def __init__(self, doc_dir): convertion_style = "-raw" self._eventextractor = EventExtractor(doc_dir) parse = Parser(join(ROOT, 'templates', 'periodic.xml')) self._template_metadata = parse.xml_template_metadata() page = self._template_metadata['page'] self._preparator = Preparator(doc_dir) self._raw_onepage_doc = self._preparator.raw_text_convertion(page, page, convertion_style) self._linetokenized_onepage_doc = line_tokenize(self._raw_onepage_doc) self._clean_onepage_doc = self._raw_onepage_doc.replace('\n', ' ')
def __init__(self, doc_dir): convertion_style = "" parse = Parser(join(ROOT, 'templates', 'event.xml')) self._template_metadata = parse.xml_template_metadata() page = self._template_metadata['page'] self._preparator = Preparator(doc_dir) self._raw_onepage_doc = self._preparator.raw_text_convertion(page, page, convertion_style) self._linetokenized_onepage_doc = line_tokenize(self._raw_onepage_doc) self._clean_onepage_doc = self._raw_onepage_doc.replace('\n', ' ') self._email_regex = re.compile(r'(\w+[.|\w])*@(\w+[.])*\w+')
def __init__(self, doc_dir): convertion_style = "" parse = Parser(join(ROOT, 'templates', 'tcc.xml')) self._template_metadata = parse.xml_template_metadata() page = self._template_metadata['page'] pages = self._template_metadata['pages'] self._preparator = Preparator(doc_dir) self._raw_onepage_doc = self._preparator.raw_text_convertion(page, page, convertion_style) self._raw_variouspages_doc = self._preparator.raw_text_convertion(pages[0], pages[1], convertion_style) self._linetokenized_onepage_raw_doc = open('%s.txt' %self._preparator.doc_dir).readlines() self._clean_variouspages_doc = self._raw_variouspages_doc.replace('\n', ' ') self._linetokenized_onepage_doc = line_tokenize(self._raw_onepage_doc) self._wordtokenized_onepage_doc = self._preparator.wordtokenized_punctuation_exclusion(self._raw_onepage_doc) self.linebreak = "\n"
class TestTccExtractor(unittest.TestCase): def setUp(self): self.doc_dir = join(ROOT_PATH, 'testdocs', 'obtencaograu', 'doctest1.pdf') self.preparator = Preparator(self.doc_dir) self.extractor = TccExtractor(self.doc_dir) self.parse = Parser('tcc.xml') self.xml_template_metadata = self.parse.xml_template_metadata() def test_metadata_extractor_generates_metadata_dict(self): self.extractor.all_metadata() |should_not| be_empty def test_tcc_document_has_one_or_more_confirmed_by_corpus_author_type_metadata(self): len(self.extractor._author_metadata()) |should| be_greater_than_or_equal_to(1) self.extractor._author_metadata() |should_not| contain('') self.preparator.remove_converted_document() def test_tcc_document_has_title_type_metadata(self): self.extractor._title_metadata() |should_not| equal_to('') self.preparator.remove_converted_document() def test_tcc_document_has_a_confirmed_by_corpus_institution_metadata(self): self.extractor._institution_metadata() |should_not| equal_to('Instituto Federal de Educação Ciência e Tecnologia ') self.preparator.remove_converted_document() def test_tcc_document_has_a_confirmed_by_corpus_campus_metadata(self): self.extractor._campus_metadata() |should_not| equal_to('') self.preparator.remove_converted_document() def test_tcc_document_has_an_abstract_metadata_pattern_found_by_regex(self): doc = self.extractor._clean_variouspages_doc matches = re.search(r'resumo:* (.*?) palavr(a|as)(.|\s)chav(e|es).', doc) matches.group() |should| start_with('resumo') self.extractor._abstract_metadata |should_not| equal_to('')
class TestParser(unittest.TestCase): def setUp(self): self.tccParse = Parser(join(TEMPLATE_PATH, 'tcc.xml')) self.eventParse = Parser(join(TEMPLATE_PATH, 'event.xml')) def test_parser_receive_a_xml_directory(self): self.tccParse.file_path and self.eventParse.file_path |should| be_like(r'.*.xml') def tes_tcc_onepage_metadata_hash_has_valid_keys_and_values(self): onepage_parser = self.tccParse._onepage_metadata() "author_residue", "author_breaker", "institution_validator", "campus_validator", "grade_graduation", "grade_spec", "grade_master_degree", "grade_doctoral" "grade_postdoctoral" |should| be_into(onepage_parser.keys()) for key in onepage_parser.keys(): onepage_parser.get(key) |should_not| equal_to ([]) def test_event_onepage_metadata_hash_has_valid_keys_and_values(self): onepage_parser = self.tccParse._onepage_metadata() "author_breaker", "author_residue", "title_breaker" |should| be_into(onepage_parser.keys()) for key in onepage_parser.keys(): onepage_parser.get(key) |should_not| equal_to ([]) def test_tcc_variouspages_metadata_hash_has_valid_keys_and_values(self): variouspages_parser = self.tccParse._variouspages_metadata() "pages" |should| be_into(variouspages_parser.keys()) def test_merge_all_template_metadata_into_one_dict(self): onepage_metatada = self.tccParse._onepage_metadata() variouspages_metatada = self.tccParse._variouspages_metadata() xml_template_metadata = self.tccParse.xml_template_metadata() xml_template_metadata.keys() |should_not| be_empty
class TestPeriodicExtractor(unittest.TestCase): def setUp(self): self.doc_dir = join(ROOT_PATH, "testdocs", "periodic", "1_pt-br.pdf") self.preparator = Preparator(self.doc_dir) self.extractor = PeriodicExtractor(self.doc_dir) self.parse = Parser("periodic.xml") self.xml_template_metadata = self.parse.xml_template_metadata() def test_periodic_document_has_author_type_metadata(self): self.extractor._author_metadata() | should_not | be_empty def test_event_document_has_an_abstract_metadata_pattern_found_by_regex(self): doc = self.extractor._clean_onepage_doc matches = re.search(r"resumo:* (.*?) (palavr(a|as)(.|\s)chav(e|es).|unitermos|descritores)", doc) matches.group() | should | start_with("resumo") self.extractor._abstract_metadata | should_not | equal_to("")
class TestPreparation(unittest.TestCase): def setUp(self): self.parse = Parser('tcc.xml') self.doc_dir = join(ROOT_PATH, 'testdocs', 'obtencaograu', 'doctest1.pdf') self.preparator = Preparator(self.doc_dir) self.xml_template_metadata = self.parse.xml_template_metadata() def test_pdf_document_exists(self): document = basename(self.doc_dir) documents = listdir(dirname(self.doc_dir)) document |should| be_into (documents) def test_raw_text_convertion(self): convertion_style = "" page = self.xml_template_metadata['page'] self.preparator.raw_text_convertion(page, page, convertion_style) documents = listdir(dirname(self.doc_dir)) self.preparator.temp_text_doc |should| be_into(documents) def test_name_corpus_has_a_certain_quantity_of_names(self): len(self.preparator.parse_corpus('names')) |should| equal_to(6297) def test_temporary_text_files_are_being_removed(self): convertion_style = "" page = self.xml_template_metadata['page'] documents = listdir(dirname(self.doc_dir)) self.preparator.raw_text_convertion(page, page, convertion_style) self.preparator.temp_text_doc |should| be_into(documents) self.preparator.remove_converted_document() documents = listdir(dirname(self.doc_dir)) self.preparator.temp_text_doc |should_not| be_into(documents) def test_institution_corpus_is_a_list_of_institution_names_with_respective_prepositions(self): self.preparator.parse_corpus('institution') |should| equal_to([['', 'fluminense'], ['', 'catarinense'], ['', 'baiano'], ['', 'goiano'], ['de ', 'tocantins'], ['do ', 'mato grosso'], ['do ', 'par\xc3\xa1'], ['da ', 'para\xc3\xadba'], ['de ', 'sergipe'], ['do ', 'cear\xc3\xa1'], ['de ', 'roraima'], ['de ', 'alagoas'], ['de ', 'santa catarina'], ['do ', 'sul de minas'], ['do ', 'sul de minas gerais'], ['de ', 's\xc3\xa3o paulo'], ['do ', 'tri\xc3\xa2ngulo mineiro'], ['de ', 'minas gerais'], ['do ', 'sert\xc3\xa3o pernambucano'], ['do ', 'mato grosso do sul'], ['da ', 'bahia'], ['de ', 'rondonia'], ['do ', 'rio grande do sul'], ['do ', 'rio grande do norte'], ['de ', 'bras\xc3\xadlia'], ['do ', 'norte de minas'], ['do ', 'piau\xc3\xad'], ['de ', 'amazonas'], ['do ', 'paran\xc3\xa1'], ['de ', 'amap\xc3\xa1'], ['do ', 'acre'], ['de ', 'maranh\xc3\xa3o'], ['do ', 'rio de janeiro'], ['de ', 'pernambuco'], ['da ', 'bahia'], ['do ', 'esp\xc3\xadrito santo'], ['do ', 'sudeste de minas gerais'], ['de ', 'goi\xc3\xa1s'], ['de ', 'farroupilha'], ['de ', 'goi\xc3\xa1s'], ['de ', 'campinas']])
class TestEventExtractor(unittest.TestCase): def setUp(self): self.doc_dir = join(ROOT_PATH, 'testdocs', 'event', '1_pt-br.pdf') self.preparator = Preparator(self.doc_dir) self.extractor = EventExtractor(self.doc_dir) self.parse = Parser('event.xml') self.xml_template_metadata = self.parse.xml_template_metadata() def test_metadata_extractor_generates_metadata_dict(self): self.extractor.all_metadata() |should_not| be_empty def test_event_document_has_an_abstract_metadata_pattern_found_by_regex(self): doc = self.extractor._clean_onepage_doc matches = re.search(r'resumo:* (.*?) (palavr(a|as)(.|\s)chav(e|es).|unitermos|descritores)', doc) matches.group() |should| start_with('resumo') self.extractor._abstract_metadata |should_not| equal_to('') def test_event_document_has_author_type_metadata(self): self.extractor._author_metadata() |should_not| be_empty def test_event_document_has_title_type_metadata(self): self.extractor._title_metadata() |should_not| be_empty
def setUp(self): self.doc_dir = join(ROOT_PATH, 'testdocs', 'obtencaograu', 'doctest1.pdf') self.preparator = Preparator(self.doc_dir) self.extractor = TccExtractor(self.doc_dir) self.parse = Parser('tcc.xml') self.xml_template_metadata = self.parse.xml_template_metadata()
def setUp(self): self.doc_dir = join(ROOT_PATH, 'testdocs', 'periodic', '1_pt-br.pdf') self.preparator = Preparator(self.doc_dir) self.extractor = PeriodicExtractor(self.doc_dir) self.parse = Parser('periodic.xml') self.xml_template_metadata = self.parse.xml_template_metadata()
def setUp(self): self.tccParse = Parser(join(TEMPLATE_PATH, 'tcc.xml')) self.eventParse = Parser(join(TEMPLATE_PATH, 'event.xml'))
def setUp(self): self.parse = Parser("tcc.xml") self.doc_dir = join(ROOT_PATH, "testdocs", "obtencaograu", "doctest1.pdf") self.preparator = Preparator(self.doc_dir) self.xml_template_metadata = self.parse.xml_template_metadata()
class TestPreparation(unittest.TestCase): def setUp(self): self.parse = Parser("tcc.xml") self.doc_dir = join(ROOT_PATH, "testdocs", "obtencaograu", "doctest1.pdf") self.preparator = Preparator(self.doc_dir) self.xml_template_metadata = self.parse.xml_template_metadata() def test_pdf_document_exists(self): document = basename(self.doc_dir) documents = listdir(dirname(self.doc_dir)) document | should | be_into(documents) def test_raw_text_convertion(self): convertion_style = "" page = self.xml_template_metadata["page"] self.preparator.raw_text_convertion(page, page, convertion_style) documents = listdir(dirname(self.doc_dir)) self.preparator.temp_text_doc | should | be_into(documents) def test_name_corpus_has_a_certain_quantity_of_names(self): len(self.preparator.parse_corpus("names")) | should | equal_to(6297) def test_temporary_text_files_are_being_removed(self): convertion_style = "" page = self.xml_template_metadata["page"] documents = listdir(dirname(self.doc_dir)) self.preparator.raw_text_convertion(page, page, convertion_style) self.preparator.temp_text_doc | should | be_into(documents) self.preparator.remove_converted_document() documents = listdir(dirname(self.doc_dir)) self.preparator.temp_text_doc | should_not | be_into(documents) def test_institution_corpus_is_a_list_of_institution_names_with_respective_prepositions(self): self.preparator.parse_corpus("institution") | should | equal_to( [ ["", "fluminense"], ["", "catarinense"], ["", "baiano"], ["", "goiano"], ["de ", "tocantins"], ["do ", "mato grosso"], ["do ", "par\xc3\xa1"], ["da ", "para\xc3\xadba"], ["de ", "sergipe"], ["do ", "cear\xc3\xa1"], ["de ", "roraima"], ["de ", "alagoas"], ["de ", "santa catarina"], ["do ", "sul de minas"], ["do ", "sul de minas gerais"], ["de ", "s\xc3\xa3o paulo"], ["do ", "tri\xc3\xa2ngulo mineiro"], ["de ", "minas gerais"], ["do ", "sert\xc3\xa3o pernambucano"], ["do ", "mato grosso do sul"], ["da ", "bahia"], ["de ", "rondonia"], ["do ", "rio grande do sul"], ["do ", "rio grande do norte"], ["de ", "bras\xc3\xadlia"], ["do ", "norte de minas"], ["do ", "piau\xc3\xad"], ["de ", "amazonas"], ["do ", "paran\xc3\xa1"], ["de ", "amap\xc3\xa1"], ["do ", "acre"], ["de ", "maranh\xc3\xa3o"], ["do ", "rio de janeiro"], ["de ", "pernambuco"], ["da ", "bahia"], ["do ", "esp\xc3\xadrito santo"], ["do ", "sudeste de minas gerais"], ["de ", "goi\xc3\xa1s"], ["de ", "farroupilha"], ["de ", "goi\xc3\xa1s"], ] )
def setUp(self): self.doc_dir = join(ROOT_PATH, "testdocs", "periodic", "1_pt-br.pdf") self.preparator = Preparator(self.doc_dir) self.extractor = PeriodicExtractor(self.doc_dir) self.parse = Parser("periodic.xml") self.xml_template_metadata = self.parse.xml_template_metadata()