class TestTccExtractor(unittest.TestCase): def setUp(self): self.doc_dir = join(ROOT_PATH, 'testdocs', 'obtencaograu', 'doctest1.pdf') self.preparator = Preparator(self.doc_dir) self.extractor = TccExtractor(self.doc_dir) self.parse = Parser('tcc.xml') self.xml_template_metadata = self.parse.xml_template_metadata() def test_metadata_extractor_generates_metadata_dict(self): self.extractor.all_metadata() |should_not| be_empty def test_tcc_document_has_one_or_more_confirmed_by_corpus_author_type_metadata(self): len(self.extractor._author_metadata()) |should| be_greater_than_or_equal_to(1) self.extractor._author_metadata() |should_not| contain('') self.preparator.remove_converted_document() def test_tcc_document_has_title_type_metadata(self): self.extractor._title_metadata() |should_not| equal_to('') self.preparator.remove_converted_document() def test_tcc_document_has_a_confirmed_by_corpus_institution_metadata(self): self.extractor._institution_metadata() |should_not| equal_to('Instituto Federal de Educação Ciência e Tecnologia ') self.preparator.remove_converted_document() def test_tcc_document_has_a_confirmed_by_corpus_campus_metadata(self): self.extractor._campus_metadata() |should_not| equal_to('') self.preparator.remove_converted_document() def test_tcc_document_has_an_abstract_metadata_pattern_found_by_regex(self): doc = self.extractor._clean_variouspages_doc matches = re.search(r'resumo:* (.*?) palavr(a|as)(.|\s)chav(e|es).', doc) matches.group() |should| start_with('resumo') self.extractor._abstract_metadata |should_not| equal_to('')
def __init__(self, doc_dir): convertion_style = "-raw" self._eventextractor = EventExtractor(doc_dir) parse = Parser(join(ROOT, 'templates', 'periodic.xml')) self._template_metadata = parse.xml_template_metadata() page = self._template_metadata['page'] self._preparator = Preparator(doc_dir) self._raw_onepage_doc = self._preparator.raw_text_convertion(page, page, convertion_style) self._linetokenized_onepage_doc = line_tokenize(self._raw_onepage_doc) self._clean_onepage_doc = self._raw_onepage_doc.replace('\n', ' ')
class PeriodicExtractor(object): def __init__(self, doc_dir): convertion_style = "-raw" self._eventextractor = EventExtractor(doc_dir) parse = Parser(join(ROOT, 'templates', 'periodic.xml')) self._template_metadata = parse.xml_template_metadata() page = self._template_metadata['page'] self._preparator = Preparator(doc_dir) self._raw_onepage_doc = self._preparator.raw_text_convertion(page, page, convertion_style) self._linetokenized_onepage_doc = line_tokenize(self._raw_onepage_doc) self._clean_onepage_doc = self._raw_onepage_doc.replace('\n', ' ') ## Event authors metadata extractor extends method to periodic author extractor def _author_metadata(self): self.authors = self._eventextractor._author_metadata() return self.authors def _abstract_metadata(self): regex = re.compile(r'resumo:* (.*?) (palavr(a|as)(.|\s)chav(e|es).|unitermos|descritores)') self.abstract = regex.search(self._clean_onepage_doc).group(1).strip().capitalize() return self.abstract def all_metadata(self): if self._preparator.doc_ext == '.pdf': try: pdf_embed_metadata = self._preparator.pdf_embed_metadata() self._pdf_num_pages = pdf_embed_metadata.numPages except: print 'Encripted document' self._pdf_num_pages = 0 else: self._pdf_num_pages = 0 metadata = {'author_metadata': self._author_metadata(), 'abstract_metadata': self._abstract_metadata(), 'number_pages': self._pdf_num_pages } try: self._preparator.remove_converted_document() except OSError: print 'Temporary document already removed..' return metadata
def __init__(self, doc_dir): convertion_style = "" parse = Parser(join(ROOT, 'templates', 'event.xml')) self._template_metadata = parse.xml_template_metadata() page = self._template_metadata['page'] self._preparator = Preparator(doc_dir) self._raw_onepage_doc = self._preparator.raw_text_convertion(page, page, convertion_style) self._linetokenized_onepage_doc = line_tokenize(self._raw_onepage_doc) self._clean_onepage_doc = self._raw_onepage_doc.replace('\n', ' ') self._email_regex = re.compile(r'(\w+[.|\w])*@(\w+[.])*\w+')
class TestPreparation(unittest.TestCase): def setUp(self): self.parse = Parser('tcc.xml') self.doc_dir = join(ROOT_PATH, 'testdocs', 'obtencaograu', 'doctest1.pdf') self.preparator = Preparator(self.doc_dir) self.xml_template_metadata = self.parse.xml_template_metadata() def test_pdf_document_exists(self): document = basename(self.doc_dir) documents = listdir(dirname(self.doc_dir)) document |should| be_into (documents) def test_raw_text_convertion(self): convertion_style = "" page = self.xml_template_metadata['page'] self.preparator.raw_text_convertion(page, page, convertion_style) documents = listdir(dirname(self.doc_dir)) self.preparator.temp_text_doc |should| be_into(documents) def test_name_corpus_has_a_certain_quantity_of_names(self): len(self.preparator.parse_corpus('names')) |should| equal_to(6297) def test_temporary_text_files_are_being_removed(self): convertion_style = "" page = self.xml_template_metadata['page'] documents = listdir(dirname(self.doc_dir)) self.preparator.raw_text_convertion(page, page, convertion_style) self.preparator.temp_text_doc |should| be_into(documents) self.preparator.remove_converted_document() documents = listdir(dirname(self.doc_dir)) self.preparator.temp_text_doc |should_not| be_into(documents) def test_institution_corpus_is_a_list_of_institution_names_with_respective_prepositions(self): self.preparator.parse_corpus('institution') |should| equal_to([['', 'fluminense'], ['', 'catarinense'], ['', 'baiano'], ['', 'goiano'], ['de ', 'tocantins'], ['do ', 'mato grosso'], ['do ', 'par\xc3\xa1'], ['da ', 'para\xc3\xadba'], ['de ', 'sergipe'], ['do ', 'cear\xc3\xa1'], ['de ', 'roraima'], ['de ', 'alagoas'], ['de ', 'santa catarina'], ['do ', 'sul de minas'], ['do ', 'sul de minas gerais'], ['de ', 's\xc3\xa3o paulo'], ['do ', 'tri\xc3\xa2ngulo mineiro'], ['de ', 'minas gerais'], ['do ', 'sert\xc3\xa3o pernambucano'], ['do ', 'mato grosso do sul'], ['da ', 'bahia'], ['de ', 'rondonia'], ['do ', 'rio grande do sul'], ['do ', 'rio grande do norte'], ['de ', 'bras\xc3\xadlia'], ['do ', 'norte de minas'], ['do ', 'piau\xc3\xad'], ['de ', 'amazonas'], ['do ', 'paran\xc3\xa1'], ['de ', 'amap\xc3\xa1'], ['do ', 'acre'], ['de ', 'maranh\xc3\xa3o'], ['do ', 'rio de janeiro'], ['de ', 'pernambuco'], ['da ', 'bahia'], ['do ', 'esp\xc3\xadrito santo'], ['do ', 'sudeste de minas gerais'], ['de ', 'goi\xc3\xa1s'], ['de ', 'farroupilha'], ['de ', 'goi\xc3\xa1s'], ['de ', 'campinas']])
def __init__(self, doc_dir): convertion_style = "" parse = Parser(join(ROOT, 'templates', 'tcc.xml')) self._template_metadata = parse.xml_template_metadata() page = self._template_metadata['page'] pages = self._template_metadata['pages'] self._preparator = Preparator(doc_dir) self._raw_onepage_doc = self._preparator.raw_text_convertion(page, page, convertion_style) self._raw_variouspages_doc = self._preparator.raw_text_convertion(pages[0], pages[1], convertion_style) self._linetokenized_onepage_raw_doc = open('%s.txt' %self._preparator.doc_dir).readlines() self._clean_variouspages_doc = self._raw_variouspages_doc.replace('\n', ' ') self._linetokenized_onepage_doc = line_tokenize(self._raw_onepage_doc) self._wordtokenized_onepage_doc = self._preparator.wordtokenized_punctuation_exclusion(self._raw_onepage_doc) self.linebreak = "\n"
def setUp(self): self.doc_dir = join(ROOT_PATH, 'testdocs', 'obtencaograu', 'doctest1.pdf') self.preparator = Preparator(self.doc_dir) self.extractor = TccExtractor(self.doc_dir) self.parse = Parser('tcc.xml') self.xml_template_metadata = self.parse.xml_template_metadata()
class TccExtractor(object): def __init__(self, doc_dir): convertion_style = "" parse = Parser(join(ROOT, 'templates', 'tcc.xml')) self._template_metadata = parse.xml_template_metadata() page = self._template_metadata['page'] pages = self._template_metadata['pages'] self._preparator = Preparator(doc_dir) self._raw_onepage_doc = self._preparator.raw_text_convertion(page, page, convertion_style) self._raw_variouspages_doc = self._preparator.raw_text_convertion(pages[0], pages[1], convertion_style) self._linetokenized_onepage_raw_doc = open('%s.txt' %self._preparator.doc_dir).readlines() self._clean_variouspages_doc = self._raw_variouspages_doc.replace('\n', ' ') self._linetokenized_onepage_doc = line_tokenize(self._raw_onepage_doc) self._wordtokenized_onepage_doc = self._preparator.wordtokenized_punctuation_exclusion(self._raw_onepage_doc) self.linebreak = "\n" def _author_metadata(self): self.authors = [] name_corpus = self._preparator.parse_corpus('names') residues = self._template_metadata['author_residue'] breakers = self._template_metadata['author_breaker'] for line in self._linetokenized_onepage_doc: line_mod = set(word_tokenize(line)) corpus_common = bool(line_mod.intersection(name_corpus)) has_residue = bool(line_mod.intersection(residues)) has_breaker = bool(line_mod.intersection(breakers)) if corpus_common and not has_residue: self.authors.append(line.title()) elif has_breaker: break return self.authors def _title_start_point(self): self._title_doc = [] for line in self._linetokenized_onepage_raw_doc: self._title_doc.append(line.decode('utf-8').lower().encode('utf-8')) authors = self._author_metadata() if authors: last_author_index = self._title_doc.index(authors[-1].lower() + self.linebreak) nextline = last_author_index + 1 ## Verify line after last author if self._title_doc[nextline] == self.linebreak: title_start_point = nextline + 1 else: title_start_point = last_author_index return title_start_point def _title_metadata(self): self.title = '' title_start_point = self._title_start_point() breakers = self._template_metadata['title_breaker'] for title_index in range(title_start_point, len(self._title_doc)): line_mod = self._title_doc[title_index].split() has_breaker = bool(set(line_mod).intersection(breakers)) if not has_breaker: self.title += self._title_doc[title_index].replace(self.linebreak, ' ') else: break self.title = self.title.strip().capitalize() return self.title def _institution_metadata(self): self.institution = 'Instituto Federal de Educação Ciência e Tecnologia ' institution_validator = set(self._template_metadata['institution_validator']) has_institution = bool(institution_validator.intersection(self._wordtokenized_onepage_doc)) if has_institution: institution_corpus = self._preparator.parse_corpus('institution') for preposition, institution in institution_corpus: institution_mod = set(institution.split()) if institution_mod.intersection(self._wordtokenized_onepage_doc) == institution_mod: self.institution = self.institution + preposition + institution.title() break return self.institution def _campus_metadata(self): self.campus = '' campus_validator = set(self._template_metadata['campus_validator']) has_campus = bool(campus_validator.intersection(self._wordtokenized_onepage_doc)) if has_campus: self.campus_corpus = self._preparator.parse_corpus('campus') for campus in self.campus_corpus: campus_mod = set(campus.split()) if campus_mod.intersection(self._wordtokenized_onepage_doc) == campus_mod: self.campus = campus.title() break return self.campus def _abstract_metadata(self): regex = re.compile(r'resumo:* (.*?) (palavr(a|as)(.|\s)chav(e|es).|abstract)') self.abstract = regex.search(self._clean_variouspages_doc).group(1).strip().capitalize() return self.abstract def _grade_metadata(self): self.grade = '' temp_grade_level = 0 doc = self._raw_onepage_doc.replace('\n', ' ') self.grade_references = {('Graduação', 1): self._template_metadata['grade_graduation'], ('Especialização', 2): self._template_metadata['grade_spec'], ('Mestrado', 3): self._template_metadata['grade_master_degree'], ('Doutorado', 4): self._template_metadata['grade_doctoral'], ('Pós-Doutorado', 5): self._template_metadata['grade_postdoctoral'] } for grade in self.grade_references.iterkeys(): grade_type, grade_level = grade for grade_name in self.grade_references[grade]: if grade_name in doc and grade_level > temp_grade_level: temp_grade_level = grade_level self.grade = grade_type break return self.grade def all_metadata(self): if self._preparator.doc_ext == '.pdf': try: pdf_embed_metadata = self._preparator.pdf_embed_metadata() self._pdf_num_pages = pdf_embed_metadata.numPages except: print 'Encripted document' self._pdf_num_pages = 0 else: self._pdf_num_pages = 0 metadata = {'author_metadata': self._author_metadata(), 'grade_metadata': self._grade_metadata(), 'title_metadata': self._title_metadata(), 'institution_metadata': self._institution_metadata(), 'campus_metadata': self._campus_metadata(), 'abstract_metadata': self._abstract_metadata(), 'number_pages': self._pdf_num_pages } try: self._preparator.remove_converted_document() except OSError: print 'Temporary document already removed..' return metadata
def setUp(self): self.parse = Parser("tcc.xml") self.doc_dir = join(ROOT_PATH, "testdocs", "obtencaograu", "doctest1.pdf") self.preparator = Preparator(self.doc_dir) self.xml_template_metadata = self.parse.xml_template_metadata()
class TestPreparation(unittest.TestCase): def setUp(self): self.parse = Parser("tcc.xml") self.doc_dir = join(ROOT_PATH, "testdocs", "obtencaograu", "doctest1.pdf") self.preparator = Preparator(self.doc_dir) self.xml_template_metadata = self.parse.xml_template_metadata() def test_pdf_document_exists(self): document = basename(self.doc_dir) documents = listdir(dirname(self.doc_dir)) document | should | be_into(documents) def test_raw_text_convertion(self): convertion_style = "" page = self.xml_template_metadata["page"] self.preparator.raw_text_convertion(page, page, convertion_style) documents = listdir(dirname(self.doc_dir)) self.preparator.temp_text_doc | should | be_into(documents) def test_name_corpus_has_a_certain_quantity_of_names(self): len(self.preparator.parse_corpus("names")) | should | equal_to(6297) def test_temporary_text_files_are_being_removed(self): convertion_style = "" page = self.xml_template_metadata["page"] documents = listdir(dirname(self.doc_dir)) self.preparator.raw_text_convertion(page, page, convertion_style) self.preparator.temp_text_doc | should | be_into(documents) self.preparator.remove_converted_document() documents = listdir(dirname(self.doc_dir)) self.preparator.temp_text_doc | should_not | be_into(documents) def test_institution_corpus_is_a_list_of_institution_names_with_respective_prepositions(self): self.preparator.parse_corpus("institution") | should | equal_to( [ ["", "fluminense"], ["", "catarinense"], ["", "baiano"], ["", "goiano"], ["de ", "tocantins"], ["do ", "mato grosso"], ["do ", "par\xc3\xa1"], ["da ", "para\xc3\xadba"], ["de ", "sergipe"], ["do ", "cear\xc3\xa1"], ["de ", "roraima"], ["de ", "alagoas"], ["de ", "santa catarina"], ["do ", "sul de minas"], ["do ", "sul de minas gerais"], ["de ", "s\xc3\xa3o paulo"], ["do ", "tri\xc3\xa2ngulo mineiro"], ["de ", "minas gerais"], ["do ", "sert\xc3\xa3o pernambucano"], ["do ", "mato grosso do sul"], ["da ", "bahia"], ["de ", "rondonia"], ["do ", "rio grande do sul"], ["do ", "rio grande do norte"], ["de ", "bras\xc3\xadlia"], ["do ", "norte de minas"], ["do ", "piau\xc3\xad"], ["de ", "amazonas"], ["do ", "paran\xc3\xa1"], ["de ", "amap\xc3\xa1"], ["do ", "acre"], ["de ", "maranh\xc3\xa3o"], ["do ", "rio de janeiro"], ["de ", "pernambuco"], ["da ", "bahia"], ["do ", "esp\xc3\xadrito santo"], ["do ", "sudeste de minas gerais"], ["de ", "goi\xc3\xa1s"], ["de ", "farroupilha"], ["de ", "goi\xc3\xa1s"], ] )
class EventExtractor(object): def __init__(self, doc_dir): convertion_style = "" parse = Parser(join(ROOT, 'templates', 'event.xml')) self._template_metadata = parse.xml_template_metadata() page = self._template_metadata['page'] self._preparator = Preparator(doc_dir) self._raw_onepage_doc = self._preparator.raw_text_convertion(page, page, convertion_style) self._linetokenized_onepage_doc = line_tokenize(self._raw_onepage_doc) self._clean_onepage_doc = self._raw_onepage_doc.replace('\n', ' ') self._email_regex = re.compile(r'(\w+[.|\w])*@(\w+[.])*\w+') def _author_metadata(self): self.authors = [] breaker = self._template_metadata['author_breaker'][0] residues = self._template_metadata['author_residue'] name_corpus = self._preparator.parse_corpus('names') abnt_name = re.compile(r'(\w[.]\s)*(\w+[;])') has_only_email = False for line in self._linetokenized_onepage_doc: has_breaker = re.match(breaker, line) if has_breaker: break line_mod = set(word_tokenize(line)) has_corpus_common = bool(line_mod.intersection(name_corpus)) has_residue = bool(line_mod.intersection(residues)) if has_corpus_common and not has_residue: find_email = self._email_regex.search(line) if find_email: email = find_email.group() line = line.replace(email, '').strip() if line != '': self.authors.append(line) if not self.authors: clean_onepage_doc = self._clean_onepage_doc find_author = abnt_name.search(clean_onepage_doc) while find_author: author = find_author.group() self.authors.append(author) clean_onepage_doc = clean_onepage_doc.replace(author, '') find_author = abnt_name.search(clean_onepage_doc) return self.authors def _abstract_metadata(self): regex = re.compile(r'resumo:* (.*?) (palavr(a|as)(.|\s)chav(e|es).|unitermos|descritores)') self.abstract = regex.search(self._clean_onepage_doc).group(1).strip().capitalize() return self.abstract def _title_metadata(self): self.title = '' self.title_catcher = [] has_author = False authors = self._author_metadata() breakers = self._template_metadata['title_breaker'] for line in self._linetokenized_onepage_doc: has_breaker = bool(set(word_tokenize(line)).intersection(breakers)) has_email = self._email_regex.search(line) for author in authors: has_author = (author in line) or has_author if not has_email and not has_author and not has_breaker: self.title_catcher.append(line) else: self.title = ' '.join(self.title_catcher).capitalize() break return self.title def all_metadata(self): if self._preparator.doc_ext == '.pdf': try: pdf_embed_metadata = self._preparator.pdf_embed_metadata() self._pdf_num_pages = pdf_embed_metadata.numPages except: print 'Encripted document' self._pdf_num_pages = 0 else: self._pdf_num_pages = 0 metadata = {'author_metadata': self._author_metadata(), 'title_metadata': self._title_metadata(), 'abstract_metadata': self._abstract_metadata(), 'number_pages': self._pdf_num_pages } try: self._preparator.remove_converted_document() except OSError: print 'Temporary document already removed..' return metadata