class TestTccExtractor(unittest.TestCase):

	def setUp(self):
		self.doc_dir = join(ROOT_PATH, 'testdocs', 'obtencaograu', 'doctest1.pdf')
		self.preparator = Preparator(self.doc_dir)
		self.extractor = TccExtractor(self.doc_dir)
		self.parse = Parser('tcc.xml')
		self.xml_template_metadata = self.parse.xml_template_metadata()

	def test_metadata_extractor_generates_metadata_dict(self):
		self.extractor.all_metadata() |should_not| be_empty

	def test_tcc_document_has_one_or_more_confirmed_by_corpus_author_type_metadata(self):
		len(self.extractor._author_metadata()) |should| be_greater_than_or_equal_to(1)
		self.extractor._author_metadata() |should_not| contain('')
		self.preparator.remove_converted_document()

	def test_tcc_document_has_title_type_metadata(self):
		self.extractor._title_metadata() |should_not| equal_to('')
		self.preparator.remove_converted_document()

 	def test_tcc_document_has_a_confirmed_by_corpus_institution_metadata(self):
 		self.extractor._institution_metadata() |should_not| equal_to('Instituto Federal de Educação Ciência e Tecnologia ')
 		self.preparator.remove_converted_document()

 	def test_tcc_document_has_a_confirmed_by_corpus_campus_metadata(self):
 		self.extractor._campus_metadata() |should_not| equal_to('')
 		self.preparator.remove_converted_document()

 	def test_tcc_document_has_an_abstract_metadata_pattern_found_by_regex(self):
 		doc = self.extractor._clean_variouspages_doc
 		matches = re.search(r'resumo:* (.*?) palavr(a|as)(.|\s)chav(e|es).', doc)
 		matches.group() |should| start_with('resumo')
 		self.extractor._abstract_metadata |should_not| equal_to('')
Пример #2
0
 def __init__(self, doc_dir):
     convertion_style = "-raw"
     self._eventextractor = EventExtractor(doc_dir)
 	parse = Parser(join(ROOT, 'templates', 'periodic.xml'))
     self._template_metadata = parse.xml_template_metadata()
     page = self._template_metadata['page']
     self._preparator = Preparator(doc_dir)
     self._raw_onepage_doc = self._preparator.raw_text_convertion(page, page, convertion_style)
     self._linetokenized_onepage_doc = line_tokenize(self._raw_onepage_doc)
     self._clean_onepage_doc = self._raw_onepage_doc.replace('\n', ' ')
Пример #3
0
class PeriodicExtractor(object):

    def __init__(self, doc_dir):
        convertion_style = "-raw"
        self._eventextractor = EventExtractor(doc_dir)
    	parse = Parser(join(ROOT, 'templates', 'periodic.xml'))
        self._template_metadata = parse.xml_template_metadata()
        page = self._template_metadata['page']
        self._preparator = Preparator(doc_dir)
        self._raw_onepage_doc = self._preparator.raw_text_convertion(page, page, convertion_style)
        self._linetokenized_onepage_doc = line_tokenize(self._raw_onepage_doc)
        self._clean_onepage_doc = self._raw_onepage_doc.replace('\n', ' ')

    ## Event authors metadata extractor extends method to periodic author extractor
    def _author_metadata(self):
        self.authors = self._eventextractor._author_metadata()
        return self.authors

    def _abstract_metadata(self):
        regex = re.compile(r'resumo:* (.*?) (palavr(a|as)(.|\s)chav(e|es).|unitermos|descritores)')
        self.abstract = regex.search(self._clean_onepage_doc).group(1).strip().capitalize()
        return self.abstract

    def all_metadata(self):
        if self._preparator.doc_ext == '.pdf':
            try:
                pdf_embed_metadata = self._preparator.pdf_embed_metadata()
                self._pdf_num_pages = pdf_embed_metadata.numPages
            except:
                print 'Encripted document'
                self._pdf_num_pages = 0
        else:
            self._pdf_num_pages = 0

        metadata = {'author_metadata':      self._author_metadata(),
                    'abstract_metadata':    self._abstract_metadata(),
                    'number_pages':         self._pdf_num_pages
                    }
        try:
            self._preparator.remove_converted_document()
        except OSError:
            print 'Temporary document already removed..'
        return metadata
Пример #4
0
 def __init__(self, doc_dir):
     convertion_style = ""
     parse = Parser(join(ROOT, 'templates', 'event.xml'))
     self._template_metadata = parse.xml_template_metadata()
     page = self._template_metadata['page']
     self._preparator = Preparator(doc_dir)
     self._raw_onepage_doc = self._preparator.raw_text_convertion(page, page, convertion_style)
     self._linetokenized_onepage_doc = line_tokenize(self._raw_onepage_doc)
     self._clean_onepage_doc = self._raw_onepage_doc.replace('\n', ' ')
     self._email_regex = re.compile(r'(\w+[.|\w])*@(\w+[.])*\w+')
class TestPreparation(unittest.TestCase):

	def setUp(self):
		self.parse = Parser('tcc.xml')
		self.doc_dir = join(ROOT_PATH, 'testdocs', 'obtencaograu', 'doctest1.pdf')
		self.preparator = Preparator(self.doc_dir)
		self.xml_template_metadata = self.parse.xml_template_metadata()

	def test_pdf_document_exists(self):
		document = basename(self.doc_dir)
		documents = listdir(dirname(self.doc_dir))
		document |should| be_into (documents)

	def test_raw_text_convertion(self):
		convertion_style = ""
		page = self.xml_template_metadata['page']
		self.preparator.raw_text_convertion(page, page, convertion_style)
		documents = listdir(dirname(self.doc_dir))
		self.preparator.temp_text_doc |should| be_into(documents)

	def test_name_corpus_has_a_certain_quantity_of_names(self):
		len(self.preparator.parse_corpus('names')) |should| equal_to(6297)

	def test_temporary_text_files_are_being_removed(self):
		convertion_style = ""
		page = self.xml_template_metadata['page']
		documents = listdir(dirname(self.doc_dir))
		self.preparator.raw_text_convertion(page, page, convertion_style)
		self.preparator.temp_text_doc |should| be_into(documents)
		self.preparator.remove_converted_document()

		documents = listdir(dirname(self.doc_dir))
		self.preparator.temp_text_doc |should_not| be_into(documents)


	def test_institution_corpus_is_a_list_of_institution_names_with_respective_prepositions(self):
		self.preparator.parse_corpus('institution') |should| equal_to([['', 'fluminense'], ['', 'catarinense'],
			['', 'baiano'], ['', 'goiano'], ['de ', 'tocantins'], ['do ', 'mato grosso'], ['do ', 'par\xc3\xa1'],
			['da ', 'para\xc3\xadba'], ['de ', 'sergipe'], ['do ', 'cear\xc3\xa1'], ['de ', 'roraima'], ['de ', 'alagoas'],
			['de ', 'santa catarina'], ['do ', 'sul de minas'], ['do ', 'sul de minas gerais'],
			['de ', 's\xc3\xa3o paulo'], ['do ', 'tri\xc3\xa2ngulo mineiro'], ['de ', 'minas gerais'],
			['do ', 'sert\xc3\xa3o pernambucano'], ['do ', 'mato grosso do sul'], ['da ', 'bahia'],
			['de ', 'rondonia'], ['do ', 'rio grande do sul'], ['do ', 'rio grande do norte'], ['de ', 'bras\xc3\xadlia'],
			['do ', 'norte de minas'], ['do ', 'piau\xc3\xad'], ['de ', 'amazonas'], ['do ', 'paran\xc3\xa1'],
			['de ', 'amap\xc3\xa1'], ['do ', 'acre'], ['de ', 'maranh\xc3\xa3o'], ['do ', 'rio de janeiro'],
			['de ', 'pernambuco'], ['da ', 'bahia'], ['do ', 'esp\xc3\xadrito santo'], ['do ', 'sudeste de minas gerais'],
			['de ', 'goi\xc3\xa1s'], ['de ', 'farroupilha'], ['de ', 'goi\xc3\xa1s'], ['de ', 'campinas']])
Пример #6
0
 def __init__(self, doc_dir):
     convertion_style = ""
     parse = Parser(join(ROOT, 'templates', 'tcc.xml'))
     self._template_metadata = parse.xml_template_metadata()
     page = self._template_metadata['page']
     pages = self._template_metadata['pages']
     self._preparator = Preparator(doc_dir)
     self._raw_onepage_doc = self._preparator.raw_text_convertion(page, page, convertion_style)
     self._raw_variouspages_doc = self._preparator.raw_text_convertion(pages[0], pages[1], convertion_style)
     self._linetokenized_onepage_raw_doc = open('%s.txt' %self._preparator.doc_dir).readlines()
     self._clean_variouspages_doc = self._raw_variouspages_doc.replace('\n', ' ')
     self._linetokenized_onepage_doc = line_tokenize(self._raw_onepage_doc)
     self._wordtokenized_onepage_doc = self._preparator.wordtokenized_punctuation_exclusion(self._raw_onepage_doc)
     self.linebreak = "\n"
	def setUp(self):
		self.doc_dir = join(ROOT_PATH, 'testdocs', 'obtencaograu', 'doctest1.pdf')
		self.preparator = Preparator(self.doc_dir)
		self.extractor = TccExtractor(self.doc_dir)
		self.parse = Parser('tcc.xml')
		self.xml_template_metadata = self.parse.xml_template_metadata()
Пример #8
0
class TccExtractor(object):

    def __init__(self, doc_dir):
        convertion_style = ""
        parse = Parser(join(ROOT, 'templates', 'tcc.xml'))
        self._template_metadata = parse.xml_template_metadata()
        page = self._template_metadata['page']
        pages = self._template_metadata['pages']
        self._preparator = Preparator(doc_dir)
        self._raw_onepage_doc = self._preparator.raw_text_convertion(page, page, convertion_style)
        self._raw_variouspages_doc = self._preparator.raw_text_convertion(pages[0], pages[1], convertion_style)
        self._linetokenized_onepage_raw_doc = open('%s.txt' %self._preparator.doc_dir).readlines()
        self._clean_variouspages_doc = self._raw_variouspages_doc.replace('\n', ' ')
        self._linetokenized_onepage_doc = line_tokenize(self._raw_onepage_doc)
        self._wordtokenized_onepage_doc = self._preparator.wordtokenized_punctuation_exclusion(self._raw_onepage_doc)
        self.linebreak = "\n"

    def _author_metadata(self):
        self.authors = []
        name_corpus = self._preparator.parse_corpus('names')
        residues = self._template_metadata['author_residue']
        breakers = self._template_metadata['author_breaker']
        for line in self._linetokenized_onepage_doc:
            line_mod = set(word_tokenize(line))
            corpus_common = bool(line_mod.intersection(name_corpus))
            has_residue = bool(line_mod.intersection(residues))
            has_breaker = bool(line_mod.intersection(breakers))
            if corpus_common and not has_residue:
                self.authors.append(line.title())
            elif has_breaker: break
        return self.authors

    def _title_start_point(self):
        self._title_doc = []
        for line in self._linetokenized_onepage_raw_doc:
            self._title_doc.append(line.decode('utf-8').lower().encode('utf-8'))
        authors = self._author_metadata()
        if authors:
            last_author_index = self._title_doc.index(authors[-1].lower() + self.linebreak)
        nextline = last_author_index + 1
        ## Verify line after last author
        if self._title_doc[nextline] == self.linebreak: 
            title_start_point = nextline + 1
        else: title_start_point = last_author_index
        return title_start_point

    def _title_metadata(self):
        self.title = ''
        title_start_point = self._title_start_point()
        breakers = self._template_metadata['title_breaker']
        for title_index in range(title_start_point, len(self._title_doc)):
            line_mod = self._title_doc[title_index].split()
            has_breaker = bool(set(line_mod).intersection(breakers))
            if not has_breaker:
                self.title += self._title_doc[title_index].replace(self.linebreak, ' ')
            else: break
        self.title = self.title.strip().capitalize()
        return self.title

    def _institution_metadata(self):
        self.institution = 'Instituto Federal de Educação Ciência e Tecnologia '
        institution_validator = set(self._template_metadata['institution_validator'])
        has_institution = bool(institution_validator.intersection(self._wordtokenized_onepage_doc))
        if has_institution:
            institution_corpus = self._preparator.parse_corpus('institution')
            for preposition, institution in institution_corpus:
                institution_mod = set(institution.split())
                if institution_mod.intersection(self._wordtokenized_onepage_doc) == institution_mod:
                    self.institution = self.institution + preposition + institution.title()
                    break
        return self.institution

    def _campus_metadata(self):
        self.campus = ''
        campus_validator = set(self._template_metadata['campus_validator'])
        has_campus = bool(campus_validator.intersection(self._wordtokenized_onepage_doc))
        if has_campus:
            self.campus_corpus = self._preparator.parse_corpus('campus')
            for campus in self.campus_corpus:
                campus_mod = set(campus.split())
                if campus_mod.intersection(self._wordtokenized_onepage_doc) == campus_mod:
                    self.campus = campus.title()
                    break
        return self.campus
    
    def _abstract_metadata(self):
        regex = re.compile(r'resumo:* (.*?) (palavr(a|as)(.|\s)chav(e|es).|abstract)')
        self.abstract = regex.search(self._clean_variouspages_doc).group(1).strip().capitalize()
        return self.abstract

    def _grade_metadata(self):
        self.grade = ''
        temp_grade_level = 0
        doc = self._raw_onepage_doc.replace('\n', ' ')
        self.grade_references = {('Graduação', 1):      self._template_metadata['grade_graduation'],
                                 ('Especialização', 2): self._template_metadata['grade_spec'],
                                 ('Mestrado', 3):       self._template_metadata['grade_master_degree'],
                                 ('Doutorado', 4):      self._template_metadata['grade_doctoral'],
                                 ('Pós-Doutorado', 5):  self._template_metadata['grade_postdoctoral']
                                 }
        for grade in self.grade_references.iterkeys():
            grade_type, grade_level = grade
            for grade_name in self.grade_references[grade]:
                if grade_name in doc and grade_level > temp_grade_level:
                    temp_grade_level = grade_level
                    self.grade = grade_type
                    break
        return self.grade

    def all_metadata(self):
        if self._preparator.doc_ext == '.pdf':
            try:
                pdf_embed_metadata = self._preparator.pdf_embed_metadata()
                self._pdf_num_pages = pdf_embed_metadata.numPages
            except:
                print 'Encripted document'
                self._pdf_num_pages = 0
        else:
            self._pdf_num_pages = 0

        metadata = {'author_metadata':      self._author_metadata(),
                    'grade_metadata':       self._grade_metadata(),
                    'title_metadata':       self._title_metadata(),
                    'institution_metadata': self._institution_metadata(),
                    'campus_metadata':      self._campus_metadata(),
                    'abstract_metadata':    self._abstract_metadata(),
                    'number_pages':         self._pdf_num_pages
                    }
        try:
            self._preparator.remove_converted_document()
        except OSError:
            print 'Temporary document already removed..'
        return metadata
 def setUp(self):
     self.parse = Parser("tcc.xml")
     self.doc_dir = join(ROOT_PATH, "testdocs", "obtencaograu", "doctest1.pdf")
     self.preparator = Preparator(self.doc_dir)
     self.xml_template_metadata = self.parse.xml_template_metadata()
Пример #10
0
class TestPreparation(unittest.TestCase):
    def setUp(self):
        self.parse = Parser("tcc.xml")
        self.doc_dir = join(ROOT_PATH, "testdocs", "obtencaograu", "doctest1.pdf")
        self.preparator = Preparator(self.doc_dir)
        self.xml_template_metadata = self.parse.xml_template_metadata()

    def test_pdf_document_exists(self):
        document = basename(self.doc_dir)
        documents = listdir(dirname(self.doc_dir))
        document | should | be_into(documents)

    def test_raw_text_convertion(self):
        convertion_style = ""
        page = self.xml_template_metadata["page"]
        self.preparator.raw_text_convertion(page, page, convertion_style)
        documents = listdir(dirname(self.doc_dir))
        self.preparator.temp_text_doc | should | be_into(documents)

    def test_name_corpus_has_a_certain_quantity_of_names(self):
        len(self.preparator.parse_corpus("names")) | should | equal_to(6297)

    def test_temporary_text_files_are_being_removed(self):
        convertion_style = ""
        page = self.xml_template_metadata["page"]
        documents = listdir(dirname(self.doc_dir))
        self.preparator.raw_text_convertion(page, page, convertion_style)
        self.preparator.temp_text_doc | should | be_into(documents)
        self.preparator.remove_converted_document()

        documents = listdir(dirname(self.doc_dir))
        self.preparator.temp_text_doc | should_not | be_into(documents)

    def test_institution_corpus_is_a_list_of_institution_names_with_respective_prepositions(self):
        self.preparator.parse_corpus("institution") | should | equal_to(
            [
                ["", "fluminense"],
                ["", "catarinense"],
                ["", "baiano"],
                ["", "goiano"],
                ["de ", "tocantins"],
                ["do ", "mato grosso"],
                ["do ", "par\xc3\xa1"],
                ["da ", "para\xc3\xadba"],
                ["de ", "sergipe"],
                ["do ", "cear\xc3\xa1"],
                ["de ", "roraima"],
                ["de ", "alagoas"],
                ["de ", "santa catarina"],
                ["do ", "sul de minas"],
                ["do ", "sul de minas gerais"],
                ["de ", "s\xc3\xa3o paulo"],
                ["do ", "tri\xc3\xa2ngulo mineiro"],
                ["de ", "minas gerais"],
                ["do ", "sert\xc3\xa3o pernambucano"],
                ["do ", "mato grosso do sul"],
                ["da ", "bahia"],
                ["de ", "rondonia"],
                ["do ", "rio grande do sul"],
                ["do ", "rio grande do norte"],
                ["de ", "bras\xc3\xadlia"],
                ["do ", "norte de minas"],
                ["do ", "piau\xc3\xad"],
                ["de ", "amazonas"],
                ["do ", "paran\xc3\xa1"],
                ["de ", "amap\xc3\xa1"],
                ["do ", "acre"],
                ["de ", "maranh\xc3\xa3o"],
                ["do ", "rio de janeiro"],
                ["de ", "pernambuco"],
                ["da ", "bahia"],
                ["do ", "esp\xc3\xadrito santo"],
                ["do ", "sudeste de minas gerais"],
                ["de ", "goi\xc3\xa1s"],
                ["de ", "farroupilha"],
                ["de ", "goi\xc3\xa1s"],
            ]
        )
Пример #11
0
class EventExtractor(object):        
        
    def __init__(self, doc_dir):
        convertion_style = ""
        parse = Parser(join(ROOT, 'templates', 'event.xml'))
        self._template_metadata = parse.xml_template_metadata()
        page = self._template_metadata['page']
        self._preparator = Preparator(doc_dir)
        self._raw_onepage_doc = self._preparator.raw_text_convertion(page, page, convertion_style)
        self._linetokenized_onepage_doc = line_tokenize(self._raw_onepage_doc)
        self._clean_onepage_doc = self._raw_onepage_doc.replace('\n', ' ')
        self._email_regex = re.compile(r'(\w+[.|\w])*@(\w+[.])*\w+')

    def _author_metadata(self):
        self.authors = []
        breaker = self._template_metadata['author_breaker'][0]
        residues = self._template_metadata['author_residue']
        name_corpus = self._preparator.parse_corpus('names') 
        abnt_name = re.compile(r'(\w[.]\s)*(\w+[;])')
        has_only_email = False
        for line in self._linetokenized_onepage_doc:
            has_breaker = re.match(breaker, line)
            if has_breaker: break
            line_mod = set(word_tokenize(line))
            has_corpus_common = bool(line_mod.intersection(name_corpus))
            has_residue = bool(line_mod.intersection(residues))
            if has_corpus_common and not has_residue:
                find_email = self._email_regex.search(line)
                if find_email:
                    email = find_email.group()
                    line = line.replace(email, '').strip()
                if line != '': self.authors.append(line)
        if not self.authors:
            clean_onepage_doc = self._clean_onepage_doc
            find_author = abnt_name.search(clean_onepage_doc)
            while find_author:
                author = find_author.group()
                self.authors.append(author)
                clean_onepage_doc = clean_onepage_doc.replace(author, '')
                find_author = abnt_name.search(clean_onepage_doc)
        return self.authors

    def _abstract_metadata(self):
        regex = re.compile(r'resumo:* (.*?) (palavr(a|as)(.|\s)chav(e|es).|unitermos|descritores)')
        self.abstract = regex.search(self._clean_onepage_doc).group(1).strip().capitalize()
        return self.abstract

    def _title_metadata(self):
        self.title = ''
        self.title_catcher = []
        has_author = False
        authors = self._author_metadata()
        breakers = self._template_metadata['title_breaker']
        for line in self._linetokenized_onepage_doc:
            has_breaker = bool(set(word_tokenize(line)).intersection(breakers))
            has_email = self._email_regex.search(line)
            for author in authors:
                has_author = (author in line) or has_author
            if not has_email and not has_author and not has_breaker:
                self.title_catcher.append(line)
            else: 
                self.title = ' '.join(self.title_catcher).capitalize()
                break
        return self.title

    
    def all_metadata(self):
        if self._preparator.doc_ext == '.pdf':
            try:
                pdf_embed_metadata = self._preparator.pdf_embed_metadata()
                self._pdf_num_pages = pdf_embed_metadata.numPages
            except:
                print 'Encripted document'
                self._pdf_num_pages = 0
        else:
            self._pdf_num_pages = 0

        metadata = {'author_metadata':      self._author_metadata(),
                    'title_metadata':       self._title_metadata(),
                    'abstract_metadata':    self._abstract_metadata(),
                    'number_pages':         self._pdf_num_pages
                    }
        try:
            self._preparator.remove_converted_document()
        except OSError:
            print 'Temporary document already removed..'
        return metadata