def __init__(self, doc_dir): convertion_style = "-raw" self._eventextractor = EventExtractor(doc_dir) parse = Parser(join(ROOT, 'templates', 'periodic.xml')) self._template_metadata = parse.xml_template_metadata() page = self._template_metadata['page'] self._preparator = Preparator(doc_dir) self._raw_onepage_doc = self._preparator.raw_text_convertion(page, page, convertion_style) self._linetokenized_onepage_doc = line_tokenize(self._raw_onepage_doc) self._clean_onepage_doc = self._raw_onepage_doc.replace('\n', ' ')
class PeriodicExtractor(object): def __init__(self, doc_dir): convertion_style = "-raw" self._eventextractor = EventExtractor(doc_dir) parse = Parser(join(ROOT, 'templates', 'periodic.xml')) self._template_metadata = parse.xml_template_metadata() page = self._template_metadata['page'] self._preparator = Preparator(doc_dir) self._raw_onepage_doc = self._preparator.raw_text_convertion(page, page, convertion_style) self._linetokenized_onepage_doc = line_tokenize(self._raw_onepage_doc) self._clean_onepage_doc = self._raw_onepage_doc.replace('\n', ' ') ## Event authors metadata extractor extends method to periodic author extractor def _author_metadata(self): self.authors = self._eventextractor._author_metadata() return self.authors def _abstract_metadata(self): regex = re.compile(r'resumo:* (.*?) (palavr(a|as)(.|\s)chav(e|es).|unitermos|descritores)') self.abstract = regex.search(self._clean_onepage_doc).group(1).strip().capitalize() return self.abstract def all_metadata(self): if self._preparator.doc_ext == '.pdf': try: pdf_embed_metadata = self._preparator.pdf_embed_metadata() self._pdf_num_pages = pdf_embed_metadata.numPages except: print 'Encripted document' self._pdf_num_pages = 0 else: self._pdf_num_pages = 0 metadata = {'author_metadata': self._author_metadata(), 'abstract_metadata': self._abstract_metadata(), 'number_pages': self._pdf_num_pages } try: self._preparator.remove_converted_document() except OSError: print 'Temporary document already removed..' return metadata
class TestEventExtractor(unittest.TestCase): def setUp(self): self.doc_dir = join(ROOT_PATH, "testdocs", "event", "1_pt-br.pdf") self.preparator = Preparator(self.doc_dir) self.extractor = EventExtractor(self.doc_dir) self.parse = Parser("event.xml") self.xml_template_metadata = self.parse.xml_template_metadata() def test_metadata_extractor_generates_metadata_dict(self): self.extractor.all_metadata() | should_not | be_empty def test_event_document_has_an_abstract_metadata_pattern_found_by_regex(self): doc = self.extractor._clean_onepage_doc matches = re.search(r"resumo:* (.*?) (palavr(a|as)(.|\s)chav(e|es).|unitermos|descritores)", doc) matches.group() | should | start_with("resumo") self.extractor._abstract_metadata | should_not | equal_to("") def test_event_document_has_author_type_metadata(self): self.extractor._author_metadata() | should_not | be_empty def test_event_document_has_title_type_metadata(self): self.extractor._title_metadata() | should_not | be_empty
def setUp(self): self.doc_dir = join(ROOT_PATH, 'testdocs', 'event', '1_pt-br.pdf') self.preparator = Preparator(self.doc_dir) self.extractor = EventExtractor(self.doc_dir) self.parse = Parser('event.xml') self.xml_template_metadata = self.parse.xml_template_metadata()
def setUp(self): self.doc_dir = join(ROOT_PATH, "testdocs", "event", "1_pt-br.pdf") self.preparator = Preparator(self.doc_dir) self.extractor = EventExtractor(self.doc_dir) self.parse = Parser("event.xml") self.xml_template_metadata = self.parse.xml_template_metadata()