def __init__(self, entityparams=ee.ALL): self.params = entityparams self.entity_extractor = ee.EntityExtractor() self.meta_extractor = MetaExtractor() self.cita_parser = CitationEntityExtractor(self.params) self.document_wrapper = DocumentWrapper() self.textual_document = TextualDocument() self.email_extractor = ee.EntityExtractor.EmailExtractor() self.document_info = DocumentInfo() self.cleaner = TextCleaner() self.lang_identifier = LanguageIdentifier()
class ArticleMetaExtractor(object): """ This class extracts metadata from articles. """ def __init__(self, entityparams=ee.ALL): self.params = entityparams self.entity_extractor = ee.EntityExtractor() self.meta_extractor = MetaExtractor() self.cita_parser = CitationEntityExtractor(self.params) self.document_wrapper = DocumentWrapper() self.textual_document = TextualDocument() self.email_extractor = ee.EntityExtractor.EmailExtractor() self.document_info = DocumentInfo() self.cleaner = TextCleaner() self.lang_identifier = LanguageIdentifier() def _assign_emails(self, emails, names): """ This method assigns email adresses to correct names. @param emails: list of emails @type emails: [RRSEmail] @param names: list of person names @type names: [RRSPerson] @return: list of person names with emails @rtype: [RRSPerson] """ names_tmp = [] assigned_emails = [] emails_tmp = [] for a in names: names_tmp.append(a.get('full_name')) for r in emails: emails_tmp.append(r.get_localpart() + '@' + r.get_domain()) #Zacne prirazovat, pokud je vubec extrahovany nejaky autor: if len(names_tmp) != 0: pr_names = names_tmp[:] pr_emails = emails_tmp[:] names_forms = [] #Na zacatek prirazenych emailu vlozi pocet shodnych retezcu rovny 0 for i in range(0, len(names_tmp)): assigned_emails.append("0|") #Upravi name autora: re_dot = re.compile('\.') re_end = re.compile(' $') re_start = re.compile('[^A-Za-z ]') re_firstname = re.compile('^([A-Z][a-z]*.*) ([A-Z][A-Za-z]*)?') re_surname = re.compile('([A-Z][a-z]*.*) ([A-Z][A-Za-z]*)$') for i in range(0, len(pr_names)): pr_names[i] = re_dot.sub('\. ', pr_names[i]) pr_names[i] = re_end.sub('$', pr_names[i]) pr_names[i] = re_start.sub("", pr_names[i]) if names_tmp[i] == "": break #Rozdeli name na krestni name a prsurname if re_firstname.search(pr_names[i]): name = re_firstname.search(pr_names[i]).group(1) else: name = pr_names[i] if re_surname.search(pr_names[i]): surname = re_surname.search(pr_names[i]).group(2) else: surname = pr_names[i] #Kazdy autor bude mit svuj seznam rezezcu: name = re.sub(' ', "", name) for k in range(0, len(surname) + 1): for j in range(0, len(name) + 1): names_forms.append(name[0:len(name) - j] + surname[0:len(surname) - k]) pr_names[i] = names_forms[:] names_forms = [] #Upravi emails: re_at = re.compile('(.*)(@)') re_em_start = re.compile('[^A-Za-z]') for i in range(0, len(pr_emails)): pr_emails[i] = re_at.search(pr_emails[i]).group(1) pr_emails[i] = re_em_start.sub("", pr_emails[i]) #Priradi emails ke jmenum podle nejvyssiho poctu shodnych retezcu: re_num = re.compile('^([0-9]+)(|)') i = 0 while i < len(pr_emails): max_p, max_j = 0, 0 for j in range(0, len(names_tmp)): poc = 0 len_pr_names = len(pr_names[j]) for k in range(0, len_pr_names): if re.search('' + re.escape(pr_names[j][k]) + '', pr_emails[i], re.IGNORECASE): poc = poc + 1 if poc > max_p and poc > 0: if int(re_num.search(assigned_emails[j]).group(1)) <= poc: max_p, max_j = poc, j same = int(re_num.search(assigned_emails[max_j]).group(1)) if same < max_p and len_pr_names > 0: assigned_emails[max_j] = str(max_p) + "|" + str(i) i = 0 else: i = i + 1 #Upravi prirazene emaily: re_num_vert = re.compile("\d+\|") for i in range(0, len(assigned_emails)): assigned_emails[i] = re_num_vert.sub("", assigned_emails[i]) for i in range(0 , len(assigned_emails)): if assigned_emails[i] != "": ei = int(assigned_emails[i]) _rel = RRSRelationshipContactPerson() _rel.set_entity(RRSContact(email=emails[ei])) names[i].set('contact', _rel) return names def extract_data(self, document, module=None, files=[], type=None): """ Output of this method is RRSPublication object with extracted data. @param document: text form of a document @type document: str @param module: module name @type module: str @param files: list of files with the document @type files: [RRSFile] @param type: type of the document @type type: str @return: document's metadata @rtype: RRSPublication """ document = self.cleaner.clean_text(document) #document = str(unicode(document, errors='ignore').decode('UTF-8', 'ignore')) #document = document.translate(None, BAD_CHARS).replace(" ", " ") publication = RRSPublication() #Create publication text rrs_text = RRSText(content=document, length=len(document)) #Wrap document textual_document = self.document_wrapper.wrap(document) meta_text = textual_document.get_meta() #Store module information into publication publication.set('module', module) #Get and store publication language lang_data = self.lang_identifier.identify(meta_text) lang = RRSLanguage(name=lang_data[0]) cred = int(lang_data[1] * 2) if cred > 100:cred = 100 lang.set('credibility', cred) publication.set('language', lang) #Get files and store them into publication txt_file_path = None pdf_file_path = None for f in files: url = f.get('url')[0].get_entities()[0] if re.search('\.txt$', f.get("filename")) or (f.isset('type') and f.get('type') == "txt"): txt_file_path = url.get('link') rrs_text.set('file', f) elif re.search('\.pdf$', f.get("filename")) or (f.isset('type') and f.get('type') == "pdf"): pdf_file_path = url.get('link') _rel = RRSRelationshipFilePublication() _rel.set_entity(f) publication.set('file', _rel) publication.set('text', rrs_text) #Get publication type if type == None and txt_file_path != None: type = self.document_info.get_document_type(txt_file_path, pdf_file_path) publication.set('type', RRSPublication_type(type=type)) elif type != None: publication.set('type', RRSPublication_type(type=type)) #Get keywords and store them into publication keywords = self.meta_extractor.find_keywords(meta_text) for keyword in keywords[0]: _rel = RRSRelationshipPublicationKeyword() _rel.set_entity(keyword) publication.set('keyword', _rel) meta_text = keywords[1] #Get abstract and store it into publication abstract = self.meta_extractor.find_abstract(meta_text) publication.set('abstract', abstract[0]) meta_text = abstract[1] #Get title from document and store it into publication title = self.meta_extractor.find_title(meta_text) publication.set('title', title[0]) meta_text = title[1] #Get emails emails = self.email_extractor.get_emails(meta_text) meta_text = self.email_extractor.get_rest() #Get names and assign emails and store them into publication names = self.entity_extractor.find_authors(meta_text) assigned_names = self._assign_emails(emails, names[0]) c = 0 for name in assigned_names: c += 1 _rel = RRSRelationshipPersonPublication(author_rank=c, editor=False) _rel.set_entity(name) publication.set('person', _rel) #publication.set('person', name) meta_text = names[1] #Get publisher from document and store it into publication publisher = self.entity_extractor.find_publisher(meta_text) publication.set('publisher', RRSOrganization(title=publisher[0])) meta_text = publisher[1] #Get chapters from document and store them into publication for chpt in textual_document.get_chapters(): _rel = RRSRelationshipPublication_sectionPublication() _rel.set_entity(chpt) publication.set("publication_section", _rel) #Get citations from document and store them into publication for cit in textual_document.get_citations(): if cit == None: continue _cit = self.cita_parser.extract(cit) if _cit.isset('reference'): _cit['reference']['publication'] = publication _rel = RRSRelationshipPublicationCitation() _rel.set_entity(_cit) publication.set("citation", _rel) return publication