class TEIExtractor: def __init__(self, file, db, xml, data_file, test_tsv=None): self.file = file self.xml = xml self.uni_rank = ReadPickle('uni_rank.pickle') self.sjr = ReadPickle('journal_dictionary.pkl') self.document = test_tsv self.test_csv = data_file self.paper = Paper() with open(file, 'rb') as tei: self.soup = BeautifulSoup(tei, features="lxml") self.db=db # TODO: return paper | redesign extractor to make it more modular to test individual components def get_self_citations(self): # DOI doi = self.soup.teiheader.find("idno", type="DOI") if doi: self.paper.doi = elem_to_text(doi) elif self.document: self.paper.doi = self.document['doi'] # Title title = self.soup.teiheader.find("title") if title: self.paper.title = elem_to_text(title) # Authors authors = self.get_authors(self.soup.analytic.find_all('author')) if authors: self.paper.authors = authors if self.soup.abstract: self.paper.abstract = elem_to_text(self.soup.abstract) # Citations bibliography = self.soup.listbibl.find_all('biblstruct') for bibl in bibliography: citation = Citation() cited_paper = bibl.analytic if cited_paper: citation.title = elem_to_text(cited_paper.find("title", type="main")) citation_authors = self.get_authors(cited_paper.find_all("author")) citation.doi = elem_to_text(cited_paper.find("idno", type="DOI")) if citation_authors: citation.authors = citation_authors cited_journal = bibl.monogr if cited_journal: citation.source = elem_to_text(cited_journal.find("title")) try: citation.publish_year = cited_journal.imprint.date['when'] except TypeError: pass self.paper.citations.append(citation) self.paper.set_self_citations() return {'doi': self.paper.doi, 'title': self.paper.title, 'total_citations': len(self.paper.citations), 'self_citations': self.paper.self_citations, 'abstract': self.paper.abstract} def get_reading_score(self, abstract): if isinstance(abstract,str): if not abstract: return 0 return textstat.flesch_reading_ease(abstract) return 0 def get_subjectivity(self, abstract): if isinstance(abstract,str): if len(abstract)<10: return -1 txtblob = TextBlob(abstract) return txtblob.sentiment.subjectivity return -1 def get_sentiment(self, abstract): if isinstance(abstract,str): if len(abstract)<10: return -1 label = predictor.predict(abstract)['label'] return int(label) return -1 def extract_paper_info(self): # DOI doi = self.soup.teiheader.find("idno", type="DOI") if doi: self.paper.doi = elem_to_text(doi) elif self.document: self.paper.doi = self.document['doi'] # Title title = self.soup.teiheader.find("title") if title: self.paper.title = elem_to_text(title) # Authors authors = self.get_authors(self.soup.analytic.find_all('author')) if authors: self.paper.authors = authors if self.soup.abstract: self.paper.abstract = elem_to_text(self.soup.abstract) # Year published = self.soup.analytic.find("publicationstmt") if published: self.paper.year = elem_to_text(published.find("date", type="when")) # Organization / Affiliations affiliations = self.soup.analytic.find_all('affiliation') for affiliation in affiliations: org = Organization() org.type = "institution" org.name = elem_to_text(affiliation.find("orgname", type="institution")) address = Address() addr = affiliation.find("address") if addr: address.place = elem_to_text(addr.find("settlement")) address.region = elem_to_text(addr.find("region")) address.country = elem_to_text(addr.find("country")) org.address = address self.paper.affiliations.append(org) # University Ranking if self.paper.affiliations: if self.paper.affiliations[0] != '': self.paper.uni_rank = self.uni_rank.get_rank(self.paper.affiliations[0].name) elif len(self.paper.affiliations) > 1: self.paper.uni_rank = self.uni_rank.get_rank(self.paper.affiliations[1].name) else: self.paper.uni_rank = self.uni_rank.get_rank('Random') # Citations bibliography = self.soup.listbibl.find_all('biblstruct') for bibl in bibliography: citation = Citation() cited_paper = bibl.analytic if cited_paper: citation.title = elem_to_text(cited_paper.find("title", type="main")) citation_authors = self.get_authors(cited_paper.find_all("author")) citation.doi = elem_to_text(cited_paper.find("idno", type="DOI")) if citation_authors: citation.authors = citation_authors cited_journal = bibl.monogr if cited_journal: citation.source = elem_to_text(cited_journal.find("title")) try: citation.publish_year = cited_journal.imprint.date['when'] except TypeError: pass self.paper.citations.append(citation) # NER - Ack pairs - Funding status self.paper.ack_pairs = self.get_funding_status() er_list = [org for (entity, org) in self.paper.ack_pairs] if 'ORG' in er_list: self.paper.funded = 1 else: self.paper.funded = 0 # SJR api_resp = self.get_sjr(self.paper.doi, self.paper.title,self.db) # Adding the paragraphs from the paper to the corpus extractor = ClaimEvidenceExtractor(self.xml, self.soup,self.test_csv) os.chdir("/scifact/") extractor.make_corpus() # Get response for claim evidence using request to API response = requests.get('http://0.0.0.0:8000/getclaimevidence') print(response) self.support, self.refute, self.ratio = extractor.get_results() print('support:',self.support,'refute:',self.refute,'ratio:',self.ratio) os.chdir("../") if api_resp: self.paper.cited_by_count = api_resp["num_citations"] self.paper.sjr = api_resp["sjr"] self.paper.subject = api_resp["subject"] self.paper.subject_code = api_resp["subject_code"] self.paper.normalized = api_resp["normalized_citations"] self.paper.velocity = api_resp["citationVelocity"] self.paper.influentialcitations = api_resp["influentialCitationCount"] self.paper.references = api_resp["references_count"] self.paper.flag = api_resp["openaccessflag"] self.paper.influentialref = api_resp["influentialReferencesCount"] self.paper.ref_background = api_resp["reference_background"] self.paper.ref_result = api_resp["reference_result"] self.paper.ref_method = api_resp["reference_methodology"] self.paper.cite_background = api_resp["citations_background"] self.paper.cite_result = api_resp["citations_result"] self.paper.cite_method = api_resp["citations_methodology"] self.paper.cite_next = api_resp["citations_next"] self.paper.influential_references_methodology = api_resp["upstream_influential_methodology_count"] self.paper.issn = api_resp["ISSN"] self.paper.auth = api_resp["authors"] self.paper.age = api_resp["age"] if api_resp["abstract"]: self.paper.abstract = api_resp["abstract"] # Set self-citations self.paper.self_citations = self.paper.set_self_citations() # return paper #calculate coCitations t2,t3 = coCite(self.paper.doi, self.db) #calculate NLP features reading_score = self.get_reading_score(self.paper.abstract) subjectivity = self.get_subjectivity(self.paper.abstract) sentiment = self.get_sentiment(self.paper.abstract) return {"doi": self.paper.doi, "title": self.paper.title, "num_citations": self.paper.cited_by_count, "author_count": len(self.paper.authors),"sjr": self.paper.sjr, "u_rank": self.paper.uni_rank, "funded": self.paper.funded,"self_citations": self.paper.self_citations, "subject": self.paper.subject, "subject_code": self.paper.subject_code, "citationVelocity": self.paper.velocity, "influentialCitationCount": self.paper.influentialcitations, "references_count": self.paper.references, "openaccessflag": self.paper.flag, "influentialReferencesCount": self.paper.influentialref, "normalized_citations": self.paper.normalized, "reference_background": self.paper.ref_background, "reference_result": self.paper.ref_result, "reference_methodology": self.paper.ref_method, "citations_background": self.paper.cite_background, "citations_result": self.paper.cite_result, "citations_methodology": self.paper.cite_method, "citations_next": self.paper.cite_next, "upstream_influential_methodology_count": self.paper.influential_references_methodology, "coCite2":t2, "coCite3":t3, "ISSN":self.paper.issn, "authors":self.paper.auth,"citations":api_resp["citations"],"age":self.paper.age, "reading_score":reading_score, "subjectivity":subjectivity, "sentiment":sentiment, "supporting_sentences":self.support, "refuting_sentences":self.refute, "ratio_support":self.ratio} @staticmethod def get_authors(authors): authors_list = [] for author in authors: person = Author() pers_name = author.persname if not pers_name: continue person.first_name = elem_to_text(pers_name.find("forename", type="first")) person.middle_name = elem_to_text(pers_name.find("forename", type="middle")) person.surname = elem_to_text(pers_name.surname) person.set_name() if not any(auth.name == person.name for auth in authors_list): authors_list.append(person) return authors_list def get_funding_status(self): pairs = NER(XML2ack(self.file)) return pairs @staticmethod def get_sjr(doi,title,db): response = getsemantic(doi,title,db) crossref = response.get_row() scopus_search = response.return_search() serial_title = response.return_serialtitle() semantic = response.return_semantic() #pdb.set_trace() final = {"doi":response.doi, "title":response.title, "sjr": response.sjr, "num_citations": response.citedby,"subject":response.subject,"subject_code":response.subject_code,"normalized_citations":response.normalized,"citationVelocity":response.velocity,"influentialCitationCount":response.incite,"references_count":response.refcount,"openaccessflag":response.openaccess,"influentialReferencesCount":response.inref, "reference_background": response.refback, "reference_result":response.refresult, "reference_methodology":response.refmeth,"citations_background":response.cback,"citations_result":response.cresult,"citations_methodology":response.cmeth, "citations_next":response.next, "upstream_influential_methodology_count": response.upstream_influential_methodology_count, "ISSN": response.issn, "authors":response.auth, "citations":response.citations,"age":response.age,"abstract":response.ab} return final
class TEIExtractor: def __init__(self, file, test_tsv=None): self.file = file self.uni_rank = ReadPickle('uni_rank.pickle') self.sjr = ReadPickle('journal_dictionary.pkl') self.document = test_tsv self.paper = Paper() with open(file, 'rb') as tei: self.soup = BeautifulSoup(tei, features="lxml") # TODO: return paper | redesign extractor to make it more modular to test individual components def get_self_citations(self): # DOI doi = self.soup.teiheader.find("idno", type="DOI") if doi: self.paper.doi = elem_to_text(doi) elif self.document: self.paper.doi = self.document['doi'] # Title title = self.soup.teiheader.find("title") if title: self.paper.title = elem_to_text(title) # Authors authors = self.get_authors(self.soup.analytic.find_all('author')) if authors: self.paper.authors = authors # Citations bibliography = self.soup.listbibl.find_all('biblstruct') for bibl in bibliography: citation = Citation() cited_paper = bibl.analytic if cited_paper: citation.title = elem_to_text( cited_paper.find("title", type="main")) citation_authors = self.get_authors( cited_paper.find_all("author")) citation.doi = elem_to_text( cited_paper.find("idno", type="DOI")) if citation_authors: citation.authors = citation_authors cited_journal = bibl.monogr if cited_journal: citation.source = elem_to_text(cited_journal.find("title")) try: citation.publish_year = cited_journal.imprint.date['when'] except TypeError: pass self.paper.citations.append(citation) self.paper.set_self_citations() return { 'doi': self.paper.doi, 'title': self.paper.title, 'total_citations': len(self.paper.citations), 'self_citations': self.paper.self_citations } def extract_paper_info(self): # DOI doi = self.soup.teiheader.find("idno", type="DOI") if doi: self.paper.doi = elem_to_text(doi) elif self.document: self.paper.doi = self.document['doi'] # Title title = self.soup.teiheader.find("title") if title: self.paper.title = elem_to_text(title) # Authors authors = self.get_authors(self.soup.analytic.find_all('author')) if authors: self.paper.authors = authors # Year published = self.soup.analytic.find("publicationstmt") if published: self.paper.year = elem_to_text(published.find("date", type="when")) # Organization / Affiliations affiliations = self.soup.analytic.find_all('affiliation') for affiliation in affiliations: org = Organization() org.type = "institution" org.name = elem_to_text( affiliation.find("orgname", type="institution")) address = Address() addr = affiliation.find("address") if addr: address.place = elem_to_text(addr.find("settlement")) address.region = elem_to_text(addr.find("region")) address.country = elem_to_text(addr.find("country")) org.address = address self.paper.affiliations.append(org) # University Ranking if self.paper.affiliations: if self.paper.affiliations[0] != '': self.paper.uni_rank = self.uni_rank.get_rank( self.paper.affiliations[0].name) elif len(self.paper.affiliations) > 1: self.paper.uni_rank = self.uni_rank.get_rank( self.paper.affiliations[1].name) else: self.paper.uni_rank = self.uni_rank.get_rank('Random') # Citations bibliography = self.soup.listbibl.find_all('biblstruct') for bibl in bibliography: citation = Citation() cited_paper = bibl.analytic if cited_paper: citation.title = elem_to_text( cited_paper.find("title", type="main")) citation_authors = self.get_authors( cited_paper.find_all("author")) citation.doi = elem_to_text( cited_paper.find("idno", type="DOI")) if citation_authors: citation.authors = citation_authors cited_journal = bibl.monogr if cited_journal: citation.source = elem_to_text(cited_journal.find("title")) try: citation.publish_year = cited_journal.imprint.date['when'] except TypeError: pass self.paper.citations.append(citation) # NER - Ack pairs - Funding status self.paper.ack_pairs = self.get_funding_status() er_list = [org for (entity, org) in self.paper.ack_pairs] if 'ORG' in er_list: self.paper.funded = 1 else: self.paper.funded = 0 # SJR api_resp = self.get_sjr(self.paper.doi, self.paper.title) if api_resp: self.paper.cited_by_count = api_resp["num_citations"] self.paper.sjr = api_resp["sjr"] self.paper.subject = api_resp["subject"] self.paper.subject_code = api_resp["subject_code"] self.paper.normalized = api_resp["normalized_citations"] self.paper.velocity = api_resp["citationVelocity"] self.paper.influentialcitations = api_resp[ "influentialCitationCount"] self.paper.references = api_resp["references_count"] self.paper.flag = api_resp["openaccessflag"] self.paper.influentialref = api_resp["influentialReferencesCount"] self.paper.ref_background = api_resp["reference_background"] self.paper.ref_result = api_resp["reference_result"] self.paper.ref_method = api_resp["reference_methodology"] self.paper.cite_background = api_resp["citations_background"] self.paper.cite_result = api_resp["citations_result"] self.paper.cite_method = api_resp["citations_methodology"] self.paper.cite_next = api_resp["citations_next"] # Set self-citations self.paper.self_citations = self.paper.set_self_citations() # Set influential_methodology_references self.paper.influential_references_methodology = self.set_influential_references_methodology( ) # return paper t2, t3 = coCite(self.paper.doi) return { "doi": self.paper.doi, "title": self.paper.title, "num_citations": self.paper.cited_by_count, "author_count": len(self.paper.authors), "sjr": self.paper.sjr, "u_rank": self.paper.uni_rank, "funded": self.paper.funded, "self_citations": self.paper.self_citations, "subject": self.paper.subject, "subject_code": self.paper.subject_code, "citationVelocity": self.paper.velocity, "influentialCitationCount": self.paper.influentialcitations, "references_count": self.paper.references, "openaccessflag": self.paper.flag, "influentialReferencesCount": self.paper.influentialref, "normalized_citations": self.paper.normalized, "reference_background": self.paper.ref_background, "reference_result": self.paper.ref_result, "reference_methodology": self.paper.ref_method, "citations_background": self.paper.cite_background, "citations_result": self.paper.cite_result, "citations_methodology": self.paper.cite_method, "citations_next": self.paper.cite_next, "upstream_influential_methodology_count": self.paper.influential_references_methodology, "coCite2": t2, "coCite3": t3 } @staticmethod def get_authors(authors): authors_list = [] for author in authors: person = Author() pers_name = author.persname if not pers_name: continue person.first_name = elem_to_text( pers_name.find("forename", type="first")) person.middle_name = elem_to_text( pers_name.find("forename", type="middle")) person.surname = elem_to_text(pers_name.surname) person.set_name() if not any(auth.name == person.name for auth in authors_list): authors_list.append(person) return authors_list def get_funding_status(self): pairs = NER(XML2ack(self.file)) return pairs @staticmethod def get_sjr(doi, title): api = getapi(doi, title) if api.empty: return None else: try: cited_by = api['num_citations'][0] except KeyError: cited_by = 0 try: normalized = api['normalized_citations'][0] except: normalized = 0.0 try: velocity = api['citationVelocity'][0] except: velocity = 0 try: influentialcitations = api['influentialCitationCount'][0] except: influentialcitations = 0 try: references = api['references_count'][0] except: references = 0 try: sjr_score = api['SJR'][0] except KeyError: sjr_score = 0 try: subject = api['subject'][0] except: subject = 0 try: subject_code = api['subject_code'][0] except: subject_code = 900 try: flag = api['openaccessflag'][0] except: flag = 0 try: influentialref = api['influentialReferencesCount'][0] except: influentialref = 0 try: ref_background = api['reference_background'][0] except: ref_background = 0 try: ref_result = api['reference_result'][0] except: ref_result = 0 try: ref_method = api['reference_methodology'][0] except: ref_method = 0 try: cite_background = api['citations_background'][0] except: cite_background = 0 try: cite_result = api['citations_result'][0] except: cite_result = 0 try: cite_method = api['citations_methodology'][0] except: cite_method = 0 try: cite_next = api['citation_next'][0] except: cite_next = 0 return { "sjr": sjr_score, "num_citations": cited_by, "subject": subject, "subject_code": subject_code, "normalized_citations": normalized, "citationVelocity": velocity, "influentialCitationCount": influentialcitations, "references_count": references, "openaccessflag": flag, "influentialReferencesCount": influentialref, "reference_background": ref_background, "reference_result": ref_result, "reference_methodology": ref_method, "citations_background": cite_background, "citations_result": cite_result, "citations_methodology": cite_method, "citations_next": cite_next } def set_influential_references_methodology(self): # Counts the number of influential references in the paper in the context of methodology count = 0 if self.paper.doi: url = 'https://partner.semanticscholar.org/v1/paper/{0}'.format( self.paper.doi) headers = {'x-api-key': 'I6SO5Ckndk67RitJNJOFR4d7jDiVpWOgaMFUhgkM'} response_payload = requests.get(url, headers=headers).json() try: references = response_payload['references'] for reference in references: try: if 'methodology' in reference['intent'] and reference[ 'isInfluential']: count += 1 except KeyError: continue except KeyError: pass return count