def handle_starttag(self, tag, attrs): if (tag == "h3" and self._state == GSEmailAlert.STATE_LOOKING_FOR_TITLE_LINK): # link to paper is shown in h3. self._state = GSEmailAlert.STATE_IN_TITLE_LINK self._current_pub = publication.Pub() self._current_pub_alert = pub_alert.PubAlert( self._current_pub, self) self.pub_alerts.append(self._current_pub_alert) elif tag == "a" and self._state == GSEmailAlert.STATE_IN_TITLE_LINK: full_url = attrs[0][1] url_args = full_url[full_url.find("?") + 1:].split("&") for url_arg in url_args: if url_arg[0:2] == "q=": # need to get rid of URL encoding. self._current_pub.url = urllib.parse.unquote(url_arg[2:]) break elif url_arg[0:4] == "url=": self._current_pub.url = urllib.parse.unquote(url_arg[4:]) break if not self._current_pub.url: # Some URLs link directly to Google Scholar. self._current_pub.url = full_url self._state = GSEmailAlert.STATE_IN_TITLE_TEXT elif (tag in ["font", "div"] and self._state == GSEmailAlert.STATE_TEXT_FROM_PUB_NEXT): self._state = GSEmailAlert.STATE_IN_TEXT_FROM_PUB self._current_pub_alert.text_from_pub = "" return (None)
def handle_starttag(self, tag, attrs): if tag == "h5": # only 1 h5; wraps pub being cited. self._state = WileyEmailCitationAlert.STATE_IN_SEARCH elif (tag == "p" and self._state == WileyEmailCitationAlert.STATE_IN_PUB_LIST): self._state = ( WileyEmailCitationAlert.STATE_AWAITING_AUTHOR_OR_TITLE) self._current_pub = publication.Pub() self.pub_alerts.append(pub_alert.PubAlert(self._current_pub, self)) elif (tag == "span" and self._state == WileyEmailCitationAlert.STATE_AWAITING_AUTHOR_OR_TITLE): # Just entered an author. self._state = WileyEmailCitationAlert.STATE_IN_AUTHOR elif (tag == "em" and self._state == WileyEmailCitationAlert.STATE_IN_TITLE_SECTION): # em here means journal, I sure hope. self._state = WileyEmailCitationAlert.STATE_IN_JOURNAL elif (tag == "strong" and self._state == WileyEmailCitationAlert.STATE_IN_TITLE_SECTION): self._state = WileyEmailCitationAlert.STATE_IN_VOLUME elif (tag == "hr" and self._state == WileyEmailCitationAlert.STATE_IN_PUB_LIST): self._state = WileyEmailCitationAlert.STATE_DONE return (None)
def handle_starttag(self, tag, attrs): # print("Tag", tag) # print("Attrs", attrs) if self._in_search and tag == "b": self._in_search_text = True self._in_search = False elif (tag == "a" and len(attrs) > 1 and attrs[1][0] == "ref" and "linkname=pubmed_pubmed" not in attrs[0][1]): self._current_pub = publication.Pub() self._current_pub_alert = pub_alert.PubAlert( self._current_pub, self) self.pub_alerts.append(self._current_pub_alert) self._current_pub.url = attrs[0][1] self._in_title = True elif tag == "td" and self._expecting_authors: # This case actually handled by handle_startendtag self._expecting_authors = False self._really_expecting_authors = True elif tag == "td" and self._really_expecting_authors: self._really_expecting_authors = False self._in_authors = True elif (tag == "span" and attrs[0][0] == "class" and attrs[0][1] == "jrnl"): # Title tag has better jrnl name than display self._current_pub.ref = attrs[1][1] self._in_ref = True return (None)
def handle_data(self, data): data = data.strip() starting = WoSEmailAlert2018AndBefore.paper_start_re.match(data) if starting: # Each paper starts with: "Record m of n. " self._current_pub = publication.Pub() self._current_pub_alert = pub_alert.PubAlert( self._current_pub, self) self.pub_alerts.append(self._current_pub_alert) elif data == "Title:": self._in_title = True elif data == "Authors:": self._in_authors = True elif (WoSEmailAlert2018AndBefore.cited_article_re.match(data) or WoSEmailAlert2018AndBefore.alert_query_re.match(data)): self._in_query = True elif data == "Source:": self._in_ref = True self._current_pub.ref = "" elif self._in_title_value: if len(self._current_pub.title) > 0: self._current_pub.set_title( self._current_pub.title + " " + data) else: self._current_pub.set_title(data) elif self._in_authors: # WOS Author lists look like: # Galia, W; Leriche, F; Cruveiller, S; Thevenot-Sergentet, D canonical_first_author = publication.to_canonical( data.split(",")[0]) self._current_pub.set_authors(data, canonical_first_author) self._in_authors = False elif self._in_query_value: # need to strip "]]>" from anywhere. Bug in WOS, if punctuation # in title. self.search += data.replace("]]>", "") self._in_query_value = False elif self._in_ref: self._current_pub.ref += data + " " return None
def handle_starttag(self, tag, attrs): if tag == "html": self._parsing = True elif self._search_coming and tag == "strong": # 2018 self._search_coming = False self._in_search = True elif (self._parsing and tag == "a" and len(attrs) > 2 and attrs[2][1] == "http://journalshelp.wiley.com"): self._parsing = False # Done looking at input. self._awaiting_title = False elif self._parsing and self._awaiting_title and tag == "a": self._awaiting_title = False self._in_title = True self._current_pub = publication.Pub() self.pub_alerts.append(pub_alert.PubAlert(self._current_pub, self)) # URL looks like # http://onlinelibrary.wiley.com/doi/10.1002/spe.2320/abstract? # campaign=wolsavedsearch # http://onlinelibrary.wiley.com/doi/10.1002/cpe.3533/abstract # loop through attrs looking for href for attr in attrs: if attr[0] == "href": base_url = attr[1] break if base_url[0:4] != "http": # Wiley sometimes forgets leading http:// base_url = "http://" + base_url self._current_pub.url = base_url # self._current_pub.url = ( # publication.get_potentially_redirected_url(base_url)) if base_url.split("/")[3] == "doi": doi_bits = "/".join(base_url.split("/")[4:6]) self._current_pub.canonical_doi = ( publication.to_canonical_doi(doi_bits)) elif self._awaiting_journal and tag == "span": self._in_journal = True self._awaiting_journal = False self._current_pub.ref = "" return (None)
def handle_starttag(self, tag, attrs): if tag == "html": self._state = WileyEmailAlert.STATE_PARSING_STARTED elif (self._state != WileyEmailAlert.STATE_DONE and tag == "a" and len(attrs) > 2 and attrs[2][1] == "http://journalshelp.wiley.com"): self._state = WileyEmailAlert.STATE_DONE # Done looking at input. elif (self._state == WileyEmailAlert.STATE_AWAITING_TITLE and tag == "a"): self._state = WileyEmailAlert.STATE_IN_TITLE self._current_pub = publication.Pub() self.pub_alerts.append(pub_alert.PubAlert(self._current_pub, self)) # URL looks like # http://el.wiley.com/wf/click?upn=-2F4d0Y8aR13lVHu481a... # however, that redirects to # https://onlinelibrary.wiley.com/doi/10.15252/embr.201847227 # EXCEPT IT DOES NOT. FROM THIS PROGRAM IT REDIRECTS TO # https://onlinelibrary.wiley.com/action/cookieAbsent # Hmm. Works for CURL. Updated publication.py to use CURL # Nope, still doesn't work, still get cookieAbsent. # loop through attrs looking for href for attr in attrs: if attr[0] == "href": base_url = attr[1] break # if base_url[0:4] != "http": # Wiley sometimes forgets leading http:// # base_url = "http://" + base_url self._current_pub.url = base_url # self._current_pub.url = ( # publication.get_potentially_redirected_url(base_url)) if base_url.split("/")[3] == "doi": doi_bits = "/".join(base_url.split("/")[4:6]) self._current_pub.canonical_doi = ( publication.to_canonical_doi(doi_bits)) elif (self._state == WileyEmailAlert.STATE_AWAITING_JOURNAL and tag == "span"): self._state = WileyEmailAlert.STATE_IN_JOURNAL self._current_pub.ref = "" return (None)
def handle_starttag(self, tag, attrs): if tag == "h2": # citing pub has started pub = publication.Pub() self._current_pub_alert = pub_alert.PubAlert(pub, self) self._current_pub_alert.pub.set_authors("", "") self.pub_alerts.append(self._current_pub_alert) self._state = SDEmailAlert.STATE_IN_H2 elif tag == "a" and self._state == SDEmailAlert.STATE_IN_H2: # First "a" inside H2 is link to citing pub at SD full_url = urllib.parse.unquote(attrs[0][1]) # Current email links look like Either # https://cwhib9vv.r.us-east-1.awstrack.me/L0/ # https:%2F%2Fwww.sciencedirect.com%2Fscience%2F # article%2Fpii%2FB9780128156094000108 # %3Fdgcid=raven_sd_search_email/1/ # 01000164f4ef81a4-8297928b-681a-463a-86c6-30f8eaf2bd7e- # 000000/_ewE29jTmNGAovSLl4HHgzWfTRQ=68 # # We want the second HTTPS up to the firs number after pii # Proxy links won't work with full redirect URL # OR # https://www.sciencedirect.com/science/article/pii/ # S0262407919306967 try: minus_redirect = "https" + full_url.split("https")[2] pii_num_only = minus_redirect.split("/")[6] self._current_pub_alert.pub.url = gen_pub_url(pii_num_only) except IndexError: self._current_pub_alert.pub.url = full_url self._current_pub_alert.pub.set_title("") self._state = SDEmailAlert.STATE_IN_CITING_PUB_TITLE elif (tag == "span" and self._state == SDEmailAlert.STATE_EXPECTING_CITING_JOURNAL and attrs[0][1] == "color:#848484"): self._state = SDEmailAlert.STATE_IN_CITING_JOURNAL return (None)
def handle_starttag(self, tag, attrs): if tag == "td" and (len(attrs) > 0 and attrs[0][0] == "class" and attrs[0][1] == "txtcontent"): """ Paper has started; next tag is an anchor, and it has paper URL We now have a long URL that points to a public HTML version of the paper. We don't have a doi. But we will have a title shortly. ScienceDirect has an API we could use to extract the DOI, or we could pull it from the HTML page. TODO: For now, go with title only match """ self._in_title_link = True pub = publication.Pub() self._current_pub_alert = pub_alert.PubAlert(pub, self) self.pub_alerts.append(self._current_pub_alert) elif tag == "a" and self._in_title_link: full_url = attrs[0][1] url_args = full_url.split("&") for url_arg in url_args: if url_arg.startswith("_piikey="): self._current_pub_alert.pub.url = gen_pub_url(url_arg[8:]) break self._in_title_link = False elif tag == "span" and (attrs[0][0] == "class" and attrs[0][1] == "artTitle"): self._in_title_text = True self._in_title_text_span_depth = 1 elif self._in_title_text and tag == "span": self._in_title_text_span_depth += 1 elif tag == "i" and self._after_title_before_ref: self._in_ref = True self._after_title_before_ref = False elif (tag == "span" and attrs[0][0] == "class" and attrs[0][1] == "authorTxt"): self._in_authors = True return None
def handle_starttag(self, tag, attrs): """ The search is wrapped in an H1: <h1 style="color:#505050;font-size:27px;line-height:40px;\ font-family:Arial,Helvetica"> Showing top results for search alert:<br/>GalaxyProject.org </h1> There are other H1's so need to also match on data text. Everything of interest about a matched pub is in a TD followed by an H2. There are many TD's but only paper alerts are have H2's <td align="left" valign="top"> <h2 style="color:#505050;font-size:23px;line-height:32px;\ font-family:Georgia,Arial,Helvetica"> <a href="https://www.sciencedirect.com/science/article/pii/\ S0025619618304026?dgcid=raven_sd_search_email" style="word-wrap:break-word;color:#007398;font-weight:none;\ text-decoration:none"> C3 Glomerulopathy: Ten Years' Experience at Mayo Clinic </a> </h2> <p align="left" style="color:#505050;font-size:15px;\ line-height:24px;font-family:Arial,Helvetica;margin-bottom:2px"> <span style="font-style:italic"> </span>Research article </p> <p align="left" style="color:#848484;font-size:15px;\ line-height:24px;font-family:Arial,Helvetica;margin-bottom:2px"> <span style="color:#848484"> <span>Mayo Clinic Proceedings, Volume 93, Issue 8, \ Pages 991-1008, </span> </span> </p> <p align="left" style="color:#505050;font-size:15px;\ line-height:24px;font-family:Arial,Helvetica;margin-bottom:2px"> Aishwarya Ravindran, Fernando C. Fervenza, ... Sanjeev Sethi </p> </td> """ if not self._state == SDEmailAlert2018To2019.STATE_DONE: if tag == "td": self._in_td_depth += 1 elif tag == "h1": self._state = SDEmailAlert2018To2019.STATE_IN_H1 elif tag == "h2" and self._in_td_depth: # everything in this TD is about the publication. # The H2 is the first element in the TD self._state = SDEmailAlert2018To2019.STATE_IN_PUB_TITLE # paper has started pub = publication.Pub() self._current_pub_alert = pub_alert.PubAlert(pub, self) self.pub_alerts.append(self._current_pub_alert) elif (tag == "a" and self._state == SDEmailAlert2018To2019.STATE_IN_PUB_TITLE): # pub title is the content of the a tag. # pub URL is where the a tag points to. full_url = urllib.parse.unquote(attrs[0][1]) # Current email links look like Either # https://cwhib9vv.r.us-east-1.awstrack.me/L0/ # https:%2F%2Fwww.sciencedirect.com%2Fscience # %2Farticle%2Fpii%2FB9780128156094000108 # %3Fdgcid=raven_sd_search_email/1/ # 01000164f4ef81a4-8297928b-681a-463a-86c6-30f8eaf2bd7e- # 000000/_ewE29jTmNGAovSLl4HHgzWfTRQ=68 # # We want the middle part, the second HTTPS. # Proxy links won't work with full redirect URL # OR # https://www.sciencedirect.com/science/article/pii/ # S0262407919306967?dgcid=raven_sd_search_email try: minus_redirect = "https" + full_url.split("https")[2] self._current_pub_alert.pub.url = minus_redirect.split( "?")[0] except IndexError: self._current_pub_alert.pub.url = full_url self._current_pub_alert.pub.title = "" elif (tag == "p" and self._state == SDEmailAlert2018To2019.STATE_EXPECTING_PUB_TYPE): self._state = SDEmailAlert2018To2019.STATE_EXPECTING_REF elif (tag == "p" and self._state == SDEmailAlert2018To2019.STATE_EXPECTING_REF): self._state = SDEmailAlert2018To2019.STATE_IN_REF elif (tag == "p" and self._state == SDEmailAlert2018To2019.STATE_EXPECTING_AUTHORS): self._state = SDEmailAlert2018To2019.STATE_IN_AUTHORS return (None)
def handle_data(self, data): # eliminate leading, trailing, and multiple embedded spaces data = re.sub(r'\s+', ' ', data).strip() if data == "": return None # nothing to see here folks. if self._state == WoSEmailAlert.State.AWAITING_CONTENT: if WoSEmailAlert.greetings_re.match(data): self._state = WoSEmailAlert.State.STARTING_CONTENT elif self._state == WoSEmailAlert.State.STARTING_CONTENT: if WoSEmailAlert.saved_search_type_next_re.match(data): self._state = WoSEmailAlert.State.SAVED_SEARCH_TYPE_NEXT elif WoSEmailAlert.is_citation_report_re.match(data): self._state = WoSEmailAlert.State.CITED_PUB_NEXT # Search alert states elif self._state == WoSEmailAlert.State.SAVED_SEARCH_TYPE_NEXT: self.search += data + " " self._state = ( WoSEmailAlert.State.SAVED_SEARCH_STRING_AND_COUNT_NEXT) elif (self._state == WoSEmailAlert.State.SAVED_SEARCH_STRING_AND_COUNT_NEXT): # form: "(Title or search) has 0 new records as of Mon XXth YYYY." # We Just want the title or search string. Match to "has" in case # title has parens in it. important_bits = WoSEmailAlert.saved_search_string_re.match(data) self.search += important_bits.group(1) if important_bits.group(2) == "0": self._state = WoSEmailAlert.State.DONE else: self._state = WoSEmailAlert.State.CITING_PUB_NEXT # Citation alert states elif self._state == WoSEmailAlert.State.CITED_PUB_NEXT: self.search += data self._state = WoSEmailAlert.State.CITATION_COUNT_NEXT elif self._state == WoSEmailAlert.State.CITATION_COUNT_NEXT: # ignore count, self._state = WoSEmailAlert.State.CITING_PUB_NEXT elif self._state == WoSEmailAlert.State.CITING_PUB_NEXT: counters = WoSEmailAlert.citing_pubs_over_re.match(data) if counters and int(counters.group(1)) == self.found_pub_count: self._state = WoSEmailAlert.State.DONE elif not counters: # Create a new pub alert. self._current_pub = publication.Pub() self._current_pub_alert = pub_alert.PubAlert( self._current_pub, self) self.pub_alerts.append(self._current_pub_alert) self.found_pub_count += 1 self._current_pub.set_title(data) self._state = WoSEmailAlert.State.CITING_PUB_AUTHORS_NEXT elif self._state == WoSEmailAlert.State.CITING_PUB_AUTHORS_NEXT: # WoS author list looks like: # Halbritter, Dale A.; Storer, Caroline G.; Kawahara, Akito Y. canonical_first_author = publication.to_canonical( data.split(",")[0]) self._current_pub.set_authors(data, canonical_first_author) self._state = WoSEmailAlert.State.CITING_PUB_JOURNAL_NEXT elif self._state == WoSEmailAlert.State.CITING_PUB_JOURNAL_NEXT: self._current_pub.ref = data self._state = WoSEmailAlert.State.CITING_PUB_EXCERPT_NEXT elif self._state == WoSEmailAlert.State.CITING_PUB_EXCERPT_NEXT: self._current_pub_alert.text_from_pub = data self._state = WoSEmailAlert.State.CITING_PUB_NEXT elif (data == "Terms of Use" and not self._state == WoSEmailAlert.State.DONE): print( "ERROR: WoS email parsing did not recognize email.", file=sys.stderr) sys.exit(1) return None
def handle_data(self, data): data = data.strip() if data == "": return None # nothing to see here folks. if self._expecting_search: if WoSEmailAlert201808To201911.search_preface_re.match(data): self._expecting_search = False self._in_search_section = True elif self._in_search_text: self.search += data self._in_search_text = False self._expecting_count_section = True elif self._in_count_section: self.expected_pub_count = int( WoSEmailAlert201808To201911.count_re.match(data).group(2)) self._in_count_section = False self._expecting_pub_section = True elif (self._expecting_pub and WoSEmailAlert201808To201911.paper_start_re.match(data)): # Each paper starts with: "Record m of n. " self._current_pub = publication.Pub() self._current_pub_alert = pub_alert.PubAlert( self._current_pub, self) self.pub_alerts.append(self._current_pub_alert) self.found_pub_count += 1 self._expecting_pub = False self._expecting_title = True elif self._in_title: self._current_pub.set_title(data) self._in_title = False self._expecting_authors = True elif self._expecting_authors and data == "Authors:": self._expecting_authors = False self._in_authors = True elif self._in_authors: # WOS Author lists look like: # Galia, W; Leriche, F; Cruveiller, S; Thevenot-Sergentet, D canonical_first_author = publication.to_canonical( data.split(",")[0]) self._current_pub.set_authors(data, canonical_first_author) self._in_authors = False self._expecting_journal = True elif self._in_journal: self._current_pub.ref = data self._in_journal = False self._expecting_citation = True elif self._in_citation: self._current_pub.ref += ", " + data elif self._expecting_doi and data == "DOI:": self._expecting_doi = False self._in_doi_section = True elif self._in_doi: self._current_pub.canonical_doi = publication.to_canonical_doi( data) self._in_doi = False self._expecting_pub = True return None