def pictureDownloaded(self, doi, entry_url, future): """Callback to handle the response of the futures downloading a picture""" if not self.parent.parsing: return query = QtSql.QSqlQuery(self.bdd) try: response = future.result() except concurrent.futures._base.CancelledError: self.l.error("future cancelled for {}".format(entry_url)) return except Exception as e: self.l.error("Exception raised in pictureDownloaded:\n{}". format(e)) self.l.error(traceback.format_exc()) params = ("Empty", doi) else: # If the picture was dled correctly if response.status_code is requests.codes.ok: try: # Save the page io = BytesIO(response.content) Image.open(io).convert('RGB').save( self.DATA_PATH + functions.simpleChar(response.url), format='JPEG') self.l.debug("Image ok") except Exception as e: self.l.error("An error occured in pictureDownloaded:\n{}". format(e)) self.l.error(traceback.format_exc()) params = ("Empty", doi) else: params = (functions.simpleChar(response.url), doi) else: self.l.debug("Bad return code: {} DOI: {}". format(response.status_code, doi)) params = ("Empty", doi) finally: query.prepare("UPDATE papers SET graphical_abstract=? WHERE doi=?") for value in params: query.addBindValue(value) self.new_entries_worker += 1 query.exec_() self.count_futures_images += 1
def pictureDownloaded(self, doi, entry_url, future): """Callback to handle the response of the futures downloading a picture""" if not self.parent.parsing: return query = QtSql.QSqlQuery(self.bdd) try: response = future.result() except concurrent.futures._base.CancelledError: self.l.error("future cancelled for {}".format(entry_url)) self.parent.counter_images_failed += 1 params = ("Empty", doi) except Exception as e: self.parent.counter_images_failed += 1 self.l.error("pictureDownloaded: {}".format(e), exc_info=True) params = ("Empty", doi) else: # If the picture was dled correctly if response.status_code is requests.codes.ok: try: # Save the page io = BytesIO(response.content) Image.open(io).convert('RGB').save( self.PATH + functions.simpleChar(response.url), format='JPEG') self.l.debug("Image ok") except Exception as e: self.l.error( "An error occured in pictureDownloaded:\n{}".format(e), exc_info=True) params = ("Empty", doi) else: params = (functions.simpleChar(response.url), doi) else: self.l.debug("Bad return code: {} DOI: {}".format( response.status_code, doi)) params = ("Empty", doi) finally: query.prepare("UPDATE papers SET graphical_abstract=? WHERE doi=?") for value in params: query.addBindValue(value) self.new_entries_worker += 1 query.exec_() self.counter_futures_images += 1
def forgeTopicSimple(title: str, abstract: str) -> str: """ Forge topic_simple, a simplified version of the abstract, used for sqlite queries """ simple_title = fct.simpleChar(title) if abstract is not None: simple_abstract = fct.simpleChar(BS(abstract, "html.parser").text) topic_simple = " " + simple_abstract + " " + simple_title + " " else: topic_simple = " " + simple_title + " " return topic_simple
def getData(company, journal, entry, response=None): """Get the data. Starts from the data contained in the RSS page and, if necessary, parses the website for additional information""" url = refineUrl(company, journal, entry) # If the journal is edited by the RSC if company == 'RSC': """Graphical abstract present in RSS. Abstract incomplete and w/out html. Title w/out html""" title = entry.title date = arrow.get(entry.updated).format('YYYY-MM-DD') abstract = None graphical_abstract = None author = None soup = BS(entry.summary, "html.parser") r = soup("img", align="center") if r: graphical_abstract = r[0]['src'] if response.status_code is requests.codes.ok: # # Get the title (w/ html) # Strainer: get a soup with only the interesting part. # Don't load the complete tree in memory. Saves RAM strainer = SS("h2", attrs={"class": "capsule__title fixpadv--m"}) soup = BS(response.text, "html.parser", parse_only=strainer) title = soup.h2 if title is not None: title = title.renderContents().decode().strip() # Get the abstrat (w/ html) strainer = SS("p", xmlns="http://www.rsc.org/schema/rscart38") soup = BS(response.text, "html.parser", parse_only=strainer) r = soup.p if r is not None: abstract = r.renderContents().decode() if abstract == "": abstract = None strainer = SS("meta", attrs={"name": "citation_author"}) soup = BS(response.text, "html.parser", parse_only=strainer) # Here, multiple tags (results) are expected, so perform # the search, even if the tree contains only the result r = soup("meta", attrs={"name": "citation_author"}) if r: author = [tag['content'] for tag in r] author = ", ".join(author) elif company == 'Wiley': title, date, author, abstract, graphical_abstract = parseWiley( entry, response) elif company == 'ACS': """Feed only contains graphical abstract""" title = entry.title.rstrip() date = arrow.get(mktime(entry.published_parsed)).format('YYYY-MM-DD') abstract = None author = entry.author author = entry.author.split(" and ") if len(author) > 1: author = ", ".join(author) else: author = author[0] graphical_abstract = None soup = BS(entry.summary, "html.parser") r = soup("img", alt="TOC Graphic") if r: graphical_abstract = r[0]['src'] # If the dl went wrong, print an error if response.status_code is requests.codes.ok: strainer = SS("p", attrs={"class": "articleBody_abstractText"}) soup = BS(response.text, "html.parser", parse_only=strainer) r = soup.p if r is not None: abstract = r.renderContents().decode() strainer = SS("h1", attrs={"class": "articleTitle"}) soup = BS(response.text, "html.parser", parse_only=strainer) r = soup.h1 if r is not None: title = r.renderContents().decode() elif company == 'Nature': title = entry.title date = entry.date abstract = None graphical_abstract = None author = None try: if entry.authors: author = [] for element in entry.authors: author.append(element['name']) author = ", ".join(author) except AttributeError: pass if entry.summary: abstract = BS(entry.summary, "html.parser") while abstract.find_all('p'): _ = abstract.p.extract() try: _ = abstract.img.extract() except AttributeError: pass abstract = abstract.renderContents().decode() if (response.status_code is requests.codes.ok or response.status_code == 401): strainer = SS("div", attrs={"class": "article__body serif cleared"}) soup = BS(response.text, "html.parser", parse_only=strainer) r = soup.div try: abstract = r.text except AttributeError: pass strainer = SS("figure") soup = BS(response.text, "html.parser", parse_only=strainer) r = soup.find_all("img", attrs={"class": "figure__image"}) if r: # Additional verification to correctly forge the URL graphical_abstract = "http:" + r[0]["src"] elif company == 'Science': title = entry.title date = entry.date graphical_abstract = None if entry.author: author = entry.author else: author = None abstract = entry.summary if not abstract: abstract = None elif company == 'PNAS': title = entry.title date = entry.prism_publicationdate graphical_abstract = None author = None abstract = None if response.status_code is requests.codes.ok: # Get the correct title, not the one in the RSS strainer = SS("h1", id="article-title-1") soup = BS(response.text, "html.parser", parse_only=strainer) r = soup.find_all("h1", id="article-title-1") if r: title = r[0].renderContents().decode() # Get the authors strainer = SS("a", attrs={"class": "name-search"}) soup = BS(response.text, "html.parser", parse_only=strainer) r = soup.find_all("a", attrs={"class": "name-search"}) if r: author = [tag.text for tag in r] author = ", ".join(author) # Try to get the complete abstract. Sometimes it's available, # sometimes the article only contains an extract strainer = SS("div", attrs={"class": "section abstract"}) soup = BS(response.text, "html.parser", parse_only=strainer) if soup.p is not None: abstract = soup.p.renderContents().decode() else: abstract = entry.summary elif company == 'Elsevier': title = entry.title date = arrow.get(mktime(entry.updated_parsed)).format('YYYY-MM-DD') graphical_abstract = None author = None abstract = entry.summary if abstract: try: author = abstract.split("Author(s): ")[1].split( "<br")[0].split("<")[0] author = author.replace(" , ", ", ") author = author.replace(" ", " ") except IndexError: author = None soup = BS(abstract, "html.parser") try: # First type of abstract formatting abstract = soup("simple-para")[0].renderContents().decode() except IndexError: try: # Second type of abstract formatting abstract = abstract.split("<br />")[3].lstrip() except IndexError: abstract = None r = soup.find_all("img") if r: graphical_abstract = r[0]['src'] # NOTE: javascript embedded, impossible # if response.status_code is requests.codes.ok: # url = response.url # print(response.url) # # Get the abstract # soup = BS(response.text) # Get the correct title, no the one in the RSS # r = soup.find_all("li", attrs={"class": "originalArticleName"}) # print(r) # if r: # title = r[0].renderContents().decode() elif company == 'Thieme': title = entry.title date = arrow.get(entry.updated).format('YYYY-MM-DD') abstract = None graphical_abstract = None author = None try: if entry.authors: author = [] for element in entry.authors: # Reverse Family name/first name field = reversed(element['name'].split(', ')) name = " ".join(field) author.append(name) author = ", ".join(author) except AttributeError: pass try: if entry.summary: abstract = entry.summary except AttributeError: pass elif company == 'Beilstein': title = entry.title date = arrow.get(mktime(entry.published_parsed)).format('YYYY-MM-DD') abstract = None graphical_abstract = None author = entry.author author = entry.author.split(" and ") if len(author) > 1: author = ", ".join(author) else: author = author[0] if entry.summary != "": soup = BS(entry.summary, "html.parser") r = soup.find_all("p") if r: abstract = r[1].renderContents().decode() r = soup.find_all("img") if r: # This company can change the background of the GA through # the url. If nothing is done, the bg is black, so turn it # to white. Doesn't affect images with unchangeable bg graphical_abstract = r[0]['src'] + '&background=FFFFFF' elif company == 'Nature2': title = entry.title date = entry.date abstract = entry.summary graphical_abstract = None try: author = [dic['name'] for dic in entry.authors] if author: if len(author) > 1: author = ", ".join(author) else: author = author[0] else: author = None except AttributeError: author = None if response.status_code is requests.codes.ok or response.status_code == 401: strainer = SS( "h1", attrs={"class": "tighten-line-height small-space-below"}) soup = BS(response.text, "html.parser", parse_only=strainer) r = soup.h1 if r is not None: title = r.renderContents().decode() strainer = SS("div", attrs={"id": "abstract-content"}) soup = BS(response.text, "html.parser", parse_only=strainer) r = soup.p if r is not None: abstract = r.renderContents().decode() strainer = SS("img") soup = BS(response.text, "html.parser", parse_only=strainer) r = soup.find_all("img", attrs={"alt": "Figure 1"}) if r: if "f1.jpg" in r[0]["src"]: graphical_abstract = "http://www.nature.com" + r[0]["src"] elif company == 'PLOS': title = entry.title date = arrow.get(mktime(entry.published_parsed)).format('YYYY-MM-DD') author = None try: if entry.authors: author = [] for element in entry.authors: author.append(element['name']) author = ", ".join(author) except AttributeError: pass abstract = BS(entry.summary, "html.parser") # Clean the authors' names from the abstract r = abstract.find_all("p") if r and str(r[0]).startswith("<p>by "): abstract("p")[0].extract() try: abstract("img")[0].extract() except IndexError: pass abstract = abstract.renderContents().decode().strip() base = "http://journals.plos.org/plosone/article/figure/image?size=medium&id=info:doi/{}.g001" graphical_abstract = base.format(getDoi(company, journal, entry)) elif company == 'Springer': title = entry.title date = arrow.get(mktime(entry.published_parsed)).format('YYYY-MM-DD') graphical_abstract = None author = None abstract = BS(entry.summary, "html.parser") try: _ = abstract("h3")[0].extract() # Remove the graphical abstract part from the abstract _ = abstract( "span", attrs={ "class": "a-plus-plus figure category-standard float-no id-figa" })[0].extract() except IndexError: pass abstract = abstract.renderContents().decode().strip() if response.status_code is requests.codes.ok: strainer = SS("div", attrs={"class": "MediaObject"}) soup = BS(response.text, "html.parser", parse_only=strainer) # For now, it's one shot: if the dl fails for the GA, there # won't be a retry. That's bc too little articles have GA r = soup.find_all("img") if r: graphical_abstract = r[0]['src'] strainer = SS("ul", attrs={"class": "AuthorNames"}) soup = BS(response.text, "html.parser", parse_only=strainer) r = soup.find_all("span", attrs={"class": "AuthorName"}) if r: author = [tag.text for tag in r] author = ", ".join(author) strainer = SS("h1", attrs={"class": "ArticleTitle"}) soup = BS(response.text, "html.parser", parse_only=strainer) r = soup.h1 if r is not None: title = r.renderContents().decode() elif company == 'Springer_open': title = entry.title date = arrow.get(mktime(entry.published_parsed)).format('YYYY-MM-DD') graphical_abstract = None author = None abstract = BS(entry.summary, "html.parser") try: _ = abstract("h3")[0].extract() # Remove the graphical abstract part from the abstract _ = abstract( "span", attrs={ "class": "a-plus-plus figure category-standard float-no id-figa" })[0].extract() except IndexError: pass abstract = abstract.renderContents().decode().strip() if response.status_code is requests.codes.ok: strainer = SS("div", attrs={"class": "MediaObject"}) soup = BS(response.text, "html.parser", parse_only=strainer) # For now, it's one shot: if the dl fails for the GA, there # won't be a retry. That's bc too little articles have GA r = soup.find_all("img") if r: graphical_abstract = r[0]['src'] strainer = SS("ul", attrs={"class": "u-listReset"}) soup = BS(response.text, "html.parser", parse_only=strainer) r = soup.find_all("span", attrs={"class": "AuthorName"}) if r: author = [tag.text for tag in r] author = ", ".join(author) strainer = SS("h1", attrs={"class": "ArticleTitle"}) soup = BS(response.text, "html.parser", parse_only=strainer) r = soup.h1 if r is not None: title = r.renderContents().decode() elif company == 'Taylor': title = entry.title date = arrow.get(mktime(entry.updated_parsed)).format('YYYY-MM-DD') graphical_abstract = None author = None abstract = None try: author = [] for element in entry.authors: author.append(element['name']) author = ", ".join(author) except AttributeError: author = None if response.status_code is requests.codes.ok: strainer = SS("div", attrs={"class": "col-md-2-3 "}) soup = BS(response.text, "html.parser", parse_only=strainer) r = soup.span if r is not None: # Remove all tags attributes for tag in r.findAll(True): tag.attrs = None title = r.renderContents().decode() strainer = SS("div", attrs={"class": "abstractSection abstractInFull"}) soup = BS(response.text, "html.parser", parse_only=strainer) # Erase the title 'Abstract', useless if soup("p") and soup("p")[0].text == "Abstract": soup("p")[0].extract() r = soup.p if r is not None: abstract = r.renderContents().decode() r = soup.find_all("img") if r: base = "http://www.tandfonline.com{}" graphical_abstract = base.format(r[0]['src']) elif company == 'ChemArxiv': title = entry.title date = arrow.get(mktime(entry.published_parsed)).format('YYYY-MM-DD') graphical_abstract = None author = None abstract = None try: if entry.authors: author = [] for element in entry.authors: author.append(element['name']) author = ", ".join(author) except AttributeError: pass try: abstract = entry.summary except AttributeError: # I saw once a poster conference, w/ no abstract. # Filter these entries if it becomes common pass elif company == 'ChemRxiv': title = entry.title date = arrow.get(mktime(entry.published_parsed)).format('YYYY-MM-DD') graphical_abstract = None author = None abstract = None try: abstract = entry.summary except AttributeError: # I saw once a poster conference, w/ no abstract. # Filter these entries if it becomes common pass if response.status_code is requests.codes.ok: pass strainer = SS("span", attrs={"class": "authors-holder"}) soup = BS(response.text, "html.parser", parse_only=strainer) r = soup.find_all("a", attrs={"class": "normal-link author"}) if r: author = [tag.text.strip() for tag in r] author = ", ".join(author) else: return None if title is None: return None topic_simple = forgeTopicSimple(title, abstract) if abstract is None or abstract == '': abstract = "Empty" if graphical_abstract is None: graphical_abstract = "Empty" if author is None or author == '': author = "Empty" author_simple = None else: # Clean author field author = author.replace(' ', ' ') author = author.replace(' ,', ',') author_simple = " " + fct.simpleChar(author) + " " return title, date, author, abstract, graphical_abstract, url, topic_simple, author_simple
def run(self): """Main function. Starts the real business""" self.l.debug("Entering worker") self.l.debug(self.url_feed) # Get the RSS page of the url provided try: self.feed = feedparser.parse(self.url_feed) self.l.debug("RSS page successfully dled") except OSError: self.l.error("Too many files open, could not start the thread !") return # Get the journal name try: journal = self.feed['feed']['title'] except KeyError: self.l.critical("No title for the journal ! Aborting") self.l.critical(self.url_feed) return self.l.info("{0}: {1}".format(journal, len(self.feed.entries))) # Lists to check if the post is in the db, and if # it has all the infos self.session_images = FuturesSession(max_workers=20) # Get the company and the journal_abb by scrolling the dictionnary # containing all the data regarding the journals implemented in the # program. This dictionnary is built in gui.py, to avoid multiple calls # to hosts.getJournals # care_image determines if the Worker will try to dl the graphical # abstracts for key, tuple_data in self.dict_journals.items(): if journal in tuple_data[0]: company = key index = tuple_data[0].index(journal) journal_abb = tuple_data[1][index] care_image = tuple_data[3][index] break try: self.dico_doi = self.listDoi(journal_abb) except UnboundLocalError: self.l.error("Journal not recognized ! Aborting") return # Create a list for the journals which a dl of the article # page is not required. All the data are in the rss page company_no_dl = ['science', 'elsevier', 'beilstein', 'plos'] query = QtSql.QSqlQuery(self.bdd) self.bdd.transaction() # The feeds of these journals are complete # if journal in wiley + science + elsevier: if company in company_no_dl: self.count_futures_urls += len(self.feed.entries) for entry in self.feed.entries: # Get the DOI, a unique number for a publication doi = hosts.getDoi(company, journal, entry) url = getattr(entry, 'feedburner_origlink', entry.link) # Reject crappy entries: corrigendum, erratum, etc if hosts.reject(entry.title): title = entry.title self.count_futures_images += 1 self.parent.counter_rejected += 1 self.l.debug("Rejecting {0}".format(doi)) # Insert the crappy articles in a rescue database if self.parent.debug_mod and doi not in self.dico_doi: query.prepare("INSERT INTO debug (doi, title, \ journal, url) VALUES(?, ?, ?, ?)") params = (doi, title, journal_abb, url) self.l.debug("Inserting {0} in table debug". format(doi)) for value in params: query.addBindValue(value) query.exec_() else: continue # Artice complete, skip it elif doi in self.dico_doi and self.dico_doi[doi]: self.count_futures_images += 1 self.l.debug("Skipping {}".format(doi)) continue # Artice not complete, try to complete it elif doi in self.dico_doi and not self.dico_doi[doi]: # How to update the entry dl_page, dl_image, data = hosts.updateData(company, journal, entry, care_image) # For these journals, all the infos are in the RSS. # Only care about the image if dl_image: self.parent.counter_updates += 1 graphical_abstract = data['graphical_abstract'] if os.path.exists(self.DATA_PATH + functions.simpleChar( graphical_abstract)): self.count_futures_images += 1 else: headers = {'User-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0', 'Connection': 'close', 'Referer': url} future_image = self.session_images.get( graphical_abstract, headers=headers, timeout=self.TIMEOUT) future_image.add_done_callback(functools.partial( self.pictureDownloaded, doi, url)) self.list_futures.append(future_image) else: self.count_futures_images += 1 continue else: try: title, date, authors, abstract, graphical_abstract, url, topic_simple, author_simple = hosts.getData(company, journal, entry) except TypeError: self.l.error("getData returned None for {}". format(journal)) self.count_futures_images += 1 return # Rejecting article if no author if authors == "Empty": self.count_futures_images += 1 self.parent.counter_rejected += 1 self.l.debug("Rejecting article {}, no author". format(title)) continue query.prepare("INSERT INTO papers (doi, title, date, \ journal, authors, abstract, \ graphical_abstract, url, new, topic_simple, \ author_simple) \ VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)") # Set new to 1 and not to true params = (doi, title, date, journal_abb, authors, abstract, graphical_abstract, url, 1, topic_simple, author_simple) self.l.debug("Adding {0} to the database".format(doi)) self.parent.counter += 1 self.new_entries_worker += 1 for value in params: query.addBindValue(value) query.exec_() if graphical_abstract == "Empty" or os.path.exists( self.DATA_PATH + functions.simpleChar(graphical_abstract)): self.count_futures_images += 1 # This block is executed when you delete the db, but # not the images. Allows to update the # graphical_abstract in db accordingly if os.path.exists(self.DATA_PATH + functions.simpleChar( graphical_abstract)): query.prepare("UPDATE papers SET \ graphical_abstract=? WHERE doi=?") params = (functions.simpleChar(graphical_abstract), doi) for value in params: query.addBindValue(value) query.exec_() else: headers = {'User-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0', 'Connection': 'close', 'Referer': url} future_image = self.session_images.get( graphical_abstract, headers=headers, timeout=self.TIMEOUT) future_image.add_done_callback( functools.partial(self.pictureDownloaded, doi, url)) self.list_futures.append(future_image) else: headers = {'User-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0', 'Connection': 'close'} self.session_pages = FuturesSession(max_workers=20) for entry in self.feed.entries: doi = hosts.getDoi(company, journal, entry) if company == 'acs': url = getattr(entry, 'feedburner_origlink', entry.link).split('/')[-1] url = "http://pubs.acs.org/doi/abs/10.1021/" + url elif company == 'npg': url = getattr(entry, 'feedburner_origlink', entry.link).split('/')[-1] url = "http://www.nature.com/nature/journal/vaop/ncurrent/abs/" + url + ".html" else: url = getattr(entry, 'feedburner_origlink', entry.link) # Reject crappy entries: corrigendum, erratum, etc if hosts.reject(entry.title): title = entry.title self.count_futures_images += 1 self.count_futures_urls += 1 self.parent.counter_rejected += 1 self.l.debug("Rejecting {0}".format(doi)) if self.parent.debug_mod and doi not in self.dico_doi: query.prepare("INSERT INTO debug (doi, title, \ journal, url) VALUES(?, ?, ?, ?)") params = (doi, title, journal_abb, url) for value in params: query.addBindValue(value) query.exec_() self.l.debug("Inserting {0} in table debug". format(doi)) continue # Article complete, skip it elif doi in self.dico_doi and self.dico_doi[doi]: self.count_futures_images += 1 self.count_futures_urls += 1 self.l.debug("Skipping {}".format(doi)) continue # Article not complete, try to complete it elif doi in self.dico_doi and not self.dico_doi[doi]: dl_page, dl_image, data = hosts.updateData(company, journal, entry, care_image) if dl_page: self.parent.counter_updates += 1 future = self.session_pages.get(url, timeout=self.TIMEOUT, headers=headers) future.add_done_callback(functools.partial( self.completeData, doi, company, journal, journal_abb, entry)) self.list_futures.append(future) # Continue just to be sure. If dl_page is True, # dl_image is likely True too continue elif dl_image: self.parent.counter_updates += 1 self.count_futures_urls += 1 graphical_abstract = data['graphical_abstract'] if os.path.exists(self.DATA_PATH + functions.simpleChar( graphical_abstract)): self.count_futures_images += 1 else: headers = {'User-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0', 'Connection': 'close', 'Referer': url} future_image = self.session_images.get( graphical_abstract, headers=headers, timeout=self.TIMEOUT) future_image.add_done_callback(functools.partial( self.pictureDownloaded, doi, url)) self.list_futures.append(future_image) else: self.count_futures_urls += 1 self.count_futures_images += 1 continue else: self.l.debug("Starting adding new entry") future = self.session_pages.get(url, timeout=self.TIMEOUT, headers=headers) future.add_done_callback(functools.partial( self.completeData, doi, company, journal, journal_abb, entry)) self.list_futures.append(future) # Check if the counters are full while ((self.count_futures_images + self.count_futures_urls) != len(self.feed.entries) * 2 and self.parent.parsing): self.sleep(1) if self.parent.parsing: if not self.bdd.commit(): self.l.error(self.bdd.lastError().text()) self.l.debug("db insertions/modifications: {}". format(self.new_entries_worker)) self.l.error("Problem when comitting data for {}". format(journal)) # Free the memory, and clean the remaining futures try: self.session_pages.executor.shutdown() except AttributeError: self.l.error("No session_pages to shut down") self.session_images.executor.shutdown() self.l.debug("Exiting thread for {}".format(journal))
def completeData(self, doi, company, journal, journal_abb, entry, future): """Callback to handle the response of the futures trying to download the page of the articles""" self.l.debug("Page dled") self.count_futures_urls += 1 if not self.parent.parsing: return try: response = future.result() except requests.exceptions.ReadTimeout: self.l.error("ReadTimeout for {}".format(journal)) self.count_futures_images += 1 return except requests.exceptions.ConnectionError: self.l.error("ConnectionError for {}".format(journal)) self.count_futures_images += 1 return except ConnectionResetError: self.l.error("ConnectionResetError for {}".format(journal)) self.count_futures_images += 1 return except socket.timeout: self.l.error("socket.timeout for {}".format(journal)) self.count_futures_images += 1 return except concurrent.futures._base.CancelledError: self.l.error("future cancelled for {}".format(journal)) self.count_futures_images += 1 return except Exception as e: self.l.error("Unknown exception {} for {}".format(e, journal)) self.l.error(traceback.format_exc()) self.count_futures_images += 1 return query = QtSql.QSqlQuery(self.bdd) try: title, date, authors, abstract, graphical_abstract, url, topic_simple, author_simple = hosts.getData(company, journal, entry, response) except TypeError: self.l.error("getData returned None for {}".format(journal)) self.count_futures_images += 1 return except Exception as e: self.l.error("Unknown exception completeData {}".format(e)) self.l.error(traceback.format_exc()) self.count_futures_images += 1 return # Rejecting the article if no authors if authors == "Empty": self.count_futures_images += 1 self.parent.counter_rejected += 1 self.l.debug("Rejecting article {}, no author".format(title)) return # Check if the DOI is already in the db. Mandatory, bc sometimes # updateData will tell the worker to dl the page before downloading # the picture if doi not in self.dico_doi: query.prepare("INSERT INTO papers (doi, title, date, journal, \ authors, abstract, graphical_abstract, url, new, \ topic_simple, author_simple) VALUES(?, ?, ?, ?, ?, \ ?, ?, ?, ?, ?, ?)") params = (doi, title, date, journal_abb, authors, abstract, graphical_abstract, url, 1, topic_simple, author_simple) self.l.debug("Adding {0} to the database".format(doi)) self.parent.counter += 1 for value in params: query.addBindValue(value) query.exec_() self.new_entries_worker += 1 # Don't try to dl the image if its url is 'Empty', or if the image # already exists if (graphical_abstract == "Empty" or os.path.exists(self.DATA_PATH + functions.simpleChar(graphical_abstract))): self.count_futures_images += 1 self.l.debug("Image already dled or Empty") # This block is executed when you delete the db, but not the # images. Allows to update the graphical_abstract in db accordingly if os.path.exists(self.DATA_PATH + functions.simpleChar(graphical_abstract)): query.prepare("UPDATE papers SET graphical_abstract=? WHERE \ doi=?") params = (functions.simpleChar(graphical_abstract), doi) for value in params: query.addBindValue(value) query.exec_() else: self.l.debug("Page dled, adding future image") headers = {'User-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0', 'Connection': 'close', 'Referer': url} future_image = self.session_images.get(graphical_abstract, headers=headers, timeout=self.TIMEOUT) future_image.add_done_callback(functools.partial( self.pictureDownloaded, doi, url)) self.list_futures.append(future_image)
def run(self): """Main function. Starts the real business""" self.l.debug("Entering worker") feed = self._getFeed(timeout=self.TIMEOUT) if feed is None: self.l.error("Exiting worker, problem w/ the feed") self.parent.list_failed_rss.append(self.url_feed) return # Get the journal name journal = feed['feed']['title'] self.l.info("{}: {}".format(journal, len(feed.entries))) # Lists to check if the post is in the db, and if # it has all the info self.session_images = FuturesSession( max_workers=self.MAX_WORKERS, session=self.parent.browsing_session) # Get the company and the journal_abb by scrolling the dictionary # containing all the data regarding the journals implemented in the # program. This dictionary is built in gui.py, to avoid multiple calls # to hosts.getJournals # care_image determines if the Worker will try to dl the graphical # abstracts for key, tuple_data in self.dict_journals.items(): if journal in tuple_data[0]: company = key index = tuple_data[0].index(journal) journal_abb = tuple_data[1][index] care_image = tuple_data[3][index] break try: self.dico_doi = self.listDoi(journal_abb) except UnboundLocalError: self.l.error("Journal not recognized ! Aborting") self.parent.list_failed_rss.append(self.url_feed) return # Create a list for the journals which a dl of the article # page is not required. All the data are in the rss page company_no_dl = [ 'Science', 'Elsevier', 'Beilstein', 'PLOS', 'ChemArxiv', 'Wiley' ] query = QtSql.QSqlQuery(self.bdd) self.bdd.transaction() # The feeds of these journals are complete if company in company_no_dl: self.counter_futures_urls += len(feed.entries) for entry in feed.entries: # Get the DOI, a unique number for a publication try: doi = hosts.getDoi(company, journal, entry) except Exception as e: self.l.error("getDoi failed for: {}".format(journal), exc_info=True) self.counter_futures_urls += 1 continue try: url = hosts.refineUrl(company, journal, entry) except Exception as e: self.l.error("refineUrl failed for: {}".format(journal), exc_info=True) self.counter_futures_urls += 1 continue # Reject crappy entries: corrigendum, erratum, etc if hosts.reject(entry.title): title = entry.title self.counter_futures_images += 1 self.parent.counter_rejected += 1 self.l.debug("Rejecting {0}".format(doi)) # Insert the crappy articles in a rescue database if self.parent.debug_mod and doi not in self.dico_doi: query.prepare("INSERT INTO debug (doi, title, \ journal, url) VALUES(?, ?, ?, ?)") params = (doi, title, journal_abb, url) self.l.debug( "Inserting {0} in table debug".format(doi)) for value in params: query.addBindValue(value) query.exec_() else: continue # Artice complete, skip it elif doi in self.dico_doi and self.dico_doi[doi]: self.counter_futures_images += 1 self.l.debug("Article complete, skipping {}".format(doi)) continue # Artice not complete, try to complete it elif doi in self.dico_doi and not self.dico_doi[doi]: self.l.debug("Trying to update {}".format(doi)) # How to update the entry dl_page, dl_image, data = hosts.updateData( company, journal, entry, care_image) # For these journals, all the infos are in the RSS. # Only care about the image if dl_image: self.parent.counter_updates += 1 graphical_abstract = data['graphical_abstract'] if os.path.exists( self.PATH + functions.simpleChar(graphical_abstract)): self.counter_futures_images += 1 else: headers = { 'User-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0', 'Connection': 'close', 'Referer': url } future_image = self.session_images.get( graphical_abstract, headers=headers, timeout=self.TIMEOUT) future_image.add_done_callback( functools.partial(self.pictureDownloaded, doi, url)) self.list_futures.append(future_image) else: self.counter_futures_images += 1 continue # New article, treat it else: try: title, date, authors, abstract, graphical_abstract, url, topic_simple, author_simple = hosts.getData( company, journal, entry) except Exception as e: self.l.error( "Problem with getData: {}".format(journal), exc_info=True) self.counter_futures_images += 1 self.parent.counter_articles_failed += 1 return # Rejecting article if no author if authors == "Empty": self.counter_futures_images += 1 self.parent.counter_rejected += 1 self.l.debug( "Rejecting article {}, no author".format(title)) continue query.prepare("INSERT INTO papers (doi, title, date, \ journal, authors, abstract, \ graphical_abstract, url, new, topic_simple, \ author_simple) \ VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)") # Set new to 1 and not to true params = (doi, title, date, journal_abb, authors, abstract, graphical_abstract, url, 1, topic_simple, author_simple) for value in params: query.addBindValue(value) # Test that query worked if not query.exec_(): self.l.error( "SQL ERROR in run(): {}, company_no_dl".format( query.lastError().text())) self.parent.counter_articles_failed += 1 continue else: self.l.debug("{} added to the database".format(doi)) self.new_entries_worker += 1 self.parent.counter_added += 1 # If article has no graphical abstract of if it has been # dled if graphical_abstract == "Empty" or os.path.exists( self.PATH + functions.simpleChar(graphical_abstract)): self.counter_futures_images += 1 # This block is executed when you delete the db, but # not the images. Allows to update the # graphical_abstract in db accordingly if os.path.exists( self.PATH + functions.simpleChar(graphical_abstract)): query.prepare("UPDATE papers SET \ graphical_abstract=? WHERE doi=?") params = (functions.simpleChar(graphical_abstract), doi) for value in params: query.addBindValue(value) query.exec_() else: headers = { 'User-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0', 'Connection': 'close', 'Referer': url } future_image = self.session_images.get( graphical_abstract, headers=headers, timeout=self.TIMEOUT) future_image.add_done_callback( functools.partial(self.pictureDownloaded, doi, url)) self.list_futures.append(future_image) # The company requires to download the article's web page else: headers = { 'User-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0', 'Connection': 'close' } self.session_pages = FuturesSession( max_workers=self.MAX_WORKERS, session=self.parent.browsing_session) for entry in feed.entries: # Get the DOI, a unique number for a publication try: doi = hosts.getDoi(company, journal, entry) except Exception as e: self.l.error("getDoi failed for: {}".format(journal), exc_info=True) self.counter_futures_urls += 1 self.counter_futures_images += 1 continue # Try to refine the url try: url = hosts.refineUrl(company, journal, entry) except Exception as e: self.l.error("refineUrl failed for: {}".format(journal), exc_info=True) self.counter_futures_urls += 1 self.counter_futures_images += 1 continue # Make sure the entry has a title try: title = entry.title except AttributeError: self.l.error("No title for {}".format(doi), exc_info=True) self.counter_futures_urls += 1 self.counter_futures_images += 1 continue # Reject crappy entries: corrigendum, erratum, etc if hosts.reject(title): self.counter_futures_images += 1 self.counter_futures_urls += 1 self.parent.counter_rejected += 1 self.l.debug("Rejecting {0}".format(doi)) if self.parent.debug_mod and doi not in self.dico_doi: query.prepare("INSERT INTO debug (doi, title, \ journal, url) VALUES(?, ?, ?, ?)") params = (doi, title, journal_abb, url) for value in params: query.addBindValue(value) query.exec_() self.l.debug( "Inserting {0} in table debug".format(doi)) continue # Article complete, skip it elif doi in self.dico_doi and self.dico_doi[doi]: self.counter_futures_images += 1 self.counter_futures_urls += 1 self.l.debug("Article complete, skipping {}".format(doi)) continue # Article not complete, try to complete it elif doi in self.dico_doi and not self.dico_doi[doi]: url = hosts.refineUrl(company, journal, entry) dl_page, dl_image, data = hosts.updateData( company, journal, entry, care_image) if dl_page: self.parent.counter_updates += 1 future = self.session_pages.get(url, timeout=self.TIMEOUT, headers=headers) future.add_done_callback( functools.partial(self.completeData, doi, company, journal, journal_abb, entry)) self.list_futures.append(future) # Continue just to be sure. If dl_page is True, # dl_image is likely True too continue elif dl_image: self.parent.counter_updates += 1 self.counter_futures_urls += 1 graphical_abstract = data['graphical_abstract'] if os.path.exists( self.PATH + functions.simpleChar(graphical_abstract)): self.counter_futures_images += 1 else: headers = { 'User-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0', 'Connection': 'close', 'Referer': url } future_image = self.session_images.get( graphical_abstract, headers=headers, timeout=self.TIMEOUT) future_image.add_done_callback( functools.partial(self.pictureDownloaded, doi, url)) self.list_futures.append(future_image) else: self.counter_futures_urls += 1 self.counter_futures_images += 1 continue # New article, treat it else: url = hosts.refineUrl(company, journal, entry) self.l.debug("Starting adding new entry") future = self.session_pages.get(url, timeout=self.TIMEOUT, headers=headers) future.add_done_callback( functools.partial(self.completeData, doi, company, journal, journal_abb, entry)) self.list_futures.append(future) # Check if the counters are full while ((self.counter_futures_images + self.counter_futures_urls) != len(feed.entries) * 2 and self.parent.parsing): self.sleep(0.5) if self.parent.parsing: if not self.bdd.commit(): self.l.error(self.bdd.lastError().text()) self.l.debug("db insertions/modifications: {}".format( self.new_entries_worker)) self.l.error( "Problem when comitting data for {}".format(journal)) # Free the memory, and clean the remaining futures try: self.session_pages.executor.shutdown() except AttributeError: self.l.error("No session_pages to shut down") self.session_images.executor.shutdown() self.l.debug("Exiting thread for {}".format(journal))
def completeData(self, doi, company, journal, journal_abb, entry, future): """Callback to handle the response of the futures trying to download the page of the articles""" self.l.debug("Page dled") self.counter_futures_urls += 1 if not self.parent.parsing: return try: response = future.result() except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, ConnectionResetError, socket.timeout, concurrent.futures._base.CancelledError) as e: self.l.error("{} raised for {}. Handled".format(journal, e)) self.counter_futures_images += 1 self.parent.counter_articles_failed += 1 return except Exception as e: self.l.error("Unknown exception {} for {}".format(e, journal), exc_info=True) self.counter_futures_images += 1 self.parent.counter_articles_failed += 1 return query = QtSql.QSqlQuery(self.bdd) try: title, date, authors, abstract, graphical_abstract, url, topic_simple, author_simple = hosts.getData( company, journal, entry, response) except TypeError: self.l.error("getData returned None for {}".format(journal), exc_info=True) self.counter_futures_images += 1 self.parent.counter_articles_failed += 1 return except Exception as e: self.l.error("Unknown exception completeData {}".format(e), exc_info=True) self.counter_futures_images += 1 self.parent.counter_articles_failed += 1 return # Rejecting the article if no authors if authors == "Empty": self.counter_futures_images += 1 self.parent.counter_rejected += 1 self.l.debug("Rejecting article {}, no author".format(title)) return # Check if the DOI is already in the db. Mandatory, bc sometimes # updateData will tell the worker to dl the page before downloading # the picture if doi not in self.dico_doi: query.prepare("INSERT INTO papers (doi, title, date, journal, \ authors, abstract, graphical_abstract, url, new, \ topic_simple, author_simple) VALUES(?, \ ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)") params = (doi, title, date, journal_abb, authors, abstract, graphical_abstract, url, 1, topic_simple, author_simple) self.l.debug("Adding {} to the database".format(doi)) self.parent.counter_added += 1 for value in params: query.addBindValue(value) # Test that query worked if not query.exec_(): self.l.error("SQL ERROR in completeData(): {}".format( query.lastError().text())) self.parent.counter_articles_failed += 1 return else: self.new_entries_worker += 1 # Don't try to dl the image if its url is 'Empty', or if the image # already exists if (graphical_abstract == "Empty" or os.path.exists(self.PATH + functions.simpleChar(graphical_abstract))): self.counter_futures_images += 1 self.l.debug("Image already dled or Empty") # This block is executed when you delete the db, but not the # images. Allows to update the graphical_abstract in db accordingly if os.path.exists(self.PATH + functions.simpleChar(graphical_abstract)): query.prepare("UPDATE papers SET graphical_abstract=? WHERE \ doi=?") params = (functions.simpleChar(graphical_abstract), doi) for value in params: query.addBindValue(value) query.exec_() else: self.l.debug("Page dled, adding future image") headers = { 'User-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0', 'Connection': 'close', 'Referer': url } future_image = self.session_images.get(graphical_abstract, headers=headers, timeout=self.TIMEOUT) future_image.add_done_callback( functools.partial(self.pictureDownloaded, doi, url)) self.list_futures.append(future_image)
def getData(company, journal, entry, response=None): """Get the data. Starts from the data contained in the RSS flux, and if necessary, parse the website for supplementary infos. Download the graphical abstract""" # If the journal is edited by the RSC if company == "rsc": """Graphical abstract present in RSS. Abstract incomplete and w/out html. Title w/out html""" title = entry.title date = arrow.get(entry.updated).format("YYYY-MM-DD") url = getattr(entry, "feedburner_origlink", entry.link) abstract = None graphical_abstract = None author = None soup = BeautifulSoup(entry.summary) r = soup("img", align="center") if r: graphical_abstract = r[0]["src"] if response.status_code is requests.codes.ok: # # Get the title (w/ html) # Strainer: get a soup with only the interesting part. # Don't load the complete tree in memory. Saves RAM strainer = SoupStrainer("h2", attrs={"class": "alpH1"}) soup = BeautifulSoup(response.text, parse_only=strainer) title = soup.h2 if title is not None: title = title.renderContents().decode().strip() # # Get the abstrat (w/ html) strainer = SoupStrainer("p", xmlns="http://www.rsc.org/schema/rscart38") soup = BeautifulSoup(response.text, parse_only=strainer) r = soup.p if r is not None: abstract = r.renderContents().decode() if abstract == "": abstract = None strainer = SoupStrainer("meta", attrs={"name": "citation_author"}) soup = BeautifulSoup(response.text, parse_only=strainer) # Here, multiple tags (results) are expected, so perform # the search, even if the tree contains only the result r = soup("meta", attrs={"name": "citation_author"}) if r: author = [tag["content"] for tag in r] author = ", ".join(author) elif company == "wiley": """Feed compltete. Abstract w/ html. Title w/out html""" title = entry.title date = arrow.get(entry.updated).format("YYYY-MM-DD") author = entry.author url = entry.prism_url graphical_abstract = None abstract = None soup = BeautifulSoup(entry.summary) try: # Remove the title "Abstract" from the abstract soup("h3")[0].extract() except IndexError: pass r = soup("a", attrs={"class": "figZoom"}) if r: # Define the graphical abstract by extracting it # (and deleting it) from the abstract graphical_abstract = r[0].extract() graphical_abstract = graphical_abstract["href"] abstract = soup.renderContents().decode() if abstract == "": abstract = None if response.status_code is requests.codes.ok: # # Get the title (w/ html) strainer = SoupStrainer("span", attrs={"class": "mainTitle"}) soup = BeautifulSoup(response.text, parse_only=strainer) r = soup.span if r is not None: try: # Remove the sign for the supplementary infos r("a", href="#nss")[0].extract() except IndexError: pass # Remove the image representing a bond try: r("img", alt="[BOND]")[0].replaceWith("-") title = r.renderContents().decode().strip() except IndexError: title = r.renderContents().decode().strip() elif company == "acs": """Feed only contains graphical abstract""" title = entry.title.rstrip() date = arrow.get(mktime(entry.published_parsed)).format("YYYY-MM-DD") abstract = None author = entry.author author = entry.author.split(" and ") if len(author) > 1: author = ", ".join(author) else: author = author[0] url = getattr(entry, "feedburner_origlink", entry.link).split("/")[-1] url = "http://pubs.acs.org/doi/abs/10.1021/" + url graphical_abstract = None soup = BeautifulSoup(entry.summary) r = soup("img", alt="TOC Graphic") if r: graphical_abstract = r[0]["src"] # If the dl went wrong, print an error if response.status_code is requests.codes.ok: strainer = SoupStrainer("p", attrs={"class": "articleBody_abstractText"}) soup = BeautifulSoup(response.text, parse_only=strainer) r = soup.p if r is not None: abstract = r.renderContents().decode() strainer = SoupStrainer("h1", attrs={"class": "articleTitle"}) soup = BeautifulSoup(response.text, parse_only=strainer) r = soup.h1 if r is not None: title = r.renderContents().decode() elif company == "npg": title = entry.title date = entry.date abstract = entry.summary graphical_abstract = None url = getattr(entry, "feedburner_origlink", entry.link).split("/")[-1] url = "http://www.nature.com/nature/journal/vaop/ncurrent/abs/" + url + ".html" try: author = [dic["name"] for dic in entry.authors] if author: if len(author) > 1: author = ", ".join(author) else: author = author[0] else: author = None except AttributeError: author = None if response.status_code is requests.codes.ok or response.status_code == 401: strainer = SoupStrainer("h1", attrs={"class": "article-heading"}) soup = BeautifulSoup(response.text, parse_only=strainer) r = soup.h1 if r is not None: title = r.renderContents().decode() strainer = SoupStrainer("div", attrs={"id": "first-paragraph"}) soup = BeautifulSoup(response.text, parse_only=strainer) r = soup.div if r is not None: abstract = r.renderContents().decode() strainer = SoupStrainer("figure") soup = BeautifulSoup(response.text, parse_only=strainer) r = soup.find_all("img") if r: graphical_abstract = "http://www.nature.com" + r[0]["src"] if "carousel" in graphical_abstract: graphical_abstract = graphical_abstract.replace("carousel", "images_article") elif company == "science": title = entry.title date = entry.date url = entry.id graphical_abstract = None author = None abstract = entry.summary if not abstract: abstract = None else: if "Author:" in entry.summary: abstract = entry.summary.split("Author: ")[0] try: author = entry.summary.split("Author: ")[1] except IndexError: pass elif "Authors:" in entry.summary: abstract = entry.summary.split("Authors: ")[0] author = entry.summary.split("Authors: ")[1].split(", ") author = ", ".join(author) # To comment if formatName elif company == "nas": title = entry.title date = entry.prism_publicationdate url = entry.id graphical_abstract = None author = None abstract = None if response.status_code is requests.codes.ok: # Get the correct title, not the one in the RSS strainer = SoupStrainer("h1", id="article-title-1") soup = BeautifulSoup(response.text, parse_only=strainer) r = soup.find_all("h1", id="article-title-1") if r: title = r[0].renderContents().decode() # Get the authors strainer = SoupStrainer("a", attrs={"class": "name-search"}) soup = BeautifulSoup(response.text, parse_only=strainer) r = soup.find_all("a", attrs={"class": "name-search"}) if r: author = [tag.text for tag in r] author = ", ".join(author) # Try to get the complete abstract. Sometimes it's available, sometimes # the article only contains an extract strainer = SoupStrainer("div", attrs={"class": "section abstract"}) soup = BeautifulSoup(response.text, parse_only=strainer) if soup.p is not None: abstract = soup.p.renderContents().decode() else: abstract = entry.summary elif company == "elsevier": title = entry.title date = arrow.get(mktime(entry.updated_parsed)).format("YYYY-MM-DD") url = entry.id graphical_abstract = None author = None abstract = entry.summary if abstract: try: author = abstract.split("Author(s): ")[1].split("<br")[0].split("<")[0] author = author.replace(" , ", ", ") author = author.replace(" ", " ") except IndexError: author = None soup = BeautifulSoup(abstract) r = soup.find_all("img") if r: graphical_abstract = r[0]["src"] try: abstract = abstract.split("<br />")[3].lstrip() except IndexError: abstract = "" if abstract == "": abstract = None # NOTE: javascript embedded, impossible # if response.status_code is requests.codes.ok: # url = response.url # print(response.url) # # Get the abstract # soup = BeautifulSoup(response.text) # Get the correct title, no the one in the RSS # r = soup.find_all("li", attrs={"class": "originalArticleName"}) # print(r) # if r: # title = r[0].renderContents().decode() elif company == "thieme": title = entry.title date = arrow.get(entry.updated).format("YYYY-MM-DD") url = entry.id abstract = None graphical_abstract = None author = None if response.status_code is requests.codes.ok: if entry.summary != "": # Get the abstract, and clean it strainer = SoupStrainer("section", id="abstract") soup = BeautifulSoup(response.text, parse_only=strainer) abstract = soup.section abstract("div", attrs={"class": "articleFunctions"})[0].extract() [tag.extract() for tag in abstract("a", attrs={"name": True})] [tag.extract() for tag in abstract("h3")] [tag.extract() for tag in abstract("ul", attrs={"class": "linkList"})] [tag.extract() for tag in abstract("a", attrs={"class": "gotolink"})] try: abstract("div", attrs={"class": "articleKeywords"})[0].extract() except IndexError: pass abstract = abstract.renderContents().decode() strainer = SoupStrainer("span", id="authorlist") soup = BeautifulSoup(response.text, parse_only=strainer) r = soup.find_all("span", id="authorlist") if r: author = r[0].text author = author.replace("*a, b", "") author = author.replace("*a", "") author = author.replace("*", "") elif company == "beilstein": title = entry.title date = arrow.get(mktime(entry.published_parsed)).format("YYYY-MM-DD") url = entry.link abstract = None graphical_abstract = None author = entry.author author = entry.author.split(" and ") if len(author) > 1: author = ", ".join(author) else: author = author[0] if entry.summary != "": soup = BeautifulSoup(entry.summary) r = soup.find_all("p") if r: abstract = r[1].renderContents().decode() r = soup.find_all("img") if r: # This company can change the background of the GA through # the url. If nothing is done, the bg is black, so turn it # to white. Doesn't affect images with unchangeable bg graphical_abstract = r[0]["src"] + "&background=FFFFFF" elif company == "npg2": title = entry.title date = entry.date abstract = entry.summary graphical_abstract = None url = entry.links[0]["href"] try: author = [dic["name"] for dic in entry.authors] if author: if len(author) > 1: author = ", ".join(author) else: author = author[0] else: author = None except AttributeError: author = None if response.status_code is requests.codes.ok or response.status_code == 401: strainer = SoupStrainer("h1", attrs={"class": "tighten-line-height small-space-below"}) soup = BeautifulSoup(response.text, parse_only=strainer) r = soup.h1 if r is not None: title = r.renderContents().decode() strainer = SoupStrainer("div", attrs={"id": "abstract-content"}) soup = BeautifulSoup(response.text, parse_only=strainer) r = soup.p if r is not None: abstract = r.renderContents().decode() strainer = SoupStrainer("img") soup = BeautifulSoup(response.text, parse_only=strainer) r = soup.find_all("img", attrs={"alt": "Figure 1"}) if r: if "f1.jpg" in r[0]["src"]: graphical_abstract = "http://www.nature.com" + r[0]["src"] elif company == "plos": title = entry.title url = entry.link date = arrow.get(mktime(entry.published_parsed)).format("YYYY-MM-DD") if entry.authors: author = [] for element in entry.authors: author.append(element["name"]) author = ", ".join(author) else: author = None abstract = BeautifulSoup(entry.summary) # Clean the authors' names from the abstract r = abstract.find_all("p") if r and str(r[0]).startswith("<p>by "): abstract("p")[0].extract() try: abstract("img")[0].extract() except IndexError: pass abstract = abstract.renderContents().decode().strip() base = "http://journals.plos.org/plosone/article/figure/image?size=medium&id=info:doi/{}.g001" graphical_abstract = base.format(getDoi(company, journal, entry)) elif company == "springer": title = entry.title url = entry.link date = arrow.get(mktime(entry.published_parsed)).format("YYYY-MM-DD") graphical_abstract = None author = None abstract = BeautifulSoup(entry.summary) try: _ = abstract("h3")[0].extract() # Remove the graphical abstract part from the abstract _ = abstract("span", attrs={"class": "a-plus-plus figure category-standard float-no id-figa"})[0].extract() except IndexError: pass abstract = abstract.renderContents().decode().strip() if response.status_code is requests.codes.ok: strainer = SoupStrainer("div", attrs={"class": "MediaObject"}) soup = BeautifulSoup(response.text, parse_only=strainer) # For now, it's one shot: if the dl fails for the GA, there # won't be a retry. That's bc too little articles have GA r = soup.find_all("img") if r: graphical_abstract = r[0]["src"] strainer = SoupStrainer("ul", attrs={"class": "AuthorNames"}) soup = BeautifulSoup(response.text, parse_only=strainer) r = soup.find_all("span", attrs={"class": "AuthorName"}) if r: author = [tag.text for tag in r] author = ", ".join(author) strainer = SoupStrainer("h1", attrs={"class": "ArticleTitle"}) soup = BeautifulSoup(response.text, parse_only=strainer) r = soup.h1 if r is not None: title = r.renderContents().decode() else: return None if abstract is not None: topic_simple = ( " " + functions.simpleChar(BeautifulSoup(abstract).text) + " " + functions.simpleChar(title) + " " ) else: topic_simple = " " + functions.simpleChar(title) + " " if abstract is None or abstract == "": abstract = "Empty" if graphical_abstract is None: graphical_abstract = "Empty" if author is None or author == "": author = "Empty" author_simple = None else: author_simple = " " + functions.simpleChar(author) + " " return title, date, author, abstract, graphical_abstract, url, topic_simple, author_simple