def parse_articles(self, articles): try: if articles: print "Parsing articles: " + str(len(articles)) for article in articles: # print article parser = ArticleParser() try: html = parser.get_html(article) if html: html = ArticleParser.pre_parse(html, "script") try: parser.feed(html) result = parser.results result.append(article) self.__article_results.append(result) except UnicodeDecodeError: print "Bad character." except HTMLParser.HTMLParseError: print "Bad html." del parser del html except HTTPError: print "HTTP error." pass except TimeoutException.TimeoutException: self.__blacklist_source(self.mech.geturl(), self.__article_results) pass finally: sys.stdout.flush() return self.__article_results
def _builProfileRecommendations(self): """Classifies user feedback into one of the tags based on the Cosine simmilarity of the closest article in each tag """ user_feedback_profile = {} for tag in self.user_profile["subscriptions"]: user_feedback_profile[tag] = {"text": None} if ("feedback" in self.user_profile): for feedback_link in self.user_profile["feedback"]: feedback_text = ArticleParser(feedback_link).fetchArticleText() simmilarity_score = -2 for tag in self.user_profile["subscriptions"]: feedback_score = self._getScore(feedback_text, tag) if (simmilarity_score < feedback_score): if (not "score" in user_feedback_profile[tag]): user_feedback_profile[tag] = { "text": feedback_text, "score": feedback_score } simmilarity_score = feedback_score elif (user_feedback_profile[tag]["score"] < feedback_score): user_feedback_profile[tag] = { "text": feedback_text, "score": feedback_score } simmilarity_score = feedback_score return user_feedback_profile
def handle_starttag(self, tag, attrs): attrs = dict(attrs) if 'id' in attrs and attrs['id'] == 'mw-pages': self.depth = 0 if self.depth is not None: self.depth += 1 if tag == 'a' and (attrs['title'], attrs['href'] not in Answer.result): ArticleParser(attrs['title'], attrs['href'])
def _extract(self): """ extract each article's metadata from the url """ logger.info(f"Extracting {self.tag} content of date: {self.date}") page = requests.get(self.url) soup = BeautifulSoup(page.content, 'html.parser') for i, card in enumerate((soup.findAll("div", {"class": self.scrape_class}))): if (not self._cntComment(card)): link = self._get_link(card) if (link): total_claps = self._get_total_claps(card) if (not total_claps): total_claps = 1 else: total_claps = self._parse_claps(total_claps) + 1 total_responses = self._get_total_responses(card) if (not total_responses): total_responses = 1 else: total_responses = self._parse_responses( total_claps) + 1 try: parsed_article = ArticleParser(link) self.data_dict["total_responses"].append( total_responses) self.data_dict["link"].append(self._parse_link(link)) self.data_dict["total_claps"].append(total_claps) self.data_dict["text"].append( parsed_article.fetchArticleText()) except Exception as e: logger.error(f'Failed to download {link} article: {e}') logger.info(f"successfully Extracted {self.tag}")
def _parseEmailContent(self,recommendations): """ For given list of recommendations parses it into presentable html format for email""" html_content = '' for edition in recommendations: html_content = html_content+f"""<h1>{edition.title()}-Feed</h1>""" for tag in recommendations[edition]: html_content = html_content +f"""<h2>{tag.title()}</h2>""" for link in recommendations[edition][tag]: article_obj = ArticleParser(link) html_content = html_content + f""" <a href="{link}"> <h2><strong>{article_obj.fetchArticleTitle()}</strong></h2></a> {article_obj.parseArticleKeywords(article_obj.fetchArticleKeywords())} <div><img src="{article_obj.fetchArticleImage()}"</div><br> <div>{article_obj.parseArticleSummary(article_obj.fetchArticleSummary())}</div> <hr><br><br>""" return html_content
def processDOI(myDOIs): global results # create empty array to hold results dicts results = [] ''' Loop through DOIS and find info about each article. add that information to a python dictionary ''' # remove empty strings from list myDOIs = [doi for doi in myDOIs if doi] for DOI in myDOIs: #remove new lines rawDOI = DOI.strip() # deal with custom figures other than the TOC image ecHTML = '' # check to see if there's extra figure code at the end of the DOI and separate it out if ' ' in rawDOI: hasAltFigure = True # get the actual DOI from rawDOI DOI = rawDOI.rsplit(' ')[0] # get the code at the end of the DOI after the space figCode = rawDOI.rsplit(' ')[1] print figCode # check fig code for editors choice if "e" in figCode: hasAltFigure = False # for editors choice ecHTML = "<div class=\"ec-article\"><img src=\"/pb-assets/images/editorschoice/ec-article.gif\"></div>" print "ec test" + ecHTML # get number out of the code figNumber = ''.join([i for i in figCode if i.isdigit()]) else: hasAltFigure = False DOI = rawDOI # set up beautiful soup html = get_html(DOI) soup = soup_setup(html) # instantiate article objects article_parser = ArticleParser(soup) article = article_parser.parse_article() # create img URL paths cleanDOI = clean_doi(DOI) coden = get_coden(cleanDOI) datecode = get_datecode() # create article URL article_link = ("/doi/" + str(DOI) + "?ref=highlight") # title html_title = article.title # authors array authors_array = article.authors # join authors authors_joined = join_commas_and(authors_array) # get picture link gif_url = "https://pubs.acs.org" + article.toc_gif toc_href = gif_to_jpeg(gif_url) # set other href to nothing in case there is no other image needed other_href = '' # create TOC img path for Flask, so that the images can be displayed on # Flask. img_path = "img/generated/" + coden + '/' + \ str(datecode) + "/" + str(cleanDOI) + ".jpeg" # get picture link for alternative images to toc_href using figCode and figNumber if hasAltFigure == True: if "f" in figCode: fig_id = "fig" + figNumber other_gif = choose_alt_figure(article.fig_urls, fig_id) print "figure " + fig_id + " gif url: " + other_gif elif "s" in figCode: fig_id = "sch" + figNumber other_gif = choose_alt_figure(article.fig_urls, fig_id) print "scheme " + fig_id + " gif url: " + other_gif elif "c" in figCode: # for the chart fig_id = "cht" + figNumber other_gif = choose_alt_figure(article.fig_urls, fig_id) print "figure " + fig_id + " gif url: " + other_gif # get the jpeg out of the gif URL other_href = gif_to_jpeg("https://pubs.acs.org" + other_gif) # set different img path for other gif img_path = "img/generated/" + coden + '/' + \ str(datecode) + "/" + str(cleanDOI) + fig_id + ".jpeg" # set desired download path name for other gif pathEnding = coden + '/' + str(datecode) + '/' filename = "app/static/img/generated/" + pathEnding + cleanDOI + fig_id + '.jpeg' # create folder on local computer for images if doesn't exist already create_img_folder(pathEnding) try: download_toc_image(filename, other_href, coden, datecode, cleanDOI) except: pass # create image URL for PB using fig code img_url = ("/pb-assets/images/" + str(coden) + "/" + "highlights/" + str(datecode) + "/" + str(cleanDOI) + fig_id + ".jpeg") else: # desired file name pathEnding = coden + '/' + str(datecode) + '/' filename = "app/static/img/generated/" + pathEnding + cleanDOI + '.jpeg'\ # create image URL for PB using fig code img_url = ("/pb-assets/images/" + str(coden) + "/" + "highlights/" + str(datecode) + "/" + str(cleanDOI) + ".jpeg") # create folder on local computer for images if doesn't exist already create_img_folder(pathEnding) try: download_toc_image(filename, toc_href, coden, datecode, cleanDOI) except: pass articleinfo = { "DOI": DOI, "Title": html_title, "article-link": article_link, "Authors": str(authors_joined), "toc_href": str(toc_href), "other_href": str(other_href), "Image": img_url, "Flask-image-path": img_path, "Coden": coden, "Datecode": datecode, "Clean_doi": cleanDOI, "editors_choice": ecHTML } print "\n" print articleinfo print "\n" results.append(articleinfo) print results return results
def processDOI(myDOIs): global results # create empty array to hold results dicts results = [] ''' Loop through DOIS and find info about each article. add that information to a python dictionary ''' # remove empty strings from list myDOIs = [doi for doi in myDOIs if doi] for DOI in myDOIs: #remove new lines rawDOI = DOI.strip() # deal with custom figures other than the TOC image ecHTML = '' # check to see if there's extra figure code at the end of the DOI and separate it out if ' ' in rawDOI: hasAltFigure = True; # get the actual DOI from rawDOI DOI = rawDOI.rsplit(' ')[0] # get the code at the end of the DOI after the space figCode = rawDOI.rsplit(' ')[1] print figCode # check fig code for editors choice if "e" in figCode: hasAltFigure = False # for editors choice ecHTML = "<div class=\"ec-article\"><img src=\"/pb-assets/images/editorschoice/ec-article.gif\"></div>" print "ec test" + ecHTML # get number out of the code figNumber = ''.join([i for i in figCode if i.isdigit()]) else: hasAltFigure = False; DOI = rawDOI # set up beautiful soup html = get_html(DOI) soup = soup_setup(html) # instantiate article objects article_parser = ArticleParser(soup) article = article_parser.parse_article() # create img URL paths cleanDOI= clean_doi(DOI) coden = get_coden(cleanDOI) datecode = get_datecode() # create article URL article_link = ("/doi/" + str(DOI) + "?ref=highlight") # title html_title = article.title # authors array authors_array = article.authors # join authors authors_joined = join_commas_and(authors_array) # get picture link gif_url = "https://pubs.acs.org" + article.toc_gif toc_href = gif_to_jpeg(gif_url) # set other href to nothing in case there is no other image needed other_href = '' # create TOC img path for Flask, so that the images can be displayed on # Flask. img_path = "img/generated/" + coden + '/' + \ str(datecode) + "/" + str(cleanDOI) + ".jpeg" # get picture link for alternative images to toc_href using figCode and figNumber if hasAltFigure == True: if "f" in figCode: fig_id = "fig" + figNumber other_gif = choose_alt_figure(article.fig_urls, fig_id) print "figure " + fig_id + " gif url: " + other_gif elif "s" in figCode: fig_id = "sch" + figNumber other_gif = choose_alt_figure(article.fig_urls, fig_id) print "scheme " + fig_id + " gif url: " + other_gif elif "c" in figCode: # for the chart fig_id = "cht" + figNumber other_gif = choose_alt_figure(article.fig_urls, fig_id) print "figure " + fig_id + " gif url: " + other_gif # get the jpeg out of the gif URL other_href = gif_to_jpeg("https://pubs.acs.org" + other_gif) # set different img path for other gif img_path = "img/generated/" + coden + '/' + \ str(datecode) + "/" + str(cleanDOI) + fig_id + ".jpeg" # set desired download path name for other gif pathEnding = coden + '/' + str(datecode) + '/' filename = "app/static/img/generated/" + pathEnding + cleanDOI + fig_id # create folder on local computer for images if doesn't exist already create_img_folder(pathEnding) try: download_toc_image(filename, other_href, coden, datecode, cleanDOI) except: pass # create image URL for PB using fig code img_url = ("/pb-assets/images/" + str(coden) + "/" + "highlights/" + str(datecode) + "/" + str(cleanDOI) + fig_id + ".jpeg") else: # desired file name pathEnding = coden + '/' + str(datecode) + '/' filename = "app/static/img/generated/" + pathEnding + cleanDOI # create image URL for PB using fig code img_url = ("/pb-assets/images/" + str(coden) + "/" + "highlights/" + str(datecode) + "/" + str(cleanDOI) + ".jpeg") # create folder on local computer for images if doesn't exist already create_img_folder(pathEnding) try: download_toc_image(filename, toc_href, coden, datecode, cleanDOI) except: pass articleinfo = { "DOI": DOI, "Title": html_title, "article-link": article_link, "Authors": str(authors_joined), "toc_href": str(toc_href), "other_href": str(other_href), "Image": img_url, "Flask-image-path": img_path, "Coden": coden, "Datecode": datecode, "Clean_doi": cleanDOI, "editors_choice": ecHTML } print "\n" print articleinfo print "\n" results.append(articleinfo) print results return results
from articleutilities import * from ArticleParser import ArticleParser, Article import pdb def setup(): DOI = "10.1021/ed084p443" html = get_html(DOI) soup = soup_setup(html) return soup soup = setup() article_parser = ArticleParser(soup) article = article_parser.parse_article() print article.title, article.authors, article.year, article.volume, article.issue, article.toc_gif
def _getRecommendations(self, edition, feature_matrix_dict=None): """ For each tag user has subscribed to get recommendations """ for tag in self.user_profile["subscriptions"]: if (not tag): continue logger.info( f"Generating {tag} tag recommendations for {self.user_profile['username']} {edition} edition" ) if (edition == 'daily'): feature_matrix = pd.DataFrame( feature_matrix_dict[tag]["content"]) elif (edition == 'archive'): feature_matrix = self._getArchiveFeatureMatrix( self.user_profile["username"], tag) if (self.user_feedback_profile[tag]["text"]): simm_score = self._getSimmScores( self.user_feedback_profile[tag]["text"], tag, edition) df_parallel = self.get_parallel( feature_matrix, simm_score, n=self.recommendation_struct[edition]["parallel"]) parallel_link = df_parallel["link"].values.tolist() df_perpend = self.get_perpendicular( feature_matrix, simm_score, n=self.recommendation_struct[edition]["perpendicular"]) perpend_link = df_perpend["link"].values.tolist() else: # User having no preference parallel_link = feature_matrix.nlargest( self.recommendation_struct[edition]["parallel"], "ClapRespScore")["link"].values.tolist() text = ArticleParser(parallel_link[0]).fetchArticleText() simm_score = self._getSimmScores(text, tag, edition) df_perpend = self.get_perpendicular( feature_matrix, simm_score, n=self.recommendation_struct[edition]["perpendicular"]) perpend_link = df_perpend["link"].values.tolist() for link in parallel_link: self.recomendationsDict[edition][tag].append(link) for link in perpend_link: self.recomendationsDict[edition][tag].append(link) if (edition == 'archive'): for link in parallel_link: self.archive_log["link"].append(link) for link in perpend_link: self.archive_log["link"].append(link) self._updateLog(self.user_profile["username"], tag)
def createVI(myDOIs, multiJournal, trackingCode, shortName): global results # create empty array to hold results dict results = [] ''' Loop through the DOIS to find information from each article page. add that info to lists. ''' clean_journal = [] # remove empty strings from list myDOIs = [doi for doi in myDOIs if doi] for DOI in myDOIs: DOI = DOI.strip() cleanDOI = clean_doi(DOI) coden = get_coden(cleanDOI) datecode = get_datecode() if multiJournal == True: # create image URL for PB using shortName shortNamePath = str(shortName) + "/" img_url = "/pb-assets/images/selects/" + shortNamePath + str( cleanDOI) + ".jpeg" # create image path for flask to display images from local folder img_path = "img/generated/" + shortNamePath + str( cleanDOI) + ".jpeg" article_link = ("/doi/" + str(DOI) + str(trackingCode)) else: # create image URL for PB using coden and today's date. codenDatePath = str(coden) + "/" + str(datecode) + "/" img_url = "/pb-assets/images/selects/" + codenDatePath + str( cleanDOI) + ".jpeg" # create img path for Flask, so that the images can be displayed on # Flask. img_path = "img/generated/" + codenDatePath + str( cleanDOI) + ".jpeg" # create article URL article_link = ("/doi/" + str(DOI)) # set up beautiful soup html = get_html(DOI) soup = soup_setup(html) # instantiate article objects article_parser = ArticleParser(soup) article = article_parser.parse_article() # title html_title = article.title # authors array authors_array = article.authors # join authors authors_joined = join_commas_and(authors_array) # get link to the toc image gif_url = "https://pubs.acs.org" + article.toc_gif toc_href = gif_to_jpeg(gif_url) # get journal name journal = article.journal """ I'm not sure about this next part. Should i do an if/else liek this? The process should be different depending on if the article is an Article ASAP or not. My program puts "Article ASAP" inside of the view template. which is bad I think. """ # check to see if article is an Article ASAP journal_string = soup.select('#citation')[0].text if "Article ASAP" not in journal_string: # get citation year year = article.year # get citation volume volume = article.volume # get issue info and pages issue_info = article.issue else: year = '' volume = '' issue_info = '' articleinfo = { 'DOI': DOI, 'Title': html_title, 'article-link': article_link, 'Authors': str(authors_joined), 'toc_href': str(toc_href), 'Image': img_url, 'Journal': journal, 'Volume': volume, 'Issue-info': issue_info, 'Year': year, "Datecode": datecode, "Clean_doi": cleanDOI, 'Coden': coden } print "\n" + str(articleinfo) + "\n" results.append(articleinfo) ''' check to see if there is an existing folder for coden and date, if not, create the folder ''' if multiJournal == True: # create folder for short journal name (groups images in a named directory) pathEnding = "virtualissue/" + shortNamePath create_img_folder(pathEnding) # desired filename filename = "app/static/img/generated/virtualissue/" + shortNamePath + cleanDOI try: download_toc_image(filename, toc_href, coden, datecode, cleanDOI) except: pass else: # create folder for journal coden and date stamp pathEnding = "virtualissue/" + codenDatePath create_img_folder(pathEnding) # desired filename filename = "app/static/img/generated/virtualissue/" + codenDatePath + cleanDOI try: download_toc_image(filename, toc_href, coden, datecode, cleanDOI) except: pass return results