Python ArticleParser示例，ArticleParser.ArticleParser Python示例

示例#1

0

显示文件

文件： WebsiteCrawler.py 项目： nolancash/know-crawler

    def parse_articles(self, articles):
        try:
            if articles:
                print "Parsing articles: " + str(len(articles))
                for article in articles:
#                    print article
                    parser = ArticleParser()
                    try:
                        html = parser.get_html(article)
                        if html:
                            html = ArticleParser.pre_parse(html, "script")
                            try:
                                parser.feed(html)
                                result = parser.results
                                result.append(article)
                                self.__article_results.append(result)
                            except UnicodeDecodeError:
                                print "Bad character."
                            except HTMLParser.HTMLParseError:
                                print "Bad html."
                            del parser
                            del html
                    except HTTPError:
                        print "HTTP error."
                        pass
        except TimeoutException.TimeoutException:
            self.__blacklist_source(self.mech.geturl(), self.__article_results)
            pass
        finally:
            sys.stdout.flush()
            return self.__article_results

示例#2

0

显示文件

    def _builProfileRecommendations(self):
        """Classifies user feedback into one of the tags based on the Cosine simmilarity of the closest article in each tag """
        user_feedback_profile = {}
        for tag in self.user_profile["subscriptions"]:
            user_feedback_profile[tag] = {"text": None}

        if ("feedback" in self.user_profile):
            for feedback_link in self.user_profile["feedback"]:
                feedback_text = ArticleParser(feedback_link).fetchArticleText()
                simmilarity_score = -2

                for tag in self.user_profile["subscriptions"]:

                    feedback_score = self._getScore(feedback_text, tag)

                    if (simmilarity_score < feedback_score):

                        if (not "score" in user_feedback_profile[tag]):

                            user_feedback_profile[tag] = {
                                "text": feedback_text,
                                "score": feedback_score
                            }
                            simmilarity_score = feedback_score

                        elif (user_feedback_profile[tag]["score"] <
                              feedback_score):

                            user_feedback_profile[tag] = {
                                "text": feedback_text,
                                "score": feedback_score
                            }
                            simmilarity_score = feedback_score

        return user_feedback_profile

示例#3

0

显示文件

 def handle_starttag(self, tag, attrs):
     attrs = dict(attrs)
     if 'id' in attrs and attrs['id'] == 'mw-pages':
         self.depth = 0
     if self.depth is not None:
         self.depth += 1
         if tag == 'a' and (attrs['title'], attrs['href']
                            not in Answer.result):
             ArticleParser(attrs['title'], attrs['href'])

示例#4

0

显示文件

    def _extract(self):
        """ extract each article's metadata from the url """
        logger.info(f"Extracting {self.tag} content of date: {self.date}")
        page = requests.get(self.url)
        soup = BeautifulSoup(page.content, 'html.parser')
        for i, card in enumerate((soup.findAll("div",
                                               {"class": self.scrape_class}))):

            if (not self._cntComment(card)):

                link = self._get_link(card)

                if (link):
                    total_claps = self._get_total_claps(card)
                    if (not total_claps):
                        total_claps = 1
                    else:
                        total_claps = self._parse_claps(total_claps) + 1

                    total_responses = self._get_total_responses(card)
                    if (not total_responses):
                        total_responses = 1
                    else:
                        total_responses = self._parse_responses(
                            total_claps) + 1
                    try:
                        parsed_article = ArticleParser(link)
                        self.data_dict["total_responses"].append(
                            total_responses)
                        self.data_dict["link"].append(self._parse_link(link))
                        self.data_dict["total_claps"].append(total_claps)
                        self.data_dict["text"].append(
                            parsed_article.fetchArticleText())
                    except Exception as e:
                        logger.error(f'Failed to download {link} article: {e}')

        logger.info(f"successfully Extracted {self.tag}")

示例#5

0

显示文件

  def _parseEmailContent(self,recommendations):
    """ For given list of recommendations parses it into presentable html format for email"""
    html_content = ''
    for edition in recommendations:
      html_content = html_content+f"""<h1>{edition.title()}-Feed</h1>"""
      for tag in recommendations[edition]:
        html_content = html_content +f"""<h2>{tag.title()}</h2>"""
        for link in recommendations[edition][tag]:
          article_obj = ArticleParser(link)
          html_content = html_content + f""" <a href="{link}"> <h2><strong>{article_obj.fetchArticleTitle()}</strong></h2></a>
                                                {article_obj.parseArticleKeywords(article_obj.fetchArticleKeywords())}
                                                <div><img src="{article_obj.fetchArticleImage()}"</div><br>
                                                <div>{article_obj.parseArticleSummary(article_obj.fetchArticleSummary())}</div> 
                                                <hr><br><br>"""

    return html_content

示例#6

0

显示文件

def processDOI(myDOIs):

    global results
    # create empty array to hold results dicts

    results = []
    '''
    Loop through DOIS and find info about each article. add that information to a python dictionary

    '''
    # remove empty strings from list
    myDOIs = [doi for doi in myDOIs if doi]

    for DOI in myDOIs:
        #remove new lines
        rawDOI = DOI.strip()

        # deal with custom figures other than the TOC image
        ecHTML = ''
        # check to see if there's extra figure code at the end of the DOI and separate it out
        if ' ' in rawDOI:
            hasAltFigure = True
            # get the actual DOI from rawDOI
            DOI = rawDOI.rsplit(' ')[0]
            # get the code at the end of the DOI after the space
            figCode = rawDOI.rsplit(' ')[1]
            print figCode
            # check fig code for editors choice
            if "e" in figCode:
                hasAltFigure = False
                # for editors choice
                ecHTML = "<div class=\"ec-article\"><img src=\"/pb-assets/images/editorschoice/ec-article.gif\"></div>"

                print "ec test" + ecHTML

            # get number out of the code
            figNumber = ''.join([i for i in figCode if i.isdigit()])
        else:
            hasAltFigure = False
            DOI = rawDOI

        # set up beautiful soup
        html = get_html(DOI)
        soup = soup_setup(html)

        # instantiate article objects
        article_parser = ArticleParser(soup)
        article = article_parser.parse_article()

        # create img URL paths
        cleanDOI = clean_doi(DOI)

        coden = get_coden(cleanDOI)
        datecode = get_datecode()

        # create article URL
        article_link = ("/doi/" + str(DOI) + "?ref=highlight")

        # title
        html_title = article.title

        # authors array
        authors_array = article.authors

        # join authors
        authors_joined = join_commas_and(authors_array)

        # get picture link
        gif_url = "https://pubs.acs.org" + article.toc_gif

        toc_href = gif_to_jpeg(gif_url)
        # set other href to nothing in case there is no other image needed
        other_href = ''

        # create TOC img path for Flask, so that the images can be displayed on
        # Flask.
        img_path = "img/generated/" + coden + '/' + \
            str(datecode) + "/" + str(cleanDOI) + ".jpeg"

        # get picture link for alternative images to toc_href using figCode and figNumber
        if hasAltFigure == True:
            if "f" in figCode:
                fig_id = "fig" + figNumber
                other_gif = choose_alt_figure(article.fig_urls, fig_id)
                print "figure " + fig_id + " gif url: " + other_gif
            elif "s" in figCode:
                fig_id = "sch" + figNumber
                other_gif = choose_alt_figure(article.fig_urls, fig_id)
                print "scheme " + fig_id + " gif url: " + other_gif
            elif "c" in figCode:
                # for the chart
                fig_id = "cht" + figNumber
                other_gif = choose_alt_figure(article.fig_urls, fig_id)
                print "figure " + fig_id + " gif url: " + other_gif

            # get the jpeg out of the gif URL
            other_href = gif_to_jpeg("https://pubs.acs.org" + other_gif)

            # set different img path for other gif
            img_path = "img/generated/" + coden + '/' + \
                str(datecode) + "/" + str(cleanDOI) + fig_id + ".jpeg"

            # set desired download path  name for other gif
            pathEnding = coden + '/' + str(datecode) + '/'
            filename = "app/static/img/generated/" + pathEnding + cleanDOI + fig_id + '.jpeg'

            # create folder on local computer for images if doesn't exist already
            create_img_folder(pathEnding)
            try:
                download_toc_image(filename, other_href, coden, datecode,
                                   cleanDOI)
            except:
                pass

            # create image URL for PB using fig code
            img_url = ("/pb-assets/images/" + str(coden) + "/" +
                       "highlights/" + str(datecode) + "/" + str(cleanDOI) +
                       fig_id + ".jpeg")

        else:
            # desired file name
            pathEnding = coden + '/' + str(datecode) + '/'
            filename = "app/static/img/generated/" + pathEnding + cleanDOI + '.jpeg'\
            # create image URL for PB using fig code

            img_url = ("/pb-assets/images/" + str(coden) + "/" +
                       "highlights/" + str(datecode) + "/" + str(cleanDOI) +
                       ".jpeg")
            # create folder on local computer for images if doesn't exist already
            create_img_folder(pathEnding)
            try:
                download_toc_image(filename, toc_href, coden, datecode,
                                   cleanDOI)
            except:
                pass

        articleinfo = {
            "DOI": DOI,
            "Title": html_title,
            "article-link": article_link,
            "Authors": str(authors_joined),
            "toc_href": str(toc_href),
            "other_href": str(other_href),
            "Image": img_url,
            "Flask-image-path": img_path,
            "Coden": coden,
            "Datecode": datecode,
            "Clean_doi": cleanDOI,
            "editors_choice": ecHTML
        }

        print "\n"
        print articleinfo
        print "\n"

        results.append(articleinfo)

    print results

    return results

示例#7

0

显示文件

文件： highlightsnewtest.py 项目： tsboom/localflaskhighlights

def processDOI(myDOIs):

    global results
    # create empty array to hold results dicts

    results = []

    '''
    Loop through DOIS and find info about each article. add that information to a python dictionary

    '''
    # remove empty strings from list
    myDOIs = [doi for doi in myDOIs if doi]

    for DOI in myDOIs:
        #remove new lines
        rawDOI = DOI.strip()

        # deal with custom figures other than the TOC image
        ecHTML = ''
        # check to see if there's extra figure code at the end of the DOI and separate it out
        if ' ' in rawDOI:
            hasAltFigure = True;
            # get the actual DOI from rawDOI
            DOI = rawDOI.rsplit(' ')[0]
            # get the code at the end of the DOI after the space
            figCode = rawDOI.rsplit(' ')[1]
            print figCode
            # check fig code for editors choice
            if "e" in figCode:
                hasAltFigure = False
                # for editors choice
                ecHTML = "<div class=\"ec-article\"><img src=\"/pb-assets/images/editorschoice/ec-article.gif\"></div>"

                print "ec test" + ecHTML

            # get number out of the code
            figNumber = ''.join([i for i in figCode if i.isdigit()])
        else:
            hasAltFigure = False;
            DOI = rawDOI

        # set up beautiful soup
        html = get_html(DOI)
        soup = soup_setup(html)

        # instantiate article objects
        article_parser = ArticleParser(soup)
        article = article_parser.parse_article()


        # create img URL paths
        cleanDOI= clean_doi(DOI)

        coden = get_coden(cleanDOI)
        datecode = get_datecode()




        # create article URL
        article_link = ("/doi/" + str(DOI) + "?ref=highlight")

        # title
        html_title = article.title

        # authors array
        authors_array = article.authors

        # join authors
        authors_joined = join_commas_and(authors_array)


        # get picture link
        gif_url = "https://pubs.acs.org" + article.toc_gif

        toc_href = gif_to_jpeg(gif_url)
        # set other href to nothing in case there is no other image needed
        other_href = ''

        # create TOC img path for Flask, so that the images can be displayed on
        # Flask.
        img_path = "img/generated/" + coden + '/' + \
            str(datecode) + "/" + str(cleanDOI) + ".jpeg"

        # get picture link for alternative images to toc_href using figCode and figNumber
        if hasAltFigure == True:
            if "f" in figCode:
                fig_id = "fig" + figNumber
                other_gif = choose_alt_figure(article.fig_urls, fig_id)
                print "figure " + fig_id + " gif url: " + other_gif
            elif "s" in figCode:
                fig_id = "sch" + figNumber
                other_gif = choose_alt_figure(article.fig_urls, fig_id)
                print "scheme " + fig_id + " gif url: " + other_gif
            elif "c" in figCode:
                # for the chart
                fig_id = "cht" + figNumber
                other_gif = choose_alt_figure(article.fig_urls, fig_id)
                print "figure " + fig_id + " gif url: " + other_gif

            # get the jpeg out of the gif URL
            other_href = gif_to_jpeg("https://pubs.acs.org" + other_gif)

            # set different img path for other gif
            img_path = "img/generated/" + coden + '/' + \
                str(datecode) + "/" + str(cleanDOI) + fig_id + ".jpeg"

            # set desired download path  name for other gif
            pathEnding = coden + '/' + str(datecode) + '/'
            filename = "app/static/img/generated/" + pathEnding + cleanDOI + fig_id

            # create folder on local computer for images if doesn't exist already
            create_img_folder(pathEnding)
            try:
                download_toc_image(filename, other_href, coden, datecode, cleanDOI)
            except:
                pass

            # create image URL for PB using fig code
            img_url = ("/pb-assets/images/" + str(coden) + "/" +
                "highlights/" + str(datecode) + "/" + str(cleanDOI) + fig_id +  ".jpeg")

        else:
            # desired file name
            pathEnding = coden + '/' + str(datecode) + '/'
            filename = "app/static/img/generated/" + pathEnding + cleanDOI
            # create image URL for PB using fig code
            img_url = ("/pb-assets/images/" + str(coden) + "/" +
                "highlights/" + str(datecode) + "/" + str(cleanDOI) + ".jpeg")
            # create folder on local computer for images if doesn't exist already
            create_img_folder(pathEnding)
            try:
                download_toc_image(filename, toc_href, coden, datecode, cleanDOI)
            except:
                pass


        articleinfo = {
            "DOI": DOI,
            "Title": html_title,
            "article-link": article_link,
            "Authors": str(authors_joined),
            "toc_href": str(toc_href),
            "other_href": str(other_href),
            "Image": img_url,
            "Flask-image-path": img_path,
            "Coden": coden,
            "Datecode": datecode,
            "Clean_doi": cleanDOI,
            "editors_choice": ecHTML
        }

        print "\n"
        print articleinfo
        print "\n"

        results.append(articleinfo)





    print results

    return results

示例#8

0

显示文件

文件： testclasses.py 项目： wildstyle-gilbert/LocalPubsHelper

from articleutilities import *
from ArticleParser import ArticleParser, Article
import pdb


def setup():
    DOI = "10.1021/ed084p443"
    html = get_html(DOI)
    soup = soup_setup(html)
    return soup


soup = setup()

article_parser = ArticleParser(soup)

article = article_parser.parse_article()

print article.title, article.authors, article.year, article.volume, article.issue, article.toc_gif

示例#9

0

显示文件

    def _getRecommendations(self, edition, feature_matrix_dict=None):
        """
        For each tag user has subscribed to get recommendations
        """

        for tag in self.user_profile["subscriptions"]:
            if (not tag):
                continue
            logger.info(
                f"Generating {tag} tag recommendations for {self.user_profile['username']} {edition} edition"
            )

            if (edition == 'daily'):

                feature_matrix = pd.DataFrame(
                    feature_matrix_dict[tag]["content"])

            elif (edition == 'archive'):
                feature_matrix = self._getArchiveFeatureMatrix(
                    self.user_profile["username"], tag)

            if (self.user_feedback_profile[tag]["text"]):

                simm_score = self._getSimmScores(
                    self.user_feedback_profile[tag]["text"], tag, edition)

                df_parallel = self.get_parallel(
                    feature_matrix,
                    simm_score,
                    n=self.recommendation_struct[edition]["parallel"])
                parallel_link = df_parallel["link"].values.tolist()

                df_perpend = self.get_perpendicular(
                    feature_matrix,
                    simm_score,
                    n=self.recommendation_struct[edition]["perpendicular"])
                perpend_link = df_perpend["link"].values.tolist()

            else:
                # User having no preference

                parallel_link = feature_matrix.nlargest(
                    self.recommendation_struct[edition]["parallel"],
                    "ClapRespScore")["link"].values.tolist()

                text = ArticleParser(parallel_link[0]).fetchArticleText()

                simm_score = self._getSimmScores(text, tag, edition)

                df_perpend = self.get_perpendicular(
                    feature_matrix,
                    simm_score,
                    n=self.recommendation_struct[edition]["perpendicular"])
                perpend_link = df_perpend["link"].values.tolist()

            for link in parallel_link:
                self.recomendationsDict[edition][tag].append(link)

            for link in perpend_link:
                self.recomendationsDict[edition][tag].append(link)

            if (edition == 'archive'):
                for link in parallel_link:
                    self.archive_log["link"].append(link)
                for link in perpend_link:
                    self.archive_log["link"].append(link)
                self._updateLog(self.user_profile["username"], tag)

示例#10

0

显示文件

文件： testclasses.py 项目： tsboom/localflaskhighlights

from articleutilities import *
from ArticleParser import ArticleParser, Article
import pdb

def setup():
    DOI = "10.1021/ed084p443"
    html = get_html(DOI)
    soup = soup_setup(html)
    return soup

soup = setup()

article_parser = ArticleParser(soup)

article = article_parser.parse_article()

print article.title, article.authors, article.year, article.volume, article.issue, article.toc_gif

示例#11

0

显示文件

文件： virtualissueASAP.py 项目： tsboom/LocalPubsHelper

def createVI(myDOIs, multiJournal, trackingCode, shortName):

    global results

    # create empty array to hold results dict
    results = []
    '''
    Loop through the DOIS to find information from each article page. add that info to lists.

    '''
    clean_journal = []

    # remove empty strings from list
    myDOIs = [doi for doi in myDOIs if doi]

    for DOI in myDOIs:

        DOI = DOI.strip()

        cleanDOI = clean_doi(DOI)

        coden = get_coden(cleanDOI)
        datecode = get_datecode()

        if multiJournal == True:
            # create image URL for PB using shortName
            shortNamePath = str(shortName) + "/"
            img_url = "/pb-assets/images/selects/" + shortNamePath + str(
                cleanDOI) + ".jpeg"

            # create image path for flask to display images from local folder
            img_path = "img/generated/" + shortNamePath + str(
                cleanDOI) + ".jpeg"

            article_link = ("/doi/" + str(DOI) + str(trackingCode))
        else:
            # create image URL for PB using coden and today's date.
            codenDatePath = str(coden) + "/" + str(datecode) + "/"
            img_url = "/pb-assets/images/selects/" + codenDatePath + str(
                cleanDOI) + ".jpeg"
            # create img path for Flask, so that the images can be displayed on
            # Flask.
            img_path = "img/generated/" + codenDatePath + str(
                cleanDOI) + ".jpeg"

            # create article URL
            article_link = ("/doi/" + str(DOI))

        # set up beautiful soup
        html = get_html(DOI)
        soup = soup_setup(html)

        # instantiate article objects
        article_parser = ArticleParser(soup)

        article = article_parser.parse_article()

        # title
        html_title = article.title

        # authors array
        authors_array = article.authors

        # join authors
        authors_joined = join_commas_and(authors_array)

        # get link to the toc image
        gif_url = "https://pubs.acs.org" + article.toc_gif

        toc_href = gif_to_jpeg(gif_url)

        # get journal name
        journal = article.journal
        """
            I'm not sure about this next part. Should i do an if/else liek this? The process should be different depending on if
            the article is an Article ASAP or not. My program puts "Article ASAP" inside of the view template. which is bad I think.
        """
        # check to see if article is an Article ASAP
        journal_string = soup.select('#citation')[0].text
        if "Article ASAP" not in journal_string:
            # get citation year
            year = article.year

            # get citation volume
            volume = article.volume

            # get issue info and pages
            issue_info = article.issue

        else:
            year = ''
            volume = ''
            issue_info = ''

        articleinfo = {
            'DOI': DOI,
            'Title': html_title,
            'article-link': article_link,
            'Authors': str(authors_joined),
            'toc_href': str(toc_href),
            'Image': img_url,
            'Journal': journal,
            'Volume': volume,
            'Issue-info': issue_info,
            'Year': year,
            "Datecode": datecode,
            "Clean_doi": cleanDOI,
            'Coden': coden
        }

        print "\n" + str(articleinfo) + "\n"
        results.append(articleinfo)
        '''
        check to see if there is an existing folder for coden and date,
        if not, create the folder

        '''

        if multiJournal == True:
            # create folder for short journal name (groups images in a named directory)
            pathEnding = "virtualissue/" + shortNamePath
            create_img_folder(pathEnding)

            # desired filename
            filename = "app/static/img/generated/virtualissue/" + shortNamePath + cleanDOI
            try:
                download_toc_image(filename, toc_href, coden, datecode,
                                   cleanDOI)
            except:
                pass

        else:
            # create folder for journal coden and date stamp
            pathEnding = "virtualissue/" + codenDatePath
            create_img_folder(pathEnding)

            # desired filename
            filename = "app/static/img/generated/virtualissue/" + codenDatePath + cleanDOI
            try:
                download_toc_image(filename, toc_href, coden, datecode,
                                   cleanDOI)
            except:
                pass

    return results