示例#1
0
def process(source, limit):
    """
    Grabs n articles from source, parses and analyses them, where limit specifies maximum n.
    source must be an RSS feed.
    limit must be an integer.

    Returns: list of dictionaries, each containing the tags and parsed information of one article.
    """

    # Getting article links
    feedLinks = parsing.get_links(source, limit)

    # Download articles and gather into list
    articleList = []
    for link in feedLinks:
        article = parsing.get_article(link)
        articleList.append(article)

    # Analyse each article
    analysedList = []
    for article in articleList:
        analysed = parsing.analyse_article(article, ARTICLE_ELEMENTS)
        analysedList.append(analysed)

    vprint(verbose, str(len(articleList)) + " articles processed.")

    return analysedList
示例#2
0
def get_article(url):
    """
    Downloads an article from url.
    url must be a string.

    Returns: instance of newspaper.article.
    """

    vprint(verbose, "Downloading article from " + url + " ...")
    article = Article(url, language="en")
    article.download()
    vprint(verbose, "Article downloaded.\n")

    return article
示例#3
0
def get_feed_list(feeds):
    """
    Gets RSS feed urls from specified text file.
    feeds must be a string specifying the location to a .txt file.

    Returns: list of RSS feeds.
    """

    # Getting RSS feed URLs
    vprint(verbose, "Getting link list from source file.")
    feedList = []
    feedsFile = open(feeds, "r")
    for link in feedsFile:
        feedList.append(link.strip("\n"))
    feedsFile.close()
    vprint(verbose, str(len(feedList)) + " RSS links acquired.")
    return feedList
示例#4
0
def get_links(RSSFeed, numLinks):
    """
    Scrape n article URLs from an RSS feed, where n=numLinks.
    RSSFeed must be a string holding an RSS feed URL.
    numLinks must be an integer.

    Returns: list of article urls.
    """

    vprint(
        verbose,
        str("Getting " + str(numLinks) + " article links from the feed at " +
            RSSFeed + "..."))
    parsedFeed = feedparser.parse(RSSFeed)
    articleLinks = []
    i = 0
    while i < numLinks and i < len(parsedFeed.entries):
        articleLinks.append(parsedFeed.entries[i].link)
        i = i + 1
    vprint(verbose, "Article links acquired.\n")
    return articleLinks
示例#5
0
def analyse_article(article, elems):
    """
    Parses article for information that matches tags specified in elems, then performs NLP on that information.
    article must be an object of type newspaper.article.
    elems must be a list of tags to scrape from article.

    Returns: dictionary of tags and matching data.
    """

    vprint(verbose, "Parsing and analysing article...")
    # Parse article and conduct NLP analysis using newspaper supplied methods
    article.parse()
    article.nlp()

    breakdown = {}
    for elem in elems:
        payload = getattr(article, elem)
        breakdown.update({elem: payload})

    vprint(verbose, "Parsing and analysis complete.")

    return breakdown