Пример #1
0
def story_top_image(stories_id):
    story = mc.story(stories_id)
    # use the tool key so anyone can see these images
    story_html = apicache.story_raw_1st_download(TOOL_API_KEY, stories_id)
    article = newspaper.Article(url=story['url'])
    article.set_html(story_html)
    article.parse()
    return jsonify({
        'top': article.top_image,
        'all': list(article.images),
    })
def scrap_article(article_link):
    article = newspaper.Article(article_link)
    article.download()
    article.parse()

    raw_paragraph = article.text
    paragraph_list = [p for p in raw_paragraph.split("\n") if p]
    return {
        "title": article.title,
        "paragraph_list": paragraph_list
    }
Пример #3
0
def all_the_content(content, article_database_ref, reload_pundits=False):
    """

    :param content: this is the mongo object containing our content up to now
    :param reload_pundits: if true, pundits are re-scraped every time
    :return: returns keywords, entities, and newpundits, as well as storing them in the mongo object for the article
    """

    reload_pundits = True

    article = newspaper.Article(content['url'])
    article.download()
    article.parse()
    article.nlp()

    print "HERE ARE THE NEWSPAPER KEYWORDS", article.keywords

    content['keywords'] = ""
    content['entities'] = ""

    # if not 'keywords' in content:
    #     content['keywords'] = [x for x in get_keywords(content['text'])
    #         if x['count'] > 2]
    #     _content.update({'_id': bson.ObjectId(content['id'])},
    #         {'$set': {'keywords': content['keywords']}})
    #
    # if not 'entities' in content:
    #     content['entities'] = get_entities(content['text'])
    #     _content.update({'_id': bson.ObjectId(content['id'])},
    #         {'$set': {'entities': content['entities']}})

    if not 'newpundits' in content or reload_pundits:
        content['newpundits'] = []
        dupe_list = []

        snippets, ratios = pundits.keyword_match(article_database_ref,
                                                 article.keywords)
        content['newpundits'] = snippets

        _content.update({'_id': bson.ObjectId(content['id'])},
                        {'$set': {
                            'newpundits': content['newpundits']
                        }})

    if not len(content['newpundits']):
        print "nothing to see here!"
        failed_snippet = {}
        failed_snippet['name'] = "#shambles"
        failed_snippet['text'] = "we can't seem to find anything."
        content['newpundits'] = [[failed_snippet]]
    else:
        print "HERE ARE NEW PUNDITS:", content['newpundits']

    return content['keywords'], content['entities'], content['newpundits']
Пример #4
0
def get_article(url):
    article = newspaper.Article(url, language=u'zh')
    try:
        article.download()
        article.parse()
    except Exception as e:
        print(u"Something go wrong...Cannot download it...")
        pass
    article_title = article.title
    article_text = article.text
    return article_title, article_text
Пример #5
0
 def getKeywords(self):
     # UGLY HACK WARNING
     # if a site has a specific scraper written for it, Newspaper is never involved - but Newspaper's keyword functionality is really good and I don't want to write my own function for it
     # so I'm creating a newspaper.Article object and forcibly setting attributes to allow the natural language processing to work and give me keywords
     a = newspaper.Article(self.url)
     a.text = self.text
     a.title = self.title
     a.download_state = 2  # nlp() function below uses direct comparisons to check for download state so I'm getting away with setting it to something arbitrary
     a.is_parsed = True
     a.nlp()
     return a.keywords
Пример #6
0
def get_text_from_url(url,
                      session,
                      cleanwriter,
                      errorwriter,
                      allow_redirects=False,
                      verify=True):

    url_idx = url[0]
    url_str = url[1]

    try:
        response = session.get(url_str,
                               allow_redirects=allow_redirects,
                               verify=verify)
        response.close()

    except (ConnectionError, InvalidSchema) as e:
        errorwriter.writerow([url_str, e.__class__.__name__])
        response = None

        print(("#%s:" % url_idx), e.__class__.__name__, url_str)

        pass

    except (MissingSchema, TooManyRedirects, RetryError) as e:
        errorwriter.writerow([url_str, e.__class__.__name__])
        response = None

        print(("#%s:" % url_idx), e.__class__.__name__, url_str)

        pass

    if response is not None:
        if response.ok:
            article = newspaper.Article(url_str)
            article.download()

            # See https://github.com/codelucas/newspaper/blob/master/newspaper/article.py#L31
            if article.download_state == 2:
                article.parse()
                article.nlp()
                date, time = get_date_time(article.html)
                cleanwriter.writerow([
                    article.text, article.title, article.keywords, url_str,
                    article.tags, article.meta_keywords, date, time
                ])

        else:
            errorwriter.writerow([url_str, response.status_code])
            print("#%s: Error with status code %s for URL: %s" %
                  (url_idx, response.status_code, url_str))

    else:
        print("%s is not a valid URL" % url_str)
Пример #7
0
 def get_article(self, url):
     article = newspaper.Article(url, language=u'zh')
     try:
         article.download()
         article.parse()
     except Exception as e:
         self.log.info(
             u"Something go wrong:\n{}\nCannot download it...".format(e))
         pass
     article_title = article.title
     article_text = article.text
     return article_title, article_text
Пример #8
0
def save_raw_html_files(graph, page_id, num_posts, html_save_loc):
    #Unrelated to project. Saving raw html files for TT's work
    page_posts = graph.get_connections(page_id, "posts", limit=num_posts)
    page_posts_data = page_posts['data']

    article_index = 0
    success = 0
    driver = webdriver.Firefox()
    driver.set_page_load_timeout(30)

    for post in page_posts_data:
        try:
            print("Examining article {n}".format(n=article_index))
            attachments = graph.get_connections(post['id'], "attachments")
            link_in_post = attachments['data'][0]['url']

            driver.get(link_in_post)
            redirected_url = driver.current_url  #Javascript redirect smh

            if "facebook.com" in redirected_url:
                print("Redirect failed - probably not an article")
                continue

            article = newspaper.Article(redirected_url, "en")
            article.download()

            html_file = article.html
            #final_save_loc = html_save_loc + str(article_index) + ".html"
            final_save_loc = html_save_loc + post['id'] + ".html"

            f = open(final_save_loc, 'w')
            f.write(html_file)
            f.close()

            article_index = article_index + 1
            success = success + 1
        except KeyError:
            print("This article has no attachments or url on facebook")
            continue
        except TimeoutException:
            print(
                "Time out exception has been thrown. Just go to next article, since we don't care about any particular article."
            )
            continue
        except UnicodeEncodeError:
            print(
                "UnicodeEncodeError thrown. Just go to the next one lol, too many possible reasons why"
            )
            continue

    print("Number of successful html files downloaded: {n}".format(n=success))
    driver.close()
    return
Пример #9
0
    def extract_headline(self):

        try:
            self.article = newspaper.Article(self.news_url)
            self.article.download()
            self.article.parse()
        
        except newspaper.article.ArticleException: #List possible errors in case of any exception
            print("\nCONNECTION/URL ERROR: There may be a problem with your connection or the URL entered may be invalid")
            article.title = "Invalid URL/Could not extract title"

        return self.article.title.strip()
Пример #10
0
 def parse(self, response):
     newsItem = newsPaperItem()
     article = newspaper.Article(response.url)
     article.download()
     article.parse()
     nltk.download('punkt')
     article.nlp()
     newsItem["Author_Name"] = article.authors
     newsItem['Publication_Date'] = article.publish_date
     newsItem['Keywords'] = article.keywords
     newsItem['Article_text'] = article.text
     yield newsItem
Пример #11
0
    def link_pull(self, url):
        parser = newspaper.Article(url, request_timeout=10)
        try:
            parser.download()
            parser.parse()
        except (newspaper.article.ArticleException, ValueError):
            return (None, None, [])
        article = parser.text
        title = parser.title
        img = parser.top_image

        return (article, title, [img])
Пример #12
0
def findArticle(links):
    articles = []

    for l in links:
        url = l.strip()
        a = newspaper.Article(url, language='en')
        a.download()
        try:
            a.parse()
        except newspaper.article.ArticleException:
            print("Article not found")

    return articles, links
Пример #13
0
 def scrape(self):
     for source in [self.queue[i].get_urls() for i in range(len(self.queue))]:
         if source:
             for date in source:
                 for url in date:
                     entry = {}
                     article = nw.Article(url)
                     article.download()
                     article.parse()
                     src = re.search('www.(.*).co',url).group(0).replace('.','')
                     entry.update(title=article.title, source=src, text=article.text)
                     self.results = self.results.append(entry,ignore_index=True)
     self.save()
Пример #14
0
def extract_article(url):
    """Function that takes the url string of a news article and returns the
    title and text of the article as a Python dictionary. Built on top of
    Newspaper's article scraping & curation library."""
    link = newspaper.Article(url)
    link.download()
    link.parse()

    article = {}
    article["title"] = link.title
    article["text"] = link.text

    return (article)
Пример #15
0
 def extract(self, url, html_text: str):
     doc = newspaper.Article(url)
     doc.download(input_html=html_text)
     doc.parse()
     self.content = {
         'url': url,
         'text': doc.text,
         'title': doc.title,
         'publish_date': doc.publish_date,
         'top_image_url': doc.top_image,
         'authors': doc.authors,
         'extraction_method': METHOD_NEWSPAPER_3k,
     }
Пример #16
0
def xrun(suburls):
    sh = StructHtml(feilds, 'ex001.xlsx')
    for v in suburls:
        # print type(v)
        url = getInnerPageURLs(v)[0]
        url = "%s%s" % (mainHTTP, url)
        art = newspaper.Article(url, language='zh')
        art.download()
        art.parse()
        sh.getSruct(art.text, art.html)
        yield (v, sh.format_txt)
        pass
    pass
Пример #17
0
 def retrieveContent(self, link):
     try:
         a = newspaper.Article(link)
         a.download()
         a.parse()
         text = a.text
         a.nlp()
         self.keywords = a.keywords
         self.retrieved = str(datetime.datetime.utcnow())
         return a.text
     except Exception as e:
         log.error("Exception retrieving %s" % (link))
         log.exception(e)
Пример #18
0
    def set_article(self):
        try:
            self.article = newspaper.Article(self.url, keep_article_html=True)
        except Exception as e:
            print e.message

        self.article.download()
        if not self.article.is_downloaded:
            time.sleep(1)
        self.article.parse()
        if not self.article.is_parsed:
            time.sleep(1)
        self.article.nlp()
def getArticle(url):
    import dateutil.parser
    article = newspaper.Article(url, keep_article_html=True)
    try:
        article.download()
        article.parse()
    except Exception as e:
        raise e
    date = article.publish_date or dateutil.parser.parse(
        extractDate(article.html)).strftime('%Y-%m-%d %H:%M:%S')
    if not date: raise Exception("Cannot find date")
    return (url, article.title, ','.join(article.authors), str(date),
            article.text.replace('\n', ' '))
Пример #20
0
def getText(url):
    try:
        time.sleep(1 / 5)
        a = newspaper.Article(url, language='zh', memoize_articles=False)
        a.download()
        a.parse()
        text = '    ' + a.text
    except Exception as e:
        time.sleep(random.randint(1, 10) / 10)
        print("下载失败:" + url)
        print(e)
        text = getText(url)
    return text.replace('\n\n', '\n\n    ')
Пример #21
0
 def getText(self, url):
     try:
         time.sleep(3)
         a = newspaper.Article(url, language='zh')
         a.download()
         a.parse()
         text = '    ' + a.text
     except Exception as e:
         time.sleep(random.randint(1, 10))
         print("下载失败:" + url)
         print(e)
         text = self.getText(url)
     return text.replace('\n\n', '\n\n    ')
Пример #22
0
 def build_article_object(article_url):
     '''Build a formatted string with the article title, summary, and url'''
     log.debug("Building article object for article {0}".format(article_url))
     article = newspaper.Article(article_url)
     log.debug("Downloading article {0}".format(article_url))
     article.download()
     log.debug("Finished downloading article {0}, parsing".format(article_url))
     article.parse()
     log.debug("Finished debugging {0}, running nlp".format(article_url))
     article.nlp()
     article_str = "{0} ({1})\n{2}\n".format(
         article.title.encode('ascii', 'ignore'), article_url, article.summary)
     output_strs.append(article_str)
Пример #23
0
def testurl(url, newsClassifier):
    a = newspaper.Article(url)
    a.download()
    a.parse()
    a.nlp()
    l1 = get_named_entities(a.text)
    author = "default"
    try:
        author = a.author[0]
    except:
        print "Not found"
    art = add_article(a.title, a.summary, url, author, l1)
    test_keywords(art, newsClassifier)
Пример #24
0
    def __init__(self, title=None, url=None, pubDate=None, rssFeed=None):
        super(Article, self).__init__()
        #        self._title = title if title != None else ''
        #        self._url = url if url != None else ''
        #        self._pub_date = pubDate if pubDate != None else ''

        self._title = title
        self._url = url
        self._pub_date = pubDate
        self._rssFeed = rssFeed

        if self._title != None and self._url != None and self._pub_date != None:
            self._articleParse = newspaper.Article(self.url)
Пример #25
0
def get_string_data():
    # 读取文本数据
    # 获取文章
    article = newspaper.Article('http://www.bjnews.com.cn/news/2019/07/02/598100.html', language='zh')
    # 下载文章
    article.download()
    # 解析文章
    article.parse()
    # 对文章进行nlp处理
    article.nlp()
    # nlp处理后的文章拼接
    string_data = "".join(article.keywords)
    return string_data
Пример #26
0
 def getArticle(self,url):
   article = newspaper.Article(url)
   for i in range(5):  
     article.download()
     print("Retry:",i)
     html = article.html
     if html and len(html)>0:
       break
     
   article.parse()
   article.nlp()
      
   return article
Пример #27
0
def calculate_article_word_count(url):
    config = newspaper.Config()
    config.browser_user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) " \
                                "AppleWebKit/537.36 (KHTML, like Gecko) " \
                                "Chrome/64.0.3282.186 Safari/537.36"
    article = newspaper.Article(url, config=config)
    article.download()
    article.parse()

    if (len(article.text.split())) < 200:
        raise ValidationError('Could not find article')

    return len(article.text.split()) + len(article.title.split())
Пример #28
0
def newspaper_read(article_link, continue_var=0):
    #continue variables
    max_allowed = 7999
    start_character = max_allowed * continue_var
    end_character = start_character + max_allowed
    text = ''
    n_entity = newspaper.Article(article_link)
    n_entity.download()
    n_entity.parse()
    text += (n_entity.text.encode('ascii',
                                  errors='ignore').replace('\n', '').replace(
                                      '\t', ''))[start_character:end_character]
    return text
Пример #29
0
def main(argv=None):
    urls = []
    if len(argv) < 2:
        raise FileNotFoundError(
            'Please input a file with urls as the first argument')
    with open(argv[1]) as f:
        for line in f.readlines():
            line = line.rstrip("\n")
            # url will be the last "column" in line
            # this leaves room for user-created tags
            if line != '':
                line = line.split()[-1]
                urls.append(line)
    articles = []
    print("downloading")
    for url in urls:
        print(".", url)
        article = newspaper.Article(url, language='en')
        article.download()
        article.parse()
        articles.append([article.title, article.text])
    print("done downloading")
    data_matrix = get_data_matrix(articles)
    # write_matrix(data_matrix)
    # title_combinations = article_combinations(articles)
    # euclidean = get_similarity(title_combinations, data_matrix, euclidean_similarity)
    # cosine = get_similarity(title_combinations, data_matrix, cosine_similarity)
    # jaccard = get_similarity(title_combinations, data_matrix, jaccard_similarity)
    # similarity_matrix = get_similarity_matrix(title_combinations,
    #                                           euclidean,
    #                                           cosine,
    #                                           jaccard)
    # write_matrix(similarity_matrix)

    print("start euclidean")
    euclidean_clusters = k_means(data_matrix, 5, euclidean_similarity)
    print("euclidean", [i[1] for i in euclidean_clusters])
    print("-------\nstart cosine")
    cosine_clusters = k_means(data_matrix, 5, cosine_similarity)
    print("cosine", [i[1] for i in cosine_clusters])
    print("-------\nstart jaccard")
    jaccard_clusters = k_means(data_matrix, 5, jaccard_similarity)
    print("jaccard", [i[1] for i in jaccard_clusters])

    euclidean_sse = sse(data_matrix, euclidean_clusters)
    cosine_sse = sse(data_matrix, cosine_clusters)
    jaccard_sse = sse(data_matrix, jaccard_clusters)

    print(euclidean_sse)
    print(cosine_sse)
    print(jaccard_sse)
Пример #30
0
    def parse_item(self, response):
        sel = Selector(response)

        try:
            #正文
            new = newspaper.Article(url=response.url, language='zh')
            new.download()
            new.parse()
            content = re.sub('\s|\W', '', new.text)
            print(content)
            # 标题
            if sel.xpath("//h1[@class='entry-title']/text()").extract_first():
                title = sel.xpath(
                    "//h1[@class='entry-title']/text()").extract_first()
                # print(title)
            else:
                pass
            #时间
            if sel.xpath("//a[1]/time[@class='entry-date']").extract_first():
                time = sel.xpath("//a[1]/time[@class='entry-date']//text()"
                                 ).extract_first()
                # print(time)
            else:
                pass
            #img图片url
            if sel.xpath(
                    "//img[@class='aligncenter size-full wp-image-1142']/@src"
            ).extract_first():
                img_url = sel.xpath(
                    "//img[@class='aligncenter size-full wp-image-1142']/@src"
                ).extract()
                for url in img_url:
                    print(url)
            else:
                pass
            #来源
            if sel.xpath("//div[@class='entry-content']/p/text()").extract():
                #内容来源一起
                content = sel.xpath(
                    "//div[@class='entry-content']/p/text()").extract()
                if '来源' in content[-1]:
                    print(content[-1])
                    # print(content,response.url)
                else:
                    pass
            else:
                pass
            print(response.url)

        except:
            pass