Пример #1
0
def parse_article(url, min_words_count=jg.MIN_WORDS_TO_SCRAPE):
    """ We download an article by ourselves so that we do it behind the Tor
    network and with a random user agent (Don't let Newspaper do it!).
    Then we fool Newspaper to think that it was the one who downloaded it so we
    can parse it and return the article.

    Returns None if the article is smaller than min_words_count.
    """

    try:
        response = get_page(url)
    except Exception as err:
        update_log.error('Error in get_page()')
        update_log.error(err)
        return None

    if response is not None:
        article = ArticleParser(url="http://something")
        article.html = response.content
        article.download_state = 2

        try:
            article.parse()
        except Exception as err:
            update_log.error('Error in article.parse()')
            update_log.error(err)
            return None
        else:
            add_url_to_blacklist(url)
            if len(article.text.split(' ')) >= min_words_count:
                return article

    return None
    def _get_content_from_url(self, url):
        """Takes in a single url and return article content and title"""
        #r = requests.get(url)

        try:
            r = requests.get(url,timeout=6)
            # print('successful!')
        except requests.exceptions.Timeout as e:
            # Maybe set up for a retry
            print(e)
            return ' ', ' '
        except requests.exceptions.RequestException as e:
            print(e)
            return ' ', ' '

        # save to file
        with open('file.html', 'wb') as fh:
            fh.write(r.content)
        #print('Running Article...')
        a = Article(url)

        # set html manually
        with open("file.html", 'rb') as fh:
            a.html = fh.read()
        #print('Done opening Article.html...')
        # need to set download_state to 2 for this to work
        a.download_state = 2

        a.parse()
        
        title = a.title
        content = re.sub("\n\n"," ",a.text)
        # Now the article should be populated
        return content, title
Пример #3
0
def run_newspaper(htmlstring):
    '''try with the newspaper module'''
    ## does not work!
    myarticle = Article('https://www.example.org/test/')
    myarticle.html = htmlstring
    myarticle.download_state = ArticleDownloadState.SUCCESS
    myarticle.parse()
    if myarticle.publish_date is None:
        return None
    date = convert_date(myarticle.publish_date, '%Y-%m-%d %H:%M:%S', '%Y-%m-%d')
    return date
Пример #4
0
def run_newspaper(htmlstring):
    '''try with the newspaper module'''
    # throws error on the eval_default dataset
    try:
        myarticle = Article(htmlstring)
    except (TypeError, UnicodeDecodeError):
        return None
    myarticle.html = htmlstring
    myarticle.download_state = ArticleDownloadState.SUCCESS
    myarticle.parse()
    if myarticle.publish_date is None or myarticle.publish_date == '':
        return None
    return convert_date(myarticle.publish_date, '%Y-%m-%d %H:%M:%S',
                        '%Y-%m-%d')
Пример #5
0
def extract_data(url, bert_summary):
    article = Article(url)
    print("article object created")
    article.download()
    if article.download_state != ArticleDownloadState.SUCCESS:
        article.html = urllib.request.urlopen(url).read()
        # Hacking the library
        article.download_state = ArticleDownloadState.SUCCESS
    print("download completed")
    article.parse()
    print("parsing completed")

    top_image = article.top_image
    title = article.title

    if bert_summary:
        print("extracting bert summary")
        summary = extract_bert_summary(article.text)
    else:
        print("extracting short summary")
        summary = extract_short_summary(article)

    return summary, top_image, title
Пример #6
0
    def parse_article(self, response):
        news_id = 19684  #response.meta.get('news_id')

        # save to file
        with open(str(news_id) + '.html', 'wb') as fh:
            fh.write(response.body)
        article = Article(response.url)
        # set html manually
        with open(str(news_id) + '.html', 'rb') as fh:
            article.html = fh.read()
        os.remove(str(news_id) + '.html')
        # need to set download_state to 2 for this to work
        article.download_state = 2
        article.parse()
        article.nlp()
        date = article.publish_date
        keywords = str([x.replace("'", "''")
                        for x in article.keywords]).replace('"', '\'')
        content = article.text.replace("'", "''")
        summary = article.summary.replace("'", "''")
        title = article.title.replace("'", "''")
        if date is None:
            date = 'null'
        else:
            date = "'" + str(date) + "'"
        authors = str([x.replace("'", "''")
                       for x in article.authors]).replace('"', '\'')
        tags = str([x.replace("'", "''")
                    for x in article.meta_keywords]).replace('"', '\'')

        dbconnector.execute(
            self.conn,
            'INSERT INTO "ParsedNews-newspaper"("IDNews", "Date", "Content", "Keywords", '
            + '"Summary", "Authors", "Tags", "Title") ' + 'VALUES (' +
            str(news_id) + ', ' + str(date) + ', \'' + content + '\', ARRAY ' +
            str(keywords) + '::text[], \'' + summary + '\', ARRAY ' +
            str(authors) + '::text[], ARRAY ' + str(tags) + '::text[], \'' +
            title + '\')')

        # get main article without comments
        content = extract_content(response.text).replace("'", "''")

        # get article and comments
        content_comments = '[\'' + extract_content_and_comments(
            response.text).replace("'", "''") + '\']'

        dbconnector.execute(
            self.conn,
            'INSERT INTO "ParsedNews-dragnet"("IDNews", "Content", "Comments") '
            + 'VALUES (' + str(news_id) + ', \'' + content + '\', ARRAY ' +
            str(content_comments) + '::text[])')

        date = articleDateExtractor.extractArticlePublishedDate(
            articleLink=response.url, html=response.text)
        if date is not None:
            dbconnector.execute(
                self.conn, 'INSERT INTO "ParsedNews-ade"("IDNews", "Date") ' +
                'VALUES (' + str(news_id) + ', \'' + str(date) + '\')')

        g = Goose()
        article = g.extract(raw_html=response.text)
        date = article.publish_datetime_utc
        keywords = str([x.replace("'", "''")
                        for x in article.tags]).replace('"', '\'')
        content = article.cleaned_text.replace("'", "''")
        summary = article.meta_description.replace("'", "''")
        title = article.title.replace("'", "''")
        if date is None:
            date = 'null'
        else:
            date = "'" + str(date) + "'"
        authors = str([x.replace("'", "''")
                       for x in article.authors]).replace('"', '\'')
        tags = str([
            x.replace("'", "''") for x in article.meta_keywords.split(",")
        ]).replace('"', '\'')
        tweets = str([x.replace("'", "''")
                      for x in article.tweets]).replace('"', '\'')

        dbconnector.execute(
            self.conn, 'INSERT INTO "ParsedNews-goose"(' +
            '"IDNews", "Date", "Content", "Keywords", "Summary", ' +
            '"Authors", "Tags", "Tweets",' + '"Title") VALUES (' +
            str(news_id) + ', ' + date + ', \'' + content + '\', ARRAY ' +
            str(keywords) + '::text[], \'' + str(summary) + '\', ARRAY ' +
            str(authors) + '::text[], ARRAY ' + str(tags) +
            '::text[], ARRAY ' + str(tweets) + '::text[], \'' + str(title) +
            '\')')

        pass
Пример #7
0
def fetch_main_content(html: str) -> Article:
    a = Article(url='')
    a.html = html
    a.download_state = 2
    a.parse()
    return a
Пример #8
0
    source_list = ast.literal_eval(e['source_list'])

    #finds which is the position of the o_url in the list (we will need that to retrieve the correct .html)
    o_idx = source_list.index(o_url)
    a = Article(o_url)

    #finds the html file
    article_alias = a_url.rstrip("/").split("/")[-1]
    article_folder = html_folder+"/"+article_alias
    o_html_filename = article_folder+"/"+str(o_idx)+".html"

    # set html manually
    with open(o_html_filename, 'rb') as fh:
        a.html = fh.read()
        # need to set download_state to 2 for this to work
        a.download_state = 2
        a.parse()
        # Now the article should be populated
        print(a.text)


gold_df.to_csv(cwd+"/datasetVeritas3.csv", index=False)

print("average number of annotations per doc:", sum(lenlen)/len(lenlen))
lenlen.sort(reverse = True)
print(lenlen[:200])
print("max num of annotations on the same source")
print(max(lenlen))
print("NEW")
print(count_array.shape)
print(b_array.shape)
Пример #9
0
def scrape(url):
    """
    Scrapes an article from the 'url', extracts meta data using Nespaper3K package
    
    Parameters:
    --------
    url         : str, url to scrape
    
    Returns:
    --------
    doc         : dict,
        {
            'url'      : url,
            'date'     : article publish_date,
            'title'    : article title,
            'text'     : article cleaned_text,
            'keywords' : article meta_keywords,
            'summary'  : article summary
        }
    False       : bool, if get request fails or html < 500
    """
    from newspaper import Article, Config
    import re

    logger.info(f"SCRAPE: trying {url}")
    config = Config()
    config.memoize_articles = False
    config.fetch_images = False
    config.language = 'en'
    config.browser_user_agent = get_ua()
    config.request_timeout = 5
    config.number_threads = 8

    response = get_html_from_url(url)

    if response['status_code'] and response['html']:
        try:
            article = Article(url=url, config=config)
            article.download_state = 2
            article.html = response['html']
            article.parse()
            article.nlp()

            words_count = len((article.text).split())

            if words_count > 200:
                logger.info(
                    f'SCRAPE: Extracted TEXT from URL: {url}\n Title: "{article.title}"'
                )
                return {
                    'url': url,
                    'datetime': article.publish_date,
                    'title': article.title,
                    'text': " ".join(re.split(r'[\n\t]+', article.text)),
                    'keywords': article.keywords,
                    'summary': article.summary
                }
            else:
                logger.info(f'''SCRAPE: Could not extract TEXT from {url}\n 
                    Article too short: {words_count} words''')
        except Exception as e:
            logger.info(
                f'SCRAPE: Could not extract TEXT from {url}\n Error: {e}')
    else:
        logger.info(f'SCRAPE: Could not extract TEXT from {url}')
    return False