示例#1
0
def get_data(path, destination):
    links_list = set()
    with open(path, 'r', encoding='utf-8') as file:
        for line in file:
            link = line.split()[0]
            if len(link) < 10:
                continue
            links_list.add(link)
    links_list = list(links_list)
    final_outputs = {}
    important_keys = [
        'authors', 'date_publish', 'description', 'image_url', 'language',
        'title', 'maintext'
    ]

    multiple_index = 200
    for i in tqdm(range(len(links_list) // multiple_index)):
        keys = links_list[i * multiple_index:(i + 1) * multiple_index]
        values = NewsPlease.from_urls(keys, timeout=6)
        for key, value in values.items():
            paper_data = {}
            for im_key in important_keys:
                paper_data[key] = value.__dict__[im_key]
            final_outputs[key] = paper_data
    pickle.dump(final_outputs, open(destination, 'wb'))
示例#2
0
def parallel_crawl(urls):
    url_batches = get_url_batches(urls)

    crawlled_data = []

    for batch in url_batches:
        data = NewsPlease.from_urls(batch)
        crawlled_data.append(data)

    data_crawled = get_dict(crawlled_data)
    logger.info(f"get response {data_crawled}")

    return data_crawled
示例#3
0
def article_crawler():
    # crawler
    n = 0
    for i in range(0, len(batch)):
        try:
            slice = batch[i]
            # print slice
            print(n)
            slice_name = str(i) + '-NewsPlease-articleCrawl.p'
            article_information = NewsPlease.from_urls(slice)
            print(article_information)
            pickle.dump(article_information, open(slice_name, 'wb'))
            n += 1
        except:
            continue
示例#4
0
def download_articles(search_term, n_articles, start, end=None):
    start_date = datetime.datetime.strptime(start, "%Y-%m-%d")
    end_date = start_date if end is None else datetime.datetime.strptime(
        end, "%Y-%m-%d")
    tbs = get_tbs(start_date, end_date)

    urls = find_urls(search_term, tbs, 10)
    valid_articles = []
    while (len(valid_articles) < n_articles and len(urls) > 0):
        articles_left = 5 - len(valid_articles)
        articles = NewsPlease.from_urls(urls[:articles_left])
        empty, articles = detect_empty_articles(articles)
        for new in articles:
            valid_articles.append(articles.get(new))
        urls = urls[articles_left:]
    # print("valid_articles", len(valid_articles))
    return {
        "search_term": search_term,
        "start": start,
        "end": start if end is None else end,
        "articles": valid_articles
    }
示例#5
0
# -*- coding: utf-8 -*-
"""Untitled3.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1nNtRq9yovCwQmXpYk6Ugq1Uufz7tMthT
"""

from newsplease import NewsPlease
import time
url1='https://timesofindia.indiatimes.com/india/bengaluru-firm-to-build-moon-lander-for-nasa-2020-mission/articleshow/69684821.cms'
url2='https://www.nytimes.com/2017/02/23/us/politics/cpac-stephen-bannon-reince-priebus.html?hp'
url=[]
for i in range(100):
    url.append(url2)

tic=time.time()
a=NewsPlease.from_urls(url)
toc=time.time()
print(toc-tic)
print(a[url2].title)

示例#6
0
def main(argv):
    if argv.__len__() < 2:
        print("Usage: crowley date output_file")

    date = argv[0]
    outputfile = argv[1]

    print
    'Date is "', date
    print
    'Output file is "', outputfile

    newsapi = NewsApiClient(open("token", 'r').read())

    # Get articles urls
    qc_articles = newsapi.get_everything(q='quantum computing',
                                         from_param=date,
                                         language='en',
                                         sort_by='relevancy')
    qp_articles = newsapi.get_everything(q='quantum physics',
                                         from_param=date,
                                         language='en',
                                         sort_by='relevancy')
    all_urls = []
    for article in qc_articles.get('articles'):
        all_urls.append(article.get('url'))
    for article in qp_articles.get('articles'):
        all_urls.append(article.get('url'))

    print("All articles ", all_urls.__len__())

    # Get content of urls
    all_articles = NewsPlease.from_urls(all_urls)

    articles = {}
    for article in all_articles.values():
        articles[article.title] = Article(article.image_url, article.url,
                                          article.maintext)

    # Write urls in file
    open(outputfile, 'w').truncate(0)
    json_data = {}
    json_article = {}
    for article in articles:

        art_value = articles[article]
        if art_value.url is not None and article is not None and art_value.image_url is not None and \
                art_value.maintext is not None:
            json_article[art_value.url] = art_value.maintext
            print("Successfully wrote article %s " % article)

    json_data['content'] = json_article
    try:
        open(outputfile, 'w').write(json.dumps(json_data))
    except OSError:
        print("Failed to open file to write article %s" % article)

    df = pd.read_json(outputfile)
    df.head()
    data = df.content.values.tolist()
    data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]
    data = [re.sub('\s+', ' ', sent) for sent in data]
    data = [re.sub("\'", "", sent) for sent in data]

    words_tokenized = tokenize(data)

    stop_words = stopwords.words('english')
    stop_words.extend([
        'from', 'subject', 're', 'edu', 'use', 'to', 'the', 'of', 'a', 'and',
        'that', 'in', 'is', 'can', 'with', 'for', 'are', 'has'
    ])

    without_stopwords = remove_stopwords(words_tokenized, stop_words)

    try:
        ldamodel_all = gensim.models.ldamodel.LdaModel.load(path_to_lda_all)
    except:
        print("Could not find models on disk! Will train.")
        print("Will generate dictionaries.")
        dictionary_all = Dictionary(without_stopwords)
        print("Will generate corpus")
        corpus_all = [
            dictionary_all.doc2bow(text) for text in without_stopwords
        ]
        print("Will begin training...")
        ldamodel_all = gensim.models.ldamodel.LdaModel(corpus=corpus_all,
                                                       num_topics=NUM_TOPICS,
                                                       id2word=dictionary_all,
                                                       update_every=5,
                                                       chunksize=10000,
                                                       passes=1)
        ldamodel_all.save(path_to_lda_all)
        print("Done training models. Saved them on disk.")

        df_topic_sents_keywords = format_topics_sentences(
            ldamodel=ldamodel_all, corpus=corpus_all, texts=without_stopwords)

        # Format
        df_dominant_topic = df_topic_sents_keywords.reset_index()
        df_dominant_topic.columns = [
            'Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords',
            'Text'
        ]

        # Show
        print(df_dominant_topic.head(5))

        # Group top 5 sentences under each topic
        sent_topics_sorteddf_mallet = pd.DataFrame()

        sent_topics_outdf_grpd = df_topic_sents_keywords.groupby(
            'Dominant_Topic')

        for i, grp in sent_topics_outdf_grpd:
            sent_topics_sorteddf_mallet = pd.concat([
                sent_topics_sorteddf_mallet,
                grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)
            ],
                                                    axis=0)

        # Reset Index
        sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

        # Format
        sent_topics_sorteddf_mallet.columns = [
            'Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"
        ]

        # Show
        print(sent_topics_sorteddf_mallet.head())

    all_topics = get_topics(ldamodel=ldamodel_all, num_words=10)
    for i, topic in enumerate(all_topics):
        print(i, topic)
		'https://www.am.com.mx/leon',
		'https://www.am.com.mx/sanfranciscodelrincon',
		'https://www.mural.com/',
		'https://www.eldiariodechihuahua.mx/Delicias/',
		'https://www.elsoldeparral.com.mx/',
		'https://www.elnorte.com/',
		'http://www.el-mexicano.com.mx/inicio.htm',
		'https://www.elsudcaliforniano.com.mx/',
		'https://www.diariodequeretaro.com.mx/',
		'https://www.eloccidental.com.mx/',
		'https://www.elsoldemexico.com.mx/',
		'https://www.lavozdelafrontera.com.mx/',
		'https://www.elsoldesanluis.com.mx/',
		'http://www.milenio.com/temas/torreon',
		'http://www.milenio.com/estado-de-mexico',
		'http://www.milenio.com/leon',
		'http://www.milenio.com/hidalgo',
		'http://www.milenio.com/jalisco',
		'http://www.milenio.com/monterrey',
		'http://www.milenio.com/puebla',
		'http://www.milenio.com/tamaulipas',
		'http://www.milenio.com/temas/xalapa'
	]

articles = NewsPlease.from_urls(urls, timeout=3)
#print(len(articles))
for url in urls:
	#dump(articles[url])
	insert_data_into_db(articles[url])
	retrieve_records()