Python join 예제들, newspaper.news_pool.join Python 예제들

예제 #1

0

파일 보기

파일: util.py 프로젝트: kedz/newsblaster-lite

def download_articles(sources, data_path, nthreads=2, verbose=False):

    if verbose:
        print "Downloading articles from {} sources with {} threads.".format(
           len(sources), nthreads)

    for source in sources:
        if verbose:
            print "Building source: {}...".format(source.url)
        source.clean_memo_cache()
        source.download_feeds()
        source.generate_articles()
        for article in source.articles:
            print "\tadded: {}".format(article.url)

    news_pool.papers = sources
    news_pool.pool = newspaper.mthreading.ThreadPool(nthreads)

    for source in sources:
        news_pool.pool.add_task(source.download_articles)
    news_pool.join() 

    import yaml
    
    with open(data_path, "w") as f:    
        for source in sources:
            for article in source.articles:
                article.parse()
                meta = {"title": article.title,
                        "author": article.authors,
                        "url": article.url,
                        "source": source.url,
                        "publish_date": article.publish_date}
                yaml.dump({"meta": meta, "text": article.text}, f)

예제 #2

0

파일 보기

파일: get_articles.py 프로젝트: spacy-pl/utils

def get_articles(output_path, ):
    config = dict(
        language='pl',
        fetch_images=False,
        MIN_WORD_COUNT=100,
        MIN_SENT_COUNT=5,
        memoize_articles=False,
    )

    papers = {}
    for url in SITES_URLS:
        paper = newspaper.build(url, **config)
        print(f"{url} contains {paper.size()} articles")
        papers[url] = paper

    print("Downloading...")
    news_pool.set(papers.values(), threads_per_source=2)
    news_pool.join()

    print("Parsing...")
    for paper in papers.values():
        paper.parse_articles()

    articles = [
        art.text for paper in papers.values() for art in paper.articles
    ]
    articles = [art.replace("\n", "") for art in articles]

    print(f"Scraped {len(articles)} articles")

    with open(output_path, "w", encoding='utf-8') as f:
        json.dump(articles, f)

예제 #3

0

파일 보기

파일: ArticlePool.py 프로젝트: pigboomboom/web_nju_news_project

 def download_and_parse(self, url_list=None):
     if not self.submissions and url_list is None:
         logging.error('Must give a "url_list" or "query_reddit" first.')
         return
     self.set_articles(url_list)
     news_pool.set(self.articles)
     news_pool.join()

예제 #4

0

파일 보기

파일: multi_threading_fill_db.py 프로젝트: froyvalencia/newsRank

def loadNews(knownrealSites, s):
    articles = Article.objects.all()
    #for a in articles:
    #print(a)
    papers = []
    for url in knownrealSites:
        real_paper = None
        try:
            real_paper = newspaper.build(url)
            papers.append(real_paper)
            print(url + ' contains ' + str(len(real_paper.articles)) + ' ' +
                  s + ' articles')
        except:
            print(url)
            print('url is bad')
            continue
    news_pool.set(papers, threads_per_source=4)
    news_pool.join()
    for paper in papers:
        for article in paper.articles:
            #due to multithreading above we can assume every article has had download called on it.
            #for article in real_paper.articles:
            try:
                #article.download()
                article.parse()
                #print('article.authors:**************************\n');print(article.authors)
                #print('article.text:**************************\n');print(article.text)
                #print('article.url:**************************\n');print(article.url)
                #print('article.title:**************************\n');print(article.title)
                #article.nlp()
                #print('keywords:**************************\n');print(article.keywords)
                #print('summary:**************************\n');print(article.summary)
            except:
                print('issue with download/parse')
                continue
            #x,y,z = tweetParser.getSentiment(url,2000)
            #print(article.publish_date)
            a = Article(
                address=article.url,
                title=article.title,
                body=article.text,
                date=article.publish_date,
                result=s,
                #positive = x,
                #negative = y,
                #neutral = z,
            )
            #article.parse()
            #article.nlp()
            try:
                a.save()
                print(
                    '**************************article SAVED**************************'
                )
            except:
                print(
                    '**************** article failed to save with field **************'
                )
                continue

예제 #5

0

파일 보기

파일: newspaper_helpers.py 프로젝트: projectweekend/Pi-Holly

def get_newspapers(source_urls):
    papers = []
    for url in source_urls:
        papers.append(newspaper.build(url))
    
    news_pool.set(papers, threads_per_source=2)
    news_pool.join()
    return papers

예제 #6

0

파일 보기

def scrape_news():

    #t = time.time()

    ## connect
    client = MongoDB()

    with client as db:

        # #connect (not necessary)
        # connect(db)

        ## multi-threading
        eu_paper = newspaper.build('https://theguardian.com',
                                   memoize_articles=False,
                                   fetch_images=False)
        us_paper = newspaper.build('https://www.cbsnews.com/',
                                   memoize_articles=False,
                                   fetch_images=False)
        hk_paper = newspaper.build('http://scmp.com',
                                   memoize_articles=False,
                                   fetch_images=False)
        jp_paper = newspaper.build('https://www.japantimes.co.jp/',
                                   memoize_articles=False,
                                   fetch_images=False)

        papers = [eu_paper, us_paper, hk_paper, jp_paper]
        news_pool.set(papers, threads_per_source=2)  # (4*2) = 8 threads total
        news_pool.join()

        print("Size of EU paper: " + str(eu_paper.size()))
        print("Size of US paper: " + str(us_paper.size()))
        print("Size of HK paper: " + str(hk_paper.size()))
        print("Size of JP paper: " + str(jp_paper.size()))

        for paper in papers:
            for article in paper.articles:
                try:
                    article.parse()
                    print(len(article.text))
                    if len(article.text) > 100:
                        article.nlp()
                        item = {
                            'url': article.url,
                            'brand': paper.brand,
                            'title': article.title,
                            'text': article.text,
                            'keywords': article.keywords,
                            'summary': article.summary,
                            'date': dt.today(),
                            'date_str': dt.today().strftime('%Y-%m-%d')
                        }
                        db.news_items.insert_one(item)
                except Exception as e:
                    #In case it fails, skip article
                    print(e)
                    print("continuing...")
                    continue

예제 #7

0

파일 보기

파일: newspaper_client.py 프로젝트: Briscoooe/NewsCache-server

def download_articles_from_urls(urls):
    articles = []
    for url in urls:
        articles.append(Article(url))

    news_pool.set_articles(articles)
    news_pool.join()

    trimmed_articles = trim_articles(articles)
    return trimmed_articles

예제 #8

0

파일 보기

파일: TNC.py 프로젝트: yrumefernandez/CMS495

    def __init__(self):

        # create list containing news sites to scrape
        self.web_list = ['http://www.foxnews.com','http://www.usatoday.com']

        # setup newspaper to multi-thread news sources 
        self.newsWebList = [newspaper.build(i, memoize_articles=True, fetch_images=False) for i in self.web_list]
        news_pool.set(self.newsWebList, threads_per_source=10)
        news_pool.join()
        self.connectDB()
        self.compareArticle()

예제 #9

0

파일 보기

    def build_sources(self, param):
        replies = list()
        for sources in param:
            replies.append(
                newspaper.build('http://' + str(sources) + '.com',
                                language='en'))

        news_pool.set(replies, threads_per_source=3)
        news_pool.join()

        return replies

예제 #10

0

파일 보기

파일: news_feed.py 프로젝트: shashankTwr/News-Feed-Scraper

    def download_all_articles(self):
        logging.info("Downloading all articles...")

        papers = self.create_source_feed_list()

        news_pool.set(papers, threads_per_source=self.THREADS_PER_NEWS_SOURCE)

        # Download feed from all sources in parallel threads
        news_pool.join()

        logging.info("Download complete.")
        logging.info(datetime.now())

예제 #11

0

파일 보기

파일: News.py 프로젝트: MuskoM/KCK-proj

    def get_who_articles(self):

        covid_articles = newspaper.build(self.newspaper_link, memoize_articles=False)
        papers = [covid_articles, ]
        news_pool.set(papers, threads_per_source=4)
        news_pool.join()

        for index, article in enumerate(covid_articles.articles):
            print(article.url)
            article.parse()
            write_file = open('sites/articles/article' + str(index) + '.txt', 'w', encoding='utf-8')
            write_file.write(str(article.title) + "\n")
            write_file.write(textwrap.fill(article.text, width=120))
            write_file.close()

예제 #12

0

파일 보기

파일: main.py 프로젝트: yonjuuni/agiliway_test

def main() -> List[List[str]]:
    papers = [newspaper.build(url,
                              memoize_articles=False,
                              fetch_images=False,
                              verbose=DEBUG)
              for url in SITE_URLS]
    news_pool.set(papers, threads_per_source=THREADS_PER_SOURCE)
    news_pool.join()

    articles = []

    for paper in papers:
        articles.extend(get_articles(paper))

    print('Final number of articles:', len(articles))

    return articles

예제 #13

0

파일 보기

파일: __init__.py 프로젝트: kedz/newsblaster-lite

    def download_articles(self):
        crawl_start_time = datetime.now()
        if not os.path.exists(self.crawl_dir):
            os.makedirs(self.crawl_dir)
        crawl_path = os.path.join(
            self.crawl_dir, "{}.yaml.gz".format(
                crawl_start_time.strftime("%Y-%m-%d-%H-%M-%S")))
        if self.verbose:
            print ("Downloading articles from {} sources with {} " \
                   + "threads.").format(len(self.sources), self.nthreads)

        for source in self.sources:
            if self.verbose:
                print "Building source: {}...".format(source.url)
            source.download_feeds()
            source.generate_articles()
            for article in source.articles:
                print "\tadded: {}".format(article.url)

        news_pool.papers = self.sources
        news_pool.pool = newspaper.mthreading.ThreadPool(self.nthreads)

        for source in self.sources:
            news_pool.pool.add_task(source.download_articles)
        news_pool.join() 


        total = 0
        with gzip.open(crawl_path, "w") as f:    
            for source in self.sources:
                for article in source.articles:
                    article.parse()
                    total += 1
                    meta = {"title": article.title,
                            "author": article.authors,
                            "url": article.url,
                            "source": source.url,
                            "publish_date": article.publish_date}
                    yaml.dump({"meta": meta, "text": article.text}, f)
        if self.verbose:
            print "Finished crawling. Wrote {} articles to {}".format(
                total, crawl_path)
        with open(self.crawl_manifest, "a") as f:
            yaml.dump({
                "crawl-time": crawl_start_time.strftime("%Y-%m-%d-%H-%M-%S"),
                "summarized": False, "preprocessed": False}, f)

예제 #14

0

파일 보기

    def test_download_works(self):
        config = Configuration()
        config.memoize_articles = False
        slate_paper = newspaper.build('http://slate.com', config=config)
        tc_paper = newspaper.build('http://techcrunch.com', config=config)
        espn_paper = newspaper.build('http://espn.com', config=config)

        print('Slate has %d articles TC has %d articles ESPN has %d articles' %
              (slate_paper.size(), tc_paper.size(), espn_paper.size()))

        papers = [slate_paper, tc_paper, espn_paper]
        news_pool.set(papers, threads_per_source=2)

        news_pool.join()

        print 'Downloaded Slate mthread len', len(slate_paper.articles[0].html)
        print 'Downloaded ESPN mthread len', len(espn_paper.articles[-1].html)
        print 'Downloaded TC mthread len', len(tc_paper.articles[1].html)

예제 #15

0

파일 보기

파일: unit_tests.py 프로젝트: Geekking/newspaper

    def test_download_works(self):
        config = Configuration()
        config.memoize_articles = False
        slate_paper = newspaper.build('http://slate.com', config=config)
        tc_paper = newspaper.build('http://techcrunch.com', config=config)
        espn_paper = newspaper.build('http://espn.com', config=config)

        print ('slate has %d articles tc has %d articles espn has %d articles'
               % (slate_paper.size(), tc_paper.size(), espn_paper.size()))

        papers = [slate_paper, tc_paper, espn_paper]
        news_pool.set(papers, threads_per_source=2)

        news_pool.join()

        print 'Downloaded slate mthread len', len(slate_paper.articles[0].html)
        print 'Downloaded espn mthread len', len(espn_paper.articles[-1].html)
        print 'Downloaded tc mthread len', len(tc_paper.articles[1].html)

예제 #16

0

파일 보기

파일: data_collector_us.py 프로젝트: 00krishna-research/py_get_opinion_data

def download_newspapers(sources):
    '''
    This function will download the data from the newspapers in the sources
    variable, and then save them to mongodb database.

    '''

    res = []
    paper = []
    l = newspaper.build(sources, memoize_articles=False)
    paper.append(l)
    news_pool.set(paper, threads_per_source=2)
    news_pool.join()

    for r in paper[0].articles:
        res.append(r)

    return res

예제 #17

0

파일 보기

def collect_news():

    papers = []
    papers.append(newspaper.build('http://cnn.com', memoize_articles=True))
    papers.append(
        newspaper.build('http://www.bbc.com/news', memoize_articles=True))
    papers.append(
        newspaper.build('http://news.sky.com/world', memoize_articles=True))
    papers.append(
        newspaper.build('https://nytimes.com/section/world',
                        memoize_articles=True))
    papers.append(
        newspaper.build('https://washingtonpost.com/world',
                        memoize_articles=True))
    papers.append(
        newspaper.build('http://reuters.com/news/world',
                        memoize_articles=True))

    news_pool.set(papers, threads_per_source=1)
    news_pool.join()
    news_list = []

    categories = fetch_20newsgroups(subset='train', shuffle=True)
    clf = joblib.load(os.path.join(os.path.dirname(__file__), 'model.pkl'))

    for paper in papers:
        for current_article in itertools.islice(paper.articles, 0, 5):
            current_article.download()
            current_article.parse()
            current_article.nlp()

            news_to_add = {
                'title': current_article.title,
                'keywords': current_article.keywords,
                'url': current_article.url,
                'category': news_predictor([current_article.text], categories,
                                           clf),
                'source': paper.brand,
                'collected':
                datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            }
            news_list.append(news_to_add)

    return news_list

예제 #18

0

파일 보기

def extract_articles(url_list):
    """Extracts article text and keywords from url.

    Inputs
    ------
    url_list: list
    Returns
    -------
    generator with keywords parsed from article url list
    """
    articles = [Article(url) for url in url_list]
    news_pool.set(articles)
    news_pool.join()
    r = Rake()
    for article in articles:
        article.parse()
        r.extract_keywords_from_text(article.text)
        article_kwords = r.get_ranked_phrases()
        yield article_kwords

예제 #19

0

파일 보기

def main():

    # Build a news souce
    # use memoize_articles flag to turn off article caching
    fox = newspaper.build("http://www.foxnews.com", memoize_articles=False)
    print(fox.size())
    msnbc = newspaper.build("http://www.msnbc.com", memoize_articles=False)
    print(msnbc.size())
    bbc = newspaper.build("http://www.bbc.com", memoize_articles=False)
    print(bbc.size())

    papers = [fox, msnbc, bbc]

    news_pool.set(papers, threads_per_source=2)  #6 total
    news_pool.join()

    # extract and save articles
    saveFile("fox.json", downloadAndParse(fox))
    saveFile("msnbc.json", downloadAndParse(msnbc))
    saveFile("bbc.json", downloadAndParse(bbc))

예제 #20

0

파일 보기

파일: unit_tests.py 프로젝트: royhu91/newspaper

    def test_download_works(self):
        """
        """
        config = Configuration()
        config.is_memoize_articles = False
        slate_paper = newspaper.build('http://slate.com', config)
        tc_paper = newspaper.build('http://techcrunch.com', config)
        espn_paper = newspaper.build('http://espn.com', config)

        print 'slate has %d articles tc has %d articles espn has %d articles' \
                % (slate_paper.size(), tc_paper.size(), espn_paper.size())

        papers = [slate_paper, tc_paper, espn_paper]
        news_pool.set(papers, threads_per_source=2)

        news_pool.join()

        print 'Downloaded slate mthread len', len(slate_paper.articles[0].html)
        print 'Downloaded espn mthread len', len(espn_paper.articles[-1].html)
        print 'Downloaded tc mthread len', len(tc_paper.articles[1].html)

예제 #21

0

파일 보기

파일: main.py 프로젝트: cilsat/perisalah-crawler

def main(argv):
    TOP_PATH = os.path.dirname(__file__)
    OUT_PATH = os.path.join(TOP_PATH, 'output')
    if not os.path.exists(OUT_PATH):
        os.makedirs(OUT_PATH)

    # Our permanent config for crawling
    config = Config()
    config.language = 'id'
    config.MIN_SENT_COUNT = 20
    config.memoize = False
    config.fetch_images = False
    config.verbose= True

    # Get contents of our source file
    sourcefile = os.path.join(TOP_PATH, "sources.txt")
    with open(os.path.join(sourcefile), 'r') as f:
        sourcelist = f.read().strip().split('\n')

    # Initialize our sources
    sources = [IntiSource(source,config=config) for source in sourcelist]

    # Make domain directories inside our output path and build sources
    for s in sources:
        if not os.path.exists(os.path.join(OUT_PATH, s.domain)):
            dom_path = os.path.join(OUT_PATH, s.domain)
            os.makedirs(dom_path)

        # Build
        s.build()

        if config.verbose:
            s.print_summary()

    # Multithreaded source downloading and parsing
    news_pool.set(sources, threads_per_source = 4)
    news_pool.join()

    article_parse(sources)

예제 #22

0

파일 보기

파일: WebScraper.py 프로젝트: k1895208/ArticleTracker

def pool():
    #Download all new news articles from our sources
    papers = []
    user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:78.0) Gecko/20100101 Firefox/78.0'

    config = Config()
    config.browser_user_agent = user_agent
    config.request_timeout = 10
    config.memoize_articles=False 

    #Build a model of all articles from the website. Get only those we haven't retrieved before
    reuters_paper = newspaper.build('https://www.reuters.com/', memoize_articles=False)#, request_timeout=10)
    bbc_paper = newspaper.build('https://www.bbc.co.uk/news', memoize_articles=False)#, request_timeout=10)

    #We add the models of the news sources
    papers.append(reuters_paper)
    papers.append(bbc_paper)

    news_pool.set(papers, threads_per_source=8)
    news_pool.join()

    return papers

예제 #23

0

파일 보기

def scrape_articles(domains=DOMAINS):
    """Crawls domains and scrapes new web articles.
    """
    papers = [newspaper.build(s, memoize_articles=False) for s in domains]
    news_pool.set(papers, threads_per_source=1)
    news_pool.join()

    for domain, paper in zip(domains, papers):
        paper_source = parse_source(domain)
        for article in paper.articles:
            article_source = parse_source(article.url)
            if article_source != paper_source:
                continue
            article.parse()
            a = Article(url=article.url, 
                        title=article.title,
                        text=article.text, 
                        image=article.top_image,
                        domain=domain)
            a.save()

    n_articles = sum(map(lambda p: len(p.articles), papers))
    logmsg = '{} articles crawled'.format(n_articles)
    logger.info(logmsg)

예제 #24

0

파일 보기

파일: homepage_scrapper.py 프로젝트: dballesteros7/alius

def scrape_website(urls):
    papers = []
    for url in urls:
        if url:
            papers.append(newspaper.build(url, memoize_articles=False))

    for paper in papers:
        delete_queue = [] # articles to be deleted
        for article in paper.articles:
            if 'video' in article.url or 'videos' in article.url:
                delete_queue.append(article)

        for article in delete_queue:
            paper.articles.remove(article)

    news_pool.set(papers, threads_per_source=2) # (2*2) = 4 threads in all
    news_pool.join()

    for paper in papers:
        paper.parse_articles()

    es = ElasticStorage.get_instance(dev=False)
    for paper in papers:
        es.store_articles(paper.articles, paper.url)

예제 #25

0

파일 보기

파일: main.py 프로젝트: davidtsong/NewsCrawler

  google = newspaper.build('http://www.usnews.com/')
  print "google built"
  bbc = newspaper.build('http://www.bbc.com/news/world/us_and_canada/')
  print "bbc built"
  nbc = newspaper.build('http://www.nbcnews.com/news/us-news')
  print "nbcbuild"
  cnn = newspaper.build('http://www.cnn.com/US/')
  print "cnn"
  abc = newspaper.build('http://abcnews.go.com/US/')
  print "abc built"
  fox = newspaper.build('http://www.foxnews.com/us/index.html')
  print "fox built"

  papers = [yahoo, google, bbc, nbc, cnn, abc, fox]
  news_pool.set(papers, threads_per_source=2)
  news_pool.join()

  for Source in papers:
      for article in Source.articles:
          url = article.url
          htmlcode = article.html

          print url
          filename = "html/" + article.title + ".html"
          filename = filename.replace("'", "")
          print filename.encode('utf-8')
          htmlfile = open(filename.encode('utf-8'), "wb")
          htmlfile.write(htmlcode.encode('utf-8'))
          htmlfile.close()
          #HTML(filename).write_png(pngfilename)

예제 #26

0

파일 보기

x = 0
allpapers = newspaper.popular_urls()
for n in allpapers:
    x += 1

if int(secondarg) < x:
    print("there are that many papers", x)
    sourcearts = []
    for paper in allpapers[int(firstarg):int(secondarg)]:
        sourcepaper = newspaper.build(paper)
        sourcearts.append(sourcepaper)

    poolset = news_pool.set(sourcearts,
                            threads_per_source=3)  # (3*2) = 6 threads total
    pooljoin = news_pool.join()
    iart = 0
    for iart in range(len(sourcearts)):
        print("newspaper {}: {}".format(iart + 1, sourcearts[iart].size()))

    iart = 0
    try:
        connection = mysql.connector.connect(host='localhost',
                                             database='newspy',
                                             user='',
                                             password='')
        if connection.is_connected():
            db_Info = connection.get_server_info()
            print("Connected to MySQL Server version ", db_Info)
            cursor = connection.cursor()
            cursor.execute("select database();")

예제 #27

0

파일 보기

파일: news.py 프로젝트: ludde127/Trader_2.1

    def gather_different(self,
                         extra_urls=None,
                         only_extra=False,
                         ignore_gotten=False,
                         save=True):
        checklang = False
        if extra_urls:
            self.urls["extras"] = set(extra_urls)
            for url_ext in extra_urls:
                mine_article(url_ext)
        if not only_extra:
            print(self.newssites)
            if len(self.newssites) > 1 and type(self.newssites) is list:
                papers = [
                    build(paper, config=self.config)
                    for paper in self.newssites
                ]
            else:
                papers = build(self.newssites, config=self.config)
            log(f"Getting Data from {len(self.newssites)} newssites...")
            news_pool.set(papers, threads_per_source=2)
            news_pool.join()
            for art_pool, url in zip(papers, self.newssites):
                print(
                    f"Handling newssite {int(self.newssites.index(url)) + 1}/{len(self.newssites)}"
                )
                for art in art_pool.articles:
                    art.parse()
                    if (str(art.url)
                            not in self.urls["gotten"]) or ignore_gotten:
                        created = date_to_posix(dates=art.publish_date,
                                                list=False)
                        if created is not None and created != "None":
                            dic_temp = {
                                "link":
                                str(art.url),
                                "text":
                                str(
                                    art.text.replace("  ",
                                                     "").replace("\n", "")),
                                "title":
                                str(art.title),
                                "created":
                                float(created),
                                "keywords":
                                str(art.keywords),
                                "author":
                                str(art.authors)
                            }
                            self.urls["gotten"] = np.append(
                                self.urls["gotten"], art.url)
                            if checklang:
                                try:
                                    if check_lang_is_en(str(art.text)):
                                        self.df_art = self.df_art.append(
                                            dic_temp, ignore_index=True)
                                    else:
                                        print(f"Blocked: {dic_temp['text']}")
                                except json.decoder.JSONDecodeError as e:
                                    error(e)
                                    if check_lang_is_en(str(art.title)):
                                        self.df_art = self.df_art.append(
                                            dic_temp, ignore_index=True)
                                    else:
                                        print(f"Blocked: {dic_temp['text']}")
                                    print("fixed?")
                            else:
                                self.df_art = self.df_art.append(
                                    dic_temp, ignore_index=True)

        if save:
            print(self.df_art)
            try:
                pass
                #print(self.df_art.to_string())
            except:
                pass
            update_hdf5(files["news_store"],
                        "news_articles",
                        dataframe=self.df_art,
                        mode="a",
                        append=False)

예제 #28

0

파일 보기

파일: crawler.py 프로젝트: dmilcevski/newspaper

def crawl():
    import newspaper
    from newspaper import news_pool

    memoize_articles = True

    conn = connect()

    threads_per_source = 4

    round = 0

    #loop indefinetely
    while True:
        count = get_news_source_count(conn)
        offset = 0
        limit = 15
        round += 1
        log.info("Crawling round %s.", round)
        while offset <= count:
            papers = []
            sources = get_news_sources(conn, offset, limit)

            offset += limit

            for source in sources:
                log.info("Creating newspaper for source %s", source[1])
                news_paper = newspaper.build(source[1], memoize_articles=memoize_articles, MIN_WORD_COUNT=100)

                papers.append(news_paper)
                log.info("Found %s articles from %s.", news_paper.size(), source[1])

            log.info("Creating a pool of newspapers for %s newspapers.", len(papers))
            news_pool.set(papers, threads_per_source=threads_per_source)

            log.info("Downloading articles for all newspapers.")
            start_time = time.time()
            news_pool.join()

            end_time = time.time() - start_time
            log.info("Downloading finished in %s", end_time)

            log.info("Storing downloaded articles in the database.")
            for paper in papers:
                #get the source id for this nespaper
                news_source_id = get_news_source(conn, paper.url)[0]

                #Get already cralwed articles for this newspaper
                crawled_urls = articles_exist(conn, paper.article_urls())
                crawled_urls_size = 0
                if crawled_urls:
                    crawled_urls_size = len(crawled_urls)
                else:
                    crawled_urls = ['']

                log.info("For newspaper %s %s articles already crawled.", paper.url, crawled_urls_size)
                #articles = []
                #crawled_articles = articles_for_news_source(conn, news_source_id)
                article_count = 0
                for article in paper.articles:
                    #if the article is not crawled already
                    if article.url not in crawled_urls:
                        #parse it
                        try:
                            article.parse()
                            #check if its a news article, and not some other page
                            if article.is_valid_body():
                                article_count += 1
                                insert_news_article(conn, article, news_source_id)
                        except:
                            pass

                            #Check if the combination title and publish date already exists for this newspaper
                            #publish_date = article.publish_date
                            #if publish_date:
                            #    publish_date = publish_date.replace(tzinfo=None)

                            #if (article.title, publish_date) not in crawled_articles:
                                #If not, add it for insertion
                            #    articles.append(article)
                            #    crawled_articles.append((article.title, publish_date))
                            #    log.info("Article '%s' publish date '%s' doesn't exists.", article.title, publish_date)
                            #else:
                            #    log.warn("Article '%s' already exists", article.url)
                log.info("For newspaper %s stored %s articles.", paper.url, article_count)
                #Insert the articles in the database
                #insert_news_articles(conn, list(set(articles)), news_source_id)

        time.sleep(1000) #sleep for 1000 seconds before continuing

예제 #29

0

파일 보기

파일: views.py 프로젝트: monist-david/knowledge

 def post(self, request):
     form = SearchForm(request.POST)
     if form.is_valid():
         search_key = form.cleaned_data['search_keys']
         search_weight = form.cleaned_data['search_weight']
         search_key = search_key.split(',')
         search_weight = search_weight.split(',')
         search_key_weight = {}
         for l in range(len(search_key)):
             search_key_weight[search_key[l]] = search_weight[l]
         if detect(search_key[0]) != 'zh' and detect(
                 search_key[0]) != 'zh-cn':
             # cnn_paper = newspaper.build('http://cnn.com', memoize_articles=False)
             # print(cnn_paper.size())
             # times_paper = newspaper.build('https://www.nytimes.com/', memoize_articles=False)
             # print(times_paper.size())
             # guardian_paper = newspaper.build('https://www.theguardian.com/us', memoize_articles=False)
             # print(guardian_paper.size())
             # abc_paper = newspaper.build('https://abcnews.go.com/', memoize_articles=False)
             # print(abc_paper.size())
             # bbc_paper = newspaper.build('https://www.bbc.com/', memoize_articles=False)
             # print(bbc_paper.size())
             boston_paper = newspaper.build('https://www.bostonglobe.com//',
                                            memoize_articles=False)
             print(boston_paper.size())
             seattle_paper = newspaper.build(
                 'https://www.seattletimes.com/', memoize_articles=False)
             print(seattle_paper.size())
             # papers = [cnn_paper, times_paper, guardian_paper, abc_paper, bbc_paper]
             papers = [boston_paper, seattle_paper]
             news_pool.set(papers,
                           threads_per_source=2)  # (5*2) = 10 threads total
             news_pool.join()
             # for article in cnn_paper.articles:
             #     self.all_scrapy(article, search_key_weight)
             # for article in times_paper.articles:
             #     self.all_scrapy(article, search_key_weight)
             # for article in guardian_paper.articles:
             #     self.all_scrapy(article, search_key_weight)
             # for article in abc_paper.articles:
             #     self.all_scrapy(article, search_key_weight)
             # for article in bbc_paper.articles:
             #     self.all_scrapy(article, search_key_weight)
             for article in boston_paper.articles:
                 self.all_scrapy(article, search_key_weight)
             for article in seattle_paper.articles:
                 self.all_scrapy(article, search_key_weight)
         elif detect(search_key[0]) == 'zh-cn':
             qq_paper = newspaper.build('https://www.qq.com/',
                                        memoize_articles=False)
             print('qq_paper: ' + str(qq_paper.size()))
             # wy_paper = newspaper.build('https://news.163.com/', memoize_articles=False)
             # papers = [qq_paper, wy_paper]
             papers = [qq_paper]
             news_pool.set(papers,
                           threads_per_source=2)  # (3*2) = 6 threads total
             news_pool.join()
             for article in qq_paper.articles:
                 print('processing')
                 self.all_scrapy(article, search_key_weight)
             # for article in wy_paper.articles:
             #     print('processing')
             #     self.all_scrapy(article, search_key_weight)
     else:
         form = SearchForm()
     return HttpResponseRedirect(reverse('searching:results', args=()))

예제 #30

0

파일 보기

파일: news_updater.py 프로젝트: UF-CompLing/Sentiment

def main():
	import newspaper # article download utility
	from newspaper import news_pool, Config, Article, Source
	import re # regex
	import csv # csv file-formatting
	import unicodedata # string cleaning
	from datetime import datetime # time-checking for cache-updates

	print("Retrieving sources and update times\n...")

	# Read active list of news/media sources
	f = open("sourcelist","r")
	sources = f.read().splitlines()
	times = []

	#
	# ONGOING: update time storage and retrieval
	#		-dependent on if caching is sufficient

	papers = {} # Empty dictionary

	print("Building papers\n....\n...\n...")

	# Store total and current number of articles for progress metrics
	total_articles = 0; current_articles = 0

	# Build diction, using url name for keys ex/ 'http://cnn.com' key will be 'cnn'
	for i in range(len(sources)):
		key = re.sub(r'(^https?:\/\/|\.com\n$|\.org\n$)','',sources[i])
		papers[key] = newspaper.build(sources[i],memoize_articles=True)
		
		# Print number of articles added from "recent" list for logging purposes
		total_articles = total_articles + papers[key].size()
		print(key,papers[key].size())

	print("Downloading articles (this may take a while)\n...\n...\n...")

	config = Config()
	config.fetch_images = False
	
	# Download all articles via multi-threading
	news_pool.set([x[1] for x in papers.items()], threads_per_source=2) # Test various thread counts
	news_pool.join()

	print("Extracting text from articles and writing to dump files \n...\n...\n...")

	# Append articles to aggregate and individual csv's
	# Format: col(1) = source, col(2) = date, col(3) = title, col(4) = authors, col(5) = text, col(6) = keywords
	with open('papers.csv','a') as outcsv:

		# Setup aggregate csv writer
		writer = csv.writer(outcsv)
		#writer.writerow(["Source","Date","Title","Authors","Text","Keywords"])

		# Traverse sources
		for i in papers:

			# Setup single_source csv writing
			source = i
			ind_outcsv = open(str(i+".csv"),'a')
			ind_writer = csv.writer(ind_outcsv)

			# Traverse articles in source			
			for j in range(papers[i].size()):

				# Parse articles and extract features
				current_articles += 1
				print("Processing " + str(i) + " article " + str(current_articles) + " of " + str(total_articles) + " (" + str("{0:.2f}".format((current_articles/float(total_articles)*100),2))
 + " %)")

				try:
					papers[i].articles[j].parse()

					# Grab key features
					title = unicodedata.normalize('NFKD',papers[i].articles[j].title).encode('ascii','ignore')
					authors = [x.encode('UTF-8') for x in papers[i].articles[j].authors]
					text = unicodedata.normalize('NFKD',papers[i].articles[j].text).encode('ascii','ignore')
					date = papers[i].articles[j].publish_date
					keywords = [x.encode('UTF-8') for x in papers[i].articles[j].keywords]
					
					# Add new row to both single-source and aggregate files
					ind_writer.writerow([source,date,title,authors,text,keywords])
					writer.writerow([source,date,title,authors,text,keywords])
					papers[i].articles[j].nlp()

				except httplib.BadStatusLine:
					print "httplib.BadStatusLine, no dice"

예제 #31

0

파일 보기

def auto_article_go_getter():
    print("starting builds ", file=sys.stderr)
    cnn_paper = newspaper.build("https://www.cnn.com",  memorize_articles=True, language = 'en')
    print("cnn_paper built", file=sys.stderr)
    nbc_paper = newspaper.build("https://www.nbcnews.com",  memorize_articles=True, language = 'en')
    #print("nbc_paper built", file=sys.stderr)
    #nyt_paper = newspaper.build("https://www.nytimes.com/",  memorize_articles=True, language = 'en')
    #print("nyt_paper built", file=sys.stderr)
    apn_paper = newspaper.build("https://apnews.com/",  memorize_articles=True, language = 'en')
    print("apn_paper built", file=sys.stderr)
    abc_paper = newspaper.build("https://abcnews.go.com/",  memorize_articles=True, language = 'en')
    print("abc_paper built", file=sys.stderr)
    papers = [cnn_paper, nbc_paper, apn_paper, abc_paper]
    verge_paper = newspaper.build("https://www.theverge.com/",  memorize_articles=True, language = 'en')
    print("verge_paper built", file=sys.stderr)
    techP = [verge_paper]
    espn_paper = newspaper.build("https://www.espn.com/",  memorize_articles=True, language = 'en')
    print("espn_paper built", file=sys.stderr)
    sportP = [espn_paper]
    et_paper = newspaper.build("https://ew.com/",  memorize_articles=True, language = 'en')
    print("ew_paper built", file=sys.stderr)
    entertainmentP = [et_paper]
    crypto_paper = newspaper.build("https://cryptonews.com/",  memorize_articles=True, language = 'en')
    print("crypto_paper built", file=sys.stderr)
    cryptoP = [crypto_paper]
    climate_paper = newspaper.build("https://www.climatechangenews.com/",  memorize_articles=True, language = 'en')
    print("climate_paper built", file=sys.stderr)
    climateP = [climate_paper]
    print("all papers built", file=sys.stderr)
    count = 0
    article_list = []
    print("Starting pool threading", file=sys.stderr)
    print("Starting pool for papers", file=sys.stderr)
    news_pool.set(papers, threads_per_source=1000)
    news_pool.join()
    print("Finished pool threading for papers", file=sys.stderr)
    print("Starting pool for techp", file=sys.stderr)
    news_pool.set(techP, threads_per_source=1000)
    news_pool.join()
    print("Finished pool threading for techp", file=sys.stderr)
    print("Starting pool for sportp", file=sys.stderr)
    news_pool.set(sportP, threads_per_source=1000)
    news_pool.join()
    print("Finished pool threading for sportp", file=sys.stderr)
    print("Starting pool for entertainmentp", file=sys.stderr)
    news_pool.set(entertainmentP, threads_per_source=1000)
    news_pool.join()
    print("Finished pool threading for entertainmentp", file=sys.stderr)
    print("Starting pool for cryptop", file=sys.stderr)
    news_pool.set(cryptoP, threads_per_source=1000)
    news_pool.join()
    print("Finished pool threading for cryptop", file=sys.stderr)
    print("Starting pool for climatep", file=sys.stderr)
    news_pool.set(climateP, threads_per_source=1000)
    news_pool.join()
    print("Finished pool threading for climatep", file=sys.stderr)
    print("Saving articles to mongodb", file=sys.stderr)
    for build in papers:
        for news in (build.articles):
            if "politics" in news.url and "cnnespanol" not in news.url:
                news.parse()
                #call on text summarizer with text of article
                textSum = text_summarizer(news.text)
                if "apnews.com" in news.url:
                    textSum = news.text
                article = NewsArticle(
                    link = news.url,
                    image = news.top_image,
                    wing = "political",
                    #text = news.text,
                    text = textSum,
                    title = news.title
                    ).save()
            #email_services = ["hotmail", "gmail", "yahoo"] 
            #email_contains_service = any(email_service in user_email for email_service in email_services)
            elif ["stock", "net", "loss", "Q1", "Q2", "Q3", "Q4", "Gain"] in word_tokenize(news.text):
                news.parse()
                #call on text summarizer with text of article
                textSum = text_summarizer(news.text)
                if "apnews.com" in news.url:
                    textSum = news.text
                article = NewsArticle(
                    link = news.url,
                    image = news.top_image,
                    wing = "buisness",
                    text = textSum,
                    title = news.title
                    ).save()
            elif "covid" in news.url or "corona" in news.url:
                news.parse()
                #call on text summarizer with text of article
                textSum = text_summarizer(news.text)
                if "apnews.com" in news.url:
                    textSum = news.text
                article = NewsArticle(
                    link = news.url,
                    image = news.top_image,
                    wing = "covid",
                    text = textSum,
                    title = news.title
                    ).save()
                count += 1
    for build in techP:
        for news in (build.articles):
            news.parse()
            #call on text summarizer with text of article
            textSum = text_summarizer(news.text)
            if "apnews.com" in news.url:
                    textSum = news.text
            if "#comments" not in news.url:
                article = NewsArticle(
                    link = news.url,
                    image = news.top_image,
                    wing = "tech",
                    text = textSum,
                    title = news.title
                    ).save()
    for build in sportP:
        for news in (build.articles):
            news.parse()
            #call on text summarizer with text of article
            textSum = text_summarizer(news.text)
            article = NewsArticle(
                link = news.url,
                image = news.top_image,
                wing = "sports",
                text = textSum,
                title = news.title
                ).save()
    for build in entertainmentP:
        for news in (build.articles):
            news.parse()
            #call on text summarizer with text of article
            textSum = text_summarizer(news.text)
            article = NewsArticle(
                link = news.url,
                image = news.top_image,
                wing = "entertainment",
                text = textSum,
                title = news.title
                ).save()
    for build in cryptoP:
        for news in (build.articles):
            news.parse()
            #call on text summarizer with text of article
            textSum = text_summarizer(news.text)
            article = NewsArticle(
                link = news.url,
                image = news.top_image,
                wing = "crypto",
                text = textSum,
                title = news.title
                ).save()
    for build in climateP:
        for news in (build.articles):
            news.parse()
            #call on text summarizer with text of article
            textSum = text_summarizer(news.text)
            article = NewsArticle(
                link = news.url,
                image = news.top_image,
                wing = "climate",
                text = textSum,
                title = news.title
                ).save()            
    print("Articles saved in mongodb", file=sys.stderr)

예제 #32

0

파일 보기

파일: get_news.py 프로젝트: John151/news_aggregator

def build_section(section):

    # some articles don't provide a date, we assume they were posted receintly and use the date the data was gathered
    for newspaper_source in list_news_obj:
        # print(f'\nName: {newspaper_source.name}')

        if section in newspaper_source.paths:
            newspaper_stack = []
            section_url = newspaper_source.make_path(section)
            newspaper_build = newspaper.build(section_url)
            newspaper_stack.append(newspaper_build)
            news_pool.set(newspaper_stack,
                          threads_per_source=2)  # (3*2) = 6 threads total
            news_pool.join()

            for downloaded_paper in newspaper_stack:
                articles = downloaded_paper.articles

                for article in articles:
                    # print(article.url)
                    # print(article.title)
                    section = filter_junk_results(article.url,
                                                  newspaper_source.name,
                                                  section)

                    if section:
                        try:
                            article.download()
                            article.parse()

                            title = article.title
                            url = article.url
                            publication = newspaper_source.name
                            city = newspaper_source.place
                            section = section
                            body = article.text
                            image = article.top_image

                            if article.authors:
                                authors = article.authors[0]
                            else:
                                authors = ''

                            try:
                                a = Article(title=title,
                                            url=url,
                                            publication=publication,
                                            city=city,
                                            section=section,
                                            authors=authors,
                                            body=body,
                                            image=image)
                                a.save()
                                print(f'created new article: {a.title}')
                            except django.db.utils.IntegrityError as e:
                                print('Duplicate entry, not added.', e)
                            except Exception as e:
                                print(e)
                            print(
                                f'Title: {title}, url: {url}, publication: {publication}, city: {city}\nsection: {section}, authors: {authors}'
                            )
                        except Exception as e:
                            print(e)

예제 #33

0

파일 보기

파일: appFINALHEROKU.py 프로젝트: paulorss/herokuDB

def get_bot_response():
    while True:
        userText = request.args.get('msg')
        msg = str(userText)
        entrada = msg.lower()
        f = csv.writer(open('inputs.csv', 'a', encoding='utf-8'))
        f.writerow([msg])
        response = searchbot.get_response(userText)
        if float(response.confidence) >= 0.8:
            return str(searchbot.get_response(userText))
        elif userText == str('NÃO'):
            return str('Refaça a pergunta, por favor!')
        elif userText == str("SIM"):
            return str("Agradecemos o seu contato")
        elif float(response.confidence) == 0.0:
            entrada = msg
            # print(entrada)
            p1 = 'http://receita.economia.gov.br/@@busca?advanced_search=False&sort_on=&SearchableText='
            p2 = '&portal_type%3Alist=Document&created.query%3Arecord%3Alist%3Adate=1970-01-02&created.range%3Arecord=min'
            html = str(p1 + entrada + p2)
            stop2 = nltk.corpus.stopwords.words('portuguese')
            stop2.append('faço')
            stop2.append('um')
            stop2.append('gostaria')
            stop2.append('fazer')
            stop2.append('saber')
            stop2.append('posso')
            stop2.append('como')
            splitter = re.compile('\\W+')

            lista_palavras = []
            lista = [p for p in splitter.split(entrada) if p != '']
            for p in lista:
                if p not in stop2:
                    if len(p) > 1:
                        lista_palavras.append(p)
            ar = len(lista_palavras)
            ax = str(lista_palavras[0:ar])
            e = str(ax).replace(',', ' ').strip('[]')
            e.strip("'")
            #headers = {'User-Agent': 'Mozilla/5.0'}
            headers = {
                'User-Agent':
                'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
            }
            try:
                page = requests.get(html,
                                    headers=headers,
                                    verify=False,
                                    stream=False,
                                    timeout=7)
                soup = BeautifulSoup(page.content, 'lxml')
                cla = soup.find(class_='searchResults')
                links = cla.find_all('a')
            except (KeyError, IndexError, AttributeError):
                pass
            # namess = soup.find_all('a')
            # ra = (lista_palavras)
            # CRIAR A LISTA DE LINKS SITE RFB

            listr = []
            for link in links:
                texto = str(link.get_text()).lower().replace('ã', 'a').replace(
                    '-', ' ').replace('ç', 'c').split()
                time.sleep(0.5)
                # print(len(texto))
                url = str(link.get('href'))
                time.sleep(0.5)
                # print(len(url))
                urls = str(link.get('href')).lower().replace('/', ' ').replace(
                    '-', ' ').replace('.', ' ').split()
                time.sleep(0.5)
                # print(len(urls))
                if entrada in texto:
                    listr.append(url)
                for i in range(0, ar):
                    if lista_palavras[i] in texto:
                        listr.append(url)
                    elif lista_palavras[i] in urls:
                        listr.append(url)
            else:
                listr == []
                pass

            listag = []
            rec = 'site:receita.economia.gov.br intitle:' + msg + " -filetype:pdf -.pdf"
            for urla in search(rec,
                               tld='com.br',
                               lang='pt-br',
                               stop=4,
                               pause=8):
                time.sleep(1)
                listag.append(urla)

            g = int(len(listag))
            # print(g)

            listago = []
            for z in range(0, g):
                ur = str(listag[z])
                listago.append(ur)

            # print(listago)
            # print(len(listago))
            qo = int(len(listago))
            # print(listr)
            # print(len(listr))
            listaunida = listago + listr
            conj = list(set(listaunida))
            # print(conj)
            # print(len(conj))
            # print(type(conj))

            # print(p)
            # print(len(p))
            j = len(conj)

            reports2 = []
            news_pool.set(reports2, threads_per_source=2)
            news_pool.join()
            for r in range(0, j):

                try:
                    ia = str(conj[r])
                    article = Article(ia, language="pt")
                    article.download()
                    article.parse()
                    article.text
                    article.nlp()
                    article.summary
                except:
                    pass
                reports2.append(str(article.summary).replace('\n', ' '))
            # print(len(reports2))

            resposta_finalc = set(reports2)
            print(resposta_finalc)

            if resposta_finalc == set():
                wikipedia.set_lang("pt")
                a = msg
                result = wikipedia.search(a, results=1)
                page = wikipedia.summary(result, sentences=6)
                content = page
                return str(content)
            else:
                try:
                    resposta_final = (str(resposta_finalc).replace(
                        '\n', ' ').replace('[', ' ').replace(']', ' ').replace(
                            ',',
                            ' ').replace("'",
                                         ' ').replace('{',
                                                      ' ').replace("}", ' '))
                    f = csv.writer(open('chats.csv', 'a', encoding='utf-8'))
                    f.writerow([msg + '\n' + resposta_final])
                    return str(
                        resposta_final + '\n' +
                        'Encontrou a resposta que precisava? SIM ou NÃO?')
                except:
                    return str(
                        'Desculpe! Não encontrei uma resposta para sua pergunta. Poderia repetir com outros termos?'
                    )

예제 #34

0

파일 보기

파일: webscraping-newspaper-multithreading.py 프로젝트: thezakpak/python-examples

#!/usr/bin/python
# -*- coding: utf-8 -*-

import newspaper
from newspaper import news_pool
from pprint import pprint

# slate_paper = newspaper.build('http://slate.com')
# tc_paper = newspaper.build('http://techcrunch.com')
# espn_paper = newspaper.build('http://espn.com')
elpais = newspaper.build('http://elpais.com')
elmundo = newspaper.build('http://www.elmundo.es')
publico = newspaper.build('http://www.publico.es')
papers = [elpais, elmundo, publico]
news_pool.set(papers, threads_per_source=2) # (3*2) = 6 threads total
news_pool.join()

print(len(papers))
pprint(papers)
print(len(elpais.articles))
print(len(elmundo.articles))

예제 #35

0

파일 보기

    def scrape_newspapers(self,
                          company_name,
                          start_date,
                          end_date,
                          bodies=False):
        """ Build a list of the newspapers articles from a given url """
        def build_papers(news_url):
            return newspaper.build(news_url,
                                   language=self.language,
                                   memoize_articles=False)

        """ Return a relevant article matching company name and optional params such as start_date, end_date, bodies """

        def relevant_articles(papers):
            try:
                for article in papers.articles:
                    """
                        Lets analyse the HTML of the article to inspect the h1 (title) of the article. 
                        Reading documentation of newspaper3k suggests parse() is expensive method so 
                        try to limit overhead and only parse articles with a relevant title.
                    """
                    soup = BeautifulSoup(article.html, "html.parser")
                    title = soup.find('h1').get_text()
                    #If the company name is found wihtin the headline of a news article then parse the article for more information
                    if title and company_name in title.lower():
                        article.parse()
                        if within_date_range(article.publish_date, start_date,
                                             end_date):
                            article_dict = {
                                "headline": article.title,
                                "source": article.url,
                                "published_date": article.publish_date,
                                "company_name": company_name
                            }
                            if bodies:
                                article_dict.update({"body": article.text})
                            yield article_dict

            except Exception as e:
                #log the error to a file, continue
                print("Exception:", e)
                pass

        articles = []
        company_name = company_name.lower()

        try:
            print("Downloading papers .....")
            papers = [build_papers(src) for src in self.news_urls]
            print("Papers downloaded", len(papers), papers)
            news_pool.set(papers, threads_per_source=2)
            news_pool.join()

        except Exception as e:
            #should log the error to a file in production then continue
            print("Exception:", e)
            pass

        finally:
            articles.extend(
                [article for p in papers for article in relevant_articles(p)])

        return articles

예제 #36

0

파일 보기

파일: newsagg.py 프로젝트: UF-CompLing/Sentiment

def main():
	import newspaper
	from newspaper import news_pool
	import re
	import csv
	import unicodedata

	# Active list of news/media sources
	
	sources = ['http://fivethirtyeight.com']

	#sources = ['http://cnn.com','http://foxnews.com',
	#'http://npr.org','http://msnbc.com','http://cbs.com',
	#'http://economist.com','http://time.com','http://nytimes.com',
	#'http://espn.com','http://reuters.com','http://usatoday.com',
	#'http://bbc.com','http://fivethirtyeight.com']

	papers = {} # Empty dictionary

	print("Building papers\n....\n...\n...")

	# Build diction, using url name for keys ex/ 'http://cnn.com' key will be 'cnn'
	for i in range(len(sources)):
		key = re.sub(r'(^https?:\/\/|\.com$|\.org$)','',sources[i])
		papers[key] = newspaper.build(sources[i],memoize_articles=False)
		# Print number of articles added from "recent" list for logging purposes
		print(key,papers[key].size())

	print("Downloading articles (this may take a while)\n...\n...\n...")

	# Download all articles via multi-threading
	news_pool.set([x[1] for x in papers.items()], threads_per_source=2) # Test various thread counts
	news_pool.join()

	print("Extracting text from articles \n...\n...\n...")

	# Parse all articles
	for i in papers:
		for j in range(papers[i].size()):
			#call to "download()" deprecated by news_pool.set & news_pool.join
			#papers[i].articles[j].download()
			papers[i].articles[j].parse()
			#extract keywords
			papers[i].articles[j].nlp()

	print("Writing new articles to dump file \n...\n...\n...")

	# Append articles to csv
	# Prototype format: col(1) = source, col(2) = title, col(3) = authors, col(4) = text
	with open('papers.csv','a') as outcsv:
		writer = csv.writer(outcsv)
		writer.writerow(["Source","Date","Title","Authors","Text","Keywords"])
		for i in papers:
			source = i
			for j in range(papers[i].size()):
				# Grab key features
				title = unicodedata.normalize('NFKD',papers[i].articles[j].title).encode('ascii','ignore')
				authors = [x.encode('UTF-8') for x in papers[i].articles[j].authors]
				text = unicodedata.normalize('NFKD',papers[i].articles[j].text).encode('ascii','ignore')
				date = papers[i].articles[j].publish_date
				#date = unicodedata.normalize('NFKD',papers[i].articles[j].publish_date).encode('ascii','ignore')
				# Identify keywords, while we're at it
				keywords = [x.encode('UTF-8') for x in papers[i].articles[j].keywords]
				writer.writerow([source,date,title,authors,text,keywords])

예제 #37

0

파일 보기

파일: app.py 프로젝트: paulorss/wikitest

def get_bot_response():
    #userText = unidecode._unidecode(request.args.get('msg')).strip().lower() request.args.get('msg')
    #print(userText)
    userText = unidecode._unidecode(request.args.get('msg')).strip().lower()
    msg = userText
    response = englishBot.get_response(userText)
    if float(response.confidence) > 0.8:
        return str(englishBot.get_response(userText))
    elif float(response.confidence) > 0.4:
        wikipedia.set_lang("pt")
        a = msg
        result = wikipedia.search(a, results=1)
        page = wikipedia.summary(result, sentences=5)
        content = page
        # print(content)
        return str(content)
    elif float(response.confidence) == 0:
        entrada = str(msg).lower()
        p1 = 'http://receita.economia.gov.br/@@busca?advanced_search=False&sort_on=&SearchableText='
        p2 = '&portal_type%3Alist=Document&created.query%3Arecord%3Alist%3Adate=1970-01-02&created.range%3Arecord=min'
        html = str(p1 + entrada + p2)
        stop2 = nltk.corpus.stopwords.words('portuguese')
        stop2.append('faço')
        stop2.append('um')
        stop2.append('gostaria')
        stop2.append('fazer')
        stop2.append('saber')
        stop2.append('posso')
        stop2.append('como')
        splitter = re.compile('\\W+')
        lista_palavras = []
        lista = [p for p in splitter.split(entrada) if p != '']
        for p in lista:
            if p not in stop2:
                if len(p) > 1:
                    lista_palavras.append(p)
        ar = len(lista_palavras)
        ax = str(lista_palavras[0:ar])
        e = str(ax).replace(',', ' ').strip('[]')
        e.strip("'")
        page = requests.get(html, verify=False, stream=False)
        soup = BeautifulSoup(page.content, 'lxml')
        cla = soup.find(class_='searchResults')
        links = cla.find_all('a')
        namess = soup.find_all('a')
        ra = (lista_palavras)
        # CRIAR A LISTA DE LINKS SITE RFB
        listr = []
        for link in links:
            texto = str(link.get_text()).lower().replace('ã', 'a').replace(
                '-', ' ').replace('ç', 'c').split()
            # print(len(texto))
            url = str(link.get('href'))
            # print(len(url))
            urls = str(link.get('href')).lower().replace('/', ' ').replace(
                '-', ' ').replace('.', ' ').split()
            # print(len(urls))
            if entrada in texto:
                listr.append(url)
            for i in range(0, ar):
                if lista_palavras[i] in texto:
                    listr.append(url)
                elif lista_palavras[i] in urls:
                    listr.append(url)

        listag = []
        rec = 'site:receita.economia.gov.br intext:' + msg + " -filetype:pdf -.pdf"
        for urla in search(rec, tld='com.br', lang='pt-br', stop=3, pause=2):
            listag.append(urla)

        g = int(len(listag))
        #print(g)

        listago = []
        for z in range(0, g):
            ur = str(listag[z])
            listago.append(ur)

        # print(listago)
        # print(len(listago))
        qo = int(len(listago))
        # print(listr)
        # print(len(listr))
        listaunida = listago + listr
        conj = list(set(listaunida))
        # print(conj)
        # print(len(conj))
        # print(type(conj))

        # print(p)
        # print(len(p))
        j = len(conj)

        reports2 = []
        news_pool.set(reports2, threads_per_source=2)
        news_pool.join()
        for r in range(0, j):
            ia = str(conj[r])
            article = Article(ia, language="pt")
            try:
                article.download()
                article.parse()
                article.text
                article.nlp()
                article.summary
            except:
                pass

            reports2.append(str(article.summary).replace('\n', ' '))
        # print(len(reports2))

        resposta_finalc = set(reports2)
        resposta_final = (str(resposta_finalc).replace('\n', ' ').replace(
            '[', ' ').replace(']',
                              ' ').replace(',', ' ').replace("'", ' ').replace(
                                  '{', ' ').replace("}", ' '))

        f = csv.writer(open('chats.txt', 'a', encoding='utf-8'))
        f.writerow([msg + '\n' + resposta_final])

        return str(resposta_final)