def pipeline_dw(): classificator = Classificator(taxonomy) ner = initializeNer() elastic = ElasticWrapper() day = START_DAY today = datetime.now() days_log = open(DAYS_LOG_PATH, 'a') t0 = datetime.now() logging.info("Quering for day: %s" % day.strftime("%Y-%m-%dT%H:%M:%SZ")) days_log.write("Quering for day: %s\n" % day.strftime("%Y-%m-%dT%H:%M:%SZ")) articles_count = elastic.articles_count_from(day) count = 0 while(count<articles_count): articles = elastic.get_articles_from(day, count) count += 1000 logging.info("Received %s articles" % len(articles)) logging.info("Got %s/%s articles" % (count, articles_count)) days_log.write("Got %s/%s articles\n" % (count+len(articles), articles_count)) result_articles = [] for article in articles: logging.debug("Analyzing article: %s" % article) result_article = analyze_doc(classificator, ner, article) logging.debug("Result article: %s" % result_article) result_article["source"] = "dw" result_articles.append(result_article) elastic.insert(result_articles, "analyzed", "article") t1 = datetime.now() logging.info("This day took %s" % (t1-t0)) days_log.write("This day took %s\n" % (t1-t0) ) logging.info("Finished!!!") days_log.write("This day took %s\n" % (t1-t0) ) days_log.close()
def pipeline(): classificator = Classificator(taxonomy) ner = initializeNer() elastic = ElasticWrapper() day = START_DAY today = datetime.now() days_log = open(DAYS_LOG_PATH, 'a') while(day < today): t0 = datetime.now() logging.info("Quering for day: %s" % day.strftime("%Y-%m-%dT%H:%M:%SZ")) days_log.write("Quering for day: %s\n" % day.strftime("%Y-%m-%dT%H:%M:%SZ")) tweets_count = elastic.tweets_count_for_day(day) count = 0 while(count<tweets_count): tweets = elastic.get_day_tweets(day, count) count += 1000 logging.info("Received %s tweets" % len(tweets)) logging.info("Got %s/%s tweets" % (count, tweets_count)) days_log.write("Got %s/%s tweets\n" % (count+len(tweets), tweets_count)) result_tweets = [] for tweet in tweets: logging.debug("Analyzing tweet: %s" % tweet) result_tweet = analyze_doc(classificator, ner, tweet) logging.debug("Result tweet: %s" % result_tweet) result_tweet["source"] = "twitter" result_tweets.append(result_tweet) elastic.insert(result_tweets, "analyzed", "tweet") t1 = datetime.now() logging.info("This day took %s" % (t1-t0)) days_log.write("This day took %s\n" % (t1-t0) ) day = day + timedelta(days=1) logging.info("Finished!!!") days_log.write("This day took %s\n" % (t1-t0) ) days_log.close()
def main(): parser = ArgumentParser() parser.add_argument("--url", type=str, help="Please provide a blog url for data crawl!") es = ElasticWrapper() args = parser.parse_args() if "whatedsaid" in args.url: contributions = whatedsaid.main(args) for contribution in contributions: es.store_post(contribution) if "vickyloras" in args.url: vickyloras.main(args) if "chiasuanchong" in args.url: chiasuanchong.main(args)
def pipeline_dw(): classificator = Classificator(taxonomy) ner = initializeNer() elastic = ElasticWrapper() day = START_DAY today = datetime.now() days_log = open(DAYS_LOG_PATH, 'a') t0 = datetime.now() logging.info("Quering for day: %s" % day.strftime("%Y-%m-%dT%H:%M:%SZ")) days_log.write("Quering for day: %s\n" % day.strftime("%Y-%m-%dT%H:%M:%SZ")) articles_count = elastic.articles_count_from(day) count = 0 while (count < articles_count): articles = elastic.get_articles_from(day, count) count += 1000 logging.info("Received %s articles" % len(articles)) logging.info("Got %s/%s articles" % (count, articles_count)) days_log.write("Got %s/%s articles\n" % (count + len(articles), articles_count)) result_articles = [] for article in articles: logging.debug("Analyzing article: %s" % article) result_article = analyze_doc(classificator, ner, article) logging.debug("Result article: %s" % result_article) result_article["source"] = "dw" result_articles.append(result_article) elastic.insert(result_articles, "analyzed", "article") t1 = datetime.now() logging.info("This day took %s" % (t1 - t0)) days_log.write("This day took %s\n" % (t1 - t0)) logging.info("Finished!!!") days_log.write("This day took %s\n" % (t1 - t0)) days_log.close()
def pipeline(): classificator = Classificator(taxonomy) ner = initializeNer() elastic = ElasticWrapper() day = START_DAY today = datetime.now() days_log = open(DAYS_LOG_PATH, 'a') while (day < today): t0 = datetime.now() logging.info("Quering for day: %s" % day.strftime("%Y-%m-%dT%H:%M:%SZ")) days_log.write("Quering for day: %s\n" % day.strftime("%Y-%m-%dT%H:%M:%SZ")) tweets_count = elastic.tweets_count_for_day(day) count = 0 while (count < tweets_count): tweets = elastic.get_day_tweets(day, count) count += 1000 logging.info("Received %s tweets" % len(tweets)) logging.info("Got %s/%s tweets" % (count, tweets_count)) days_log.write("Got %s/%s tweets\n" % (count + len(tweets), tweets_count)) result_tweets = [] for tweet in tweets: logging.debug("Analyzing tweet: %s" % tweet) result_tweet = analyze_doc(classificator, ner, tweet) logging.debug("Result tweet: %s" % result_tweet) result_tweet["source"] = "twitter" result_tweets.append(result_tweet) elastic.insert(result_tweets, "analyzed", "tweet") t1 = datetime.now() logging.info("This day took %s" % (t1 - t0)) days_log.write("This day took %s\n" % (t1 - t0)) day = day + timedelta(days=1) logging.info("Finished!!!") days_log.write("This day took %s\n" % (t1 - t0)) days_log.close()