Пример #1
0
def pipeline_dw():
    classificator = Classificator(taxonomy)
    ner = initializeNer()
    elastic = ElasticWrapper()
    day = START_DAY
    today = datetime.now()
    days_log = open(DAYS_LOG_PATH, 'a')
    t0 = datetime.now()
    logging.info("Quering for day: %s" % day.strftime("%Y-%m-%dT%H:%M:%SZ"))
    days_log.write("Quering for day: %s\n" % day.strftime("%Y-%m-%dT%H:%M:%SZ"))                
    articles_count = elastic.articles_count_from(day)
    count = 0
    while(count<articles_count):
        articles = elastic.get_articles_from(day, count)
        count += 1000
        logging.info("Received %s articles" % len(articles))
        logging.info("Got %s/%s articles" % (count, articles_count))
        days_log.write("Got %s/%s articles\n" % (count+len(articles), articles_count))
        result_articles = []
        for article in articles:
            logging.debug("Analyzing article: %s" % article)
            result_article = analyze_doc(classificator, ner, article)
            logging.debug("Result article: %s" % result_article)
            result_article["source"] = "dw"
            result_articles.append(result_article)
        elastic.insert(result_articles, "analyzed", "article")
    t1 = datetime.now()
    logging.info("This day took %s" % (t1-t0))
    days_log.write("This day took %s\n" % (t1-t0) )
    logging.info("Finished!!!")
    days_log.write("This day took %s\n" % (t1-t0) )
    days_log.close()
Пример #2
0
def pipeline():
    classificator = Classificator(taxonomy)
    ner = initializeNer()
    elastic = ElasticWrapper()
    day = START_DAY
    today = datetime.now()
    days_log = open(DAYS_LOG_PATH, 'a')
    while(day < today):
        t0 = datetime.now()
        logging.info("Quering for day: %s" % day.strftime("%Y-%m-%dT%H:%M:%SZ"))
        days_log.write("Quering for day: %s\n" % day.strftime("%Y-%m-%dT%H:%M:%SZ"))                
        tweets_count = elastic.tweets_count_for_day(day)
        count = 0
        while(count<tweets_count):
            tweets = elastic.get_day_tweets(day, count)
            count += 1000
            logging.info("Received %s tweets" % len(tweets))
            logging.info("Got %s/%s tweets" % (count, tweets_count))
            days_log.write("Got %s/%s tweets\n" % (count+len(tweets), tweets_count))
            result_tweets = []
            for tweet in tweets:
                logging.debug("Analyzing tweet: %s" % tweet)
                result_tweet = analyze_doc(classificator, ner, tweet)
                logging.debug("Result tweet: %s" % result_tweet)
                result_tweet["source"] = "twitter"
                result_tweets.append(result_tweet)
            elastic.insert(result_tweets, "analyzed", "tweet")
        t1 = datetime.now()
        logging.info("This day took %s" % (t1-t0))
        days_log.write("This day took %s\n" % (t1-t0) )
        day = day + timedelta(days=1)
    logging.info("Finished!!!")
    days_log.write("This day took %s\n" % (t1-t0) )
    days_log.close()
Пример #3
0
def main():
    parser = ArgumentParser()
    parser.add_argument("--url",
                        type=str,
                        help="Please provide a blog url for data crawl!")
    es = ElasticWrapper()

    args = parser.parse_args()
    if "whatedsaid" in args.url:
        contributions = whatedsaid.main(args)
        for contribution in contributions:
            es.store_post(contribution)
    if "vickyloras" in args.url:
        vickyloras.main(args)
    if "chiasuanchong" in args.url:
        chiasuanchong.main(args)
Пример #4
0
def pipeline_dw():
    classificator = Classificator(taxonomy)
    ner = initializeNer()
    elastic = ElasticWrapper()
    day = START_DAY
    today = datetime.now()
    days_log = open(DAYS_LOG_PATH, 'a')
    t0 = datetime.now()
    logging.info("Quering for day: %s" % day.strftime("%Y-%m-%dT%H:%M:%SZ"))
    days_log.write("Quering for day: %s\n" %
                   day.strftime("%Y-%m-%dT%H:%M:%SZ"))
    articles_count = elastic.articles_count_from(day)
    count = 0
    while (count < articles_count):
        articles = elastic.get_articles_from(day, count)
        count += 1000
        logging.info("Received %s articles" % len(articles))
        logging.info("Got %s/%s articles" % (count, articles_count))
        days_log.write("Got %s/%s articles\n" %
                       (count + len(articles), articles_count))
        result_articles = []
        for article in articles:
            logging.debug("Analyzing article: %s" % article)
            result_article = analyze_doc(classificator, ner, article)
            logging.debug("Result article: %s" % result_article)
            result_article["source"] = "dw"
            result_articles.append(result_article)
        elastic.insert(result_articles, "analyzed", "article")
    t1 = datetime.now()
    logging.info("This day took %s" % (t1 - t0))
    days_log.write("This day took %s\n" % (t1 - t0))
    logging.info("Finished!!!")
    days_log.write("This day took %s\n" % (t1 - t0))
    days_log.close()
Пример #5
0
def pipeline():
    classificator = Classificator(taxonomy)
    ner = initializeNer()
    elastic = ElasticWrapper()
    day = START_DAY
    today = datetime.now()
    days_log = open(DAYS_LOG_PATH, 'a')
    while (day < today):
        t0 = datetime.now()
        logging.info("Quering for day: %s" %
                     day.strftime("%Y-%m-%dT%H:%M:%SZ"))
        days_log.write("Quering for day: %s\n" %
                       day.strftime("%Y-%m-%dT%H:%M:%SZ"))
        tweets_count = elastic.tweets_count_for_day(day)
        count = 0
        while (count < tweets_count):
            tweets = elastic.get_day_tweets(day, count)
            count += 1000
            logging.info("Received %s tweets" % len(tweets))
            logging.info("Got %s/%s tweets" % (count, tweets_count))
            days_log.write("Got %s/%s tweets\n" %
                           (count + len(tweets), tweets_count))
            result_tweets = []
            for tweet in tweets:
                logging.debug("Analyzing tweet: %s" % tweet)
                result_tweet = analyze_doc(classificator, ner, tweet)
                logging.debug("Result tweet: %s" % result_tweet)
                result_tweet["source"] = "twitter"
                result_tweets.append(result_tweet)
            elastic.insert(result_tweets, "analyzed", "tweet")
        t1 = datetime.now()
        logging.info("This day took %s" % (t1 - t0))
        days_log.write("This day took %s\n" % (t1 - t0))
        day = day + timedelta(days=1)
    logging.info("Finished!!!")
    days_log.write("This day took %s\n" % (t1 - t0))
    days_log.close()