def execute(): print 'Started Entity Aggregation... ', start_timing() map_function = Code( open(join(JAVASCRIPT_PATH, MAP_FUNCTION_FILENAME), 'r').read()) reduce_function = Code( open(join(JAVASCRIPT_PATH, REDUCE_FUNCTION_FILENAME), 'r').read()) aggregate_map_function = Code( open(join(JAVASCRIPT_PATH, AGGREGATION_MAP_ADD_FUNCTION_FILENAME), 'r').read()) aggregate_reduce_function = Code( open(join(JAVASCRIPT_PATH, AGGREGATION_REDUCE_FUNCTION_FILENAME), 'r').read()) temp_raw.map_reduce(map_function, reduce_function, TEMP_RESULTS_COLLECTION_NAME) temp_results.map_reduce(aggregate_map_function, aggregate_reduce_function, {'reduce': RESULTS_COLLECTION_NAME}) if temp_raw.count() > 0: copy_into_collection(temp_raw.find(no_cursor_timeout=True), coll) temp_results.drop() temp_raw.drop() client.close() print 'Finished' stop_timing()
def execute(): data_files = get_files_in_dir(TEMP_PATH, JSON) l = len(data_files) print 'Started Preprocessing ' + str(l) + ' files... ' start_timing() cnt = 0 percent_interval = 1 # increment for the completion percent display display_percentage(cnt, l, percent_interval) for data_file in data_files: data_file_path = join(TEMP_PATH, data_file) tweets_data = extract_data(data_file_path) processed_tweets = process(tweets_data) insert_many(collection, processed_tweets) remove(data_file_path) # updating completion status cnt += 1 display_percentage(cnt, l, percent_interval) client.close() print print 'Finished' stop_timing()
def execute(): print 'Started Entity Aggregation... ' start_timing() topics = get_hot_topics() print topics aggregate_urls(topics) for topic_id in topics: results_coll_name = TOPIC_URL_AGGR_COLLECTION_NAME(topic_id) results_coll = topics_db[results_coll_name] top_urls = [] results = results_coll.find(limit=NUMBER_OF_URLS_TO_EXTRACT).sort([ (VALUE, DESCENDING) ]) for result in results: top_urls.append(result[GENERAL_ID_TAG]) write_urls(topic_id, top_urls) client.close() print 'Finished' stop_timing()
def execute(): print 'Started at ' + get_time() + '... ', start_timing() client.drop_database(TOPIC_TWEETS_DB_NAME) results = entity_results_coll.find(limit=NUMBER_OF_TOP_ENTITIES, no_cursor_timeout=True) \ .sort([(VALUE + '.' + COUNT, DESCENDING)]) for result in results: tweets = [] text = [] lower_entity = result[LOWER_ENTITY] entities = result[VALUE][PSEUDONYMS] entity_pseudos[lower_entity] = entities max_tweets = 0 for entity in entities: c = 0 for tweet in raw_collection.find({ENTITIES: entity}): c += 1 tweets.append(tweet) text.append(tweet[TWEET]) if c > max_tweets: actual_entity[lower_entity] = entity max_tweets = c text = clean(text) topic_id = get_topic_for_entity(text, tweets) entity_topic[lower_entity] = topic_id save_to_collection() save_model_data() print 'Finished' stop_timing()
def execute(): print 'Started at ' + get_time() + '... ', start_timing() hot_topics = [4,5,7, 8, 17] for topic in hot_topics: create_wordcloud(topic) print 'Finished' stop_timing()
def execute(): print 'Started LDA at ' + get_time() + '... ', start_timing() lda = models.LdaModel(CORPUS, id2word=DICTIONARY, num_topics=NUMBER_OF_TOPICS, passes=NUMBER_OF_PASSES, alpha=ALPHA) lda.save(LDA_PATH) print 'Finished' stop_timing()
def execute(): start_timing() print 'Starting Pre-processing for LDA...', documents = get_documents() tokenized_documents = clean(documents) dictionary = corpora.Dictionary([doc for doc in tokenized_documents]) dictionary.compactify() dictionary.save(DICTIONARY_PATH) corpus = [dictionary.doc2bow(doc) for doc in tokenized_documents] corpora.MmCorpus.serialize(CORPUS_PATH, corpus) print 'Finished' stop_timing() client.close()
def execute(): print 'Started Entity Aggregation... ', start_timing() map_function = Code(open(join(JAVASCRIPT_PATH, MAP_FUNCTION_FILENAME), 'r').read()) reduce_function = Code(open(join(JAVASCRIPT_PATH, REDUCE_FUNCTION_FILENAME), 'r').read()) aggregate_map_function = Code(open(join(JAVASCRIPT_PATH, AGGREGATION_MAP_ADD_FUNCTION_FILENAME), 'r').read()) aggregate_reduce_function = Code(open(join(JAVASCRIPT_PATH, AGGREGATION_REDUCE_FUNCTION_FILENAME), 'r').read()) temp_raw.map_reduce(map_function, reduce_function, TEMP_RESULTS_COLLECTION_NAME) temp_results.map_reduce(aggregate_map_function, aggregate_reduce_function, {'reduce': RESULTS_COLLECTION_NAME}) if temp_raw.count() > 0: copy_into_collection(temp_raw.find(no_cursor_timeout=True), coll) temp_results.drop() temp_raw.drop() client.close() print 'Finished' stop_timing()
def execute(): print 'Started Entity Aggregation... ' start_timing() topics = get_hot_topics() print topics aggregate_urls(topics) for topic_id in topics: results_coll_name = TOPIC_URL_AGGR_COLLECTION_NAME(topic_id) results_coll = topics_db[results_coll_name] top_urls = [] results = results_coll.find(limit=NUMBER_OF_URLS_TO_EXTRACT).sort([(VALUE, DESCENDING)]) for result in results: top_urls.append(result[GENERAL_ID_TAG]) write_urls(topic_id, top_urls) client.close() print 'Finished' stop_timing()