Пример #1
0
def execute():
    print 'Started Entity Aggregation... ',

    start_timing()

    map_function = Code(
        open(join(JAVASCRIPT_PATH, MAP_FUNCTION_FILENAME), 'r').read())
    reduce_function = Code(
        open(join(JAVASCRIPT_PATH, REDUCE_FUNCTION_FILENAME), 'r').read())
    aggregate_map_function = Code(
        open(join(JAVASCRIPT_PATH, AGGREGATION_MAP_ADD_FUNCTION_FILENAME),
             'r').read())
    aggregate_reduce_function = Code(
        open(join(JAVASCRIPT_PATH, AGGREGATION_REDUCE_FUNCTION_FILENAME),
             'r').read())

    temp_raw.map_reduce(map_function, reduce_function,
                        TEMP_RESULTS_COLLECTION_NAME)
    temp_results.map_reduce(aggregate_map_function, aggregate_reduce_function,
                            {'reduce': RESULTS_COLLECTION_NAME})

    if temp_raw.count() > 0:
        copy_into_collection(temp_raw.find(no_cursor_timeout=True), coll)

    temp_results.drop()
    temp_raw.drop()

    client.close()

    print 'Finished'
    stop_timing()
Пример #2
0
def execute():
    data_files = get_files_in_dir(TEMP_PATH, JSON)
    l = len(data_files)
    print 'Started Preprocessing ' + str(l) + ' files... '
    start_timing()

    cnt = 0
    percent_interval = 1  # increment for the completion percent display
    display_percentage(cnt, l, percent_interval)

    for data_file in data_files:
        data_file_path = join(TEMP_PATH, data_file)
        tweets_data = extract_data(data_file_path)
        processed_tweets = process(tweets_data)

        insert_many(collection, processed_tweets)
        remove(data_file_path)

        # updating completion status
        cnt += 1
        display_percentage(cnt, l, percent_interval)

    client.close()
    print
    print 'Finished'

    stop_timing()
Пример #3
0
def execute():
    print 'Started Entity Aggregation... '
    start_timing()

    topics = get_hot_topics()
    print topics
    aggregate_urls(topics)

    for topic_id in topics:
        results_coll_name = TOPIC_URL_AGGR_COLLECTION_NAME(topic_id)
        results_coll = topics_db[results_coll_name]

        top_urls = []

        results = results_coll.find(limit=NUMBER_OF_URLS_TO_EXTRACT).sort([
            (VALUE, DESCENDING)
        ])
        for result in results:
            top_urls.append(result[GENERAL_ID_TAG])

        write_urls(topic_id, top_urls)

    client.close()

    print 'Finished'
    stop_timing()
Пример #4
0
def execute():
    data_files = get_files_in_dir(TEMP_PATH, JSON)
    l = len(data_files)
    print 'Started Preprocessing ' + str(l) + ' files... '
    start_timing()

    cnt = 0
    percent_interval = 1  # increment for the completion percent display
    display_percentage(cnt, l, percent_interval)

    for data_file in data_files:
        data_file_path = join(TEMP_PATH, data_file)
        tweets_data = extract_data(data_file_path)
        processed_tweets = process(tweets_data)

        insert_many(collection, processed_tweets)
        remove(data_file_path)

        # updating completion status
        cnt += 1
        display_percentage(cnt, l, percent_interval)

    client.close()
    print
    print 'Finished'

    stop_timing()
Пример #5
0
def execute():
    print 'Started at ' + get_time() + '... ',
    start_timing()

    client.drop_database(TOPIC_TWEETS_DB_NAME)
    results = entity_results_coll.find(limit=NUMBER_OF_TOP_ENTITIES, no_cursor_timeout=True) \
        .sort([(VALUE + '.' + COUNT, DESCENDING)])

    for result in results:
        tweets = []
        text = []
        lower_entity = result[LOWER_ENTITY]
        entities = result[VALUE][PSEUDONYMS]
        entity_pseudos[lower_entity] = entities

        max_tweets = 0
        for entity in entities:
            c = 0
            for tweet in raw_collection.find({ENTITIES: entity}):
                c += 1
                tweets.append(tweet)
                text.append(tweet[TWEET])
            if c > max_tweets:
                actual_entity[lower_entity] = entity
                max_tweets = c

        text = clean(text)
        topic_id = get_topic_for_entity(text, tweets)
        entity_topic[lower_entity] = topic_id

    save_to_collection()
    save_model_data()

    print 'Finished'
    stop_timing()
Пример #6
0
def execute():
    print 'Started at ' + get_time() + '... ',
    start_timing()

    hot_topics = [4,5,7, 8, 17]
    for topic in hot_topics:
        create_wordcloud(topic)
    print 'Finished'
    stop_timing()
Пример #7
0
def execute():

    print 'Started LDA at ' + get_time() + '... ',

    start_timing()

    lda = models.LdaModel(CORPUS, id2word=DICTIONARY,
                          num_topics=NUMBER_OF_TOPICS,
                          passes=NUMBER_OF_PASSES,
                          alpha=ALPHA)

    lda.save(LDA_PATH)

    print 'Finished'
    stop_timing()
Пример #8
0
def execute():

    print 'Started LDA at ' + get_time() + '... ',

    start_timing()

    lda = models.LdaModel(CORPUS,
                          id2word=DICTIONARY,
                          num_topics=NUMBER_OF_TOPICS,
                          passes=NUMBER_OF_PASSES,
                          alpha=ALPHA)

    lda.save(LDA_PATH)

    print 'Finished'
    stop_timing()
Пример #9
0
def execute():
    start_timing()
    print 'Starting Pre-processing for LDA...',

    documents = get_documents()
    tokenized_documents = clean(documents)

    dictionary = corpora.Dictionary([doc for doc in tokenized_documents])
    dictionary.compactify()
    dictionary.save(DICTIONARY_PATH)

    corpus = [dictionary.doc2bow(doc) for doc in tokenized_documents]
    corpora.MmCorpus.serialize(CORPUS_PATH, corpus)

    print 'Finished'
    stop_timing()

    client.close()
Пример #10
0
def execute():
    start_timing()
    print 'Starting Pre-processing for LDA...',

    documents = get_documents()
    tokenized_documents = clean(documents)

    dictionary = corpora.Dictionary([doc for doc in tokenized_documents])
    dictionary.compactify()
    dictionary.save(DICTIONARY_PATH)

    corpus = [dictionary.doc2bow(doc) for doc in tokenized_documents]
    corpora.MmCorpus.serialize(CORPUS_PATH, corpus)

    print 'Finished'
    stop_timing()

    client.close()
Пример #11
0
def execute():
    print 'Started Entity Aggregation... ',

    start_timing()

    map_function = Code(open(join(JAVASCRIPT_PATH, MAP_FUNCTION_FILENAME), 'r').read())
    reduce_function = Code(open(join(JAVASCRIPT_PATH, REDUCE_FUNCTION_FILENAME), 'r').read())
    aggregate_map_function = Code(open(join(JAVASCRIPT_PATH, AGGREGATION_MAP_ADD_FUNCTION_FILENAME), 'r').read())
    aggregate_reduce_function = Code(open(join(JAVASCRIPT_PATH, AGGREGATION_REDUCE_FUNCTION_FILENAME), 'r').read())

    temp_raw.map_reduce(map_function, reduce_function, TEMP_RESULTS_COLLECTION_NAME)
    temp_results.map_reduce(aggregate_map_function, aggregate_reduce_function, {'reduce': RESULTS_COLLECTION_NAME})

    if temp_raw.count() > 0:
        copy_into_collection(temp_raw.find(no_cursor_timeout=True), coll)

    temp_results.drop()
    temp_raw.drop()

    client.close()
    
    print 'Finished'
    stop_timing()
Пример #12
0
def execute():
    print 'Started Entity Aggregation... '
    start_timing()

    topics = get_hot_topics()
    print topics
    aggregate_urls(topics)

    for topic_id in topics:
        results_coll_name = TOPIC_URL_AGGR_COLLECTION_NAME(topic_id)
        results_coll = topics_db[results_coll_name]

        top_urls = []

        results = results_coll.find(limit=NUMBER_OF_URLS_TO_EXTRACT).sort([(VALUE, DESCENDING)])
        for result in results:
            top_urls.append(result[GENERAL_ID_TAG])

        write_urls(topic_id, top_urls)

    client.close()

    print 'Finished'
    stop_timing()