def execute():
    print 'Started at ' + get_time() + '... ',
    start_timing()

    client.drop_database(TOPIC_TWEETS_DB_NAME)
    results = entity_results_coll.find(limit=NUMBER_OF_TOP_ENTITIES, no_cursor_timeout=True) \
        .sort([(VALUE + '.' + COUNT, DESCENDING)])

    for result in results:
        tweets = []
        text = []
        lower_entity = result[LOWER_ENTITY]
        entities = result[VALUE][PSEUDONYMS]
        entity_pseudos[lower_entity] = entities

        max_tweets = 0
        for entity in entities:
            c = 0
            for tweet in raw_collection.find({ENTITIES: entity}):
                c += 1
                tweets.append(tweet)
                text.append(tweet[TWEET])
            if c > max_tweets:
                actual_entity[lower_entity] = entity
                max_tweets = c

        text = clean(text)
        topic_id = get_topic_for_entity(text, tweets)
        entity_topic[lower_entity] = topic_id

    save_to_collection()
    save_model_data()

    print 'Finished'
    stop_timing()
def execute():
    start_timing()
    print 'Starting Pre-processing for LDA...',

    documents = get_documents()
    tokenized_documents = clean(documents)

    dictionary = corpora.Dictionary([doc for doc in tokenized_documents])
    dictionary.compactify()
    dictionary.save(DICTIONARY_PATH)

    corpus = [dictionary.doc2bow(doc) for doc in tokenized_documents]
    corpora.MmCorpus.serialize(CORPUS_PATH, corpus)

    print 'Finished'
    stop_timing()

    client.close()
示例#3
0
def execute():
    start_timing()
    print 'Starting Pre-processing for LDA...',

    documents = get_documents()
    tokenized_documents = clean(documents)

    dictionary = corpora.Dictionary([doc for doc in tokenized_documents])
    dictionary.compactify()
    dictionary.save(DICTIONARY_PATH)

    corpus = [dictionary.doc2bow(doc) for doc in tokenized_documents]
    corpora.MmCorpus.serialize(CORPUS_PATH, corpus)

    print 'Finished'
    stop_timing()

    client.close()