Пример #1
0
def execute():
    print 'Started at ' + get_time() + '... ',
    start_timing()

    client.drop_database(TOPIC_TWEETS_DB_NAME)
    results = entity_results_coll.find(limit=NUMBER_OF_TOP_ENTITIES, no_cursor_timeout=True) \
        .sort([(VALUE + '.' + COUNT, DESCENDING)])

    for result in results:
        tweets = []
        text = []
        lower_entity = result[LOWER_ENTITY]
        entities = result[VALUE][PSEUDONYMS]
        entity_pseudos[lower_entity] = entities

        max_tweets = 0
        for entity in entities:
            c = 0
            for tweet in raw_collection.find({ENTITIES: entity}):
                c += 1
                tweets.append(tweet)
                text.append(tweet[TWEET])
            if c > max_tweets:
                actual_entity[lower_entity] = entity
                max_tweets = c

        text = clean(text)
        topic_id = get_topic_for_entity(text, tweets)
        entity_topic[lower_entity] = topic_id

    save_to_collection()
    save_model_data()

    print 'Finished'
    stop_timing()
Пример #2
0
def execute():
    print 'Started at ' + get_time() + '... ',
    start_timing()

    hot_topics = [4,5,7, 8, 17]
    for topic in hot_topics:
        create_wordcloud(topic)
    print 'Finished'
    stop_timing()
Пример #3
0
def execute():

    print 'Started LDA at ' + get_time() + '... ',

    start_timing()

    lda = models.LdaModel(CORPUS, id2word=DICTIONARY,
                          num_topics=NUMBER_OF_TOPICS,
                          passes=NUMBER_OF_PASSES,
                          alpha=ALPHA)

    lda.save(LDA_PATH)

    print 'Finished'
    stop_timing()
Пример #4
0
def execute():

    print 'Started LDA at ' + get_time() + '... ',

    start_timing()

    lda = models.LdaModel(CORPUS,
                          id2word=DICTIONARY,
                          num_topics=NUMBER_OF_TOPICS,
                          passes=NUMBER_OF_PASSES,
                          alpha=ALPHA)

    lda.save(LDA_PATH)

    print 'Finished'
    stop_timing()
Пример #5
0
        if tweets_cnt == MAX_TWEETS_IN_FILE:
            tweets_cnt = 0
            change_file()

        return True

    def on_error(self, status):
        print 'Error: ' + status


if __name__ == '__main__':

    file_name = get_filename(FILE_PATH, file_number)
    tweets_file = open(file_name, WRITE)

    print "Started extracting tweets at " + get_time() + "... "

    while True:  # ensures continuous stream extraction

        try:
            # This handles Twitter authentication and the connection to Twitter Streaming API

            l = StdOutListener()
            auth = OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
            auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)

            stream = Stream(auth, l)
            stream.filter(languages=[ENGLISH], track=FILTER_KEYWORDS)

        except:  # TODO
            continue
Пример #6
0
        if tweets_cnt == MAX_TWEETS_IN_FILE:
            tweets_cnt = 0
            change_file()

        return True

    def on_error(self, status):
        print 'Error: ' + status


if __name__ == '__main__':

    file_name = get_filename(FILE_PATH, file_number)
    tweets_file = open(file_name, WRITE)

    print "Started extracting tweets at " + get_time() + "... "

    while True:  # ensures continuous stream extraction

        try:
            # This handles Twitter authentication and the connection to Twitter Streaming API

            l = StdOutListener()
            auth = OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
            auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)

            stream = Stream(auth, l)
            stream.filter(languages=[ENGLISH], track=FILTER_KEYWORDS)

        except:  # TODO
            continue