Exemplo n.º 1
0
def combine_users_tweets():
    timelines_cursor = get_users_timeline_from_db()
    urls_pattern = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    i=0
    for tl in timelines_cursor:
        i += 1
        if i < 10:
            continue
        DataOperations.append_to_file("temp", str(tl['user_id']))
        for tweet in tl['timeline']:
            first_text = tweet['text']
            lean_text = first_text

            lean_text.encode('ascii', 'ignore')
            lean_text = remove_entities(lean_text, urls_pattern)
            testObj = TextProcesor.TextProcesor()
            lean_text = testObj.removeSpecialChars(lean_text)

            lean_text_arr = testObj.splitToTokens(lean_text)
            lean_text_arr = testObj.removeStopwords(lean_text_arr)
            lean_text_arr = testObj.removeSingles(lean_text_arr)
            DataOperations.append_arr_to_file("temp", lean_text_arr)
        return