示例#1
0
def main():
    tweets_path = config_lda.get("input_file")
    tweets_clean = load_tweets(tweets_path)

    dictionary = corpora.Dictionary(tweets_clean)
    dtm = [dictionary.doc2bow(text) for text in tweets_clean]
    if reduce:
        tweets_clean = remove_terms_low_median(tweets_clean, dictionary, dtm)
        # Calculate again
        dictionary = corpora.Dictionary(tweets_clean)
        dtm = [dictionary.doc2bow(text) for text in tweets_clean]

    k = config_lda.get("topics")
    if k == 0:
        k = calculate_num_topics()

    ldamodel = gensim.models.LdaMulticore(dtm,
                                          num_topics=k,
                                          id2word=dictionary,
                                          passes=config_lda.get("passes", 20))
    topics_dict = {}
    for topic in ldamodel.show_topics(formatted=False,
                                      num_topics=k,
                                      num_words=config_lda.get("words", 10)):
        topic_words = []
        for pair in topic[1]:
            topic_words.append((pair[0], str(pair[1])))
        topics_dict['topic_' + str(topic[0])] = topic_words

    file_system_json_file = open(
        config_lda.get("output_file", "./topics_results.json"), "w")
    file_system_json_file.write(json.dumps(topics_dict))
    file_system_json_file.close()
示例#2
0
        str2 = set(str2.split())
        try:
            return float(len(str1 & str2)) / len(str1 | str2)
        except ZeroDivisionError:
            return 0
    

    # prepara función de verbose
    if opts.verbose:
        def verbose(*args):
            print(*args)
    else:   
        verbose = lambda *a: None 

    # Colecta los tweets y sus identificadores (idtweet y idusuario)
    users,ids=load_tweets(opts.DIR,opts.format,False)
    #print (users)
    #lo limpiamos de links
    clean_users = [[re.sub(r'\bhttps?:\/\/.*[\r\n]*', u'', i) for i in x] for x in users]    
    #Lo mostramos 
    histogram_list = []

    for tweets in clean_users:
        #print(tweets)
        
        tweets_1 = list(tweets)
        tweets_2 = list(tweets)
        
        #tweets_1 = list(clean_tweets_2)
        #tweets_2 = list(clean_tweets_2)
        #print(tweets_1)
示例#3
0
        help="Define el valor minimo de cuentas ")
    p.add_argument("-v", "--verbose",
        action="store_true", dest="verbose",
        help="Verbose mode [Off]")
    opts = p.parse_args()
    

    # prepara función de verbose
    if opts.verbose:
        def verbose(*args):
            print(*args)
    else:   
        verbose = lambda *a: None 

    # Colecta los tweets y sus identificadores (idtweet y idusuario)
    tweets,ids=load_tweets(opts.DIR,opts.format,mix=opts.mix)

    # Imprime alguna información sobre los tweets
    if opts.verbose:
        for i,tweet in enumerate(tweets[:10]):
            verbose('Tweet example',i+1,tweet[:100])
        verbose("Total tweets   : ",len(tweets))
        try:
            verbose("Total usuarios : ",len(set([id for x,id in ids])))
        except ValueError:
            verbose("Total usuarios : ",len(ids))

    # Calculamos los features
    #metemos las stop words en una lista
    if not opts.stopwords:
        my_stop_words=[]
示例#4
0
                   "--verbose",
                   action="store_true",
                   dest="verbose",
                   help="Verbose mode [Off]")
    opts = p.parse_args()

    # prepara función de verbose
    if opts.verbose:

        def verbose(*args):
            print(*args)
    else:
        verbose = lambda *a: None

    # Colecta los tweets y sus identificadores (idtweet y idusuario)
    tweets, ids = load_tweets(opts.DIR, opts.format, mix=opts.mix)

    # Imprime alguna información sobre los tweets
    if opts.verbose:
        for i, tweet in enumerate(tweets[:10]):
            verbose('Tweet example', i + 1, tweet[:100])
        verbose("Total tweets   : ", len(tweets))
        try:
            verbose("Total usuarios : ", len(set([id for x, id in ids])))
        except ValueError:
            verbose("Total usuarios : ", len(ids))

    # Calculamos los features
    #metemos las stop words en una lista
    if not opts.stopwords:
        my_stop_words = []
示例#5
0
        str2 = set(str2.split())
        try:
            return float(len(str1 & str2)) / len(str1 | str2)
        except ZeroDivisionError:
            return 0

    # prepara función de verbose
    if opts.verbose:

        def verbose(*args):
            print(*args)
    else:
        verbose = lambda *a: None

    # Colecta los tweets y sus identificadores (idtweet y idusuario)
    users, ids = load_tweets(opts.DIR, opts.format, False)
    #print (users)
    #lo limpiamos de links
    clean_users = [[re.sub(r'\bhttps?:\/\/.*[\r\n]*', u'', i) for i in x]
                   for x in users]
    #Lo mostramos
    histogram_list = []

    for tweets in clean_users:
        #print(tweets)

        tweets_1 = list(tweets)
        tweets_2 = list(tweets)

        #tweets_1 = list(clean_tweets_2)
        #tweets_2 = list(clean_tweets_2)
示例#6
0
from load_tweets import load_tweets
from feature_extractors import *
from stats_and_plots import *

import matplotlib.pyplot as plt

tweets = load_tweets('data/sample1.txt')
tdm, vocab = word_counts(tweets)

print(most_used(tdm, vocab, 25))
print(least_used(tdm, vocab, 25))

plt.show(word_bar(tdm, vocab, 25))

feats = {}
feats['num_hashtags'] = num_hashtags(tweets)
feats['reply'] = reply(tweets)
feats['length'] = length(tweets)
feats['num_retweets'] = num_retweets(tweets)

print(most_hashtags(feats))
plt.show(hashtag_hist(feats))

print(proportion_replies(feats))

print(avg_tweet_length(tdm))
plt.show(length_hist(feats))

print(avg_retweets(feats))
plt.show(num_retweets_hist(feats))
    :return: dictionary containing the word and its count
    '''
    dict = {}
    cv = CountVectorizer()
    text = [t['text'] for t in tweets]
    tdm = cv.fit_transform(text)
    word_counts = tdm.sum(axis=0).tolist()[0]

    assert (len(cv.get_feature_names()) == len(word_counts))
    return zip(cv.get_feature_names(), word_counts)


if __name__ == '__main__':

    print('Loading...')
    samples = [load_tweets('data/sample%d.txt' % i) for i in range(1, 4)]
    print('Loaded.')

    print('Cleaning...')
    for tweets in samples:
        tweets = remove_stop_words(tweets)
        tweets = remove_punctuation(tweets)
        tweets = remove_non_english(tweets)
        tweets = remove_links(tweets)
        tweets = remove_digits(tweets)
        tweets = remove_empty(tweets)
    print('Cleaned.')

    print('Extract Trends')
    trend_extract_term_frequency(samples)
    print('Extracted')
示例#8
0
if __name__ == "__main__":
    warnings.simplefilter("ignore")

    tweets = None
    sentiment_dir = "../sentiment/"
    sentiment_models = {
        "text_blob": find_text_blob_sentiment,
        "vader": find_vader_sentiment,
    }

    for model_name, model_function in sentiment_models.items():
        sentiment_path = os.path.join(sentiment_dir, model_name) + ".pickle"
        if not os.path.exists(sentiment_path):
            if tweets is None:
                tweets = load_tweets()
                tweets = list(tweets.items())

            results = Parallel(find_text_blob_sentiment, tweets, model_name)

            sentiment = {tweet_id: value for tweet_id, value in results}
            save_pickle(sentiment, sentiment_path)

    model_name = "flair"
    sentiment_path = os.path.join(sentiment_dir, model_name) + ".pickle"
    if not os.path.exists(sentiment_path):
        if tweets is None:
            tweets = load_tweets()
            tweets = list(tweets.items())
        sentiment = find_flair_sentiment(tweets, chunk_len=100000)
        sentiment_models[model_name] = sentiment
示例#9
0
    # Separate data into train, validation and test sets
    data, labels = load_polarity()
    class_names = ['negatif', 'positif']
    size_data = len(labels)
    train, test, train_labels, test_labels = sklearn.model_selection.train_test_split(
        data, labels, test_size=.9, random_state=42)
    train, val, train_labels, val_labels = sklearn.model_selection.train_test_split(
        train, train_labels, test_size=.1, random_state=42)
    train_labels = np.array(train_labels)
    test_labels = np.array(test_labels)
    val_labels = np.array(val_labels)

elif dataset == 'tweet':
    # If spanish change load_tweets to load_tweets_es() else only load_tweets()
    class_names = load_tweets.transform_emoji()
    train, train_labels = load_tweets.load_tweets()
    train, test, train_labels, test_labels = sklearn.model_selection.train_test_split(
        train, train_labels, test_size=.9, random_state=42)
    train, val, train_labels, val_labels = sklearn.model_selection.train_test_split(
        train, train_labels, test_size=.2, random_state=42)
    size_data = len(train_labels) + len(test_labels)

elif 'bert' in dataset:
    class_names = ['negative', 'positive']
    df = pd.read_csv(
        'https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv',
        delimiter='\t',
        header=None)
    # Number of sentance examples on which the bert model is trained on the train set (limited since the model takes time)
    batch_1 = df[:size_batch_bert]
    train_vectors = batch_1[0]