class PtBrTwitter(): def __init__(self, dir_in, dir_out): self.dir_in = dir_in self.dir_out = dir_out self.tw_files = ([file for root, dirs, files in os.walk(self.dir_in) for file in files if file.endswith('.json') ]) self.doc_list = list() self.date_list = list() self.tp = TextProcessor() def read(self): for tw_file in self.tw_files: with open(self.dir_in+tw_file) as data_file: for line in data_file: tweet = json.loads(line) self.doc_list.append(tweet['text']) self.date_list.append(tweet['created_at']) def tokenizeAndSave(self, file_name): tweets = self.tp.text_process(self.doc_list) tweets = list(itertools.chain.from_iterable(tweets)) t_count = Counter(tweets) with open(self.dir_out+file_name, 'wb') as handle: pickle.dump(t_count, handle) def loadCounter(self, file_name): with open(self.dir_out+file_name, 'rb') as handle: t_count = pickle.load(handle) return t_count
file = "/Users/lucasso 1/Documents/validation/nao_politicos.json" data = { 'favorites': [], 'user_id': [], 'text': [], 'retweets': [], 'created_at': [], 'tweet_id': [], 'user_screen_name': [] } for t in load_tweets(file): data['user_id'].append(t['user_id']) data['favorites'].append(t['favorites']) data['text'].append(t['text']) data['retweets'].append(t['retweets']) data['created_at'].append(t['created_at']) data['tweet_id'].append(t['tweet_id']) data['user_screen_name'].append(t['user_screen_name']) df = pd.DataFrame(data) df['created_at'] = pd.to_datetime(df['created_at'], unit='ms') df = df.set_index('created_at') df = df.sort_index(ascending=True) tp = TextProcessor() df['text_processed'] = tp.text_process(df.text.tolist(), hashtags=True) df['political'] = 0 file = file.replace('json', 'pck') df.to_pickle(file)
tp = TextProcessor() id_rep, names = rt.names_from_xls() for idx in range(len(names)): tweets = list() graphs = list() tw = nx.Graph() data = rt.tweets_election_data(id_rep[idx]) diction = {k:v for (k,v) in data.items()} for i in diction: tweets.append(list(itertools.chain.from_iterable(tp.text_process(diction[i].split())))) for tweet in tweets: for u,v in itertools.combinations(tweet,2): if tw.has_edge(u,v): tw[u][v]['weight'] += 1 else: tw.add_edge(u, v, weight=1) nx.write_gml(tw , dir_out+names[idx]+".gml") files = list() for file in os.listdir(dir_out): if file.endswith(".gml"): files.append(file)
with open(dir_out+"random-pck/coleta1.pck",'rb') as handle: random_pck.append(pickle.load(handle)) with open(dir_out+"random-pck/coleta2.pck",'rb') as handle: random_pck.append(pickle.load(handle)) month_files = list() for m in range(10): month_files.append(load_files(dir_out+"tw_month/month_"+str(m)+"/")) tp = TextProcessor() month_processed =list() for tw in month_files: tmp = list() for dep in tw: tmp.append(tp.text_process(dep,text_only=True)) month_processed.append(tmp) ranked_month = list() for i,month in enumerate(month_processed): tmp = tfidf_month(month,random_pck) ranked_month.append(tmp) save_pck(dir_out+"tw_month/month_"+str(i)+"/",tmp) tfidf = TfIdf() #calcular o tfidf para cada pasta com aleatorio dependendo do tamanho de cada mes
if excel: sheet_name = "nao_eleitos" col = 4 rep_dic = {} for fname in tw_files: rep_dic[fname.split('_',1)[0]] = fname xls = xlrd.open_workbook(dir_xls) sheet = xls.sheet_by_name(sheet_name) for i in range(sheet.nrows): id_rep = str(int(sheet.cell_value(rowx= i, colx=col))) if (id_rep in rep_dic): with open(dir_in+rep_dic[id_rep]) as data_file: print('file %s' % data_file) for line in data_file: tweet = json.loads(line) tweet['text_processed'] = ' '.join(tp.text_process([tweet['text']], text_only=True)[0]) tweet['cond_55'] = sheet_name db.tweets.insert(tweet) else: for tw_file in tw_files: with open(dir_in+tw_file) as data_file: print('file %s' % data_file) for line in data_file: tweet = json.loads(line) tweet['text_processed'] = ' '.join(tp.text_process([tweet['text']], text_only=True)[0]) tweet['cond_55'] = 'nao_eleitos' db.tweets.insert(tweet)
f = open(dir_down + "sanders-twitter-0.2/full-corpus.csv", "rt") twitter = csv.reader(f, delimiter=',') tweets = list() for tw in twitter: tweets.append(tw) random.shuffle(tweets) topic = list() txt = list() for tw in tweets: topic.append(tw[0]) txt.append(tw[4]) txt = tp.text_process(txt, lang="english") with open(dir_out + "sanders_twitter.pck", 'wb') as handle: pickle.dump(txt, handle) f = open(dir_out + "sanders_twitter.txt", 'w') for l in txt: f.write(" ".join(l) + "\n") f.close f = open(dir_out + "topic_sanders_twitter.txt", 'w') for t in topic: f.write(t + "\n") f.close
if __name__=='__main__': cf = configparser.ConfigParser() cf.read("file_path.properties") path = dict(cf.items("file_path")) dir_in = path['dir_in'] dir_out = path['dir_out'] dir_ale = path['dir_ale'] dir_rob = path['dir_rob'] doc_list, parl_tw_list = load_files(dir_rob) tp = TextProcessor() parl_tw_processed = list() for l in parl_tw_list: parl_tw_processed.append(tp.text_process(l, text_only=True)) with open(dir_in+"coleta1.pck",'rb') as handle: coleta1 = pickle.load(handle) with open(dir_in+"coleta2.pck",'rb') as handle: coleta2 = pickle.load(handle) tweets = list(itertools.chain.from_iterable(list(itertools.chain.from_iterable(parl_tw_processed)))) tot_counter = Counter(tweets) parl_counters = list() for parl in parl_tw_processed: tw = list(itertools.chain.from_iterable(parl))
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format( dir_w2v + W2VEC_MODEL_FILE, binary=False, unicode_errors="ignore") tp = TextProcessor() texts = list() tx_class = list() tmp = list() with open(POLITICS_FILE) as l_file: for line in l_file: tmp.append(line) tx_class.append('politics') texts += tp.text_process(tmp, text_only=True) tmp = list() with open(NON_POLITICS_FILE) as l_file: for line in l_file: tmp.append(line) tx_class.append('non-politics') texts += tp.text_process(tmp, text_only=True) texts = select_texts(texts) vocab = gen_vocab(word2vec_model) X, y = gen_sequence(vocab, texts, tx_class)
col = 4 rt = ReadTwitter(dir_in, excel_path, sheet_name, col ) tp = TextProcessor() id_rep, names = rt.names_from_xls() parl_words = Counter() counter_list = list() tw_apriori = list() for idx in range(len(names)): tweets = list() data = rt.tweets_election_data(id_rep[idx]) for k,v in data.items(): tweets.append(tp.text_process(v.split())) tw_apriori += [[x[0] for x in e if x] for e in tweets if e ] tweets = list(itertools.chain.from_iterable(list(itertools.chain.from_iterable(tweets)))) counter_list.append(Counter(tweets)) parl_words.update(tweets) word_ent = dict() for word,count in parl_words.items(): ent = 0 for counter in counter_list: prob = counter[word]/count ent += prob * (-np.log2(prob+1e-100)) word_ent[word] = ent sort = sorted(word_ent.items(), key=lambda x: x[1], reverse=True)
if __name__=='__main__': cf = configparser.ConfigParser() cf.read("file_path.properties") path = dict(cf.items("file_path")) dir_in = path['dir_in'] dir_out = path['dir_out'] dir_ale = path['dir_ale'] dir_pck = path['dir_pck'] doc_list, parl_tw_list = load_files(dir_in) _ ,list_aleatory = load_files(dir_ale) tp = TextProcessor() tweets = tp.text_process(doc_list,text_only=True) tw_words = add_separator(tweets) parl_bigrams = get_bigrams(tw_words,3,True) #processa os tweets de cada deputado parl_processed = list() parl_tri_processed = list() for l in parl_tw_list: temp = add_separator(tp.text_process(l,text_only=True)) parl_tri_processed.append(get_trigrams(temp, 3, True)) parl_processed.append(get_bigrams(temp,3,True)) with open(dir_out+"list_dept_bigrams_.pck", 'wb') as handle: pickle.dump(parl_processed, handle) with open(dir_out+"list_dept_trigrams_.pck", 'wb') as handle: pickle.dump(parl_tri_processed, handle)
with open(dir_in+tw_file) as data_file: for line in data_file: tweet = json.loads(line) doc_list.append(tweet['text']) return doc_list if __name__=='__main__': dir_in= "/Users/lucasso/Documents/pck/" dir_ent = "/Users/lucasso/Documents/tweets_pedro/" dir_out= "/Users/lucasso/Dropbox/Twitter_Marcelo/Report/plot/" doc_list = load_files(dir_ent) tp = TextProcessor() tweets = tp.text_process(doc_list) word_list = set(load_file(dir_out,"word_list.pck")) #lista já processada sem entropia 0 e ration >1. remove todas as outras palavras que não interessam dos tweets tweets =[[i for i in t if i in word_list] for t in tweets] hashtags = re.compile(r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)""") hs_set =set() hastgs_list = list() for tweet in tweets: v = ','.join(hashtags.findall( ' '.join(tweet))) l = hashtags.findall( ' '.join(tweet)) hastgs_list.append(l) hs_set |= set(v.split(",")) hastgs_list = [e for e in hastgs_list if e] # remove as listas em branco
cf = configparser.ConfigParser() cf.read("../file_path.properties") path = dict(cf.items("file_path")) dir_out = path['dir_out'] dir_ale = path['dir_ale'] dir_tw = path['dir_tw'] print("load tweet files") fnames = ([file for root, dirs, files in os.walk(dir_tw) for file in files if file.endswith('.json') ]) categories_tw = list() tp = TextProcessor() for fl in fnames: categories_tw.append(tp.text_process(read_tweets(fl))) categories_counter = list() test_data = list() for categ in categories_tw: k = int(len(categ) * 0.2) random.shuffle(categ) tmp = list(itertools.chain.from_iterable(categ[k:])) categories_counter.append(Counter(tmp)) test_data.append(categ[:k]) print("process tfidf") tfidf_entropy = list() tfidf_smooth = list() tfidf_like = list()
plot_cloud(sort_tfidf_like,n,"dic_tfidf_like") dir_path = "/Users/lucasso/Documents/tweets/" tp = TextProcessor() tw_files = ([file for root, dirs, files in os.walk(dir_path) for file in files if file.endswith('.json') ]) tw_list = list() tweets = list() for tw_file in tw_files: with open(dir_path+tw_file) as data_file: doc_list = list() for line in data_file: tweet = json.loads(line) doc_list.append(tweet['text']) tw_list.append(list(itertools.chain.from_iterable(tp.text_process(doc_list)))) for i in range(len(tw_list)): plot_dep_cloud(tw_list[i],sort_tfidf,n,tw_files[i]+"_dic_tfidf") plot_dep_cloud(tw_list[i],sort_tf_log_idf,n,tw_files[i]+"dep_dic_tf_log_idf") plot_dep_cloud(tw_list[i],sort_tfidf_like,n,tw_files[i]+"dic_tfidf_like") #Gera o cvs dos rankings save_ranking(dir_out, sort_tfidf, sort_tf_log_idf, sort_tfidf_like) #Gera a tabela do tfidf_like e seus parametros save_tfidf_like(parl_counter, sort_tfidf_like, counter_list, tot_counter,counter_list_dep) #comando linux para trocar . por ,: tr '.' ',' < arquivo_in > arquivo_out #Gerar o ranking das palavras e cada parlamanentar dic_political = dict(dic_tfidf_like)
VALIDATION_FILE = args.validationfile cf = configparser.ConfigParser() cf.read("file_path.properties") path = dict(cf.items("file_path")) dir_in = path['dir_in'] X, y_true = load_validation_file_csv(VALIDATION_FILE) tp = TextProcessor() pc = PoliticalClassification(H5_FILE, NPY_FILE, 25) pol = '' n_pol = '' y_pred = list() X = tp.text_process(X, text_only=True) for tx in X: text = ' '.join(tx) if pc.is_political(text): pol += text + '\n' y_pred.append(1) else: n_pol += text + '\n' y_pred.append(0) print(classification_report(y_true, y_pred)) p, r, f1, s = precision_recall_fscore_support(y_true, y_pred) ff1 = f1_score(y_true, y_pred, average='weighted') recall = recall_score(y_true, y_pred, average='weighted') precision = precision_score(y_true, y_pred, average='weighted')
dir_w2v = path['dir_w2v'] print('Loading word2vec model...') word2vec_model = gensim.models.KeyedVectors.load_word2vec_format( dir_w2v + W2VEC_MODEL_FILE, binary=False, unicode_errors="ignore") texts, y_true = load_validation_file_csv(VALIDATION_FILE) print('Loading ' + MODEL_FILE + ' file...') model = joblib.load(MODEL_FILE) pol = '' n_pol = '' y_pred = list() tp = TextProcessor() texts = tp.text_process(texts, text_only=True) X = gen_data(texts) mean_auc, std_auc = generate_roc_curve( model, X, y_true, MODEL_FILE, get_model_name_by_file(VALIDATION_FILE)) print('Predicting...') y_pred = model.predict(X) print('Classification Report') print(classification_report(y_true, y_pred)) p, r, f1, s = precision_recall_fscore_support(y_true, y_pred) model_name = MODEL_FILE.replace(SKL_FOLDER, '') model_name = model_name.replace('.politics_ben.skl', '')
with open(dir_in+tw_file) as data_file: for line in data_file: tweet = json.loads(line) temp.append(tweet['text']) doc_list.append(temp) return doc_list, tw_files if __name__ == "__main__": cf = configparser.ConfigParser() cf.read("../file_path.properties") path = dict(cf.items("file_path")) dir_in = path['dir_val'] tp = TextProcessor() tweets = list() doc_list, tw_files = load_files(dir_in) for txt in doc_list: print(len(doc_list)) tweets.append(tp.text_process(txt, text_only=True)) for i, fl in enumerate(tw_files): f = open(dir_in+"%s.txt" % fl.split('.')[0], 'w') for tw in tweets[i]: f.write(" ".join(tw) + "\n") f.close()
with open(filedir) as data_file: doc_set = list() doc_tw = set() dc =set() weeks = list() dist = list() lamb = list() inicial = 1 final = 603 for line in data_file: tweet = json.loads(line) created = int(tweet['created_at']) if(days2time(inicial) <= created < days2time(final)): doc_tw.add(tweet['text']) doc_set.append(tweet) texts = tp.text_process(doc_tw) corpus, dic = tp.create_corpus(texts) ldamodel = tp.generate_lda(corpus, dic, 5) #ldamodel = tp.generate_hdp(corpus, dic) print(tp.print_topics(ldamodel)) with open(lamb_dir) as l_file: for line in l_file: i = int(line.split('|')[2]) w = int(line.split('|')[0]) lamb.append(w) for s in range(i-1): lamb.append(w) for k in range(inicial, final, 7): doc = set()
N_ESTIMATORS = int(args.estimators) LOSS_FUN = args.loss KERNEL = args.kernel print('Word2Vec embedding: %s' %(W2VEC_MODEL_FILE)) print('Embedding Dimension: %d' %(EMBEDDING_DIM)) cf = configparser.ConfigParser() cf.read("../file_path.properties") path = dict(cf.items("file_path")) dir_w2v = path['dir_w2v'] dir_in = path['dir_in'] word2vec_model = gensim.models.Word2Vec.load(dir_w2v+W2VEC_MODEL_FILE) tp = TextProcessor() doc_list, tw_class = load_files(dir_in) tweets = tp.text_process(doc_list, text_only=True) tweets = select_tweets(tweets) X, Y = gen_data(tweets, tw_class) model = classification_model(X, Y, MODEL_TYPE) joblib.dump(model, dir_in + MODEL_TYPE + '.skl') # python BoWV.py --model logistic --seed 42 -f model_word2vec -d 100 --folds 10 # python BoWV.py --model gradient_boosting --seed 42 -f model_word2vec -d 100 --loss deviance --folds 10 # python BoWV.py --model random_forest --seed 42 -f model_word2vec -d 100 --estimators 20 --folds 10 # python BoWV.py --model svm_linear --seed 42 -f model_word2vec -d 100 --loss squared_hinge --folds 10 # python BoWV.py --model svm --seed 42 -f model_word2vec -d 100 --kernel rbf --folds 10
if __name__=='__main__': cf = configparser.ConfigParser() cf.read("file_path.properties") path = dict(cf.items("file_path")) dir_in = path['dir_in'] dir_out = path['dir_out'] dir_ale = path['dir_ale'] dir_pck = path['dir_pck'] doc_list, parl_tw_list = load_files(dir_in) _ ,list_aleatory = load_files(dir_ale) tp = TextProcessor() tweets = tp.text_process(doc_list, text_only=True) parl_tw_processed = list() for l in parl_tw_list: parl_tw_processed.append(tp.text_process(l, text_only=True)) alea_tw_processed = list() for l in list_aleatory: alea_tw_processed.append(tp.text_process(l, text_only=True)) for i,l in enumerate(alea_tw_processed): alea_tw_processed[i] = [n for n in l if n] with open(dir_out+"bgr_tfidf_like.pck",'rb') as handle: parl_bigrams = pickle.load(handle)