def preprocess_csv(csv_file_name, processed_file_name, test_file=False): save_to_file = open(processed_file_name, 'w') import chardet with open(csv_file_name, 'rb') as rawdata: result = chardet.detect(rawdata.read(100000)) print(result) with open(csv_file_name, 'r') as csv: lines = csv.readlines() total = len(lines) for i, line in enumerate(lines): tweet_id = line[:line.find(',')] if not test_file: line = line[1 + line.find(','):] positive = int(line[:line.find(',')]) line = line[1 + line.find(','):] tweet = line processed_tweet = preprocess_tweet(tweet) if not test_file: save_to_file.write('%s,%d,%s\n' % (tweet_id, positive, processed_tweet)) else: save_to_file.write('%s,%s\n' % (tweet_id, processed_tweet)) write_status(i + 1, total) save_to_file.close() print('\nSaved processed tweets to: %s' % processed_file_name) return processed_file_name
def preprocess_csv(csv_file_name, processed_file_name, test_file): """ Creates a dictionary with slangs and their equivalents and replaces them """ with open('slang.txt') as file: slang_dict = dict(map(str.strip, line.partition('\t')[::2]) for line in file if line.strip()) save_to_file = open(processed_file_name, 'w') with open(csv_file_name, 'r') as csv: lines = csv.readlines() total = len(lines) for i, line in enumerate(lines): tweet_id = line[:line.find(',')] if not test_file: line = line[1 + line.find(','):] positive = int(line[:line.find(',')]) line = line[1 + line.find(','):-1] tweet = line tweet = preprocess_tweet(tweet, slang_dict) processed_tweet = preprocess_tweet(tweet, slang_dict) #print(processed_tweet) if (test_file==0): save_to_file.write('%s,%d,%s\n' % (tweet_id, positive, processed_tweet)) else: save_to_file.write('%s,%s\n' % (tweet_id, processed_tweet)) write_status(i + 1, total) save_to_file.close() print ('\nSaved processed tweets to: %s' % processed_file_name) return processed_file_name
def process_tweets(csv_file, test_file=True): """ Returns a list of tuples of type (tweet_id, feature_vector) or (tweet_id, sentiment, feature_vector) """ tweets = [] print('Generating feature vectors') with open(csv_file, 'r') as csv: lines = csv.readlines() total = len(lines) for i, line in enumerate(lines): if test_file: tweet_id = line[:line.find(',')] tweet = line[1 + line.find(','):] else: tweet_id = line[:line.find(',')] sentiment = int(line[:line.find(',')]) tweet = line[1 + line.find(','):] # tweet_id, sentiment, tweet = line.split(',') feature_vector = get_feature_vector(tweet) if test_file: tweets.append((tweet_id, feature_vector)) else: tweets.append((tweet_id, int(sentiment), feature_vector)) utils.write_status(i + 1, total) print('\n') return tweets
def process_tweets(csv_file, test_file=True): """ Generates training X, y pairs. """ tweets = [] labels = [] print ('Generating feature vectors') with open(csv_file, 'r') as csv: lines = csv.readlines() total = len(lines) for i, line in enumerate(lines): if test_file: tweet_id, sentiment, tweet = line.split(',') else: tweet_id, sentiment, tweet = line.split(',') feature_vector = get_feature_vector(tweet) if test_file: tweets.append(feature_vector) labels.append(int(sentiment)) else: tweets.append(feature_vector) labels.append(int(sentiment)) utils.write_status(i + 1, total) print ('\n') return tweets, np.array(labels)
def preprocess_csv(csv_file_name, processed_file_name, test_file=False): save_to_file = open(processed_file_name, 'w') pos = int() with open(csv_file_name, 'r') as csv: lines = csv.readlines() total = len(lines) for i, line in enumerate(lines): tweet_id = line[:line.find(',')] if not test_file: line = line[1 + line.find(','):] positive = line[:line.find(',')] if "1" in positive: pos = int(1) elif "0" in positive: pos = int(0) line = line[1 + line.find(','):] tweet = line processed_tweet = preprocess_tweet(tweet) if not test_file: save_to_file.write('%s,%d,%s\n' % (tweet_id, pos, processed_tweet)) else: save_to_file.write('%s,%s\n' % (tweet_id, processed_tweet)) write_status(i + 1, total) save_to_file.close() print('\nSaved processed tweets to: %s' % processed_file_name) return processed_file_name
def process_CONTENT(csv_file, test_file=True): """Returns a list of tuples of type (content_id, feature_vector) or (content_id, sentiment, feature_vector) Args: csv_file (str): Name of processed csv file generated by preprocess.py test_file (bool, optional): If processing test file Returns: list: Of tuples """ CONTENT = [] print 'Generating feature vectors' with open(csv_file, 'r') as csv: lines = csv.readlines() total = len(lines) for i, line in enumerate(lines): if test_file: CONTENT_id, CONTENTS = line.split(',') else: CONTENT_id, sentiment, CONTENTS = line.split(',') feature_vector = get_feature_vector(CONTENTS) if test_file: CONTENT.append((CONTENT_id, feature_vector)) else: CONTENT.append((CONTENT_id, int(sentiment), feature_vector)) utils.write_status(i + 1, total) print '\n' return CONTENT
def preprocess_df(structured_file_name): overall_file_name = sys.argv[1][:-4] + '-overall.csv' room_file_name = sys.argv[1][:-4] + '-room.csv' cleanliness_file_name = sys.argv[1][:-4] + '-cleanliness.csv' service_file_name = sys.argv[1][:-4] + '-service-linear.csv' save_to_file = open(service_file_name, 'w') with open(structured_file_name, 'r',encoding='utf-8') as csv: lines = csv.readlines() total = len(lines) for i, line in enumerate(lines): if i==0: continue line = line.split(',') post_id, content, n_likes, sentiment_hand, relevance = line[0],line[10],line[5],line[9],line[8] sentiment_hand = int(float(sentiment_hand)) if sentiment_hand > 3: sentiment = 1 else: sentiment = 0 processed_content = preprocess_tweet(content) save_to_file.write('%s,%s,%s\n' % (post_id,int(int(n_likes)>0),processed_content)) write_status(i + 1, total) save_to_file.close() return
def process_tweets(csv_file, test_file=True): """Returns a list of tuples of type (tweet_id, feature_vector) or (tweet_id, sentiment, feature_vector) Args: csv_file (str): Name of processed csv file generated by preprocess.py test_file (bool, optional): If processing test file Returns: list: Of tuples """ tweets = [] print('Generating feature vectors') with open(csv_file, 'r') as csv: lines = csv.readlines() total = len(lines) for i, line in enumerate(lines): if test_file: tweet_id, tweet = line.split(',') else: tweet_id, sentiment, tweet = line.split(',') feature_vector = get_feature_vector(tweet) if test_file: tweets.append((tweet_id, feature_vector)) else: tweets.append((tweet_id, int(sentiment), feature_vector)) utils.write_status(i + 1, total) print('\n') return tweets
def process_CONTENT(csv_file, test_file=True): """ Generates training X, y pairs. """ CONTENT = [] labels = [] print 'Generating feature vectors' with open(csv_file, 'r') as csv: lines = csv.readlines() total = len(lines) for i, line in enumerate(lines): if test_file: CONTENT_id, CONTENTS = line.split(',') else: CONTENT_id, sentiment, CONTENTS = line.split(',') feature_vector = get_feature_vector(CONTENTS) if test_file: CONTENT.append(feature_vector) labels.append(np.random.randn(1)) else: CONTENT.append(feature_vector) labels.append(int(sentiment)) utils.write_status(i + 1, total) print '\n' return CONTENT, np.array(labels)
def preprocess_csv( csv_file_name, processed_file_name, test_file=True): #make False when training , true when testing save_to_file = open(processed_file_name, 'w') with open(csv_file_name, 'r') as csv: lines = csv.readlines() total = len(lines) for i, line in enumerate(lines): tweet_id = line[:line.find(',')] if not test_file: line = line[1 + line.find(','):] positive = int(line[:line.find(',')]) line = line[1 + line.find(','):] tweet = line processed_tweet = preprocess_tweet(tweet) if not test_file: save_to_file.write('%s,%d,%s\n' % (tweet_id, positive, processed_tweet)) else: save_to_file.write('%s,%s\n' % (tweet_id, processed_tweet)) write_status(i + 1, total) save_to_file.close() print('\nSaved processed tweets to: %s' % processed_file_name) return processed_file_name
def main(file_name: str): num_tweets, num_pos_tweets, num_neg_tweets = 0, 0, 0 all_words: List[str] = [] all_bigrams: List[Tuple[str, str]] = [] with open(file_name) as csv: lines = csv.readlines() num_tweets = len(lines) for i, line in enumerate(lines): t_id, if_pos, tweet = line.strip().split(',') if_pos = int(if_pos) if if_pos: num_pos_tweets += 1 else: num_neg_tweets += 1 words, bigrams = analyze_tweet(tweet) all_words.extend(words) all_bigrams.extend(bigrams) write_status(i + 1, num_tweets) unique_words = list(set(all_words)) unique_words_file_name = 'unique.txt' with open(os.path.join('dataset', unique_words_file_name), 'w') as uwf: uwf.write('\n'.join(unique_words)) sys.stdout.write('\nCalculating frequency distribution') sys.stdout.flush() # Unigrams freq_dist = FreqDist(all_words) pkl_file_name = 'freqdist.pkl' with open(os.path.join('dataset', pkl_file_name), 'wb') as pkl_file: pickle.dump(freq_dist, pkl_file) sys.stdout.write(f'Saved uni-frequency distribution to {pkl_file_name}') sys.stdout.flush() # Bigrams bigram_freq_dist = get_bigram_freqdist(all_bigrams) bi_pkl_file_name = 'freqdist-bi.pkl' with open(os.path.join('dataset', bi_pkl_file_name), 'wb') as pkl_file: pickle.dump(bigram_freq_dist, pkl_file) sys.stdout.write(f'Saved bi-frequency distribution to {bi_pkl_file_name}') sys.stdout.write('\n[Analysis Statistics]') sys.stdout.write(f'Tweets => Total: {num_tweets}, Positive: {num_pos_tweets}, Negative: {num_neg_tweets}') sys.stdout.flush()
def process_tweets_test(csv_file): tweets = [] print('Generating feature vectors') with open(csv_file, 'r') as csv: lines = csv.readlines() total = len(lines) for i, line in enumerate(lines): tweet_id, sentiment, tweet = line.split(',') feature_vector = get_feature_vector(tweet) tweets.append((tweet_id, feature_vector)) utils.write_status(i + 1, total) print('\n') return tweets
def get_glove_vectors(vocab): print 'Looking for GLOVE seeds' glove_vectors = {} found = 0 with open(GLOVE_FILE, 'r') as glove_file: for i, line in enumerate(glove_file): utils.write_status(i + 1, 0) tokens = line.strip().split() word = tokens[0] if vocab.get(word): vector = [float(e) for e in tokens[1:]] glove_vectors[word] = np.array(vector) found += 1 print '\n' return glove_vectors
def get_glove_vectors(vocab): print('Looking for GLOVE vectors') glove_vectors = {} found = 0 with open(GLOVE_FILE, 'r', encoding="utf8") as glove_file: for i, line in enumerate(glove_file): utils.write_status(i + 1, 0) tokens = line.split() word = tokens[0] if vocab.get(word): vector = [float(e) for e in tokens[1:]] glove_vectors[word] = np.array(vector) found += 1 print('\n') print('Found %d words in GLOVE' % found) return glove_vectors
def get_glove_vectors(vocab): print('Looking for pre-trained vectors') pretrained_vectors = {} found = 0 with open(WORD_VECTORS, 'r', encoding='utf-8') as glove_file: for i, line in enumerate(glove_file): utils.write_status(i + 1, 0) tokens = line.split() word = tokens[0] if vocab.get(word): vector = [float(e) for e in tokens[1:]] pretrained_vectors[word] = np.array(vector) found += 1 print('\n') print('Found %d words on pre-trained word vectors' % found) return pretrained_vectors
def get_glove_vectors(vocab): """ Extracts glove vectors from seed file only for words present in vocab. """ print('Looking for GLOVE seeds') glove_vectors = {} found = 0 with open(GLOVE_FILE, 'r') as glove_file: for i, line in enumerate(glove_file): utils.write_status(i + 1, 0) tokens = line.strip().split() word = tokens[0] if vocab.get(word): vector = [float(e) for e in tokens[1:]] glove_vectors[word] = np.array(vector) found += 1 return glove_vectors
def process_tweets(csv_file, test_file=True): tweets = [] print 'Generating feature vectors' with open(csv_file, 'r') as csv: lines = csv.readlines() total = len(lines) for i, line in enumerate(lines): if test_file: tweet_id, tweet = line.split(',') else: tweet_id, sentiment, tweet = line.split(',') feature_vector = get_feature_vector(tweet) if test_file: tweets.append((tweet_id, feature_vector)) else: tweets.append((tweet_id, int(sentiment), feature_vector)) utils.write_status(i + 1, total) print '\n' return tweets
def process_CONTENT(csv_file, test_file=True): CONTENT = [] print 'Generating feature vectors' with open(csv_file, 'r') as csv: lines = csv.readlines() total = len(lines) for i, line in enumerate(lines): if test_file: CONTENT_id, CONTENTS = line.split(',') else: CONTENT_id, sentiment, CONTENTS = line.split(',') feature_vector = get_feature_vector(CONTENTS) if test_file: CONTENT.append((CONTENT_id, feature_vector)) else: CONTENT.append((CONTENT_id, int(sentiment), feature_vector)) utils.write_status(i + 1, total) print '\n' return CONTENT
def preprocess_csv(csv_file_name): train = "processed_train_data.csv" test = "processed_test_data.csv" rows = [] with open(csv_file_name, 'r', encoding="utf8") as csvfile: csvreader = csv.reader(csvfile) for row in csvreader: rows.append(row) # Chỉ lấy các tweet positive và negative tweets = [row for row in rows if "neutral" not in row[7]] # 20000 train # ~10000 test train_tweets = tweets[:20000] test_tweets = tweets[20000:] total = len(tweets) success = 0 save_to_file = open(train, 'w', encoding="utf8") for i, tweet in enumerate(train_tweets): tweet_content = tweet[1] positive = 0 if "positive" not in tweet[7] else 1 tweet_id = uuid.uuid4() processed_tweet = preprocess_tweet(tweet_content) save_to_file.write('%s,%d,%s\n' % (tweet_id, positive, processed_tweet)) success += 1 write_status(success, total) save_to_file.close() save_to_file = open(test, 'w', encoding="utf8") for i, tweet in enumerate(test_tweets): tweet_content = tweet[1] positive = 0 if "positive" not in tweet[7] else 1 tweet_id = uuid.uuid4() processed_tweet = preprocess_tweet(tweet_content) save_to_file.write('%s,%d,%s\n' % (tweet_id, positive, processed_tweet)) success += 1 write_status(success, total) save_to_file.close() print('\nSaved processed tweets to: %s\n%s' % (test, train))
def preprocess_csv(csv_file_name, processed_file_name, test_file): save_to_file = open(processed_file_name, 'w') df = pd.read_csv(csv_file_name, sep="\t") total = len(df.index) for i, line in df.iterrows(): if not test_file: tweet = line[1] sentiment = int(line[0]) else: tweet = line[0] processed_tweet = preprocess_tweet(tweet) if not test_file: #save_to_file.write('%d,%s\n' %(sentiment, processed_tweet)) save_to_file.write(str(sentiment) + ',' + processed_tweet + '\n') else: save_to_file.write('%s\n' % (processed_tweet)) write_status(i + 1, total) save_to_file.close() print('\nSaved processed tweets to: %s' % processed_file_name) return processed_file_name
def process_reviews(csv_file, test_file): """Returns a list of tuples of type (review_id, feature_vector) or (review_id, sentiment, feature_vector)""" reviews = [] print('Generating feature vectors') with open(csv_file, 'r') as csv: lines = csv.readlines() total = len(lines) for i, line in enumerate(lines): if test_file: review_id, review = line.split(',') else: review_id, sentiment, review = line.split(',') feature_vector = get_feature_vector(review) #print(feature_vector) if test_file: reviews.append((review_id, feature_vector)) else: reviews.append((review_id, int(sentiment), feature_vector)) utils.write_status(i + 1, total) print('\n') return reviews
def preprocess_csv(csv_file_name, processed_file_name, test_file): """ Creates a dictionary with slangs and their equivalents and replaces them """ with open('slang.txt') as file: slang_dict = dict(map(str.strip, line.partition('\t')[::2]) for line in file if line.strip()) #tweet = "happy birthday <user> :) keep handsome , longlife and everything . wish u all the best : > allah bless you . traktir!!!!!! <33333" #tweet = "5555555555 gsshg5655 i havent even met <user> yet but i miss them sooo soo much ! ! ! i feel like a got a hole in my heart ! ! would give up everything 4u" tweet = "<user> oh ... wells , the kkeut <3 coming from - _ ** beast still makes me scared , i lost my prince anyways , i cant lose others too 4u hhjhhj67 :/" new_tweet = preprocess_tweet(tweet, slang_dict) print(new_tweet) save_to_file = open(processed_file_name, 'w') with open(csv_file_name, 'r') as csv: lines = csv.readlines() total = len(lines) for i, line in enumerate(lines): tweet_id = line[:line.find(',')] if not test_file: line = line[1 + line.find(','):] positive = int(line[:line.find(',')]) line = line[1 + line.find(','):-1] tweet = line tweet = preprocess_tweet(tweet, slang_dict) processed_tweet = preprocess_tweet(tweet, slang_dict) #print(processed_tweet) if (test_file==0): save_to_file.write('%s,%d,%s\n' % (tweet_id, positive, processed_tweet)) else: save_to_file.write('%s,%s\n' % (tweet_id, processed_tweet)) write_status(i + 1, total) save_to_file.close() print ('\nSaved processed tweets to: %s' % processed_file_name) return processed_file_name
def preprocess_csv(csv_name, processed_name, test_file): save_to_file = open(processed_name, 'w') with open(csv_name, 'r', encoding="ISO-8859-1") as csv: lines = csv.readlines() total = len(lines) for i, line in enumerate(lines): tweet_id = line[:line.find(',')] if not test_file: line = line[1 + line.find(','):] label = int(line[:line.find(',')]) line = line[1 + line.find(','):] tweet = line processed_tweet = preprocess_tweet(tweet) if not test_file: save_to_file.write('%s, %d, %s\n' % (tweet_id, label, processed_tweet)) else: save_to_file.write('%s, %s\n' % (tweet_id, processed_tweet)) write_status(i + 1, total) save_to_file.close() print('\n saved processed tweets to: %s' % processed_name) return processed_name
def preprocess_csv(csv_file_name, processed_file_name, test_file=True): save_to_file = open(processed_file_name, 'w') with open(csv_file_name, 'r') as csv: lines = csv.readlines() total = len(lines) for i, line in enumerate(lines): COMMENT_ID = line[:line.find(',')] if not test_file: line = line[1 + line.find(','):] positive = int(line[:line.find(',')]) line = line[1 + line.find(','):] CONTENT = line processed_CONTENT = preprocess_CONTENT(CONTENT) if not test_file: save_to_file.write('%s,%d,%s\n' % (COMMENT_ID, positive, processed_CONTENT)) else: save_to_file.write('%s,%s\n' % (COMMENT_ID, processed_CONTENT)) write_status(i + 1, total) save_to_file.close() print '\nSaved processed comments to: %s' % processed_file_name return processed_file_name
def preprocess_df(structured_file_name): overall_file_name = sys.argv[1][:-4] + '-overall.csv' room_file_name = sys.argv[1][:-4] + '-room.csv' cleanliness_file_name = sys.argv[1][:-4] + '-cleanliness.csv' service_file_name = sys.argv[1][:-4] + '-service-linear.csv' save_to_file = open(service_file_name, 'w') with open(structured_file_name, 'r', encoding='utf-8') as csv: lines = csv.readlines() total = len(lines) for i, line in enumerate(lines): if i == 0: continue line = line.split(',') post_id, content, n_posts = line[0], line[10], line[14] processed_content = preprocess_tweet(content) save_to_file.write( '%s,%s,%s\n' % (post_id, n_posts.replace('\n', ''), processed_content)) write_status(i + 1, total) save_to_file.close() return
def preprocess_csv(csv_file_name, processed_file_name, test_file=True): save_to_file = open(processed_file_name, 'w') with open(csv_file_name, 'r',errors = 'ignore',encoding="utf8") as csv: lines = csv.readlines() total = len(lines) for i, line in enumerate(lines): if i > 1: tweet_id = line[:line.find(',')] '''if not test_file: line = line[1 + line.find(','):] positive = int(line[:line.find(',')]) ''' line = line[1 + line.find(','):] tweet = line processed_tweet = preprocess_tweet(tweet) ''' if not test_file: save_to_file.write('%s,%d,%s\n' %(tweet_id, positive, processed_tweet)) else: ''' save_to_file.write('%s,%s\n' %(tweet_id, processed_tweet)) write_status(i + 1, total) save_to_file.close() print('\nSaved processed tweets to: %s' %processed_file_name) return processed_file_name
o = model.train_on_batch(training_set_X, training_set_y) sys.stdout.write('\rIteration %d/%d, loss:%.4f, acc:%.4f' % (i, n_train_batches, o[0], o[1])) sys.stdout.flush() i += 1 val_acc = evaluate_model(model, val_tweets) print ('\nEpoch: %d, val_acc:%.4f' % (j + 1, val_acc)) random.shuffle(train_tweets) if val_acc > best_val_acc: print ('Accuracy improved from %.4f to %.4f, saving model' % (best_val_acc, val_acc)) best_val_acc = val_acc model.save('best_model.h5') print ('Testing') del train_tweets del model model = load_model('best_model.h5') test_tweets = process_tweets(TEST_PROCESSED_FILE, test_file=True) n_test_batches = int(np.ceil(len(test_tweets) / float(batch_size))) predictions = np.array([]) print ('Predicting batches') i = 1 for test_set_X, _ in extract_features(test_tweets, feat_type=FEAT_TYPE, batch_size=batch_size, test_file=True): prediction = np.round(model.predict_on_batch(test_set_X).flatten()) predictions = np.concatenate((predictions, prediction)) utils.write_status(i, n_test_batches) i += 1 predictions = [(str(j), int(predictions[j])) for j in range(len(test_tweets))] utils.save_results_to_csv(predictions, 'logistic.csv') print ('\nSaved to logistic.csv')
train_tweets, val_tweets = utils.split_data(tweets) else: random.shuffle(tweets) train_tweets = tweets del tweets print('Extracting features & training batches') clf = RandomForestClassifier(n_jobs=2, random_state=0) batch_size = len(train_tweets) i = 1 n_train_batches = int(np.ceil(len(train_tweets) / float(batch_size))) for training_set_X, training_set_y in extract_features( train_tweets, test_file=False, feat_type=FEAT_TYPE, batch_size=batch_size): utils.write_status(i, n_train_batches) i += 1 if FEAT_TYPE == 'frequency': tfidf = apply_tf_idf(training_set_X) training_set_X = tfidf.transform(training_set_X) clf.fit(training_set_X, training_set_y) print('\n') print('Testing') if TRAIN: correct, total = 0, len(val_tweets) i = 1 batch_size = len(val_tweets) n_val_batches = int(np.ceil(len(val_tweets) / float(batch_size))) for val_set_X, val_set_y in extract_features(val_tweets, test_file=False, feat_type=FEAT_TYPE,
num_mentions += result['MENTIONS'] max_mentions = max(max_mentions, result['MENTIONS']) #!!!!!!!!seems no meaning, why count all??? num_pos_emojis += result['POS_EMOS'] num_neg_emojis += result['NEG_EMOS'] max_emojis = max(max_emojis, result['POS_EMOS'] + result['NEG_EMOS']) num_urls += result['URLS'] max_urls = max(max_urls, result['URLS']) num_words += result['WORDS'] min_words = min(min_words, result['WORDS']) max_words = max(max_words, result['WORDS']) all_words.extend(words) num_bigrams += result['BIGRAMS'] all_bigrams.extend(bigrams) write_status(i + 1, num_tweets) num_emojis = num_pos_emojis + num_neg_emojis unique_words = list(set(all_words)) with open(sys.argv[1][:-4] + '-unique.txt', 'w') as uwf: uwf.write('\n'.join(unique_words)) num_unique_words = len(unique_words) num_unique_bigrams = len(set(all_bigrams)) print('\nCalculate Frequency Distribution') #Unigrams freq_dist = FreqDist(all_words) pkl_file_name = sys.argv[1][:-4] + '-freqdist.pkl' with open(pkl_file_name, 'wb') as pf: pickle.dump(freq_dist, pf) print('save uni-frequency distribution to %s' % pkl_file_name) #Bigrams bigram_freq_dist = get_bigram_freqdist(all_bigrams)
result, words, bigrams = analyze_review(review) num_mentions += result['MENTIONS'] max_mentions = max(max_mentions, result['MENTIONS']) num_pos_emojis += result['POS_EMOS'] num_neg_emojis += result['NEG_EMOS'] max_emojis = max(max_emojis, result['POS_EMOS'] + result['NEG_EMOS']) num_urls += result['URLS'] max_urls = max(max_urls, result['URLS']) num_words += result['WORDS'] min_words = min(min_words, result['WORDS']) max_words = max(max_words, result['WORDS']) all_words.extend(words) num_bigrams += result['BIGRAMS'] all_bigrams.extend(bigrams) write_status(i + 1, num_reviews) num_emojis = num_pos_emojis + num_neg_emojis unique_words = list(set(all_words)) with open(sys.argv[1][:-4] + '-unique.txt', 'w') as uwf: uwf.write('\n'.join(unique_words)) num_unique_words = len(unique_words) num_unique_bigrams = len(set(all_bigrams)) print('\nCalculating frequency distribution') # Unigrams freq_dist = FreqDist(all_words) pkl_file_name = sys.argv[1][:-4] + '-freqdist.pkl' with open(pkl_file_name, 'wb') as pkl_file: pickle.dump(freq_dist, pkl_file) print('Saved uni-frequency distribution to %s' % pkl_file_name) # Bigrams bigram_freq_dist = get_bigram_freqdist(all_bigrams)