def clean_tokens(self, tweet): tweet_cleaner = TweetCleaner() tweet_punct_removed = tweet_cleaner.remove_punct(tweet) tweet_tokenized = tweet_cleaner.tokenize(tweet_punct_removed) tweet_stopwords_removed = tweet_cleaner.remove_stopwords( tweet_tokenized) return tweet_stopwords_removed
class CategoryAnalyser: def __init__(self, cleaned_file="./data/cleaned_train_category.csv", cleaner=False): self.tweets = [] self.filelist = [ './data/tech_raw.txt', './data/sports_raw.txt', './data/fnl_raw.txt', './data/business_raw.txt', './data/politics_raw.txt', './data/ent_raw.txt' ] self.labels = [ 'Technology', 'Sports', 'Finance', 'Business', 'Politics', 'Entertainment' ] self.tcl = TweetCleaner() if (cleaner): self.cleaned_trainfile_creator() self.df = pd.read_csv(cleaned_file, encoding='latin-1', header=None) self.feature_names = None def dict_creator(self): for k in range(0, len(self.filelist)): my_dict = {} self.my_list = self.my_list = open(self.filelist[k], 'r').read().split('\n') for i in range(0, len(self.my_list)): my_dict["category"] = self.labels[k] my_dict["text"] = self.tcl.clean_single_tweet( self.my_list[i], True) self.tweets.append(my_dict.copy()) df = pd.DataFrame(self.tweets) return df def test_feature_count(self, test_feature): count_word = CountVectorizer(vocabulary=self.feature_names) return count_word.fit_transform(test_feature) def counter_value(self, pred): return Counter(pred).most_common() def cleaned_trainfile_creator(self): df = self.dict_creator() self.tcl.cleaned_file_creator("./data/cleaned_train_category.csv", df.category, df.text) def modelling(self): count_word = CountVectorizer() train_features = count_word.fit_transform(self.df[1]) train_x, test_x, train_y, test_y = train_test_split(train_features, self.df[0], test_size=0.2) model = MultinomialNB().fit(train_x, train_y) self.feature_names = count_word.get_feature_names() y_pred = model.predict(test_x) print(accuracy_score(y_pred, test_y)) return model
def trending_tweets_file(self): for topic, file in zip(self.trending_topics(), self.files_csv): tcl = TweetCleaner() csvFile = open(file, 'w+') csvWriter = csv.writer(csvFile) for tweet in Cursor(self.api.search, q=topic, lang="en").items(200): cleaned_tweet = tcl.clean_single_tweet(tweet.text) csvWriter.writerow([cleaned_tweet]) csvFile.close()
def test_lowercases(self): input_example = "yo le voy a dedicar la de amor brutal de Traileros Del Norte" self.assertEqual( "yo le voy a dedicar la de amor brutal de traileros del norte", TweetCleaner().clean_tweet(input_example) )
def test_preserves_double_quotes(self): input_example = ', " SI NO ME QUIERES TE MATO !!! " Jajaja' self.assertEqual( ', " si no me quieres te mato !!! " jajaja', TweetCleaner().clean_tweet(input_example) )
def test_preserves_digits(self): input_example = 'Concidering the fact that I have too be awake in 30 mins , this sucks' self.assertEqual( 'concidering the fact that i have too be awake in 30 mins , this sucks', TweetCleaner().clean_tweet(input_example) )
def test_removes_emojis(self): input_example = "muajajajaja 😈😈 ya te descubri ante todo twitter lmfao" self.assertEqual( "muajajajaja ya te descubri ante todo twitter lmfao", TweetCleaner().clean_tweet(input_example), )
def test_removes_mentions(self): input_example = "@Lyanne_DLC muajajajaja" self.assertEqual( "muajajajaja", TweetCleaner().clean_tweet(input_example), )
def main(): print('tf:', tf.__version__) print('TRAINING_MODEL_FILENAME =', TRAINING_MODEL_FILENAME) nn_input_preparer = NNInputPreparer() model_creator = NNModelCreator(latent_dim=LATENT_DIM, dense_dim=DENSE_DIM) loaded_training_model = load_model(TRAINING_MODEL_FILENAME) encoder_model, decoder_model = model_creator.derive_inference_models( loaded_training_model) inference_runner = InferenceRunner(encoder_model=encoder_model, decoder_model=decoder_model) cleaner = TweetCleaner() selector = TweetSelector(min_length=MIN_TWEET_LENGTH, max_length=MAX_TWEET_LENGTH) noiser = DisjointNoiser() for input_filename in [TRAINING_INPUT_FILENAME, DEV_INPUT_FILENAME]: k = 10 print( f'processing the first {k} selected tweets from {input_filename}') raw_tweets = DataLoader(input_filename).load() clean_tweets = [cleaner.clean_tweet(t) for t in raw_tweets] clean_tweets_as_lists = [list(t) for t in clean_tweets] selected_tweets_as_lists = [ t for t in clean_tweets_as_lists if selector.select(t) ] gb_inference = nn_input_preparer.get_batches(selected_tweets_as_lists, noiser, batch_size=1) for i in range(k): noised_batch, originals_batch, original_delayed_batch = next( gb_inference) print('[noised ]', nn_input_preparer.decode_tweet(noised_batch[0])) print('[original ]', nn_input_preparer.decode_tweet(originals_batch[0])) print('[original 2]', ''.join(selected_tweets_as_lists[i])) print('[or-delayed]', nn_input_preparer.decode_tweet(original_delayed_batch[0])) decoded_tweet = inference_runner.decode_sequence(noised_batch) print('[decoded ]', decoded_tweet) print()
def __init__(self, cleaned_file="./data/cleaned_train_category.csv", cleaner=False): self.tweets = [] self.filelist = [ './data/tech_raw.txt', './data/sports_raw.txt', './data/fnl_raw.txt', './data/business_raw.txt', './data/politics_raw.txt', './data/ent_raw.txt' ] self.labels = [ 'Technology', 'Sports', 'Finance', 'Business', 'Politics', 'Entertainment' ] self.tcl = TweetCleaner() if (cleaner): self.cleaned_trainfile_creator() self.df = pd.read_csv(cleaned_file, encoding='latin-1', header=None) self.feature_names = None
def __init__(self, file_name="./data/cleaned_train_sentiment.csv", clean_required=False): self.tcl = TweetCleaner() if (clean_required): self.cleaned_train_file("./data/train.csv") self.cleaned_train_tweets = pd.read_csv(file_name, encoding='latin-1', header=None) self.max_features = 2000 self.feature_names = None self.Classifiers = [ LogisticRegression(C=0.000000001, solver='liblinear', max_iter=200), DecisionTreeClassifier(), RandomForestClassifier(n_estimators=30), AdaBoostClassifier(), MultinomialNB() ]
def main(args): host = args.host port = args.port database = args.database collection = args.collection client = MongoClient(host=host, port=port) db = client[database] coll = db[collection] cursor = coll.find() tc = TweetCleaner() counter = 0 for doc in cursor: if 'http' in doc['text']: cleaned = tc.clean(doc['text']) print doc['text'] print cleaned print '\n' counter += 1 if counter > 1000: break
def main(args): host = args.host port = args.port database = args.database source_collection = args.src_collection dest_collection = args.dest_collection client = MongoClient(host=host, port=port) db = client[database] src_coll = db[source_collection] dest_coll = db[dest_collection] src_cursor = src_coll.find() tc = TweetCleaner() print "Copy started" counter = 0 for doc in src_cursor: cleaned_tweet = tc.clean(doc['text']) dest_coll.insert_one({'text': cleaned_tweet}) counter += 1 if counter % 1000 == 0: print "{0} tweets copied".format(counter)
def main(self, data_type): df_final = pd.DataFrame() if data_type == 'tweet': #Get cleaner func. cleaner = TweetCleaner('', 'tweet') #-->Column name #predict tweet TWEEETs = fsp.fetchMysql('tweet') input_df = pd.DataFrame(TWEEETs, columns=['id', 'tweet']) df_tweet = cleaner.cleaning_table(input_df) df_tweet = df_tweet[pd.notnull(df_tweet['tweet'])] df_final = df_tweet.drop_duplicates(subset=['tweet']) elif data_type == 'comment': #Get cleaner func. cleaner = TweetCleaner('', 'tweet') #-->Column name #predict tweet COMMENTs = fsp.fetchMysql('comment') input_df = pd.DataFrame(COMMENTs, columns=['id', 'tweet']) df_comment = cleaner.cleaning_table(input_df) df_comment = df_comment[pd.notnull(df_comment['tweet'])] df_final = df_comment.drop_duplicates(subset=['tweet']) else: ''' To make predictions from files. Change data kind in insertMysql before running ''' #f= open("Results/DaggettBeaver.txt","r+") #fl = f.readlines() #data = list() #for x in fl: # data.append(x.split()) #df = pd.DataFrame(data, columns = ['id','predicted_hyperpartisan']) #df = df[pd.notnull(df['predicted_hyperpartisan'])] df_k = pd.read_csv("TheMEEye.csv", encoding='unicode_escape') #df_final = df_k [['text_tweet', 'id_tweet', 'predicted_bias']] df_final = df_k[pd.notnull(df_k['text'])] print(df_final.head()) #Get sequences X_test = self._convert_texts_to_sequences(df_final) print("This is how X_test looks like:", X_test[0:10]) #Load model model = load_model( os.path.join(self.sem_eval_path, 'models', self.model_file_name)) #Do the prediction y_pred = self._predict(model, X_test) # Create output dataframe to write on disk y_pred_df = self._create_output_dataframe(df_final, y_pred) print("This is how y_pred_df looks like:", y_pred_df.head()) y_pred_df.to_csv(runOutputFileName, index=False) fsp.insertMysql(y_pred_df, 'tweet') print("DONE!")
def main(): print('tf:', tf.__version__) random.seed(42) raw_tweets = DataLoader(TRAINING_INPUT_FILENAME).load() cleaner = TweetCleaner() clean_tweets = [cleaner.clean_tweet(t) for t in raw_tweets] clean_tweets_as_lists = [list(t) for t in clean_tweets] print('number of clean_tweets_as_lists:', len(clean_tweets_as_lists)) selector = TweetSelector(min_length=MIN_TWEET_LENGTH, max_length=MAX_TWEET_LENGTH) selected_tweets_as_lists = [ t for t in clean_tweets_as_lists if selector.select(t) ] print('number of selected_tweets_as_lists:', len(selected_tweets_as_lists)) if CONTINUE_TRAINING: training_model = load_model(TRAINING_MODEL_FILENAME_TO_CONTINUE) else: model_creator = NNModelCreator(latent_dim=LATENT_DIM, dense_dim=DENSE_DIM) training_model = model_creator.create_training_model() nn_input_preparer = NNInputPreparer() num_generations_in_run = 0 print(time.ctime()) noiser = DisjointNoiser() for de_facto_epoch in range(INITIALLY_COMPLETED_DFEPOCH + 1, NUM_DE_FACTO_EPOCHS): gb_training = nn_input_preparer.get_batches(selected_tweets_as_lists, noiser, GENERATOR_BATCH_SIZE) cp_filepath = BASE_DIR + f'dfepoch_{de_facto_epoch}_' + "{val_accuracy:.5f}.h5" checkpoint = ModelCheckpoint(cp_filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max') while True: try: noised_batch, originals_batch, originals_delayed_batch = next( gb_training) assert (len(noised_batch) == GENERATOR_BATCH_SIZE) print(noised_batch.shape, originals_batch.shape, originals_delayed_batch.shape) validation_split = 0.125 fit_batch_size = 32 # We take care here so as not to manifest the "Your input ran out of data" warning validation_steps = int( GENERATOR_BATCH_SIZE * validation_split) // fit_batch_size training_steps = GENERATOR_BATCH_SIZE // fit_batch_size - validation_steps training_model.fit([noised_batch, originals_delayed_batch], originals_batch, batch_size=fit_batch_size, steps_per_epoch=training_steps, epochs=1, validation_split=validation_split, validation_steps=validation_steps, callbacks=[checkpoint]) # https://keras.io/api/models/model_training_apis/ says: # "The validation data is selected from the last samples in the ... data provided" # This means the model is never validated on tweets that we train it on. except StopIteration: break num_generations_in_run += 1 print(f'num_generations: {num_generations_in_run}') print(time.ctime()) print(f'End of de facto epoch {de_facto_epoch} - saving model') training_model.save(BASE_DIR + f'dfepoch_{de_facto_epoch}_end.h5') training_model.save(FINAL_TRAINED_MODEL_FILENAME)
class SentimentAnalysis: def __init__(self, file_name="./data/cleaned_train_sentiment.csv", clean_required=False): self.tcl = TweetCleaner() if (clean_required): self.cleaned_train_file("./data/train.csv") self.cleaned_train_tweets = pd.read_csv(file_name, encoding='latin-1', header=None) self.max_features = 2000 self.feature_names = None self.Classifiers = [ LogisticRegression(C=0.000000001, solver='liblinear', max_iter=200), DecisionTreeClassifier(), RandomForestClassifier(n_estimators=30), AdaBoostClassifier(), MultinomialNB() ] def set_max_features(self, value): self.max_features = value def analyser(self): count_word = TfidfVectorizer(max_features=self.max_features, analyzer="word") train_features = count_word.fit_transform( self.cleaned_train_tweets[1].values.astype('U').tolist()) self.feature_names = count_word.get_feature_names() return self.modelling(train_features) def count_test_feature(self, testfeature): count_word2 = TfidfVectorizer(vocabulary=self.feature_names, analyzer="word") test_feature = count_word2.fit_transform(testfeature) return test_feature def best_model(self, model_list): return sorted(model_list)[0][1] def modelling(self, train_f, run_all_classifiers=False): model_list = [] train_x, test_x, train_y, test_y = train_test_split( train_f, self.cleaned_train_tweets[0], test_size=0.2) if (not run_all_classifiers): self.Classifiers = [MultinomialNB()] for classifier in self.Classifiers: model = classifier.fit(train_x, train_y) pred = model.predict(test_x) #Evaluation accur = accuracy_score(pred, test_y) print('Accuracy of ' + classifier.__class__.__name__ + ' is ' + str(accur)) model_list.append((accur, model)) return self.best_model(model_list) def cleaned_train_file(self, file_name): train_tweets = pd.read_csv(file_name, encoding='latin-1') train_tweet_list = self.tcl.clean_tweets( train_tweets.SentimentText, False) # clean tweets from entire file self.tcl.cleaned_file_creator("./data/cleaned_train_sentiment.csv", train_tweets.Sentiment, train_tweet_list)