示例#1
0
 def clean_tokens(self, tweet):
     tweet_cleaner = TweetCleaner()
     tweet_punct_removed = tweet_cleaner.remove_punct(tweet)
     tweet_tokenized = tweet_cleaner.tokenize(tweet_punct_removed)
     tweet_stopwords_removed = tweet_cleaner.remove_stopwords(
         tweet_tokenized)
     return tweet_stopwords_removed
示例#2
0
class CategoryAnalyser:
    def __init__(self,
                 cleaned_file="./data/cleaned_train_category.csv",
                 cleaner=False):
        self.tweets = []
        self.filelist = [
            './data/tech_raw.txt', './data/sports_raw.txt',
            './data/fnl_raw.txt', './data/business_raw.txt',
            './data/politics_raw.txt', './data/ent_raw.txt'
        ]
        self.labels = [
            'Technology', 'Sports', 'Finance', 'Business', 'Politics',
            'Entertainment'
        ]
        self.tcl = TweetCleaner()
        if (cleaner):
            self.cleaned_trainfile_creator()
        self.df = pd.read_csv(cleaned_file, encoding='latin-1', header=None)
        self.feature_names = None

    def dict_creator(self):
        for k in range(0, len(self.filelist)):
            my_dict = {}
            self.my_list = self.my_list = open(self.filelist[k],
                                               'r').read().split('\n')
            for i in range(0, len(self.my_list)):
                my_dict["category"] = self.labels[k]
                my_dict["text"] = self.tcl.clean_single_tweet(
                    self.my_list[i], True)
                self.tweets.append(my_dict.copy())
        df = pd.DataFrame(self.tweets)
        return df

    def test_feature_count(self, test_feature):
        count_word = CountVectorizer(vocabulary=self.feature_names)
        return count_word.fit_transform(test_feature)

    def counter_value(self, pred):
        return Counter(pred).most_common()

    def cleaned_trainfile_creator(self):
        df = self.dict_creator()
        self.tcl.cleaned_file_creator("./data/cleaned_train_category.csv",
                                      df.category, df.text)

    def modelling(self):
        count_word = CountVectorizer()
        train_features = count_word.fit_transform(self.df[1])
        train_x, test_x, train_y, test_y = train_test_split(train_features,
                                                            self.df[0],
                                                            test_size=0.2)
        model = MultinomialNB().fit(train_x, train_y)
        self.feature_names = count_word.get_feature_names()
        y_pred = model.predict(test_x)
        print(accuracy_score(y_pred, test_y))
        return model
 def trending_tweets_file(self):
     for topic, file in zip(self.trending_topics(), self.files_csv):
         tcl = TweetCleaner()
         csvFile = open(file, 'w+')
         csvWriter = csv.writer(csvFile)
         for tweet in Cursor(self.api.search, q=topic,
                             lang="en").items(200):
             cleaned_tweet = tcl.clean_single_tweet(tweet.text)
             csvWriter.writerow([cleaned_tweet])
         csvFile.close()
示例#4
0
    def test_lowercases(self):
        input_example = "yo le voy a dedicar la de amor brutal de Traileros Del Norte"

        self.assertEqual(
            "yo le voy a dedicar la de amor brutal de traileros del norte",
            TweetCleaner().clean_tweet(input_example)
        )
示例#5
0
    def test_preserves_double_quotes(self):
        input_example = ', " SI NO ME QUIERES TE MATO !!! " Jajaja'

        self.assertEqual(
            ', " si no me quieres te mato !!! " jajaja',
            TweetCleaner().clean_tweet(input_example)
        )
示例#6
0
    def test_preserves_digits(self):
        input_example = 'Concidering the fact that I have too be awake in 30 mins , this sucks'

        self.assertEqual(
            'concidering the fact that i have too be awake in 30 mins , this sucks',
            TweetCleaner().clean_tweet(input_example)
        )
示例#7
0
    def test_removes_emojis(self):
        input_example = "muajajajaja 😈😈 ya te descubri ante todo twitter lmfao"

        self.assertEqual(
            "muajajajaja ya te descubri ante todo twitter lmfao",
            TweetCleaner().clean_tweet(input_example),
        )
示例#8
0
    def test_removes_mentions(self):
        input_example = "@Lyanne_DLC muajajajaja"

        self.assertEqual(
            "muajajajaja",
            TweetCleaner().clean_tweet(input_example),
        )
示例#9
0
def main():
    print('tf:', tf.__version__)
    print('TRAINING_MODEL_FILENAME =', TRAINING_MODEL_FILENAME)

    nn_input_preparer = NNInputPreparer()

    model_creator = NNModelCreator(latent_dim=LATENT_DIM, dense_dim=DENSE_DIM)
    loaded_training_model = load_model(TRAINING_MODEL_FILENAME)

    encoder_model, decoder_model = model_creator.derive_inference_models(
        loaded_training_model)
    inference_runner = InferenceRunner(encoder_model=encoder_model,
                                       decoder_model=decoder_model)

    cleaner = TweetCleaner()
    selector = TweetSelector(min_length=MIN_TWEET_LENGTH,
                             max_length=MAX_TWEET_LENGTH)
    noiser = DisjointNoiser()

    for input_filename in [TRAINING_INPUT_FILENAME, DEV_INPUT_FILENAME]:
        k = 10
        print(
            f'processing the first {k} selected tweets from {input_filename}')
        raw_tweets = DataLoader(input_filename).load()
        clean_tweets = [cleaner.clean_tweet(t) for t in raw_tweets]
        clean_tweets_as_lists = [list(t) for t in clean_tweets]
        selected_tweets_as_lists = [
            t for t in clean_tweets_as_lists if selector.select(t)
        ]
        gb_inference = nn_input_preparer.get_batches(selected_tweets_as_lists,
                                                     noiser,
                                                     batch_size=1)
        for i in range(k):
            noised_batch, originals_batch, original_delayed_batch = next(
                gb_inference)
            print('[noised    ]',
                  nn_input_preparer.decode_tweet(noised_batch[0]))
            print('[original  ]',
                  nn_input_preparer.decode_tweet(originals_batch[0]))
            print('[original 2]', ''.join(selected_tweets_as_lists[i]))
            print('[or-delayed]',
                  nn_input_preparer.decode_tweet(original_delayed_batch[0]))
            decoded_tweet = inference_runner.decode_sequence(noised_batch)
            print('[decoded   ]', decoded_tweet)
            print()
示例#10
0
 def __init__(self,
              cleaned_file="./data/cleaned_train_category.csv",
              cleaner=False):
     self.tweets = []
     self.filelist = [
         './data/tech_raw.txt', './data/sports_raw.txt',
         './data/fnl_raw.txt', './data/business_raw.txt',
         './data/politics_raw.txt', './data/ent_raw.txt'
     ]
     self.labels = [
         'Technology', 'Sports', 'Finance', 'Business', 'Politics',
         'Entertainment'
     ]
     self.tcl = TweetCleaner()
     if (cleaner):
         self.cleaned_trainfile_creator()
     self.df = pd.read_csv(cleaned_file, encoding='latin-1', header=None)
     self.feature_names = None
示例#11
0
 def __init__(self,
              file_name="./data/cleaned_train_sentiment.csv",
              clean_required=False):
     self.tcl = TweetCleaner()
     if (clean_required):
         self.cleaned_train_file("./data/train.csv")
     self.cleaned_train_tweets = pd.read_csv(file_name,
                                             encoding='latin-1',
                                             header=None)
     self.max_features = 2000
     self.feature_names = None
     self.Classifiers = [
         LogisticRegression(C=0.000000001, solver='liblinear',
                            max_iter=200),
         DecisionTreeClassifier(),
         RandomForestClassifier(n_estimators=30),
         AdaBoostClassifier(),
         MultinomialNB()
     ]
示例#12
0
def main(args):
    host = args.host
    port = args.port
    database = args.database
    collection = args.collection

    client = MongoClient(host=host, port=port)
    db = client[database]
    coll = db[collection]

    cursor = coll.find()

    tc = TweetCleaner()
    counter = 0
    for doc in cursor:
        if 'http' in doc['text']:
            cleaned = tc.clean(doc['text'])
            print doc['text']
            print cleaned
            print '\n'
            counter += 1
        if counter > 1000:
            break
def main(args):
    host = args.host
    port = args.port
    database = args.database
    source_collection = args.src_collection
    dest_collection = args.dest_collection

    client = MongoClient(host=host, port=port)
    db = client[database]
    src_coll = db[source_collection]
    dest_coll = db[dest_collection]

    src_cursor = src_coll.find()

    tc = TweetCleaner()

    print "Copy started"
    counter = 0
    for doc in src_cursor:
        cleaned_tweet = tc.clean(doc['text'])
        dest_coll.insert_one({'text': cleaned_tweet})
        counter += 1
        if counter % 1000 == 0:
            print "{0} tweets copied".format(counter)
示例#14
0
def main(args):
    host = args.host
    port = args.port
    database = args.database
    source_collection = args.src_collection
    dest_collection = args.dest_collection

    client = MongoClient(host=host, port=port)
    db = client[database]
    src_coll = db[source_collection]
    dest_coll = db[dest_collection]

    src_cursor = src_coll.find()

    tc = TweetCleaner()

    print "Copy started"
    counter = 0
    for doc in src_cursor:
        cleaned_tweet = tc.clean(doc['text'])
        dest_coll.insert_one({'text': cleaned_tweet})
        counter += 1
        if counter % 1000 == 0:
            print "{0} tweets copied".format(counter)
    def main(self, data_type):
        df_final = pd.DataFrame()

        if data_type == 'tweet':
            #Get cleaner func.
            cleaner = TweetCleaner('', 'tweet')  #-->Column name

            #predict tweet
            TWEEETs = fsp.fetchMysql('tweet')
            input_df = pd.DataFrame(TWEEETs, columns=['id', 'tweet'])
            df_tweet = cleaner.cleaning_table(input_df)
            df_tweet = df_tweet[pd.notnull(df_tweet['tweet'])]
            df_final = df_tweet.drop_duplicates(subset=['tweet'])

        elif data_type == 'comment':
            #Get cleaner func.
            cleaner = TweetCleaner('', 'tweet')  #-->Column name

            #predict tweet
            COMMENTs = fsp.fetchMysql('comment')
            input_df = pd.DataFrame(COMMENTs, columns=['id', 'tweet'])
            df_comment = cleaner.cleaning_table(input_df)
            df_comment = df_comment[pd.notnull(df_comment['tweet'])]
            df_final = df_comment.drop_duplicates(subset=['tweet'])

        else:
            '''
            To make predictions from files. Change data kind in insertMysql before running
            '''
            #f= open("Results/DaggettBeaver.txt","r+")
            #fl = f.readlines()
            #data = list()
            #for x in fl:
            #    data.append(x.split())
            #df = pd.DataFrame(data, columns = ['id','predicted_hyperpartisan'])
            #df = df[pd.notnull(df['predicted_hyperpartisan'])]

            df_k = pd.read_csv("TheMEEye.csv", encoding='unicode_escape')
            #df_final = df_k [['text_tweet', 'id_tweet', 'predicted_bias']]
            df_final = df_k[pd.notnull(df_k['text'])]
            print(df_final.head())

        #Get sequences
        X_test = self._convert_texts_to_sequences(df_final)
        print("This is how X_test looks like:", X_test[0:10])

        #Load model
        model = load_model(
            os.path.join(self.sem_eval_path, 'models', self.model_file_name))

        #Do the prediction
        y_pred = self._predict(model, X_test)

        # Create output dataframe to write on disk
        y_pred_df = self._create_output_dataframe(df_final, y_pred)
        print("This is how y_pred_df looks like:", y_pred_df.head())
        y_pred_df.to_csv(runOutputFileName, index=False)

        fsp.insertMysql(y_pred_df, 'tweet')

        print("DONE!")
示例#16
0
def main():
    print('tf:', tf.__version__)

    random.seed(42)

    raw_tweets = DataLoader(TRAINING_INPUT_FILENAME).load()
    cleaner = TweetCleaner()
    clean_tweets = [cleaner.clean_tweet(t) for t in raw_tweets]

    clean_tweets_as_lists = [list(t) for t in clean_tweets]
    print('number of clean_tweets_as_lists:', len(clean_tweets_as_lists))
    selector = TweetSelector(min_length=MIN_TWEET_LENGTH,
                             max_length=MAX_TWEET_LENGTH)
    selected_tweets_as_lists = [
        t for t in clean_tweets_as_lists if selector.select(t)
    ]
    print('number of selected_tweets_as_lists:', len(selected_tweets_as_lists))

    if CONTINUE_TRAINING:
        training_model = load_model(TRAINING_MODEL_FILENAME_TO_CONTINUE)
    else:
        model_creator = NNModelCreator(latent_dim=LATENT_DIM,
                                       dense_dim=DENSE_DIM)
        training_model = model_creator.create_training_model()

    nn_input_preparer = NNInputPreparer()

    num_generations_in_run = 0

    print(time.ctime())

    noiser = DisjointNoiser()
    for de_facto_epoch in range(INITIALLY_COMPLETED_DFEPOCH + 1,
                                NUM_DE_FACTO_EPOCHS):
        gb_training = nn_input_preparer.get_batches(selected_tweets_as_lists,
                                                    noiser,
                                                    GENERATOR_BATCH_SIZE)

        cp_filepath = BASE_DIR + f'dfepoch_{de_facto_epoch}_' + "{val_accuracy:.5f}.h5"

        checkpoint = ModelCheckpoint(cp_filepath,
                                     monitor='val_accuracy',
                                     verbose=1,
                                     save_best_only=True,
                                     mode='max')

        while True:
            try:
                noised_batch, originals_batch, originals_delayed_batch = next(
                    gb_training)
                assert (len(noised_batch) == GENERATOR_BATCH_SIZE)
                print(noised_batch.shape, originals_batch.shape,
                      originals_delayed_batch.shape)
                validation_split = 0.125
                fit_batch_size = 32
                # We take care here so as not to manifest the "Your input ran out of data" warning
                validation_steps = int(
                    GENERATOR_BATCH_SIZE * validation_split) // fit_batch_size
                training_steps = GENERATOR_BATCH_SIZE // fit_batch_size - validation_steps
                training_model.fit([noised_batch, originals_delayed_batch],
                                   originals_batch,
                                   batch_size=fit_batch_size,
                                   steps_per_epoch=training_steps,
                                   epochs=1,
                                   validation_split=validation_split,
                                   validation_steps=validation_steps,
                                   callbacks=[checkpoint])
                # https://keras.io/api/models/model_training_apis/ says:
                # "The validation data is selected from the last samples in the ... data provided"
                # This means the model is never validated on tweets that we train it on.
            except StopIteration:
                break

            num_generations_in_run += 1
            print(f'num_generations: {num_generations_in_run}')

        print(time.ctime())
        print(f'End of de facto epoch {de_facto_epoch} - saving model')

        training_model.save(BASE_DIR + f'dfepoch_{de_facto_epoch}_end.h5')

    training_model.save(FINAL_TRAINED_MODEL_FILENAME)
示例#17
0
class SentimentAnalysis:
    def __init__(self,
                 file_name="./data/cleaned_train_sentiment.csv",
                 clean_required=False):
        self.tcl = TweetCleaner()
        if (clean_required):
            self.cleaned_train_file("./data/train.csv")
        self.cleaned_train_tweets = pd.read_csv(file_name,
                                                encoding='latin-1',
                                                header=None)
        self.max_features = 2000
        self.feature_names = None
        self.Classifiers = [
            LogisticRegression(C=0.000000001, solver='liblinear',
                               max_iter=200),
            DecisionTreeClassifier(),
            RandomForestClassifier(n_estimators=30),
            AdaBoostClassifier(),
            MultinomialNB()
        ]

    def set_max_features(self, value):
        self.max_features = value

    def analyser(self):
        count_word = TfidfVectorizer(max_features=self.max_features,
                                     analyzer="word")
        train_features = count_word.fit_transform(
            self.cleaned_train_tweets[1].values.astype('U').tolist())
        self.feature_names = count_word.get_feature_names()
        return self.modelling(train_features)

    def count_test_feature(self, testfeature):
        count_word2 = TfidfVectorizer(vocabulary=self.feature_names,
                                      analyzer="word")
        test_feature = count_word2.fit_transform(testfeature)
        return test_feature

    def best_model(self, model_list):
        return sorted(model_list)[0][1]

    def modelling(self, train_f, run_all_classifiers=False):
        model_list = []
        train_x, test_x, train_y, test_y = train_test_split(
            train_f, self.cleaned_train_tweets[0], test_size=0.2)

        if (not run_all_classifiers):
            self.Classifiers = [MultinomialNB()]

        for classifier in self.Classifiers:
            model = classifier.fit(train_x, train_y)
            pred = model.predict(test_x)

            #Evaluation
            accur = accuracy_score(pred, test_y)
            print('Accuracy of ' + classifier.__class__.__name__ + ' is ' +
                  str(accur))
            model_list.append((accur, model))

        return self.best_model(model_list)

    def cleaned_train_file(self, file_name):
        train_tweets = pd.read_csv(file_name, encoding='latin-1')
        train_tweet_list = self.tcl.clean_tweets(
            train_tweets.SentimentText, False)  # clean tweets from entire file
        self.tcl.cleaned_file_creator("./data/cleaned_train_sentiment.csv",
                                      train_tweets.Sentiment, train_tweet_list)