def plot_most_frequent_tokens(self, df, column_name):
     count_vectorizer = CountVectorizer()
     tf_original = count_vectorizer.fit_transform(df[column_name])
     tf_feature_names = count_vectorizer.get_feature_names()
     visualizer = FreqDistVisualizer(features=tf_feature_names, orient='v')
     visualizer.fit(tf_original)
     visualizer.show()
def bagOfWords(featureTrain, stopWords=False, countWords=False, plot=False):
    if (stopWords == False):
        count_vect = CountVectorizer()
    else:
        count_vect = CountVectorizer(stop_words='english')
    X_train_counts = count_vect.fit_transform(featureTrain.headline)

    if countWords:
        features = count_vect.get_feature_names()
        visualizer = FreqDistVisualizer(features=features, n=20, orient='v')
        visualizer.fit(X_train_counts)
        words = countTopWords(X_train_counts, count_vect, 20)
        if stopWords:
            visualizer.show(outpath="SWRemovedYB")
            visualizer.show()
            plotBagOfWords("Stop Words Removed", words, 20, stopWords)
        else:
            visualizer.show(outpath="SWIncludedYB")
            visualizer.show()
            plotBagOfWords("Stop Words Included", words, 20, stopWords)
    return count_vect, X_train_counts
print(x_train.shape)
print(x_test.shape)
print('\n')

print("________________________________-Text preparation___________________")
#Converting our NLP text into vector by using the function countvectorizer
print("_____________________Contvervectorizer___________________________")
con_vec = CountVectorizer(stop_words=stopwords.words('english'))
x_train_count= con_vec.fit_transform(x_train)
#print(x_train_count)

#Token Frequency Distribution
feature =con_vec.get_feature_names()
visualizer =FreqDistVisualizer(features=feature,orient='v')
visualizer.fit(x_train_count)
visualizer.show()

#Compute of word count with the function tfidtransformer
print("---------------------TfdiTransformer------------------------------------------------------")
tfidftransformer = TfidfTransformer()
x_train_tfidf =tfidftransformer.fit_transform(x_train_count)
print(x_train_tfidf.shape)

#Transforming text into a meaningful representation of numbers with of function TfidfVectorize
print("-------------------------TfidfVectorize---------------------------")
vectorizer = TfidfVectorizer()
x_train_tfidf =vectorizer.fit_transform(x_train)
print(x_train_tfidf)
print('\n')

#We used 3 algorithms to test our model: SVM model, Random Forest and Decision Tree
Exemplo n.º 4
0
    df[new_text_field_name] = df[new_text_field_name].apply(
        lambda elem: re.sub(r"\d+", "", elem))
    return df


data_clean = clean_text(train_data, 'text', 'text')
# Removes stop words
data_clean['text'] = data_clean['text'].apply(
    lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

vectorizer = CountVectorizer()
docs = vectorizer.fit_transform(data_clean['text'])
features = vectorizer.get_feature_names()
visualizer = FreqDistVisualizer(features=features, orient='v')
visualizer.fit(docs)
visualizer.show()

disaster_tweets = data_clean[data_clean['target'] == 1]
vectorizer = CountVectorizer()
docs = vectorizer.fit_transform(disaster_tweets['text'])
features_disaster = vectorizer.get_feature_names()
visualizer_disaster = FreqDistVisualizer(features=features_disaster,
                                         orient='v')
visualizer_disaster.fit(docs)
visualizer_disaster.show()

###Logistic regression

X_train, X_test, y_train, y_test = train_test_split(data_clean['text'],
                                                    data_clean['target'],
                                                    random_state=0)