def tf_idf_vect_feature_vector(): token_array = text_processed() training_token_array, test_token_array = split_string_2_data_array( token_array, 0.8) # print("token: ", token_array) vectorizer = TfidfVectorizer(stop_words='english', analyzer="word") # print(vectorizer) vec = vectorizer.fit(training_token_array) vec_matrix = vectorizer.transform(training_token_array) # data_frame = pd.DataFrame(matrix.toarray(), columns=vectorizer.get_feature_names()) # print(data_frame) return (test_token_array, vec, vec_matrix)
def compute_dissimalrity_matrix(): token_array = text_processed() vectorizer = TfidfVectorizer(stop_words='english', analyzer="word") td_if = vectorizer.fit_transform(token_array) x = td_if.toarray() y = vectorizer.get_feature_names() print(x) print(y) matrix = euclidean_distances(td_if) # print(matrix) return matrix
def tf_idf_vect_feature_vector(): df = text_processed() vectorizer = TfidfVectorizer() vec_train = vectorizer.fit_transform(df.Tweets) df['tweets_vec'] = list(vec_train.toarray()) #df.to_csv('tfidf_vector.csv') train, test = train_test_split(df, test_size=0.2) print(vectorizer.get_feature_names()) print(df) return test, train, df
def tf_idf_trans_feature_vector(): token_array = text_processed() training_token_array, test_token_array = split_string_2_data_array( token_array, 0.8) print(token_array) vectorizer = TfidfTransformer(stop_words='english', analyzer="word") # tokenize and build vocab X = vectorizer.fit_transform(token_array) analyze = vectorizer.build_analyzer() print(analyze("subject is not the case")) # summarize print(vectorizer.get_feature_names()) # summarize encoded vector print(X.toarray()) return X
def count_vectorizer_feature_vector(): token_array = text_processed() training_token_array, test_token_array = split_string_2_data_array( token_array, 0.8) vectorizer = CountVectorizer(encoding='utf-8', analyzer='word', stop_words='english', binary='false', min_df=0.01) # tokenize and build vocab vec = vectorizer.fit(training_token_array) vec_matrix = vectorizer.fit_transform(training_token_array) # print(vectorizer.get_feature_names()) f_vector = vectorizer.transform(training_token_array) # print(f_vector.shape) # print(f_vector.toarray()) return (test_token_array, vec, vec_matrix)
def word2vec_feature_vector(): token_array = text_processed() print(token_array) model = models.Word2Vec(token_array, min_count=1) print(model) return