test_df_languages = get_language_df(test_df_tr) test_df_day_week = get_day_week(test_df_tr) test_df_part_day = get_part_day(test_df_tr) test_df_sentiment = get_sentiment_features_df(ROOT_PATH, test_df_tr, str='test') test_features_count = count_features_and_scale(test_df_tr, test_df_counts) y = train_df_tr['username'].values y1, y2 = np.unique(y, return_inverse=True) ## MODEL ## tfidf_vectorizer = funcs.StemmedTfidfVectorizer( sublinear_tf=True, max_df=0.25, min_df=3, norm='l2', stop_words=funcs.stop_words(), ngram_range=(1, 1), ) X_tfidf = tfidf_vectorizer.fit_transform( train_df_tr['text_clean']).toarray() X_tfidf = np.append(X_tfidf, train_df_languages, 1) X_tfidf = np.append(X_tfidf, train_features_count, 1) X_tfidf = np.append(X_tfidf, train_df_day_week, 1) X_tfidf = np.append(X_tfidf, train_df_part_day, 1) X_tfidf = np.append(X_tfidf, train_df_sentiment, 1) len(X_tfidf) len(train_df_languages)
y = train_df_tr['party'].values y1, y2 = np.unique(y, return_inverse=True) # Counts features and scale train_features_count = count_features_and_scale(train_df_tr, train_df_counts) test_features_count = count_features_and_scale(test_df_tr, test_df_counts) # Parameter selection (TFIDF) tfidf_vectorizer = funcs.StemmedTfidfVectorizer( sublinear_tf=True, # scaling # strip_accents='unicode', max_df=0.25, # 0.5, min_df=3, norm='l2', # token_pattern='#?\w\w+',#r'[^0-9]\w{1,}',#r'#?[^0-9]\w\w+', stop_words=funcs.stop_words(), ngram_range=(1, 1), # max_features=4000 ) X_tfidf = tfidf_vectorizer.fit_transform( train_df_tr['text_clean']).toarray() X_tfidf = np.append(X_tfidf, train_df_languages, 1) X_tfidf = np.append(X_tfidf, train_features_count, 1) X_tfidf = np.append(X_tfidf, train_df_sentiment, 1) X_tfidf_test = tfidf_vectorizer.transform( test_df_tr['text_clean']).toarray() X_tfidf_test = np.append(X_tfidf_test, test_df_languages, 1)