예제 #1
0
    test_df_languages = get_language_df(test_df_tr)
    test_df_day_week = get_day_week(test_df_tr)
    test_df_part_day = get_part_day(test_df_tr)
    test_df_sentiment = get_sentiment_features_df(ROOT_PATH,
                                                  test_df_tr,
                                                  str='test')
    test_features_count = count_features_and_scale(test_df_tr, test_df_counts)

    y = train_df_tr['username'].values
    y1, y2 = np.unique(y, return_inverse=True)

    ## MODEL ##
    tfidf_vectorizer = funcs.StemmedTfidfVectorizer(
        sublinear_tf=True,
        max_df=0.25,
        min_df=3,
        norm='l2',
        stop_words=funcs.stop_words(),
        ngram_range=(1, 1),
    )

    X_tfidf = tfidf_vectorizer.fit_transform(
        train_df_tr['text_clean']).toarray()
    X_tfidf = np.append(X_tfidf, train_df_languages, 1)
    X_tfidf = np.append(X_tfidf, train_features_count, 1)
    X_tfidf = np.append(X_tfidf, train_df_day_week, 1)
    X_tfidf = np.append(X_tfidf, train_df_part_day, 1)
    X_tfidf = np.append(X_tfidf, train_df_sentiment, 1)

    len(X_tfidf)
    len(train_df_languages)
예제 #2
0
    y = train_df_tr['party'].values
    y1, y2 = np.unique(y, return_inverse=True)

    # Counts features and scale
    train_features_count = count_features_and_scale(train_df_tr,
                                                    train_df_counts)
    test_features_count = count_features_and_scale(test_df_tr, test_df_counts)

    # Parameter selection (TFIDF)
    tfidf_vectorizer = funcs.StemmedTfidfVectorizer(
        sublinear_tf=True,  # scaling
        # strip_accents='unicode',
        max_df=0.25,  # 0.5,
        min_df=3,
        norm='l2',
        # token_pattern='#?\w\w+',#r'[^0-9]\w{1,}',#r'#?[^0-9]\w\w+',
        stop_words=funcs.stop_words(),
        ngram_range=(1, 1),
        # max_features=4000
    )

    X_tfidf = tfidf_vectorizer.fit_transform(
        train_df_tr['text_clean']).toarray()
    X_tfidf = np.append(X_tfidf, train_df_languages, 1)
    X_tfidf = np.append(X_tfidf, train_features_count, 1)
    X_tfidf = np.append(X_tfidf, train_df_sentiment, 1)

    X_tfidf_test = tfidf_vectorizer.transform(
        test_df_tr['text_clean']).toarray()
    X_tfidf_test = np.append(X_tfidf_test, test_df_languages, 1)