示例#1
0
    bags_test = count_vec.transform(docs_test)

    tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True)
    tf_idf_train = tfidf.fit_transform(bags_train)
    tf_idf_test = tfidf.transform(bags_test)

    X_train = pd.DataFrame(tf_idf_train.toarray())
    X_test = pd.DataFrame(tf_idf_test.toarray())

    return X_train.reset_index(drop=True), X_test.reset_index(
        drop=True), y_train, y_test


if __name__ == '__main__':

    df = load_pandas_df(nrows=1000, shuffle=True)
    X_train, X_test, y_train, y_test = preprocess_data(df)

    RUN_NAME = 'logistic_regression'
    logger.add(f'data/{RUN_NAME}/result.log',
               colorize=True,
               format='<green>{time}</green> {message}')
    logger.info(f'{X_train.shape}, {X_test.shape}')

    y_preds = []
    NUM_CLASS = 9
    oof_train = np.zeros((len(X_train), NUM_CLASS))
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

    for fold_id, (train_index,
                  valid_index) in enumerate(tqdm(cv.split(X_train, y_train))):
示例#2
0
        if self.use_idf:
            check_is_fitted(self, '_idf_diag', 'idf vector is not fitted')

            expected_n_features = self._idf_diag.shape[0]
            if n_features != expected_n_features:
                raise ValueError("Input has n_features=%d while the model"
                                 " has been trained with n_features=%d" % (
                                     n_features, expected_n_features))
            X = X * self._idf_diag

        return X


if __name__ == '__main__':
    df = load_pandas_df(nrows=10)

    # Normalization
    df['text'] = df['text'].apply(neologdn.normalize)

    tokenizer = WordTokenizer('MeCab')
    docs = np.array([
        ' '.join(map(str, tokenizer.tokenize(text))) for text in df['text']
    ])
    print(docs.shape)
    # (10,)

    count_vec = CountVectorizer(min_df=2,
                                max_features=20000,
                                ngram_range=(1, 3))
    bags = count_vec.fit_transform(docs)
示例#3
0
import matplotlib.pyplot as plt
import japanize_matplotlib

from utils_nlp.dataset.livedoor import load_pandas_df

if __name__ == '__main__':
    df = load_pandas_df()
    df['first_char'] = df['text'].str[0]
    plot_df = df['first_char'].value_counts()[:10].reset_index()

    japanize_matplotlib.japanize()
    plt.figure(figsize=(15, 8))
    plt.bar(plot_df['index'], plot_df['first_char'])
    plt.savefig('examples/visualization/japanize.png')