示例#1
0
文件: Classifier.py 项目: ido90/News
def prepare_data_and_test(df, classifiers, x='article', y='source',
                          fig=None, t0=time(), add_heuristics=True,
                          force_balance=True, diagnosis=False):
    # convert to pairs (x,y) and split to train & test
    data = get_labeled_raw_data(df, verbose=1, force_balance=force_balance,
                                x_resolution=x, y_col=y)
    X_train_raw, X_test_raw, y_train, y_test = \
        train_test_split(data[0], data[1], test_size=0.2, random_state=0)
    print(f'Train & test groups defined ({time()-t0:.0f} [s]).')

    # extract features
    voc = sm.get_vocabulary(texts=X_train_raw, required_freq=20,
                            filter_fun=lambda w: any('א' <= c <= 'ת' for c in w))
    print(f'Vocabulary of {len(voc):d} words is set ({time()-t0:.0f} [s]).')
    X_train = extract_features(X_train_raw, voc, add_heuristics=add_heuristics)
    X_test = extract_features(X_test_raw, voc, add_heuristics=add_heuristics)
    print(f'Features extracted ({time()-t0:.0f} [s]).')

    # train & test
    res, models = test_models(X_train, X_test, y_train, y_test, classifiers,
                              t0=t0, verbose=3)
    print(f'Test finished ({time()-t0:.0f} [s]).')

    # results analysis
    if diagnosis:
        models_diagnosis(models.values(), list(X_train.columns),
                         x+' -> '+y, max_features=30)
    if fig is None:
        fig = plt.subplots(1, 2)
    plt.figure(fig[0].number)
    plot_results(res, fig[1], x + ' -> ' + y,
                 100 / len(np.unique(y_train)))
示例#2
0
if __name__ == "__main__":
    ## configuration
    build_graphs = False
    save_graphs = False
    voc_samples = 0
    detailed_cliques = False
    build_word2vec = False

    # load data
    t0 = time()
    df = ba.load_data(r'..\Data\articles')
    print(f'Data loaded ({time()-t0:.0f} [s]).')

    # get vocabulary
    print("\n\n____________________")
    voc = sm.get_vocabulary(df, required_freq=70,
                            filter_fun=lambda w: any('א' <= c <= 'ת' for c in w))
    print(f"Vocabulary loaded ({len(voc):d} words) ({time()-t0:.0f} [s]).")
    if voc_samples > 0:
        voc = list(np.random.choice(voc, voc_samples, replace=False))
        print(f"Vocabulary shrunk ({len(voc):d} words) ({time()-t0:.0f} [s]).")

    # Graph of shared skip-grams neighbors
    if build_graphs:
        A, D, full_voc = words_2gram_adj_matrix(
            df, voc, window=3, normalize=True, min_abs=3, min_perc=0.0, binary=True)
        G = graph_of_words(voc, A=A, filter_singletons=True, A_thresh=2)
        if save_graphs:
            pickle.dump(G, open(r'..\Output\Context based embedding\2gram_based_graph.pkl', 'wb'))
    else:
        G = pickle.load(open(r'..\Output\Context based embedding\2gram_based_graph.pkl', 'rb'))
    print(f'Graph of 2-grams generated ({time()-t0:.0f} [s]).')