def prepare_data_and_test(df, classifiers, x='article', y='source', fig=None, t0=time(), add_heuristics=True, force_balance=True, diagnosis=False): # convert to pairs (x,y) and split to train & test data = get_labeled_raw_data(df, verbose=1, force_balance=force_balance, x_resolution=x, y_col=y) X_train_raw, X_test_raw, y_train, y_test = \ train_test_split(data[0], data[1], test_size=0.2, random_state=0) print(f'Train & test groups defined ({time()-t0:.0f} [s]).') # extract features voc = sm.get_vocabulary(texts=X_train_raw, required_freq=20, filter_fun=lambda w: any('א' <= c <= 'ת' for c in w)) print(f'Vocabulary of {len(voc):d} words is set ({time()-t0:.0f} [s]).') X_train = extract_features(X_train_raw, voc, add_heuristics=add_heuristics) X_test = extract_features(X_test_raw, voc, add_heuristics=add_heuristics) print(f'Features extracted ({time()-t0:.0f} [s]).') # train & test res, models = test_models(X_train, X_test, y_train, y_test, classifiers, t0=t0, verbose=3) print(f'Test finished ({time()-t0:.0f} [s]).') # results analysis if diagnosis: models_diagnosis(models.values(), list(X_train.columns), x+' -> '+y, max_features=30) if fig is None: fig = plt.subplots(1, 2) plt.figure(fig[0].number) plot_results(res, fig[1], x + ' -> ' + y, 100 / len(np.unique(y_train)))
if __name__ == "__main__": ## configuration build_graphs = False save_graphs = False voc_samples = 0 detailed_cliques = False build_word2vec = False # load data t0 = time() df = ba.load_data(r'..\Data\articles') print(f'Data loaded ({time()-t0:.0f} [s]).') # get vocabulary print("\n\n____________________") voc = sm.get_vocabulary(df, required_freq=70, filter_fun=lambda w: any('א' <= c <= 'ת' for c in w)) print(f"Vocabulary loaded ({len(voc):d} words) ({time()-t0:.0f} [s]).") if voc_samples > 0: voc = list(np.random.choice(voc, voc_samples, replace=False)) print(f"Vocabulary shrunk ({len(voc):d} words) ({time()-t0:.0f} [s]).") # Graph of shared skip-grams neighbors if build_graphs: A, D, full_voc = words_2gram_adj_matrix( df, voc, window=3, normalize=True, min_abs=3, min_perc=0.0, binary=True) G = graph_of_words(voc, A=A, filter_singletons=True, A_thresh=2) if save_graphs: pickle.dump(G, open(r'..\Output\Context based embedding\2gram_based_graph.pkl', 'wb')) else: G = pickle.load(open(r'..\Output\Context based embedding\2gram_based_graph.pkl', 'rb')) print(f'Graph of 2-grams generated ({time()-t0:.0f} [s]).')