示例#1
0
def words_2gram_adj_matrix(df, voc, col='text', window=2,
                           normalize=True, min_abs=0, min_perc=0.0, binary=False):
    full_voc = list(np.unique(sm.get_all_words(
        df[col], filter_fun=lambda w: any('א'<=c<='ת' for c in w)
    )))
    # get list of sentences
    data = sm.get_all_sentences(df[col])
    # fill incidence matrices
    c = np.zeros(len(voc))
    offsets = list(range(-window,0)) + list(range(1,window+1))
    D = {off: np.zeros((len(voc), len(full_voc))) for off in offsets}
    for txt in data:
        sent = sm.get_all_words(
            txt, filter_fun=lambda w: any('א'<=c<='ת' for c in w))
        for k,w in enumerate(sent):
            if w in voc:
                i = voc.index(w)
                c[i] += 1
                for off in offsets:
                    if 0 <= k+off < len(sent):
                        D[off][i, full_voc.index(sent[k+off])] += 1
    # normalize
    for off in offsets:
        D[off][D[off]<min_abs] = 0
        if normalize:
            D[off] = D[off] * np.nan_to_num(1/c)[:, np.newaxis]
            D[off][D[off]<min_perc] = 0
        if binary: D[off][D[off]>0] = 1
    # adj matrix
    A = np.zeros((len(voc),len(voc)))
    for off in offsets:
        d = np.sqrt(D[off])
        A += np.matmul(d, d.transpose())
    np.fill_diagonal(A, 0)
    return (A, D, full_voc)
示例#2
0
def words_local_incidence_matrix(df, voc, col='text', window=3,
                                 normalize=True, min_abs=0, min_perc=0.1, binary=False):
    full_voc = list(np.unique(sm.get_all_words(df[col])))
    # get list of sentences
    data = sm.get_all_sentences(df[col])
    # fill incidence matrix
    c = np.zeros(len(voc))
    D = np.zeros((len(voc), len(full_voc)))
    for txt in data:
        sent = sm.get_all_words(
            txt, filter_fun=lambda w: any('א'<=c<='ת' for c in w))
        for k,w in enumerate(sent):
            if w in voc:
                i = voc.index(w)
                c[i] += 1
                neihb = sent[k-window:k] + sent[k+1:k+window+1]
                for w2 in neihb:
                    D[i, full_voc.index(w2)] += 1
    # normalize
    D[D<min_abs] = 0
    if normalize:
        D = D * np.nan_to_num(1/c)[:, np.newaxis]
        D[D<min_perc] = 0
    if binary: D[D>0] = 1
    return D
示例#3
0
def word2vec(df, col='text', size=100, window=3,
             min_count=1, workers=4, save_to=None, **kwargs):
    sents = sm.get_all_sentences(df[col])
    sents = [sm.get_all_words(s,stopwords=()) for s in sents]
    model = Word2Vec(sents, size=size, window=window,
                     min_count=min_count, workers=workers, **kwargs)
    if save_to:
        pickle.dump(model, open(save_to,'wb'))
    return model
示例#4
0
文件: Classifier.py 项目: ido90/News
def prepare_data_and_test(df, classifiers, x='article', y='source',
                          fig=None, t0=time(), add_heuristics=True,
                          force_balance=True, diagnosis=False):
    # convert to pairs (x,y) and split to train & test
    data = get_labeled_raw_data(df, verbose=1, force_balance=force_balance,
                                x_resolution=x, y_col=y)
    X_train_raw, X_test_raw, y_train, y_test = \
        train_test_split(data[0], data[1], test_size=0.2, random_state=0)
    print(f'Train & test groups defined ({time()-t0:.0f} [s]).')

    # extract features
    voc = sm.get_vocabulary(texts=X_train_raw, required_freq=20,
                            filter_fun=lambda w: any('א' <= c <= 'ת' for c in w))
    print(f'Vocabulary of {len(voc):d} words is set ({time()-t0:.0f} [s]).')
    X_train = extract_features(X_train_raw, voc, add_heuristics=add_heuristics)
    X_test = extract_features(X_test_raw, voc, add_heuristics=add_heuristics)
    print(f'Features extracted ({time()-t0:.0f} [s]).')

    # train & test
    res, models = test_models(X_train, X_test, y_train, y_test, classifiers,
                              t0=t0, verbose=3)
    print(f'Test finished ({time()-t0:.0f} [s]).')

    # results analysis
    if diagnosis:
        models_diagnosis(models.values(), list(X_train.columns),
                         x+' -> '+y, max_features=30)
    if fig is None:
        fig = plt.subplots(1, 2)
    plt.figure(fig[0].number)
    plot_results(res, fig[1], x + ' -> ' + y,
                 100 / len(np.unique(y_train)))
示例#5
0
文件: Classifier.py 项目: ido90/News
def texts_heuristics(texts):
    return {
        'n_words': ba.count_words(texts),
        'chars_per_word': np.array([len(s) for s in texts]) /
                          np.array(ba.count_words(texts)),
        'heb_chars_rate': np.nan_to_num([np.mean(['א'<=c<='ת'
                                    for w in sm.get_all_words(s) for c in w])
                           for s in texts]),
        'eng_chars_rate': np.nan_to_num(
            [np.mean(['a'<=c<='z' or 'A'<=c<='Z'
                      for w in sm.get_all_words(s) for c in w])
             for s in texts]),
        'num_chars_rate': np.nan_to_num([np.mean(['0'<=c<='9'
                                    for w in sm.get_all_words(s) for c in w])
                           for s in texts])
    }
示例#6
0
def common_context(df, words, col='text', window=2):
    if isinstance(words[0],str):
        words = (words,)
    sents = sm.get_all_sentences(df[col])
    for pair in words:
        print("Words:\t", pair)
        A, D, voc = words_2gram_adj_matrix(df, pair, col=col, window=window)
        context = []
        for o in D:
            ii = [i[0] for i in np.argwhere(D[o][0,:] * D[o][1,:])]
            context.extend([voc[i] for i in ii])
        print("context:\t", context)
        for i, s in enumerate(sents):
            if np.any([w in s for w in pair]) and \
                    np.any([w in s for w in context]):
                print(i, s)
示例#7
0
def words_vs_texts_incidence_matrix(df, voc, col='text', per='article',
                           normalize=False, min_abs=0, min_perc=0, binary=False):
    # get list of texts
    data = list()
    sep = sm.SEPARATOR[per]
    for txt in df[col]:
        data.extend([s.strip().strip(sm.word_chars_filter) for s in
                     list(filter(None, re.split(sep, txt)))])
    # fill incidence matrix
    c = np.zeros(len(voc))
    D = np.zeros((len(voc), len(data)))
    for j,txt in enumerate(data):
        for w in sm.get_all_words(
                txt, filter_fun=lambda w: any('א'<=c<='ת' for c in w)):
            if w in voc:
                c[voc.index(w)] += 1
                D[voc.index(w), j] += 1
    # normalize
    D[D<min_abs] = 0
    if normalize:
        D = D * np.nan_to_num(1/c)[:, np.newaxis]
        D[D<min_perc] = 0
    if binary: D[D>0] = 1
    return D
示例#8
0
if __name__ == "__main__":
    ## configuration
    build_graphs = False
    save_graphs = False
    voc_samples = 0
    detailed_cliques = False
    build_word2vec = False

    # load data
    t0 = time()
    df = ba.load_data(r'..\Data\articles')
    print(f'Data loaded ({time()-t0:.0f} [s]).')

    # get vocabulary
    print("\n\n____________________")
    voc = sm.get_vocabulary(df, required_freq=70,
                            filter_fun=lambda w: any('א' <= c <= 'ת' for c in w))
    print(f"Vocabulary loaded ({len(voc):d} words) ({time()-t0:.0f} [s]).")
    if voc_samples > 0:
        voc = list(np.random.choice(voc, voc_samples, replace=False))
        print(f"Vocabulary shrunk ({len(voc):d} words) ({time()-t0:.0f} [s]).")

    # Graph of shared skip-grams neighbors
    if build_graphs:
        A, D, full_voc = words_2gram_adj_matrix(
            df, voc, window=3, normalize=True, min_abs=3, min_perc=0.0, binary=True)
        G = graph_of_words(voc, A=A, filter_singletons=True, A_thresh=2)
        if save_graphs:
            pickle.dump(G, open(r'..\Output\Context based embedding\2gram_based_graph.pkl', 'wb'))
    else:
        G = pickle.load(open(r'..\Output\Context based embedding\2gram_based_graph.pkl', 'rb'))
    print(f'Graph of 2-grams generated ({time()-t0:.0f} [s]).')