Exemplo n.º 1
0
def texts_heuristics(texts):
    return {
        'n_words': ba.count_words(texts),
        'chars_per_word': np.array([len(s) for s in texts]) /
                          np.array(ba.count_words(texts)),
        'heb_chars_rate': np.nan_to_num([np.mean(['א'<=c<='ת'
                                    for w in sm.get_all_words(s) for c in w])
                           for s in texts]),
        'eng_chars_rate': np.nan_to_num(
            [np.mean(['a'<=c<='z' or 'A'<=c<='Z'
                      for w in sm.get_all_words(s) for c in w])
             for s in texts]),
        'num_chars_rate': np.nan_to_num([np.mean(['0'<=c<='9'
                                    for w in sm.get_all_words(s) for c in w])
                           for s in texts])
    }
Exemplo n.º 2
0
def word2vec(df, col='text', size=100, window=3,
             min_count=1, workers=4, save_to=None, **kwargs):
    sents = sm.get_all_sentences(df[col])
    sents = [sm.get_all_words(s,stopwords=()) for s in sents]
    model = Word2Vec(sents, size=size, window=window,
                     min_count=min_count, workers=workers, **kwargs)
    if save_to:
        pickle.dump(model, open(save_to,'wb'))
    return model
Exemplo n.º 3
0
def words_vs_texts_incidence_matrix(df, voc, col='text', per='article',
                           normalize=False, min_abs=0, min_perc=0, binary=False):
    # get list of texts
    data = list()
    sep = sm.SEPARATOR[per]
    for txt in df[col]:
        data.extend([s.strip().strip(sm.word_chars_filter) for s in
                     list(filter(None, re.split(sep, txt)))])
    # fill incidence matrix
    c = np.zeros(len(voc))
    D = np.zeros((len(voc), len(data)))
    for j,txt in enumerate(data):
        for w in sm.get_all_words(
                txt, filter_fun=lambda w: any('א'<=c<='ת' for c in w)):
            if w in voc:
                c[voc.index(w)] += 1
                D[voc.index(w), j] += 1
    # normalize
    D[D<min_abs] = 0
    if normalize:
        D = D * np.nan_to_num(1/c)[:, np.newaxis]
        D[D<min_perc] = 0
    if binary: D[D>0] = 1
    return D