Пример #1
0
def tsne_plot(model, search_word, n_neighbors=10, ax=None):
    # Credit for function:
    # https://medium.com/@khulasaandh/word-embeddings-fun-with-word2vec-and-game-of-thrones-ea4c24fcf1b8
    labels = [bidi.get_display(search_word)]
    tokens = [model.wv[search_word]]
    similar = [1]
    close_words = model.wv.similar_by_word(search_word, topn=n_neighbors)
    for word in close_words:
        tokens.append(model.wv[word[0]])
        labels.append(bidi.get_display(word[0]))
        similar.append(word[1])

    tsne_model = TSNE(n_components=2, init='pca')
    coordinates = tsne_model.fit_transform(tokens)
    df = pd.DataFrame({'x': [x for x in coordinates[:, 0]],
                       'y': [y for y in coordinates[:, 1]],
                       'words': labels,
                       'similarity': similar}
                      )

    if ax is None:
        _, ax = plt.subplots()
    plot = ax.scatter(df.x, df.y, c=df.similarity, cmap='Reds')
    for i in range(len(df)):
        ax.annotate("  {} ({:.2f})".format(df.words[i].title(),
                                           df.similarity[i]),
                    (df.x[i], df.y[i]))

    plt.colorbar(mappable=plot, ax=ax)
    ax.set_title('t-SNE visualization for {}'.format(
        bidi.get_display(search_word)))
    utils.draw()
Пример #2
0
def words2sections(G, df, to_plot=False, title=''):
    sections = np.unique(df.section)
    colors = ('cyan', 'red', 'green', 'lime', 'orange', 'gold',
              'grey', 'magenta', 'plum', 'peru')
    # assign to each word the section in which it appears the most
    for w in G.node:
        G.node[w]['section'] = \
            sections[np.argmax(['\n'.join(df[df.section==sec].text).count(w)
                                for sec in sections])]
    sections = np.unique([G.node[w]['section'] for w in G.node])
    if to_plot:
        colors = {sec: colors[i % len(colors)]
                  for i, sec in enumerate(sections)}
        G2 = nx.relabel_nodes(
            G, {w: bidi.get_display(w) for w in G.node}, True)
        pos = nx.spring_layout(G2)
        for sec in sections:
            nx.draw_networkx_nodes(G2, pos=pos, node_color=colors[sec],
                                   with_labels=True, label=bidi.get_display(sec),
                                   nodelist=[w for w in G2
                                             if G2.node[w]['section']==sec])
        nx.draw_networkx_edges(G2, pos=pos, edge_color='pink')
        nx.draw_networkx_labels(G2, pos=pos)
        plt.title(title)
        plt.legend()
        utils.draw()
Пример #3
0
def perceptron_diagnosis(model, col_names=None, title=None, fig=None,
                         max_features=50):
    # input validation
    if len(model.coef_)<=2:
        raise NotImplementedError('Binary classification diagnosis is ' +
                                  'currently not supported.')
    if fig is None:
        fig = plt.subplots(1,1)
    plt.figure(fig[0].number)
    if col_names is None:
        col_names = list(range(len(model.coef_[0])))
    col_names = ['intercept'] + [bidi.get_display(nm) for nm in col_names]
    # get std of coefficients
    coef_std = [np.std(model.intercept_)] + \
               [np.std([cfs[i] for cfs in model.coef_])
                for i in range(len(model.coef_[0]))]
    if max_features:
        ids = np.array(coef_std).argsort()[-max_features:][::-1]
        col_names = [col_names[i] for i in ids]
        coef_std = [coef_std[i] for i in ids]
    # plot
    pre_title = '' if title is None else title+'\n'
    utils.barplot(fig[1], col_names, coef_std, vertical_xlabs=True,
                  title=pre_title + 'Perceptron Diagnosis ' +
                        f'({model.n_iter_:d} iterations)',
                  xlab='Feature', colors=('black',),
                  ylab='STD(coef) over classes\n' + '(not STD(x*coef)!)')
    utils.draw()
Пример #4
0
def plot_words_repetitions(tab):
    f, axs = plt.subplots(1, 1)
    axs.plot(list(range(101)),
             utils.dist([t[1] for t in tab], list(range(101)))[2:], 'k-')
    axs.set_yscale('log')
    axs.set_xlim((0, 100))
    axs.set_xlabel(f'Quantile [%]\n(100% = {len(tt):d} words)', fontsize=12)
    axs.set_ylabel('Repetitions', fontsize=12)
    axs.set_title('Frequency of Words in Articles\n' +
                  '(in Hebrew without stopwords)',
                  fontsize=14)
    utils.draw()
Пример #5
0
def lengths_analysis(df, by=None):
    f, axs = plt.subplots(3, 3)
    # remove blocked haaretz texts before analysis
    df = df[np.logical_not(df['blocked'])]
    # count units
    df['words_per_text'] = count_words(df.text)
    df['words_per_title'] = count_words(df.title)
    df['words_per_subtitle'] = count_words(df.subtitle)
    df['characters_per_text'] = [len(s) for s in df.text]
    df['sentences_per_text'] = count_sentences(df.text)
    df['paragraphs_per_text'] = count_paragraphs(df.text)
    df['characters_per_title'] = [len(s) for s in df.title]
    df['unique_words_per_100_words'] =\
        [100*len(np.unique(list(filter(None,re.split(' |\t|\n\r|\n',s))))) /
         len(list(filter(None,re.split(' |\t|\n\r|\n',s))))
         for s in df.text]
    df['characters_per_word'] =\
        [len(s)/len(list(filter(None,re.split(' |\t|\n\r|\n',s))))
         for s in df.text]
    # plot
    columns = ('words_per_text', 'words_per_subtitle', 'words_per_title',
               'characters_per_text', 'sentences_per_text', 'paragraphs_per_text',
               'characters_per_title', 'unique_words_per_100_words',
               'characters_per_word')
    for i,col in enumerate(columns):
        ax = axs[int(i/3),i%3]
        bp = df.boxplot(ax=ax, column=col, by=['source']+([by] if by else []),
                        return_type='both', patch_artist=True)
        colors = np.repeat(('blue','red','green'), int(len(bp[0][1]['boxes'])/3))
        for box, color in zip(bp[0][1]['boxes'], colors):
            box.set_facecolor(color)
        ax.set_xlabel('')#'Source', fontsize=12)
        ax.set_ylabel(col.replace('_',' ').capitalize(), fontsize=12)
        if by:
            ax.set_xticklabels(
                [bidi.get_display(
                    t._text.replace('(', '').replace(')', '').replace(', ', '\n') )
                    for t in ax.get_xticklabels()],
                rotation=90)
        if i==0:
            ax.set_title('TOKENS COUNT', fontsize=14)
        else:
            ax.set_title('')
    # draw
    utils.draw()
Пример #6
0
def validity_tests(df):
    sources = np.unique(df['source'])
    blocked_contents = (1-check_haaretz_blocked_text(df[df['source'] == 'haaretz'])\
                       / np.sum(df['source']=='haaretz')) * 100
    df = df[np.logical_not(df['blocked'])]
    n = {src: np.sum(df['source'] == src) for src in sources}
    # get anomalies
    bad_types = {src: verify_valid(df[df['source']==src],
                                      {'date':datetime,'blocked':np.bool_})
                 for src in sources}
    bad_lengths = {src: check_lengths(df[df['source']==src]) for src in sources}
    bad_tokens = {src: verify_hebrew_words(df[df['source']==src]) for src in sources}
    # plot anomalies
    f, axs = plt.subplots(3, len(sources))
    for i, src in enumerate(sources):
        tit = ('DATA SANITY TESTS\n' if i==int(len(sources)/2) else '\n') +\
              f'[{src:s}] Invalid field types' +\
              (f'\n(out of {blocked_contents:.0f}% unblocked articles)'
               if src=='haaretz' else '\n')
        utils.barplot(axs[0, i], bad_types[src].keys(),
                      100 * np.array(tuple(bad_types[src].values())) / n[src],
                      vertical_xlabs=True, title=tit,
                      ylab='Having invalid type [%]', ylim=(0, 100))
    sp = inspect.getfullargspec(check_lengths)
    limits = list(itertools.chain.from_iterable(sp[3][0].values()))
    for i, src in enumerate(sources):
        utils.barplot(axs[1, i],
                      [a+f'\n({b:.0f} chars)' for a,b in
                       zip(bad_lengths[src].keys(),limits)],
                      100 * np.array(tuple(bad_lengths[src].values())) / n[src],
                      vertical_xlabs=True,
                      title=f'[{src:s}] Suspicious string-field lengths',
                      ylab='Having invalid length [%]', ylim=(0, 100))
    utils.barplot(axs[2,0], sources, [100*(1-bad_tokens[src][0]) for src in sources],
                  xlab='Source', ylab='Words without numbers\nor Hebrew letters [%]')
    utils.barplot(axs[2,1], sources, [100*(1-bad_tokens[src][1]) for src in sources],
                  xlab='Source', ylab='Words of length <=1 [%]')
    for i in range(2,len(sources)):
        utils.clean_figure(axs[2,i])
    # draw
    utils.draw()
Пример #7
0
def plot_results(res, axs, title='Test Classification', reference=None):
    for i,test in enumerate(res):
        ax = axs[i]
        n_samples = res[test][0]
        # plot reference
        if reference is not None:
            ax.plot((n_samples[0], n_samples[-1]),
                    2 * [reference],
                    'k--', label='Random')
        # plot actual results
        for model in res[test][1]:
            accuracy = res[test][1][model]
            ax.plot(n_samples, accuracy, label=model)
        ax.set_title(title + f'\n({test:s})', fontsize=14)
        ax.set_xlabel('Training samples', fontsize=12)
        ax.set_ylabel('Accuracy [%]', fontsize=12)
        ax.set_xlim((n_samples[0],n_samples[-1]))
        ax.set_ylim((0,101))
        ax.grid(color='k', linestyle=':', linewidth=1)
        ax.legend(loc='upper left')
    utils.draw()
Пример #8
0
def data_description(df):
    sources = np.unique(df['source'])
    n = len(sources)
    f, axs = plt.subplots(2, n)
    # counters per source
    bar_per_source(axs[0,0], df, ylab='Articles\n(black = partially blocked contents)',
                   fun=lambda d: d.shape[0], title='\nArticles per Source')
    bar_per_source(axs[0,1], df,
                   ylab='Words [x1000]\n(black = partially blocked contents)',
                   fun=lambda d: sum(len(l.split()) for t in d['text'].values
                                     for l in t.split('\n')) / 1e3,
                   title='BASIC DATA DESCRIPTION\nWords per Source')
    # remove blocked haaretz texts before next analysis
    df = df[np.logical_not(df['blocked'])]
    # sections per source
    articles_per_section =\
        [df[np.logical_and(df.source==src,df.section==sec)].shape[0]
         for src in sources
         for sec in np.unique(df[df.source==src].section)]
    axs[0,2].pie([df[df.source==src].shape[0] for src in sources],
                 labels=sources, colors=utils.DEF_COLORS[:3], startangle=90,
                 frame=True, counterclock=False)
    patches,_ = axs[0,2].pie(articles_per_section,
                  radius=0.75, startangle=90, counterclock=False)
    centre_circle =\
        plt.Circle((0, 0), 0.5, color='black', fc='white', linewidth=0)
    axs[0,2].add_artist(centre_circle)
    axs[0,2].set_title('\nSources and Sections', fontsize=14)
    axs[0,2].legend(
        patches, [bidi.get_display(sec) for src in sources
                  for sec in np.unique(df[df.source==src].section)],
        ncol=5, loc='upper right', bbox_to_anchor=(1, 0.11), fontsize=8 )
    # dates & authors
    date_hist(axs[1,0], df)
    author_concentration(axs[1,1], df)
    top_authors(axs[1,2], df)
    # draw
    utils.draw()
Пример #9
0
def naive_bayes_diagnosis(model, col_names=None, title=None, fig=None,
                         max_features=50):
    # input validation
    if fig is None:
        fig = plt.subplots(1,1)
    plt.figure(fig[0].number)
    if col_names is None:
        col_names = list(range(len(model.feature_importances)))
    col_names = [bidi.get_display(nm) for nm in col_names]
    # get std of coefficients
    log_probs_std = [np.std([lp[i] for lp in model.feature_log_prob_])
                     for i in range(len(model.feature_log_prob_[0]))]
    if max_features:
        ids = np.array(log_probs_std).argsort()[-max_features:][::-1]
        col_names = [col_names[i] for i in ids]
        log_probs_std = [log_probs_std[i] for i in ids]
    # plot
    pre_title = '' if title is None else title+'\n'
    utils.barplot(fig[1], col_names, log_probs_std, vertical_xlabs=True,
                  title=pre_title+f'Naive Bayes Diagnosis',
                  xlab='Feature', colors=('black',),
                  ylab='STD(log probability)\nover classes')
    utils.draw()
Пример #10
0
def random_forest_diagnosis(model, col_names=None, title=None, fig=None,
                         max_features=50):
    # input validation
    if fig is None:
        fig = plt.subplots(1,1)
    plt.figure(fig[0].number)
    if col_names is None:
        col_names = list(range(len(model.feature_importances_)))
    col_names = [bidi.get_display(nm) for nm in col_names]
    # get importance
    importance = model.feature_importances_
    if max_features:
        ids = np.array(importance).argsort()[-max_features:][::-1]
        col_names = [col_names[i] for i in ids]
        importance = [importance[i] for i in ids]
    # plot
    pre_title = '' if title is None else title+'\n'
    utils.barplot(fig[1], col_names, importance, vertical_xlabs=True,
                  title=pre_title + 'Random Forest Diagnosis ' +
                        f'({len(model.estimators_):d} trees)',
                  xlab='Feature', colors=('black',),
                  ylab='Gini importance')
    utils.draw()
Пример #11
0
def count_parties(
    ax,
    df,
    col='text',
    by='source',
    binary_per_text=False,
    logscale=False,
    keys=('ליכוד', ('ביבי', 'נתניהו'), ('כחול לבן', 'כחול-לבן'), 'גנץ',
          'העבודה', 'גבאי', ('חד"ש', 'תע"ל'), 'עודה', 'יהדות התורה', 'ליצמן',
          'איחוד הימין', "סמוטריץ'", 'הימין החדש', 'בנט', 'זהות', 'פייגלין',
          'מרצ', 'זנדברג', 'ש"ס', 'דרעי', 'כולנו', 'כחלון', ('בל"ד', 'רע"ם'),
          'עבאס', ('ישראל ביתנו',
                   'ישראל-ביתנו'), 'ליברמן', 'גשר', 'אורלי לוי')):
    groups = np.unique(df[by])
    sep = SEPARATOR['word']

    count = {grp: len(keys) * [0] for grp in groups}
    for grp in groups:
        for i, txt in enumerate(df[df[by] == grp][col]):
            for j, key in enumerate(keys):
                # multi-word keys
                appears = 0
                if isinstance(key, tuple):
                    for k in key:
                        if ' ' in k:
                            appears = txt.count(k)
                            count[grp][j] += bool(
                                appears) if binary_per_text else appears
                            if binary_per_text: break
                else:
                    k = key
                    if ' ' in k:
                        appears = txt.count(k)
                        count[grp][j] += bool(
                            appears) if binary_per_text else appears
                if binary_per_text and appears:
                    continue
                # one-word keys
                for w in re.split(sep, txt):
                    w = re.sub('\.|,|\(|\)|;|:|\t', '', w).strip()
                    if w.endswith(key):
                        count[grp][j] += 1
                        if binary_per_text: break

    keys = tuple(k[0] + ' /\n' + k[1] if isinstance(k, tuple) else k
                 for k in keys)
    keys = tuple(bidi.get_display(k) for k in keys)
    colors = utils.DEF_COLORS
    bottom = np.array([0 for _ in keys])

    ylab = ('Texts with the expression' if binary_per_text else 'Total appearances') +\
           '\n(as end of a word)'
    for i, group in enumerate(groups):
        utils.barplot(ax,
                      keys,
                      count[group],
                      bottom=bottom,
                      plot_bottom=False,
                      ylab=ylab,
                      title='Frequency of appearance',
                      vertical_xlabs=True,
                      colors=colors[i % len(colors)],
                      label=bidi.get_display(group))
        bottom += count[group]
    if logscale:
        ax.set_yscale('log')
    ax.legend()
    utils.draw()