예제 #1
0
def output_label_index():

    output_dir = fh.makedirs(defines.web_dir, 'DRLD')
    output_filename = fh.make_filename(output_dir, 'index_labels', 'html')

    true = labels.get_labels(['Democrat-Dislikes', 'Democrat-Likes', 'Republican-Dislikes', 'Republican-Likes'])

    with codecs.open(output_filename, 'w') as output_file:
        output_file.write(html.make_header('Labels'))
        output_file.write(html.make_body_start())
        output_file.write(common.make_masthead(1))
        output_file.write(html.make_heading('Labels', align='center'))

        table_header = ['Label']
        output_file.write(html.make_table_start(style='sortable'))
        output_file.write(html.make_table_header(table_header))

        for index, code in enumerate(true.columns):
            code_name = code_names[index]
            link = html.make_link('label_' + html.replace_chars(code_name) + '.html', code_name)
            row = [link]
            output_file.write(html.make_table_row(row))

        output_file.write(html.make_table_end())

        output_file.write(html.make_body_end())
        output_file.write(html.make_footer())
예제 #2
0
def output_response_index():
    output_dir = fh.makedirs(defines.web_dir, 'DRLD')
    output_filename = fh.make_filename(output_dir, 'index_responses', 'html')
    datasets = ['Democrat-Dislikes', 'Democrat-Likes', 'Republican-Dislikes', 'Republican-Likes']

    text_file_dir = fh.makedirs(defines.data_dir, 'rnn')
    text = fh.read_json(fh.make_filename(text_file_dir, 'ngrams_n1_m1_rnn', 'json'))

    with codecs.open(output_filename, 'w') as output_file:
        output_file.write(html.make_header('Democrats vs Republicans'))
        output_file.write(html.make_body_start())

        output_file.write(common.make_masthead(0))

        for dataset in datasets:
            true = labels.get_labels([dataset])
            all_items = ds.get_all_documents(dataset)
            train_items = ds.get_train_documents(dataset, 0, 0)
            dev_items = ds.get_dev_documents(dataset, 0, 0)
            test_items = ds.get_test_documents(dataset, 0)

            output_file.write(html.make_heading(dataset, align='center'))

            table_header = ['Response', 'Split', 'Snippet']
            col_widths = [130, 80, 800]
            output_file.write(html.make_table_start(col_widths=col_widths, style='sortable'))
            output_file.write(html.make_table_header(table_header))

            for subset in [train_items, dev_items, test_items]:
                subset.sort()
                for item in subset:
                    if item in train_items:
                        split = 'train'
                    elif item in dev_items:
                        split = 'dev'
                    else:
                        split = 'test'

                    words = text[item]
                    response = ' '.join(words)
                    if len(response) > 100:
                        response = response[:100] + '. . .'
                    num = item.split('_')[1]
                    link = html.make_link(item + '.html', num, new_window=False)
                    link2 = html.make_link(item + '.html', response, new_window=False)
                    row = [link, split, link2]
                    output_file.write(html.make_table_row(row))

            output_file.write(html.make_table_end())

            output_file.write(html.make_body_end())
            output_file.write(html.make_footer())
예제 #3
0
def output_label_pages():

    output_dir = fh.makedirs(defines.web_dir, 'DRLD')
    blm_dir = fh.makedirs(defines.exp_dir, 'Democrat-Dislikes_Democrat-Likes_Republican-Dislikes_Republican-Likes', 'test_fold_0', 'L1LR_all_groups_a0', 'models')
    true = labels.get_labels(['Democrat-Dislikes', 'Democrat-Likes', 'Republican-Dislikes', 'Republican-Likes'])

    for code_index, code in enumerate(true.columns):
        code_name = code_names[code_index]
        output_filename = fh.make_filename(output_dir, 'label_' + html.replace_chars(code_name), 'html')
        with codecs.open(output_filename, 'w') as output_file:

            output_file.write(html.make_header(code_name))
            output_file.write(html.make_body_start())
            output_file.write(common.make_masthead(-1))
            output_file.write(html.make_heading('Label: ' + code_name, align='center'))
            output_file.write(html.make_paragraph('Coefficients for unigram model:', align="center"))

            table_header = ['Word', 'Value', 'Scaled']
            output_file.write(html.make_table_start(style='sortable'))
            output_file.write(html.make_table_header(table_header))

            model_filename = fh.make_filename(blm_dir, re.sub(' ', '_', code), 'json')
            model = fh.read_json(model_filename)
            intercept = float(model.get('intercept', 1.0))
            if 'coefs' in model:
                coefs = dict(model['coefs'])

                tokens = coefs.keys()
                tokens.sort()
                for token_index, token in enumerate(tokens):
                    cmax = 255
                    colours = [(0, 0, 0)]*2
                    word = token.split('_')[-1]
                    coef = coefs[token]
                    scaled_coef = coef/abs(intercept)
                    val = int(cmax - (min(1, abs(scaled_coef))*cmax))
                    if coef > 0:
                        colours += [(val, val, cmax)]
                    else:
                        colours += [(cmax, val, val)]

                    if len(word) > 0:
                        if word[0] not in ascii_lowercase:
                            word = '_' + word
                        link = html.make_link('wordtype_' + word + '.html', word)
                        row = [link, str('{:0.2f}'.format(coef)), word]
                        output_file.write(html.make_table_row(row, colours=colours))

            output_file.write(html.make_table_end())

            output_file.write(html.make_body_end())
            output_file.write(html.make_footer())
예제 #4
0
def output_word_index():
    output_dir = fh.makedirs(defines.web_dir, 'DRLD')
    blm_dir = fh.makedirs(defines.exp_dir, 'Democrat-Dislikes_Democrat-Likes_Republican-Dislikes_Republican-Likes', 'test_fold_0', 'L1LR_all_groups_a0', 'models')

    true = labels.get_labels(['Democrat-Dislikes', 'Democrat-Likes', 'Republican-Dislikes', 'Republican-Likes'])

    word_list = set()
    for code_index, code in enumerate(true.columns):
        # load coefficients from unigram model
        model_filename = fh.make_filename(blm_dir, html.replace_chars(code), 'json')
        model = fh.read_json(model_filename)
        if 'coefs' in model:
            coefs = dict(model['coefs'])
            words = [word[4:] for word in coefs.keys()]
            word_list.update(words)
    word_list = list(word_list)
    word_list.sort()

    output_filename = fh.make_filename(output_dir, 'index_words', 'html')
    with codecs.open(output_filename, 'w') as output_file:
        output_file.write(html.make_header('Words'))
        output_file.write(html.make_body_start())
        output_file.write(common.make_masthead(2))
        output_file.write(html.make_heading('Words', align='center'))

        table_header = ['Words']
        output_file.write(html.make_table_start(style='sortable'))
        output_file.write(html.make_table_header(table_header))
        for word in word_list:
            link = html.make_link('wordtype_' + html.replace_chars(word) + '.html', word)
            row = [link]
            output_file.write(html.make_table_row(row))

        output_file.write(html.make_table_end())

        output_file.write(html.make_body_end())
        output_file.write(html.make_footer())
예제 #5
0
def output_responses(dataset):
    print dataset
    output_dir = fh.makedirs(defines.web_dir, 'DRLD')
    rnn_dir = fh.makedirs(defines.exp_dir, 'rnn', 'bayes_opt_rnn_LSTM_reuse_mod_34_rerun', 'fold0', 'responses')
    blm_dir = fh.makedirs(defines.exp_dir, 'Democrat-Dislikes_Democrat-Likes_Republican-Dislikes_Republican-Likes', 'test_fold_0', 'L1LR_all_groups_a0', 'models')
    predictions_dir = fh.makedirs(defines.exp_dir, 'Democrat-Dislikes_Democrat-Likes_Republican-Dislikes_Republican-Likes', 'test_fold_0', 'L1LR_all_groups_a0', 'predictions')
    train_pred = pd.read_csv(fh.make_filename(predictions_dir, dataset + '_train', 'csv'), header=0, index_col=0)
    test_pred = pd.read_csv(fh.make_filename(predictions_dir, dataset + '_test', 'csv'), header=0, index_col=0)

    text_file_dir = fh.makedirs(defines.data_dir, 'rnn')
    text = fh.read_json(fh.make_filename(text_file_dir, 'ngrams_n1_m1_rnn', 'json'))

    true = labels.get_labels([dataset])
    all_items = ds.get_all_documents(dataset)

    word_list = common.get_word_list(true.columns, blm_dir)

    train_words = {}
    test_words = {}

    for i in all_items:
        true_i = true.loc[i]
        rnn_file = fh.make_filename(rnn_dir, i, 'csv')
        rnn_vals = pd.read_csv(rnn_file, header=-1)
        rnn_vals.columns = true.columns

        if i in train_pred.index:
            pred_i = train_pred.loc[i]
            train_item = True
        else:
            pred_i = test_pred.loc[i]
            train_item = False

        #output_filename = fh.make_filename(output_dir, i, 'html')
        output_filename = '/Users/dcard/Desktop.temp.html'
        with codecs.open(output_filename, 'w') as output_file:

            output_file.write(html.make_header(i))
            output_file.write(html.make_body_start())
            output_file.write(common.make_masthead(-1))
            output_file.write(html.make_heading('Response: ' + i, align='center'))
            output_file.write(html.make_paragraph('The table below shows coeficients for the unigram model (red-blue)',
                                                  align="center"))
            output_file.write(html.make_paragraph('and sequence element probabilities for the LSTM (white-green).',
                                                  align="center"))

            links = [html.make_link('wordtype_' + w + '.html', w) if w in word_list else w for w in text[i]]
            table_header = ['Label'] + links + ['True', 'Pred.']
            output_file.write(html.make_table_start(style='t1'))
            output_file.write(html.make_table_header(table_header))
            for code_index, code in enumerate(true.columns):
                # load coefficients from unigram model
                words = text[i]
                model_filename = fh.make_filename(blm_dir, re.sub(' ', '_', code), 'json')
                model = fh.read_json(model_filename)
                intercept = float(model.get('intercept', 1.0))
                if 'coefs' in model:
                    coefs = dict(model['coefs'])
                    colours = [str((0, 0, 0))]
                    for word in words:
                        coef = coefs.get('_n1_' + word, 0.0)/abs(intercept)
                        val = int(255 - (min(1, abs(coef))*255))
                        if coef > 0:
                            colours += [(val, val, 255)]
                        else:
                            colours += [(255, val, val)]
                else:
                    colours = [str((0, 0, 0))]
                    colours += [(255, 255, 255) for w in words]

                colours += [str((0, 0, 0))]*2
                code_name = code_names[code_index]
                link = html.make_link('label_' + html.replace_chars(code_name) + '.html', code_name)
                row = [link] + words + [str(true_i[code]), str(int(pred_i[code])) + ' (LR)']
                output_file.write(html.make_table_row(row, colours=colours))

                for i_v, v in enumerate(rnn_vals[code].values):
                    if v >= 0.5:
                        if train_item:
                            focal_word = text[i][i_v]
                            if focal_word in train_words:
                                train_words[focal_word] += 1
                            else:
                                train_words[focal_word] = 1
                        else:
                            focal_word = text[i][i_v]
                            if focal_word in test_words:
                                test_words[focal_word] += 1
                            else:
                                test_words[focal_word] = 1

                colours = [str((0, 0, 0))]
                vals = [int(235 - (v*235)) for v in rnn_vals[code]]
                colours += [(v, 235, v) for v in vals]
                colours += [str((0, 0, 0))]*2
                row = [' '] + text[i] + [' ', str(int(rnn_vals[code].max() >= 0.5)) + ' (RNN)']
                output_file.write(html.make_table_row(row, colours=colours))
            output_file.write(html.make_table_end())

            output_file.write(html.make_heading('LSTM Gates', align='center'))
            output_file.write(html.make_paragraph('The plot below shows LSTM gate values at each sequence element.',
                                                  align="center"))
            output_file.write(html.make_paragraph('Each grey line is one dimension; the colored line shows the mean.',
                                                  align="center"))
            output_file.write(html.make_image(os.path.join('gate_plots', i + '_gates.png')))

            output_file.write(html.make_heading('LSTM vectors', align='center'))
            output_file.write(html.make_paragraph('The plot below shows the LSTM hidden and memory nodes for each '
                                                  'sequence element.', align="Center"))
            output_file.write(html.make_paragraph('Vectors have been projected to a common space.',
                                                  align="center"))
            output_file.write(html.make_image(os.path.join('vector_plots', i + '_vectors.png')))

            output_file.write(html.make_body_end())
            output_file.write(html.make_footer())

    return train_words, test_words
예제 #6
0
def output_words():
    output_dir = fh.makedirs(defines.web_dir, 'DRLD')
    blm_dir = fh.makedirs(defines.exp_dir, 'Democrat-Dislikes_Democrat-Likes_Republican-Dislikes_Republican-Likes', 'test_fold_0', 'L1LR_all_groups_a0', 'models')

    text_file_dir = fh.makedirs(defines.data_dir, 'rnn')
    text = fh.read_json(fh.make_filename(text_file_dir, 'ngrams_n1_m1_rnn', 'json'))
    vocab = fh.read_json(fh.make_filename(text_file_dir, 'ngrams_n1_m1_rnn_vocab', 'json'))

    true = labels.get_labels(['Democrat-Dislikes', 'Democrat-Likes', 'Republican-Dislikes', 'Republican-Likes'])

    word_list = set()
    for code_index, code in enumerate(true.columns):
        # load coefficients from unigram model
        model_filename = fh.make_filename(blm_dir, html.replace_chars(code), 'json')
        model = fh.read_json(model_filename)
        if 'coefs' in model:
            coefs = dict(model['coefs'])
            words = [word[4:] for word in coefs.keys()]
            word_list.update(words)
    word_list = list(word_list)

    word_index = {}
    order = true.index.tolist()
    random.shuffle(order)
    for item in order:
        words = text[item]
        for word in words:
            if word in word_index:
                word_index[word].append(item)
            else:
                word_index[word] = [item]

    for word in word_list:
        output_filename = fh.make_filename(output_dir, 'wordtype_' + word, 'html')
        with codecs.open(output_filename, 'w') as output_file:

            output_file.write(html.make_header(word))
            output_file.write(html.make_body_start())
            output_file.write(common.make_masthead(-1))
            output_file.write(html.make_heading('Word: ' + word, align='center'))

            if word in word_index:
                output_file.write(html.make_paragraph('Sample usage:', align='center'))
                item_list = word_index[word][:]
                random.shuffle(item_list)
                for item in item_list[:min(len(item_list), 5)]:
                    item_text = text[item]
                    occurence_index = item_text.index(word)
                    start = max(0, occurence_index-10)
                    end = min(len(item_text), occurence_index + 10)
                    item_text = ['<b>' + w + '</b>' if w == word else w for w in item_text]
                    link = html.make_link(item + '.html', ' '.join(item_text[start:end]))
                    output_file.write(html.make_paragraph(link, align="center", id="psmall"))

            output_file.write(html.make_paragraph('Unigram model coefficients for each label:', align='center'))

            table_header = ['Label', 'Value', 'Scaled']
            output_file.write(html.make_table_start(style='sortable'))
            output_file.write(html.make_table_header(table_header))
            for code_index, code in enumerate(true.columns):
                # load coefficients from unigram model
                model_filename = fh.make_filename(blm_dir, re.sub(' ', '_', code), 'json')
                model = fh.read_json(model_filename)
                intercept = float(model.get('intercept', 1.0))

                cmax = 255
                if 'coefs' in model:
                    coefs = dict(model['coefs'])
                    colours = [str((0, 0, 0))]*2
                    coef = coefs.get('_n1_' + word, 0.0)
                    scaled_coef = coef/abs(intercept)
                    val = int(cmax - (min(1, abs(scaled_coef))*cmax))
                    if coef > 0:
                        colours += [(val, val, cmax)]
                    else:
                        colours += [(cmax, val, val)]
                else:
                    coef = 0.0
                    colours = [str((0, 0, 0)), str((0, 0, 0)), str((cmax, cmax, cmax))]

                code_name = code_names[code_index]
                link = html.make_link('label_' + html.replace_chars(code_name) + '.html', code_name)
                row = [link, '{:0.2f}'.format(coef), word]
                output_file.write(html.make_table_row(row, colours=colours))

            output_file.write(html.make_table_end())

            output_file.write(html.make_body_end())
            output_file.write(html.make_footer())