示例#1
0
def generate_all():
    X = read_all_debates()
    k = 0
    CHUNKS = [""]
    curr_speaker = X[0].speaker
    for i, sent in enumerate(X):
        if curr_speaker != sent.speaker:
            k += 1
            CHUNKS.append("")
            curr_speaker = sent.speaker
        sent.features['mod_chunk'] = k
        # if 'audit is over' in sent.text:
        #     print(k)
        #     print(CHUNKS[k])
        CHUNKS[k] = CHUNKS[k] + " " + sent.text
    CHUNKS.append("")
    # print(k)
    output_doc = open(
        "/home/pepa/PycharmProjects/claim-rank/data/discourse/chunk_parses_new_680.txt",
        "w")
    output_sent = open(
        "/home/pepa/PycharmProjects/claim-rank/data/discourse/sent_parses_new_680.txt",
        "w")
    j = 0

    start = 680
    end = 750
    for i, chunk in enumerate(CHUNKS):
        if i < start or i > end:
            continue
        # print(i)
        parse_sent, parse_doc = get_parse(chunk, "{}".format(i))
        parse_doc = parse_doc.replace("\n", "")
        output_doc.write("{}\t{}\n".format(i, parse_doc))

        for s, p_s in enumerate(parse_sent.split(")\n(")):
            if s == 0:
                p_s = p_s + ")"
            elif s == len(parse_sent.split(")\n(")) - 1:
                p_s = "(" + p_s
            else:
                p_s = "(" + p_s + ")"

            output_sent.write("{}\t{}\t{}\n".format(X[j].id, X[j].debate.name,
                                                    p_s.replace("\n", "")))
            j += 1
示例#2
0
import numpy as np
from sklearn.metrics import (average_precision_score, precision_score,
                             recall_score, roc_auc_score)
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint

from src.data import debates
from src.models import models
from src.stats import rank_metrics as rm
from src.features.feature_sets import get_serialized_pipeline

data_sets = debates.get_for_crossvalidation()

texts = [
    sentence.text for sentence in debates.read_all_debates(sep_by_deb=False)
]  #debates
texts.extend([
    sentence.text for sentence in debates.read_all_speeches(sep_by_deb=False)
])  #speeches

tokenizer, word_index = models.create_tokenizer(texts)

MAX_SENTENCE_LENGTH = max([len(sentence.split()) for sentence in texts])

folder = 'speeches/'
# tests: one hidden layer, dropout 0.3, softmax activation, fixed vs dynamic word embedding (300d glove)
results = []
exp_num = folder + 'bilstm_d'
embedding_filepath = '/usr/users/oliverren/meng/check-worthy/data/glove/glove.6B.300d.txt'
for test_deb, test, val, train in data_sets:
示例#3
0
from src.data.debates import read_all_debates
from src.utils.config import get_config

CONFIG = get_config()

output_file = open("../../reports/all_sources.txt", "w")

all_sentences = read_all_debates()

output_file.write("Overall sentences annotated: " + str(len(all_sentences)) +
                  "\n\n")

count_agreement = [0 for i in range(10)]
accumulated_agreement = [0 for i in range(10)]
for sent in all_sentences:
    count_agreement[sent.label] += 1

    for j in range(10):
        if sent.label >= j:
            accumulated_agreement[j] += 1

output_file.write("Agreement:\n")
for i in range(10):
    output_file.write(
        str(count_agreement[i]) + " sentences with " + str(i) +
        " annotators agreed\n")

output_file.write("\nAccumulated Agreement:\n")
for i in range(1, 10):
    output_file.write(
        str(accumulated_agreement[i]) + " sentences with at least " + str(i) +
示例#4
0
    if not line:
        continue
    cols = line.split("\t")
    # print(cols[0])
    chunks[int(cols[0])] = parse_list(cols[1])
output_sent = open(CONFIG['sentence_parses'])
sents = {}
# print("Parsing Sentences")
for line in output_sent:
    line = line.strip()
    if not line:
        continue
    cols = line.split("\t")
    # print(cols[0])
    sents[cols[0] + cols[1]] = parse_list(cols[2])
X = read_all_debates()
k = 0
sent2chunk = {}
curr_speaker = X[0].speaker
for i, sent in enumerate(X):
    if curr_speaker != sent.speaker:
        k += 1
        curr_speaker = sent.speaker
    sent2chunk["{}{}".format(sent.id, sent.debate.name)] = k


class DiscourseInfo(Feature):
    FEATS = ['discourse_it', 'discourse_rel', 'in_chunk_first_rel', 'in_chunk_last_rel', 'in_chunk_first_it',\
             'in_chunk_last_it', 'in_chunk_it', 'in_chunk_rel']

    RELS = ['DUMMY', 'Joint', 'Condition', 'Contrast', 'Joint', 'Attribution',
示例#5
0
from keras import backend as K
from keras.layers.wrappers import Bidirectional
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Merge

from features.feature_sets import get_experimential_pipeline
from src.utils.config import get_config

from os.path import join
import numpy as np
from src.data.debates import read_all_debates
from gensim.corpora import Dictionary

CONFIG = get_config()
dict = Dictionary([sent.tokens for sent in read_all_debates()])


def precision(y_true, y_pred):
    '''Calculates the precision, a metric for multi-label classification of
    how many selected items are relevant.
    '''
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision


def feats_lstm_model(feat_size):
    text_embedding_model = Sequential()
    text_embedding_model.add(Embedding(5200, 100))
示例#6
0
def get_tokens(sentences):
    text = " ".join([sent.text.lower() for sent in sentences])
    stop = list(stopwords.words('english'))
    tokens = [token for token in word_tokenize(text) if token not in stop]
    return " ".join(tokens)


def save_wordcloud(wordcloud, file_name):
    plt.figure()
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.savefig(CONFIG['word_clouds_dir'] + file_name)


all_sents = read_all_debates()
save_wordcloud(WC.generate(get_tokens(all_sents)), "all_sents.png")

negative_sents = [sent for sent in all_sents if sent.label == 0]
save_wordcloud(WC.generate(get_tokens(negative_sents)), "neg_sents.png")

positive_sents = [sent for sent in all_sents if sent.label > 0]
save_wordcloud(WC.generate(get_tokens(positive_sents)), "pos_sents.png")

for i in range(1, 9):
    pos_gt_i = [sent for sent in all_sents if sent.label > i]
    save_wordcloud(WC.generate(get_tokens(pos_gt_i)),
                   "pos_gt_" + str(i) + "_sents.png")

all_cb = read_all_debates(source='cb')
for i in np.arange(0.5, 1.1, 0.1):