def generate_all(): X = read_all_debates() k = 0 CHUNKS = [""] curr_speaker = X[0].speaker for i, sent in enumerate(X): if curr_speaker != sent.speaker: k += 1 CHUNKS.append("") curr_speaker = sent.speaker sent.features['mod_chunk'] = k # if 'audit is over' in sent.text: # print(k) # print(CHUNKS[k]) CHUNKS[k] = CHUNKS[k] + " " + sent.text CHUNKS.append("") # print(k) output_doc = open( "/home/pepa/PycharmProjects/claim-rank/data/discourse/chunk_parses_new_680.txt", "w") output_sent = open( "/home/pepa/PycharmProjects/claim-rank/data/discourse/sent_parses_new_680.txt", "w") j = 0 start = 680 end = 750 for i, chunk in enumerate(CHUNKS): if i < start or i > end: continue # print(i) parse_sent, parse_doc = get_parse(chunk, "{}".format(i)) parse_doc = parse_doc.replace("\n", "") output_doc.write("{}\t{}\n".format(i, parse_doc)) for s, p_s in enumerate(parse_sent.split(")\n(")): if s == 0: p_s = p_s + ")" elif s == len(parse_sent.split(")\n(")) - 1: p_s = "(" + p_s else: p_s = "(" + p_s + ")" output_sent.write("{}\t{}\t{}\n".format(X[j].id, X[j].debate.name, p_s.replace("\n", ""))) j += 1
import numpy as np from sklearn.metrics import (average_precision_score, precision_score, recall_score, roc_auc_score) from keras.preprocessing.sequence import pad_sequences from keras.callbacks import ModelCheckpoint from src.data import debates from src.models import models from src.stats import rank_metrics as rm from src.features.feature_sets import get_serialized_pipeline data_sets = debates.get_for_crossvalidation() texts = [ sentence.text for sentence in debates.read_all_debates(sep_by_deb=False) ] #debates texts.extend([ sentence.text for sentence in debates.read_all_speeches(sep_by_deb=False) ]) #speeches tokenizer, word_index = models.create_tokenizer(texts) MAX_SENTENCE_LENGTH = max([len(sentence.split()) for sentence in texts]) folder = 'speeches/' # tests: one hidden layer, dropout 0.3, softmax activation, fixed vs dynamic word embedding (300d glove) results = [] exp_num = folder + 'bilstm_d' embedding_filepath = '/usr/users/oliverren/meng/check-worthy/data/glove/glove.6B.300d.txt' for test_deb, test, val, train in data_sets:
from src.data.debates import read_all_debates from src.utils.config import get_config CONFIG = get_config() output_file = open("../../reports/all_sources.txt", "w") all_sentences = read_all_debates() output_file.write("Overall sentences annotated: " + str(len(all_sentences)) + "\n\n") count_agreement = [0 for i in range(10)] accumulated_agreement = [0 for i in range(10)] for sent in all_sentences: count_agreement[sent.label] += 1 for j in range(10): if sent.label >= j: accumulated_agreement[j] += 1 output_file.write("Agreement:\n") for i in range(10): output_file.write( str(count_agreement[i]) + " sentences with " + str(i) + " annotators agreed\n") output_file.write("\nAccumulated Agreement:\n") for i in range(1, 10): output_file.write( str(accumulated_agreement[i]) + " sentences with at least " + str(i) +
if not line: continue cols = line.split("\t") # print(cols[0]) chunks[int(cols[0])] = parse_list(cols[1]) output_sent = open(CONFIG['sentence_parses']) sents = {} # print("Parsing Sentences") for line in output_sent: line = line.strip() if not line: continue cols = line.split("\t") # print(cols[0]) sents[cols[0] + cols[1]] = parse_list(cols[2]) X = read_all_debates() k = 0 sent2chunk = {} curr_speaker = X[0].speaker for i, sent in enumerate(X): if curr_speaker != sent.speaker: k += 1 curr_speaker = sent.speaker sent2chunk["{}{}".format(sent.id, sent.debate.name)] = k class DiscourseInfo(Feature): FEATS = ['discourse_it', 'discourse_rel', 'in_chunk_first_rel', 'in_chunk_last_rel', 'in_chunk_first_it',\ 'in_chunk_last_it', 'in_chunk_it', 'in_chunk_rel'] RELS = ['DUMMY', 'Joint', 'Condition', 'Contrast', 'Joint', 'Attribution',
from keras import backend as K from keras.layers.wrappers import Bidirectional from keras.preprocessing import sequence from keras.models import Sequential from keras.layers import Dense, Embedding, LSTM, Merge from features.feature_sets import get_experimential_pipeline from src.utils.config import get_config from os.path import join import numpy as np from src.data.debates import read_all_debates from gensim.corpora import Dictionary CONFIG = get_config() dict = Dictionary([sent.tokens for sent in read_all_debates()]) def precision(y_true, y_pred): '''Calculates the precision, a metric for multi-label classification of how many selected items are relevant. ''' true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1))) precision = true_positives / (predicted_positives + K.epsilon()) return precision def feats_lstm_model(feat_size): text_embedding_model = Sequential() text_embedding_model.add(Embedding(5200, 100))
def get_tokens(sentences): text = " ".join([sent.text.lower() for sent in sentences]) stop = list(stopwords.words('english')) tokens = [token for token in word_tokenize(text) if token not in stop] return " ".join(tokens) def save_wordcloud(wordcloud, file_name): plt.figure() plt.imshow(wordcloud) plt.axis("off") plt.savefig(CONFIG['word_clouds_dir'] + file_name) all_sents = read_all_debates() save_wordcloud(WC.generate(get_tokens(all_sents)), "all_sents.png") negative_sents = [sent for sent in all_sents if sent.label == 0] save_wordcloud(WC.generate(get_tokens(negative_sents)), "neg_sents.png") positive_sents = [sent for sent in all_sents if sent.label > 0] save_wordcloud(WC.generate(get_tokens(positive_sents)), "pos_sents.png") for i in range(1, 9): pos_gt_i = [sent for sent in all_sents if sent.label > i] save_wordcloud(WC.generate(get_tokens(pos_gt_i)), "pos_gt_" + str(i) + "_sents.png") all_cb = read_all_debates(source='cb') for i in np.arange(0.5, 1.1, 0.1):