def parse_ground_truth_triples(df): ground_truth_triples = [] current_sent_index = 0 current_triples = [] num_dev_documents = len(load_documents(cf.DEV_FILENAME)) ground_truth_triples_dict = {k: [] for k in range(num_dev_documents)} for i, row in enumerate(df.itertuples()): sent_index = int(getattr(row, 'index')) head = getattr(row, 's1').split() rel = str(getattr(row, 'r')).split() tail = getattr(row, 's2').split() if sent_index not in ground_truth_triples_dict: ground_truth_triples_dict[sent_index] = [] ground_truth_triples_dict[sent_index].append([head, rel, tail]) for k in range(num_dev_documents): ground_truth_triples.append([]) for t in ground_truth_triples_dict[k]: ground_truth_triples[-1].append(t) return ground_truth_triples
def main(opts): if len(opts) == 0: raise ValueError("Usage: evaluate.py <dataset>") dataset = opts[0] if dataset not in ['cateringServices', 'automotiveEngineering', 'bbn']: raise ValueError( "Dataset must be either cateringServices, automotiveEngineering or bbn." ) cf.load_config(dataset) datasets = {} data_loaders = {} # 1. Read in the train and dev datasets from the csv file. datasets['dev'] = pd.read_csv(cf.DEV_FILENAME) datasets['train'] = pd.read_csv(cf.TRAIN_FILENAME, encoding='utf-8') # 2. Load documents documents = {} documents['train'] = load_documents(cf.TRAIN_DOCUMENTS_FILENAME) documents['dev'] = load_documents(cf.DEV_DOCUMENTS_FILENAME) # 3. Build a data loader for each dataset (train, dev, test). data_loaders = {} for ds_name, dataset in datasets.items(): logger.info("Building %s dataset..." % (ds_name)) dataset = build_dataset(dataset, ds_name, documents[ds_name]) data_loader = DataLoader(dataset, batch_size=cf.BATCH_SIZE, pin_memory=True) data_loaders[ds_name] = data_loader logger.info("The %s dataset was built successfully." % ds_name) logger.info("Saving data loaders to file...") save_obj_to_pkl_file(data_loaders['train'], 'data loader (train)', cf.ASSET_FOLDER + '/data_loader_train.pkl') save_obj_to_pkl_file(data_loaders['dev'], 'data loader (dev)', cf.ASSET_FOLDER + '/data_loader_dev.pkl')
def get_encoded_data(args): global agree_words, disagree_words, agree_indices, disagree_indices # %% load data # load sentence data sents, labels = load_sentences(domain=args.domain) # load sentiment lexicon lexicon = load_lexicon() pos_words = [word for word in lexicon if lexicon[word] == 1] neg_words = [word for word in lexicon if lexicon[word] == 0] lex_labels = [1] * len(pos_words) + [0] * len(neg_words) lex_word_seqs = pos_words + neg_words # load document data mdsd_domain = 'dvd' if args.domain == 'dvds' else args.domain doc_texts, doc_labels, _ = load_documents(domains=(mdsd_domain,)) # just one domain, ignore domain labels ## build vocabulary counter = Counter() word_seqs = [] doc_word_seqs = [] doc_word_sseqs = [] # tokenize to words for sent in sents: word_seqs.append(my_tokenize(sent)) # [[w1, w2, ...], ...] for doc in doc_texts: doc_word_seqs.append(my_tokenize(doc)) sent_seqs = [] for sent in sent_tokenize(doc): sent_seqs.append(my_tokenize(sent)) doc_word_sseqs.append(sent_seqs) # [[[w11, w12, ...], [w21, w22, ...], ...], ...] # stat and index lens = [] doc_lens = [] doc_sentlens = [] doc_wordlens = [] for word_seq in word_seqs: counter.update(word_seq) lens.append(len(word_seq)) for word in lexicon.keys(): counter.update([word]) for doc_word_seq in doc_word_seqs: # counter.update(doc_word_seq) doc_lens.append(len(doc_word_seq)) for sent_seqs in doc_word_sseqs: doc_sentlens.append(len(sent_seqs)) for sent_seq in sent_seqs: counter.update(sent_seq) doc_wordlens.append(len(sent_seq)) percentage = 98 maxlen = int(np.percentile(lens, percentage)) doc_maxlen_sent = int(np.percentile(doc_sentlens, percentage)) # max sent per doc doc_maxlen_word = int(np.percentile(doc_wordlens, percentage)) # max word per sent doc_maxlen_word = max(maxlen, doc_maxlen_word) # the vocabulary min_freq = 3 word2index = dict() idx = 2 # start from 2, 0 as <PAD>, 1 as <OOV> for word_count in counter.most_common(): if word_count[1] >= min_freq or word_count[0] in lexicon: word2index[word_count[0]] = idx idx += 1 n_words = len(word2index) + 2 print('words:', len(word2index)) print('[agree] words:') for word in agree_words: if word in word2index: agree_indices.add(word2index[word]) print(' -', word, word2index[word]) print('[disagree] words:') for word in disagree_words: if word in word2index: disagree_indices.add(word2index[word]) print(' -', word, word2index[word]) print('agree: {}\ndisagree: {}'.format(agree_indices, disagree_indices)) # %% data encoding ==================================================================== # sent data, and CV version seqs = [] for words in word_seqs: seqs.append([word2index.get(word, 1) for word in words]) padded_seqs_bak = pad_sequences(seqs, maxlen=doc_maxlen_word, padding='post', truncating='post') labels_bak = np.asarray(labels, dtype=int) print('sent:', padded_seqs_bak.shape, labels_bak.shape) # CV-fold split for sentence data kf = StratifiedKFold(n_splits=CV, shuffle=True) padded_seqs_trains = dict() padded_seqs_tests = dict() labels_trains = dict() labels_tests = dict() print('{} fold train/test splitting'.format(CV)) for cv, (train_idx, test_idx) in enumerate(kf.split(padded_seqs_bak, labels_bak)): padded_seqs_trains[cv] = padded_seqs_bak[train_idx] padded_seqs_tests[cv] = padded_seqs_bak[test_idx] labels_trains[cv] = labels_bak[train_idx] labels_tests[cv] = labels_bak[test_idx] # lex data lex_seqs = [] for word in lex_word_seqs: lex_seqs.append([word2index.get(word, 1)]) lex_padded_seqs = pad_sequences(lex_seqs, maxlen=1, padding='post', truncating='post') lex_labels = np.asarray(lex_labels, dtype=int) print(' - lex (all):', lex_padded_seqs.shape, lex_labels.shape) # doc data (hierarchical), padding from word to sent n_samples = len(doc_word_sseqs) doc_padded_seqs = np.zeros(shape=(n_samples, doc_maxlen_sent, doc_maxlen_word), dtype=int) for i, sseq_1doc in enumerate(doc_word_sseqs): for j, seq_1doc in enumerate(sseq_1doc): if j < doc_maxlen_sent: for k, word in enumerate(seq_1doc): if k < doc_maxlen_word: doc_padded_seqs[i, j, k] = word2index.get(word, 1) doc_labels = np.asarray(doc_labels, dtype=int) print(' - doc (all):', doc_padded_seqs.shape, doc_labels.shape) # relation data for doc (internal sents) (agree & disagree) count_agree, count_disagree = 0, 0 doc_rel_padded_seqs = np.zeros(shape=(n_samples, doc_maxlen_sent), dtype=int) for i in range(0, n_samples): for j in range(1, doc_maxlen_sent): if doc_padded_seqs[i, j, 0] in agree_indices: doc_rel_padded_seqs[i, j] = 1 count_agree += 1 if doc_padded_seqs[i, j, 0] in disagree_indices: doc_rel_padded_seqs[i, j] = -1 count_disagree += 1 print(' - doc sent-rel (all):', doc_rel_padded_seqs.shape) print(' - doc sent-rel (all): agree: {}, disagree: {}'.format(count_agree, count_disagree)) ## sub-sample from lexicon and documents print('sub-sampling:') # doc data sub-sample n_samples = len(padded_seqs_trains[0]) + len(padded_seqs_tests[0]) doc_padded_seqs, doc_rel_padded_seqs, doc_labels = balanced_subsample3( doc_padded_seqs, doc_rel_padded_seqs, doc_labels, subsample_num=n_samples) doc_padded_seqs = np.asarray(doc_padded_seqs) doc_labels = np.asarray(doc_labels, dtype=int) print(' - doc (sampled):', doc_padded_seqs.shape, doc_labels.shape) # lex data sub-sample lex_padded_seqs, lex_labels = balanced_subsample2(lex_padded_seqs, lex_labels, subsample_num=n_samples) lex_padded_seqs = np.asarray(lex_padded_seqs) lex_labels = np.asarray(lex_labels, dtype=int) print(' - lex (sampled):', lex_padded_seqs.shape, lex_labels.shape) ddata = { 'n_samples': n_samples, 'n_words': n_words, 'doc_maxlen_word': doc_maxlen_word, 'doc_maxlen_sent': doc_maxlen_sent, 'word2index': word2index, 'padded_seqs_trains': padded_seqs_trains, 'labels_trains': labels_trains, 'padded_seqs_tests': padded_seqs_tests, 'labels_tests': labels_tests, 'lex_padded_seqs': lex_padded_seqs, 'lex_labels': lex_labels, 'doc_padded_seqs': doc_padded_seqs, 'doc_labels': doc_labels, 'doc_rel_padded_seqs': doc_rel_padded_seqs, } return ddata