def save_vectors(data, tnn, n_tuples, dh, output_basename): tuple_vectors = np.zeros([n_tuples, dh]) keys = range(len(data)) n_mentions = len(keys) mention_vectors = np.zeros([n_mentions, dh]) print "getting vectors" for k_i, k in enumerate(keys): d = data[k] word_idxs = d['word_idxs'] edge_idxs = d['edge_idxs'] pos_idxs = d['pos_idxs'] tuple_ids = d['tuple_ids'] mention_id = d['mention_id'] input_vectors, mention_vector = np.array(tnn.get_vectors(word_idxs, edge_idxs, pos_idxs, 0)) for t_i, t in enumerate(tuple_ids): tuple_vectors[t, :] = input_vectors[t_i, :] mention_vectors[mention_id, :] = mention_vector if k_i % 100000 == 0 and k_i > 0: print k_i print "saving vectors" output_filename = output_basename + '_inputs.json' fh.write_to_json(tuple_vectors.tolist(), output_filename, sort_keys=False) output_filename = output_basename + '_mentions.json' fh.write_to_json(mention_vectors.tolist(), output_filename, sort_keys=False)
def preprocess_for_brown_clustering(): input_filename = dirs.data_processed_text_file articles = fh.read_json(input_filename) keys = articles.keys() keys.sort() items = keys print len(items) processed_dict = {} output_filename = fh.make_filename(dirs.data_processed_brown_dir, 'input', 'txt') with codecs.open(output_filename, 'w', encoding='utf-8') as output_file: for k in keys: text = articles[k] tokens = [] sentences = text.split('\n') for s in sentences: sent_tokens = tokenizer.split_into_words(s, reattach=False, split_off_quotes=False, lemmatize=False, replace_numbers=True) tokens = tokens + sent_tokens if k in items: output_file.write(' '.join(tokens) + '\n') processed_dict[k] = tokens output_filename = fh.make_filename(dirs.data_processed_brown_dir, 'processed', 'json') fh.write_to_json(processed_dict, output_filename)
def write_tagged_text(parsed_filename, output_filename): data = fh.read_json(parsed_filename) tagged_text = {} for key, sentences in data.items(): tagged_sentences = [] for sentence in sentences: tagged_tokens = [] for token in sentence: word = token.get('word', '__MISSING__') POS = token.get('POS', '__MISSING__') lemma = token.get('lemma', '__MISSING__') NER = token.get('NER', '__MISSING__') #tagged = word + '_' + POS tagged = POS + '_POS_' tagged_tokens.append(tagged) tagged_sentence = ' '.join(tagged_tokens) tagged_sentences.append(tagged_sentence) tagged_text[fh.get_basename_wo_ext(key)] = ' '.join(tagged_sentences) fh.write_to_json(tagged_text, output_filename, sort_keys=False)
def test_over_time(project_dir, subset, config_file, model_type, field, train_start, train_end, test_start, test_end, n_train=None, n_calib=0, penalty='l2', suffix='', loss='log', objective='f1', do_ensemble=True, dh=300, label='label', intercept=True, n_dev_folds=5, average='micro', seed=None, alpha_min=0.01, alpha_max=1000.0, n_alphas=8, sample_labels=False, group_identical=False, annotated_subset=None, nonlinearity='tanh', init_lr=1e-2, min_epochs=2, max_epochs=50, patience=5, tol=1e-4, list_size=1, repeats=1, oracle=False, lower=None, interactive=False, stoplist_file=None, cshift=False, n_cshift=None, do_cfm=True, do_platt=True, dropout=0.0, min_test=None, test_prop=None, verbose=False): # Just run a regular model, one per year, training on the past, and save the reults if seed is not None: seed = int(seed) np.random.seed(seed) log = { 'project': project_dir, 'subset': subset, 'config_file': config_file, 'model_type': model_type, 'field': field, 'train_start': train_start, 'train_end': train_end, 'test_start': test_start, 'test_end': test_end, 'n_train': n_train, 'n_calib': n_calib, 'penalty': penalty, 'cshift': cshift, 'n_cshift': n_cshift, 'suffix': suffix, 'loss': loss, 'objective': objective, 'do_ensemble': do_ensemble, 'dh': dh, 'label': label, 'intercept': intercept, 'n_dev_folds': n_dev_folds, 'average': average, 'seed': seed, 'alpha_min': alpha_min, 'alpha_max': alpha_max, 'n_alphas': n_alphas, 'sample_labels': sample_labels, 'group_identical': group_identical, 'annotated_subset': annotated_subset, 'nonlinearity': nonlinearity, 'init_lr': init_lr, 'min_epochs': min_epochs, 'max_epochs': max_epochs, 'patience': patience, 'tol': tol, 'interactive': interactive, 'stoplist_file': stoplist_file, 'list_size': list_size } model_basename = make_model_basename(log) # save the experiment parameters to a log file logfile = os.path.join(dirs.dir_logs(project_dir), model_basename + '.json') fh.makedirs(dirs.dir_logs(project_dir)) fh.write_to_json(log, logfile) # load the features specified in the config file config = fh.read_json(config_file) feature_defs = [] for f in config['feature_defs']: feature_defs.append(features.parse_feature_string(f)) # load the file that contains metadata about each item metadata_file = os.path.join(dirs.dir_subset(project_dir, subset), 'metadata.csv') metadata = fh.read_csv_to_df(metadata_file) field_vals = list(set(metadata[field].values)) field_vals.sort() print("Splitting data according to %s", field) print("Values:", field_vals) print("\nTesting on %s to %s" % (test_start, test_end)) # first, split into training and non-train data based on the field of interest all_items = list(metadata.index) test_selector_all = (metadata[field] >= int(test_start)) & (metadata[field] <= int(test_end)) test_subset_all = metadata[test_selector_all] test_items_all = test_subset_all.index.tolist() n_test_all = len(test_items_all) if min_test is not None: if n_test_all < min_test: print("Not enough test samples; exiting") return if train_end is None: if train_start is None: train_selector_all = metadata[field] < int(test_start) else: train_selector_all = (metadata[field] < int(test_start)) & (metadata[field] >= train_start) else: if train_start is None: train_selector_all = metadata[field] <= int(train_end) else: train_selector_all = (metadata[field] <= int(train_end)) & (metadata[field] >= train_start) train_subset_all = metadata[train_selector_all] train_items_all = list(train_subset_all.index) n_train_all = len(train_items_all) # only keep the items in the train and test sets all_items = train_items_all + test_items_all print("Train: %d, Test: %d (labeled and unlabeled)" % (n_train_all, n_test_all)) # load all labels label_dir = dirs.dir_labels(project_dir, subset) labels_df = fh.read_csv_to_df(os.path.join(label_dir, label + '.csv'), index_col=0, header=0) labels_df = labels_df.loc[all_items] # if desired, attempt to learn weights for the training data using techniques for covariate shift if cshift: print("Training a classifier for covariate shift") # start by learning to discriminate train from non-train data # Label items based on whether they come from train or test train_test_labels = np.zeros((len(all_items), 2), dtype=int) train_test_labels[:n_train_all, 0] = 1 train_test_labels[n_train_all:, 1] = 1 if np.sum(train_test_labels[:, 0]) < np.sum(train_test_labels[:, 1]): cshift_pos_label = 0 else: cshift_pos_label = 1 train_test_labels_df = pd.DataFrame(train_test_labels, index=all_items, columns=[0, 1]) if n_cshift is not None and len(all_items) >= n_cshift: print("Taking a random sample of %d items for reweighting" % n_cshift) #np.random.shuffle(all_items) cshift_items = np.random.choice(all_items, size=n_cshift, replace=False) else: print("Using all train items") cshift_items = all_items print(train_test_labels_df.loc[cshift_items].mean(axis=0)) # create a cshift model using the same specifiction as our model below (e.g. LR/MLP, etc.) model_name = model_basename + '_' + str(test_start) + '-' + str(test_end) + 'cshift' model, dev_f1, dev_acc, dev_cal, dev_cal_overall = train.train_model_with_labels(project_dir, model_type, loss, model_name, subset, train_test_labels_df, feature_defs, items_to_use=cshift_items, penalty=penalty, alpha_min=alpha_min, alpha_max=alpha_max, n_alphas=n_alphas, intercept=intercept, n_dev_folds=n_dev_folds, save_model=True, do_ensemble=False, dh=dh, seed=seed, pos_label=cshift_pos_label, verbose=False) print("cshift results: %0.4f f1, %0.4f acc" % (dev_f1, dev_acc)) #X_cshift, features_concat = predict.load_data(project_dir, model_name, subset, items_to_use=all_items) X_cshift, features_concat = predict.load_data(project_dir, model_name, subset, items_to_use=all_items) cshift_pred_probs = model.predict_probs(X_cshift) f_items = features_concat.get_items() assert len(f_items) == len(all_items) for i in range(len(all_items)): assert all_items[i] == f_items[i] cshift_pred_probs_df = pd.DataFrame(cshift_pred_probs, index=features_concat.get_items(), columns=range(2)) # display the min and max probs print("Min: %0.6f" % cshift_pred_probs_df[1].values[:n_train_all].min()) print("Mean: %0.6f" % cshift_pred_probs_df[1].values[:n_train_all].mean()) print("Max: %0.6f" % cshift_pred_probs_df[1].values[:n_train_all].max()) # HACK: need to prevent 0s in prob(y=0|x) p_train_values = cshift_pred_probs_df[0].values threshold = 0.01 p_train_values[p_train_values < threshold] = threshold print("After thresholding") print("Min: %0.6f" % p_train_values[:n_train_all].min()) print("Mean: %0.6f" % p_train_values[:n_train_all].mean()) print("Max: %0.6f" % p_train_values[:n_train_all].max()) # use the estimated probability of each item being a training item to compute item weights weights = n_train_all / float(n_test_all) * (1.0/p_train_values - 1) weights_df_all = pd.DataFrame(weights, index=all_items) # print a summary of the weights from just the training items print("Min weight: %0.4f" % weights[:n_train_all].min()) print("Ave weight: %0.4f" % weights[:n_train_all].mean()) print("Max weight: %0.4f" % weights[:n_train_all].max()) # print a summary of all weights #print("Min weight: %0.4f" % weights.min()) #print("Ave weight: %0.4f" % weights.mean()) #print("Max weight: %0.4f" % weights.max()) # create a data frame with this information else: weights_df_all = None # find the labeled items print("Subsetting items with labels") label_sums_df = labels_df.sum(axis=1) labeled_item_selector = label_sums_df > 0 labels_df = labels_df[labeled_item_selector] n_labeled_items, n_classes = labels_df.shape print("%d labeled items" % n_labeled_items) labeled_items = set(labels_df.index) train_items_labeled = [i for i in train_items_all if i in labeled_items] test_items = [i for i in test_items_all if i in labeled_items] #n_train = len(train_items) n_test = len(test_items) for r in range(repeats): # set seed very explicily here to make sure experiments are comparable if seed is not None: seed += 1 np.random.seed(seed) print("* Starting repetition %d *" % r) model_name = model_basename + '_' + str(test_start) + '-' + str(test_end) + '_' + str(r) if n_train is not None and len(train_items_labeled) >= n_train: np.random.shuffle(train_items_labeled) train_items = np.random.choice(train_items_labeled, size=n_train, replace=False) else: print("Using all train items") train_items = train_items_labeled n_train_r = len(train_items) # now, choose a calibration set if n_calib > 0 and n_test >= n_calib: np.random.shuffle(test_items) calib_items = np.random.choice(test_items, size=n_calib, replace=False) elif n_test < n_calib: print("Error: Only %d labeled test instances available" % n_test) calib_items = test_items else: calib_items = [] if weights_df_all is not None: weights_df = weights_df_all[labeled_item_selector] else: weights_df = None print("Labeled train: %d, test: %d" % (n_train_r, n_test)) # create a data frame to hold a summary of the results output_df = pd.DataFrame([], columns=['N', 'training data', 'test data', 'cal', 'estimate', 'MAE', '95lcl', '95ucl', 'contains_test']) test_labels_df = labels_df.loc[test_items] # do a fake adjustment of the test label proportions if test_prop is not None: test_prop = float(test_prop) test_label_values = test_labels_df.values test_label_props = test_label_values[:, 1] / (test_label_values[:, 1] + test_label_values[:, 0]) order = list(np.argsort(test_label_props)) true_prop = np.mean(test_label_props) if test_prop < true_prop: i = 0 running = test_label_props[order[i]] new_test_items = [test_items[order[i]]] i += 1 while (running / i) <= test_prop: running += test_label_props[order[i]] new_test_items.append(test_items[order[i]]) i += 1 print("Taking %d test_items" % len(new_test_items)) test_items = new_test_items[:] else: order.reverse() i = 0 running = test_label_props[order[i]] new_test_items = [test_items[order[i]]] i += 1 while (running / i) >= test_prop: running += test_label_props[order[i]] new_test_items.append(test_items[order[i]]) i += 1 print("Taking %d test_items" % len(new_test_items)) test_items = new_test_items[:] test_labels_df = labels_df.loc[test_items] test_label_values = test_labels_df.values test_label_props = test_label_values[:, 1] / (test_label_values[:, 1] + test_label_values[:, 0]) print("New props = %0.3f" % np.mean(test_label_props)) # if instructed, sample labels in proportion to annotations (to simulate having one label per item) if sample_labels: print("Sampling labels") # normalize the labels temp = labels_df.values / np.array(labels_df.values.sum(axis=1).reshape((n_labeled_items, 1)), dtype=float) samples = np.zeros([n_labeled_items, n_classes], dtype=int) for i in range(n_labeled_items): index = np.random.choice(np.arange(n_classes), size=1, p=temp[i, :]) samples[i, index] = 1 sampled_labels_df = pd.DataFrame(samples, index=labels_df.index, columns=labels_df.columns) else: sampled_labels_df = labels_df train_labels_df = sampled_labels_df.loc[train_items].copy() if n_calib > 0: calib_labels_df = sampled_labels_df.loc[calib_items].copy() else: calib_labels_df = None # get the true proportion of labels in the test OR non-training data (calibration and test combined) target_props, target_estimate, target_std = get_estimate_and_std(test_labels_df, use_n_annotations=True) output_df.loc['target'] = [n_test, 'test', 'test', 'n/a', target_estimate, 0, target_estimate - 2 * target_std, target_estimate + 2 * target_std, np.nan] # get the same estimate from training data train_props, train_estimate, train_std = get_estimate_and_std(train_labels_df, use_n_annotations=True) print("Train props:", train_props, train_estimate) train_rmse = np.abs(train_estimate - target_estimate) train_contains_test = target_estimate > train_estimate - 2 * train_std and target_estimate < train_estimate + 2 * train_std output_df.loc['train'] = [n_train_r, 'train', 'test', 'n/a', train_estimate, train_rmse, train_estimate - 2 * train_std, train_estimate + 2 * train_std, train_contains_test] # get the same estimate from training data if n_calib > 0: calib_props, calib_estimate, calib_std = get_estimate_and_std(calib_labels_df, use_n_annotations=True) # compute the error of this estimate calib_rmse = np.abs(calib_estimate - target_estimate) calib_contains_test = target_estimate > calib_estimate - 2 * calib_std and target_estimate < calib_estimate + 2 * calib_std output_df.loc['calib'] = [n_calib, 'calib', 'test', 'n/a', calib_estimate, calib_rmse, calib_estimate - 2 * calib_std, calib_estimate + 2 * calib_std, calib_contains_test] else: calib_estimate = 0.0 calib_std = 1.0 output_df.loc['calib'] = [n_calib, 'calib', 'test', 'n/a', np.nan, np.nan, np.nan, np.nan, np.nan] if train_estimate > 0.5: pos_label = 0 else: pos_label = 1 print("Using %d as the positive label" % pos_label) results_df = pd.DataFrame([], columns=['f1', 'acc', 'mae', 'estimated calibration']) # Now train a model on the training data, saving the calibration data for calibration if stoplist_file is not None: stoplist = fh.read_text(stoplist_file) stoplist = {s.strip() for s in stoplist} print(stoplist) else: stoplist = None print("Training a LR model") model, dev_f1, dev_acc, dev_cal_mae, dev_cal_est = train.train_model_with_labels(project_dir, model_type, 'log', model_name, subset, sampled_labels_df, feature_defs, weights_df=weights_df, items_to_use=train_items, penalty=penalty, alpha_min=alpha_min, alpha_max=alpha_max, n_alphas=n_alphas, intercept=intercept, objective=objective, n_dev_folds=n_dev_folds, do_ensemble=do_ensemble, dh=dh, seed=seed, pos_label=pos_label, vocab=None, group_identical=group_identical, nonlinearity=nonlinearity, init_lr=init_lr, min_epochs=min_epochs, max_epochs=max_epochs, patience=patience, do_cfm=do_cfm, do_platt=do_platt, lower=lower, stoplist=stoplist, dropout=dropout, verbose=verbose) results_df.loc['cross_val'] = [dev_f1, dev_acc, dev_cal_mae, dev_cal_est] X_test, features_concat = predict.load_data(project_dir, model_name, subset, items_to_use=test_items) test_predictions = model.predict(X_test) test_predictions_df = pd.DataFrame(test_predictions, index=features_concat.get_items(), columns=[label]) test_pred_probs = model.predict_probs(X_test) _, n_labels = test_pred_probs.shape test_pred_probs_df = pd.DataFrame(test_pred_probs, index=features_concat.get_items(), columns=range(n_labels)) f1_test, acc_test = evaluate_predictions.evaluate_predictions(test_labels_df, test_predictions_df, test_pred_probs_df, pos_label=pos_label, average=average) true_test_vector = np.argmax(test_labels_df.as_matrix(), axis=1) test_cal_est = evaluation.evaluate_calibration_rmse(true_test_vector, test_pred_probs_df.as_matrix(), min_bins=1, max_bins=1) test_cc_estimate, test_pcc_estimate = model.predict_proportions(X_test) test_cc_mae = np.mean(np.abs(test_cc_estimate[1] - target_estimate)) test_pcc_mae = np.mean(np.abs(test_pcc_estimate[1] - target_estimate)) results_df.loc['test'] = [f1_test, acc_test, test_pcc_mae, test_cal_est] output_df.loc['CC'] = [n_train_r, 'train', 'test', 'n/a', test_cc_estimate[1], test_cc_mae, np.nan, np.nan, np.nan] output_df.loc['PCC'] = [n_train_r, 'train', 'test', 'n/a', test_pcc_estimate[1], test_pcc_mae, np.nan, np.nan, np.nan] test_acc_estimate_internal, test_acc_ms_estimate_internal = model.predict_proportions(X_test, do_cfm=do_cfm) test_acc_rmse_internal = np.abs(test_acc_estimate_internal[1] - target_estimate) test_acc_ms_rmse_internal = np.abs(test_acc_ms_estimate_internal[1] - target_estimate) output_df.loc['ACC_internal'] = [n_train_r, 'train', 'test', 'n/a', test_acc_estimate_internal[1], test_acc_rmse_internal, np.nan, np.nan, np.nan] output_df.loc['MS_internal'] = [n_train_r, 'train', 'nontrain', 'predicted', test_acc_ms_estimate_internal[1], test_acc_ms_rmse_internal, np.nan, np.nan, np.nan] test_platt1_estimate, test_platt2_estimate = model.predict_proportions(X_test, do_platt=do_platt) test_platt1_rmse = np.abs(test_platt1_estimate[1] - target_estimate) test_platt2_rmse = np.abs(test_platt2_estimate[1] - target_estimate) output_df.loc['PCC_platt1'] = [n_train_r, 'train', 'test', 'n/a', test_platt1_estimate[1], test_platt1_rmse, np.nan, np.nan, np.nan] output_df.loc['PCC_platt2'] = [n_train_r, 'train', 'nontrain', 'predicted', test_platt2_estimate[1], test_platt2_rmse, np.nan, np.nan, np.nan] if n_calib > 0: cc_plus_cal_estimate = (test_cc_estimate[1] + calib_estimate) / 2.0 pcc_plus_cal_estimate = (test_pcc_estimate[1] + calib_estimate) / 2.0 cc_plus_cal_mae = np.mean(np.abs(cc_plus_cal_estimate - target_estimate)) pcc_plus_cal_mae = np.mean(np.abs(pcc_plus_cal_estimate - target_estimate)) #output_df.loc['CC_plus_cal'] = [n_train, 'train', 'test', 'n/a', cc_plus_cal_estimate, cc_plus_cal_mae, np.nan, np.nan, np.nan] output_df.loc['PCC_plus_cal'] = [n_train_r, 'train', 'test', 'n/a', pcc_plus_cal_estimate, pcc_plus_cal_mae, np.nan, np.nan, np.nan] results_df.to_csv(os.path.join(dirs.dir_models(project_dir), model_name, 'accuracy.csv')) output_df.to_csv(os.path.join(dirs.dir_models(project_dir), model_name, 'results.csv')) """
def parse_semafor_output(fes_dir, sentences): semafor_dir = dirs.data_semafor_dir frames = {} frame_target = {} frame_arguments = {} frame_target_agruments = {} dicts = [frames, frame_target, frame_arguments, frame_target_agruments] files = glob.glob(os.path.join(fes_dir, '*.fes')) for f in files: basename = fh.get_basename_wo_ext(f) for d in dicts: d[basename] = [] for s in sentences[basename]: d[basename].append([]) #frames[basename] = [] #frame_target[basename] = [] #frame_arguments[basename] = [] #frame_target_agruments[basename] = [] with codecs.open(f, 'r') as input_file: for line in input_file: parts = line.split('\t') n_args = int(parts[1]) frame = parts[2] target_dot_pos = parts[3] target_index = parts[4] target_phrase = parts[5] sentence_num = int(parts[6]) arg_types = [] arg_inices = [] for j in range(n_args-1): arg_types.append(parts[7 + 2*j]) arg_inices.append(parts[7 + 2*j + 1]) # save the frame output_token = '<' + frame + '>' frames[basename][sentence_num].append(output_token) # save the frame_target output_token = '<' + frame + '>' target_indices = target_index.split('_') for ti in target_indices: output_token += '_' + sentences[basename][sentence_num][int(ti)] frame_target[basename][sentence_num].append(output_token) # save the frame and arguments output_token = '<' + frame + '>' for j, arg_type in enumerate(arg_types): output_token += '_<' + arg_type + '>' arg_index = arg_inices[j] indices = arg_index.split(':') for ai in indices: output_token += '_' + sentences[basename][sentence_num][int(ai)] frame_arguments[basename][sentence_num].append(output_token) # save the frame, token, and arguments output_token = '<' + frame + '>' target_indices = target_index.split('_') for ti in target_indices: output_token += '_' + sentences[basename][sentence_num][int(ti)] for j, arg_type in enumerate(arg_types): output_token += '_<' + arg_type + '>' arg_index = arg_inices[j] indices = arg_index.split(':') for ai in indices: output_token += '_' + sentences[basename][sentence_num][int(ai)] frame_target_agruments[basename][sentence_num].append(output_token) output_filename = os.path.join(semafor_dir, 'frames.json') print output_filename fh.write_to_json(frames, output_filename) output_filename = os.path.join(semafor_dir, 'frames_targets.json') print output_filename fh.write_to_json(frame_target, output_filename) output_filename = os.path.join(semafor_dir, 'frame_arguments.json') print output_filename fh.write_to_json(frame_arguments, output_filename) output_filename = os.path.join(semafor_dir, 'frames_target_arguments.json') print output_filename fh.write_to_json(frame_target_agruments, output_filename)
def cross_train_and_eval(project_dir, reference_model_dir, subset, field_name, config_file, n_train=100, field_val=None, vocab_file=None, group_identical=False, suffix='', model_type='MLP', loss='log', do_ensemble=True, dh=100, label='label', n_dev_folds=5, repeats=1, verbose=False, average='micro', objective='calibration', seed=None, init_lr=1e-4, min_epochs=2, max_epochs=50, early_stopping=False, tol=1e-4, patience=8): n_calib = 0 model_basename = subset + '_' + label + '_' + field_name + '_' + model_type if model_type == 'MLP': model_basename += '_' + str(dh) model_basename += '_' + str(n_train) + '_' + str(n_calib) + '_' + objective model_basename += suffix # save the experiment parameters to a log file logfile = os.path.join(dirs.dir_logs(project_dir), model_basename + '.json') fh.makedirs(dirs.dir_logs(project_dir)) log = { 'project': project_dir, 'subset': subset, 'field_name': field_name, 'config_file': config_file, 'n_calib': n_calib, 'n_train': n_train, 'suffix': suffix, 'model_type': model_type, 'loss': loss, 'dh': dh, 'do_ensemble': do_ensemble, 'label': label, 'field_val': field_val, 'n_dev_folds': n_dev_folds, 'repeats': repeats, 'average': average, 'objective': objective, } fh.write_to_json(log, logfile) # load the features specified in the config file config = fh.read_json(config_file) feature_defs = [] for f in config['feature_defs']: feature_defs.append(features.parse_feature_string(f)) # load the file that contains metadata about each item metadata_file = os.path.join(dirs.dir_subset(project_dir, subset), 'metadata.csv') metadata = fh.read_csv_to_df(metadata_file) field_vals = list(set(metadata[field_name].values)) field_vals.sort() print("Splitting data according to :", field_vals) print(field_vals) if field_val is not None: field_vals = [field_val] # repeat the following value for each fold of the partition of interest (up to max_folds, if given) for v_i, v in enumerate(field_vals): print("\nTesting on %s" % v) # first, split into training and non-train data based on the field of interest train_selector = metadata[field_name] != v train_subset = metadata[train_selector] train_items = list(train_subset.index) n_train_cshift = len(train_items) non_train_selector = metadata[field_name] == v non_train_subset = metadata[non_train_selector] non_train_items = non_train_subset.index.tolist() n_non_train_cshift = len(non_train_items) print("Train: %d, non-train: %d" % (n_train_cshift, n_non_train_cshift)) # load all labels label_dir = dirs.dir_labels(project_dir, subset) labels_df = fh.read_csv_to_df(os.path.join(label_dir, label + '.csv'), index_col=0, header=0) n_items, n_classes = labels_df.shape weights_df = None # add in a stage to eliminate items with no labels? print("Subsetting items with labels") label_sums_df = labels_df.sum(axis=1) labeled_item_selector = label_sums_df > 0 labels_df = labels_df[labeled_item_selector] n_items, n_classes = labels_df.shape labeled_items = set(labels_df.index) train_items = [i for i in train_items if i in labeled_items] non_train_items = [i for i in non_train_items if i in labeled_items] n_non_train = len(non_train_items) if weights_df is not None: weights_df = weights_df[labeled_item_selector] print("Starting repeats") # repeat the following process multiple times with different random splits of train / calibration / test data for r in range(repeats): print("* Repetition %d *" % r) # next, take a random subset of the training data (and ignore the rest), to simulate fewer annotated items if n_train > 0: np.random.shuffle(train_items) train_items_r = np.random.choice(train_items, size=n_train, replace=False) else: train_items_r = train_items n_train_r = len(train_items_r) # create a data frame to hold a summary of the results output_df = pd.DataFrame([], columns=[ 'N', 'training data', 'test data', 'cal', 'estimate', 'RMSE', '95lcl', '95ucl', 'contains_test' ]) # create a unique name ofr this model model_name = model_basename + '_' + str(v) + '_' + str(r) # now, divide the non-train data into a calibration and a test set #n_calib = int(calib_prop * n_non_train) np.random.shuffle(non_train_items) if n_calib > n_non_train: n_calib = int(n_non_train / 2) print( "Warning!!: only %d non-train items; using 1/2 for calibration" % n_non_train) calib_items = non_train_items[:n_calib] test_items = non_train_items[n_calib:] n_test = len(test_items) print("Train: %d, calibration: %d, test: %d" % (n_train_r, n_calib, n_test)) test_labels_df = labels_df.loc[test_items] non_train_labels_df = labels_df.loc[non_train_items] sampled_labels_df = labels_df train_labels_r_df = sampled_labels_df.loc[train_items_r].copy() calib_labels_df = sampled_labels_df.loc[calib_items].copy() # get the true proportion of labels in the test OR non-training data (calibration and test combined) target_props, target_estimate, target_std = get_estimate_and_std( non_train_labels_df) output_df.loc['target'] = [ n_test, 'nontrain', 'nontrain', 'given', target_estimate, 0, target_estimate - 2 * target_std, target_estimate + 2 * target_std, np.nan ] # get the same estimate from training data train_props, train_estimate, train_std = get_estimate_and_std( train_labels_r_df) # compute the error of this estimate train_rmse = np.sqrt((train_estimate - target_estimate)**2) train_contains_test = target_estimate > train_estimate - 2 * train_std and target_estimate < train_estimate + 2 * train_std output_df.loc['train'] = [ n_train_r, 'train', 'train', 'n/a', train_estimate, train_rmse, np.nan, np.nan, np.nan ] print( "target proportions: (%0.3f, %0.3f); train proportions: %0.3f" % (target_estimate - 2 * target_std, target_estimate + 2 * target_std, train_estimate)) if train_estimate > 0.5: pos_label = 0 else: pos_label = 1 print("Using %d as the positive label" % pos_label) # repeat for labeled calibration data if n_calib > 0: calib_props, calib_estimate, calib_std = get_estimate_and_std( calib_labels_df) calib_rmse = np.sqrt((calib_estimate - target_estimate)**2) # check if the test estimate is within 2 standard deviations of the estimate calib_contains_test = target_estimate > calib_estimate - 2 * calib_std and calib_estimate < calib_estimate + 2 * calib_std output_df.loc['calibration'] = [ n_calib, 'calibration', 'nontrain', 'given', calib_estimate, calib_rmse, calib_estimate - 2 * calib_std, calib_estimate + 2 * calib_std, calib_contains_test ] # do a test using the number of annotations rather than the number of items calib_props2, calib_estimate2, calib_std2 = get_estimate_and_std( calib_labels_df, use_n_annotations=True) calib_rmse2 = np.sqrt((calib_estimate2 - target_estimate)**2) calib_contains_test2 = target_estimate > calib_estimate2 - 2 * calib_std2 and calib_estimate < calib_estimate2 + 2 * calib_std2 output_df.loc['calibration_n_annotations'] = [ n_calib, 'calibration', 'nontrain', 'given', calib_estimate2, calib_rmse2, calib_estimate2 - 2 * calib_std2, calib_estimate2 + 2 * calib_std2, calib_contains_test2 ] results_df = pd.DataFrame( [], columns=['f1', 'acc', 'calibration', 'calib overall']) # Now train a model on the training data, saving the calibration data for calibration print("Training model on training data only") model, dev_f1, dev_acc, dev_cal, dev_cal_overall = train.train_brier_grouped( project_dir, model_name, subset, sampled_labels_df, feature_defs, weights_df=weights_df, vocab_file=vocab_file, group_identical=group_identical, items_to_use=train_items_r, intercept=True, n_dev_folds=n_dev_folds, do_ensemble=do_ensemble, dh=dh, seed=seed, pos_label=pos_label, verbose=verbose, init_lr=init_lr, min_epochs=min_epochs, max_epochs=max_epochs, early_stopping=early_stopping, tol=tol, patience=patience) results_df.loc['cross_val'] = [ dev_f1, dev_acc, dev_cal, dev_cal_overall ] # predict on calibration data if n_calib > 0: calib_predictions_df, calib_pred_probs_df, calib_pred_proportions = predict.predict( project_dir, model, model_name, subset, label, items_to_use=calib_items, verbose=verbose, force_dense=True) calib_cc, calib_pcc, calib_acc, calib_pvc = calib_pred_proportions f1_cal, acc_cal = evaluate_predictions.evaluate_predictions( calib_labels_df, calib_predictions_df, calib_pred_probs_df, pos_label=pos_label, average=average, verbose=False) true_calib_vector = np.argmax(calib_labels_df.as_matrix(), axis=1) calib_cal_rmse = evaluation.evaluate_calibration_rmse( true_calib_vector, calib_pred_probs_df.as_matrix()) calib_cal_rmse_overall = evaluation.evaluate_calibration_rmse( true_calib_vector, calib_pred_probs_df.as_matrix(), min_bins=1, max_bins=1) results_df.loc['calibration'] = [ f1_cal, acc_cal, calib_cal_rmse, calib_cal_rmse_overall ] # predict on test data test_predictions_df, test_pred_probs_df, test_pred_proportions = predict.predict( project_dir, model, model_name, subset, label, items_to_use=test_items, verbose=verbose, force_dense=True) f1_test, acc_test = evaluate_predictions.evaluate_predictions( test_labels_df, test_predictions_df, test_pred_probs_df, pos_label=pos_label, average=average) true_test_vector = np.argmax(test_labels_df.as_matrix(), axis=1) test_cal_rmse = evaluation.evaluate_calibration_rmse( true_test_vector, test_pred_probs_df.as_matrix()) test_cal_rmse_overall = evaluation.evaluate_calibration_rmse( true_test_vector, test_pred_probs_df.as_matrix(), min_bins=1, max_bins=1) results_df.loc['test'] = [ f1_test, acc_test, test_cal_rmse, test_cal_rmse_overall ] test_cc_estimate, test_pcc_estimate, test_acc_estimate_internal, test_pvc_estimate_internal = test_pred_proportions # predict on calibration and test data combined nontrain_predictions_df, nontrain_pred_probs_df, nontrain_pred_proportions = predict.predict( project_dir, model, model_name, subset, label, items_to_use=non_train_items, verbose=verbose, force_dense=True) nontrain_cc_estimate, nontrain_pcc_estimate, nontrain_acc_estimate_internal, nontrain_pvc_estimate_internal = nontrain_pred_proportions if n_calib > 0: cc_calib_rmse = np.sqrt((calib_cc[1] - calib_estimate)**2) output_df.loc['CC_cal'] = [ n_non_train, 'train', 'calibration', 'predicted', calib_cc[1], cc_calib_rmse, np.nan, np.nan, np.nan ] pcc_calib_rmse = np.sqrt((calib_pcc[1] - calib_estimate)**2) output_df.loc['PCC_cal'] = [ n_non_train, 'train', 'calibration', 'predicted', calib_pcc[1], pcc_calib_rmse, np.nan, np.nan, np.nan ] cc_rmse = np.sqrt((nontrain_cc_estimate[1] - target_estimate)**2) pcc_rmse = np.sqrt((nontrain_pcc_estimate[1] - target_estimate)**2) output_df.loc['CC_nontrain'] = [ n_non_train, 'train', 'nontrain', 'predicted', nontrain_cc_estimate[1], cc_rmse, np.nan, np.nan, np.nan ] output_df.loc['PCC_nontrain'] = [ n_non_train, 'train', 'nontrain', 'predicted', nontrain_pcc_estimate[1], pcc_rmse, np.nan, np.nan, np.nan ] if n_calib > 0: averaged_cc_estimate = ( test_cc_estimate[1] * n_test + calib_estimate * n_calib) / float(n_test + n_calib) averaged_pcc_estimate = ( test_pcc_estimate[1] * n_test + calib_estimate * n_calib) / float(n_test + n_calib) averaged_cc_rmse = np.sqrt( (averaged_cc_estimate - target_estimate)**2) averaged_pcc_rmse = np.sqrt( (averaged_pcc_estimate - target_estimate)**2) output_df.loc['CC_nontrain_averaged'] = [ n_non_train, 'train', 'nontrain', 'given', averaged_cc_estimate, averaged_cc_rmse, np.nan, np.nan, np.nan ] output_df.loc['PCC_nontrain_averaged'] = [ n_non_train, 'train', 'nontrain', 'given', averaged_pcc_estimate, averaged_pcc_rmse, np.nan, np.nan, np.nan ] """ nontrain_acc_rmse_internal = np.sqrt((nontrain_acc_estimate_internal[1] - target_estimate) ** 2) nontrain_pvc_rmse_internal = np.sqrt((nontrain_pvc_estimate_internal[1] - target_estimate) ** 2) output_df.loc['ACC_internal'] = [n_non_train, 'train', 'nontrain', 'predicted', nontrain_acc_estimate_internal[1], nontrain_acc_rmse_internal, np.nan, np.nan, np.nan] output_df.loc['PVC_internal'] = [n_non_train, 'train', 'nontrain', 'predicted', nontrain_pvc_estimate_internal[1], nontrain_pvc_rmse_internal, np.nan, np.nan, np.nan] if n_calib > 0: averaged_acc_estimate_internal = (test_acc_estimate_internal[1] * n_test + calib_estimate * n_calib) / float(n_test + n_calib) averaged_pvc_estimate_internal = (test_pvc_estimate_internal[1] * n_test + calib_estimate * n_calib) / float(n_test + n_calib) averaged_acc_rmse_internal = np.sqrt((averaged_acc_estimate_internal - target_estimate) ** 2) averaged_pvc_rmse_internal = np.sqrt((averaged_pvc_estimate_internal - target_estimate) ** 2) output_df.loc['ACC_internal_averaged'] = [n_non_train, 'train', 'nontrain', 'given', averaged_acc_estimate_internal, averaged_acc_rmse_internal, np.nan, np.nan, np.nan] output_df.loc['PVC_internal_averaged'] = [n_non_train, 'train', 'nontrain', 'given', averaged_pvc_estimate_internal, averaged_pvc_rmse_internal, np.nan, np.nan, np.nan] # do calibration here using calibration data if n_calib > 0: # expand the data so as to only have singly-labeled, weighted items _, calib_labels, calib_weights, calib_predictions = train.prepare_data(np.zeros([n_calib, 2]), calib_labels_df.values, predictions=calib_predictions_df.values) #calib_labels_expanded, calib_weights_expanded, calib_predictions_expanded = expand_labels(calib_labels.values, calib_predictions.values) acc = calibration.compute_acc(calib_labels, calib_predictions, n_classes, weights=calib_weights) acc_corrected = calibration.apply_acc_binary(nontrain_predictions_df.values, acc) acc_estimate = acc_corrected[1] acc_rmse = np.sqrt((acc_estimate - target_estimate) ** 2) output_df.loc['ACC'] = [n_non_train, 'train', 'nontrain', 'predicted', acc_estimate, acc_rmse, np.nan, np.nan, np.nan] pvc = calibration.compute_pvc(calib_labels, calib_predictions, n_classes, weights=calib_weights) pvc_corrected = calibration.apply_pvc(nontrain_predictions_df.values, pvc) pvc_estimate = pvc_corrected[1] pvc_rmse = np.sqrt((pvc_estimate - target_estimate) ** 2) output_df.loc['PVC'] = [n_non_train, 'train', 'nontrain', 'predicted', pvc_estimate, pvc_rmse, np.nan, np.nan, np.nan] acc_corrected = calibration.apply_acc_binary(test_predictions_df.values, acc) acc_estimate = acc_corrected[1] averaged_acc_estimate = (acc_estimate * n_test + calib_estimate * n_calib) / float(n_test + n_calib) averaged_acc_rmse = np.sqrt((acc_estimate - target_estimate) ** 2) output_df.loc['ACC_averaged'] = [n_non_train, 'train', 'nontrain', 'given', averaged_acc_estimate, averaged_acc_rmse, np.nan, np.nan, np.nan] pvc_corrected = calibration.apply_pvc(test_predictions_df.values, pvc) pvc_estimate = pvc_corrected[1] averaged_pvc_estimate = (pvc_estimate * n_test + calib_estimate * n_calib) / float(n_test + n_calib) averaged_pvc_rmse = np.sqrt((pvc_estimate - target_estimate) ** 2) output_df.loc['PVC_averaged'] = [n_non_train, 'train', 'nontrain', 'given', averaged_pvc_estimate, averaged_pvc_rmse, np.nan, np.nan, np.nan] print("Venn internal nontrain") #models = list(model._models.values()) nontrain_pred_ranges_internal, nontrain_preds_internal = ivap.estimate_probs_from_labels_internal(project_dir, model, model_name, subset, non_train_items) pred_range = np.mean(nontrain_pred_ranges_internal, axis=0) venn_estimate = np.mean(nontrain_preds_internal) venn_rmse = np.sqrt((venn_estimate - target_estimate)**2) venn_contains_test = pred_range[0] < target_estimate < pred_range[1] output_df.loc['Venn_internal'] = [n_non_train, 'train', 'nontrain', 'predicted', venn_estimate, venn_rmse, pred_range[0], pred_range[1], venn_contains_test] if n_calib > 0: print("Venn internal test") test_pred_ranges_internal, test_preds_internal = ivap.estimate_probs_from_labels_internal(project_dir, model, model_name, subset, test_items) pred_range = np.mean(test_pred_ranges_internal, axis=0) venn_estimate = (np.mean(test_preds_internal) * n_test + calib_estimate * n_calib) / float(n_test + n_calib) venn_rmse = np.sqrt((venn_estimate - target_estimate)**2) averaged_lower = (pred_range[0] * n_test + (calib_estimate - 2 * calib_std) * n_calib) / float(n_test + n_calib) averaged_upper = (pred_range[1] * n_test + (calib_estimate + 2 * calib_std) * n_calib) / float(n_test + n_calib) venn_contains_test = averaged_lower < target_estimate < averaged_upper output_df.loc['Venn_internal_averaged'] = [n_non_train, 'train', 'nontrain', 'given', venn_estimate, venn_rmse, averaged_lower, averaged_upper, venn_contains_test] # Venn prediction using proper calibration data print("Venn calibration") calib_pred_ranges, calib_preds, calib_props_in_range, list_of_n_levels = ivap.estimate_probs_from_labels_cv(project_dir, model, model_name, sampled_labels_df, subset, calib_items=calib_items) print("Venn test") test_pred_ranges, test_preds = ivap.estimate_probs_from_labels(project_dir, model, model_name, sampled_labels_df, subset, subset, calib_items=calib_items, test_items=test_items) nontrain_pred_ranges = np.vstack([calib_pred_ranges, test_pred_ranges]) nontrain_preds = np.r_[calib_preds, test_preds] nontrain_pred_range = np.mean(nontrain_pred_ranges, axis=0) nontrain_venn_estimate = np.mean(nontrain_preds) nontrain_venn_rmse = np.sqrt((nontrain_venn_estimate - target_estimate)**2) nontrain_contains_test = nontrain_pred_range[0] < target_estimate < nontrain_pred_range[1] output_df.loc['Venn'] = [n_non_train, 'train', 'nontrain', 'predicted', nontrain_venn_estimate, nontrain_venn_rmse, nontrain_pred_range[0], nontrain_pred_range[1], nontrain_contains_test] test_pred_range = np.mean(test_pred_ranges, axis=0) averaged_venn_estimate = (np.mean(test_preds) * n_test + calib_estimate * n_calib) / float(n_test + n_calib) averaged_venn_rmse = np.sqrt((averaged_venn_estimate - target_estimate)**2) averaged_lower = (test_pred_range[0] * n_test + (calib_estimate - 2 * calib_std) * n_calib) / float(n_test + n_calib) averaged_upper = (test_pred_range[1] * n_test + (calib_estimate + 2 * calib_std) * n_calib) / float(n_test + n_calib) venn_contains_test = averaged_lower < target_estimate < averaged_upper output_df.loc['Venn_averaged'] = [n_non_train, 'train', 'nontrain', 'given', averaged_venn_estimate, averaged_venn_rmse, averaged_lower, averaged_upper, venn_contains_test] fh.write_list_to_text(calib_props_in_range, os.path.join(dirs.dir_models(project_dir), model_name, 'venn_calib_props_in_range.csv')) fh.write_list_to_text(list_of_n_levels, os.path.join(dirs.dir_models(project_dir), model_name, 'list_of_n_levels.csv')) results_df.to_csv(os.path.join(dirs.dir_models(project_dir), model_name, 'accuracy.csv')) # now train a model on the training and calibration data combined if run_all: print("Training model on all labeled data") calib_and_train_items_r = np.array(list(calib_items) + list(train_items_r)) model, dev_f1, dev_acc, dev_cal, dev_cal_overall = train.train_model_with_labels(project_dir, model_type, loss, model_name, subset, sampled_labels_df, feature_defs, weights_df=weights_df, items_to_use=calib_and_train_items_r, penalty=penalty, alpha_min=alpha_min, alpha_max=alpha_max, intercept=intercept, objective=objective, n_dev_folds=n_dev_folds, do_ensemble=do_ensemble, dh=dh, seed=seed, pos_label=pos_label, verbose=verbose) results_df.loc['cross_val_all'] = [dev_f1, dev_acc, dev_cal, dev_cal_overall] # get labels for test data test_predictions_df, test_pred_probs_df, test_pred_proportions = predict.predict(project_dir, model, model_name, subset, label, items_to_use=test_items, verbose=verbose) f1_test, acc_test = evaluate_predictions.evaluate_predictions(test_labels_df, test_predictions_df, test_pred_probs_df, pos_label=pos_label, average=average) test_cc_estimate, test_pcc_estimate, test_acc_estimate_internal, test_pvc_estimate_internal = test_pred_proportions true_test_vector = np.argmax(test_labels_df.as_matrix(), axis=1) test_cal_rmse = evaluation.evaluate_calibration_rmse(true_test_vector, test_pred_probs_df.as_matrix()) results_df.loc['test'] = [f1_test, acc_test, test_cal_rmse, 0] results_df.loc['test_all'] = [f1_test, acc_test, test_cal_rmse, 0] nontrain_predictions_df, nontrain_pred_probs_df, nontrain_pred_proportions = predict.predict(project_dir, model, model_name, subset, label, items_to_use=non_train_items, verbose=verbose) nontrain_cc_estimate, nontrain_pcc_estimate, nontrain_acc_estimate_internal, nontrain_pvc_estimate_internal = nontrain_pred_proportions cc_rmse = np.sqrt((nontrain_cc_estimate[1] - target_estimate)**2) pcc_rmse = np.sqrt((nontrain_pcc_estimate[1] - target_estimate)**2) output_df.loc['CC_nontrain_all'] = [n_non_train, 'nontest', 'nontrain', 'predicted', nontrain_cc_estimate[1], cc_rmse, np.nan, np.nan, np.nan] output_df.loc['PCC_nontrain_all'] = [n_non_train, 'nontest', 'nontrain', 'predicted', nontrain_pcc_estimate[1], pcc_rmse, np.nan, np.nan, np.nan] if n_calib > 0: averaged_cc_estimate = (test_cc_estimate[1] * n_test + calib_estimate * n_calib) / float(n_test + n_calib) averaged_pcc_estimate = (test_pcc_estimate[1] * n_test + calib_estimate * n_calib) / float(n_test + n_calib) averaged_cc_rmse = np.sqrt((averaged_cc_estimate - target_estimate)**2) averaged_pcc_rmse = np.sqrt((averaged_pcc_estimate - target_estimate)**2) output_df.loc['CC_nontrain_averaged_all'] = [n_non_train, 'nontest', 'nontrain', 'given', averaged_cc_estimate, averaged_cc_rmse, np.nan, np.nan, np.nan] output_df.loc['PCC_nontrain_averaged_all'] = [n_non_train, 'nontest', 'nontrain', 'given', averaged_pcc_estimate, averaged_pcc_rmse, np.nan, np.nan, np.nan] nontrain_acc_rmse_internal = np.sqrt((nontrain_acc_estimate_internal[1] - target_estimate) ** 2) nontrain_pvc_rmse_internal = np.sqrt((nontrain_pvc_estimate_internal[1] - target_estimate) ** 2) output_df.loc['ACC_internal_all'] = [n_non_train, 'nontest', 'nontrain', 'predicted', nontrain_acc_estimate_internal[1], nontrain_acc_rmse_internal, np.nan, np.nan, np.nan] output_df.loc['PVC_internal_all'] = [n_non_train, 'nontest', 'nontrain', 'predicted', nontrain_pvc_estimate_internal[1], nontrain_pvc_rmse_internal, np.nan, np.nan, np.nan] if n_calib > 0: averaged_acc_estimate_internal = (test_acc_estimate_internal[1] * n_test + calib_estimate * n_calib) / float(n_test + n_calib) averaged_pvc_estimate_internal = (test_pvc_estimate_internal[1] * n_test + calib_estimate * n_calib) / float(n_test + n_calib) averaged_acc_rmse_internal = np.sqrt((averaged_acc_estimate_internal - target_estimate) ** 2) averaged_pvc_rmse_internal = np.sqrt((averaged_pvc_estimate_internal - target_estimate) ** 2) output_df.loc['ACC_internal_averaged_all'] = [n_non_train, 'nontest', 'nontrain', 'given', averaged_acc_estimate_internal, averaged_acc_rmse_internal, np.nan, np.nan, np.nan] output_df.loc['PVC_internal_averaged_all'] = [n_non_train, 'nontest', 'nontrain', 'given', averaged_pvc_estimate_internal, averaged_pvc_rmse_internal, np.nan, np.nan, np.nan] print("Venn internal nontrain") nontrain_pred_ranges_internal, nontrain_preds_internal = ivap.estimate_probs_from_labels_internal(project_dir, model, model_name, subset, non_train_items) pred_range = np.mean(nontrain_pred_ranges_internal, axis=0) venn_estimate = np.mean(nontrain_preds_internal) venn_rmse = np.sqrt((venn_estimate - target_estimate)**2) venn_contains_test = pred_range[0] < target_estimate < pred_range[1] output_df.loc['Venn_internal_all'] = [n_non_train, 'nontest', 'nontrain', 'predicted', venn_estimate, venn_rmse, pred_range[0], pred_range[1], venn_contains_test] if n_calib > 0: print("Venn internal test") test_pred_ranges_internal, test_preds_internal = ivap.estimate_probs_from_labels_internal(project_dir, model, model_name, subset, test_items) pred_range = np.mean(test_pred_ranges_internal, axis=0) venn_estimate = (np.mean(test_preds_internal) * n_test + calib_estimate * n_calib) / float(n_test + n_calib) venn_rmse = np.sqrt((venn_estimate - target_estimate)**2) averaged_lower = (pred_range[0] * n_test + (calib_estimate - 2 * calib_std) * n_calib) / float(n_test + n_calib) averaged_upper = (pred_range[1] * n_test + (calib_estimate + 2 * calib_std) * n_calib) / float(n_test + n_calib) venn_contains_test = averaged_lower < target_estimate < averaged_upper output_df.loc['Venn_internal_averaged_all'] = [n_non_train, 'nontest', 'nontrain', 'given', venn_estimate, venn_rmse, averaged_lower, averaged_upper, venn_contains_test] """ results_df.to_csv( os.path.join(dirs.dir_models(project_dir), model_name, 'accuracy.csv')) output_df.to_csv( os.path.join(dirs.dir_models(project_dir), model_name, 'results.csv'))
def test_over_time(project_dir, subset, config_file, first_year, stage1_logfile=None, penalty='l2', suffix='', model_type='LR', loss='log', objective='f1', do_ensemble=True, dh=100, label='label', intercept=True, n_dev_folds=5, verbose=False, average='micro', seed=None, alpha_min=0.01, alpha_max=1000.0, n_alphas=8, sample_labels=False, group_identical=False, annotated_subset=None, n_terms=0, nonlinearity='tanh', init_lr=1e-4, min_epochs=2, max_epochs=100, patience=8, tol=1e-4, early_stopping=False, DL=False): # Just run a regular model, one per year, training on the past, and save the reults log = { 'project': project_dir, 'subset': subset, 'config_file': config_file, 'first_year': first_year, 'stage1_logfile': stage1_logfile, 'penalty': penalty, 'suffix': suffix, 'model_type': model_type, 'loss': loss, 'objective': objective, 'do_ensemble': do_ensemble, 'dh': dh, 'label': label, 'intercept': intercept, 'n_dev_folds': n_dev_folds, 'average': average, 'seed': seed, 'alpha_min': alpha_min, 'alpha_max': alpha_max, 'n_alphas': n_alphas, 'sample_labels': sample_labels, 'group_identical': group_identical, 'annotated_subset': annotated_subset, 'n_terms': n_terms, 'nonlinearity': nonlinearity, 'init_lr': init_lr, 'min_epochs': min_epochs, 'max_epochs': max_epochs, 'patience': patience, 'tol': tol, 'early_stopping': early_stopping } model_basename = make_model_basename(log) stage1_model_basename = '' if stage1_logfile is not None: stage1_log = fh.read_json(stage1_logfile) stage1_model_basename = make_model_basename(stage1_log) # save the experiment parameters to a log file logfile = os.path.join(dirs.dir_logs(project_dir), model_basename + '.json') fh.makedirs(dirs.dir_logs(project_dir)) fh.write_to_json(log, logfile) # load the features specified in the config file config = fh.read_json(config_file) feature_defs = [] for f in config['feature_defs']: feature_defs.append(features.parse_feature_string(f)) # load the file that contains metadata about each item metadata_file = os.path.join(dirs.dir_subset(project_dir, subset), 'metadata.csv') metadata = fh.read_csv_to_df(metadata_file) field_vals = list(set(metadata['year'].values)) field_vals.sort() print("Splitting data according to :", field_vals) # DEBUG: field_vals = ['2009'] for target_year in field_vals: if int(target_year) >= first_year: print("\nTesting on %s" % target_year) model_name = model_basename + '_' + str(target_year) stage1_model_name = stage1_model_basename + '_' + str(target_year) # first, split into training and non-train data based on the field of interest ## DEBUG! test_selector_all = metadata['year'] >= int(target_year) test_subset_all = metadata[test_selector_all] test_items_all = test_subset_all.index.tolist() n_test_all = len(test_items_all) train_selector_all = metadata['year'] < int(target_year) train_subset_all = metadata[train_selector_all] train_items_all = list(train_subset_all.index) n_train_all = len(train_items_all) print("Test year: %d Train: %d, Test: %d (labeled and unlabeled)" % (int(target_year), n_train_all, n_test_all)) # load all labels label_dir = dirs.dir_labels(project_dir, subset) labels_df = fh.read_csv_to_df(os.path.join(label_dir, label + '.csv'), index_col=0, header=0) n_items, n_classes = labels_df.shape vocab = None if stage1_logfile is not None: fightin_lexicon = None if annotated_subset is not None: print("Determining fightin' words") fightin_words.find_most_annotated_features( project_dir, annotated_subset, subset, config_file, items_to_use=train_items_all, remove_stopwords=False) fightin_lexicon, scores = fightin_words.load_from_config_files( project_dir, annotated_subset, subset, config_file, items_to_use=train_items_all, n=n_terms, remove_stopwords=True) fightin_lexicon_test, scores = fightin_words.load_from_config_files( project_dir, annotated_subset, subset, config_file, items_to_use=test_items_all, n=n_terms, remove_stopwords=True) print(fightin_lexicon) #print(fightin_lexicon_test) #vocab = list(fightin_lexicon) #vocab.sort() print("Loading feature from stage 1") # load features from previous model top_features = get_top_features.get_top_features( os.path.join(dirs.dir_models(project_dir), stage1_model_name), n_terms) lr_features, weights = zip(*top_features) vocab = list(lr_features) #if annotated_subset is not None: # print("\nTaking intersection:") # intersection = set(lr_features).intersection(set(fightin_lexicon)) # vocab = list(intersection) # vocab.sort() # for w in vocab: # print(w) #vocab = [w for w in vocab if w not in stopwords] for w in vocab: print(w) vocab.sort() #if annotated_subset is not None: # print("Missing:") # print(set(fightin_lexicon_test) - set(vocab)) # add in a stage to eliminate items with no labels print("Subsetting items with labels") label_sums_df = labels_df.sum(axis=1) labeled_item_selector = label_sums_df > 0 labels_df = labels_df[labeled_item_selector] n_items, n_classes = labels_df.shape labeled_items = set(labels_df.index) train_items = [i for i in train_items_all if i in labeled_items] test_items = [i for i in test_items_all if i in labeled_items] n_train = len(train_items) n_test = len(test_items) weights_df = None if weights_df is not None: weights_df = weights_df[labeled_item_selector] print("Labeled train: %d, test: %d" % (n_train, n_test)) # create a data frame to hold a summary of the results output_df = pd.DataFrame([], columns=[ 'N', 'training data', 'test data', 'cal', 'estimate', 'MAE', '95lcl', '95ucl', 'contains_test' ]) test_labels_df = labels_df.loc[test_items] # if instructed, sample labels in proportion to annotations (to simulate having one label per item) if sample_labels: print("Sampling labels") # normalize the labels temp = labels_df.values / np.array( labels_df.values.sum(axis=1).reshape((n_items, 1)), dtype=float) samples = np.zeros([n_items, n_classes], dtype=int) for i in range(n_items): index = np.random.choice(np.arange(n_classes), size=1, p=temp[i, :]) samples[i, index] = 1 sampled_labels_df = pd.DataFrame(samples, index=labels_df.index, columns=labels_df.columns) else: sampled_labels_df = labels_df train_labels_df = sampled_labels_df.loc[train_items].copy() # get the true proportion of labels in the test OR non-training data (calibration and test combined) target_props, target_estimate, target_std = get_estimate_and_std( test_labels_df, use_n_annotations=True) output_df.loc['target'] = [ n_test, 'test', 'test', 'n/a', target_estimate, 0, target_estimate - 2 * target_std, target_estimate + 2 * target_std, np.nan ] # get the same estimate from training data train_props, train_estimate, train_std = get_estimate_and_std( train_labels_df, use_n_annotations=True) # compute the error of this estimate train_rmse = np.sqrt((train_estimate - target_estimate)**2) train_contains_test = target_estimate > train_estimate - 2 * train_std and target_estimate < train_estimate + 2 * train_std output_df.loc['train'] = [ n_train, 'train', 'test', 'n/a', train_estimate, train_rmse, train_estimate - 2 * train_std, train_estimate + 2 * train_std, train_contains_test ] #print("target proportions: (%0.3f, %0.3f); train proportions: %0.3f" % (target_estimate - 2 * target_std, target_estimate + 2 * target_std, train_estimate)) if train_estimate > 0.5: pos_label = 0 else: pos_label = 1 print("Using %d as the positive label" % pos_label) results_df = pd.DataFrame( [], columns=['f1', 'acc', 'mae', 'estimated calibration']) # Now train a model on the training data, saving the calibration data for calibration print("Training a model") model, dev_f1, dev_acc, dev_cal_mae, dev_cal_est = train.train_model_with_labels( project_dir, model_type, loss, model_name, subset, sampled_labels_df, feature_defs, weights_df=weights_df, items_to_use=train_items, penalty='l2', alpha_min=alpha_min, alpha_max=alpha_max, n_alphas=n_alphas, intercept=intercept, objective=objective, n_dev_folds=n_dev_folds, do_ensemble=do_ensemble, dh=dh, seed=seed, pos_label=pos_label, vocab=vocab, group_identical=group_identical, nonlinearity=nonlinearity, init_lr=init_lr, min_epochs=min_epochs, max_epochs=max_epochs, patience=patience, tol=tol, early_stopping=early_stopping, verbose=verbose) results_df.loc['cross_val'] = [ dev_f1, dev_acc, dev_cal_mae, dev_cal_est ] # predict on test data force_dense = False if model_type == 'MLP': force_dense = True test_predictions_df, test_pred_probs_df, test_pred_proportions = predict.predict( project_dir, model, model_name, subset, label, items_to_use=test_items, verbose=verbose, force_dense=force_dense, group_identical=group_identical) f1_test, acc_test = evaluate_predictions.evaluate_predictions( test_labels_df, test_predictions_df, test_pred_probs_df, pos_label=pos_label, average=average) true_test_vector = np.argmax(test_labels_df.as_matrix(), axis=1) #test_cal_mae = evaluation.eval_proportions_mae(test_labels_df.as_matrix(), test_pred_probs_df.as_matrix()) test_cal_est = evaluation.evaluate_calibration_rmse( true_test_vector, test_pred_probs_df.as_matrix(), min_bins=1, max_bins=1) test_cc_estimate, test_pcc_estimate, test_acc_estimate_internal, test_pvc_estimate_internal = test_pred_proportions test_cc_mae = np.mean(np.abs(test_cc_estimate[1] - target_estimate)) test_pcc_mae = np.mean( np.abs(test_pcc_estimate[1] - target_estimate)) results_df.loc['test'] = [ f1_test, acc_test, test_pcc_mae, test_cal_est ] output_df.loc['CC_test'] = [ n_train, 'train', 'test', 'n/a', test_cc_estimate[1], test_cc_mae, np.nan, np.nan, np.nan ] output_df.loc['PCC_test'] = [ n_train, 'train', 'test', 'n/a', test_pcc_estimate[1], test_pcc_mae, np.nan, np.nan, np.nan ] test_acc_rmse_internal = np.sqrt( (test_acc_estimate_internal[1] - target_estimate)**2) test_pvc_rmse_internal = np.sqrt( (test_pvc_estimate_internal[1] - target_estimate)**2) output_df.loc['ACC_internal'] = [ n_train, 'train', 'test', 'n/a', test_acc_estimate_internal[1], test_acc_rmse_internal, np.nan, np.nan, np.nan ] output_df.loc['PVC_internal'] = [ n_train, 'train', 'nontrain', 'predicted', test_pvc_estimate_internal[1], test_pvc_rmse_internal, np.nan, np.nan, np.nan ] """ if DL: print("Training a model") model_type = 'DL' DL_model_name = model_name + '_DL' model, _, _, _, _ = train.train_model_with_labels(project_dir, model_type, loss, DL_model_name, subset, sampled_labels_df, feature_defs, weights_df=weights_df, items_to_use=train_items, penalty='l2', alpha_min=alpha_min, alpha_max=alpha_max, n_alphas=n_alphas, intercept=intercept, objective=objective, n_dev_folds=n_dev_folds, do_ensemble=do_ensemble, dh=dh, seed=seed, pos_label=pos_label, vocab=vocab, group_identical=group_identical, nonlinearity=nonlinearity, init_lr=init_lr, min_epochs=min_epochs, max_epochs=max_epochs, patience=patience, tol=tol, early_stopping=early_stopping, verbose=verbose) # predict on test data force_dense = False if model_type == 'MLP': force_dense = True test_predictions_df, test_pred_probs_df, test_pred_proportions = predict.predict(project_dir, model, DL_model_name, subset, label, items_to_use=test_items, verbose=verbose, force_dense=force_dense, group_identical=group_identical) f1_test, acc_test = evaluate_predictions.evaluate_predictions(test_labels_df, test_predictions_df, test_pred_probs_df, pos_label=pos_label, average=average) true_test_vector = np.argmax(test_labels_df.as_matrix(), axis=1) #test_cal_mae = evaluation.eval_proportions_mae(test_labels_df.as_matrix(), test_pred_probs_df.as_matrix()) test_cal_est = evaluation.evaluate_calibration_rmse(true_test_vector, test_pred_probs_df.as_matrix(), min_bins=1, max_bins=1) test_cc_estimate, test_pcc_estimate, test_acc_estimate_internal, test_pvc_estimate_internal = test_pred_proportions test_cc_mae = np.mean(np.abs(test_cc_estimate[1] - target_estimate)) test_pcc_mae = np.mean(np.abs(test_pcc_estimate[1] - target_estimate)) output_df.loc['CC_test_DL'] = [n_train, 'train', 'test', 'n/a', test_cc_estimate[1], test_cc_mae, np.nan, np.nan, np.nan] output_df.loc['PCC_test_DL'] = [n_train, 'train', 'test', 'n/a', test_pcc_estimate[1], test_pcc_mae, np.nan, np.nan, np.nan] """ results_df.to_csv( os.path.join(dirs.dir_models(project_dir), model_name, 'accuracy.csv')) output_df.to_csv( os.path.join(dirs.dir_models(project_dir), model_name, 'results.csv'))
def make_random_split(input_file, field_name, calib_percent, overwrite=False, sampling='proportional'): """ Split a dataset into multiple overlapping datasets based on some metadata variable (such as year) The idea is to create subsets to test domain adaptation / covariate shift For each value of the variable, create three datasets: train = all those items that don't have that value (training data) calib = random subset of items that do have that value (calibration data) test = remaining items that do have that value (evaluation data) :param input_file: :param field_name: :param calib_percent: :param overwrite: :param sampling: :return: """ basedir = os.path.dirname(input_file) data = fh.read_json(input_file) field_vals = set([data[k][field_name] for k in data.keys()]) if sampling == 'proportional': for val in field_vals: print(val) train = { k: v for k, v in data.items() if data[k][field_name] != val } subset = { k: v for k, v in data.items() if data[k][field_name] == val } keys = list(subset.keys()) random.shuffle(keys) n_items = len(keys) print("Loaded %d items" % n_items) n_calib = int(n_items * calib_percent) calib = {k: data[k] for k in keys[:n_calib]} test = {k: data[k] for k in keys[n_calib:]} print( "Creating train, calibration, and test sets of sizes %d, %d and %d, respectively" % (len(train), len(calib), len(test))) output_file = os.path.join( basedir, field_name + '_' + str(val) + '_train.json') if os.path.exists(output_file) and not overwrite: sys.exit("Error: output file %s exists" % output_file) fh.write_to_json(train, output_file) output_file = os.path.join( basedir, field_name + '_' + str(val) + '_calib.json') if os.path.exists(output_file) and not overwrite: sys.exit("Error: output file %s exists" % output_file) fh.write_to_json(calib, output_file) output_file = os.path.join( basedir, field_name + '_' + str(val) + '_test.json') if os.path.exists(output_file) and not overwrite: sys.exit("Error: output file %s exists" % output_file) fh.write_to_json(test, output_file) else: keys = list(data.keys()) random.shuffle(keys) n_items = len(keys) print("Loaded %d items" % n_items) n_calib = int(n_items * calib_percent) calib = {k: data[k] for k in keys[:n_calib]} test = {k: data[k] for k in keys[n_calib:]} for val in field_vals: print(val) train = { k: v for k, v in data.items() if data[k][field_name] != val } calib_subset = { k: v for k, v in calib.items() if calib[k][field_name] == val } test_subset = { k: v for k, v in test.items() if test[k][field_name] == val } print( "Creating train, calibration, and test sets of sizes %d, %d and %d, respectively" % (len(train), len(calib_subset), len(test_subset))) output_file = os.path.join( basedir, field_name + '_' + str(val) + '_train.json') if os.path.exists(output_file) and not overwrite: sys.exit("Error: output file %s exists" % output_file) fh.write_to_json(train, output_file) output_file = os.path.join( basedir, field_name + '_' + str(val) + '_calib.json') if os.path.exists(output_file) and not overwrite: sys.exit("Error: output file %s exists" % output_file) fh.write_to_json(calib_subset, output_file) output_file = os.path.join( basedir, field_name + '_' + str(val) + '_test.json') if os.path.exists(output_file) and not overwrite: sys.exit("Error: output file %s exists" % output_file) fh.write_to_json(test_subset, output_file)
def extract_story_elements(): min_head_vocab = 5 min_role_vocab = 4 min_tuples = 3 ATTRIBUTE = 0 AGENT_ROLE = 1 PATIENT_ROLE = 2 SURFACE_FORM = 3 parsed_dir = os.path.join(dirs.data_stanford_dir, 'parsed') parsed_files = glob.glob(os.path.join(parsed_dir, '*.json')) dependencies_file = os.path.join(dirs.data_stanford_dir, 'dependency_tuple_ids.json') dependencies = fh.read_json(dependencies_file) coref_file = os.path.join(dirs.data_stanford_dir, 'coref_heads.json') coref_heads = fh.read_json(coref_file) supersense_tags = fh.read_json(os.path.join(dirs.data_amalgram_dir, 'all_tags.json')) heads = defaultdict(int) tokens = defaultdict(int) attributes = defaultdict(int) agent_roles = defaultdict(int) patient_roles = defaultdict(int) story_elements = {} print "Extracting story elements" for f_i, f in enumerate(parsed_files): sentences = fh.read_json(f) basename = fh.get_basename_wo_ext(f) element_list = extract_story_elements_from_article(sentences, dependencies[basename], coref_heads[basename], supersense_tags[basename], basename) story_elements[basename] = element_list for element in element_list: for h in element.head_words: heads[h] += 1 for t in element.attributes: attributes[t] += 1 for t in element.agent_roles: agent_roles[t] += 1 for t in element.patient_roles: patient_roles[t] += 1 print "Finding most common tokens" common_heads = [(v, k) for k, v in heads.items()] common_heads.sort() common_heads.reverse() output_filename = os.path.join(dirs.lda_dir, 'common_heads.json') fh.write_to_json(common_heads, output_filename, sort_keys=False) """ common_tokens = [(v, k) for k, v in tokens.items()] common_tokens.sort() common_tokens.reverse() output_filename = os.path.join(dirs.lda_dir, 'common_tokens.json') fh.write_to_json(common_tokens, output_filename, sort_keys=False) """ common_attributes = [(v, k) for k, v in attributes.items()] common_attributes.sort() common_attributes.reverse() output_filename = os.path.join(dirs.lda_dir, 'common_attributes.json') fh.write_to_json(common_attributes, output_filename, sort_keys=False) common_agent_roles = [(v, k) for k, v in agent_roles.items()] common_agent_roles.sort() common_agent_roles.reverse() output_filename = os.path.join(dirs.lda_dir, 'common_agent_roles.json') fh.write_to_json(common_agent_roles, output_filename, sort_keys=False) common_patient_roles = [(v, k) for k, v in patient_roles.items()] common_patient_roles.sort() common_patient_roles.reverse() output_filename = os.path.join(dirs.lda_dir, 'common_patient_roles.json') fh.write_to_json(common_patient_roles, output_filename, sort_keys=False) print pronoun_list #most_common_heads = {k: v for v, k in common_heads if v >= min_head_vocab and k not in pronoun_list} most_common_attributes = {k: v for v, k in common_attributes if (v >= min_role_vocab and k not in pronoun_list)} most_common_agent_roles = {k: v for v, k in common_agent_roles if (v >= min_role_vocab and k not in pronoun_list and k not in stopwords)} most_common_patient_roles = {k: v for v, k in common_patient_roles if (v >= min_role_vocab and k not in pronoun_list and k not in stopwords)} output_filename = os.path.join(dirs.lda_dir, 'most_common_attributes.json') fh.write_to_json(most_common_attributes, output_filename, sort_keys=False) output_filename = os.path.join(dirs.lda_dir, 'most_common_agent_roles.json') fh.write_to_json(most_common_agent_roles, output_filename, sort_keys=False) output_filename = os.path.join(dirs.lda_dir, 'most_common_patient_roles.json') fh.write_to_json(most_common_patient_roles, output_filename, sort_keys=False) print len(most_common_attributes) print len(most_common_agent_roles) print len(most_common_patient_roles) print "Filtering tuples" valid_elements = defaultdict(list) for basename, element_list in story_elements.items(): for se in element_list: se.valid_heads = [h for h in se.head_words if h not in pronoun_list] se.valid_phrases = [h for h in se.phrases if h not in pronoun_list] if len(se.valid_heads) > 0: se.valid_attributes = [t for t in se.attributes if t in most_common_attributes] se.valid_agent_roles = [t for t in se.agent_roles if t in most_common_agent_roles] se.valid_patient_roles = [t for t in se.patient_roles if t in most_common_patient_roles] se.tuples = [(ATTRIBUTE, t) for t in se.valid_attributes] + \ [(AGENT_ROLE, t) for t in se.valid_agent_roles] + \ [(PATIENT_ROLE, t) for t in se.valid_patient_roles] #[(SURFACE_FORM, t) for t in se.valid_heads] if len(se.tuples) >= min_tuples: valid_elements[basename].append(se) print "Constructing vocabulary" n_tuples = 0 vocab = VocabWithCounts('', add_oov=False) n_entities = 0 for basename, element_list in valid_elements.items(): for se in element_list: tokens = [token for role, token in se.tuples] vocab.add_tokens(tokens) n_tuples += len(tokens) n_entities += 1 head_word_vocab = VocabWithCounts('', add_oov=False) for basename, element_list in valid_elements.items(): for se in element_list: tokens = [token for token in se.valid_heads] head_word_vocab.add_tokens(tokens) head_phrase_vocab = VocabWithCounts('', add_oov=False) for basename, element_list in valid_elements.items(): for se in element_list: tokens = [token for token in se.valid_phrases] head_phrase_vocab.add_tokens(tokens) print "Building indices" tuple_vocab = np.zeros(n_tuples, dtype=int) # vocab index of the ith word tuple_entity = np.zeros(n_tuples, dtype=int) tuple_role = [] entity_doc = np.zeros(n_entities, dtype=int) # topic of the ith word docs = valid_elements.keys() docs.sort() vocab_counts = np.zeros(len(vocab), dtype=int) article_mapping = [] entity_index = 0 head_word_vocab_list = [] head_word_entity_list = [] head_phrase_vocab_list = [] head_phrase_entity_list = [] t_i = 0 for d_i, d in enumerate(docs): element_list = valid_elements[d] for se in element_list: entity_doc[entity_index] = d_i for role, token in se.tuples: tuple_entity[t_i] = entity_index tuple_role.append(role) vocab_index = vocab.get_index(token) tuple_vocab[t_i] = vocab_index vocab_counts[vocab_index] += 1 t_i += 1 for token in se.valid_heads: head_word_vocab_index = head_word_vocab.get_index(token) head_word_vocab_list.append(head_word_vocab_index) head_word_entity_list.append(entity_index) for token in se.valid_phrases: head_phrase_vocab_index = head_phrase_vocab.get_index(token) head_phrase_vocab_list.append(head_phrase_vocab_index) head_phrase_entity_list.append(entity_index) article_mapping.append(str(entity_index) + ':' + d + ':' + ','.join(se.head_words) + ':' + ','.join(se.valid_attributes) + ':' + ','.join(se.valid_agent_roles) + ':' + ','.join(se.valid_patient_roles)) entity_index += 1 print len(docs), "valid documents" print entity_index, "entities" print t_i, "tuples" print len(vocab), "word types" print np.min(vocab_counts), np.max(vocab_counts), np.sum(vocab_counts) output_filename = os.path.join(dirs.lda_dir, 'tuple_vocab.json') fh.write_to_json(list(tuple_vocab), output_filename, sort_keys=False) output_filename = os.path.join(dirs.lda_dir, 'tuple_role.json') fh.write_to_json(list(tuple_role), output_filename, sort_keys=False) output_filename = os.path.join(dirs.lda_dir, 'tuple_entity.json') fh.write_to_json(list(tuple_entity), output_filename, sort_keys=False) output_filename = os.path.join(dirs.lda_dir, 'entity_doc.json') fh.write_to_json(list(entity_doc), output_filename, sort_keys=False) output_filename = os.path.join(dirs.lda_dir, 'vocab.json') fh.write_to_json(vocab.index2token, output_filename, sort_keys=False) output_filename = os.path.join(dirs.lda_dir, 'docs.json') fh.write_to_json(list(docs), output_filename, sort_keys=False) output_filename = os.path.join(dirs.lda_dir, 'article_map.json') fh.write_to_json(list(article_mapping), output_filename, sort_keys=False) output_filename = os.path.join(dirs.lda_dir, 'head_word_vocab.json') fh.write_to_json(head_word_vocab.index2token, output_filename, sort_keys=False) output_filename = os.path.join(dirs.lda_dir, 'head_phrase_vocab.json') fh.write_to_json(head_phrase_vocab.index2token, output_filename, sort_keys=False) output_filename = os.path.join(dirs.lda_dir, 'head_word_vocab_list.json') fh.write_to_json(head_word_vocab_list, output_filename, sort_keys=False) output_filename = os.path.join(dirs.lda_dir, 'head_word_entity_list.json') fh.write_to_json(head_word_entity_list, output_filename, sort_keys=False) output_filename = os.path.join(dirs.lda_dir, 'head_phrase_vocab_list.json') fh.write_to_json(head_phrase_vocab_list, output_filename, sort_keys=False) output_filename = os.path.join(dirs.lda_dir, 'head_phrase_entity_list.json') fh.write_to_json(head_phrase_entity_list, output_filename, sort_keys=False)
def parse_xml_files(xml_filelist_filename, output_dir): filelist = fh.read_text(xml_filelist_filename) parsed_files = {} sentiments = {} dependencies = {} dependency_tuples = {} entities = {} coref = {} coref_entities = {} coref_heads = {} all_groups = {} jk_grams = {} amalgram_pairs = {} for file in filelist: file = file.rstrip('\n') print file # peel off both .txt and .xml basename = fh.get_basename_wo_ext(fh.get_basename_wo_ext(file)) sentences, doc_sentiments, doc_dependencies, doc_dependency_tuples, doc_entities, doc_coref, groups, _,\ doc_coref_entities, doc_coref_heads = parse_xml_output(file) parsed_files[basename] = sentences sentiments[basename] = doc_sentiments dependencies[basename] = doc_dependencies dependency_tuples[basename] = doc_dependency_tuples entities[basename] = doc_entities coref[basename] = doc_coref coref_entities[basename] = doc_coref_entities coref_heads[basename] = doc_coref_heads doc_jk_grams, doc_jk_indices = find_jk_grams(sentences) jk_grams[basename] = doc_jk_grams # output documents to amalgram format #amalgram_dir = os.path.join(dirs.data_amalgram_dir, 'input') #if not os.path.exists(amalgram_dir): # os.makedirs(amalgram_dir) tagged_sents = ['\n'.join([t['word'] + '\t' + t['POS'] for t in s]) + '\n' for s in sentences] # save word/tag pairs for amalgram tagged_sents = [[(t['word'], t['POS']) for t in s] for s in sentences] amalgram_pairs[basename] = tagged_sents # uncomment for extracting story elements... parsed_dir = os.path.join(output_dir, 'parsed') if not os.path.exists(parsed_dir): os.makedirs(parsed_dir) parsed_filename = os.path.join(parsed_dir, basename + '.json') fh.write_to_json(sentences, parsed_filename, sort_keys=False) sentiment_filename = fh.make_filename(output_dir, 'sentiments', 'json') fh.write_to_json(sentiments, sentiment_filename, sort_keys=False) dependencies_filename = fh.make_filename(output_dir, 'dependency_tuple_ids', 'json') fh.write_to_json(dependency_tuples, dependencies_filename, sort_keys=False) coref_filename = fh.make_filename(output_dir, 'entities', 'json') fh.write_to_json(coref, coref_filename, sort_keys=False) jkgrams_filename = fh.make_filename(output_dir, 'jkgrams', 'json') fh.write_to_json(jk_grams, jkgrams_filename, sort_keys=False) coref_heads_filename = fh.make_filename(output_dir, 'coref_heads', 'json') fh.write_to_json(coref_heads, coref_heads_filename, sort_keys=False) amalgram_keys = amalgram_pairs.keys() amalgram_keys.sort() amalgram_data_file = os.path.join(dirs.data_amalgram_dir, 'input.txt') with codecs.open(amalgram_data_file, 'w', encoding='utf-8') as output_file: for k in amalgram_keys: sents = amalgram_pairs[k] for s in sents: for p in s: output_file.write(p[0] + '\t' + p[1] + '\n') output_file.write('\n') for k in amalgram_keys: amalgram_data_file = os.path.join(dirs.data_amalgram_dir, k + '.txt') with codecs.open(amalgram_data_file, 'w', encoding='utf-8') as output_file: sents = amalgram_pairs[k] for s in sents: for p in s: output_file.write(p[0] + '\t' + p[1] + '\n') output_file.write('\n') amalgram_index_file = os.path.join(dirs.data_amalgram_dir, 'index.txt') with codecs.open(amalgram_index_file, 'w', encoding='utf-8') as output_file: for k in amalgram_keys: sents = amalgram_pairs[k] for s in sents: output_file.write(k + '\n') #all_groups_filename = fh.make_filename(output_dir, 'all_groups', 'json') #fh.write_to_json(all_groups, all_groups_filename) return parsed_files, dependencies
def parse_summary_to_files(parsed, dependencies, output_dir): words = {} lemmas = {} pos = {} ner = {} word_pos = {} lemma_pos = {} word_ner = {} lemma_ner = {} dependency_links = {} dependency_heads = {} dependency_tails = {} dependency_tuples = {} dependency_pairs = {} dicts = [words, lemmas, pos, ner, word_pos, lemma_pos, word_ner, lemma_ner] dicts2 = [dependency_links, dependency_heads, dependency_tails, dependency_tuples, dependency_pairs] jk_gram = [] last_tag = None last_ner_tag = None for key in parsed.keys(): # TODO: Actually want [a*]n+[pn+]* quote = None for d in dicts: d[key] = [] sentences = parsed[key] for s in sentences: for d in dicts: d[key].append([]) for token in s: words[key][-1].append(token['word']) lemmas[key][-1].append(token['lemma']) pos[key][-1].append(token['POS']) word_pos[key][-1].append(token['word'] + '_' + token['POS']) lemma_pos[key][-1].append(token['lemma'] + '_' + token['POS']) if token['NER'] != 'O': # if tag matches last tag, concatenate to old entires if token['NER'] == last_ner_tag and len(word_ner[key][-1]) > 0: word_ner[key][-1][-1] = '_'.join(word_ner[key][-1][-1].split('_')[:-1] + [token['word'], token['NER']]) lemma_ner[key][-1][-1] = '_'.join(lemma_ner[key][-1][-1].split('_')[:-1] + [token['lemma'], token['NER']]) #word_ner[key][-1].append(token['word'] + '_' + token['NER']) #lemma_ner[key][-1].append(token['lemma'] + '_' + token['NER']) else: ner[key][-1].append(token['NER']) word_ner[key][-1].append(token['word'] + '_' + token['NER']) lemma_ner[key][-1].append(token['lemma'] + '_' + token['NER']) last_ner_tag = token['NER'] # join the word and lemma lists into documents words[key] = '\n'.join([' '.join(sentence_tokens) for sentence_tokens in words[key]]) lemmas[key] = '\n'.join([' '.join(sentence_tokens) for sentence_tokens in lemmas[key]]) words_filename = fh.make_filename(output_dir, 'words', 'json') fh.write_to_json(words, words_filename, sort_keys=False) lemmas_filename = fh.make_filename(output_dir, 'lemmas', 'json') fh.write_to_json(lemmas, lemmas_filename, sort_keys=False) pos_filename = fh.make_filename(output_dir, 'pos', 'json') fh.write_to_json(pos, pos_filename, sort_keys=False) ner_filename = fh.make_filename(output_dir, 'ner', 'json') fh.write_to_json(ner, ner_filename, sort_keys=False) word_pos_filename = fh.make_filename(output_dir, 'word_pos', 'json') fh.write_to_json(word_pos, word_pos_filename, sort_keys=False) lemma_pos_filename = fh.make_filename(output_dir, 'lemma_pos', 'json') fh.write_to_json(lemma_pos, lemma_pos_filename, sort_keys=False) word_ner_filename = fh.make_filename(output_dir, 'word_ner', 'json') fh.write_to_json(word_ner, word_ner_filename, sort_keys=False) lemma_ner_filename = fh.make_filename(output_dir, 'lemma_ner', 'json') fh.write_to_json(lemma_ner, lemma_ner_filename, sort_keys=False) for key in dependencies.keys(): for d in dicts2: d[key] = [] sentences = dependencies[key] for s in sentences: for d in dicts2: d[key].append([]) for tuple in s: dependency_links[key][-1].append(tuple[1]) dependency_heads[key][-1].append(tuple[0] + '_' + tuple[1]) dependency_tails[key][-1].append(tuple[1] + '_' + tuple[2]) dependency_tuples[key][-1].append(tuple[0] + '_' + tuple[1] + '_' + tuple[2]) dependency_pairs[key][-1].append(tuple[0] + '_' + tuple[2]) dep_filename = fh.make_filename(output_dir, 'dependency_links', 'json') fh.write_to_json(dependency_links, dep_filename, sort_keys=False) dep_filename = fh.make_filename(output_dir, 'dependency_heads', 'json') fh.write_to_json(dependency_heads, dep_filename, sort_keys=False) dep_filename = fh.make_filename(output_dir, 'dependency_tails', 'json') fh.write_to_json(dependency_tails, dep_filename, sort_keys=False) dep_filename = fh.make_filename(output_dir, 'dependency_tuples', 'json') fh.write_to_json(dependency_tuples, dep_filename, sort_keys=False) dep_filename = fh.make_filename(output_dir, 'dependency_pairs', 'json') fh.write_to_json(dependency_pairs, dep_filename, sort_keys=False)
def cross_train_and_eval(project_dir, subset, field_name, config_file, calib_prop=0.33, train_prop=1.0, prefix=None, max_folds=None, min_val=None, max_val=None, model_type='LR', loss='log', do_ensemble=False, dh=0, label='label', penalty='l1', cshift=None, intercept=True, n_dev_folds=5, repeats=1, verbose=False, pos_label=1, average='micro', objective='f1', seed=None, use_calib_pred=False, exclude_calib=False, alpha_min=0.01, alpha_max=1000, sample_labels=False): model_basename = subset + '_' + field_name if prefix is not None: model_basename = prefix + '_' + model_basename # save the experiment parameters to a log file logfile = os.path.join(dirs.dir_logs(project_dir), model_basename + '.json') fh.makedirs(dirs.dir_logs(project_dir)) log = { 'project': project_dir, 'subset': subset, 'field_name': field_name, 'config_file': config_file, 'calib_prop': calib_prop, 'train_prop': train_prop, 'prefix': prefix, 'max_folds': max_folds, 'model_type': model_type, 'loss': loss, 'dh': dh, 'alpha_min': alpha_min, 'alpha_max': alpha_max, 'do_ensemble': do_ensemble, 'label': label, 'penalty': penalty, 'cshift': cshift, 'intercept': intercept, 'objective': objective, 'n_dev_folds': n_dev_folds, 'repeats': repeats, 'pos_label': pos_label, 'average': average, 'use_calib_pred': use_calib_pred, 'exclude_calib': exclude_calib } fh.write_to_json(log, logfile) # load the features specified in the config file config = fh.read_json(config_file) feature_defs = [] for f in config['feature_defs']: feature_defs.append(features.parse_feature_string(f)) # load the file that contains metadata about each item metadata_file = os.path.join(dirs.dir_subset(project_dir, subset), 'metadata.csv') metadata = fh.read_csv_to_df(metadata_file) field_vals = list(set(metadata[field_name].values)) field_vals.sort() print(field_vals) # exclude certain values of the partition if desired if min_val is not None: field_vals = [v for v in field_vals if v >= float(min_val)] if max_val is not None: field_vals = [v for v in field_vals if v <= float(max_val)] if max_folds is None: max_folds = len(field_vals) # repeat the following value for each fold of the partition of interest (up to max_folds, if given) for v_i, v in enumerate(field_vals[:max_folds]): print("\nTesting on %s" % v) # first, split into training and non-train data based on the field of interest train_selector = metadata[field_name] != v train_subset = metadata[train_selector] train_items = list(train_subset.index) n_train = len(train_items) non_train_selector = metadata[field_name] == v non_train_subset = metadata[non_train_selector] non_train_items = non_train_subset.index.tolist() n_non_train = len(non_train_items) print("Train: %d, non-train: %d" % (n_train, n_non_train)) # load all labels label_dir = dirs.dir_labels(project_dir, subset) labels_df = fh.read_csv_to_df(os.path.join(label_dir, label + '.csv'), index_col=0, header=0) n_items, n_classes = labels_df.shape train_labels = labels_df.loc[train_items] # if desired, attempt to learn weights for the training data using techniques for covariate shift if cshift is not None: print("Training a classifier for covariate shift") # start by learning to discriminate train from non-train data train_test_labels = np.zeros((n_items, 2), dtype=int) train_test_labels[train_selector, 0] = 1 train_test_labels[non_train_selector, 1] = 1 train_test_labels_df = pd.DataFrame(train_test_labels, index=labels_df.index, columns=[0, 1]) # create a cshift model using the same specifiction as our model below (e.g. LR/MLP, etc.) model_name = model_basename + '_' + str(v) + '_' + 'cshift' model, dev_f1, dev_acc, dev_cal, _, _ = train.train_model_with_labels(project_dir, model_type, loss, model_name, subset, train_test_labels_df, feature_defs, penalty=penalty, alpha_min=alpha_min, alpha_max=alpha_max, intercept=intercept, n_dev_folds=n_dev_folds, save_model=True, do_ensemble=do_ensemble, dh=dh, seed=seed, verbose=False) print("cshift results: %0.4f f1, %0.4f acc" % (dev_f1, dev_acc)) # take predictions from model on the training data train_test_pred_df, train_test_probs_df = predict.predict(project_dir, model, model_name, subset, label, verbose=verbose) # display the min and max probs print("Min: %0.4f" % train_test_probs_df[1].min()) print("Max: %0.4f" % train_test_probs_df[1].max()) # use the estimated probability of each item being a training item to compute item weights weights = n_train / float(n_non_train) * (1.0/train_test_probs_df[0].values - 1) # print a summary of the weights from just the training items print("Min weight: %0.4f" % weights[train_selector].min()) print("Ave weight: %0.4f" % weights[train_selector].mean()) print("Max weight: %0.4f" % weights[train_selector].max()) # print a summary of all weights print("Min weight: %0.4f" % weights.min()) print("Ave weight: %0.4f" % weights.mean()) print("Max weight: %0.4f" % weights.max()) # create a data frame with this information weights_df = pd.DataFrame(weights, index=labels_df.index) else: weights_df = None # repeat the following process multiple times with different random splits of train / calibration / test data for r in range(repeats): # next, take a random subset of the training data (and ignore the rest), to simulate fewer annotated items if train_prop < 1.0: np.random.shuffle(train_items) train_items_r = np.random.choice(train_items, size=int(n_train * train_prop), replace=False) n_train_r = len(train_items_r) # create a data frame to hold a summary of the results output_df = pd.DataFrame([], columns=['N', 'estimate', 'RMSE', '95lcl', '95ucl', 'contains_test']) # create a unique name ofr this model model_name = model_basename + '_' + str(v) + '_' + str(r) # now, divide the non-train data into a calibration and a test set n_calib = int(calib_prop * n_non_train) np.random.shuffle(non_train_items) calib_items = non_train_items[:n_calib] test_items = non_train_items[n_calib:] n_test = len(test_items) print("%d %d %d" % (n_train_r, n_calib, n_test)) test_labels_df = labels_df.loc[test_items] non_train_labels_df = labels_df.loc[non_train_items] # if instructed, sample labels in proportion to annotations (to simulate having one label per item) if sample_labels: print("Sampling labels") # normalize the labels temp = labels_df.values / np.array(labels_df.values.sum(axis=1).reshape((n_items, 1)), dtype=float) samples = np.zeros([n_items, n_classes], dtype=int) for i in range(n_items): index = np.random.choice(np.arange(n_classes), size=1, p=temp[i, :]) samples[i, index] = 1 sampled_labels_df = pd.DataFrame(samples, index=labels_df.index, columns=labels_df.columns) else: sampled_labels_df = labels_df train_labels_r_df = sampled_labels_df.loc[train_items_r].copy() calib_labels_df = sampled_labels_df.loc[calib_items].copy() # get the true proportion of labels in the test OR non-training data (calibration and test combined) if exclude_calib: test_props, test_estimate, test_std = get_estimate_and_std(test_labels_df) else: test_props, test_estimate, test_std = get_estimate_and_std(non_train_labels_df) output_df.loc['test'] = [n_test, test_estimate, 0, test_estimate - 2 * test_std, test_estimate + 2 * test_std, 1] # get the same estimate from training data train_props, train_estimate, train_std = get_estimate_and_std(train_labels_r_df) # compute the error of this estimate train_rmse = np.sqrt((train_estimate - test_estimate)**2) train_contains_test = test_estimate > train_estimate - 2 * train_std and test_estimate < train_estimate + 2 * train_std output_df.loc['train'] = [n_train_r, train_estimate, train_rmse, train_estimate - 2 * train_std, train_estimate + 2 * train_std, train_contains_test] # repeat for calibration data calib_props, calib_estimate, calib_std = get_estimate_and_std(calib_labels_df) calib_rmse = np.sqrt((calib_estimate - test_estimate)**2) # check if the test estimate is within 2 standard deviations of the estimate calib_contains_test = test_estimate > calib_estimate - 2 * calib_std and calib_estimate < calib_estimate + 2 * calib_std output_df.loc['calibration'] = [n_calib, calib_estimate, calib_rmse, calib_estimate - 2 * calib_std, calib_estimate + 2 * calib_std, calib_contains_test] results_df = pd.DataFrame([], columns=['f1', 'acc', 'cal']) print("Training model on all labeled data") # first train a model on the training and calibration data combined calib_and_train_items_r = np.array(list(calib_items) + list(train_items_r)) model, dev_f1, dev_acc, dev_cal, acc_cfm, pvc_cfm = train.train_model_with_labels(project_dir, model_type, loss, model_name, subset, sampled_labels_df, feature_defs, weights_df=weights_df, items_to_use=calib_and_train_items_r, penalty=penalty, alpha_min=alpha_min, alpha_max=alpha_max, intercept=intercept, objective=objective, n_dev_folds=n_dev_folds, do_ensemble=do_ensemble, dh=dh, seed=seed, verbose=verbose) results_df.loc['cross_val_all'] = [dev_f1, dev_acc, dev_cal] # get labels for test data test_predictions_df, test_pred_probs_df = predict.predict(project_dir, model, model_name, subset, label, items_to_use=test_items, verbose=verbose) f1_test, acc_test = evaluate_predictions.evaluate_predictions(test_labels_df, test_predictions_df, test_pred_probs_df, pos_label=pos_label, average=average) results_df.loc['test_all'] = [f1_test, acc_test, 0.0] # combine the predictions on the test and calibration data (unless excluding calibration data from this) if exclude_calib: test_predictions = test_predictions_df.values test_pred_probs = test_pred_probs_df.values else: # get labels for calibration data if use_calib_pred: calib_predictions_df, calib_pred_probs_df = predict.predict(project_dir, model, model_name, subset, label, items_to_use=calib_items, verbose=verbose) else: calib_predictions_df = pd.DataFrame(np.argmax(calib_labels_df.values, axis=1), index=calib_labels_df.index) # normalize labels to get (questionable) estimates of probabilities calib_pred_probs_df = pd.DataFrame(calib_labels_df.values / np.array(np.sum(calib_labels_df.values, axis=1).reshape((n_calib, 1)), dtype=float), index=calib_labels_df.index) test_predictions = np.r_[test_predictions_df.values, calib_predictions_df.values] test_pred_probs = np.vstack([test_pred_probs_df.values, calib_pred_probs_df.values]) # get the basic error estimates for this model cc_estimate = np.mean(test_predictions) cc_rmse = np.sqrt((cc_estimate - test_estimate)**2) # average the predicted probabilities for the positive label (assuming binary labels) pcc_estimate = np.mean(test_pred_probs[:, 1]) pcc_rmse = np.sqrt((pcc_estimate - test_estimate)**2) output_df.loc['CC_all'] = [n_test, cc_estimate, cc_rmse, np.nan, np.nan, np.nan] output_df.loc['PCC_all'] = [n_test, pcc_estimate, pcc_rmse, np.nan, np.nan, np.nan] # Now repeat for a model trained on the training data, saving the calibration data for calibration print("Training model on training data only") model, dev_f1, dev_acc, dev_cal, acc_cfm, pvc_cfm = train.train_model_with_labels(project_dir, model_type, loss, model_name, subset, sampled_labels_df, feature_defs, weights_df=weights_df, items_to_use=train_items_r, penalty=penalty, alpha_min=alpha_min, alpha_max=alpha_max, intercept=intercept, objective=objective, n_dev_folds=n_dev_folds, do_ensemble=do_ensemble, dh=dh, seed=seed, verbose=verbose) results_df.loc['cross_val'] = [dev_f1, dev_acc, dev_cal] # predict on calibration data calib_predictions_df, calib_pred_probs_df = predict.predict(project_dir, model, model_name, subset, label, items_to_use=calib_items, verbose=verbose) f1_cal, acc_cal = evaluate_predictions.evaluate_predictions(calib_labels_df, calib_predictions_df, calib_pred_probs_df, pos_label=pos_label, average=average, verbose=False) results_df.loc['calibration'] = [f1_cal, acc_cal, calib_rmse] # predict on test data test_predictions_df, test_pred_probs_df = predict.predict(project_dir, model, model_name, subset, label, items_to_use=test_items, verbose=verbose) f1_test, acc_test = evaluate_predictions.evaluate_predictions(test_labels_df, test_predictions_df, test_pred_probs_df, pos_label=pos_label, average=average) results_df.loc['test'] = [f1_test, acc_test, 0.0] results_df.to_csv(os.path.join(dirs.dir_models(project_dir), model_name, 'results.csv')) # combine the predictions on the test and calibration data (unless excluding calibration data from this) if exclude_calib: test_predictions = test_predictions_df.values test_pred_probs = test_pred_probs_df.values else: if not use_calib_pred: calib_predictions_df = pd.DataFrame(np.argmax(calib_labels_df.values, axis=1), index=calib_labels_df.index) # normalize labels to get (questionable) estimates of probabilities calib_pred_probs_df = pd.DataFrame(calib_labels_df.values / np.array(np.sum(calib_labels_df.values, axis=1).reshape((n_calib, 1)), dtype=float), index=calib_labels_df.index) test_predictions = np.r_[test_predictions_df.values, calib_predictions_df.values] test_pred_probs = np.vstack([test_pred_probs_df.values, calib_pred_probs_df.values]) # now evaluate in terms of predicted proportions # average the predictions (assuming binary labels) cc_estimate = np.mean(test_predictions) cc_rmse = np.sqrt((cc_estimate - test_estimate)**2) # average the predicted probabilities for the positive label (assuming binary labels) pcc_estimate = np.mean(test_pred_probs[:, 1]) pcc_rmse = np.sqrt((pcc_estimate - test_estimate)**2) pcc_calib_estimate = np.mean(calib_pred_probs_df.values[:, 1]) pcc_calib_rmse = np.sqrt((pcc_calib_estimate - calib_estimate)**2) output_df.loc['PCC_cal'] = [n_calib, pcc_calib_estimate, pcc_calib_rmse, np.nan, np.nan, np.nan] output_df.loc['CC'] = [n_test, cc_estimate, cc_rmse, np.nan, np.nan, np.nan] output_df.loc['PCC'] = [n_test, pcc_estimate, pcc_rmse, np.nan, np.nan, np.nan] # expand the data so as to only have singly-labeled, weighted items _, calib_labels, calib_weights, calib_predictions = train.prepare_data(np.zeros([n_calib, 2]), calib_labels_df.values, predictions=calib_predictions_df.values) # do some sort of calibration here (ACC, PACC, PVC) print("ACC correction") #calib_labels_expanded, calib_weights_expanded, calib_predictions_expanded = expand_labels(calib_labels.values, calib_predictions.values) acc = calibration.compute_acc(calib_labels, calib_predictions, n_classes, weights=calib_weights) acc_corrected = calibration.apply_acc_binary(test_predictions, acc) acc_estimate = acc_corrected[1] acc_rmse = np.sqrt((acc_estimate - test_estimate) ** 2) output_df.loc['ACC'] = [n_calib, acc_estimate, acc_rmse, np.nan, np.nan, np.nan] print("ACC internal") acc_corrected = calibration.apply_acc_binary(test_predictions, acc_cfm) acc_estimate = acc_corrected[1] acc_rmse = np.sqrt((acc_estimate - test_estimate) ** 2) output_df.loc['ACC_int'] = [n_calib, acc_estimate, acc_rmse, np.nan, np.nan, np.nan] print("PVC correction") pvc = calibration.compute_pvc(calib_labels, calib_predictions, n_classes, weights=calib_weights) pvc_corrected = calibration.apply_pvc(test_predictions, pvc) pvc_estimate = pvc_corrected[1] pvc_rmse = np.sqrt((pvc_estimate - test_estimate) ** 2) output_df.loc['PVC'] = [n_calib, pvc_estimate, pvc_rmse, np.nan, np.nan, np.nan] print("PVC internal") pvc_corrected = calibration.apply_pvc(test_predictions, pvc_cfm) pvc_estimate = pvc_corrected[1] pvc_rmse = np.sqrt((pvc_estimate - test_estimate) ** 2) output_df.loc['PVC_int'] = [n_calib, pvc_estimate, pvc_rmse, np.nan, np.nan, np.nan] print("Venn") test_pred_ranges, calib_pred_ranges = ivap.estimate_probs_from_labels(project_dir, model, model_name, subset, subset, sampled_labels_df, calib_items, test_items, weights_df=None) if not exclude_calib: test_pred_ranges = np.vstack([test_pred_ranges, calib_pred_ranges]) combo = test_pred_ranges[:, 1] / (1.0 - test_pred_ranges[:, 0] + test_pred_ranges[:, 1]) pred_range = np.mean(test_pred_ranges, axis=0) venn_estimate = np.mean(combo) venn_rmse = np.sqrt((venn_estimate - test_estimate)**2) venn_contains_test = pred_range[0] < test_estimate < pred_range[1] output_df.loc['Venn'] = [n_calib, venn_estimate, venn_rmse, pred_range[0], pred_range[1], venn_contains_test] output_filename = os.path.join(dirs.dir_models(project_dir), model_name, field_name + '_' + str(v) + '.csv') output_df.to_csv(output_filename)
def cross_train_and_eval(project_dir, subset, config_file, n_train=500, suffix='', model_type='LR', loss='log', do_ensemble=True, dh=100, label='label', penalty='l1', intercept=True, n_dev_folds=5, repeats=1, verbose=False, average='micro', objective='f1', seed=None, alpha_min=0.01, alpha_max=1000.0, sample_labels=False, run_all=False): field_name = 'nosplit' model_basename = subset + '_' + label + '_' + field_name + '_' + model_type + '_' + penalty if model_type == 'MLP': model_basename += '_' + str(dh) model_basename += '_' + str(n_train) + '_' + objective if sample_labels: model_basename += '_sampled' model_basename += suffix # save the experiment parameters to a log file logfile = os.path.join(dirs.dir_logs(project_dir), model_basename + '.json') fh.makedirs(dirs.dir_logs(project_dir)) log = { 'project': project_dir, 'subset': subset, 'field_name': 'nosplit', 'config_file': config_file, 'n_train': n_train, 'suffix': suffix, 'model_type': model_type, 'loss': loss, 'dh': dh, 'alpha_min': alpha_min, 'alpha_max': alpha_max, 'do_ensemble': do_ensemble, 'label': label, 'penalty': penalty, 'intercept': intercept, 'objective': objective, 'n_dev_folds': n_dev_folds, 'repeats': repeats, 'average': average, #'use_calib_pred': use_calib_pred, #'exclude_calib': exclude_calib, 'sample_labels': sample_labels } fh.write_to_json(log, logfile) # load the features specified in the config file config = fh.read_json(config_file) feature_defs = [] for f in config['feature_defs']: feature_defs.append(features.parse_feature_string(f)) # load all labels label_dir = dirs.dir_labels(project_dir, subset) labels_df = fh.read_csv_to_df(os.path.join(label_dir, label + '.csv'), index_col=0, header=0) n_items, n_classes = labels_df.shape weights_df = None # eliminate items with no labels print("Subsetting items with labels") label_sums_df = labels_df.sum(axis=1) labeled_item_selector = label_sums_df > 0 labels_df = labels_df[labeled_item_selector] n_items, n_classes = labels_df.shape labeled_items = list(set(labels_df.index)) print("Starting repeats") # repeat the following process multiple times with different random splits of train / calibration / test data for r in range(repeats): print("* Repetition %d *" % r) # take a random subset of the training data np.random.shuffle(labeled_items) train_items = labeled_items[:n_train] test_items = labeled_items[n_train:] n_test = len(test_items) n_calib = 0 # create a data frame to hold a summary of the results output_df = pd.DataFrame([], columns=['N', 'training data', 'test data', 'cal', 'estimate', 'RMSE', '95lcl', '95ucl', 'contains_test']) # create a unique name ofr this model model_name = model_basename + '_' + 'nosplit' + '_' + str(r) print("Train: %d, calibration: %d, test: %d" % (n_train, n_calib, n_test)) test_labels_df = labels_df.loc[test_items] # if instructed, sample labels in proportion to annotations (to simulate having one label per item) if sample_labels: print("Sampling labels") # normalize the labels temp = labels_df.values / np.array(labels_df.values.sum(axis=1).reshape((n_items, 1)), dtype=float) samples = np.zeros([n_items, n_classes], dtype=int) for i in range(n_items): index = np.random.choice(np.arange(n_classes), size=1, p=temp[i, :]) samples[i, index] = 1 sampled_labels_df = pd.DataFrame(samples, index=labels_df.index, columns=labels_df.columns) else: sampled_labels_df = labels_df train_labels_df = sampled_labels_df.loc[train_items].copy() # get the true proportion of labels in the test OR non-training data (calibration and test combined) target_props, target_estimate, target_std = combo.get_estimate_and_std(labels_df) output_df.loc['target'] = [n_test, 'n/a', 'all', 'given', target_estimate, 0, target_estimate - 2 * target_std, target_estimate + 2 * target_std, np.nan] # get the same estimate from training data train_props, train_estimate, train_std = combo.get_estimate_and_std(train_labels_df) # compute the error of this estimate train_rmse = np.sqrt((train_estimate - target_estimate)**2) train_contains_test = target_estimate > train_estimate - 2 * train_std and target_estimate < train_estimate + 2 * train_std output_df.loc['train'] = [n_train, 'train', 'train', 'n/a', train_estimate, train_rmse, train_estimate - 2 * train_std, train_estimate + 2 * train_std, train_contains_test] # do a test using the number of annotations rather than the number of items train_props2, train_estimate2, train_std2 = combo.get_estimate_and_std(train_labels_df, use_n_annotations=True) # compute the error of this estimate train_rmse2 = np.sqrt((train_estimate2 - target_estimate)**2) train_contains_test2 = target_estimate > train_estimate2 - 2 * train_std2 and target_estimate < train_estimate2 + 2 * train_std2 output_df.loc['train_n_annotations'] = [n_train, 'train', 'train', 'n/a', train_estimate2, train_rmse2, train_estimate2 - 2 * train_std2, train_estimate2 + 2 * train_std2, train_contains_test2] print("target proportions: (%0.3f, %0.3f); train proportions: %0.3f" % (target_estimate - 2 * target_std, target_estimate + 2 * target_std, train_estimate)) if train_estimate > 0.5: pos_label = 0 else: pos_label = 1 print("Using %d as the positive label" % pos_label) results_df = pd.DataFrame([], columns=['f1', 'acc', 'calibration', 'calib overall']) # Now train a model on the training data, saving the calibration data for calibration print("Training model on training data only") model, dev_f1, dev_acc, dev_cal, dev_cal_overall = train.train_model_with_labels(project_dir, model_type, loss, model_name, subset, sampled_labels_df, feature_defs, weights_df=weights_df, items_to_use=train_items, penalty=penalty, alpha_min=alpha_min, alpha_max=alpha_max, intercept=intercept, objective=objective, n_dev_folds=n_dev_folds, do_ensemble=do_ensemble, dh=dh, seed=seed, pos_label=pos_label, verbose=verbose) results_df.loc['cross_val'] = [dev_f1, dev_acc, dev_cal, dev_cal_overall] # predict on test data test_predictions_df, test_pred_probs_df, test_pred_proportions = predict.predict(project_dir, model, model_name, subset, label, items_to_use=test_items, verbose=verbose) f1_test, acc_test = evaluate_predictions.evaluate_predictions(test_labels_df, test_predictions_df, test_pred_probs_df, pos_label=pos_label, average=average) true_test_vector = np.argmax(test_labels_df.as_matrix(), axis=1) test_cal_rmse = evaluation.evaluate_calibration_rmse(true_test_vector, test_pred_probs_df.as_matrix()) test_cal_rmse_overall = evaluation.evaluate_calibration_rmse(true_test_vector, test_pred_probs_df.as_matrix(), min_bins=1, max_bins=1) results_df.loc['test'] = [f1_test, acc_test, test_cal_rmse, test_cal_rmse_overall] test_cc_estimate, test_pcc_estimate, test_acc_estimate_internal, test_pvc_estimate_internal = test_pred_proportions # predict on calibration and test data combined all_predictions_df, all_pred_probs_df, all_pred_proportions = predict.predict(project_dir, model, model_name, subset, label, items_to_use=labeled_items, verbose=verbose) all_cc_estimate, all_pcc_estimate, all_acc_estimate_internal, all_pvc_estimate_internal = all_pred_proportions cc_rmse = np.sqrt((all_cc_estimate[1] - target_estimate)**2) pcc_rmse = np.sqrt((all_pcc_estimate[1] - target_estimate)**2) output_df.loc['CC_all'] = [n_items, 'train', 'all', 'predicted', all_cc_estimate[1], cc_rmse, np.nan, np.nan, np.nan] output_df.loc['PCC_all'] = [n_items, 'train', 'all', 'predicted', all_pcc_estimate[1], pcc_rmse, np.nan, np.nan, np.nan] averaged_cc_estimate = (test_cc_estimate[1] * n_test + train_estimate * n_train) / float(n_test + n_train) averaged_pcc_estimate = (test_pcc_estimate[1] * n_test + train_estimate * n_train) / float(n_test + n_train) averaged_cc_rmse = np.sqrt((averaged_cc_estimate - target_estimate)**2) averaged_pcc_rmse = np.sqrt((averaged_pcc_estimate - target_estimate)**2) output_df.loc['CC_nontrain_averaged'] = [n_items, 'train', 'all', 'given', averaged_cc_estimate, averaged_cc_rmse, np.nan, np.nan, np.nan] output_df.loc['PCC_nontrain_averaged'] = [n_items, 'train', 'all', 'given', averaged_pcc_estimate, averaged_pcc_rmse, np.nan, np.nan, np.nan] all_acc_rmse_internal = np.sqrt((all_acc_estimate_internal[1] - target_estimate) ** 2) all_pvc_rmse_internal = np.sqrt((all_pvc_estimate_internal[1] - target_estimate) ** 2) output_df.loc['ACC_internal'] = [n_items, 'train', 'all', 'predicted', all_acc_estimate_internal[1], all_acc_rmse_internal, np.nan, np.nan, np.nan] output_df.loc['PVC_internal'] = [n_items, 'train', 'all', 'predicted', all_pvc_estimate_internal[1], all_pvc_rmse_internal, np.nan, np.nan, np.nan] print("Venn internal all") all_pred_ranges_internal, all_preds_internal = ivap.estimate_probs_from_labels_internal(project_dir, model, model_name, subset, labeled_items, plot=False) pred_range = np.mean(all_pred_ranges_internal, axis=0) venn_estimate = np.mean(all_preds_internal) venn_rmse = np.sqrt((venn_estimate - target_estimate)**2) venn_contains_test = pred_range[0] < target_estimate < pred_range[1] output_df.loc['Venn_internal'] = [n_items, 'train', 'all', 'predicted', venn_estimate, venn_rmse, pred_range[0], pred_range[1], venn_contains_test] print("Venn internal test") test_pred_ranges_internal, test_preds_internal = ivap.estimate_probs_from_labels_internal(project_dir, model, model_name, subset, test_items) pred_range = np.mean(test_pred_ranges_internal, axis=0) venn_estimate = (np.mean(test_preds_internal) * n_test + train_estimate * n_train) / float(n_test + n_train) venn_rmse = np.sqrt((venn_estimate - target_estimate)**2) averaged_lower = (pred_range[0] * n_test + (train_estimate - 2 * train_std) * n_train) / float(n_test + n_train) averaged_upper = (pred_range[1] * n_test + (train_estimate + 2 * train_std) * n_train) / float(n_test + n_train) venn_contains_test = averaged_lower < target_estimate < averaged_upper output_df.loc['Venn_internal_averaged'] = [n_items, 'train', 'all', 'given', venn_estimate, venn_rmse, averaged_lower, averaged_upper, venn_contains_test] results_df.to_csv(os.path.join(dirs.dir_models(project_dir), model_name, 'accuracy.csv')) output_df.to_csv(os.path.join(dirs.dir_models(project_dir), model_name, 'results.csv'))
def cross_train_and_eval(project_dir, subset, field_name, config_file, calib_prop=0.33, nontest_prop=1.0, prefix=None, max_folds=None, model_type='LR', label='label', penalty='l2', cshift=None, intercept=True, n_dev_folds=5, repeats=1, verbose=False, pos_label=1, average='micro', objective='f1'): model_basename = subset + '_' + field_name if prefix is not None: model_basename = prefix + '_' + model_basename logfile = os.path.join(dirs.dir_logs(project_dir), model_basename + '.json') fh.makedirs(dirs.dir_logs(project_dir)) log = { 'project': project_dir, 'subset': subset, 'field_name': field_name, 'config_file': config_file, 'calib_prop': calib_prop, 'train_prop': nontest_prop, 'prefix': prefix, 'max_folds': max_folds, 'model_type': model_type, 'label': label, 'penalty': penalty, 'cshift': cshift, 'intercept': intercept, 'objective': objective, 'n_dev_folds': n_dev_folds, 'repeats': repeats, 'pos_label': pos_label, 'average': average } fh.write_to_json(log, logfile) config = fh.read_json(config_file) feature_defs = [] for f in config['feature_defs']: feature_defs.append(features.parse_feature_string(f)) weights_file = None metadata_file = os.path.join(dirs.dir_subset(project_dir, subset), 'metadata.csv') metadata = fh.read_csv_to_df(metadata_file) field_vals = list(set(metadata[field_name].values)) field_vals.sort() print(field_vals) if max_folds is None: max_folds = len(field_vals) for v_i, v in enumerate(field_vals[:max_folds]): print("\nTesting on %s" % v) nontest_selector = metadata[field_name] != v nontest_subset = metadata[nontest_selector] nontest_items = list(nontest_subset.index) n_nontest = len(nontest_items) test_selector = metadata[field_name] == v test_subset = metadata[test_selector] test_items = test_subset.index.tolist() n_test = len(test_items) # load all labels label_dir = dirs.dir_labels(project_dir, subset) labels_df = fh.read_csv_to_df(os.path.join(label_dir, label + '.csv'), index_col=0, header=0) n_items, n_classes = labels_df.shape # subsample the non-test items if desired if nontest_prop < 1.0: np.random.shuffle(nontest_items) nontest_items = np.random.choice(nontest_items, size=int(n_nontest * nontest_prop), replace=False) n_nontest = len(nontest_items) nontest_labels = labels_df.loc[nontest_items] if cshift is not None: print("Training a classifier for covariate shift") # start by learning to discriminate test from non-test data train_test_labels = np.zeros((n_items, 2), dtype=int) train_test_labels[nontest_selector, 0] = 1 train_test_labels[test_selector, 1] = 1 train_test_labels_df = pd.DataFrame(train_test_labels, index=labels_df.index, columns=[0, 1]) model_name = model_basename + '_' + str(v) + '_' + 'cshift' model, dev_f1, dev_cal, _, _ = train.train_model_with_labels( project_dir, model_type, model_name, subset, train_test_labels_df, feature_defs, penalty=penalty, intercept=intercept, n_dev_folds=n_dev_folds, verbose=False) train_test_pred_df, train_test_probs_df = predict.predict( project_dir, model, model_name, subset, label, verbose=verbose) print("Min: %0.4f" % train_test_probs_df[1].min()) print("Max: %0.4f" % train_test_probs_df[1].max()) # base the weights on the probability of each item being a training item weights = n_nontest / float(n_test) * ( 1.0 / train_test_probs_df[0].values - 1) print("Min weight: %0.4f" % weights[nontest_selector].min()) print("Ave weight: %0.4f" % weights[nontest_selector].mean()) print("Max weight: %0.4f" % weights[nontest_selector].max()) print("Min weight: %0.4f" % weights.min()) print("Ave weight: %0.4f" % weights.mean()) print("Max weight: %0.4f" % weights.max()) weights_df = pd.DataFrame(weights, index=labels_df.index) else: weights_df = None # repeat the following process multiple times with different random splits of calibration / test data for r in range(repeats): output_df = pd.DataFrame([], columns=[ 'N', 'estimate', 'RMSE', '95lcl', '95ucl', 'contains_test' ]) model_name = model_basename + '_' + str(v) + '_' + str(r) # split the non-test items into train and calibration n_calib = int(n_nontest * calib_prop) np.random.shuffle(nontest_items) calib_items = nontest_items[:n_calib] train_items = nontest_items[n_calib:] train_labels = labels_df.loc[train_items] calib_labels = labels_df.loc[calib_items] test_labels = labels_df.loc[test_items] # get the label proportions from the test and non-test data test_props, test_estimate, test_std = get_estimate_and_std( test_labels) output_df.loc['test'] = [ n_test, test_estimate, 0, test_estimate - 2 * test_std, test_estimate + 2 * test_std, 1 ] nontest_props, nontest_estimate, nontest_std = get_estimate_and_std( nontest_labels) nontest_rmse = np.sqrt((nontest_estimate - test_estimate)**2) nontest_contains_test = test_estimate > nontest_estimate - 2 * nontest_std and test_estimate < nontest_estimate + 2 * nontest_std output_df.loc['nontest'] = [ n_nontest, nontest_estimate, nontest_rmse, nontest_estimate - 2 * nontest_std, nontest_estimate + 2 * nontest_std, nontest_contains_test ] # train a model print("Doing training") model, dev_f1, dev_cal, acc_cfm, pvc_cfm = train.train_model_with_labels( project_dir, model_type, model_name, subset, labels_df, feature_defs, weights_df=weights_df, items_to_use=train_items, penalty=penalty, intercept=intercept, objective=objective, n_dev_folds=n_dev_folds, verbose=verbose) # predict on the calibration and test sets print("Doing prediction on calibration items") calib_predictions, calib_pred_probs = predict.predict( project_dir, model, model_name, subset, label, items_to_use=calib_items, verbose=verbose) print("Doing prediction on test items") test_predictions, test_pred_probs = predict.predict( project_dir, model, model_name, subset, label, items_to_use=test_items, verbose=verbose) # evaluate the model on the calibration and test data print("Doing evaluation") f1_cal, acc_cal = evaluate_predictions.evaluate_predictions( calib_labels, calib_predictions, pos_label=pos_label, average=average) f1_test, acc_test = evaluate_predictions.evaluate_predictions( test_labels, test_predictions, pos_label=pos_label, average=average) results_df = pd.DataFrame([], columns=['f1', 'acc']) results_df.loc['calibration'] = [f1_cal, acc_cal] results_df.loc['test'] = [f1_test, acc_test] results_df.to_csv( os.path.join(dirs.dir_models(project_dir), model_name, 'results.csv')) # first check results without any correction # average the preditions (assuming binary labels) cc_estimate = np.mean(test_predictions[label].values) cc_rmse = np.sqrt((cc_estimate - test_estimate)**2) # average the predicted probabilities for the positive label (assuming binary labels) pcc_estimate = np.mean(test_pred_probs[1].values) pcc_rmse = np.sqrt((pcc_estimate - test_estimate)**2) output_df.loc['CC'] = [n_test, cc_estimate, cc_rmse, 0, 1, np.nan] output_df.loc['PCC'] = [ n_test, pcc_estimate, pcc_rmse, 0, 1, np.nan ] # do the two basic corrections, based on the calibration data print("ACC internal") calib_labels_expanded, calib_weights_expanded, calib_predictions_expanded = expand_labels( calib_labels.values, calib_predictions.values) acc = calibration.compute_acc(calib_labels_expanded, calib_predictions_expanded, n_classes, calib_weights_expanded) acc_corrected = calibration.apply_acc_binary( test_predictions.values, acc) acc_estimate = acc_corrected[1] acc_rmse = np.sqrt((acc_estimate - test_estimate)**2) output_df.loc['ACC_int'] = [ n_calib, acc_estimate, acc_rmse, 0, 1, np.nan ] print("PVC internal") pvc = calibration.compute_pvc(calib_labels_expanded, calib_predictions_expanded, n_classes, weights=calib_weights_expanded) pvc_corrected = calibration.apply_pvc(test_predictions.values, pvc) pvc_estimate = pvc_corrected[1] pvc_rmse = np.sqrt((pvc_estimate - test_estimate)**2) output_df.loc['PVC_int'] = [ n_calib, pvc_estimate, pvc_rmse, 0, 1, np.nan ] # do IVAP for calibration print("Venn") test_pred_ranges = ivap.estimate_probs_from_labels( project_dir, model, model_name, subset, subset, labels_df, calib_items, test_items, weights_df=weights_df) combo = test_pred_ranges[:, 1] / (1.0 - test_pred_ranges[:, 0] + test_pred_ranges[:, 1]) pred_range = np.mean(test_pred_ranges, axis=0) venn_estimate = np.mean(combo) venn_rmse = np.sqrt((venn_estimate - test_estimate)**2) venn_contains_test = pred_range[0] < test_estimate < pred_range[1] output_df.loc['Venn'] = [ n_calib, venn_estimate, venn_rmse, pred_range[0], pred_range[1], venn_contains_test ] output_filename = os.path.join(dirs.dir_models(project_dir), model_name, field_name + '_' + str(v) + '.csv') output_df.to_csv(output_filename)
def identify_rnn_targets(output_data_filename): min_head_vocab = 5 min_role_vocab = 4 min_tuples = 3 ATTRIBUTE = 0 AGENT_ROLE = 1 PATIENT_ROLE = 2 SURFACE_FORM = 3 parsed_dir = os.path.join(dirs.data_stanford_dir, 'parsed') parsed_files = glob.glob(os.path.join(parsed_dir, '*.json')) dependencies_file = os.path.join(dirs.data_stanford_dir, 'dependency_tuple_ids.json') dependencies = fh.read_json(dependencies_file) coref_file = os.path.join(dirs.data_stanford_dir, 'coref_heads.json') coref_heads = fh.read_json(coref_file) supersense_tags = fh.read_json(os.path.join(dirs.data_amalgram_dir, 'all_tags.json')) heads = defaultdict(int) tokens = defaultdict(int) attributes = defaultdict(int) agent_roles = defaultdict(int) patient_roles = defaultdict(int) story_elements = {} print "Extracting story elements" for f_i, f in enumerate(parsed_files): sentences = fh.read_json(f) basename = fh.get_basename_wo_ext(f) print f element_list = extract_story_elements_from_article(sentences, dependencies[basename], coref_heads[basename], supersense_tags[basename], basename) story_elements[basename] = element_list for element in element_list: for h in element.head_words: heads[h] += 1 for t in element.attributes: attributes[t] += 1 for t in element.agent_roles: agent_roles[t] += 1 for t in element.patient_roles: patient_roles[t] += 1 print "Finding most common tokens" common_heads = [(v, k) for k, v in heads.items()] common_heads.sort() common_heads.reverse() output_filename = os.path.join(dirs.lda_dir, 'common_heads.json') fh.write_to_json(common_heads, output_filename, sort_keys=False) """ common_tokens = [(v, k) for k, v in tokens.items()] common_tokens.sort() common_tokens.reverse() output_filename = os.path.join(dirs.lda_dir, 'common_tokens.json') fh.write_to_json(common_tokens, output_filename, sort_keys=False) """ common_attributes = [(v, k) for k, v in attributes.items()] common_attributes.sort() common_attributes.reverse() output_filename = os.path.join(dirs.lda_dir, 'common_attributes.json') fh.write_to_json(common_attributes, output_filename, sort_keys=False) common_agent_roles = [(v, k) for k, v in agent_roles.items()] common_agent_roles.sort() common_agent_roles.reverse() output_filename = os.path.join(dirs.lda_dir, 'common_agent_roles.json') fh.write_to_json(common_agent_roles, output_filename, sort_keys=False) common_patient_roles = [(v, k) for k, v in patient_roles.items()] common_patient_roles.sort() common_patient_roles.reverse() output_filename = os.path.join(dirs.lda_dir, 'common_patient_roles.json') fh.write_to_json(common_patient_roles, output_filename, sort_keys=False) print pronoun_list #most_common_heads = {k: v for v, k in common_heads if v >= min_head_vocab and k not in pronoun_list} most_common_attributes = {k: v for v, k in common_attributes if (v >= min_role_vocab and k not in pronoun_list)} most_common_agent_roles = {k: v for v, k in common_agent_roles if (v >= min_role_vocab and k not in pronoun_list and k not in stopwords)} most_common_patient_roles = {k: v for v, k in common_patient_roles if (v >= min_role_vocab and k not in pronoun_list and k not in stopwords)} output_filename = os.path.join(dirs.lda_dir, 'most_common_attributes.json') fh.write_to_json(most_common_attributes, output_filename, sort_keys=False) output_filename = os.path.join(dirs.lda_dir, 'most_common_agent_roles.json') fh.write_to_json(most_common_agent_roles, output_filename, sort_keys=False) output_filename = os.path.join(dirs.lda_dir, 'most_common_patient_roles.json') fh.write_to_json(most_common_patient_roles, output_filename, sort_keys=False) print len(most_common_attributes) print len(most_common_agent_roles) print len(most_common_patient_roles) print "Filtering tuples" valid_elements = defaultdict(list) for basename, element_list in story_elements.items(): for se in element_list: # need at least one head word that is not a pronoun se.valid_heads = [h for h in se.head_words if h not in pronoun_list] if len(se.valid_heads) > 0: se.valid_attributes = [t for t in se.attributes if t in most_common_attributes] se.valid_agent_roles = [t for t in se.agent_roles if t in most_common_agent_roles] se.valid_patient_roles = [t for t in se.patient_roles if t in most_common_patient_roles] se.tuples = [(ATTRIBUTE, t) for t in se.valid_attributes] + \ [(AGENT_ROLE, t) for t in se.valid_agent_roles] + \ [(PATIENT_ROLE, t) for t in se.valid_patient_roles] #[(SURFACE_FORM, t) for t in se.valid_heads] if len(se.tuples) >= min_tuples: valid_elements[basename].append(se) output_data = [] for basename, element_list in valid_elements.items(): used_sentences = set() for se in element_list: for i in range(len(se.head_indices)): assert se.head_indices[i] < len(se.sentences[i].split()) if se.head_words[i] not in pronoun_list: if se.sentences[i] not in used_sentences: output_data.append((se.head_indices[i], se.sentences[i], basename)) # THIS IS TRYING SOMETHING NEW... used_sentences.add(se.sentences[i]) with codecs.open(output_data_filename, 'w', encoding='utf-8') as output_file: json.dump(output_data, output_file, indent=2, sort_keys=False) """
def get_bamman_entities(all_trees, clustered_entity_indices, word2vec_file=None, min_role_vocab=4, min_tuples=3): ATTRIBUTE = 0 AGENT_ROLE = 1 PATIENT_ROLE = 2 SURFACE_FORM = 3 tokens = defaultdict(int) heads = defaultdict(int) attributes = defaultdict(int) agent_roles = defaultdict(int) patient_roles = defaultdict(int) story_elements = {} for basename, trees in all_trees.items(): story_elements[basename] = [] article_clusters = clustered_entity_indices[basename] # go through each entity, represented by a list of tree/node locations for c_i, cluster_indices in enumerate(article_clusters): # create an entity for each cluster in this document entity = BammanEntity(basename) # for each appearance, create an appearance object for this entity for t_i, n_i in cluster_indices: word = trees[t_i].node_dict[n_i].word compound_word = get_compound_noun(trees[t_i], n_i) mention_attributes = get_attributes(trees[t_i], n_i) mention_agent_roles = get_agent_roles(trees[t_i], n_i) mention_patient_roles = get_patient_roles(trees[t_i], n_i) appearance = BammanEntityAppearance(t_i, n_i, word, mention_attributes, mention_agent_roles, mention_patient_roles, compound_word) entity.add_appearance(appearance) # count the total mentions of these words to build vocabularies heads[word] += 1 for t in mention_attributes: attributes[t[0]] += 1 for t in mention_agent_roles: agent_roles[t[0]] += 1 for t in mention_patient_roles: patient_roles[t[0]] += 1 # add the newly created entity to a dict story_elements[basename].append(entity) print "Finding most common tokens" common_heads = [(v, k) for k, v in heads.items()] common_heads.sort() common_heads.reverse() output_filename = os.path.join(dirs.persona_dir, 'common_heads.json') fh.write_to_json(common_heads, output_filename, sort_keys=False) common_attributes = [(v, k) for k, v in attributes.items()] common_attributes.sort() common_attributes.reverse() output_filename = os.path.join(dirs.persona_dir, 'common_attributes.json') fh.write_to_json(common_attributes, output_filename, sort_keys=False) common_agent_roles = [(v, k) for k, v in agent_roles.items()] common_agent_roles.sort() common_agent_roles.reverse() output_filename = os.path.join(dirs.persona_dir, 'common_agent_roles.json') fh.write_to_json(common_agent_roles, output_filename, sort_keys=False) common_patient_roles = [(v, k) for k, v in patient_roles.items()] common_patient_roles.sort() common_patient_roles.reverse() output_filename = os.path.join(dirs.persona_dir, 'common_patient_roles.json') fh.write_to_json(common_patient_roles, output_filename, sort_keys=False) # filter vocabularies based on frequency and stopwords most_common_attributes = {k: v for v, k in common_attributes if (v >= min_role_vocab and k not in pronoun_list and k not in stopwords)} most_common_agent_roles = {k: v for v, k in common_agent_roles if (v >= min_role_vocab and k not in pronoun_list and k not in stopwords)} most_common_patient_roles = {k: v for v, k in common_patient_roles if (v >= min_role_vocab and k not in pronoun_list and k not in stopwords)} # save these vocabularies output_filename = os.path.join(dirs.persona_dir, 'most_common_attributes.json') fh.write_to_json(most_common_attributes, output_filename, sort_keys=False) output_filename = os.path.join(dirs.persona_dir, 'most_common_agent_roles.json') fh.write_to_json(most_common_agent_roles, output_filename, sort_keys=False) output_filename = os.path.join(dirs.persona_dir, 'most_common_patient_roles.json') fh.write_to_json(most_common_patient_roles, output_filename, sort_keys=False) print len(most_common_attributes) print len(most_common_agent_roles) print len(most_common_patient_roles) print "Filtering tuples" filtered_indices = {} valid_elements = defaultdict(list) for basename, entity_list in story_elements.items(): #filtered_indices[basename] = [] for e_index, entity in enumerate(entity_list): appearances = entity.get_appearances() valid_heads = [] for ap in appearances: if ap.head_word not in pronoun_list: valid_heads.append(ap.head_word) ap.valid_heads = [ap.head_word] ap.valid_compound_heads = [ap.compound_word] else: ap.valid_heads = [] ap.valid_compound_heads = [] if len(valid_heads) > 0: for ap in appearances: ap.valid_attributes = [t for t in ap.attributes if t[0] in most_common_attributes] ap.valid_agent_roles = [t for t in ap.agent_roles if t[0] in most_common_agent_roles] ap.valid_patient_roles = [t for t in ap.patient_roles if t[0] in most_common_patient_roles] ap.tuples = [(ATTRIBUTE, t[0], t[1], t[2], t[3]) for t in ap.valid_attributes] + \ [(AGENT_ROLE, t[0], t[1], t[2], t[3]) for t in ap.valid_agent_roles] + \ [(PATIENT_ROLE, t[0], t[1], t[2], t[3]) for t in ap.valid_patient_roles] if entity.get_n_tuples() >= min_tuples: valid_elements[basename].append(entity) #filtered_indices[basename].append(clustered_entity_indices[basename][se_index]) print "Constructing vocabulary" n_tuples = 0 vocab = VocabWithCounts('', add_oov=False) n_entities = 0 n_mentions = 0 for basename, element_list in valid_elements.items(): for se in element_list: for appearance in se.appearances: tokens = [token for role, token, relation, pos, tuple_token_index in appearance.tuples] vocab.add_tokens(tokens) n_tuples += len(tokens) if len(appearance.tuples) > 0: n_mentions += 1 n_entities += 1 head_word_vocab = VocabWithCounts('', add_oov=False) for basename, element_list in valid_elements.items(): for se in element_list: for appearance in se.appearances: tokens = [token for token in appearance.valid_heads] head_word_vocab.add_tokens(tokens) head_phrase_vocab = VocabWithCounts('', add_oov=False) for basename, element_list in valid_elements.items(): for se in element_list: for appearance in se.appearances: tokens = [token for token in appearance.valid_compound_heads] head_phrase_vocab.add_tokens(tokens) print "Building indices" tuple_vocab = np.zeros(n_tuples, dtype=int) # vocab index of the ith word tuple_entity = np.zeros(n_tuples, dtype=int) tuple_role = [] mention_entity = np.zeros(n_mentions, dtype=int) tuple_mention = np.zeros(n_tuples, dtype=int) entity_doc = np.zeros(n_entities, dtype=int) # topic of the ith word docs = valid_elements.keys() docs.sort() """ vocab_vectors = None if word2vec_file is not None: import gensim dx = 300 vocab_vectors = np.zeros((len(vocab), dx)) # load pre-trained word vectors print "Loading pre-trained word vectors" all_vectors = gensim.models.Word2Vec.load_word2vec_format(word2vec_file, binary=True) word2vec_vocab = set() for v in vocab.get_all_tokens(): v_i = vocab.get_index(v) if v in all_vectors: vocab_vectors[v_i, :] = all_vectors[v] word2vec_vocab.add(v) else: vocab_vectors[v_i, :] = 0.05 * np.random.uniform(-1.0, 1.0, (1, dx)) print len(list(set(vocab.get_all_tokens()) - word2vec_vocab)), "words in training vocabulary with no word2vec vector" """ vocab_counts = np.zeros(len(vocab), dtype=int) entity_appearances = {} entity_index = 0 mention_index = 0 head_word_vocab_list = [] head_word_entity_list = [] head_phrase_vocab_list = [] head_phrase_entity_list = [] entity_text_mentions = {} t_i = 0 for d_i, d in enumerate(docs): print d basename = os.path.basename(d) entity_appearances[basename] = {} element_list = valid_elements[d] entity_text_mentions[d] = {} for se in element_list: entity_text_mentions[d][entity_index] = {'sent_indices': [], 'token_indices': [], 'roles': []} entity_doc[entity_index] = d_i for appearance in se.appearances: entity_text_mentions[d][entity_index]['sent_indices'].append(appearance.tree_index) entity_text_mentions[d][entity_index]['token_indices'].append(appearance.token_index) for role, token, relation, pos, tuple_token_index in appearance.tuples: tuple_entity[t_i] = entity_index tuple_mention[t_i] = mention_index tuple_role.append(role) vocab_index = vocab.get_index(token) tuple_vocab[t_i] = vocab_index vocab_counts[vocab_index] += 1 t_i += 1 entity_text_mentions[d][entity_index]['roles'].append((role, token, appearance.tree_index, tuple_token_index)) for token in appearance.valid_heads: head_word_vocab_index = head_word_vocab.get_index(token) head_word_vocab_list.append(head_word_vocab_index) head_word_entity_list.append(entity_index) for token in appearance.valid_compound_heads: head_phrase_vocab_index = head_phrase_vocab.get_index(token) head_phrase_vocab_list.append(head_phrase_vocab_index) head_phrase_entity_list.append(entity_index) # keep track of which document / sentences this entity appears in s_i = appearance.tree_index if s_i in entity_appearances[basename]: entity_appearances[basename][s_i].append(entity_index) else: entity_appearances[basename][s_i] = [entity_index] if len(appearance.tuples): mention_entity[mention_index] = entity_index mention_index += 1 entity_index += 1 # as initial testing for Gaussian LDA, export a small vector for each tuple """ tuple_vectors = None if word2vec_file is not None: vec_size = 10 tuple_vectors = np.zeros([n_tuples, vec_size]) for v_i, v in enumerate(tuple_vocab): tuple_vectors[v_i, :] = vocab_vectors[v, :vec_size] """ # export network data rnn_data = [] t_i = 0 entity_index = 0 mention_index = 0 for d_i, d in enumerate(docs): element_list = valid_elements[d] for entity in element_list: appearance_list = [] for appearance in entity.appearances: tuple_list = [] head_word = appearance.head_word head_phrase = appearance.compound_word for role, token, relation, pos, tuple_token_index in appearance.tuples: tuple_list.append((t_i, role, token, relation, head_word, pos, head_phrase)) t_i += 1 if len(tuple_list) > 0: appearance_list.append(tuple_list) rnn_data.append([d_i, entity_index, appearance_list]) entity_index += 1 output_filename = os.path.join(dirs.persona_dir, 'rnn_data.json') fh.write_to_json(rnn_data, output_filename, sort_keys=False) print len(docs), "valid documents" print entity_index, "entities" print t_i, "tuples" print len(vocab), "word types" print np.min(vocab_counts), np.max(vocab_counts), np.sum(vocab_counts) output_filename = os.path.join(dirs.persona_dir, 'tuple_vocab.json') fh.write_to_json(list(tuple_vocab), output_filename, sort_keys=False) output_filename = os.path.join(dirs.persona_dir, 'tuple_role.json') fh.write_to_json(list(tuple_role), output_filename, sort_keys=False) output_filename = os.path.join(dirs.persona_dir, 'tuple_entity.json') fh.write_to_json(list(tuple_entity), output_filename, sort_keys=False) output_filename = os.path.join(dirs.persona_dir, 'tuple_mention.json') fh.write_to_json(list(tuple_mention), output_filename, sort_keys=False) output_filename = os.path.join(dirs.persona_dir, 'mention_entity.json') fh.write_to_json(list(mention_entity), output_filename, sort_keys=False) output_filename = os.path.join(dirs.persona_dir, 'entity_doc.json') fh.write_to_json(list(entity_doc), output_filename, sort_keys=False) output_filename = os.path.join(dirs.persona_dir, 'vocab.json') fh.write_to_json(vocab.index2token, output_filename, sort_keys=False) output_filename = os.path.join(dirs.persona_dir, 'docs.json') fh.write_to_json(list(docs), output_filename, sort_keys=False) #output_filename = os.path.join(dirs.persona_dir, 'article_map.json') #fh.write_to_json(list(article_mapping), output_filename, sort_keys=False) output_filename = os.path.join(dirs.persona_dir, 'head_word_vocab.json') fh.write_to_json(head_word_vocab.index2token, output_filename, sort_keys=False) output_filename = os.path.join(dirs.persona_dir, 'head_word_vocab_list.json') fh.write_to_json(head_word_vocab_list, output_filename, sort_keys=False) output_filename = os.path.join(dirs.persona_dir, 'head_word_entity_list.json') fh.write_to_json(head_word_entity_list, output_filename, sort_keys=False) output_filename = os.path.join(dirs.persona_dir, 'entity_appearances.json') fh.write_to_json(entity_appearances, output_filename, sort_keys=False) #if tuple_vectors is not None: # output_filename = os.path.join(dirs.persona_dir, 'tuple_vectors.json') # fh.write_to_json(tuple_vectors.tolist(), output_filename, sort_keys=False) output_filename = os.path.join(dirs.persona_dir, 'head_phrase_vocab.json') fh.write_to_json(head_phrase_vocab.index2token, output_filename, sort_keys=False) output_filename = os.path.join(dirs.persona_dir, 'head_phrase_vocab_list.json') fh.write_to_json(head_phrase_vocab_list, output_filename, sort_keys=False) output_filename = os.path.join(dirs.persona_dir, 'head_phrase_entity_list.json') fh.write_to_json(head_phrase_entity_list, output_filename, sort_keys=False) output_filename = os.path.join(dirs.persona_dir, 'entity_text_mentions.json') fh.write_to_json(entity_text_mentions, output_filename, sort_keys=False) return filtered_indices, valid_elements