def setup_rep(technique, dataset, fold, rep, noise, shuffled, folddir, repdir): if shuffled == 0: return [] repfile = ('%s_%s_%d_%d_%.5f_%d.rep' % (technique, dataset, fold, rep, noise, shuffled)) reppath = os.path.join(repdir, repfile) if os.path.exists(reppath): with open(reppath, 'r') as f: shuffled_bags = map(lambda s: tuple(s.strip().split(',')), f) shuffled_bags = [(int(l[0]), l[1], l[2], int(l[3])) for l in shuffled_bags] else: ids, _, y = get_dataset(dataset) test_ids = set(bid for bid, iid in get_fold(folddir, dataset, fold)) classes = defaultdict(list) for (bid, iid), yi in zip(ids, y.flat): if bid not in test_ids: classes[bid].append(yi) bag_ids = list(classes.keys()) Y = [any(classes[bid]) for bid in bag_ids] pos_bags = [bid for bid, Yi in zip(bag_ids, Y) if Yi] neg_bags = [bid for bid, Yi in zip(bag_ids, Y) if not Yi] n_pos = len(pos_bags) n_neg = len(neg_bags) initial_bag_ids = pos_bags + neg_bags initial_labels = ([True]*n_pos) + ([False]*n_neg) initial_bags = [[(bid, iid) for bid, iid in ids if bid == ibid] for ibid in initial_bag_ids] if technique == 'shuffle_both': shuffled_bags, shuffled_labels = shuffling.shuffle_both( shuffled, initial_bags, initial_labels, noise) elif technique == 'shuffle_pos': shuffled_bags, shuffled_labels = shuffling.shuffle_pos( shuffled, initial_bags, initial_labels, noise) else: raise Exception('Unsupported shuffling technique: %s' % technique) all_bags = shuffled_bags all_labels = shuffled_labels shuffled_bags = [] with open(reppath, 'w+') as f: for i, (bag, label) in enumerate(zip(all_bags, all_labels)): for bid, iid in bag: linst = (i, bid, iid, int(label)) shuffled_bags.append(linst) f.write('%d,%s,%s,%d\n' % linst) return shuffled_bags
def client_target(task, callback): key = task['key'] params = task['params'] (classifier, dataset, kernel, fold, rep) = key print 'Starting task %s...' % str(key) print 'Parameters: %s' % str(params) data_dict = data.get_dataset(dataset) fold_ids = set(data.get_fold(FOLDIR, dataset, fold)) test_ids = fold_ids if rep == 0: train_ids = set(data_dict.keys()) - test_ids else: train_ids = set(data.get_rep(FOLDIR, dataset, fold, rep)) X_train = [data_dict[bid][0] for bid in train_ids] y_train = [data_dict[bid][1] for bid in train_ids] X_test = [data_dict[bid][0] for bid in test_ids] y_test = [data_dict[bid][1] for bid in test_ids] results = {} results['stats'] = {} results['preds'] = {} start = time.time() if classifier == 'nsk': nsk = SetSVM(SVC, kernel, **params) nsk.fit(X_train, y_train) predictions = nsk.decision_function(X_test) else: print 'Technique "%s" not supported' % technique callback.quit = True return results['stats']['time'] = time.time() - start for i, y in zip(test_ids, predictions): results['preds'][i] = float(y) if len(y_test) > 1: print 'Test AUC Score: %f' % auc_score(y_test, predictions) print 'Finished task %s.' % str(key) return results
def main(dataset, folddir, outputdir, reps=0): data_dict = data.get_dataset(dataset) folds = data.get_folds(folddir, dataset) all_bag_ids = set(data_dict.keys()) progress = ProgressMonitor(total=reps * len(folds), msg='Generating Replicates') for f in range(len(folds)): test = data.get_fold(folddir, dataset, f) bag_ids = np.array(list(all_bag_ids - set(test))) n = len(bag_ids) for r in range(1, reps + 1): rep_path = os.path.join(outputdir, '%s_%04d_%06d.rep' % (dataset, f, r)) if not os.path.exists(rep_path): sample = np.random.randint(n, size=n) sampled_bags = bag_ids[sample] with open(rep_path, 'w+') as ofile: ofile.write('\n'.join([bid for bid in sampled_bags.flat])) progress.increment()
def main(args): out_dir = args.model_dir paramargs = param_t([ln.strip().split("=", 1) for ln in open(os.path.join(out_dir, "param.txt"), "r")]) print("") print("Regression evaluator") print(" # Score type: {}".format(paramargs.mp_score_type)) print(" # Output dir: {}".format(out_dir)) print(" # Param string:\n{}".format(model.param_str(paramargs))) print("") # To reduce memory consumption config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) keras.backend.set_session(sess) # Load Pretrained Embedding pre_embed = data.load_pretrained_embeddings() if args.mp_pretrained else None # Load data and Normalize score essayids, essays, org_scores, scores, prompts, scaler = data.load_annotated_essay_with_normalized_score('Path_to_essay_file(.xlsx)', score_source="./essayScore_folds/{}Scores.txt".format(paramargs.mp_score_type)) # ./data/ICLE_essays.xlsx # Get Persing Sequence (Paragraph Sequence of Persing et. al. 2010) pseqs = np.array([data.get_persing_sequence(e, p) for e, p in zip(essays, prompts)]) # Preprocess data (assert sentence and paragraph boundary) if paramargs.mp_di_aware: di_list = data.load_discourse_indicators() essays = data.preprocess_essay_withParaBoundary(essays, paramargs, boseos=True, bopeop=True) elif paramargs.mp_model_type == "nea_aft_pretrain" and not paramargs.mp_para: essays = data.preprocess_essay_noParaBoundary(essays, paramargs, boseos=True) elif paramargs.mp_no_para: essays = data.preprocess_essay_noParaBoundary(essays, paramargs, boseos=True) else: essays = data.preprocess_essay_withParaBoundary(essays, paramargs, boseos=True, bopeop=True) if paramargs.mp_prompt: prompts = data.preprocess_essay_noParaBoundary(essays, paramargs, boseos=True) # Get training and validation set! id2idx = dict([(v, k) for k, v in enumerate(essayids)]) folds = data.load_folds("./essayScore_folds/{}Folds.txt".format(paramargs.mp_score_type), id2idx=id2idx) assert(0 <= args.fold and args.fold <= 4) _, _, ts = data.get_fold(folds, args.fold) indices = np.arange(len(essays)) main_essay_t, main_essay_v, score_t, score_v, indices_t, indices_v, prompt_t, prompt_v = [], essays[ts], [], org_scores[ts], [], indices[ts], [], prompts[ts] pseq_t, pseq_v = [], pseqs[indices_v] print(main_essay_v[:1]) print(prompt_v[:1]) # Preparing inputs model_inputs_v = [] # Text to sequence if paramargs.mp_model_type != "only_pseq": tokenizer_m = pickle.load(open(os.path.join(args.model_dir, "tokenizer_f{}.pickle".format(args.fold)), "rb")) sequences_valid_main = tokenizer_m.texts_to_sequences(main_essay_v) lens = [len(e) for e in sequences_valid_main] model_inputs_v += [pad_sequences(sequences_valid_main, maxlen=min(max(lens), data.MAX_WORDS))] sequence_length_main = model_inputs_v[-1].shape[1] # Persing sequence to sequence sequence_length_pseq = None if paramargs.mp_pseq or paramargs.mp_model_type == "only_pseq": tokenizer_pseq = pickle.load(open(os.path.join(args.model_dir, "tokenizer_pseq_f{}.pickle".format(args.fold)), "rb")) sequences_valid_pseq = tokenizer_pseq.texts_to_sequences(pseq_v) lens = [len(e) for e in sequences_valid_pseq] model_inputs_v += [pad_sequences(sequences_valid_pseq, maxlen=min(max(lens), data.MAX_PARAGRAPHS))] sequence_length_pseq = model_inputs_v[-1].shape[1] #prompt if paramargs.mp_prompt: tokenizer_p = pickle.load(open(os.path.join(args.model_dir, "tokenizer_p_f{}.pickle".format(args.fold)), "rb")) sequences_valid_prompt = tokenizer_p.texts_to_sequences(prompt_v) lens = [len(e) for e in sequences_valid_prompt] model_inputs_v += [pad_sequences(sequences_valid_prompt, maxlen=min(max(lens), data.MAX_WORDS))] sequence_length_prompt = model_inputs_v[-1].shape[1] if paramargs.mp_model_type == "only_pseq": mainModel = model.pseq_regression(sequence_length_pseq, paramargs,) else: if paramargs.mp_prompt: mainModel = model.create_regression_wprompt(None, tokenizer_m.word_index, tokenizer_p.word_index, sequence_length_main, sequence_length_pseq, sequence_length_prompt, paramargs, ) else: mainModel = model.create_regression(None, tokenizer_m.word_index, sequence_length_main, sequence_length_pseq, paramargs, ) mainModel.summary() mainModel.load_weights(os.path.join(args.model_dir, "regression_f{}.hdf5".format(args.fold)), by_name=True) print("Starting evaluation.") # Perform prediction! score_model = mainModel.predict( model_inputs_v, verbose=1, batch_size=32) score_model_n = scaler.inverse_transform(score_model) score_model_n = score_model_n.tolist() score_model_nf = [] for i in score_model_n: for j in i: score_model_nf.append(j) # Save to the file. with open(os.path.join(out_dir, "prediction_f{}.json".format(args.fold)), "w") as f: mse, mae = mean_squared_error(score_v, score_model_nf), mean_absolute_error(score_v, score_model_nf) pr = { "system": score_model_nf, "gold": score_v.tolist(), "MSE": mse, "MAE": mae, } json.dump(pr, f) print("MSE:", mse) print("MAE:", mae)
def client_target(task, callback): key = task['key'] params = task['params'] shuffled_bags = task['shuffled_bags'] (technique, classifier, dataset, kernel, fold, rep, noise, shuffled) = key print 'Starting task %s...' % str(key) print 'Parameters: %s' % str(params) ids, X, y = data.get_dataset(dataset) id_index = {} for j, i in enumerate(ids): id_index[i] = j fold = set(data.get_fold(FOLDIR, dataset, fold)) test_ids = fold train_ids = set(ids) - test_ids X_train = defaultdict(list) y_train = defaultdict(bool) for bid, iid in train_ids: X_train[bid].append(X[id_index[bid, iid]]) y_train[bid] |= bool(y[id_index[bid, iid]]) for bag, bid, iid, yi in shuffled_bags: X_train[bag].append(X[id_index[bid, iid]]) y_train[bag] |= bool(yi) bags_train = sorted(X_train.keys()) X_train = map(np.vstack, [X_train[b] for b in bags_train]) y_train = [y_train[b] for b in bags_train] X_test = defaultdict(list) y_test = defaultdict(bool) for bid, iid in test_ids: X_test[bid].append(X[id_index[bid, iid]]) y_test[bid] |= bool(y[id_index[bid, iid]]) bags_test = sorted(X_test.keys()) X_test = map(np.vstack, [X_test[b] for b in bags_test]) y_test = [y_test[b] for b in bags_test] results = {} results['stats'] = {} results['preds'] = {} start = time.time() if classifier == 'nsk': nsk = SetSVM(SVC, kernel, **params) nsk.fit(X_train, y_train) predictions = nsk.decision_function(X_test) else: print 'Classifier "%s" not supported' % classifier callback.quit = True return results['stats']['time'] = time.time() - start for i, y in zip(bags_test, predictions): results['preds'][i] = float(y) if len(y_test) > 1: print 'Test AUC Score: %f' % auc_score(y_test, predictions) print 'Finished task %s.' % str(key) return results
def main(args): pstr = model.param_str(args) out_dir = hashlib.md5(pstr.encode("utf-8")).hexdigest() out_dir = os.path.join("output", out_dir) os.system("mkdir -p {}".format(out_dir)) with open(os.path.join(out_dir, "param.txt"), "w") as f: print(pstr, file=f) with open(os.path.join(out_dir, "args.json"), "w") as f: json.dump(args.__dict__, f) print("") print("Regression trainer") print(" # Score type: {}".format(args.mp_score_type)) print(" # Output dir: {}".format(out_dir)) print(" # Param string:\n{}".format(pstr)) print("") # To reduce memory consumption config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) keras.backend.set_session(sess) # Load Pretrained Embedding pre_embed = data.load_pretrained_embeddings( ) if args.mp_pretrained else None # Load data and Normalize Score essayids, essays, _, scores, prompts, _ = data.load_annotated_essay_with_normalized_score( 'Path_to_essay_file(.xlsx)', score_source="data/{}Scores.txt".format( args.mp_score_type)) # ./data/ICLE_essays.xlsx # Get Persing Sequence (Paragraph Sequence of Persing et. al. 2010) pseqs = np.array( [data.get_persing_sequence(e, p) for e, p in zip(essays, prompts)]) # Preprocess data (assert sentence and paragraph boundary) if args.mp_di_aware: di_list = data.load_discourse_indicators() essays = data.preprocess_essay_withParaBoundary(essays, args, boseos=True, bopeop=True) elif args.mp_model_type == "nea_aft_pretrain" and not args.mp_para: essays = data.preprocess_essay_noParaBoundary(essays, args, boseos=True) elif args.mp_no_para: essays = data.preprocess_essay_noParaBoundary(essays, args, boseos=True) else: essays = data.preprocess_essay_withParaBoundary(essays, args, boseos=True, bopeop=True) if args.mp_prompt: prompts = data.preprocess_essay_noParaBoundary(essays, args, boseos=True) # Get training and validation set! id2idx = dict([(v, k) for k, v in enumerate(essayids)]) folds = data.load_folds("./essayScore_folds/{}Folds.txt".format( args.mp_score_type), id2idx=id2idx) assert (0 <= args.fold and args.fold <= 4) tr, v, ts = data.get_fold(folds, args.fold) if args.mp_divide_data == "half": _, tr = train_test_split(tr, test_size=0.50, random_state=args.mp_seed) elif args.mp_divide_data == "one_forth": _, tr = train_test_split(tr, test_size=0.25, random_state=args.mp_seed) elif args.mp_divide_data == "one_eighth": _, tr = train_test_split(tr, test_size=0.125, random_state=args.mp_seed) else: tr = tr v = v indices = np.arange(len(essays)) main_essay_t, main_essay_v, score_t, score_v, indices_t, indices_v, prompt_t, prompt_v = essays[ tr], essays[v], scores[tr], scores[v], indices[tr], indices[ v], prompts[tr], prompts[v] pseq_t, pseq_v = pseqs[indices_t], pseqs[indices_v] print(main_essay_v[:2]) print(prompt_v[:10]) # Preparing inputs model_inputs_t, model_inputs_v = [], [] # Text to sequence if args.mp_model_type != "only_pseq": if args.mp_preenc != None: tokenizer_m = pickle.load( open(os.path.join(args.mp_preenc, "tokenizer.pickle"), "rb")) else: tokenizer_m = model.create_vocab(main_essay_t, args) with open( os.path.join(out_dir, "tokenizer_f{}.pickle".format(args.fold)), "wb") as f: pickle.dump(tokenizer_m, f) sequences_train_main = tokenizer_m.texts_to_sequences(main_essay_t) sequences_valid_main = tokenizer_m.texts_to_sequences(main_essay_v) lens = [len(e) for e in sequences_train_main] model_inputs_t += [ pad_sequences(sequences_train_main, maxlen=min(max(lens), data.MAX_WORDS)) ] model_inputs_v += [ pad_sequences(sequences_valid_main, maxlen=model_inputs_t[-1].shape[1]) ] sequence_length_main = model_inputs_t[-1].shape[1] # Persing sequence to sequence sequence_length_pseq = None if args.mp_pseq or args.mp_model_type == "only_pseq": tokenizer_pseq = model.create_vocab_seq(pseq_t, char_level=True) with open( os.path.join(out_dir, "tokenizer_pseq_f{}.pickle".format(args.fold)), "wb") as f: pickle.dump(tokenizer_pseq, f) sequences_train_pseq = tokenizer_pseq.texts_to_sequences(pseq_t) sequences_valid_pseq = tokenizer_pseq.texts_to_sequences(pseq_v) lens = [len(e) for e in sequences_train_pseq] model_inputs_t += [ pad_sequences(sequences_train_pseq, maxlen=min(max(lens), data.MAX_PARAGRAPHS)) ] model_inputs_v += [ pad_sequences(sequences_valid_pseq, maxlen=model_inputs_t[-1].shape[1]) ] sequence_length_pseq = model_inputs_t[-1].shape[1] # Prompt if args.mp_prompt: tokenizer_p = model.create_vocab_prompt(prompt_t, args) with open( os.path.join(out_dir, "tokenizer_p_f{}.pickle".format(args.fold)), "wb") as f: pickle.dump(tokenizer_p, f) sequences_train_prompt = tokenizer_p.texts_to_sequences(prompt_t) sequences_valid_prompt = tokenizer_p.texts_to_sequences(prompt_v) lens = [len(e) for e in sequences_train_prompt] model_inputs_t += [ pad_sequences(sequences_train_prompt, maxlen=min(max(lens), data.MAX_WORDS)) ] model_inputs_v += [ pad_sequences(sequences_valid_prompt, maxlen=model_inputs_t[-1].shape[1]) ] sequence_length_prompt = model_inputs_t[-1].shape[1] # Create neural regression model. if args.mp_model_type == "only_pseq": mainModel = model.pseq_regression( sequence_length_pseq, args, ) mainModel.summary() else: if args.mp_prompt: mainModel = model.create_regression_wprompt( pre_embed, tokenizer_m.word_index, tokenizer_p.word_index, sequence_length_main, sequence_length_pseq, sequence_length_prompt, args, ) mainModel.summary() else: mainModel = model.create_regression( pre_embed, tokenizer_m.word_index, sequence_length_main, sequence_length_pseq, args, ) mainModel.summary() # Load Pretrained Model if args.mp_preenc != None: mainModel.load_weights(os.path.join(args.mp_preenc, "encoder.hdf5"), by_name=True) print("Starting training.") optimizer_main = keras.optimizers.Adam(clipnorm=args.mp_clipnorm) mainModel.compile(optimizer=optimizer_main, loss='mse', metrics=['mse']) es = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=15, verbose=0, mode='auto', baseline=None, restore_best_weights=True) nbl = kr_util.NBatchLogger( os.path.join(out_dir, "logs_f{}.pickle".format(args.fold))) mainModel.fit(model_inputs_t, score_t, validation_data=(model_inputs_v, score_v), epochs=100, verbose=1, callbacks=[es, nbl], batch_size=32) mainModel.save_weights( os.path.join(out_dir, "regression_f{}.hdf5".format(args.fold))) print() print("# Output dir: {}".format(out_dir))
def client_target(task, callback): key = task['key'] params = task['params'] labeled = task['labeled'] (technique, classifier, dataset, kernel, fold, rep, initial, shuffled, queries) = key print 'Starting task %s...' % str(key) print 'Parameters: %s' % str(params) ids, X, y = data.get_dataset(dataset) id_index = {} for j, i in enumerate(ids): id_index[i] = j fold = set(data.get_fold(FOLDIR, dataset, fold)) test_ids = fold train_ids = set(ids) - test_ids labeled_ids = set((l[1], l[2]) for l in labeled) pool_ids = train_ids - labeled_ids X_labeled = defaultdict(list) y_labeled = defaultdict(bool) for bag, bid, iid, yi in labeled: X_labeled[bag].append(X[id_index[bid, iid]]) y_labeled[bag] |= bool(yi) bags_labeled = sorted(X_labeled.keys()) X_labeled = map(np.vstack, [X_labeled[b] for b in bags_labeled]) y_labeled = [y_labeled[b] for b in bags_labeled] X_pool = defaultdict(list) y_pool = defaultdict(bool) for bid, iid in pool_ids: X_pool[bid].append(X[id_index[bid, iid]]) y_pool[bid] |= bool(y[id_index[bid, iid]]) bags_pool = sorted(X_pool.keys()) X_pool = map(np.vstack, [X_pool[b] for b in bags_pool]) y_pool = [y_pool[b] for b in bags_pool] X_test = defaultdict(list) y_test = defaultdict(bool) for bid, iid in test_ids: X_test[bid].append(X[id_index[bid, iid]]) y_test[bid] |= bool(y[id_index[bid, iid]]) bags_test = sorted(X_test.keys()) X_test = map(np.vstack, [X_test[b] for b in bags_test]) y_test = [y_test[b] for b in bags_test] results = {} results['stats'] = {} results['preds'] = {} start = time.time() if classifier == 'nsk': cls = SetSVM(SVC, kernel, **params) active = SVMActiveLearner(cls, queries) predictions = active.learn(X_labeled, y_labeled, X_pool, y_pool, X_test) else: print 'Technique "%s" not supported' % technique callback.quit = True return results['stats']['time'] = time.time() - start for q, preds in enumerate(predictions): results['preds'][q] = {} for bid, y in zip(bags_test, preds): results['preds'][q][bid] = float(y) predictions = np.column_stack(predictions).T print predictions.shape if len(bags_test) > 1: print 'Test AUC Scores:' for row in predictions: print '\t%f' % auc_score(np.array(y_test), row.reshape((-1, ))) print 'Finished task %s.' % str(key) return results