def load(fname): global vw, sequenceLabeler try: vw = pyvw.vw("--quiet -i " + fname + " -f " + fname) except: vw = pyvw.vw("--search 3 --quiet --search_task hook --ring_size 2048 -f " + fname) vw.finish() vw = pyvw.vw("--quiet -i " + fname + " -f " + fname) sequenceLabeler = vw.init_search_task(SequenceLabeler3)
def get_vw(self): """ Factory to create a vw instance on demand Returns ------- pyvw.vw instance """ if self.vw_ is None: self.vw_ = vw(**self.params) return self.vw_
def get_vw(self): """Factory to create a vw instance on demand Returns ------- pyvw.vw instance """ if self.vw_ is None: self.vw_ = vw(**self.params) # set label type self.label_type_ = self.vw_.get_label_type() return self.vw_
def main(): vw = [] sl = [] while True: inp = raw_input("> ") inp = inp.strip() words = inp.split() cmd = words[0] if cmd == "/save": for temp in vw: temp.finish() sys.exit(1) if cmd == "/train": data = " ".join(words[1:]).strip() for i in range(10): for temp in sl: temp.learn(preprocess([data])) elif cmd == "/query": data = " ".join(words[1:]).strip() output = set() for s in sl: output.add(postprocess(query(s, data))) for out in output: print "\t", out elif cmd == "/start": data = " ".join(words[1:]).strip() if os.path.isfile(data + ".1") and os.path.isfile(data + ".2") and os.path.isfile( data + ".3") and os.path.isfile(data + ".4"): vw = [ pyvw.vw("--quiet -i " + data + ".1 -f "+data + ".1"), pyvw.vw("--quiet -i " + data + ".2 -f "+data + ".2"), pyvw.vw("--quiet -i " + data + ".3 -f "+data + ".3"), pyvw.vw("--quiet -i " + data + ".4 -f "+data + ".4") ] else: vw = [ pyvw.vw("--search 3 --quiet --search_task hook --ring_size 2048 -f " + data + ".1"), pyvw.vw("--search 3 --quiet --search_task hook --ring_size 2048 -f " + data + ".2"), pyvw.vw("--search 3 --quiet --search_task hook --ring_size 2048 -f " + data + ".3"), pyvw.vw("--search 3 --quiet --search_task hook --ring_size 2048 -f " + data + ".4") ] sl = [ vw[0].init_search_task(SequenceLabeler), vw[1].init_search_task(SequenceLabeler2), vw[2].init_search_task(SequenceLabeler3), vw[3].init_search_task(SequenceLabeler4) ]
def mini_vw(inputFile, numPasses, otherArgs): vw = pyvw.vw(otherArgs) for p in range(numPasses): print 'pass', (p+1) h = open(inputFile, 'r') for l in h.readlines(): if learnFromStrings: vw.learn(l) else: ex = vw.example(l) vw.learn(ex) ex.finish() h.close() vw.finish()
def mini_vw(inputFile, numPasses, otherArgs): vw = pyvw.vw(otherArgs) for p in range(numPasses): print 'pass', (p + 1) h = open(inputFile, 'r') for l in h.readlines(): if learnFromStrings: vw.learn(l) else: ex = vw.example(l) vw.learn(ex) ex.finish() h.close() vw.finish()
def do_work(train_instances, dev_instances, test_instances, sample_size, samples_per_event, gold_probs, iters, l2, log_time, semsims, dfdeltas, use_best_feats, use_i_only, use_abs_df, doc_condition, output_dir): vw = pyvw.vw( ("-l .001 --l2 {} --search 2 --search_task hook --ring_size 1024 " + \ "--search_no_caching --noconstant --quiet").format(l2)) task = vw.init_search_task(Summarizer) task.use_best_feats = use_best_feats task.use_i_only = use_i_only task.use_abs_df = use_abs_df task._doc_condition = doc_condition print "use best?", task.use_best_feats print "use i only?", task.use_i_only print "use abs df?", task.use_abs_df print "use doc condition?", task._doc_condition all_scores = [] all_weights = [] for n_iter in xrange(1, iters + 1): task.total_loss = 0 random.shuffle(train_instances) print "iter", n_iter task.learn(train_instances) for i, inst in enumerate(dev_instances): egain, comp, f1, loss, _ = predict(task, inst, n_iter) print egain, comp, f1, loss all_scores.append({"iter": n_iter, "E[gain]": egain, "Comp.": comp, "F1": f1, "Loss": loss}) df = pd.DataFrame(all_scores) df_u = df.groupby("iter").mean().reset_index(drop=True) print df_u select_df, next_df = task.get_feature_weights() select_df["class"] = "SELECT" select_df["iter"] = n_iter next_df["class"] = "NEXT" next_df["iter"] = n_iter all_weights.append(select_df) all_weights.append(next_df) best_f1_iter = df_u["F1"].argmax() + 1 best_egain_iter = df_u["E[gain]"].argmax() + 1 best_comp_iter = df_u["Comp."].argmax() + 1 best_loss_iter = df_u["Loss"].argmin() + 1 weights_df = pd.concat(all_weights) all_summaries = [] # all_scores = [] F1_weights = weights_df[weights_df["iter"] == best_f1_iter] loss_weights = weights_df[weights_df["iter"] == best_loss_iter] egain_weights = weights_df[weights_df["iter"] == best_egain_iter] comp_weights = weights_df[weights_df["iter"] == best_comp_iter] def get_summaries(weights, run): print "Best", run task.set_weights(weights) for test_instance in test_instances: event = test_instance[0] df = test_instance[1] print event task._keep_scores = True task._scores = [] predictions = task.predict(test_instance) assert len(predictions) == len(task._scores) for action, (_, row), ascore in zip(predictions, df.iterrows(), task._scores): if action == SELECT: # assert ascore["SELECT"] <= ascore["NEXT"] print "{}\t{}\t{}\t{}\t{}\t{}\t{}".format( event.query_num, "CUNLP", run, "-".join(row["update id"].split("-")[0:2]), row["update id"].split("-")[2], row["timestamp"], ascore) all_summaries.append( {"event": event.query_num, "team": "CUNLP", "run": run, "stream id": "-".join(row["update id"].split("-")[0:2]), "sentence id": row["update id"].split("-")[2], "timestamp": row["timestamp"], "confidence": row["probs"], "partial": ascore, "text": row["sent text"], "pretty text": row["pretty text"] }) #else: # assert ascore["SELECT"] >= ascore["NEXT"] # all_scores = [] # task.set_weights(F1_weights) # for i, inst in enumerate(dev_instances): # egain, comp, f1, loss, _ = predict(task, inst, best_f1_iter) # print egain, comp, f1, loss # all_scores.append({"iter": n_iter, "E[gain]": egain, "Comp.": comp, "F1": f1, "Loss": loss}) # df = pd.DataFrame(all_scores) # df_u = df.groupby("iter").mean().reset_index(drop=True) # print df_u # # all_scores = [] # task.set_weights(egain_weights) # for i, inst in enumerate(dev_instances): # egain, comp, f1, loss, _ = predict(task, inst, best_egain_iter) # print egain, comp, f1, loss # all_scores.append({"iter": n_iter, "E[gain]": egain, "Comp.": comp, "F1": f1, "Loss": loss}) # df = pd.DataFrame(all_scores) # df_u = df.groupby("iter").mean().reset_index(drop=True) # print df_u get_summaries(F1_weights, "L2S.F1") get_summaries(loss_weights, "L2S.Loss") get_summaries(egain_weights, "L2S.E[gain]") get_summaries(comp_weights, "L2S.Comp.") df = pd.DataFrame(all_summaries, columns=["event", "team", "run", "stream id", "sentence id", "timestamp", "confidence", "partial", "pretty text", "text"]) submission_path = os.path.join(output_dir, "submission.tsv") summary_path = os.path.join(output_dir, "summaries.tsv") f1_weights_path = os.path.join(output_dir, "weights.f1.tsv") loss_weights_path = os.path.join(output_dir, "weights.loss.tsv") egain_weights_path = os.path.join(output_dir, "weights.egain.tsv") comp_weights_path = os.path.join(output_dir, "weights.comp.tsv") scores_path = os.path.join(output_dir, "scores.tsv") no_text = ["event", "team", "run", "stream id", "sentence id", "timestamp", "confidence", "partial"] if not os.path.exists(output_dir): os.makedirs(output_dir) df["confidence"] = df["confidence"].apply(lambda x: max(x, 0)) with open(submission_path, "w") as f: df[no_text].to_csv(f, index=False, header=False, sep="\t") with open(summary_path, "w") as f: df.to_csv(f, index=False, sep="\t") with open(f1_weights_path, "w") as f: F1_weights.to_csv(f, index=False, sep="\t") with open(loss_weights_path, "w") as f: loss_weights.to_csv(f, index=False, sep="\t") with open(egain_weights_path, "w") as f: egain_weights.to_csv(f, index=False, sep="\t") with open(comp_weights_path, "w") as f: comp_weights.to_csv(f, index=False, sep="\t") with open(scores_path, "w") as f: df_u.to_csv(f, sep="\t", index=False)
def do_work(training_events, test_event, sample_size, samples_per_event, gold_probs, iters, l2, log_time, semsims, dfdeltas, use_best_feats, use_i_only, use_abs_df): training_streams = [] summary = [] for event in training_events: df = get_input_stream(event, gold_probs) training_streams.append((event, df)) test_df = get_input_stream(test_event, gold_probs) test_X_l = semsims[test_event.type].transform( test_df["stems"].apply(lambda x: ' '.join(x)).tolist()) test_stream = (test_event, test_df, test_X_l, dfdeltas(test_event)) vw = pyvw.vw( ("--l2 {} --search 2 --search_task hook --ring_size 1024 " + \ "--search_no_caching --noconstant --quiet").format(l2)) task = vw.init_search_task(Summarizer) task.use_best_feats = use_best_feats task.use_i_only = use_i_only task.use_abs_df = use_abs_df print "use best?", task.use_best_feats print "use i only?", task.use_i_only print "use abs df?", task.use_abs_df task.log_time = log_time all_scores = [] all_weights = [] instances = [] for sample in xrange(samples_per_event): for event, stream in training_streams: while 1: sample_stream = ds(stream, sample_size=sample_size) if (sample_stream["nuggets"].apply(len) > 0).any(): break X_l = semsims[event.type].transform( sample_stream["stems"].apply(lambda x: ' '.join(x)).tolist()) instances.append((event, sample_stream, X_l, dfdeltas(event))) for n_iter in xrange(1, iters + 1): task.total_loss = 0 #instances = [(event, ds(stream, sample_size=sample_size)) # for event, stream in training_streams # for sample in xrange(samples_per_event)] random.shuffle(instances) for i, inst in enumerate(instances): print "{}.{}.{}/{}".format( test_event.fs_name(), n_iter, i, len(instances)) task.learn([inst]) print "{}.{}.p".format( test_event.fs_name(), n_iter) train_egain = 0 train_comp = 0 train_f1 = 0 train_loss = 0 for i, inst in enumerate(instances): egain, comp, f1, loss, train_sum = predict(task, inst, n_iter) train_egain += egain train_comp += comp train_f1 += f1 train_loss += loss train_egain = train_egain / float(len(instances)) train_comp = train_comp / float(len(instances)) train_f1 = train_f1 / float(len(instances)) train_loss = train_loss / float(len(instances)) print "{} {} train loss {}".format(test_event.query_id, n_iter, train_loss) pred = task.predict(test_stream) select_df, next_df = task.get_feature_weights() select_df["class"] = "SELECT" select_df["iter"] = n_iter next_df["class"] = "NEXT" next_df["iter"] = n_iter all_weights.append(select_df) all_weights.append(next_df) pred = ["SELECT" if p == SELECT else "SKIP" for p in pred] all_nuggets = set() for nuggets in test_stream[1]["nuggets"].tolist(): all_nuggets.update(nuggets) loss = 0 y_int_y_hat = 0 size_y = 0 size_y_hat = 0 nuggets = set() for action, (_, sent) in izip(pred, test_stream[1].iterrows()): gain = len(sent["nuggets"] - nuggets) if action == "SELECT": if gain == 0: loss += 1 summary.append({ "event": test_event.query_id, "iter": n_iter, "update id": sent["update id"], "timestamp": sent["timestamp"], "gain": gain, "nuggets": ",".join(sent["nuggets"]), "update text": sent["pretty text"] }) nuggets.update(sent["nuggets"]) else: if gain > 0: loss += 1 if gain > 0: oracle = "SELECT" else: oracle = "SKIP" if action == "SELECT" and oracle == "SELECT": y_int_y_hat += 1 size_y += 1 size_y_hat += 1 elif action == "SELECT" and oracle == "SKIP": size_y_hat += 1 elif action == "SKIP" and oracle == "SELECT": size_y += 1 if size_y_hat == 0: print test_event print (test_stream[1]["nuggets"].apply(len) > 0).any() loss = 1 - float(y_int_y_hat) / (size_y + size_y_hat) if len(nuggets) > 0: egain = len(nuggets) / sum([1.0 if a == "SELECT" else 0.0 for a in pred]) else: egain = 0 comp = len(nuggets) / float(len(all_nuggets)) all_scores.append({"iter": n_iter, "Comp.": comp, "E[gain]": egain, "Loss": loss, "Avg. Train Loss": train_loss, "Avg. Train E[gain]": train_egain, "Avg. Train Comp.": train_comp, "Avg. Train F1": train_f1, }) print "{}.{}.p E[gain]={:0.6f} Comp.={:0.6f} Train Loss={:0.6f}".format( test_event.fs_name(), n_iter, egain, comp, train_loss) scores_df = pd.DataFrame(all_scores, columns=["iter", "E[gain]", "Comp.", "Loss", "Avg. Train Loss", "Avg. Train E[gain]", "Avg. Train Comp.", "Avg. Train F1"]) weights_df = pd.concat(all_weights) weights_df["event"] = test_event.query_id scores_df["event"] = test_event.query_id summary_df = pd.DataFrame( summary, columns=["iter", "event", "update id", "timestamp", "gain", "update text", "nuggets"]) return scores_df, weights_df, summary_df
def train(sequenceLabeler, data): sequenceLabeler.learn(preprocess([data])) def testit(sequenceLabeler): passed = 0 for sample in samples: pred = postprocess(test(sequenceLabeler, sample[0])) if pred == sample[1]: passed += 1 print "\n======== ACCURACY:[", (passed*1.0/len(samples))*100, "% ] ======" print "====================================\n" if __name__ == "__main__": fname = "tagger2.bin" vw = pyvw.vw("--search 3 --quiet --search_task hook --ring_size 2048 -f " + fname) sequenceLabeler = vw.init_search_task(SequenceLabeler3) train(sequenceLabeler, "watch_N big_B bang_I") testit(sequenceLabeler) train(sequenceLabeler, "harry_B potter_I") testit(sequenceLabeler) train(sequenceLabeler, "show_N me_N action_B movies_B") testit(sequenceLabeler) vw.finish()
def main(learner, training_ids, test_ids, sample_size, n_iters, report_dir_base): extractor = "goose" topk = 20 delay = None threshold = .8 res = InputStreamResource() events = [ e for e in cuttsum.events.get_events() if e.query_num in training_ids or e.query_num in test_ids ] training_insts = [] test_insts = [] for event in events: print "Loading event", event.fs_name() corpus = cuttsum.corpora.get_raw_corpus(event) # A list of dataframes. Each dataframe is a document with =< 20 sentences. # This is the events document stream. dataframes = res.get_dataframes(event, corpus, extractor, threshold, delay, topk) if event.query_num in training_ids: training_insts.append((event, dataframes)) if event.query_num in test_ids: test_insts.append((event, dataframes)) # Init l2s task. vw = pyvw.vw( "--search 0 --csoaa_ldf m --search_task hook --ring_size 1024 --quiet --search_no_caching" ) #task = vw.init_search_task(UpdateSummarizer) if learner == "PerfectOracle": task = vw.init_search_task(PerfectOracle) elif learner == "LessPerfectOracle": task = vw.init_search_task(LessPerfectOracle) elif learner == "SelectLexNextOracle": task = vw.init_search_task(SelectLexNextOracle) elif learner == "SelectLexNextLex": task = vw.init_search_task(SelectLexNextLex) elif learner == "SelectLexNextLexCache": task = vw.init_search_task(SelectLexNextLexCache) elif learner == "SelectLexGenericNextOracle": task = vw.init_search_task(SelectLexGenericNextOracle) elif learner == "SelectBasicNextBias": task = vw.init_search_task(SelectBasicNextBias) elif learner == "SelectBasicNextBiasDocAvg": task = vw.init_search_task(SelectBasicNextBiasDocAvg) for n_iter in range(n_iters): print "iter", n_iter + 1 ds = downsample(training_insts, size=sample_size) task.learn(ds) all_train_df = [df for inst in training_insts for df in inst[1]] feature_weights = task.get_feature_weights(all_train_df) write_model(feature_weights, report_dir_base, n_iter) for event, dataframes in training_insts: # Predict a sequence for this training examples and see if it is sensible. print "PREDICTING", event.fs_name() sequence, scores = task.predict_with_scores((event, dataframes)) print sequence make_report(event, dataframes, sequence, scores, "train", n_iter, report_dir_base) for event, dataframes in test_insts: # Predict a sequence for this training examples and see if it is sensible. print "PREDICTING", event.fs_name() sequence, scores = task.predict_with_scores((event, dataframes)) print sequence make_report(event, dataframes, sequence, scores, "test", n_iter, report_dir_base)
(VERB, 'ate'), (DET , 'a'), (ADJ , 'big'), (NOUN, 'sandwich')], [(DET , 'the'), (NOUN, 'sandwich'), (VERB, 'was'), (ADJ , 'tasty')], [(NOUN, 'it'), (VERB, 'ate'), (NOUN, 'it'), (ADJ , 'all')] ] # initialize VW as usual, but use 'hook' as the search_task vw = pyvw.vw("--search 4 --quiet --search_task hook --ring_size 1024") # tell VW to construct your search task object sequenceLabeler = vw.init_search_task(SequenceLabeler) # train it on the above dataset ten times; the my_dataset.__iter__ feeds into _run above print >>sys.stderr, 'training!' i = 0 while i < 10: sequenceLabeler.learn(my_dataset) i += 1 # now see the predictions on a test sentence print >>sys.stderr, 'predicting!' print sequenceLabeler.predict( [(0,w) for w in "the sandwich ate a monster".split()] ) print 'should have printed: [1, 2, 3, 1, 2]'
return output # wow! your data can be ANY type you want... does NOT have to be VW examples DET = 1 NOUN = 2 VERB = 3 ADJ = 4 my_dataset = [ [(DET, "the"), (NOUN, "monster"), (VERB, "ate"), (DET, "a"), (ADJ, "big"), (NOUN, "sandwich")], [(DET, "the"), (NOUN, "sandwich"), (VERB, "was"), (ADJ, "tasty")], [(NOUN, "it"), (VERB, "ate"), (NOUN, "it"), (ADJ, "all")], ] # initialize VW as usual, but use 'python_hook' as the search_task vw = pyvw.vw("--search 4 --quiet --search_task python_hook --search_no_snapshot --ring_size 1024") # tell VW to construct your search task object sequenceLabeler = vw.init_search_task(SequenceLabeler) # train it on the above dataset ten times; the my_dataset.__iter__ feeds into _run above print >> sys.stderr, "training!" for curPass in range(10): sequenceLabeler.learn(my_dataset.__iter__) # now see the predictions on a test sentence print >> sys.stderr, "predicting!" print sequenceLabeler.predict([(0, w) for w in "the sandwich ate a monster".split()]) print "should have printed: [1, 2, 3, 1, 2]"
my_tag=n + 1, oracle=oracle, condition=[(n, 'p'), (n - 1, 'q')]) output[ n] = pred - 1 if pred < n else pred # have to +1 because n==m excluded return output # TODO: if they make sure search=0 <==> ldf <==> csoaa_ldf # demo the non-ldf version: print 'training non-LDF' vw = pyvw.vw("--search 2 --search_task hook --ring_size 1024 --quiet") task = vw.init_search_task(CovingtonDepParser) for p in range(2): # do two passes over the training data task.learn(my_dataset) print 'testing non-LDF' print task.predict([(w, -1) for w in "the monster ate a sandwich".split()]) print 'should have printed [ 1 2 -1 4 2 ]' # demo the ldf version: print 'training LDF' vw = pyvw.vw( "--search 0 --csoaa_ldf m --search_task hook --ring_size 1024 --quiet") task = vw.init_search_task(CovingtonDepParserLDF) for p in range(100): # do two passes over the training data task.learn(my_dataset) print 'testing LDF'
def do_work( train_instances, dev_instances, test_instances, sample_size, samples_per_event, gold_probs, iters, l2, log_time, semsims, dfdeltas, use_best_feats, use_i_only, use_abs_df, doc_condition, output_dir, ): vw = pyvw.vw( ( "-l .001 --l2 {} --search 2 --search_task hook --ring_size 1024 " + "--search_no_caching --noconstant --quiet" ).format(l2) ) task = vw.init_search_task(Summarizer) task.use_best_feats = use_best_feats task.use_i_only = use_i_only task.use_abs_df = use_abs_df task._doc_condition = doc_condition print "use best?", task.use_best_feats print "use i only?", task.use_i_only print "use abs df?", task.use_abs_df print "use doc condition?", task._doc_condition all_scores = [] all_weights = [] for n_iter in xrange(1, iters + 1): task.total_loss = 0 random.shuffle(train_instances) print "iter", n_iter task.learn(train_instances) for i, inst in enumerate(dev_instances): egain, comp, f1, loss, _ = predict(task, inst, n_iter) print egain, comp, f1, loss all_scores.append({"iter": n_iter, "E[gain]": egain, "Comp.": comp, "F1": f1, "Loss": loss}) df = pd.DataFrame(all_scores) df_u = df.groupby("iter").mean().reset_index(drop=True) print df_u select_df, next_df = task.get_feature_weights() select_df["class"] = "SELECT" select_df["iter"] = n_iter next_df["class"] = "NEXT" next_df["iter"] = n_iter all_weights.append(select_df) all_weights.append(next_df) best_f1_iter = df_u["F1"].argmax() + 1 best_egain_iter = df_u["E[gain]"].argmax() + 1 best_comp_iter = df_u["Comp."].argmax() + 1 best_loss_iter = df_u["Loss"].argmin() + 1 weights_df = pd.concat(all_weights) all_summaries = [] # all_scores = [] F1_weights = weights_df[weights_df["iter"] == best_f1_iter] loss_weights = weights_df[weights_df["iter"] == best_loss_iter] egain_weights = weights_df[weights_df["iter"] == best_egain_iter] comp_weights = weights_df[weights_df["iter"] == best_comp_iter] def get_summaries(weights, run): print "Best", run task.set_weights(weights) for test_instance in test_instances: event = test_instance[0] df = test_instance[1] print event task._keep_scores = True task._scores = [] predictions = task.predict(test_instance) assert len(predictions) == len(task._scores) for action, (_, row), ascore in zip(predictions, df.iterrows(), task._scores): if action == SELECT: # assert ascore["SELECT"] <= ascore["NEXT"] print "{}\t{}\t{}\t{}\t{}\t{}\t{}".format( event.query_num, "CUNLP", run, "-".join(row["update id"].split("-")[0:2]), row["update id"].split("-")[2], row["timestamp"], ascore, ) all_summaries.append( { "event": event.query_num, "team": "CUNLP", "run": run, "stream id": "-".join(row["update id"].split("-")[0:2]), "sentence id": row["update id"].split("-")[2], "timestamp": row["timestamp"], "confidence": row["probs"], "partial": ascore, "text": row["sent text"], "pretty text": row["pretty text"], } ) # else: # assert ascore["SELECT"] >= ascore["NEXT"] # all_scores = [] # task.set_weights(F1_weights) # for i, inst in enumerate(dev_instances): # egain, comp, f1, loss, _ = predict(task, inst, best_f1_iter) # print egain, comp, f1, loss # all_scores.append({"iter": n_iter, "E[gain]": egain, "Comp.": comp, "F1": f1, "Loss": loss}) # df = pd.DataFrame(all_scores) # df_u = df.groupby("iter").mean().reset_index(drop=True) # print df_u # # all_scores = [] # task.set_weights(egain_weights) # for i, inst in enumerate(dev_instances): # egain, comp, f1, loss, _ = predict(task, inst, best_egain_iter) # print egain, comp, f1, loss # all_scores.append({"iter": n_iter, "E[gain]": egain, "Comp.": comp, "F1": f1, "Loss": loss}) # df = pd.DataFrame(all_scores) # df_u = df.groupby("iter").mean().reset_index(drop=True) # print df_u get_summaries(F1_weights, "L2S.F1") get_summaries(loss_weights, "L2S.Loss") get_summaries(egain_weights, "L2S.E[gain]") get_summaries(comp_weights, "L2S.Comp.") df = pd.DataFrame( all_summaries, columns=[ "event", "team", "run", "stream id", "sentence id", "timestamp", "confidence", "partial", "pretty text", "text", ], ) submission_path = os.path.join(output_dir, "submission.tsv") summary_path = os.path.join(output_dir, "summaries.tsv") f1_weights_path = os.path.join(output_dir, "weights.f1.tsv") loss_weights_path = os.path.join(output_dir, "weights.loss.tsv") egain_weights_path = os.path.join(output_dir, "weights.egain.tsv") comp_weights_path = os.path.join(output_dir, "weights.comp.tsv") scores_path = os.path.join(output_dir, "scores.tsv") no_text = ["event", "team", "run", "stream id", "sentence id", "timestamp", "confidence", "partial"] if not os.path.exists(output_dir): os.makedirs(output_dir) df["confidence"] = df["confidence"].apply(lambda x: max(x, 0)) with open(submission_path, "w") as f: df[no_text].to_csv(f, index=False, header=False, sep="\t") with open(summary_path, "w") as f: df.to_csv(f, index=False, sep="\t") with open(f1_weights_path, "w") as f: F1_weights.to_csv(f, index=False, sep="\t") with open(loss_weights_path, "w") as f: loss_weights.to_csv(f, index=False, sep="\t") with open(egain_weights_path, "w") as f: egain_weights.to_csv(f, index=False, sep="\t") with open(comp_weights_path, "w") as f: comp_weights.to_csv(f, index=False, sep="\t") with open(scores_path, "w") as f: df_u.to_csv(f, sep="\t", index=False)
import pyvw vw = pyvw.vw('--audit') full = vw.example( { 'a': ['b'], 'x': ['y'] } ) full.learn() part = vw.example( {'a': ['b'] } ) part.learn() part.push_features('x', ['y']) part.learn() part.erase_namespace(ord('x')) part.push_features('x', ['z']) part.learn()
def _run(self, y_x): y,(x0,x1) = y_x ex = self.example({'x': [('x0',x0), ('x1',x1)]}) h = self.sch.predict(examples=ex, my_tag=1, oracle=None) * 2 - 3 ex = self.example({'x': [('x0',x0), ('x1',x1), ('x0h',x0*h)]}) p = self.sch.predict(examples=ex, my_tag=2, oracle=y, condition=(1,'h')) self.sch.loss( 0. if p == y else 1. ) return p my_dataset = [ (1, (-1, -1)), (1, (+1, +1)), (2, (-1, +1)), (2, (+1, -1)) ] vw = pyvw.vw("--search 2 --search_task hook --ring_size 1024 --search_alpha 1e-2") lv = vw.init_search_task(LatentVariableClassifier) print 'training' for i in range(100): lv.learn(my_dataset) print 'testing' for (y,x) in my_dataset: print 'pred =', lv.predict( (0,x) )
import pyvw vw = pyvw.vw('--audit') full = vw.example({'a': ['b'], 'x': ['y']}) full.learn() part = vw.example({'a': ['b']}) part.learn() part.push_features('x', ['y']) part.learn() part.erase_namespace(ord('x')) part.push_features('x', ['z']) part.learn()
label, ex = parseExample(tokens) ldf_example.append((label, ex)) else: if ldf_example: shared = ldf_example[0] ldf = ldf_example[1:] for l in ldf: for k in shared[1]: l[1][k] = shared[1][k] sentence.append(ldf) ldf_example = [] # initialize VW as usual, but use 'hook' as the search_task # vw = pyvw.vw("--search 0 --hash all -b 31 --csoaa_ldf mc --quiet --search_task hook -q t: -q m: --ngram t2 --ngram m2 --ngram g2 --ngram c2") vw = pyvw.vw("--search 0 --hash all -b 31 --csoaa_ldf mc --quiet --search_task hook -q ::") # tell VW to construct your search task object sequenceLabeler = vw.init_search_task(SequenceLabeler) train = data[:77072] test = data[77072:] def prepare(test): for s in range(len(test)): sentence = test[s] oracle = [] for w in range(len(sentence)): word = sentence[w]
def main(input_path, features, loss_metric, fold, lemma_length_cutoff, use_interactions, max_iters, output_dir): if not os.path.exists(output_dir): os.makedirs(output_dir) converters = {"lemmas": eval, "tokens": eval} with open(input_path, "rb") as f: all_inputs = pd.read_csv(f, sep="\t", converters=converters) print("Read {} input sentences from {}".format( len(all_inputs), input_path)) if lemma_length_cutoff > 0: all_inputs = lemma_filter(all_inputs, lemma_length_cutoff) vw_str = "--search 0 --csoaa_ldf m --search_task hook --ring_size 1024 " \ "--search_no_caching --quiet --noconstant" vw = pyvw.vw(vw_str) instances = make_instances(vw, all_inputs, features, use_interactions) chunk_size = 20 chunks = [instances[i:i+chunk_size] for i in range(0, 80, 20)] instances_train = [] for i, chunk in enumerate(chunks): if i != fold: instances_train.extend(chunk) else: instances_dev = chunks[i] print "Fold {}".format(fold) task = vw.init_search_task(L2SSum) task.set_loss_func(loss_metric) print task.get_loss_func() from datetime import datetime, timedelta now = datetime.now() total_train_time = timedelta(0) for num_iter in range(1, max_iters + 1): print("iter {}/{}".format(num_iter, max_iters)) task.learn(instances_train) dur = datetime.now() - now total_train_time += dur print("took {}".format(dur)) now = datetime.now() write_weights(output_dir, num_iter, vw, instances[0][-1]) output_paths = [] for instance in instances_train: docset, year, opath = write_output( task.predict(instance), num_iter, instance, output_dir) output_paths.append((docset, year, opath)) write_eval(output_paths, num_iter, output_dir) output_paths_dev = [] for instance in instances_dev: docset, year, opath = write_output( task.predict(instance), num_iter, instance, output_dir, dev=True) output_paths_dev.append((docset, year, opath)) write_eval_dev(output_paths_dev, num_iter, output_dir) print total_train_time, print timedelta(seconds=total_train_time.total_seconds() / 10.)
def main(learner, training_ids, test_ids, sample_size, n_iters, report_dir_base): extractor = "goose" topk = 20 delay = None threshold = .8 res = InputStreamResource() events = [e for e in cuttsum.events.get_events() if e.query_num in training_ids or e.query_num in test_ids] training_insts = [] test_insts = [] for event in events: print "Loading event", event.fs_name() corpus = cuttsum.corpora.get_raw_corpus(event) # A list of dataframes. Each dataframe is a document with =< 20 sentences. # This is the events document stream. dataframes = res.get_dataframes(event, corpus, extractor, threshold, delay, topk) if event.query_num in training_ids: training_insts.append((event, dataframes)) if event.query_num in test_ids: test_insts.append((event, dataframes)) # Init l2s task. vw = pyvw.vw("--search 0 --csoaa_ldf m --search_task hook --ring_size 1024 --quiet --search_no_caching") #task = vw.init_search_task(UpdateSummarizer) if learner == "PerfectOracle": task = vw.init_search_task(PerfectOracle) elif learner == "LessPerfectOracle": task = vw.init_search_task(LessPerfectOracle) elif learner == "SelectLexNextOracle": task = vw.init_search_task(SelectLexNextOracle) elif learner == "SelectLexNextLex": task = vw.init_search_task(SelectLexNextLex) elif learner == "SelectLexNextLexCache": task = vw.init_search_task(SelectLexNextLexCache) elif learner == "SelectLexGenericNextOracle": task = vw.init_search_task(SelectLexGenericNextOracle) elif learner == "SelectBasicNextBias": task = vw.init_search_task(SelectBasicNextBias) elif learner == "SelectBasicNextBiasDocAvg": task = vw.init_search_task(SelectBasicNextBiasDocAvg) for n_iter in range(n_iters): print "iter", n_iter + 1 ds = downsample(training_insts, size=sample_size) task.learn(ds) all_train_df = [df for inst in training_insts for df in inst[1]] feature_weights = task.get_feature_weights(all_train_df) write_model(feature_weights, report_dir_base, n_iter) for event, dataframes in training_insts: # Predict a sequence for this training examples and see if it is sensible. print "PREDICTING", event.fs_name() sequence, scores = task.predict_with_scores((event, dataframes)) print sequence make_report(event, dataframes, sequence, scores, "train", n_iter, report_dir_base) for event, dataframes in test_insts: # Predict a sequence for this training examples and see if it is sensible. print "PREDICTING", event.fs_name() sequence, scores = task.predict_with_scores((event, dataframes)) print sequence make_report(event, dataframes, sequence, scores, "test", n_iter, report_dir_base)
# wow! your data can be ANY type you want... does NOT have to be VW examples DET = 1 NOUN = 2 VERB = 3 ADJ = 4 my_dataset = [[(DET, 'the'), (NOUN, 'monster'), (VERB, 'ate'), (DET, 'a'), (ADJ, 'big'), (NOUN, 'sandwich')], [(DET, 'the'), (NOUN, 'sandwich'), (VERB, 'was'), (ADJ, 'tasty')], [(NOUN, 'it'), (VERB, 'ate'), (NOUN, 'it'), (ADJ, 'all')]] # initialize VW as usual, but use 'hook' as the search_task vw = pyvw.vw( "--search 4 --quiet --search_task hook --search_no_snapshot --ring_size 1024" ) # tell VW to construct your search task object sequenceLabeler = vw.init_search_task(SequenceLabeler) # train it on the above dataset ten times; the my_dataset.__iter__ feeds into _run above print >> sys.stderr, 'training!' for curPass in range(10): sequenceLabeler.learn(my_dataset.__iter__) # now see the predictions on a test sentence print >> sys.stderr, 'predicting!' print sequenceLabeler.predict([(0, w) for w in "the sandwich ate a monster".split()]) print 'should have printed: [1, 2, 3, 1, 2]'
ex = self.example({'w': [word + '_' + str(p)]}, labelType=self.vw.lCostSensitive) ex.set_label_string(str(p) + ':0') return ex def _run(self, sentence): # it's called _run to remind you that you shouldn't call it directly! output = [] for n in range(len(sentence)): pos,word = sentence[n] # use "with...as..." to guarantee that the example is finished properly ex = [ self.makeExample(word,p) for p in [DET,NOUN,VERB,ADJ] ] pred = self.sch.predict(examples=ex, my_tag=n+1, oracle=pos-1, condition=(n,'p')) output.append(pred + 1) return output # initialize VW as usual, but use 'hook' as the search_task vw = pyvw.vw("--search 0 --csoaa_ldf m --quiet --search_task hook --ring_size 1024") # tell VW to construct your search task object sequenceLabeler = vw.init_search_task(SequenceLabeler) # train it on the above dataset ten times; the my_dataset.__iter__ feeds into _run above print >>sys.stderr, 'training!' i = 0 while i < 100000000: sequenceLabeler.learn(my_dataset) i += 1 # now see the predictions on a test sentence print >>sys.stderr, 'predicting!' print sequenceLabeler.predict( [(1,w) for w in "the sandwich ate a monster".split()] ) print 'should have printed: [1, 2, 3, 1, 2]'
def compute_reference(prev, truth): if truth == BIO('O') or truth == BIO('B'): ret = [] if truth == BIO('O'): for key in valid_labels: ret.append(BIO('O',key)) if truth == BIO('B'): for key in valid_labels: ret.append(BIO('B',key)) return ret # TODO ret = [] if prev != BIO('I') and prev != BIO('B'): #ret.append(BIO('B')) for key in valid_labels: ret.append(BIO('O',key)) return ret else: for key in valid_labels: ret.append(BIO('I',key)) return ret class MWE(pyvw.SearchTask): def __init__(self, vw, sch, num_actions): # you must must must initialize the parent class # this will automatically store self.sch <- sch, self.vw <- vw pyvw.SearchTask.__init__(self, vw, sch, num_actions) # for now we will use AUTO_HAMMING_LOSS; in Part II, you should remove this and implement a more task-focused loss # like one-minus-F-measure. sch.set_options( sch.AUTO_HAMMING_LOSS | sch.AUTO_CONDITION_FEATURES ) def _run(self, sentence): output = [] prev = BIO('O') # store the previous prediction for n in range(len(sentence)): # label is a BIO, word is a string and pos is a string label,word,lemma,pos = sentence[n] with self.make_example(word, lemma, pos) as ex: # construct the VW example # first, compute the numeric labels for all valid reference actions refs = [ bio.numeric_label for bio in compute_reference(prev, label) ] # next, because some actions are invalid based on the # previous decision, we need to compute a list of # valid actions available at this point valid = [ bio.numeric_label for bio in prev.valid_next() ] # make a prediction pred = self.sch.predict(examples = ex, my_tag = n+1, oracle = refs, condition = [(n, 'p'), (n-1, 'q')], allowed = valid) # map that prediction back to a BIO label this = numeric_label_to_BIO(pred) # append it to output output.append(this) # update the 'previous' prediction to the current prev = this # return the list of predictions as BIO labels return output def make_example(self, word, lemma, pos): ex = self.example({'w': [word], 'l': [lemma], 'p': [pos]},labelType=self.vw.lCostSensitive) ex.set_label_string(str(pos)+':0') return ex def make_data(BIO,filename): data = [] sentence = [] f = open(filename,'r') for l in f: l = l.strip() # at end of sentence if l == "": data.append(sentence) sentence = [] else: [offset,word,lemma,pos,mwe,parent,strength,ssense,sid] = l.split('\t') sentence.append((BIO(mwe),word,lemma,pos)) return data if __name__ == "__main__": # input/output files trainfilename='dimsum16.p3.train.contiguous' testfilename='dimsum16.p3.test.contiguous' outfilename='dimsum16.p3.test.contiguous.out' # read in some examples to be used as training/dev set train_data = make_data(BIO,trainfilename) # initialize VW and sequence labeler as learning to search vw = pyvw.vw(search=9, quiet=True, search_task='hook', ring_size=1024, \ search_rollin='learn', search_rollout='none') # tell VW to construct your search task object sequenceLabeler = vw.init_search_task(MWE) # train! # we make 5 passes over the training data, training on the first 80% # examples (we retain the last 20% as development data) print 'training!' N = int(0.8 * len(train_data)) for i in xrange(5): print 'iteration ', i, ' ...' sequenceLabeler.learn(train_data[0:N]) # now see the predictions on 20% held-out sentences print 'predicting!' hamming_loss, total_words = 0,0 for n in range(N, len(train_data)): truth = [label for label,word,lemma,pos in train_data[n]] pred = sequenceLabeler.predict( [(BIO('O'),word,lemma,pos) for label,word,lemma,pos in train_data[n]] ) for i,t in enumerate(truth): if t != pred[i]: hamming_loss += 1 total_words += 1 # print 'predicted:', '\t'.join(map(str, pred)) # print ' truth:', '\t'.join(map(str, truth)) # print '' print 'total hamming loss on dev set:', hamming_loss, '/', total_words
def _run(self, y_x): y,(x0,x1) = y_x ex = self.example({'x': [('x0',x0), ('x1',x1)]}) h = self.sch.predict(examples=ex, my_tag=1, oracle=None) * 2 - 3 ex = self.example({'x': [('x0',x0), ('x1',x1), ('x0h',x0*h)]}) p = self.sch.predict(examples=ex, my_tag=2, oracle=y, condition=(1,'h')) self.sch.loss( 0. if p == y else 1. ) return p my_dataset = [ (1, (-1, -1)), (1, (+1, +1)), (2, (-1, +1)), (2, (+1, -1)) ] vw = pyvw.vw("--search 2 --search_task hook --ring_size 1024 --search_alpha 1e-2") lv = vw.init_search_task(LatentVariableClassifier) print('training') for i in range(100): lv.learn(my_dataset) print('testing') for (y,x) in my_dataset: print('pred =', lv.predict( (0,x) ))
sentence.append((BIO(mwe),word,lemma,pos)) return data if __name__ == "__main__": # input/output files trainfilename='dimsum16.p3.train.contiguous' testfilename='dimsum16.p3.test.contiguous' outfilename='dimsum16.p3.test.contiguous.out' # read in some examples to be used as training/dev set train_data = make_data(BIO,trainfilename) # initialize VW and sequence labeler as learning to search vw = pyvw.vw(search=9, quiet=True, search_task='hook', ring_size=1024, \ search_rollin='learn', search_rollout='none') # tell VW to construct your search task object sequenceLabeler = vw.init_search_task(MWE) # train! # we make 5 passes over the training data, training on the first 80% # examples (we retain the last 20% as development data) print 'training!' N = int(0.8 * len(train_data)) for i in xrange(5): print 'iteration ', i, ' ...' sequenceLabeler.learn(train_data[0:N]) # now see the predictions on 20% held-out sentences print 'predicting!'
import pyvw def my_predict(vw, ex): pp = 0. for f,v in ex.iter_features(): pp += vw.get_weight(f) * v return pp def ensure_close(a,b,eps=1e-6): if abs(a-b) > eps: raise Exception("test failed: expected " + str(a) + " and " + str(b) + " to be " + str(eps) + "-close, but they differ by " + str(abs(a-b))) ############################################################################### vw = pyvw.vw("--quiet") ############################################################################### vw.learn("1 |x a b") ############################################################################### print '# do some stuff with a read example:' ex = vw.example("1 |x a b |y c") ex.learn() ; ex.learn() ; ex.learn() ; ex.learn() updated_pred = ex.get_updated_prediction() print 'current partial prediction =', updated_pred # compute our own prediction print ' my view of example =', str(list(ex.iter_features())) my_pred = my_predict(vw, ex)
# print 'pred', pred # print 'performing action', action # print parser.perform_action(action) output.append(pred) n += 1 loss = parser.loss() self.sch.loss(loss) parser.stop() print 'parsed doc', doc_id, 'with loss', loss return output def get_label(self, action): return 1 if action['action'] == 'shift' else 2 def get_action(self, label): return self.SHIFT if label == 1 else self.REDUCE if __name__ == '__main__': disco = DiscoSession() dataset = disco.get_doc_ids() vw = pyvw.vw("--search 2 --quiet --search_task hook --ring_size 1024 --search_no_caching -f disco.vw") parser = vw.init_search_task(DiscourseParser) print 'training ...' for i in xrange(5): parser.learn(dataset) vw.finish() print 'done!'
def do_work( training_events, test_event, sample_size, samples_per_event, gold_probs, iters, l2, log_time, semsims, dfdeltas, use_best_feats, use_i_only, use_abs_df, ): training_streams = [] summary = [] for event in training_events: df = get_input_stream(event, gold_probs) training_streams.append((event, df)) test_df = get_input_stream(test_event, gold_probs) test_X_l = semsims[test_event.type].transform(test_df["stems"].apply(lambda x: " ".join(x)).tolist()) test_stream = (test_event, test_df, test_X_l, dfdeltas(test_event)) vw = pyvw.vw( ("--l2 {} --search 2 --search_task hook --ring_size 1024 " + "--search_no_caching --noconstant --quiet").format( l2 ) ) task = vw.init_search_task(Summarizer) task.use_best_feats = use_best_feats task.use_i_only = use_i_only task.use_abs_df = use_abs_df print "use best?", task.use_best_feats print "use i only?", task.use_i_only print "use abs df?", task.use_abs_df task.log_time = log_time all_scores = [] all_weights = [] instances = [] for sample in xrange(samples_per_event): for event, stream in training_streams: while 1: sample_stream = ds(stream, sample_size=sample_size) if (sample_stream["nuggets"].apply(len) > 0).any(): break X_l = semsims[event.type].transform(sample_stream["stems"].apply(lambda x: " ".join(x)).tolist()) instances.append((event, sample_stream, X_l, dfdeltas(event))) for n_iter in xrange(1, iters + 1): task.total_loss = 0 # instances = [(event, ds(stream, sample_size=sample_size)) # for event, stream in training_streams # for sample in xrange(samples_per_event)] random.shuffle(instances) for i, inst in enumerate(instances): print "{}.{}.{}/{}".format(test_event.fs_name(), n_iter, i, len(instances)) task.learn([inst]) print "{}.{}.p".format(test_event.fs_name(), n_iter) train_egain = 0 train_comp = 0 train_f1 = 0 train_loss = 0 for i, inst in enumerate(instances): egain, comp, f1, loss, train_sum = predict(task, inst, n_iter) train_egain += egain train_comp += comp train_f1 += f1 train_loss += loss train_egain = train_egain / float(len(instances)) train_comp = train_comp / float(len(instances)) train_f1 = train_f1 / float(len(instances)) train_loss = train_loss / float(len(instances)) print "{} {} train loss {}".format(test_event.query_id, n_iter, train_loss) pred = task.predict(test_stream) select_df, next_df = task.get_feature_weights() select_df["class"] = "SELECT" select_df["iter"] = n_iter next_df["class"] = "NEXT" next_df["iter"] = n_iter all_weights.append(select_df) all_weights.append(next_df) pred = ["SELECT" if p == SELECT else "SKIP" for p in pred] all_nuggets = set() for nuggets in test_stream[1]["nuggets"].tolist(): all_nuggets.update(nuggets) loss = 0 y_int_y_hat = 0 size_y = 0 size_y_hat = 0 nuggets = set() for action, (_, sent) in izip(pred, test_stream[1].iterrows()): gain = len(sent["nuggets"] - nuggets) if action == "SELECT": if gain == 0: loss += 1 summary.append( { "event": test_event.query_id, "iter": n_iter, "update id": sent["update id"], "timestamp": sent["timestamp"], "gain": gain, "nuggets": ",".join(sent["nuggets"]), "update text": sent["pretty text"], } ) nuggets.update(sent["nuggets"]) else: if gain > 0: loss += 1 if gain > 0: oracle = "SELECT" else: oracle = "SKIP" if action == "SELECT" and oracle == "SELECT": y_int_y_hat += 1 size_y += 1 size_y_hat += 1 elif action == "SELECT" and oracle == "SKIP": size_y_hat += 1 elif action == "SKIP" and oracle == "SELECT": size_y += 1 if size_y_hat == 0: print test_event print (test_stream[1]["nuggets"].apply(len) > 0).any() loss = 1 - float(y_int_y_hat) / (size_y + size_y_hat) if len(nuggets) > 0: egain = len(nuggets) / sum([1.0 if a == "SELECT" else 0.0 for a in pred]) else: egain = 0 comp = len(nuggets) / float(len(all_nuggets)) all_scores.append( { "iter": n_iter, "Comp.": comp, "E[gain]": egain, "Loss": loss, "Avg. Train Loss": train_loss, "Avg. Train E[gain]": train_egain, "Avg. Train Comp.": train_comp, "Avg. Train F1": train_f1, } ) print "{}.{}.p E[gain]={:0.6f} Comp.={:0.6f} Train Loss={:0.6f}".format( test_event.fs_name(), n_iter, egain, comp, train_loss ) scores_df = pd.DataFrame( all_scores, columns=[ "iter", "E[gain]", "Comp.", "Loss", "Avg. Train Loss", "Avg. Train E[gain]", "Avg. Train Comp.", "Avg. Train F1", ], ) weights_df = pd.concat(all_weights) weights_df["event"] = test_event.query_id scores_df["event"] = test_event.query_id summary_df = pd.DataFrame( summary, columns=["iter", "event", "update id", "timestamp", "gain", "update text", "nuggets"] ) return scores_df, weights_df, summary_df
def my_predict(vw, ex): pp = 0. for f, v in ex.iter_features(): pp += vw.get_weight(f) * v return pp def ensure_close(a, b, eps=1e-6): if abs(a - b) > eps: raise Exception("test failed: expected " + str(a) + " and " + str(b) + " to be " + str(eps) + "-close, but they differ by " + str(abs(a - b))) ###############################################################################3 vw = pyvw.vw("--quiet") ###############################################################################3 vw.learn("1 |x a b") ###############################################################################3 print '# do some stuff with a read example:' ex = vw.example("1 |x a b |y c") ex.learn() ex.learn() ex.learn() ex.learn() updated_pred = ex.get_updated_prediction() print 'current partial prediction =', updated_pred # compute our own prediction
def main(argv): if len(argv) < -1: print "usage python concept_relation_joint_learning.py concept_training_dataset_p concept_test_dataset_p span_concept_dict_p vnpb_words_concepts_dict_p relation_train_dataset_p relation_test_dataset_p kbest_dep_parse_p original_amr_aligned nodes_relation_dict_p" return global edgeLabelsList global span_concept_dict_p global vnpb_words_concepts_dict_p global dep_parse global gold_relation_dict global concept_labels global concept_map global relation_map global training_id_dict global flag global seen_dict global prev_sent_id global nodes_relation_dict_in global nodes_relation_dict_out global nodes_relation_dict_pair global dep_parse_nx #change this is you want to use debugger debug = False if debug: concept_training_dataset_p = "../data/amr-release-1.0-training-proxy/concept_dataset.p" concept_test_dataset_p = "../data/amr-release-1.0-test-proxy/concept_dataset.p" span_concept_dict_p = "../data/amr-release-1.0-training-proxy/span_concept_dict.p" vnpb_words_concepts_dict_p = "../data/amr-release-1.0-training-proxy/vnpb_words_concepts_dict.p" relation_training_dataset_p = "../data/amr-release-1.0-training-proxy/relation_dataset.p" kbest_dep_parse_p_train = "../data/amr-release-1.0-training-proxy/dep_parse.p" kbest_dep_parse_p_test = "../data/amr-release-1.0-test-proxy/dep_parse.p" original_amr_aligned = "../data/amr-release-1.0-test-proxy/amr-release-1.0-test-proxy.aligned" nodes_relation_dict_p = "../data/amr-release-1.0-training-proxy/nodes_relation_dict.p" amr_out_file_name = "proxy-out-temp_quant_d" else: concept_training_dataset_p = argv[0] concept_test_dataset_p = argv[1] span_concept_dict_p = argv[2] vnpb_words_concepts_dict_p = argv[3] relation_training_dataset_p = argv[4] kbest_dep_parse_p_train = argv[5] kbest_dep_parse_p_test = argv[6] original_amr_aligned = argv[7] nodes_relation_dict_p = argv[8] amr_out_file_name = argv[9] print "Starting Up!" nodes_relation_dict_out, nodes_relation_dict_in, nodes_relation_dict_pair = pickle.load(open(nodes_relation_dict_p)) #Format of concept_training_dataset #concept_training_dataset = {id: [span, pos, concept]} #concept_training_dataset = pickle.load(open("data/amr-release-1.0-training-proxy/concept_training_dataset_p", "rb")) #Read original amr amr_dict, ids_in_order, id_lines_in_order, sent_lines_in_order = read_amr(open(original_amr_aligned)) #print len(amr_dict) #Prepare training data concept_training_dataset = pickle.load(open(concept_training_dataset_p, "rb")) gold_relation_dict = pickle.load(open(relation_training_dataset_p, "rb")) dep_parse = pickle.load(open(kbest_dep_parse_p_train, "rb")) dep_parse_nx = {} for each_id in dep_parse: each_dp = dep_parse[each_id] dep_parse_graph_u = nx.Graph() dep_parse_graph_d = nx.DiGraph() for each_src in each_dp: for each_tgt in each_dp[each_src]: dep_parse_graph_u.add_edge(each_src, each_tgt[0], {'label': each_tgt[1]}) dep_parse_graph_d.add_edge(each_src, each_tgt[0], {'label': each_tgt[1]}) dep_parse_nx[each_id] = (dep_parse_graph_u, dep_parse_graph_d) training_id_dict = {} count = 0 subcount = 0 lens = [] training_sentences = [] for id, concept_training_data in concept_training_dataset.iteritems(): current_spans = [] training_id_dict[id] = count i = 0 for span_index, [span, pos, concept, name, ner] in enumerate(concept_training_data): num_words = len(span.split()) current_spans.append(Span(span, pos, range(i, i+num_words), ner, concept)) i += num_words training_sentence = Sentence(id, current_spans) lens.append(len(training_sentence.spans)) count += 1 if len(training_sentence.spans) <= 10: subcount += 1 training_sentences.append(training_sentence) #print subcount, count #print sorted(lens, reverse=True)[:100] amr_out_file = open(amr_out_file_name, 'w') #Prepare vw parameters N = len(training_sentences) #N = 1 #N = 10 vw = pyvw.vw("--search 0 --csoaa_ldf m --quiet --search_task hook --ring_size 2048 --search_no_caching -q a: ") task = vw.init_search_task(ConceptRelationLearning) prev_sent_id = training_sentences[0].id #Start training print "Learning.." start_time = time.time() for p in range(1): seen_dict = {} task.learn(training_sentences[:]) print "Time taken: " + str(time.time() - start_time) flag = False #Prepare test data concept_test_dataset = pickle.load(open(concept_test_dataset_p, "rb")) dep_parse = pickle.load(open(kbest_dep_parse_p_test, "rb")) dep_parse_nx = {} for each_id in dep_parse: each_dp = dep_parse[each_id] dep_parse_graph_u = nx.Graph() dep_parse_graph_d = nx.DiGraph() for each_src in each_dp: for each_tgt in each_dp[each_src]: dep_parse_graph_u.add_edge(each_src, each_tgt[0], {'label': each_tgt[1]}) dep_parse_graph_d.add_edge(each_src, each_tgt[0], {'label': each_tgt[1]}) dep_parse_nx[each_id] = (dep_parse_graph_u, dep_parse_graph_d) gold_relation_dict = {} test_sentences = [] for id, concept_test_data in concept_test_dataset.iteritems(): current_spans = [] i = 0 for span_index, [span, pos, concept, name, ner] in enumerate(concept_test_data): num_words = len(span.split()) current_spans.append(Span(span, pos, range(i, i+num_words), ner, concept)) #print current_spans[-1].word_positions i += num_words test_sentence = Sentence(id, current_spans) test_sentences.append(test_sentence) #test_sentences = test_sentences[:10] #Start testing start_time = time.time() print "Testing.." #print len(test_sentences) predictions = [] t2 = [] test_sentences_dict = {} for test_sentence in test_sentences: id = test_sentence.id test_sentences_dict[id] = test_sentence i = 0 for id in ids_in_order: #print id id_line = id_lines_in_order[i] sent_line = sent_lines_in_order[i] test_sentence = test_sentences_dict[id] predicted, node_exp, root_index = task.predict(eraseAnnotations(test_sentence)) predictions.append(predicted) t2.append(test_sentence) write_amr_to_file(test_sentence.id, predicted, node_exp, concept_map, relation_map, amr_out_file, nodes_relation_dict_out, nodes_relation_dict_in, nodes_relation_dict_pair, root_index, id_line, sent_line) predictions[i] = predicted i+=1 print "Time taken: " + str(time.time() - start_time) amr_out_file.close()
'p': [wordN + '_' + wordM, dir + '_' + wordN + '_' + wordM], 'd': [ str(m - n <= d) + '<=' + str(d) for d in [-8, -4, -2, -1, 1, 2, 4, 8] ] + [ str(m - n >= d) + '>=' + str(d) for d in [-8, -4, -2, -1, 1, 2, 4, 8] ] }) as ex: pred = self.sch.predict(examples=ex, my_tag=(m + 1) * N + n + 1, oracle=isParent, condition=[ (max(0, (m) * N + n + 1), 'p'), (max(0, (m + 1) * N + n), 'q') ]) if pred == 2: output[n] = m break return output vw = pyvw.vw("--search 2 --quiet --search_task hook --ring_size 1024") task = vw.init_search_task(CovingtonDepParser) for p in range(10): # do ten passes over the training data task.learn(my_dataset.__iter__) print 'testing' print task.predict([(w, -1) for w in "the monster ate a sandwich".split()]) print 'should have printed [ 1 2 -1 4 2 ]'