def main(): parser = argparse.ArgumentParser() parser.add_argument("model") args = parser.parse_args() model_dir = ModelDir(args.model) checkpoint = model_dir.get_best_weights() reader = tf.train.NewCheckpointReader(checkpoint) if reader.has_tensor("weight_embed_context_lm/layer_0/w"): x = "w" else: x = "ELMo_W_0" for i in reader.get_variable_to_shape_map().items(): print(i) input_w = reader.get_tensor( "weight_embed_lm/layer_0/%s/ExponentialMovingAverage" % x) output_w = reader.get_tensor( "weight_lm/layer_0/%s/ExponentialMovingAverage" % x) print("Input") print(input_w) print("(Softmax): " + str(softmax(input_w))) print("Output") print(output_w) print("(Softmax): " + str(softmax(output_w)))
def main(): parser = argparse.ArgumentParser(description='') parser.add_argument('name', help='name of output to exmaine') parser.add_argument('--eval', "-e", action="store_true") args = parser.parse_args() resume_training(ModelDir(args.name), start_eval=args.eval)
def init(): global model, model_dir print("Loading Model...") # Load the model model_dir = ModelDir( "pretrained_models/models/triviaqa-unfiltered-shared-norm") model = model_dir.get_model() if not isinstance(model, ParagraphQuestionModel): raise ValueError( "This script is built to work for ParagraphQuestionModel models only" )
def main(): print('Starting...') model_dir = ModelDir(OPTS.model) model = model_dir.get_model() if not isinstance(model, ParagraphQuestionModel): raise ValueError("This script is built to work for ParagraphQuestionModel models only") input_data, vocab = read_input_data(model) print('Loading word vectors...') model.set_input_spec(ParagraphAndQuestionSpec(batch_size=None), vocab) print('Starting Tensorflow session...') sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) with sess.as_default(): prediction = model.get_prediction() # Take 0-th here because we know we only truncate to one paragraph start_logits_tf = prediction.start_logits[0] end_logits_tf = prediction.end_logits[0] none_logit_tf = prediction.none_logit[0] context_rep_tf = model.context_rep[0] m1_tf = model.predictor.m1[0] m2_tf = model.predictor.m2[0] model_dir.restore_checkpoint(sess) with open(OPTS.output_file, 'w') as f: for doc_raw, q_raw, context, ex in tqdm(input_data): encoded = model.encode(ex, is_train=False) start_logits, end_logits, none_logit, context_rep, m1, m2 = sess.run( [start_logits_tf, end_logits_tf, none_logit_tf, context_rep_tf, m1_tf, m2_tf], feed_dict=encoded) beam, p_na = logits_to_probs( doc_raw, context[0], start_logits, end_logits, none_logit, beam_size=OPTS.beam_size) inputs = [context_rep, m1, m2] vec = np.concatenate([np.amax(x, axis=0) for x in inputs] + [np.amin(x, axis=0) for x in inputs] + [np.mean(x, axis=0) for x in inputs]) #span_logits = np.add.outer(start_logits, end_logits) #all_logits = np.concatenate((np.array([none_logit]), span_logits.flatten())) #log_partition = scipy.special.logsumexp(all_logits) #vec = np.concatenate([ # np.amax(context_rep, axis=0), # np.amin(context_rep, axis=0), # np.mean(context_rep, axis=0), # [np.amax(start_logits), scipy.special.logsumexp(start_logits), # np.amax(end_logits), scipy.special.logsumexp(end_logits), # none_logit, log_partition] #]) out_obj = {'paragraph': doc_raw, 'question': q_raw, 'beam': beam, 'p_na': p_na} if not OPTS.no_vec: out_obj['vec'] = vec.tolist() print(json.dumps(out_obj), file=f)
def main(): print('Starting...') model_dir = ModelDir(OPTS.model) model = model_dir.get_model() if isinstance(model, ParagraphQuestionModel): run_paragraph_model(model_dir, model) elif isinstance(model, ElmoQaModel): run_elmo_model(model_dir, model) else: raise ValueError( "This script is built to work for ParagraphQuestionModel and ElmoQaModel models only" )
def resume_training(model_to_resume: str, dataset_oversampling: Dict[str, int], checkpoint: Optional[str] = None, epochs: Optional[int] = None): """Resume training on a partially trained model (or finetune an existing model) :param model_to_resume: path to the model directory of the model to resume training :param dataset_oversampling: dictionary mapping dataset names to integer counts of how much to oversample them :param checkpoint: optional string to specify which checkpoint to resume from. Uses the latest if not specified :param epochs: Optional int specifying how many epochs to train for. If not detailed, runs for 24 """ out = ModelDir(model_to_resume) train_params = out.get_last_train_params() evaluators = train_params["evaluators"] params = train_params["train_params"] params.num_epochs = epochs if epochs is not None else 24 model = out.get_model() notes = None dry_run = False data = prepare_data(model, TrainConfig(), dataset_oversampling) if checkpoint is None: checkpoint = tf.train.latest_checkpoint(out.save_dir) _train_async(model=model, data=data, checkpoint=checkpoint, parameter_checkpoint=None, save_start=False, train_params=params, evaluators=evaluators, out=out, notes=notes, dry_run=dry_run, start_eval=False)
def convert_model_pickle(model_dir, output_dir): print("Updating model...") md = ModelDir(model_dir) model = md.get_model() # remove the lm models word embeddings - cpu model will use Char-CNN model.lm_model.embed_weights_file = None dim = model.embed_mapper.layers[1].n_units model.embed_mapper.layers = [ model.embed_mapper.layers[0], BiRecurrentMapper(CompatGruCellSpec(dim)), model.embed_mapper.layers[2] ] model.match_encoder.layers = list(model.match_encoder.layers) other = model.match_encoder.layers[1].other other.layers = list(other.layers) other.layers[1] = BiRecurrentMapper(CompatGruCellSpec(dim)) pred = model.predictor.predictor pred.first_layer = BiRecurrentMapper(CompatGruCellSpec(dim)) pred.second_layer = BiRecurrentMapper(CompatGruCellSpec(dim)) with open(join(output_dir, "model.pkl"), "wb") as f: pickle.dump(model, f)
def main(): parser = argparse.ArgumentParser(description='') parser.add_argument("model") args = parser.parse_args() model_dir = ModelDir(args.model) checkpoint = model_dir.get_best_weights() print(checkpoint) if checkpoint is None: print("Show latest checkpoint") checkpoint = model_dir.get_latest_checkpoint() else: print("Show best weights") reader = tf.train.NewCheckpointReader(checkpoint) param_map = reader.get_variable_to_shape_map() total = 0 for k in sorted(param_map): v = param_map[k] print('%s: %s' % (k, str(v))) total += np.prod(v) print("%d total" % total)
def run(): parser = argparse.ArgumentParser() parser.add_argument("squad_path", help="path to squad dev data file") parser.add_argument("output_path", help="path where evaluation json file will be written") parser.add_argument("--model-path", default="model", help="path to model directory") parser.add_argument("--n", type=int, default=None) parser.add_argument("-b", "--batch_size", type=int, default=100) parser.add_argument("--ema", action="store_true") args = parser.parse_args() squad_path = args.squad_path output_path = args.output_path model_dir = ModelDir(args.model_path) nltk.data.path.append("nltk_data") print("Loading data") docs = parse_squad_data(squad_path, "", NltkAndPunctTokenizer(), False) pairs = split_docs(docs) dataset = ParagraphAndQuestionDataset( pairs, ClusteredBatcher(args.batch_size, ContextLenKey(), False, True)) print("Done, init model") model = model_dir.get_model() loader = ResourceLoader(lambda a, b: load_word_vector_file( join(VEC_DIR, "glove.840B.300d.txt"), b)) lm_model = model.lm_model basedir = join(LM_DIR, "squad-context-concat-skip") lm_model.lm_vocab_file = join(basedir, "squad_train_dev_all_unique_tokens.txt") lm_model.options_file = join( basedir, "options_squad_lm_2x4096_512_2048cnn_2xhighway_skip.json") lm_model.weight_file = join( basedir, "squad_context_concat_lm_2x4096_512_2048cnn_2xhighway_skip.hdf5") lm_model.embed_weights_file = None model.set_inputs([dataset], loader) print("Done, building graph") sess = tf.Session() with sess.as_default(): pred = model.get_prediction() best_span = pred.get_best_span(17)[0] all_vars = tf.global_variables() + tf.get_collection( tf.GraphKeys.SAVEABLE_OBJECTS) dont_restore_names = { x.name for x in all_vars if x.name.startswith("bilm") } print(sorted(dont_restore_names)) vars = [x for x in all_vars if x.name not in dont_restore_names] print("Done, loading weights") checkpoint = model_dir.get_best_weights() if checkpoint is None: print("Loading most recent checkpoint") checkpoint = model_dir.get_latest_checkpoint() else: print("Loading best weights") saver = tf.train.Saver(vars) saver.restore(sess, checkpoint) if args.ema: ema = tf.train.ExponentialMovingAverage(0) saver = tf.train.Saver( {ema.average_name(x): x for x in tf.trainable_variables()}) saver.restore(sess, checkpoint) sess.run( tf.variables_initializer( [x for x in all_vars if x.name in dont_restore_names])) print("Done, starting evaluation") out = {} for i, batch in enumerate(dataset.get_epoch()): if args.n is not None and i == args.n: break print("On batch: %d" % (i + 1)) enc = model.encode(batch, False) spans = sess.run(best_span, feed_dict=enc) for (s, e), point in zip(spans, batch): out[point.question_id] = point.get_original_text(s, e) sess.close() print("Done, saving") with open(output_path, "w") as f: json.dump(out, f) print("Mission accomplished!")
def main(): parser = argparse.ArgumentParser(description='Evaluate a model on SQuAD') parser.add_argument('model', help='model directory to evaluate') parser.add_argument("-o", "--official_output", type=str, help="where to output an official result file") parser.add_argument('-n', '--sample_questions', type=int, default=None, help="(for testing) run on a subset of questions") parser.add_argument('--answer_bounds', nargs='+', type=int, default=[17], help="Max size of answer") parser.add_argument('-b', '--batch_size', type=int, default=200, help="Batch size, larger sizes can be faster but uses more memory") parser.add_argument('-s', '--step', default=None, help="Weights to load, can be a checkpoint step or 'latest'") parser.add_argument('-c', '--corpus', choices=["dev", "train"], default="dev") parser.add_argument('--no_ema', action="store_true", help="Don't use EMA weights even if they exist") parser.add_argument('--none_prob', action="store_true", help="Output none probability for samples") parser.add_argument('--elmo', action="store_true", help="Use elmo model") parser.add_argument('--per_question_loss_file', type=str, default=None, help="Run question by question and output a question_id -> loss output to this file") args = parser.parse_known_args()[0] model_dir = ModelDir(args.model) corpus = SquadCorpus() if args.corpus == "dev": questions = corpus.get_dev() else: questions = corpus.get_train() questions = split_docs(questions) if args.sample_questions: np.random.RandomState(0).shuffle(sorted(questions, key=lambda x: x.question_id)) questions = questions[:args.sample_questions] questions.sort(key=lambda x:x.n_context_words, reverse=True) dataset = ParagraphAndQuestionDataset(questions, FixedOrderBatcher(args.batch_size, True)) evaluators = [SpanEvaluator(args.answer_bounds, text_eval="squad")] if args.official_output is not None: evaluators.append(RecordSpanPrediction(args.answer_bounds[0])) if args.per_question_loss_file is not None: evaluators.append(RecordSpanPredictionScore(args.answer_bounds[0], args.batch_size, args.none_prob)) if args.step is not None: if args.step == "latest": checkpoint = model_dir.get_latest_checkpoint() else: checkpoint = model_dir.get_checkpoint(int(args.step)) else: checkpoint = model_dir.get_best_weights() if checkpoint is not None: print("Using best weights") else: print("Using latest checkpoint") checkpoint = model_dir.get_latest_checkpoint() model = model_dir.get_model() if args.elmo: model.lm_model.lm_vocab_file = './elmo-params/squad_train_dev_all_unique_tokens.txt' model.lm_model.options_file = './elmo-params/options_squad_lm_2x4096_512_2048cnn_2xhighway_skip.json' model.lm_model.weight_file = './elmo-params/squad_context_concat_lm_2x4096_512_2048cnn_2xhighway_skip.hdf5' model.lm_model.embed_weights_file = None evaluation = trainer.test(model, evaluators, {args.corpus: dataset}, corpus.get_resource_loader(), checkpoint, not args.no_ema)[args.corpus] # Print the scalar results in a two column table scalars = evaluation.scalars cols = list(sorted(scalars.keys())) table = [cols] header = ["Metric", ""] table.append([("%s" % scalars[x] if x in scalars else "-") for x in cols]) print_table([header] + transpose_lists(table)) # Save the official output if args.official_output is not None: quid_to_para = {} for x in questions: quid_to_para[x.question_id] = x.paragraph q_id_to_answers = {} q_ids = evaluation.per_sample["question_id"] spans = evaluation.per_sample["predicted_span"] for q_id, (start, end) in zip(q_ids, spans): text = quid_to_para[q_id].get_original_text(start, end) q_id_to_answers[q_id] = text with open(args.official_output, "w") as f: json.dump(q_id_to_answers, f) if args.per_question_loss_file is not None: print("Saving result") output_file = args.per_question_loss_file ids = evaluation.per_sample["question_ids"] f1s = evaluation.per_sample["text_f1"] ems = evaluation.per_sample["text_em"] losses = evaluation.per_sample["loss"] if args.none_prob: none_probs = evaluation.per_sample["none_probs"] """ results = {question_id: {'f1': float(f1), 'em': float(em), 'loss': float(loss), 'none_prob': float(none_prob)} for question_id, f1, em, loss, none_prob in zip(ids, f1s, ems, losses, none_probs)} """ results = {question_id: float(none_prob) for question_id, none_prob in zip(ids, none_probs)} else: results = {question_id: {'f1': float(f1), 'em': float(em), 'loss': float(loss)} for question_id, f1, em, loss in zip(ids, f1s, ems, losses)} with open(output_file, 'w') as f: json.dump(results, f)
def main(): parser = argparse.ArgumentParser(description='Run the demo server') parser.add_argument('model', help='Models to use') parser.add_argument( '-v', '--voc', help='vocab to use, only words from this file will be used') parser.add_argument('-t', '--tokens', type=int, default=400, help='Number of tokens to use per paragraph') parser.add_argument('--vec_dir', help='Location to find word vectors') parser.add_argument('--n_paragraphs', type=int, default=12, help="Number of paragraphs to run the model on") parser.add_argument('--span_bound', type=int, default=8, help="Max span size to return as an answer") parser.add_argument( '--tagme_api_key', help="Key to use for TAGME (tagme.d4science.org/tagme)") parser.add_argument('--bing_api_key', help="Key to use for bing searches") parser.add_argument('--tagme_thresh', default=0.2, type=float) parser.add_argument('--no_wiki', action="store_true", help="Dont use TAGME") parser.add_argument('--n_web', type=int, default=10, help='Number of web docs to fetch') parser.add_argument('--blacklist_trivia_sites', action="store_true", help="Don't use trivia websites") parser.add_argument('-c', '--wiki_cache', help="Cache wiki articles in this directory") parser.add_argument('--n_dl_threads', type=int, default=5, help="Number of threads to download documents with") parser.add_argument('--request_timeout', type=int, default=60) parser.add_argument('--download_timeout', type=int, default=25) parser.add_argument('--workers', type=int, default=1, help="Number of server workers") parser.add_argument('--debug', default=None, choices=["random_model", "dummy_qa"]) args = parser.parse_args() span_bound = args.span_bound if args.tagme_api_key is not None: tagme_api_key = args.tagme_api_key else: tagme_api_key = environ.get("TAGME_API_KEY") if args.bing_api_key is not None: bing_api_key = args.bing_api_key else: bing_api_key = environ.get("BING_API_KEY") if bing_api_key is None and args.n_web > 0: raise ValueError("If n_web > 0 you must give a BING_API_KEY") if args.debug is None: model = ModelDir(args.model) else: model = RandomPredictor(5, WithIndicators()) if args.vec_dir is not None: loader = LoadFromPath(args.vec_dir) else: loader = ResourceLoader() if args.debug == "dummy_qa": qa = DummyQa() else: qa = QaSystem( args.wiki_cache, MergeParagraphs(args.tokens), ShallowOpenWebRanker(args.n_paragraphs), args.voc, model, loader, bing_api_key, tagme_api_key=tagme_api_key, n_dl_threads=args.n_dl_threads, blacklist_trivia_sites=args.blacklist_trivia_sites, download_timeout=args.download_timeout, span_bound=span_bound, tagme_threshold=None if args.no_wiki else args.tagme_thresh, n_web_docs=args.n_web) logging.propagate = False formatter = logging.Formatter("%(asctime)s: %(levelname)s: %(message)s") handler = logging.StreamHandler() handler.setFormatter(formatter) logging.root.addHandler(handler) logging.root.setLevel(logging.DEBUG) app = Sanic() app.config.REQUEST_TIMEOUT = args.request_timeout @app.route("/answer") async def answer(request): try: question = request.args["question"][0] if question == "": return response.json({'message': 'No question given'}, status=400) spans, paras = await qa.answer_question(question) answers = select_answers(paras, spans, span_bound, 10) return json([x.to_json() for x in answers]) except Exception as e: log.info("Error: " + str(e)) raise ServerError("Server Error", status_code=500) @app.route('/answer-from', methods=['POST']) async def answer_from(request): try: args = ujson.loads(request.body.decode("utf-8")) question = args.get("question") if question is None or question == "": return response.json({'message': 'No question given'}, status=400) doc = args["document"] if len(doc) > 500000: raise ServerError("Document too large", status_code=400) spans, paras = qa.answer_with_doc(question, doc) answers = select_answers(paras, spans, span_bound, 10) return json([x.to_json() for x in answers]) except Exception as e: log.info("Error: " + str(e)) raise ServerError("Server Error", status_code=500) app.static('/', './docqa//server/static/index.html') app.static('/about.html', './docqa//service/static/about.html') app.run(host="0.0.0.0", port=8000, workers=args.workers, debug=False)
def main(): parser = argparse.ArgumentParser(description="Run an ELMo model on user input") # parser.add_argument("model", type=int, help="Model directory") parser.add_argument("question", help="Question to answer") parser.add_argument("documents", help="List of text documents to answer the question with", nargs='+') args = parser.parse_args() # Models path SQUAD_MODEL_DIRECTORY_PATH = 'docqa/models-cpu/squad' SQUAD_SHARED_NORM_MODEL_DIRECTORY_PATH = 'docqa/models-cpu/squad-shared-norm' TRIVIAQA_MODEL_DIRECTORY_PATH = 'docqa/models-cpu/triviaqa-unfiltered-shared-norm' TRIVIAQA_SHARED_NORM_MODEL_DIRECTORY_PATH = 'docqa/models-cpu/triviaqa-web-shared-norm' models_directory = [ SQUAD_MODEL_DIRECTORY_PATH, SQUAD_SHARED_NORM_MODEL_DIRECTORY_PATH, TRIVIAQA_MODEL_DIRECTORY_PATH, TRIVIAQA_SHARED_NORM_MODEL_DIRECTORY_PATH ] print("Preprocessing...") # Load the model # model_dir = ModelDir(args.model) model_dir = ModelDir(models_directory[0]) model = model_dir.get_model() if not isinstance(model, ParagraphQuestionModel): raise ValueError("This script is built to work for ParagraphQuestionModel models only") # Read the documents documents = [] for doc in args.documents: if not isfile(doc): raise ValueError(doc + " does not exist") with open(doc, "r") as f: documents.append(f.read()) print("Loaded %d documents" % len(documents)) # Split documents into lists of paragraphs documents = [re.split("\s*\n\s*", doc) for doc in documents] # Tokenize the input, the models expects data to be tokenized using `NltkAndPunctTokenizer` # Note the model expects case-sensitive input tokenizer = NltkAndPunctTokenizer() question = tokenizer.tokenize_paragraph_flat(args.question) # List of words # Now list of document->paragraph->sentence->word documents = [[tokenizer.tokenize_paragraph(p) for p in doc] for doc in documents] # Now group the document into paragraphs, this returns `ExtractedParagraph` objects # that additionally remember the start/end token of the paragraph within the source document splitter = MergeParagraphs(400) # splitter = PreserveParagraphs() # Uncomment to use the natural paragraph grouping documents = [splitter.split(doc) for doc in documents] # Now select the top paragraphs using a `ParagraphFilter` if len(documents) == 1: # Use TF-IDF to select top paragraphs from the document selector = TopTfIdf(NltkPlusStopWords(True), n_to_select=5) context = selector.prune(question, documents[0]) else: # Use a linear classifier to select top paragraphs among all the documents selector = ShallowOpenWebRanker(n_to_select=10) context = selector.prune(question, flatten_iterable(documents)) print("Select %d paragraph" % len(context)) if model.preprocessor is not None: # Models are allowed to define an additional pre-processing step # This will turn the `ExtractedParagraph` objects back into simple lists of tokens context = [model.preprocessor.encode_text(question, x) for x in context] else: # Otherwise just use flattened text context = [flatten_iterable(x.text) for x in context] print("Setting up model") # Tell the model the batch size (can be None) and vocab to expect, This will load the # needed word vectors and fix the batch size to use when building the graph / encoding the input voc = set(question) for txt in context: voc.update(txt) model.set_input_spec(ParagraphAndQuestionSpec(batch_size=len(context)), voc) # Now we build the actual tensorflow graph, `best_span` and `conf` are # tensors holding the predicted span (inclusive) and confidence scores for each # element in the input batch, confidence scores being the pre-softmax logit for the span print("Build tf graph") sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) # We need to use sess.as_default when working with the cuNND stuff, since we need an active # session to figure out the # of parameters needed for each layer. The cpu-compatible models don't need this. with sess.as_default(): # 8 means to limit the span to size 8 or less best_spans, conf = model.get_prediction().get_best_span(10) # Loads the saved weights model_dir.restore_checkpoint(sess) # Now the model is ready to run # The model takes input in the form of `ContextAndQuestion` objects, for example: data = [ParagraphAndQuestion(x, question, None, "user-question%d"%i) for i, x in enumerate(context)] print("Starting run") # The model is run in two steps, first it "encodes" a batch of paragraph/context pairs # into numpy arrays, then we use `sess` to run the actual model get the predictions encoded = model.encode(data, is_train=False) # batch of `ContextAndQuestion` -> feed_dict best_spans, conf = sess.run([best_spans, conf], feed_dict=encoded) # feed_dict -> predictions best_para = np.argmax(conf) # We get output for each paragraph, select the most-confident one to print print("Best Paragraph: " + str(best_para)) para_id = int(str(best_para)) # print("Best Paragraph: \n" + (" ".join((paras[para_id].text)[0]))) print("Best Paragraph: \n" + " ".join(context[para_id])) print("Best span: " + str(best_spans[best_para])) print("Answer text: " + " ".join(context[best_para][best_spans[best_para][0]:best_spans[best_para][1]+1])) print("Confidence: " + str(conf[best_para]))
def main(): parser = argparse.ArgumentParser( description="Run an ELMo model on user input") parser.add_argument("model", help="Model directory") parser.add_argument("ja_filepath", help="File path to japanese questions") parser.add_argument("result_file", help="File path to predicted result json") args = parser.parse_args() print(args) print("Preprocessing...") paragraphs, questions = read_squad_style_database(args.ja_filepath) # Load the model model_dir = ModelDir(args.model) model = model_dir.get_model() if not isinstance(model, ParagraphQuestionModel): raise ValueError( "This script is built to work for ParagraphQuestionModel models only" ) paragraphs, questions = read_squad_style_database(args.ja_filepath) predictions = {} predictions["conf"] = {} for qa in questions: print(qa["id"]) title = qa["title"] para_idx = qa["para_idx"] context = paragraphs[title][para_idx] question = qa["question"] print(context) print(question) if model.preprocessor is not None: context = [ model.preprocessor.encode_text(question, x) for x in context ] print("Setting up model") voc = set(question) for txt in context: voc.update(txt) model.set_input_spec(ParagraphAndQuestionSpec(batch_size=len(context)), voc) print("Build tf graph") sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) with sess.as_default(): best_spans, conf = model.get_prediction().get_best_span(8) # Loads the saved weights model_dir.restore_checkpoint(sess) data = [ ParagraphAndQuestion(x, question, None, "user-question%d" % i) for i, x in enumerate(context) ] print("Starting run") encoded = model.encode( data, is_train=False) # batch of `ContextAndQuestion` -> feed_dict best_spans, conf = sess.run( [best_spans, conf], feed_dict=encoded) # feed_dict -> predictions print(best_spans) predictions[qa["id"]] = best_spans predictions["conf"][qa["id"]] = conf print(predictions) result_f = open(args.result_file, "w") json.dump(predictions, result_f) exit() official_evaluator = OfficialEvaluator(args.ja_filepath, args.result_file) evaluation = official_evaluator.evaluate() print(evaluation)
def main(): print('Starting...') model_dir = ModelDir(OPTS.model) model = model_dir.get_model() tokenizer = NltkAndPunctTokenizer() if not isinstance(model, ParagraphQuestionModel): raise ValueError( "This script is built to work for ParagraphQuestionModel models only" ) if OPTS.reload_vocab: loader = ResourceLoader() else: loader = CachingResourceLoader() print('Loading word vectors...') model.set_input_spec(ParagraphAndQuestionSpec(batch_size=None), set([',']), word_vec_loader=loader, allow_update=True) print('Starting Tensorflow session...') sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) with sess.as_default(): prediction = model.get_prediction() # Take 0-th here because we know we only truncate to one paragraph start_logits_tf = prediction.start_logits[0] end_logits_tf = prediction.end_logits[0] none_logit_tf = prediction.none_logit[0] #best_spans_tf, conf_tf = prediction.get_best_span(MAX_SPAN_LENGTH) model_dir.restore_checkpoint(sess) splitter = Truncate(400) # NOTE: we truncate past 400 tokens selector = TopTfIdf(NltkPlusStopWords(True), n_to_select=5) app = bottle.Bottle() @app.route('/') def index(): return bottle.template('index') @app.route('/post_query', method='post') def post_query(): document_raw = bottle.request.forms.getunicode('document').strip() question_raw = bottle.request.forms.getunicode('question').strip() document = re.split("\s*\n\s*", document_raw) question = tokenizer.tokenize_paragraph_flat(question_raw) doc_toks = [tokenizer.tokenize_paragraph(p) for p in document] split_doc = splitter.split(doc_toks) context = selector.prune(question, split_doc) if model.preprocessor is not None: context = [ model.preprocessor.encode_text(question, x) for x in context ] else: context = [flatten_iterable(x.text) for x in context] vocab = set(question) for txt in context: vocab.update(txt) data = [ ParagraphAndQuestion(x, question, None, "user-question%d" % i) for i, x in enumerate(context) ] model.word_embed.update(loader, vocab) encoded = model.encode(data, is_train=False) start_logits, end_logits, none_logit = sess.run( [start_logits_tf, end_logits_tf, none_logit_tf], feed_dict=encoded) beam, p_na = logits_to_probs(document_raw, context[0], start_logits, end_logits, none_logit, beam_size=BEAM_SIZE) return bottle.template('results', document=document_raw, question=question_raw, beam=beam, p_na=p_na) cur_dir = os.path.abspath(os.path.dirname(__file__)) bottle.TEMPLATE_PATH.insert(0, os.path.join(cur_dir, 'views')) bottle.run(app, host=OPTS.hostname, port=OPTS.port, debug=OPTS.debug)
def run(): parser = argparse.ArgumentParser() parser.add_argument("input_data") parser.add_argument("output_data") parser.add_argument("--plot_dir", type=str, default=None) parser.add_argument("--model_dir", type=str, default="/tmp/model/document-qa") parser.add_argument("--lm_dir", type=str, default="/home/castle/data/lm/squad-context-concat-skip") parser.add_argument("--glove_dir", type=str, default="/home/castle/data/glove") parser.add_argument("--n", type=int, default=None) parser.add_argument("-b", "--batch_size", type=int, default=30) parser.add_argument("--ema", action="store_true") args = parser.parse_args() input_data = args.input_data output_path = args.output_data model_dir = ModelDir(args.model_dir) nltk.data.path.append("nltk_data") print("Loading data") docs = parse_squad_data(input_data, "", NltkAndPunctTokenizer(), False) pairs = split_docs(docs) dataset = ParagraphAndQuestionDataset(pairs, ClusteredBatcher(args.batch_size, ContextLenKey(), False, True)) print("Done, init model") model = model_dir.get_model() # small hack, just load the vector file at its expected location rather then using the config location loader = ResourceLoader(lambda a, b: load_word_vector_file(join(args.glove_dir, "glove.840B.300d.txt"), b)) lm_model = model.lm_model basedir = args.lm_dir plotdir = args.plot_dir lm_model.lm_vocab_file = join(basedir, "squad_train_dev_all_unique_tokens.txt") lm_model.options_file = join(basedir, "options_squad_lm_2x4096_512_2048cnn_2xhighway_skip.json") lm_model.weight_file = join(basedir, "squad_context_concat_lm_2x4096_512_2048cnn_2xhighway_skip.hdf5") lm_model.embed_weights_file = None model.set_inputs([dataset], loader) print("Done, building graph") sess = tf.Session() with sess.as_default(): pred = model.get_prediction() best_span = pred.get_best_span(17)[0] if plotdir != None: start_logits_op, end_logits_op = pred.get_logits() all_vars = tf.global_variables() + tf.get_collection(tf.GraphKeys.SAVEABLE_OBJECTS) dont_restore_names = {x.name for x in all_vars if x.name.startswith("bilm")} print(sorted(dont_restore_names)) vars = [x for x in all_vars if x.name not in dont_restore_names] print("Done, loading weights") checkpoint = model_dir.get_best_weights() if checkpoint is None: print("Loading most recent checkpoint") checkpoint = model_dir.get_latest_checkpoint() else: print("Loading best weights") saver = tf.train.Saver(vars) saver.restore(sess, checkpoint) if args.ema: ema = tf.train.ExponentialMovingAverage(0) saver = tf.train.Saver({ema.average_name(x): x for x in tf.trainable_variables()}) saver.restore(sess, checkpoint) sess.run(tf.variables_initializer([x for x in all_vars if x.name in dont_restore_names])) print("Done, starting evaluation") out = {} for i, batch in enumerate(dataset.get_epoch()): if args.n is not None and i == args.n: break print("On batch size [%d], now in %d th batch" % (args.batch_size, i +1)) enc = model.encode(batch, False) if plotdir != None: spans, start_logits, end_logits = sess.run([best_span, start_logits_op, end_logits_op], feed_dict=enc) for bi, point in enumerate(batch): q = ' '.join(point.question) c = point.paragraph.get_context() gt = ' | '.join(point.answer.answer_text) s, e = spans[bi] pred = point.get_original_text(s, e) start_dist = start_logits[bi] end_dist = end_logits[bi] c_interval = np.arange(0.0, start_dist.shape[0], 1) c_label = c plt.figure(1) plt.subplot(211) plt.plot(c_interval, start_dist, color='r') plt.title("Q : " + q + " // A : " + gt, fontsize=9) plt.text(0, 0, r'Predict : %s [%d:%d]' % (pred, s, e), color='b') axes = plt.gca() axes.set_ylim([-20, 20]) plt.subplot(212) plt.plot(c_interval, end_dist, color='g') plt.xticks(c_interval, c_label, rotation=90, fontsize=5) axes = plt.gca() axes.set_ylim([-20, 20]) plt.show() break else: spans = sess.run(best_span, feed_dict=enc) for (s, e), point in zip(spans, batch): out[point.question_id] = point.get_original_text(s, e) sess.close() print("Done, saving") with open(output_path, "w") as f: json.dump(out, f) print("Mission accomplished!")
def getAnswer(self): #parser = argparse.ArgumentParser(description="Run an ELMo model on user input") #parser.add_argument("model", help="Model directory") #parser.add_argument("question", help="Question to answer") #parser.add_argument("documents", help="List of text documents to answer the question with", nargs='+') #args = parser.parse_args() #print("Preprocessing...") # Load the model model_dir = ModelDir(MODEL_DIR) model = model_dir.get_model() if not isinstance(model, ParagraphQuestionModel): raise ValueError( "This script is built to work for ParagraphQuestionModel models only" ) conn = pyodbc.connect(DB_CONN) cursor = conn.cursor() #(23211,28690,33214,25638,25837,26454,28693,26137,31428,32087) query="select cast(filetext as varchar(max)) as filetext, name, type from dbo.UserworkspaceData where objectmasterid= "+\ str(self.ObjectMasterId)+\ " order by id asc" #query="select cast(filetext as varchar(max)) as filetext from kpl_tmp" documents = [] document = "" name = "" filetype = 0 for doc in cursor.execute(query): document = document + doc[0] name = doc[1] filetype = doc[2] #open("E:/kpl.txt","w+").write(document) documents.append(document) #documents.replace("\n\n","\n") #r.sub("",documents) #documents=" ".join(documents.split()) #open("E:\kpl_test.txt","w+").write(document) #doc="D:\Document QnA\document-qa-master\Data\Drug_Delivery_Surveying_Global_Competitive_Landscape_BMI.txt" # ============================================================================= # if not isfile(doc): # raise ValueError(doc + " does not exist") # with open(doc, "r") as f: # documents.append(f.read()) # ============================================================================= #print("Loaded %d documents" % len(documents)) #temp=documents[0].split() # Split documents into lists of paragraphs #documents=[" ".join(temp[i:(i+400)]) for i in range(1,len(temp),400)] documents = [re.split("\s*\n\s*", doc) for doc in documents] # Tokenize the input, the models expects data to be tokenized using `NltkAndPunctTokenizer` # Note the model expects case-sensitive input tokenizer = NltkAndPunctTokenizer() question = tokenizer.tokenize_paragraph_flat( self.Question) # List of words # Now list of document->paragraph->sentence->word documents = [[tokenizer.tokenize_paragraph(p) for p in doc] for doc in documents] # Now group the document into paragraphs, this returns `ExtractedParagraph` objects # that additionally remember the start/end token of the paragraph within the source document splitter = MergeParagraphs(400) #splitter = PreserveParagraphs() # Uncomment to use the natural paragraph grouping documents = [splitter.split(doc) for doc in documents] #print(str(len(documents))+" kpl") #kpl # Now select the top paragraphs using a `ParagraphFilter` if len(documents) == 1: # Use TF-IDF to select top paragraphs from the document selector = TopTfIdf(NltkPlusStopWords(True), n_to_select=5) context = selector.prune(question, documents[0]) else: # Use a linear classifier to select top paragraphs among all the documents selector = ShallowOpenWebRanker(n_to_select=10) context = selector.prune(question, flatten_iterable(documents)) #print("Select %d paragraph" % len(context)) if model.preprocessor is not None: # Models are allowed to define an additional pre-processing step # This will turn the `ExtractedParagraph` objects back into simple lists of tokens context = [ model.preprocessor.encode_text(question, x) for x in context ] else: # Otherwise just use flattened text context = [flatten_iterable(x.text) for x in context] #x=open("E:\context.txt","a+") #[x.write(" ".join(cont)) for cont in context] #x.write("\n.......................................................\n") #print("Setting up model") # Tell the model the batch size (can be None) and vocab to expect, This will load the # needed word vectors and fix the batch size to use when building the graph / encoding the input voc = set(question) for txt in context: voc.update(txt) model.set_input_spec(self.nlp, ParagraphAndQuestionSpec(batch_size=len(context)), voc) # Now we build the actual tensorflow graph, `best_span` and `conf` are # tensors holding the predicted span (inclusive) and confidence scores for each # element in the input batch, confidence scores being the pre-softmax logit for the span #print("Build tf graph") #kpl sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) # We need to use sess.as_default when working with the cuNND stuff, since we need an active # session to figure out the # of parameters needed for each layer. The cpu-compatible models don't need this. with sess.as_default(): # 8 means to limit the span to size 8 or less best_spans, conf = model.get_prediction().get_best_span(8) # Loads the saved weights model_dir.restore_checkpoint(sess) # Now the model is ready to run # The model takes input in the form of `ContextAndQuestion` objects, for example: data = [ ParagraphAndQuestion(x, question, None, "user-question%d" % i) for i, x in enumerate(context) ] #print("Starting run") # The model is run in two steps, first it "encodes" a batch of paragraph/context pairs # into numpy arrays, then we use `sess` to run the actual model get the predictions encoded = model.encode( data, is_train=True) # batch of `ContextAndQuestion` -> feed_dict best_spans, conf = sess.run( [best_spans, conf], feed_dict=encoded) # feed_dict -> predictions best_para = np.argmax( conf ) # We get output for each paragraph, select the most-confident one to print #print("Best Paragraph: " + str(best_para)) #print("Best span: " + str(best_spans[best_para])) #print("Answer text: " + " ".join(context[best_para][best_spans[best_para][0]:best_spans[best_para][1]+1])) #print("Confidence: " + str(conf[best_para])) Answer = " ".join(context[best_para] [best_spans[best_para][0]:best_spans[best_para][1] + 1]) print("Confidence: " + str(conf[best_para])) print("Best Paragraph: " + str(best_para)) print("Best span: " + str(best_spans[best_para])) print("Answer text: " + Answer) print(" ".join(context[best_para])) context[best_para][best_spans[best_para][ 0]] = r"<em>" + context[best_para][best_spans[best_para][0]] context[best_para][best_spans[best_para][1]] = context[best_para][ best_spans[best_para][1]] + r"</em>" start = 0 end = len(context[best_para]) positions = [ x for x, n in enumerate(context[best_para] [0:best_spans[best_para][0]]) if n == "." ] if len(positions) >= 2: start = positions[len(positions) - 2] + 1 positions = [ x for x, n in enumerate(context[best_para][best_spans[best_para][1] + 1:]) if n == "." ] if len(positions) > 1: end = best_spans[best_para][1] + 1 + positions[1] d = dict() if conf[best_para] > 10: d["answer"] = Answer else: d["answer"] = "" d["name"] = name d["filetype"] = filetype d["paragraph"] = re.sub(r' (?=\W)', '', " ".join(context[best_para][start:end])) d["ObjectMasterId"] = self.ObjectMasterId return d #if __name__ == "__main__": # main()
def convert_saved_graph(model_dir, output_dir): print("Load model") md = ModelDir(model_dir) model = md.get_model() # remove the lm models word embeddings - cpu model will use Char-CNN model.lm_model.embed_weights_file = None dim = model.embed_mapper.layers[1].n_units print("Setting up cudnn version") sess = tf.Session() with sess.as_default(): model.set_input_spec( ParagraphAndQuestionSpec(1, None, None, 14), {"the"}, ResourceLoader(lambda a, b: {"the": np.zeros(300, np.float32)})) print("Buiding graph") pred = model.get_prediction() test_questions = get_test_questions() print("Load vars:") all_vars = tf.global_variables() + tf.get_collection( tf.GraphKeys.SAVEABLE_OBJECTS) lm_var_names = {x.name for x in all_vars if x.name.startswith("bilm")} vars = [x for x in all_vars if x.name not in lm_var_names] md.restore_checkpoint(sess, vars) sess.run( tf.variables_initializer( [x for x in all_vars if x.name in lm_var_names])) feed = model.encode([test_questions], False) cuddn_out = sess.run([pred.start_logits, pred.end_logits], feed_dict=feed) print("Done, copying files...") if not exists(output_dir): mkdir(output_dir) for file in listdir(model_dir): if isfile(file) and file != "model.npy": copyfile(join(model_dir, file), join(output_dir, file)) print("Done, mapping tensors...") to_save, to_init = [], [] for x in tf.trainable_variables(): if x.name.endswith("/gru_parameters:0"): key = x.name[:-len("/gru_parameters:0")] indim, outdim = get_dims(x, dim) c = cudnn_rnn_ops.CudnnGRUSaveable(x, 1, outdim, indim, scope=key) for spec in c.specs: if spec.name.endswith("bias_cudnn 0") or \ spec.name.endswith("bias_cudnn 1"): print('Unsupported spec: ' + spec.name) continue if 'forward' in spec.name: new_name = spec.name.replace( 'forward/rnn/multi_rnn_cell/cell_0/', 'bidirectional_rnn/fw/') else: new_name = spec.name.replace( 'backward/rnn/multi_rnn_cell/cell_0/', 'bidirectional_rnn/bw/') v = tf.Variable(sess.run(spec.tensor), name=new_name) to_init.append(v) to_save.append(v) else: to_save.append(x) save_dir = join(output_dir, "save") if not exists(save_dir): mkdir(save_dir) # save: all_vars = tf.global_variables() + tf.get_collection( tf.GraphKeys.SAVEABLE_OBJECTS) vars_to_save = [x for x in all_vars if not x.name.startswith("bilm")] sess.run(tf.initialize_variables(to_init)) saver = tf.train.Saver(vars_to_save) saver.save( sess, join(save_dir, 'checkpoint'), global_step=123456789, write_meta_graph=False, ) sess.close() tf.reset_default_graph() return cuddn_out
def main(): parser = argparse.ArgumentParser( description='Evaluate a model on TriviaQA data') parser.add_argument('model', help='model directory') parser.add_argument( '-p', '--paragraph_output', type=str, help="Save fine grained results for each paragraph in csv format") parser.add_argument('-o', '--official_output', type=str, help="Build an offical output file with the model's" " most confident span for each (question, doc) pair") parser.add_argument('--no_ema', action="store_true", help="Don't use EMA weights even if they exist") parser.add_argument( '--n_processes', type=int, default=None, help= "Number of processes to do the preprocessing (selecting paragraphs+loading context) with" ) parser.add_argument('-i', '--step', type=int, default=None, help="checkpoint to load, default to latest") parser.add_argument('-n', '--n_sample', type=int, default=None, help="Number of questions to evaluate on") parser.add_argument('-a', '--async', type=int, default=10) parser.add_argument('-t', '--tokens', type=int, default=400, help="Max tokens per a paragraph") parser.add_argument('-g', '--n_paragraphs', type=int, default=15, help="Number of paragraphs to run the model on") parser.add_argument('-f', '--filter', type=str, default=None, choices=["tfidf", "truncate", "linear"], help="How to select paragraphs") parser.add_argument( '-b', '--batch_size', type=int, default=200, help="Batch size, larger sizes might be faster but wll take more memory" ) parser.add_argument('--max_answer_len', type=int, default=8, help="Max answer span to select") parser.add_argument('-c', '--corpus', choices=[ "web-dev", "web-test", "web-verified-dev", "web-train", "open-dev", "open-train", "wiki-dev", "wiki-test" ], default="web-verified-dev") parser.add_argument("-s", "--source_dir", type=str, default=None, help="where to take input files") parser.add_argument("--n_span_per_q", type=int, default=1, help="where to take input files") args = parser.parse_args() dataset_name = args.source_dir.split('/')[-1] model_name = args.model.split('/')[-1] ElasticLogger().write_log('INFO', 'Start Evaluation', context_dict={ 'model': model_name, 'dataset': dataset_name }) model_dir = ModelDir(args.model) model = model_dir.get_model() if args.corpus.startswith('web'): dataset = TriviaQaWebDataset() if args.corpus == "web-dev": test_questions = dataset.get_dev() elif args.corpus == "web-test": test_questions = dataset.get_test() elif args.corpus == "web-verified-dev": test_questions = dataset.get_verified() elif args.corpus == "web-train": test_questions = dataset.get_train() else: raise AssertionError() elif args.corpus.startswith("wiki"): dataset = TriviaQaWikiDataset() if args.corpus == "wiki-dev": test_questions = dataset.get_dev() elif args.corpus == "wiki-test": test_questions = dataset.get_test() else: raise AssertionError() else: dataset = TriviaQaOpenDataset(args.source_dir) if args.corpus == "open-dev": # just loading the pkl that was saved in build_span_corpus test_questions = dataset.get_dev() elif args.corpus == "open-train": test_questions = dataset.get_train() else: raise AssertionError() ### ALON debuging #test_questions = test_questions[0:5] corpus = dataset.evidence splitter = MergeParagraphs(args.tokens) per_document = args.corpus.startswith( "web") # wiki and web are both multi-document #per_document = True filter_name = args.filter if filter_name is None: # Pick default depending on the kind of data we are using if per_document: filter_name = "tfidf" else: filter_name = "linear" print("Selecting %d paragraphs using method \"%s\" per %s" % (args.n_paragraphs, filter_name, ("question-document pair" if per_document else "question"))) if filter_name == "tfidf": para_filter = TopTfIdf(NltkPlusStopWords(punctuation=True), args.n_paragraphs) elif filter_name == "truncate": para_filter = FirstN(args.n_paragraphs) elif filter_name == "linear": para_filter = ShallowOpenWebRanker(args.n_paragraphs) else: raise ValueError() n_questions = args.n_sample docqa.config.SPANS_PER_QUESTION = args.n_span_per_q #n_questions = 1 if n_questions is not None: test_questions.sort(key=lambda x: x.question_id) np.random.RandomState(0).shuffle(test_questions) test_questions = test_questions[:n_questions] print("Building question/paragraph pairs...") # Loads the relevant questions/documents, selects the right paragraphs, and runs the model's preprocessor if per_document: prep = ExtractMultiParagraphs(splitter, para_filter, model.preprocessor, require_an_answer=False) else: prep = ExtractMultiParagraphsPerQuestion(splitter, para_filter, model.preprocessor, require_an_answer=False) prepped_data = preprocess_par(test_questions, corpus, prep, args.n_processes, 1000) data = [] for q in prepped_data.data: for i, p in enumerate(q.paragraphs): if q.answer_text is None: ans = None else: ans = TokenSpans(q.answer_text, p.answer_spans) data.append( DocumentParagraphQuestion(q.question_id, p.doc_id, (p.start, p.end), q.question, p.text, ans, i)) # Reverse so our first batch will be the largest (so OOMs happen early) questions = sorted(data, key=lambda x: (x.n_context_words, len(x.question)), reverse=True) print("Done, starting eval") if args.step is not None: if args.step == "latest": checkpoint = model_dir.get_latest_checkpoint() else: checkpoint = model_dir.get_checkpoint(int(args.step)) else: checkpoint = model_dir.get_best_weights() if checkpoint is not None: print("Using best weights") else: print("Using latest checkpoint") checkpoint = model_dir.get_latest_checkpoint() test_questions = ParagraphAndQuestionDataset( questions, FixedOrderBatcher(args.batch_size, True)) evaluation = trainer.test( model, [RecordParagraphSpanPrediction(args.max_answer_len, True)], {args.corpus: test_questions}, ResourceLoader(), checkpoint, not args.no_ema, args. async)[args.corpus] if not all(len(x) == len(data) for x in evaluation.per_sample.values()): raise RuntimeError() df = pd.DataFrame(evaluation.per_sample) if args.official_output is not None: print("Saving question result") fns = {} if per_document: # I didn't store the unormalized filenames exactly, so unfortunately we have to reload # the source data to get exact filename to output an official test script print("Loading proper filenames") if args.corpus == 'web-test': source = join(TRIVIA_QA, "qa", "web-test-without-answers.json") elif args.corpus == "web-dev": source = join(TRIVIA_QA, "qa", "web-dev.json") else: raise AssertionError() with open(join(source)) as f: data = json.load(f)["Data"] for point in data: for doc in point["EntityPages"]: filename = doc["Filename"] fn = join("wikipedia", filename[:filename.rfind(".")]) fn = normalize_wiki_filename(fn) fns[(point["QuestionId"], fn)] = filename answers = {} scores = {} for q_id, doc_id, start, end, txt, score in df[[ "question_id", "doc_id", "para_start", "para_end", "text_answer", "predicted_score" ]].itertuples(index=False): filename = dataset.evidence.file_id_map[doc_id] if per_document: if filename.startswith("web"): true_name = filename[4:] + ".txt" else: true_name = fns[(q_id, filename)] # Alon Patch for triviaqa test results true_name = true_name.replace('TriviaQA_Org/', '') key = q_id + "--" + true_name else: key = q_id prev_score = scores.get(key) if prev_score is None or prev_score < score: scores[key] = score answers[key] = txt with open(args.official_output, "w") as f: json.dump(answers, f) output_file = args.paragraph_output if output_file is not None: print("Saving paragraph result") df.to_csv(output_file, index=False) print("Computing scores") if per_document: group_by = ["question_id", "doc_id"] else: group_by = ["question_id"] # Print a table of scores as more paragraphs are used df.sort_values(group_by + ["rank"], inplace=True) df_scores = df.copy(deep=True) df_scores['predicted_score'] = df_scores['predicted_score'].apply( lambda x: pd.Series(x).max()) em = compute_ranked_scores(df_scores, "predicted_score", "text_em", group_by) f1 = compute_ranked_scores(df_scores, "predicted_score", "text_f1", group_by) table = [["N Paragraphs", "EM", "F1"]] table += list([str(i + 1), "%.4f" % e, "%.4f" % f] for i, (e, f) in enumerate(zip(em, f1))) table_df = pd.DataFrame(table[1:], columns=table[0]).drop(['N Paragraphs'], axis=1) ElasticLogger().write_log('INFO', 'Results', context_dict={'model': model_name, 'dataset': dataset_name, \ 'max_EM':table_df.max().ix['EM'], \ 'max_F1':table_df.max().ix['F1'], \ 'result_table': str(table_df)}) df_flat = [] for id, question in df.iterrows(): for text_answer, predicted_span, predicted_score in zip( question['text_answer'], question['predicted_span'], question['predicted_score']): new_question = dict(question.copy()) new_question.update({ 'text_answer': text_answer, 'predicted_span': predicted_span, 'predicted_score': predicted_score }) df_flat.append(new_question) results_df = pd.DataFrame(df_flat) #Alon: outputing the estimates for all the #results_df = results_df.groupby(['question_id', 'text_answer']).apply(lambda df: df.ix[df['predicted_score'].argmax()]).reset_index(drop=True) results_df.sort_values(by=['question_id', 'predicted_score'], ascending=False).set_index([ 'question_id', 'text_answer' ])[['question', 'predicted_score', 'text_em']].to_csv('results.csv') print_table(table)
def main(): parser = argparse.ArgumentParser("Train rejection model on SQuAD") parser.add_argument("--corpus_dir", type=str, default="~/data/document-qa") parser.add_argument("--output_dir", type=str, default="~/model/document-qa/squad") parser.add_argument("--lm_dir", type=str, default="~/data/lm") parser.add_argument("--exp_id", type=str, default="rejection") parser.add_argument("--lr", type=float, default=0.5) parser.add_argument("--epoch", type=int, default=20) parser.add_argument("--dim", type=int, default=100) parser.add_argument("--batch_size", type=int, default=45) parser.add_argument("--l2", type=float, default=0) parser.add_argument("--mode", choices=["input", "output", "both", "none"], default="both") parser.add_argument("--top_layer_only", action="store_true") args = parser.parse_args() print("Arguments : ", args) out = args.output_dir + "_" + args.exp_id + "_lr" + str( args.lr) + "-" + datetime.now().strftime("%m%d-%H%M%S") dim = args.dim batch_size = args.batch_size out = expanduser(out) lm_dir = expanduser(args.lm_dir) corpus_dir = expanduser(args.corpus_dir) print("Make global recurrent_layer...") recurrent_layer = CudnnGru( dim, w_init=tf.keras.initializers.TruncatedNormal(stddev=0.05)) params = trainer.TrainParams(trainer.SerializableOptimizer( "Adadelta", dict(learning_rate=args.lr)), ema=0.999, max_checkpoints_to_keep=2, async_encoding=10, num_epochs=args.epoch, log_period=30, eval_period=1200, save_period=1200, best_weights=("dev", "b17/text-f1"), eval_samples=dict(dev=None, train=8000)) lm_reduce = MapperSeq( ElmoLayer(args.l2, layer_norm=False, top_layer_only=args.top_layer_only), DropoutLayer(0.5), ) model = AttentionWithElmo( encoder=DocumentAndQuestionEncoder(SingleSpanAnswerEncoder()), lm_model=SquadContextConcatSkip(lm_dir=lm_dir), append_before_atten=(args.mode == "both" or args.mode == "output"), append_embed=(args.mode == "both" or args.mode == "input"), max_batch_size=128, word_embed=FixedWordEmbedder(vec_name="glove.840B.300d", word_vec_init_scale=0, learn_unk=False, cpu=True), char_embed=CharWordEmbedder(LearnedCharEmbedder(word_size_th=14, char_th=49, char_dim=20, init_scale=0.05, force_cpu=True), MaxPool(Conv1d(100, 5, 0.8)), shared_parameters=True), embed_mapper=SequenceMapperSeq( VariationalDropoutLayer(0.8), recurrent_layer, VariationalDropoutLayer(0.8), ), lm_reduce=None, lm_reduce_shared=lm_reduce, per_sentence=False, memory_builder=NullBiMapper(), attention=BiAttention(TriLinear(bias=True), True), match_encoder=SequenceMapperSeq( FullyConnected(dim * 2, activation="relu"), ResidualLayer( SequenceMapperSeq( VariationalDropoutLayer(0.8), recurrent_layer, VariationalDropoutLayer(0.8), StaticAttentionSelf(TriLinear(bias=True), ConcatWithProduct()), FullyConnected(dim * 2, activation="relu"), )), VariationalDropoutLayer(0.8)), predictor=BoundsPredictor( ChainBiMapper(first_layer=recurrent_layer, second_layer=recurrent_layer))) batcher = ClusteredBatcher(batch_size, ContextLenKey(), False, False) data = DocumentQaTrainingData(SquadCorpus(corpus_dir), None, batcher, batcher) with open(__file__, "r") as f: notes = f.read() notes = str(sorted(args.__dict__.items(), key=lambda x: x[0])) + "\n" + notes trainer.start_training( data, model, params, [LossEvaluator(), SpanEvaluator(bound=[17], text_eval="squad")], ModelDir(out), notes)
def perform_evaluation(model_name: str, dataset_names: List[str], tokens_per_paragraph: int, filter_type: str, n_processes: int, n_paragraphs: int, batch_size: int, checkpoint: str, no_ema: bool, max_answer_len: int, official_output_path: str, paragraph_output_path: str, aggregated_output_path: str, elmo_char_cnn: bool, n_samples: Union[int, None], per_document: bool = False): """Perform an evaluation using cape's answer decoder A file will be created listing the answers per question ID for each dataset :param model_name: path to the model to evaluate :param dataset_names: list of strings of datasets to evaluate :param tokens_per_paragraph: how big to make paragraph chunks :param filter_type: how to select the paragraphs to read :param n_processes: how many processes to use when multiprocessing :param n_paragraphs: how many paragraphs to read per question :param batch_size: how many datapoints to evaluate at once :param checkpoint: string, checkpoint to load :param no_ema: if true, dont use EMA weights :param max_answer_len: the maximum allowable length of an answer in tokens :param official_output_path: path to write official output to :param paragraph_output_path: path to write paragraph output to :param aggregated_output_path: path to write aggregated output to :param elmo_char_cnn: if true, uses the elmo CNN to make token embeddings, less OOV but requires much more memory :param per_document: if false, return best scoring answer to a question, if true, the best scoring answer from each document is used instead. """ async = True corpus_name = 'all' print('Setting Up:') model_dir = ModelDir(model_name) model = model_dir.get_model() dataset = get_multidataset(dataset_names) splitter = MergeParagraphs(tokens_per_paragraph) para_filter = get_para_filter(filter_type, per_document, n_paragraphs) test_questions, n_questions = get_questions(per_document, dataset, splitter, para_filter, model.preprocessor, n_processes, batch_size) print("Starting eval") checkpoint = get_checkpoint(checkpoint, model_dir) evaluation = test(model, [RecordParagraphSpanPrediction(max_answer_len, True)], {corpus_name: test_questions}, ResourceLoader(), checkpoint, not no_ema, async, n_samples, elmo_char_cnn)[corpus_name] print('Exporting and Post-processing') if not all(len(x) == n_questions for x in evaluation.per_sample.values()): raise RuntimeError() df = pd.DataFrame(evaluation.per_sample) compute_and_dump_official_output(df, official_output_path, per_document=per_document) print("Saving paragraph result") df.to_csv(paragraph_output_path, index=False) print("Computing scores") agg_df = get_aggregated_df(df, per_document) agg_df.to_csv(aggregated_output_path, index=False)
def convert(model_dir, output_dir, best_weights=False): print("Load model") md = ModelDir(model_dir) model = md.get_model() dim = model.embed_mapper.layers[1].n_units global_step = tf.get_variable('global_step', shape=[], dtype='int32', initializer=tf.constant_initializer(0), trainable=False) print("Setting up cudnn version") #global_step = tf.get_variable('global_step', shape=[], dtype='int32', trainable=False) sess = tf.Session() sess.run(global_step.assign(0)) with sess.as_default(): model.set_input_spec( ParagraphAndQuestionSpec(1, None, None, 14), {"the"}, ResourceLoader(lambda a, b: {"the": np.zeros(300, np.float32)})) print("Buiding graph") pred = model.get_prediction() test_questions = ParagraphAndQuestion( ["Harry", "Potter", "was", "written", "by", "JK"], ["Who", "wrote", "Harry", "Potter", "?"], None, "test_questions") print("Load vars") md.restore_checkpoint(sess) print("Restore finished") feed = model.encode([test_questions], False) cuddn_out = sess.run([pred.start_logits, pred.end_logits], feed_dict=feed) print("Done, copying files...") if not exists(output_dir): mkdir(output_dir) for file in listdir(model_dir): if isfile(file) and file != "model.npy": copyfile(join(model_dir, file), join(output_dir, file)) print("Done, mapping tensors...") to_save = [] to_init = [] for x in tf.trainable_variables(): if x.name.endswith("/gru_parameters:0"): key = x.name[:-len("/gru_parameters:0")] fw_params = x if "map_embed" in x.name: c = cudnn_rnn_ops.CudnnGRU(1, dim, 400) elif "chained-out" in x.name: c = cudnn_rnn_ops.CudnnGRU(1, dim, dim * 4) else: c = cudnn_rnn_ops.CudnnGRU(1, dim, dim * 2) params_saveable = cudnn_rnn_ops.RNNParamsSaveable( c, c.params_to_canonical, c.canonical_to_params, [fw_params], key) for spec in params_saveable.specs: if spec.name.endswith("bias_cudnn 0") or \ spec.name.endswith("bias_cudnn 1"): # ??? What do these even do? continue name = spec.name.split("/") name.remove("cell_0") if "forward" in name: ix = name.index("forward") name.insert(ix + 2, "fw") else: ix = name.index("backward") name.insert(ix + 2, "bw") del name[ix] ix = name.index("multi_rnn_cell") name[ix] = "bidirectional_rnn" name = "/".join(name) v = tf.Variable(sess.run(spec.tensor), name=name) to_init.append(v) to_save.append(v) else: to_save.append(x) other = [ x for x in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) if x not in tf.trainable_variables() ] print(other) sess.run(tf.initialize_variables(to_init)) saver = tf.train.Saver(to_save + other) save_dir = join(output_dir, "save") if not exists(save_dir): mkdir(save_dir) saver.save(sess, join(save_dir, "checkpoint"), sess.run(global_step)) sess.close() tf.reset_default_graph() print("Updating model...") model.embed_mapper.layers = [ model.embed_mapper.layers[0], BiRecurrentMapper(CompatGruCellSpec(dim)) ] model.match_encoder.layers = list(model.match_encoder.layers) other = model.match_encoder.layers[1].other other.layers = list(other.layers) other.layers[1] = BiRecurrentMapper(CompatGruCellSpec(dim)) pred = model.predictor.predictor pred.first_layer = BiRecurrentMapper(CompatGruCellSpec(dim)) pred.second_layer = BiRecurrentMapper(CompatGruCellSpec(dim)) with open(join(output_dir, "model.pkl"), "wb") as f: pickle.dump(model, f) print("Testing...") with open(join(output_dir, "model.pkl"), "rb") as f: model = pickle.load(f) sess = tf.Session() model.set_input_spec( ParagraphAndQuestionSpec(1, None, None, 14), {"the"}, ResourceLoader(lambda a, b: {"the": np.zeros(300, np.float32)})) pred = model.get_prediction() print("Rebuilding") saver = tf.train.Saver() saver.restore(sess, tf.train.latest_checkpoint(save_dir)) feed = model.encode([test_questions], False) cpu_out = sess.run([pred.start_logits, pred.end_logits], feed_dict=feed) print("These should be close:") print([np.allclose(a, b) for a, b in zip(cpu_out, cuddn_out)]) print(cpu_out) print(cuddn_out)
def main(): parser = argparse.ArgumentParser(description='Evaluate a model on SQuAD') parser.add_argument('model', help='model directory to evaluate') parser.add_argument("-o", "--official_output", type=str, help="where to output an official result file") parser.add_argument('-n', '--sample_questions', type=int, default=None, help="(for testing) run on a subset of questions") parser.add_argument('--answer_bounds', nargs='+', type=int, default=[17], help="Max size of answer") parser.add_argument('-b', '--batch_size', type=int, default=200, help="Batch size, larger sizes can be faster but uses more memory") parser.add_argument('-s', '--step', default=None, help="Weights to load, can be a checkpoint step or 'latest'") # Add ja_test choice to test Multilingual QA dataset. parser.add_argument( '-c', '--corpus', choices=["dev", "train", "ja_test", "pred"], default="dev") parser.add_argument('--no_ema', action="store_true", help="Don't use EMA weights even if they exist") # Add ja_test choice to test Multilingual QA pipeline. parser.add_argument('-p', '--pred_filepath', default=None, help="The csv file path if you try pred mode") args = parser.parse_args() model_dir = ModelDir(args.model) corpus = SquadCorpus() if args.corpus == "dev": questions = corpus.get_dev() # Add ja_test choice to test Multilingual QA pipeline. elif args.corpus == "ja_test": questions = corpus.get_ja_test() # This is for prediction mode for MLQA pipeline. elif args.corpus == "pred": questions = create_pred_dataset(args.pred_filepath) else: questions = corpus.get_train() questions = split_docs(questions) if args.sample_questions: np.random.RandomState(0).shuffle( sorted(questions, key=lambda x: x.question_id)) questions = questions[:args.sample_questions] questions.sort(key=lambda x: x.n_context_words, reverse=True) dataset = ParagraphAndQuestionDataset( questions, FixedOrderBatcher(args.batch_size, True)) evaluators = [SpanEvaluator(args.answer_bounds, text_eval="squad")] if args.official_output is not None: evaluators.append(RecordSpanPrediction(args.answer_bounds[0])) if args.step is not None: if args.step == "latest": checkpoint = model_dir.get_latest_checkpoint() else: checkpoint = model_dir.get_checkpoint(int(args.step)) else: checkpoint = model_dir.get_best_weights() if checkpoint is not None: print("Using best weights") else: print("Using latest checkpoint") checkpoint = model_dir.get_latest_checkpoint() model = model_dir.get_model() evaluation = trainer.test(model, evaluators, {args.corpus: dataset}, corpus.get_resource_loader(), checkpoint, not args.no_ema)[args.corpus] # Print the scalar results in a two column table scalars = evaluation.scalars cols = list(sorted(scalars.keys())) table = [cols] header = ["Metric", ""] table.append([("%s" % scalars[x] if x in scalars else "-") for x in cols]) print_table([header] + transpose_lists(table)) # Save the official output if args.official_output is not None: quid_to_para = {} for x in questions: quid_to_para[x.question_id] = x.paragraph q_id_to_answers = {} q_ids = evaluation.per_sample["question_id"] spans = evaluation.per_sample["predicted_span"] for q_id, (start, end) in zip(q_ids, spans): text = quid_to_para[q_id].get_original_text(start, end) q_id_to_answers[q_id] = text with open(args.official_output, "w") as f: json.dump(q_id_to_answers, f)
def main(): parser = argparse.ArgumentParser(description='Evaluate a model on TriviaQA data') parser.add_argument('model', help='model directory') parser.add_argument('-p', '--paragraph_output', type=str, help="Save fine grained results for each paragraph in csv format") parser.add_argument('-o', '--official_output', type=str, help="Build an offical output file with the model's" " most confident span for each (question, doc) pair") parser.add_argument('--no_ema', action="store_true", help="Don't use EMA weights even if they exist") parser.add_argument('--n_processes', type=int, default=None, help="Number of processes to do the preprocessing (selecting paragraphs+loading context) with") parser.add_argument('-i', '--step', type=int, default=None, help="checkpoint to load, default to latest") parser.add_argument('-n', '--n_sample', type=int, default=None, help="Number of questions to evaluate on") parser.add_argument('-a', '--async', type=int, default=10) parser.add_argument('-t', '--tokens', type=int, default=400, help="Max tokens per a paragraph") parser.add_argument('-g', '--n_paragraphs', type=int, default=15, help="Number of paragraphs to run the model on") parser.add_argument('-f', '--filter', type=str, default=None, choices=["tfidf", "truncate", "linear"], help="How to select paragraphs") parser.add_argument('-b', '--batch_size', type=int, default=200, help="Batch size, larger sizes might be faster but wll take more memory") parser.add_argument('--max_answer_len', type=int, default=8, help="Max answer span to select") parser.add_argument('-c', '--corpus', choices=["web-dev", "web-test", "web-verified-dev", "web-train", "open-dev", "open-train"], default="web-verified-dev") args = parser.parse_args() model_dir = ModelDir(args.model) model = model_dir.get_model() if args.corpus.startswith('web'): dataset = TriviaQaWebDataset() corpus = dataset.evidence if args.corpus == "web-dev": test_questions = dataset.get_dev() elif args.corpus == "web-test": test_questions = dataset.get_test() elif args.corpus == "web-verified-dev": test_questions = dataset.get_verified() elif args.corpus == "web-train": test_questions = dataset.get_train() else: raise RuntimeError() else: dataset = TriviaQaOpenDataset() corpus = dataset.evidence if args.corpus == "open-dev": test_questions = dataset.get_dev() elif args.corpus == "open-train": test_questions = dataset.get_train() else: raise RuntimeError() splitter = MergeParagraphs(args.tokens) per_document = not args.corpus.startswith("open") filter_name = args.filter if filter_name is None: if args.corpus.startswith("open"): filter_name = "linear" else: filter_name = "tfidf" print("Selecting %d paragraphs using %s method per %s" % (args.n_paragraphs, filter_name, ("question-document pair" if per_document else "question"))) if filter_name == "tfidf": para_filter = TopTfIdf(NltkPlusStopWords(punctuation=True), args.n_paragraphs) elif filter_name == "truncate": para_filter = FirstN(args.n_paragraphs) elif filter_name == "linear": para_filter = ShallowOpenWebRanker(args.n_paragraphs) else: raise ValueError() n_questions = args.n_sample if n_questions is not None: test_questions.sort(key=lambda x:x.question_id) np.random.RandomState(0).shuffle(test_questions) test_questions = test_questions[:n_questions] print("Building question/paragraph pairs...") # Loads the relevant questions/documents, selects the right paragraphs, and runs the model's preprocessor if per_document: prep = ExtractMultiParagraphs(splitter, para_filter, model.preprocessor, require_an_answer=False) else: prep = ExtractMultiParagraphsPerQuestion(splitter, para_filter, model.preprocessor, require_an_answer=False) prepped_data = preprocess_par(test_questions, corpus, prep, args.n_processes, 1000) data = [] for q in prepped_data.data: for i, p in enumerate(q.paragraphs): if q.answer_text is None: ans = None else: ans = TokenSpans(q.answer_text, p.answer_spans) data.append(DocumentParagraphQuestion(q.question_id, p.doc_id, (p.start, p.end), q.question, p.text, ans, i)) # Reverse so our first batch will be the largest (so OOMs happen early) questions = sorted(data, key=lambda x: (x.n_context_words, len(x.question)), reverse=True) print("Done, starting eval") if args.step is not None: if args.step == "latest": checkpoint = model_dir.get_latest_checkpoint() else: checkpoint = model_dir.get_checkpoint(int(args.step)) else: checkpoint = model_dir.get_best_weights() if checkpoint is not None: print("Using best weights") else: print("Using latest checkpoint") checkpoint = model_dir.get_latest_checkpoint() test_questions = ParagraphAndQuestionDataset(questions, FixedOrderBatcher(args.batch_size, True)) evaluation = trainer.test(model, [RecordParagraphSpanPrediction(args.max_answer_len, True)], {args.corpus:test_questions}, ResourceLoader(), checkpoint, not args.no_ema, args.async)[args.corpus] if not all(len(x) == len(data) for x in evaluation.per_sample.values()): raise RuntimeError() df = pd.DataFrame(evaluation.per_sample) if args.official_output is not None: print("Saving question result") # I didn't store the unormalized filenames exactly, so unfortunately we have to reload # the source data to get exact filename to output an official test script fns = {} print("Loading proper filenames") if args.corpus == 'web-test': source = join(TRIVIA_QA, "qa", "web-test-without-answers.json") elif args.corpus == "web-dev": source = join(TRIVIA_QA, "qa", "web-dev.json") else: raise NotImplementedError() with open(join(source)) as f: data = json.load(f)["Data"] for point in data: for doc in point["EntityPages"]: filename = doc["Filename"] fn = join("wikipedia", filename[:filename.rfind(".")]) fn = normalize_wiki_filename(fn) fns[(point["QuestionId"], fn)] = filename answers = {} scores = {} for q_id, doc_id, start, end, txt, score in df[["question_id", "doc_id", "para_start", "para_end", "text_answer", "predicted_score"]].itertuples(index=False): filename = dataset.evidence.file_id_map[doc_id] if filename.startswith("web"): true_name = filename[4:] + ".txt" else: true_name = fns[(q_id, filename)] key = q_id + "--" + true_name prev_score = scores.get(key) if prev_score is None or prev_score < score: scores[key] = score answers[key] = txt with open(args.official_output, "w") as f: json.dump(answers, f) if per_document: group_by = ["question_id", "doc_id"] else: group_by = ["question_id"] # Print a table of scores as more paragraphs are used df.sort_values(group_by + ["rank"], inplace=True) f1 = compute_model_scores(df, "predicted_score", "text_f1", group_by) em = compute_model_scores(df, "predicted_score", "text_em", group_by) table = [["N Paragraphs", "EM", "F1"]] table += list([str(i+1), "%.4f" % e, "%.4f" % f] for i, (e, f) in enumerate(zip(em, f1))) print_table(table) output_file = args.paragraph_output if output_file is not None: print("Saving paragraph result") if output_file.endswith("json"): with open(output_file, "w") as f: json.dump(evaluation.per_sample, f) elif output_file.endswith("pkl"): with open(output_file, "wb") as f: pickle.dump(evaluation.per_sample, f) elif output_file.endswith("csv"): df.to_csv(output_file, index=False) else: raise ValueError("Unrecognized file format")
import warnings warnings.filterwarnings("ignore") import tensorflow as tf import re from docqa.data_processing.document_splitter import MergeParagraphs, TopTfIdf, ShallowOpenWebRanker, PreserveParagraphs from docqa.data_processing.qa_training_data import ParagraphAndQuestion, ParagraphAndQuestionSpec from docqa.data_processing.text_utils import NltkAndPunctTokenizer, NltkPlusStopWords from docqa.doc_qa_models import ParagraphQuestionModel from docqa.model_dir import ModelDir from docqa.utils import flatten_iterable # load model model_dir = ModelDir(os.path.join('.', 'qa-model')) model = model_dir.get_model() print('Generating Vocab...', end='') # preload all the words vocab = set() with open(os.path.join('.', 'qa-model', 'vocab.txt'), encoding='utf-8') as vocfile: for line in vocfile: word = line.strip() if len(word) > 0: vocab.add(word) print('done')
def main(): parser = argparse.ArgumentParser( description='Evaluate a model on document-level SQuAD') parser.add_argument('model', help='model to use') parser.add_argument( 'output', type=str, help="Store the per-paragraph results in csv format in this file") parser.add_argument('-n', '--n_sample', type=int, default=None, help="(for testing) sample documents") parser.add_argument( '-s', '--async', type=int, default=10, help="Encoding batch asynchronously, queueing up to this many") parser.add_argument('-a', '--answer_bound', type=int, default=17, help="Max answer span length") parser.add_argument('-p', '--n_paragraphs', type=int, default=None, help="Max number of paragraphs to use") parser.add_argument( '-b', '--batch_size', type=int, default=200, help="Batch size, larger sizes can be faster but uses more memory") parser.add_argument('-c', '--corpus', choices=["dev", "train", "doc-rd-dev"], default="dev") parser.add_argument('--no_ema', action="store_true", help="Don't use EMA weights even if they exist") args = parser.parse_args() model_dir = ModelDir(args.model) print("Loading data") questions = [] ranker = SquadTfIdfRanker(NltkPlusStopWords(True), args.n_paragraphs, force_answer=False) if args.corpus == "doc-rd-dev": docs = SquadCorpus().get_dev() if args.n_sample is not None: docs.sort(key=lambda x: x.doc_id) np.random.RandomState(0).shuffle(docs) docs = docs[:args.n_sample] print("Fetching document reader docs...") doc_rd_versions = get_doc_rd_doc(docs) print("Ranking and matching with questions...") for doc in tqdm(docs): doc_questions = flatten_iterable(x.questions for x in doc.paragraphs) paragraphs = doc_rd_versions[doc.title] ranks = ranker.rank([x.words for x in doc_questions], [x.text for x in paragraphs]) for i, question in enumerate(doc_questions): para_ranks = np.argsort(ranks[i]) for para_rank, para_num in enumerate( para_ranks[:args.n_paragraphs]): # Just use dummy answers spans for these pairs questions.append( RankedParagraphQuestion( question.words, TokenSpans(question.answer.answer_text, np.zeros((0, 2), dtype=np.int32)), question.question_id, paragraphs[para_num], para_rank, para_num)) rl = ResourceLoader() else: if args.corpus == "dev": docs = SquadCorpus().get_dev() else: docs = SquadCorpus().get_train() rl = SquadCorpus().get_resource_loader() if args.n_sample is not None: docs.sort(key=lambda x: x.doc_id) np.random.RandomState(0).shuffle(docs) docs = docs[:args.n_sample] for q in ranker.ranked_questions(docs): for i, p in enumerate(q.paragraphs): questions.append( RankedParagraphQuestion( q.question, TokenSpans(q.answer_text, p.answer_spans), q.question_id, ParagraphWithInverse([p.text], p.original_text, p.spans), i, p.paragraph_num)) print("Split %d docs into %d paragraphs" % (len(docs), len(questions))) questions = sorted(questions, key=lambda x: (x.n_context_words, len(x.question)), reverse=True) for q in questions: if len(q.answer.answer_spans.shape) != 2: raise ValueError() checkpoint = model_dir.get_best_weights() if checkpoint is not None: print("Using best weights") else: print("Using latest checkpoint") checkpoint = model_dir.get_latest_checkpoint() if checkpoint is None: raise ValueError("No checkpoints found") data = ParagraphAndQuestionDataset( questions, FixedOrderBatcher(args.batch_size, True)) model = model_dir.get_model() evaluation = trainer.test( model, [RecordParagraphSpanPrediction(args.answer_bound, True)], {args.corpus: data}, rl, checkpoint, not args.no_ema, args. async)[args.corpus] print("Saving result") output_file = args.output df = pd.DataFrame(evaluation.per_sample) df.sort_values(["question_id", "rank"], inplace=True, ascending=True) group_by = ["question_id"] f1 = compute_ranked_scores(df, "predicted_score", "text_f1", group_by) em = compute_ranked_scores(df, "predicted_score", "text_em", group_by) table = [["N Paragraphs", "EM", "F1"]] table += list([str(i + 1), "%.4f" % e, "%.4f" % f] for i, (e, f) in enumerate(zip(em, f1))) print_table(table) df.to_csv(output_file, index=False)
def main(): parser = argparse.ArgumentParser(description='Run the demo server') parser.add_argument( 'model', default= "/home/antriv/conversation_ai/ALLENAI_DocumentQA/document-qa/models/triviaqa-unfiltered-shared-norm/best-weights", help='Models to use') parser.add_argument( '-v', '--voc', default= "/home/antriv/conversation_ai/ALLENAI_DocumentQA/document-qa/data/triviaqa/evidence/vocab.txt", help='vocab to use, only words from this file will be used') parser.add_argument('-t', '--tokens', type=int, default=400, help='Number of tokens to use per paragraph') parser.add_argument('--vec_dir', default="/home/antriv/data/glove", help='Location to find word vectors') parser.add_argument('--n_paragraphs', type=int, default=15, help="Number of paragraphs to run the model on") parser.add_argument('--paragraphs_to_return', type=int, default=10, help="Number of paragraphs return to the frontend") parser.add_argument('--span_bound', type=int, default=8, help="Max span size to return as an answer") parser.add_argument( '--tagme_api_key', default="1cdc0067-b2de-4774-afbe-38703b11a365-843339462", help="Key to use for TAGME (tagme.d4science.org/tagme)") parser.add_argument('--bing_api_key', default="413239df9faa4f1494a914e0c9cec78e", help="Key to use for bing searches") parser.add_argument( '--bing_version', choices=["v5.0", "v7.0"], default="v7.0", help='Version of Bing API to use (must be compatible with the API key)' ) parser.add_argument( '--tagme_thresh', default=0.2, type=float, help="TAGME threshold for when to use the identified docs") parser.add_argument('--n_web', type=int, default=10, help='Number of web docs to fetch') parser.add_argument('--blacklist_trivia_sites', action="store_true", help="Don't use trivia websites") parser.add_argument( '-c', '--wiki_cache', default= "/home/antriv/conversation_ai/ALLENAI_DocumentQA/document-qa/data/triviaqa/evidence/wikipedia", help="Cache wiki articles in this directory") parser.add_argument('--n_dl_threads', type=int, default=5, help="Number of threads to download documents with") parser.add_argument('--request_timeout', type=int, default=60) parser.add_argument('--download_timeout', type=int, default=25, help="how long to wait before timing out downloads") parser.add_argument('--workers', type=int, default=1, help="Number of server workers") parser.add_argument('--debug', default=None, choices=["random_model", "dummy_qa"]) args = parser.parse_args() span_bound = args.span_bound n_to_return = args.paragraphs_to_return if args.tagme_api_key is not None: tagme_api_key = args.tagme_api_key else: tagme_api_key = environ.get("TAGME_API_KEY") if args.bing_api_key is not None: bing_api_key = args.bing_api_key else: bing_api_key = environ.get("BING_API_KEY") if bing_api_key is None and args.n_web > 0: raise ValueError("If n_web > 0 you must give a BING_API_KEY") if args.debug is None: model = ModelDir(args.model) else: model = RandomPredictor(5, WithIndicators()) if args.vec_dir is not None: loader = LoadFromPath(args.vec_dir) else: loader = ResourceLoader() # Update Sanic's logging to register our class's loggers log_config = LOGGING formatter = "%(asctime)s: %(levelname)s: %(message)s" log_config["formatters"]['my_formatter'] = { 'format': formatter, 'datefmt': '%Y-%m-%d %H:%M:%S', } log_config['handlers']['stream_handler'] = { 'class': "logging.StreamHandler", 'formatter': 'my_formatter', 'stream': sys.stderr } log_config['handlers']['file_handler'] = { 'class': "logging.FileHandler", 'formatter': 'my_formatter', 'filename': 'logging.log' } # It looks like we have to go and name every logger our own code might # use in order to register it with Sanic log_config["loggers"]['qa_system'] = { 'level': 'INFO', 'handlers': ['stream_handler', 'file_handler'], } log_config["loggers"]['downloader'] = { 'level': 'INFO', 'handlers': ['stream_handler', 'file_handler'], } log_config["loggers"]['server'] = { 'level': 'INFO', 'handlers': ['stream_handler', 'file_handler'], } app = Sanic() app.config.REQUEST_TIMEOUT = args.request_timeout log = logging.getLogger('server') @app.listener('before_server_start') async def setup_qa(app, loop): # To play nice with iohttp's async ClientSession objects, we need to construct the QaSystem # inside the event loop. if args.debug == "dummy_qa": qa = DummyQa() else: qa = QaSystem( args.wiki_cache, MergeParagraphs(args.tokens), ShallowOpenWebRanker(args.n_paragraphs), args.voc, model, loader, bing_api_key, bing_version=args.bing_version, tagme_api_key=tagme_api_key, n_dl_threads=args.n_dl_threads, blacklist_trivia_sites=args.blacklist_trivia_sites, download_timeout=args.download_timeout, span_bound=span_bound, tagme_threshold=None if (tagme_api_key is None) else args.tagme_thresh, n_web_docs=args.n_web, ) app.qa = qa @app.listener('after_server_stop') async def setup_qa(app, loop): app.qa.close() @app.route("/answer") async def answer(request): try: question = request.args["question"][0] if question == "": return response.json({'message': 'No question given'}, status=400) spans, paras = await app.qa.answer_question(question) answers = select_answers(paras, spans, span_bound, 10) answers = answers[:n_to_return] best_span = max(answers[0].answers, key=lambda x: x.conf) log.info("Answered \"%s\" (with web search): \"%s\"", question, answers[0].original_text[best_span.start:best_span.end]) return json([x.to_json() for x in answers]) except Exception as e: log.info("Error: " + str(e)) raise ServerError(e, status_code=500) @app.route('/answer-from', methods=['POST']) async def answer_from(request): try: args = ujson.loads(request.body.decode("utf-8")) question = args.get("question") if question is None or question == "": return response.json({'message': 'No question given'}, status=400) doc = args["document"] if len(doc) > 500000: raise ServerError("Document too large", status_code=400) spans, paras = app.qa.answer_with_doc(question, doc) answers = select_answers(paras, spans, span_bound, 10) answers = answers[:n_to_return] best_span = max(answers[0].answers, key=lambda x: x.conf) log.info("Answered \"%s\" (with user doc): \"%s\"", question, answers[0].original_text[best_span.start:best_span.end]) return json([x.to_json() for x in answers]) except Exception as e: log.info("Error: " + str(e)) raise ServerError(e, status_code=500) app.static('/', './docqa//server/static/index.html') app.static('/about.html', './docqa/server/static/about.html') app.run(host="0.0.0.0", port=5000, workers=args.workers, debug=False, log_config=LOGGING)
def main(): parser = argparse.ArgumentParser( description="Run an ELMo model on user input") parser.add_argument("model", help="Model directory") parser.add_argument("question", help="Question to answer") parser.add_argument("context", help="Context to answer the question with") args = parser.parse_args() # Tokenize the input, the models expected data to be tokenized using `NltkAndPunctTokenizer` # Note the model expects case-sensitive input tokenizer = NltkAndPunctTokenizer() question = tokenizer.tokenize_paragraph_flat(args.question) context = tokenizer.tokenize_paragraph_flat(args.context) print("Loading model") model_dir = ModelDir(args.model) model = model_dir.get_model() if not isinstance(model, ElmoQaModel): raise ValueError( "This script is build to work for ElmoQaModel models only") # Important! This tells the language model not to use the pre-computed word vectors, # which are only applicable for the SQuAD dev/train sets. # Instead the language model will use its character-level CNN to compute # the word vectors dynamically. model.lm_model.embed_weights_file = None # Tell the model the batch size and vocab to expect, This will load the needed # word vectors and fix the batch size when building the graph / encoding the input print("Setting up model") voc = set(question) voc.update(context) model.set_input_spec(ParagraphAndQuestionSpec(batch_size=1), voc) # Now we build the actual tensorflow graph, `best_span` and `conf` are # tensors holding the predicted span (inclusive) and confidence scores for each # element in the input batch print("Build tf graph") sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) with sess.as_default(): # 17 means to limit the span to size 17 or less best_spans, conf = model.get_prediction().get_best_span(17) # Now restore the weights, this is a bit fiddly since we need to avoid restoring the # bilm weights, and instead load them from the pre-computed data all_vars = tf.global_variables() + tf.get_collection( tf.GraphKeys.SAVEABLE_OBJECTS) lm_var_names = {x.name for x in all_vars if x.name.startswith("bilm")} vars = [x for x in all_vars if x.name not in lm_var_names] model_dir.restore_checkpoint(sess, vars) # Run the initializer of the lm weights, which will load them from the lm directory sess.run( tf.variables_initializer( [x for x in all_vars if x.name in lm_var_names])) # Now the model is ready to run # The model takes input in the form of `ContextAndQuestion` objects, for example: data = [ParagraphAndQuestion(context, question, None, "user-question1")] print("Starting run") # The model is run in two steps, first it "encodes" the paragraph/context pairs # into numpy arrays, then to use `sess` to run the actual model get the predictions encoded = model.encode( data, is_train=False) # batch of `ContextAndQuestion` -> feed_dict best_spans, conf = sess.run([best_spans, conf], feed_dict=encoded) # feed_dict -> predictions print("Best span: " + str(best_spans[0])) print("Answer text: " + " ".join(context[best_spans[0][0]:best_spans[0][1] + 1])) print("Confidence: " + str(conf[0]))
def predict(): json_data = {"success": False, "predictions": []} print("Preprocessing...") # Load the model model_dir = ModelDir( "/home/antriv/conversation_ai/Transfer_Learning/ALLENAI_DocumentQA/document-qa/pretrained_models/models/triviaqa-unfiltered-shared-norm" ) model = model_dir.get_model() if not isinstance(model, ParagraphQuestionModel): raise ValueError( "This script is built to work for ParagraphQuestionModel models only" ) # Load the question question = (flask.request.data).decode("utf-8") # Read the documents documents = [] doclist = ["/home/antriv/data/The-Future-Computed.txt"] for doc in doclist: if not isfile(doc): raise ValueError(doc + " does not exist") with open(doc, "r") as f: documents.append(f.read()) print("Loaded %d documents" % len(documents)) # Split documents into lists of paragraphs documents = [re.split("\s*\n\s*", doc) for doc in documents] # Tokenize the input, the models expects data to be tokenized using `NltkAndPunctTokenizer` # Note the model expects case-sensitive input tokenizer = NltkAndPunctTokenizer() question = tokenizer.tokenize_paragraph_flat(question) # List of words # Now list of document->paragraph->sentence->word documents = [[tokenizer.tokenize_paragraph(p) for p in doc] for doc in documents] # Now group the document into paragraphs, this returns `ExtractedParagraph` objects # that additionally remember the start/end token of the paragraph within the source document splitter = MergeParagraphs(400) #splitter = PreserveParagraphs() # Uncomment to use the natural paragraph grouping documents = [splitter.split(doc) for doc in documents] # Now select the top paragraphs using a `ParagraphFilter` if len(documents) == 1: # Use TF-IDF to select top paragraphs from the document selector = TopTfIdf(NltkPlusStopWords(True), n_to_select=1000) context = selector.prune(question, documents[0]) else: # Use a linear classifier to select top paragraphs among all the documents selector = ShallowOpenWebRanker(n_to_select=1000) context = selector.prune(question, flatten_iterable(documents)) print("Select %d paragraph" % len(context)) if model.preprocessor is not None: # Models are allowed to define an additional pre-processing step # This will turn the `ExtractedParagraph` objects back into simple lists of tokens context = [ model.preprocessor.encode_text(question, x) for x in context ] else: # Otherwise just use flattened text context = [flatten_iterable(x.text) for x in context] print("Setting up model") # Tell the model the batch size (can be None) and vocab to expect, This will load the # needed word vectors and fix the batch size to use when building the graph / encoding the input voc = set(question) for txt in context: voc.update(txt) model.set_input_spec(ParagraphAndQuestionSpec(batch_size=len(context)), voc) # Now we build the actual tensorflow graph, `best_span` and `conf` are # tensors holding the predicted span (inclusive) and confidence scores for each # element in the input batch, confidence scores being the pre-softmax logit for the span print("Build tf graph") sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) # We need to use sess.as_default when working with the cuNND stuff, since we need an active # session to figure out the # of parameters needed for each layer. The cpu-compatible models don't need this. with sess.as_default(): # 8 means to limit the span to size 8 or less best_spans, conf = model.get_prediction().get_best_span(8) # Loads the saved weights model_dir.restore_checkpoint(sess) # Now the model is ready to run # The model takes input in the form of `ContextAndQuestion` objects, for example: data = [ ParagraphAndQuestion(x, question, None, "user-question%d" % i) for i, x in enumerate(context) ] print("Starting run") # The model is run in two steps, first it "encodes" a batch of paragraph/context pairs # into numpy arrays, then we use `sess` to run the actual model get the predictions encoded = model.encode( data, is_train=False) # batch of `ContextAndQuestion` -> feed_dict best_spans, conf = sess.run([best_spans, conf], feed_dict=encoded) # feed_dict -> predictions best_para = np.argmax( conf ) # We get output for each paragraph, select the most-confident one to print print("Best Paragraph: " + str(best_para)) print("Best span: " + str(best_spans[best_para])) print("Answer text: " + " ".join(context[best_para] [best_spans[best_para][0]:best_spans[best_para][1] + 1])) print("Confidence: " + str(conf[best_para])) y_output = " ".join( context[best_para][best_spans[best_para][0]:best_spans[best_para][1] + 1]) print(y_output) json_data["predictions"].append(str(y_output)) #indicate that the request was a success json_data["success"] = True #return the data dictionary as a JSON response return flask.jsonify(json_data)
def main(): parser = argparse.ArgumentParser("Train our ELMo model on SQuAD") parser.add_argument("loss_mode", choices=['default', 'confidence']) parser.add_argument("output_dir") parser.add_argument("--dim", type=int, default=90) parser.add_argument("--l2", type=float, default=0) parser.add_argument("--mode", choices=["input", "output", "both", "none"], default="both") parser.add_argument("--top_layer_only", action="store_true") parser.add_argument("--no-tfidf", action='store_true', help="Don't add TF-IDF negative examples") args = parser.parse_args() out = args.output_dir + "-" + datetime.now().strftime("%m%d-%H%M%S") dim = args.dim recurrent_layer = CudnnGru(dim, w_init=TruncatedNormal(stddev=0.05)) if args.loss_mode == 'default': n_epochs = 24 answer_encoder = SingleSpanAnswerEncoder() predictor = BoundsPredictor( ChainBiMapper(first_layer=recurrent_layer, second_layer=recurrent_layer)) batcher = ClusteredBatcher(45, ContextLenKey(), False, False) data = DocumentQaTrainingData(SquadCorpus(), None, batcher, batcher) elif args.loss_mode == 'confidence': if args.no_tfidf: prepro = SquadDefault() n_epochs = 15 else: prepro = SquadTfIdfRanker(NltkPlusStopWords(True), 4, True) n_epochs = 50 answer_encoder = DenseMultiSpanAnswerEncoder() predictor = ConfidencePredictor(ChainBiMapper( first_layer=recurrent_layer, second_layer=recurrent_layer, ), AttentionEncoder(), FullyConnected(80, activation="tanh"), aggregate="sum") eval_dataset = RandomParagraphSetDatasetBuilder( 100, 'flatten', True, 0) train_batching = ClusteredBatcher(45, ContextLenBucketedKey(3), True, False) data = PreprocessedData(SquadCorpus(), prepro, StratifyParagraphsBuilder(train_batching, 1), eval_dataset, eval_on_verified=False) data.preprocess(1) params = trainer.TrainParams(trainer.SerializableOptimizer( "Adadelta", dict(learning_rate=1.0)), ema=0.999, max_checkpoints_to_keep=2, async_encoding=10, num_epochs=n_epochs, log_period=30, eval_period=1200, save_period=1200, best_weights=("dev", "b17/text-f1"), eval_samples=dict(dev=None, train=8000)) lm_reduce = MapperSeq( ElmoLayer(args.l2, layer_norm=False, top_layer_only=args.top_layer_only), DropoutLayer(0.5), ) model = AttentionWithElmo( encoder=DocumentAndQuestionEncoder(answer_encoder), lm_model=SquadContextConcatSkip(), append_before_atten=(args.mode == "both" or args.mode == "output"), append_embed=(args.mode == "both" or args.mode == "input"), max_batch_size=128, word_embed=FixedWordEmbedder(vec_name="glove.840B.300d", word_vec_init_scale=0, learn_unk=False, cpu=True), char_embed=CharWordEmbedder(LearnedCharEmbedder(word_size_th=14, char_th=49, char_dim=20, init_scale=0.05, force_cpu=True), MaxPool(Conv1d(100, 5, 0.8)), shared_parameters=True), embed_mapper=SequenceMapperSeq( VariationalDropoutLayer(0.8), recurrent_layer, VariationalDropoutLayer(0.8), ), lm_reduce=None, lm_reduce_shared=lm_reduce, per_sentence=False, memory_builder=NullBiMapper(), attention=BiAttention(TriLinear(bias=True), True), match_encoder=SequenceMapperSeq( FullyConnected(dim * 2, activation="relu"), ResidualLayer( SequenceMapperSeq( VariationalDropoutLayer(0.8), recurrent_layer, VariationalDropoutLayer(0.8), StaticAttentionSelf(TriLinear(bias=True), ConcatWithProduct()), FullyConnected(dim * 2, activation="relu"), )), VariationalDropoutLayer(0.8)), predictor=predictor) with open(__file__, "r") as f: notes = f.read() notes = str(sorted(args.__dict__.items(), key=lambda x: x[0])) + "\n" + notes trainer.start_training( data, model, params, [LossEvaluator(), SpanEvaluator(bound=[17], text_eval="squad")], ModelDir(out), notes)
def main(): parser = argparse.ArgumentParser( description='Train a model on document-level SQuAD') parser.add_argument( 'mode', choices=["paragraph", "confidence", "shared-norm", "merge", "sigmoid"]) parser.add_argument("name", help="Output directory") args = parser.parse_args() mode = args.mode out = args.name + "-" + datetime.now().strftime("%m%d-%H%M%S") corpus = SquadCorpus() if mode == "merge": # Adds paragraph start tokens, since we will be concatenating paragraphs together pre = WithIndicators(True, para_tokens=False, doc_start_token=False) else: pre = None # model = get_model(50, 100, args.mode, pre) tmp = ModelDir("models/squad-shared-norm") model = tmp.get_model() if mode == "paragraph": # Run in the "standard" known-paragraph setting if model.preprocessor is not None: raise NotImplementedError() n_epochs = 26 train_batching = ClusteredBatcher(45, ContextLenBucketedKey(3), True, False) eval_batching = ClusteredBatcher(45, ContextLenKey(), False, False) data = DocumentQaTrainingData(corpus, None, train_batching, eval_batching) eval = [LossEvaluator(), SpanEvaluator(bound=[17], text_eval="squad")] else: eval_set_mode = { "confidence": "flatten", "sigmoid": "flatten", "shared-norm": "group", "merge": "merge" }[mode] eval_dataset = RandomParagraphSetDatasetBuilder( 100, eval_set_mode, True, 0) if mode == "confidence" or mode == "sigmoid": if mode == "sigmoid": # needs to be trained for a really long time for reasons unknown, even this might be too small n_epochs = 100 else: n_epochs = 50 # more epochs since we only "see" the label very other epoch-osh train_batching = ClusteredBatcher(45, ContextLenBucketedKey(3), True, False) data = PreprocessedData( SquadCorpus(), SquadTfIdfRanker(NltkPlusStopWords(True), 4, True, model.preprocessor), StratifyParagraphsBuilder(train_batching, 1), eval_dataset, eval_on_verified=False, ) else: n_epochs = 26 data = PreprocessedData( SquadCorpus(), SquadTfIdfRanker(NltkPlusStopWords(True), 4, True, model.preprocessor), StratifyParagraphSetsBuilder(25, args.mode == "merge", True, 1), eval_dataset, eval_on_verified=False, ) eval = [LossEvaluator(), MultiParagraphSpanEvaluator(17, "squad")] data.preprocess(1) with open(__file__, "r") as f: notes = f.read() notes = args.mode + "\n" + notes params = train_params(n_epochs) if mode == "paragraph": params.best_weights = ("dev", "b17/text-f1") trainer.start_training(data, model, params, eval, model_dir.ModelDir(out), notes, initialize_from=tmp.get_best_weights())