def query_to_encoder_features(sentence, vocabs, FLAGS): """ Convert a natural language query into feature vectors used by the encoder. """ if FLAGS.channel == 'char': tokens = data_utils.nl_to_characters(sentence) init_vocab = data_utils.CHAR_INIT_VOCAB elif FLAGS.channel == 'partial.token': tokens = data_utils.nl_to_partial_tokens(sentence, tokenizer.basic_tokenizer) init_vocab = data_utils.TOKEN_INIT_VOCAB else: if FLAGS.normalized: tokens = data_utils.nl_to_tokens(sentence, tokenizer.ner_tokenizer) else: tokens = data_utils.nl_to_tokens(sentence, tokenizer.basic_tokenizer) init_vocab = data_utils.TOKEN_INIT_VOCAB sc_ids = data_utils.tokens_to_ids(tokens, vocabs.sc_vocab) encoder_features = [[sc_ids]] if FLAGS.use_copy and FLAGS.copy_fun == 'copynet': csc_ids = [] for i, t in enumerate(tokens): if not t in init_vocab and t in vocabs.tg_vocab: csc_ids.append(vocabs.tg_vocab[t]) else: csc_ids.append(len(vocabs.tg_vocab) + i) encoder_features.append([csc_ids]) return encoder_features
def query_to_copy_tokens(sentence, FLAGS): if FLAGS.channel == 'char': tokens = data_utils.nl_to_characters(sentence) elif FLAGS.channel == 'partial.token': tokens = data_utils.nl_to_partial_tokens( sentence, tokenizer.basic_tokenizer, to_lower_case=False, lemmatization=False) else: tokens = data_utils.nl_to_tokens( sentence, tokenizer.basic_tokenizer, to_lower_case=False, lemmatization=False) return tokens