def __init__(self, data_files, vocab_file, label_file, batch_size=32, reverse=False, split_word=True, max_len=1200): self.reverse = reverse self.split_word = split_word self.data_files = data_files self.batch_size = batch_size self.max_len = max_len self.vocab, self.w2i = read_vocab(vocab_file) self.i2w = {v: k for k, v in self.w2i.items()} self.label_names, self.l2i = read_vocab(label_file) self.i2l = {v: k for k, v in self.l2i.items()} self.tag_l2i = {"0": 0, "1": 1} self.tag_i2l = {v: k for k, v in self.tag_l2i.items()} self._raw_data = [] self.items = [] self._preprocess()
def output_newquery(inputs): inputs = str(inputs) # vocab_file='scripts/data/vocab.txt' # label_file='scripts/data/labels.txt' # checkpoint_dir='scripts/data/elmo_ema_0120' # out_file='scripts/data/new_query.json' # vocab_file = '/home/kg/PycharmProjects/nlp_p2/nlp_2/static/modelData/vocab.txt' # label_file = '/home/kg/PycharmProjects/nlp_p2/nlp_2/static/modelData/labels.txt' # checkpoint_dir = '/home/kg/PycharmProjects/nlp_p2/nlp_2/static/modelData/elmo_ema_0120' # out_file = '/home/kg/PycharmProjects/nlp_p2/nlp_2/static/modelData/new_query.json' vocab_file = 'static/modelData/vocab.txt' label_file = 'static/modelData/labels.txt' checkpoint_dir = 'static/modelData/elmo_ema_0120' out_file = 'static/modelData/new_query.json' feature_list = [ "location_traffic_convenience", "location_distance_from_business_district", "location_easy_to_find", "service_wait_time", "service_waiters_attitude", "service_parking_convenience", "service_serving_speed", "price_level", "price_cost_effective", "price_discount", "environment_decoration", "environment_noise", "environment_space", "environment_cleaness", "dish_portion", "dish_taste", "dish_look", "dish_recommendation", "others_overall_experience", "others_willing_to_consume_again" ] new_dict = defaultdict() vocab, w2i = read_vocab(vocab_file) label_names, l2i = read_vocab(label_file) i2l = {v: k for k, v in l2i.items()} tag_l2i = {"1": 0, "0": 1, "-1": 2, "-2": 3} tag_i2l = {v: k for k, v in tag_l2i.items()} model_item = search_process(inputs, new_dict, feature_list, out_file, w2i) if isinstance(model_item, list): hparams = load_hparams( checkpoint_dir, { "mode": 'inference', 'checkpoint_dir': checkpoint_dir + "/best_eval", 'embed_file': None }) with tf.Session(config=get_config_proto( log_device_placement=False)) as sess: model = Model(hparams) model.build() try: model.restore_model(sess) #restore best solution except Exception as e: print("unable to restore model with exception", e) exit(1) (source, lengths, _, ids) = process_item(model_item) predict, logits = model.inference_clf_one_batch( sess, source, lengths) for i, (p, l) in enumerate(zip(predict, logits)): new_dict['id'] = 'new_query' new_dict['content'] = inputs for j in range(20): label_name = i2l[j] tag = tag_i2l[np.argmax(p[j])] new_dict[label_name] = tag with open(out_file, 'w') as f: f.write(json.dumps(new_dict, ensure_ascii=False) + '\n') return new_dict
labels = self.get_label(labels, self.tag_l2i) item_labels.append(labels) self._raw_data.append( DataItem(content=content, labels=np.asarray(item_labels), length=len(content), id=int(item['id']))) self.items.append(item) self.num_batches = len(self._raw_data) // self.batch_size self.data_size = len(self._raw_data) print_out("# Got %d data items with %d batches" % (self.data_size, self.num_batches)) vocab, w2i = read_vocab(flags.vocab_file) UNK_ID = 0 SOS_ID = 1 EOS_ID = 2 def _tokenize(content, w2i, max_tokens=1200, reverse=False, split=True): def get_tokens(content): tokens = content.strip().split() ids = [] for t in tokens: if t in w2i: ids.append(w2i[t]) else: for c in t: