def load_text_pairs(fname, config_data, vocabulary, noutputs=3): max_input_length = config_data['max_input_length'] max_output_length = config_data['max_output_length'] max_idx = max(vocabulary.values()) dummy_word_idx = max_idx + 1 ifile = open(fname, encoding='utf-8', mode='rt') inputs_raw = [] outputs_raw = [] for line in ifile: sline = line.replace('\n', '').split('\t') text0 = sline[0] text1 = sline[1] inputs_raw.append(text0) outputs_raw.append(text1) input_idx = convert2indices(inputs_raw, vocabulary, dummy_word_idx, dummy_word_idx, max_sent_length=max_input_length) target_idx = convert2indices(outputs_raw, vocabulary, dummy_word_idx, dummy_word_idx, max_sent_length=max_output_length) outputs = [np.ones(len(input_idx))] * noutputs return [input_idx, target_idx], outputs
def load_text_gen_data(fname, config_data, vocabulary, noutputs=3): max_input_length = config_data['max_input_length'] max_output_length = config_data['max_output_length'] max_idx = max(vocabulary.values()) dummy_word_idx = max_idx + 1 reader = csv.DictReader(open(fname, encoding='utf-8', mode='rt')) inputs_raw = [] outputs_raw = [] for row in reader: i1 = row['mr'] i2 = row['ref'] inputs_raw.append(i1) outputs_raw.append(i2) input_idx = convert2indices(inputs_raw, vocabulary, dummy_word_idx, dummy_word_idx, max_sent_length=max_input_length) target_idx = convert2indices(outputs_raw, vocabulary, dummy_word_idx, dummy_word_idx, max_sent_length=max_output_length) outputs = [np.ones(len(input_idx))] * noutputs return [input_idx, target_idx], outputs
def generate_data_stream(fname, config_data, vocabulary, batch_size, noutputs=2, skip_data=0): max_sentence_len = config_data['max_sentence_length'] dummy_word_idx = vocabulary['DUMMY_WORD'] outputs = [np.ones(batch_size)]*noutputs #vocabulary = {k: v[0] for k, v in vocabulary.items()} current_batch = [] while True: if fname.endswith('.tsv') or fname.endswith('.txt'): ifile = open(fname, mode='rt', encoding='utf-8') elif fname.endswith('.gz') or fname.endswith('.gzip'): ifile = gzip.open(fname, mode='rt', encoding='utf-8') for i, line in enumerate(ifile, start=1): if not skip_data == 0: skip_data -= 1 continue current_batch.append(line) if i % batch_size == 0: random.shuffle(current_batch) processed_batch = [preprocess(x.replace('\r', '').split('\t')[-1]) for x in current_batch] batch_idx = convert2indices(processed_batch, vocabulary, dummy_word_idx, dummy_word_idx, max_sent_length=max_sentence_len) yield [batch_idx, batch_idx], outputs current_batch = [] ifile.close()
def transform_data(fname, vocabulary_word, vocabulary_char, max_sentence_len, noutputs): dummy_word_idx = vocabulary_word['DUMMY_WORD'][0] dummy_char_idx = max(vocabulary_char.values()) + 1 file = open(fname, encoding='utf-8', mode='rt') curr_tweets = [ x.replace('\r', '').split('\t')[-1] for x in file.readlines() ] processed_batch = [preprocess(x) for x in curr_tweets] text_idx = hybrid_convert2indices(curr_tweets, processed_batch, vocabulary_word, dummy_word_idx, dummy_word_idx, max_sent_length=max_sentence_len) char_idx = convert2indices(curr_tweets, vocabulary_char, dummy_char_idx, dummy_char_idx, max_sent_length=max_sentence_len) outputs = [np.ones(len(curr_tweets))] * noutputs return [char_idx, text_idx], outputs
def transform_data(fname, vocabulary, max_sentence_len, noutputs): max_idx = max(vocabulary.values()) dummy_word_idx = max_idx + 1 file = open(fname, encoding='utf-8', mode='rt') curr_tweets = [ x.replace('\r', '').split('\t')[-1] for x in file.readlines() ] text_idx = convert2indices(curr_tweets, vocabulary, dummy_word_idx, dummy_word_idx, max_sent_length=max_sentence_len) outputs = [np.ones(len(curr_tweets))] * noutputs return [text_idx, text_idx], outputs
def load_text_gen_data(fname, config_data, vocabulary, noutputs=3, random_output=False, word_based=False, random_first_word=False): max_output_length = config_data['max_sentence_len'] vocab_path = config_data['vocab_path'] fw_vocab = cPickle.load(open(join(vocab_path, 'fw_vocab.pkl'), 'rb')) overlap_map_for_fw = cPickle.load(open(join(vocab_path, 'overlap_map_for_fw.pkl'), 'rb')) dummy_word_idx = len(vocabulary) dropout_word_idx = len(vocabulary) + 1 reader = csv.DictReader(open(fname, encoding='utf-8', mode='rt')) if word_based: vocabulary = {token: idx for token, (idx, freq) in vocabulary.items()} headers = [ ('name', process_name), ('eatType', process_eat_type), ('priceRange', process_price_range), ('customer rating', process_customer_rating), ('near', process_near), ('food', process_food), ('area', process_area), ('familyFriendly', process_family_friendly) ] field_ops = { 'eatType': 3, 'priceRange': 6, 'customer rating': 6, 'food': 7, 'area': 2, 'familyFriendly': 2 } processed_fields = defaultdict(lambda: []) outputs_raw = [] weights_raw = [] mr_list = [] for row in reader: i1 = row['mr'] i2 = row.get('ref', '') i3 = row.get('weight', 1.0) mr_list.append(i1) weights_raw.append(float(i3)) outputs_raw.append(i2) keywords = i1.split(',') kv = {} for keyword in keywords: kidx = keyword.find('[') key = keyword[:kidx].strip() value = keyword[kidx + 1: keyword.find(']')] kv[key] = value for header, funct in headers: val = kv.get(header, None) processed_value = funct(val) processed_fields[header].append(processed_value) inputs = [] for header, _ in headers: values = processed_fields[header] if header in ['name', 'near', 'food']: value_idx = [] for value in values: x = np.zeros(2) if value: x[0] = 1 else: x[1] = 1 value_idx.append(x) value_idx = np.array(value_idx).astype('float32') else: value_idx = [] for value in values: x = np.zeros(field_ops[header] + 1) if value is not None: x[value] = 1 value_idx.append(x) value_idx = np.array(value_idx).astype('float32') inputs.append(value_idx) outputs_delex = [preprocess_nlg_text(x, name, near, food, name_tok, near_tok, food_tok, word_based=word_based) for x, name, near, food in zip(outputs_raw, processed_fields['name'], processed_fields['near'], processed_fields['food'])] if not random_first_word: first_words = get_first_words(outputs_delex, fw_vocab, random_first_word) else: first_words, _ = sample_first_word(inputs, overlap_map_for_fw, fw_vocab) inputs.append(first_words) target_idx = convert2indices(outputs_delex, vocabulary, dummy_word_idx, dummy_word_idx, max_sent_length=max_output_length) if random_output: target_idx = np.random.normal(loc=0, scale=0.25, size=target_idx.shape)#np.ones_like(target_idx)*dropout_word_idx inputs.append(target_idx) weights = np.array(weights_raw) outputs = [np.ones(len(inputs[0]))] * noutputs lex_dict = { name_tok: processed_fields['name'], near_tok: processed_fields['near'], food_tok: processed_fields['food'], } return inputs, outputs, [weights]*noutputs, lex_dict
def load_text_gen_data(fname, feature_fname, tree_fname, config_data, vocabulary, noutputs=3, random_output=False, word_based=False, random_first_word=False): max_output_length = config_data['max_sentence_len'] feature_list = config_data['features'] dummy_word_idx = len(vocabulary) if word_based: vocabulary = {token: idx for token, (idx, freq) in vocabulary.items()} field_ops = { 'eatType': 3, 'priceRange': 6, 'customer rating': 6, 'food': 7, 'area': 2, 'familyFriendly': 2 } inputs = [] outputs_raw, processed_fields, weights_raw = _load_nlg_data(fname) for header, _ in headers: values = processed_fields[header] if header in ['name', 'near', 'food']: value_idx = [] for value in values: x = np.zeros(2) if value: x[0] = 1 else: x[1] = 1 value_idx.append(x) value_idx = np.array(value_idx).astype('float32') else: value_idx = [] for value in values: x = np.zeros(field_ops[header] + 1) if value is not None: x[value] = 1 value_idx.append(x) value_idx = np.array(value_idx).astype('float32') inputs.append(value_idx) outputs_delex = [ preprocess_nlg_text(x, name, near, food, name_tok, near_tok, food_tok, word_based=word_based) for x, name, near, food in zip(outputs_raw, processed_fields['name'], processed_fields['near'], processed_fields['food']) ] target_idx = convert2indices(outputs_delex, vocabulary, dummy_word_idx, dummy_word_idx, max_sent_length=max_output_length) if not random_first_word: nsentence_embeddings, tr_fwords_full_vectors, tr_fphrase_full_vectors, tr_fpos_full_vectors, tr_fwords_vectors, tr_fpos_vectors, tr_fphrase_vectors = load_lex_features( feature_fname, config_data) pos_tag_feature, phrase_tag_feature = load_special_tags( tree_fname, config_data) else: nsentence_embeddings, tr_fwords_full_vectors, tr_fphrase_full_vectors, tr_fpos_full_vectors, tr_fwords_vectors, tr_fpos_vectors, tr_fphrase_vectors = sample_lex_features( processed_fields, config_data) pos_tag_feature, phrase_tag_feature = sample_special_tags( processed_fields, config_data) if 'nsent' in feature_list: inputs.append(nsentence_embeddings) if 'fout_word_vectors': inputs.append(tr_fwords_full_vectors) if 'fout_phrase_vectors': inputs.append(tr_fphrase_full_vectors) if 'fout_pos_vectors': inputs.append(tr_fpos_full_vectors) if 'fword_vectors': inputs.extend(tr_fwords_vectors) if 'fphrase_vectors': inputs.extend(tr_fphrase_vectors) if 'fpos_vectors': inputs.extend(tr_fpos_vectors) if 'pos_tag_feature': inputs.append(pos_tag_feature) if 'phrase_tag_feature': inputs.append(phrase_tag_feature) if random_output: target_idx = np.random.normal(loc=0, scale=0.25, size=target_idx.shape) inputs.append(target_idx) weights = np.array(weights_raw) outputs = [np.ones(len(inputs[0]))] * noutputs lex_dict = { name_tok: processed_fields['name'], near_tok: processed_fields['near'], food_tok: processed_fields['food'], } return inputs, outputs, [weights] * noutputs, lex_dict