def write_punctuations(net, text_has_pause_duration_tags, unpunctuated_text, output_file_path, punctuation_reverse_map, write_readable_text): stream = unpunctuated_text.split() # + ["<END>"] word = None pause = 0. with (open(output_file_path, 'w') if (output_file_path != "-") else sys.stdout) as output_file: if text_has_pause_duration_tags: print("pause!!!!") for token in stream: if token.startswith("<sil="): previous_pause = pause pause = float(token.replace("<sil=", "").replace(">", "")) word_index = utils.input_word_index( net.in_vocabulary, word) punctuation_index = net.predict_punctuation( [word_index], np.array([previous_pause]))[0] punctuation = punctuation_reverse_map[punctuation_index] if punctuation == " ": output_file.write("%s%s" % (punctuation, word)) else: if write_readable_text: output_file.write("%s %s" % (punctuation[:1], word)) else: output_file.write(" %s %s" % (punctuation, word)) else: word = token else: for word in stream: word_index = utils.input_word_index(net.in_vocabulary, word) punctuation_index = net.predict_punctuation([word_index], np.array([0.0]))[0] punctuation = punctuation_reverse_map[punctuation_index] if punctuation == " ": output_file.write("%s%s" % (punctuation, word)) else: if write_readable_text: output_file.write("%s %s" % (punctuation[:1], word)) else: output_file.write(" %s %s" % (punctuation, word))
def convert_files(file_paths, vocabulary, punctuations, batch_size, use_pauses, output_path): inputs = [] outputs = [] punctuation = " " pause = 0. if use_pauses: pauses = [] for file_path in file_paths: with open(file_path, 'r') as corpus: for line in corpus: for token in line.split(): if token in punctuations: punctuation = token continue elif token.startswith("<sil="): pause = float(token.replace("<sil=","").replace(">","")) continue else: inputs.append(utils.input_word_index(vocabulary, token)) outputs.append(utils.punctuation_index(punctuations, punctuation)) if use_pauses: pauses.append(pause) punctuation = " " pause = 0. inputs.append(utils.input_word_index(vocabulary, "<END>")) outputs.append(utils.punctuation_index(punctuations, punctuation)) if use_pauses: pauses.append(pause) assert len(inputs) == len(outputs) num_batches = np.floor(len(inputs) / batch_size) dtype = np.int32 if len(vocabulary) > 32767 else np.int16 inputs = np.array(inputs, dtype=dtype)[:batch_size*num_batches].reshape((batch_size, num_batches)).T outputs = np.array(outputs, dtype=np.int16)[:batch_size*num_batches].reshape((batch_size, num_batches)).T if use_pauses: pauses = np.array(pauses, dtype=np.float32)[:batch_size*num_batches].reshape((batch_size, num_batches)).T total_size = batch_size*num_batches data = {"inputs": inputs, "outputs": outputs, "vocabulary": vocabulary, "punctuations": punctuations, "batch_size": batch_size, "total_size": total_size} if use_pauses: data["pauses"] = pauses with open(output_path, 'wb') as output_file: cPickle.dump(data, output_file, protocol=cPickle.HIGHEST_PROTOCOL)
def convert_files(file_paths, vocabulary, punctuations, output_path): inputs = [] outputs = [] punctuation = " " for file_path in file_paths: with open(file_path, 'r') as corpus: for line in corpus: array = np.zeros(shape=(1, len(vocabulary)), dtype=np.int8) array[0,utils.input_word_index(vocabulary, "<START>")] = 1 inputs.append(array) array = np.zeros(shape=(1, len(punctuations)), dtype=np.int8) array[0,utils.punctuation_index(punctuations, " ")] = 1 outputs.append(array) for token in line.split(): if token in punctuations: punctuation = token continue else: array = np.zeros(shape=(1, len(vocabulary)), dtype=np.int8) array[0,utils.input_word_index(vocabulary, token)] = 1 inputs.append(array) array = np.zeros(shape=(1, len(punctuations)), dtype=np.int8) array[0,utils.punctuation_index(punctuations, punctuation)] = 1 outputs.append(array) punctuation = " " array = np.zeros(shape=(1, len(vocabulary)), dtype=np.int8) array[0,utils.input_word_index(vocabulary, "<END>")] = 1 inputs.append(array) array = np.zeros(shape=(1, len(punctuations)), dtype=np.int8) array[0,utils.punctuation_index(punctuations, punctuation)] = 1 outputs.append(array) assert len(inputs) == len(outputs) inputs = np.array(inputs, dtype=np.int8).reshape((len(inputs), 1, len(vocabulary))) outputs = np.array(outputs, dtype=np.int16).reshape((len(inputs), len(punctuations))) f = h5py.File(output_path + '.h5', "w") dset = f.create_dataset('inputs', data=inputs, dtype='i8') dset = f.create_dataset('outputs',data=outputs, dtype='i8') data = {"vocabulary": vocabulary, "punctuations": punctuations, "total_size": len(inputs)} with open(output_path + '.pkl', 'wb') as output_file: cPickle.dump(data, output_file, protocol=cPickle.HIGHEST_PROTOCOL)
def write_punctuations(net, unpunctuated_text, output_file_path, punctuation_reverse_map, write_readable_text): stream = unpunctuated_text.split()# + ["<END>"] word = None pause = 0. with open(output_file_path, 'w') as output_file: for token in stream: if token.startswith("<sil="): previous_pause = pause pause = float(token.replace("<sil=","").replace(">","")) word_index = utils.input_word_index(net.in_vocabulary, word) punctuation_index = net.predict_punctuation([word_index], np.array([previous_pause]))[0] punctuation = punctuation_reverse_map[punctuation_index] if punctuation == " ": output_file.write("%s%s" % (punctuation, word)) else: if write_readable_text: output_file.write("%s %s" % (punctuation[:1], word)) else: output_file.write(" %s %s" % (punctuation, word)) else: word = token
def get_top_predictions(inputs, target, model, raw_word_list, n=5): """ Returns the top predicted filler for a target role, given a set of input roles + fillers Keyword arguments: inputs -- A dictionary of inputs with the role as the key and the filler as the value. target -- A singleton dictionary containing the target role as the key and target filler as the value. model -- The loaded model with which to make predictions raw_word_list -- A dictionary of vocabulary n -- The number of top predictions that should be retrieved """ #print(inputs) raw_word_list.update(inputs) #print(raw_word_list) assert len(raw_word_list) == len(model.role_vocabulary) t_r = [ model.role_vocabulary.get(r, model.unk_role_id) for r in target.keys() ] t_w = [model.unk_word_id] input_roles_words = {} for r, w in raw_word_list.items(): input_roles_words[model.role_vocabulary[r]] = utils.input_word_index( model.word_vocabulary, w, model.unk_word_id, warn_unk=False) #print input_roles_words, t_r[0] input_roles_words.pop(t_r[0]) x_w_i = numpy.asarray([input_roles_words.values()], dtype=numpy.int64) x_r_i = numpy.asarray([input_roles_words.keys()], dtype=numpy.int64) y_w_i = numpy.asarray(t_w, dtype=numpy.int64) y_r_i = numpy.asarray(t_r, dtype=numpy.int64) predicted_word_indices = model.top_words(x_w_i, x_r_i, y_w_i, y_r_i, n) results = [] reverse_vocabulary = utils.get_reverse_map(model.word_vocabulary) for t_w_i in predicted_word_indices: t_w = model.word_vocabulary.get(t_w_i, model.unk_word_id) y_w_i = numpy.asarray([t_w_i], dtype=numpy.int64) p = model.p_words(x_w_i, x_r_i, y_w_i, y_r_i, batch_size=1, verbose=0)[0] n = numpy.round(p / 0.005) fb = numpy.floor(n) hb = n % 2 lemma = reverse_vocabulary[int(t_w_i)] #print u"{:<5} {:7.6f} {:<20} ".format(i+1, float(p), lemma) + u"\u2588" * int(fb) + u"\u258C" * int(hb) results.append((lemma, p)) return results
def write_punctuations(net, punctuation_reverse_map, document): inputs = document.split() inputs = fix_missing_pauses(inputs) + ["<END>"] word = None pause = 0. tags = [] first_word = True for token in inputs: if is_pause(token): previous_pause = pause pause = float(token.replace("<sil=", "").replace(">", "")) word_index = utils.input_word_index(net.in_vocabulary, word) punctuation_index = net.predict_punctuation( [word_index], np.array([previous_pause]))[0] if first_word: punctuation = "" else: punctuation = punctuation_reverse_map[punctuation_index] tagstring = " ".join(tags) + " " if tags else "" tags = [] if punctuation.strip() == "": sys.stdout.write("%s%s%s" % (punctuation, tagstring, word)) else: sys.stdout.write("%s %s%s" % (punctuation[:1], tagstring, word)) first_word = False else: if is_word(token): word = token else: tags.append(token) sys.stdout.write("\n") sys.stdout.flush()
def write_punctuations(net, punctuation_reverse_map, document): inputs = document.split() inputs = fix_missing_pauses(inputs) + ["<END>"] word = None pause = 0. tags = [] first_word = True for token in inputs: if is_pause(token): previous_pause = pause pause = float(token.replace("<sil=","").replace(">","")) word_index = utils.input_word_index(net.in_vocabulary, word) punctuation_index = net.predict_punctuation([word_index], np.array([previous_pause]))[0] if first_word: punctuation = "" else: punctuation = punctuation_reverse_map[punctuation_index] tagstring = " ".join(tags) + " " if tags else "" tags = [] if punctuation.strip() == "": sys.stdout.write("%s%s%s" % (punctuation, tagstring, word)) else: sys.stdout.write("%s %s%s" % (punctuation[:1], tagstring, word)) first_word = False else: if is_word(token): word = token else: tags.append(token) sys.stdout.write("\n") sys.stdout.flush()
def get_filler_prob(inputs, target, model, raw_word_list): """ Returns the probability of a target filler for a role, given a set of input roles + fillers Keyword arguments: inputs -- A dictionary of inputs with the role as the key and the filler as the value. target -- A singleton dictionary containing the target role as the key and target filler as the value. model -- The loaded model with which to make predictions raw_word_list -- A dictionary of vocabulary """ #print(inputs) raw_word_list.update(inputs) #print(raw_word_list) assert len(raw_word_list) == len(model.role_vocabulary) t_r = [ model.role_vocabulary.get(r, model.unk_role_id) for r in target.keys() ] t_w = [ model.word_vocabulary.get(w, model.unk_word_id) for w in target.values() ] #print("Target role", t_r) #print("Target word", t_w) if t_w[0] == model.unk_word_id: return None input_roles_words = {} for r, w in raw_word_list.items(): input_roles_words[model.role_vocabulary[r]] = utils.input_word_index( model.word_vocabulary, w, model.unk_word_id, warn_unk=False) #print input_roles_words, t_r[0] input_roles_words.pop(t_r[0]) x_w_i = numpy.asarray([input_roles_words.values()], dtype=numpy.int64) x_r_i = numpy.asarray([input_roles_words.keys()], dtype=numpy.int64) y_w_i = numpy.asarray(t_w, dtype=numpy.int64) y_r_i = numpy.asarray(t_r, dtype=numpy.int64) return model.p_words(x_w_i, x_r_i, y_w_i, y_r_i)[0]
def convert_file(file_path, vocab_file, punct_file, output_path): punctuations = {" ":0, ".":1, ",":2} punctuations = utils.load_punctuations(punct_file) vocabulary = utils.load_vocabulary(vocab_file) punctuation = " " time_steps = 1 #to be used in future experiments filename = 'database' # output file name f = h5py.File(os.path.join(output_path, filename+'.h5'), "w") input_dset = f.create_dataset('inputs', (100, time_steps,len(vocabulary)), dtype='i8', maxshape=(None, time_steps, len(vocabulary))) output_dset = f.create_dataset('outputs', (100, len(punctuations)), dtype='i8', maxshape=(None, len(punctuations))) data_counter = 0 with open(file_path, 'r') as corpus: for line in corpus: array = np.zeros(shape=(1, len(vocabulary)), dtype=np.int8) array[0,utils.input_word_index(vocabulary, "<START>")] = 1 input_dset[data_counter] = array array = np.zeros(shape=(1, len(punctuations)), dtype=np.int8) array[0,utils.punctuation_index(punctuations, " ")] = 1 output_dset[data_counter] = array data_counter += 1 if data_counter == input_dset.shape[0]: input_dset.resize(input_dset.shape[0]+1000, axis=0) output_dset.resize(output_dset.shape[0]+1000, axis=0) for token in line.split(): if token in punctuations: punctuation = token continue else: array = np.zeros(shape=(1, len(vocabulary)), dtype=np.int8) array[0,utils.input_word_index(vocabulary, token)] = 1 input_dset[data_counter] = array array = np.zeros(shape=(1, len(punctuations)), dtype=np.int8) array[0,utils.punctuation_index(punctuations, punctuation)] = 1 output_dset[data_counter] = array punctuation = " " data_counter += 1 if data_counter == input_dset.shape[0]: input_dset.resize(input_dset.shape[0]+1000, axis=0) output_dset.resize(output_dset.shape[0]+1000, axis=0) array = np.zeros(shape=(1, len(vocabulary)), dtype=np.int8) array[0,utils.input_word_index(vocabulary, "<END>")] = 1 input_dset[data_counter] = array array = np.zeros(shape=(1, len(punctuations)), dtype=np.int8) array[0,utils.punctuation_index(punctuations, punctuation)] = 1 output_dset[data_counter] = array data_counter += 1 if data_counter == input_dset.shape[0]: input_dset.resize(input_dset.shape[0]+1000, axis=0) output_dset.resize(output_dset.shape[0]+1000, axis=0) input_dset.resize(data_counter, axis=0) output_dset.resize(data_counter, axis=0) data = {"vocabulary": vocabulary, "punctuations": punctuations, "total_size": data_counter} with open(os.path.join(output_path, filename+'.pkl'), 'wb') as output_file: cPickle.dump(data, output_file, protocol=cPickle.HIGHEST_PROTOCOL) print("Done!")
def eval_bicknell_switch(model_name, experiment_name, evaluation, model=None, print_result=True, switch_test=False): MODEL_NAME = experiment_name if model: net = model else: description = model_builder.load_description(MODEL_PATH, MODEL_NAME) net = model_builder.build_model(model_name, description) net.load(MODEL_PATH, MODEL_NAME, description) bias = net.set_0_bias() if print_result: print net.role_vocabulary eval_data_file = os.path.join(RF_EVAL_PATH, evaluation + '.txt') result_file = os.path.join(MODEL_PATH, MODEL_NAME + '_' + evaluation + '.txt') probs = [] baseline = [] oov_count = 0 if print_result: print eval_data_file print "=" * 60 dataset = numpy.genfromtxt(eval_data_file, dtype=str, delimiter='\t', usecols=[0, 1, 2, 3, 4]) samples = [] i = 0 while True: d = dataset[i] d2 = dataset[i + 1] A0 = d[0][:-2] V = d[1][:-2] assert d2[0][:-2] == A0 assert d2[1][:-2] == V if d[3] == 'yes': assert d2[3] == 'no' A1_correct = d[2][:-2] A1_incorrect = d2[2][:-2] b_correct = d[4] b_incorrect = d2[4] else: assert d[3] == 'no' A1_correct = d2[2][:-2] A1_incorrect = d[2][:-2] b_correct = d2[4] b_incorrect = d[4] if A1_correct not in net.word_vocabulary or A1_incorrect not in net.word_vocabulary: if A1_correct not in net.word_vocabulary and print_result: print "%s MISSING FROM VOCABULARY. SKIPPING..." % A1_correct if A1_incorrect not in net.word_vocabulary and print_result: print "%s MISSING FROM VOCABULARY. SKIPPING..." % A1_incorrect else: roles = net.role_vocabulary.values() del roles[net.unk_role_id] input_roles_words = dict((r, net.missing_word_id) for r in (roles)) input_roles_words[ net.role_vocabulary["A0"]] = utils.input_word_index( net.word_vocabulary, A0, net.unk_word_id, warn_unk=True) input_roles_words[ net.role_vocabulary["V"]] = utils.input_word_index( net.word_vocabulary, V, net.unk_word_id, warn_unk=True) sample = ( numpy.asarray( [input_roles_words.values(), input_roles_words.values()], dtype=numpy.int64), # x_w_i numpy.asarray( [input_roles_words.keys(), input_roles_words.keys()], dtype=numpy.int64), # x_r_i numpy.asarray([ net.word_vocabulary[A1_correct], net.word_vocabulary[A1_incorrect] ], dtype=numpy.int64 ), # y_i (1st is correct and 2nd is incorrect numpy.asarray( [net.role_vocabulary["A1"], net.role_vocabulary["A1"]], dtype=numpy.int64), # y_r_i [b_correct, b_incorrect], # bicknell scores "\"" + A0 + " " + V + "\"", # context [A1_correct, A1_incorrect]) samples.append(sample) i += 2 if i > len(dataset) - 2: break num_samples = len(samples) num_correct = 0 num_total = 0 if print_result: print "context", "correct", "incorrect", "P(correct)", "P(incorrect)", "bicnell_correct", "bicnell_incorrect" result_list = [] for x_w_i, x_r_i, y_w_i, y_r_i, bicknell, context, a1 in samples: p = net.p_words(x_w_i, x_r_i, y_w_i, y_r_i) p_correct = p[0] p_incorrect = p[1] if print_result: print context, a1[0], a1[1], p_correct, p_incorrect, bicknell[ 0], bicknell[1] if p_correct > p_incorrect: result_list.append(1) else: result_list.append(0) num_correct += p_correct > p_incorrect num_total += 1 assert num_total == num_samples accuracy = float(num_correct) / float(num_samples) if print_result: print "Number of lines %d" % num_samples print "Baseline Lenci11 is 43/64=0.671875" print "Final score of theano model is %d/%d=%.6f" % ( num_correct, num_samples, accuracy) print result_list if switch_test and print_result: print "\nSwitch A0/A1 TEST" input_words = [] input_roles = [] for i in range(1): roles = net.role_vocabulary.values() print net.unk_role_id roles.remove(net.unk_role_id) input_role_word_pairs = dict( (r, net.missing_word_id) for r in roles) input_role_word_pairs[ net.role_vocabulary["V"]] = utils.input_word_index( net.word_vocabulary, "buy", net.unk_word_id, warn_unk=True) input_words.append(input_role_word_pairs.values()) input_roles.append(input_role_word_pairs.keys()) man = utils.input_word_index(net.word_vocabulary, "man", net.unk_word_id, warn_unk=True) car = utils.input_word_index(net.word_vocabulary, "car", net.unk_word_id, warn_unk=True) a1 = net.role_vocabulary["A1"] a0 = net.role_vocabulary["A0"] a0_test = ( numpy.asarray(input_words, dtype=numpy.int64), numpy.asarray(input_roles, dtype=numpy.int64), numpy.asarray([man, car], dtype=numpy.int64), numpy.asarray([a0], dtype=numpy.int64), ) x_w_i, x_r_i, y_w_i, y_r_i = a0_test p0 = net.p_words(x_w_i, x_r_i, y_w_i, y_r_i) print p0 a1_test = ( numpy.asarray(input_words, dtype=numpy.int64), numpy.asarray(input_roles, dtype=numpy.int64), numpy.asarray([man, car], dtype=numpy.int64), numpy.asarray([a1], dtype=numpy.int64), ) x_w_i, x_r_i, y_w_i, y_r_i = a1_test p1 = net.p_words(x_w_i, x_r_i, y_w_i, y_r_i) print p1 print "man buy", p0[0] print "buy man", p1[0] print "car buy", p0[1] print "buy car", p1[1] net.set_bias(bias) return num_correct, num_samples, accuracy
def query(model_name, experiment_name, inputs, target): MODEL_NAME = experiment_name description = model_builder.load_description(MODEL_PATH, MODEL_NAME) net = model_builder.build_model(model_name, description) net.load(MODEL_PATH, MODEL_NAME, description) # net.model.summary() # print net.model.get_layer(name="embedding_2").get_weights()[0] print net.role_vocabulary print("unk_word_id", net.unk_word_id) print("missing_word_id", net.missing_word_id) # net.set_0_bias() net.model.summary() propbank_map = { "subj" : "A0", "obj" : "A1", "ARG0" : "A0", "ARG1" : "A1", "ARG2" : "A2", } # tr_map = { # "A0": numpy.asarray([[net.role_vocabulary["A0"]]], dtype=numpy.int64), # "A1": numpy.asarray([[net.role_vocabulary["A1"]]], dtype=numpy.int64), # "A2": numpy.asarray([[net.role_vocabulary["<UNKNOWN>"]]], dtype=numpy.int64) # } # net.word_vocabulary["<NOTHING>"] = net.missing_word_id # net.role_vocabulary["<UNKNOWN>"] = net.unk_role_id reverse_vocabulary = utils.get_reverse_map(net.word_vocabulary) reverse_role_vocabulary = utils.get_reverse_map(net.role_vocabulary) print reverse_role_vocabulary raw_words = dict((reverse_role_vocabulary[r], reverse_vocabulary[net.missing_word_id]) for r in net.role_vocabulary.values()) # print raw_words raw_words.update(inputs) # print raw_words # print len(raw_words) assert len(raw_words) == len(net.role_vocabulary) # print repr(raw_words) # n = int(sys.argv[3]) t_r = [net.role_vocabulary.get(r, net.unk_role_id) for r in target.keys()] t_w = [net.word_vocabulary.get(w, net.unk_word_id) for w in target.values()] input_roles_words = {} for r, w in raw_words.items(): input_roles_words[net.role_vocabulary[r]] = utils.input_word_index(net.word_vocabulary, w, net.unk_word_id, warn_unk=True) print input_roles_words, t_r input_roles_words.pop(t_r[0]) # default_roles_words = dict((r, net.missing_word_id) for r in (net.role_vocabulary.values())) # default_roles_words.update(input_roles_words) # input_roles_words = default_roles_words x_w_i = numpy.asarray([input_roles_words.values()], dtype=numpy.int64) x_r_i = numpy.asarray([input_roles_words.keys()], dtype=numpy.int64) y_w_i = numpy.asarray(t_w, dtype=numpy.int64) y_r_i = numpy.asarray(t_r, dtype=numpy.int64) topN=20 predicted_word_indices = net.top_words(x_w_i, x_r_i, y_w_i, y_r_i, topN) # print predicted_word_indices # print len(predicted_word_indices) print(x_w_i, x_r_i, y_w_i, y_r_i) p_w = net.p_words(x_w_i, x_r_i, y_w_i, y_r_i, batch_size=1, verbose=0)[0] print ('p_t_w: ', p_w) resultlist = predicted_word_indices # print resultlist for i, t_w_i in enumerate(resultlist): t_w = net.word_vocabulary.get(t_w_i, net.unk_word_id) y_w_i = numpy.asarray([t_w_i], dtype=numpy.int64) p = net.p_words(x_w_i, x_r_i, y_w_i, y_r_i, batch_size=1, verbose=0)[0] n = numpy.round(p / 0.005) fb = numpy.floor(n) hb = n % 2 print u"{:<5} {:7.6f} {:<20} ".format(i+1, float(p), reverse_vocabulary[int(t_w_i)]) + u"\u2588" * int(fb) + u"\u258C" * int(hb)