def examples_from_file(path, seq_length_limit): """Return list[EditExample] from file path.""" examples = [] # count total lines before loading total_lines = num_lines(path) with codecs.open(path, 'r', encoding='utf-8') as f: lnum = 0 for line in verboserate(f, desc='Reading data file.', total=total_lines): split = line.strip().split('\t') lnum += 1 input_words = [] try: for c in config.source_cols: input_words.append(split[c].split(' ')) trg_words = split[config.target_col].split( ' ') # gold answer assert len(trg_words) > 0 ex = EditExample(input_words, trg_words) # skip sequences that are too long, because they use up memory if max_seq_length(ex) > seq_length_limit: continue examples.append(ex) except: print 'bad formatting in line ' + str(lnum) print line return examples
def edit(): events = request.form['events'] #print(events) events = [event.split() for event in events.split(',')] #print(events) processedEvents = [EditExample([[event[0]], [event[1]], event[2:]],['?']) for event in tqdm(events)] valid_eval = ret_model.ret_and_make_ex(processedEvents, new_lsh, examples.train, 0, train_mode=False) beam_list, edit_traces = edit_model.edit(valid_eval) # base retriever. import gtd.retrieval_func as rf lsh, dict = rf.make_hash(examples.train) output_index = rf.grab_nbs(processedEvents, lsh, dict) ret_pred = rf.generate_predictions(examples.train, output_index) #### # eval code gen_out = [] for i in tqdm(range(len(edit_traces))): gen = beam_list[i][0] gen_out.append(gen) dist = [] prob = [] for i in tqdm(range(len(edit_traces))): dist.append(str(valid_eval[i].dist)) prob.append(str(edit_traces[i].decoder_trace.candidates[0].prob)) output = [gen_out, dist, prob] return str(json.dumps(output))
def output_file(pickle_path): # for pickle_path in tqdm(tr_files, total = len(tr_files)): # with open(str(pickle_path), 'rb') as f: # result = pickle.load(f)# result: {(name_of_file, total_line_num) : [ExampleLines]} # f.close() write_dir = pathlib2.Path.cwd() / 'github_data' / 'neural_ret_files' / 'train' df = pd.read_csv(pickle_path, skiprows=2, header=None, names=[0, 1], dtype=str).fillna(NO_CONTEXT_WORD) df[0] = df[0].apply(lambda x: tokenize_fine_grained(x)) # df[0] = df[0].apply(lambda x: preprocess_tokens(x, MAX_LINE_LENGTH)) df[1] = df[1].apply(lambda x: tokenize_fine_grained(x)) max_seq_length = lambda ex: max(max(len(seq) for seq in ex.input_words), len(ex.target_words)) try: ex = list(map(lambda x: EditExample(x[0], x[1]), zip(df[0].tolist(), df[1].tolist()))) # skip sequences that are too long, because they use up memory ex = list(ifilterfalse(lambda x: max_seq_length(x) > 150, ex)) # examples[(str(line).split('/')[-1], len(ex))] = ex result = {(str(pickle_path).split('/')[-1], len(ex)): ex} k = str(pickle_path).split('/')[-1].split('.')[0] k = list(result.keys()) val = ex name, l = k[0] # try: new_vecs = None for batch in chunks(val, 32): # loop over line numbers in file (get batches from file in order) # preprocess lines (includes tokenize_fine_grained # error checking and remove those lines from grabbing below # if line is bad, remove line from v which we use below so that idx below and idx in new_vecs match encin = ret_model.encode(batch, train_mode=False).data.cpu().numpy() # for vec in encin: # new_vecs.append(vec) new_vecs = np.vstack([new_vecs, encin]) if new_vecs is not None else encin # X --> x_i find closest in X ne = NearestNeighbors(10, n_jobs = 32, metric = 'minkowski') # n_jobs=32 ne.fit(new_vecs) neighbors = ne.kneighbors()[1] new_repo = pd.DataFrame(np.array([int(l)] + [None] * 11).reshape(1, -1)) for idx, row in enumerate(neighbors): filtered_idx = row[np.where((row < (idx - 2)) | (row > (idx + 2)))[0]][:5] retrieved_lines = list(pd.DataFrame([(' '.join(val[ret_idx].input_words[0]), ' '.join(val[ret_idx].target_words)) for ret_idx in filtered_idx]).values.flatten()) # .reshape(1, -1) full_line = pd.DataFrame(np.array( [' '.join(val[idx].input_words[0]), ' '.join(val[idx].target_words)] + retrieved_lines).reshape(1, -1)) new_repo = pd.concat([new_repo, full_line], axis=0) # new_repo.head() new_repo.to_csv(str(write_dir / pickle_path), header=None, index=None) # total_threads[0] = total_threads[0] - 1 except Exception as e: print e print 'bad formatting in file ' + str(pickle_path).split('/')[-1] print pickle_path
def make_editexamples(self, proto_list, edit_list): example_list = [] for i in range(len(proto_list)): el = EditExample( edit_list[0].input_words + proto_list[i].input_words + [proto_list[i].target_words], edit_list[0].target_words) example_list.append(el) return example_list
def make_eexs(inlist, outlist): fline = [] for instr, outstr in zip(inlist, outlist): cardname = regex.sub('[\p{P}\p{Sm}]+', '', ''.join(instr[0].split(' '))) i1 = [cardname] + instr[0].split(' ') i2 = instr[1:9] i3 = instr[9].split(' ') tmp = EditExample([i1, i2, i3], outstr) fline.append(tmp) return fline
def edit(events): x = "with enraged yells <PRP> repeatedly throws Synset('entity.n.01') at <ORGANIZATION>8 Synset('natural_phenomenon.n.01') that seals the Synset('action.n.01') startling <PERSON>14 the Synset('defender.n.01') on Synset('group_action.n.01')".split( ) processedEvents = [ EditExample([[event[0]], [event[1]], event[2:]], x) for event in events ] print(processedEvents[0]) valid_eval = ret_model.ret_and_make_ex(processedEvents, new_lsh, examples.train, 0, train_mode=False) beam_list, edit_traces = edit_model.edit(valid_eval) # base retriever. import gtd.retrieval_func as rf lsh, dict = rf.make_hash(examples.train) output_index = rf.grab_nbs(processedEvents, lsh, dict) ret_pred = rf.generate_predictions(examples.train, output_index) #### # eval code gen_out = [] for i in range(len(edit_traces)): gen = beam_list[i][0] gen_out.append(gen) dist = [] prob = [] for i in range(len(edit_traces)): dist.append(str(valid_eval[i].dist)) prob.append(str(edit_traces[i].decoder_trace.candidates[0].prob)) output = {'output': gen_out, 'distances': dist, 'beamProb': prob} print(output) return output
def interact(self, beam_size=8, constrain_vocab=False, verbose=True): ex = EditExample.from_prompt() beam_list, edit_traces = self.edit([ex], beam_size=beam_size, constrain_vocab=constrain_vocab) beam = beam_list[0] output_words = beam[0] edit_trace = edit_traces[0] # nll = lambda example: self.loss([example]).data[0] # TODO: make this fully generative in the right way.. current NLL is wrong, disabled for now. # compare NLL of correct output and predicted output # output_ex = EditExample(ex.source_words, ex.insert_words, ex.delete_words, output_words) # gold_nll = nll(ex) # output_nll = nll(output_ex) print 'output:' print ' '.join(output_words) if verbose: # print # print 'output NLL: {}, gold NLL: {}'.format(output_nll, gold_nll) print edit_trace
def input_remapper(self, batch): flatten = lambda l: [item for sublist in l for item in sublist] return [EditExample(input_words=ex.input_words, target_words=flatten(ex.input_words)) for ex in batch]
def examples_from_file(data_paths, seq_length_limit, fname): examples = {} MAX_LINE_LENGTH = 128 name = '{}.pickle'.format(fname) file = pathlib2.Path.cwd( ) / 'github_data' / 'processed_repo_pkl' / name # if os.path.exists(str(file)): # with open(str(file), 'rb') as f: # examples = pickle.load(f) # f.close() # return list(examples.values()) # count total lines before loading num_direct = len(data_paths) for line in verboserate(data_paths, desc='Reading data file.', total=num_direct): df = pd.read_csv(line, skiprows=2, header=None, names=[0, 1], dtype=str).fillna(NO_CONTEXT_WORD) df[0] = df[0].apply(lambda x: tokenize_fine_grained(x)) # df[0] = df[0].apply(lambda x: preprocess_tokens(x, MAX_LINE_LENGTH)) df[1] = df[1].apply(lambda x: tokenize_fine_grained(x)) try: ex = [] for i, row in df.iterrows(): try: ex.append(EditExample(row[0], row[1])) except: # print 'bad formatting in file ' + str(line).split('/')[-1] # print line count = 1 # skip sequences that are too long, because they use up memory # if max_seq_length(ex) > seq_length_limit: # continue ex = list( ifilterfalse( lambda x: max_seq_length(x) > seq_length_limit, ex)) # examples[(str(line).split('/')[-1], len(ex))] = ex file = pathlib2.Path.cwd( ) / 'github_data' / 'processed_repo_pkl' / fname result = {(str(line).split('/')[-1], len(ex)): ex} k = str(line).split('/')[-1].split('.')[0] pick_obj = {(str(line).split('/')[-1], len(ex)): ex} obj_name = str(file / k) + '.pickle' with open(obj_name, 'wb') as f: pickle.dump(pick_obj, f) f.close() except Exception as e: print e print 'bad formatting in file ' + str(line).split('/')[-1] print line # name = '{}.pickle'.format(fname) # file = pathlib2.Path.cwd() / 'github_data' / 'processed_repo_pkl' / name # if fname == 'train': # file = pathlib2.Path.cwd() / 'github_data' / 'processed_repo_pkl' / fname # for k, v in tqdm(examples.items()): # obj_name = file / k[0].split('.')[0] # pick_obj = {k : v} # with open(str(obj_name), 'wb') as f: # pickle.dump(pick_obj, f) # f.close() # else: # if not os.path.exists(str(file)): # with open(str(file), 'wb') as f: # pickle.dump(examples, f) # f.close() return list(examples.values())