def generate_ev_binary(group='dev', output_name='ev_binary'): outdir = '{}/sent_classifier/{}/'.format(utils.DATA_DIR, output_name) try: os.system('mkdir -p {}'.format(outdir)) except OSError: print('Target dir: {} already exists'.format(outdir)) input('Proceeding with generation...') pmids = utils.group_ids('ev_inf', group) with open('{}/{}.tsv'.format(outdir, group), 'w') as fout: for pmid in pmids: sents = utils.readlines('{}/documents/sents/{}.sents'.format( utils.DATA_DIR, pmid)) frames = utils.readlines('{}/documents/sents/{}.frame_idx'.format( utils.DATA_DIR, pmid)) pos_sents = [s for s, fs in zip(sents, frames) if len(fs) >= 1] neg_sents = [s for s, fs in zip(sents, frames) if len(fs) == 0] print(pmid) print(pos_sents) input() shuffle(neg_sents) neg_sents = neg_sents[:len(pos_sents)] for s in pos_sents: fout.write('{}\t{}\n'.format(1, s)) for s in neg_sents: fout.write('{}\t{}\n'.format(0, s))
def read_docs(phase='starting_spans'): pmid_groups = {} for g in GROUPS: pmids = utils.readlines( os.path.join(config.EBM_NLP_DIR, 'pmids_{}.txt'.format(g))) for pmid in pmids: pmid_groups[pmid] = g def get_e_fname(pmid, e): if pmid_groups[pmid] == 'test': subdir = os.path.join('test', 'gold') else: subdir = 'train' f = '{}.AGGREGATED.ann'.format(pmid) return os.path.join(config.EBM_NLP_DIR, 'annotations', 'aggregated', phase, e, subdir, f) docs = [] for pmid, group in pmid_groups.items(): tokens = utils.readlines( os.path.join(config.EBM_NLP_DIR, 'documents', '{}.tokens'.format(pmid))) text, token_offsets = utils.join_tokens(tokens) doc = classes.Doc(pmid, text) doc.group = group for e in ['participants', 'interventions', 'outcomes']: label_name = 'GOLD_{}'.format(e[0]) labels = [int(l) for l in utils.readlines(get_e_fname(pmid, e))] for token_i, token_f, l in utils.condense_labels(labels): char_i = token_offsets[token_i][0] char_f = token_offsets[token_f - 1][1] doc.labels[label_name].append( classes.Span(char_i, char_f, text[char_i:char_f])) docs.append(doc) return docs
def generate_frames(output_name, group='dev'): icodir = '{}/ico_acceptor/{}/'.format(utils.DATA_DIR, output_name) data = {} for sample in ['x', 'c', 'o']: input_lines = utils.readlines('{}/{}_sample_{}.txt'.format( icodir, group, sample)) input_lines = [l.split('\t') for l in input_lines] output_lines = utils.readlines('{}/{}_sample_{}_results.txt'.format( icodir, group, sample)) output_lines = [l.split('\t') for l in output_lines] #sample_c.write(utils.joinstr([pmid, sent_idx, frame_idx, i, c, o, s])) don't show Jay assert len(input_lines) == len(output_lines) all_lines = [i_l + o_l for i_l, o_l in zip(input_lines, output_lines)] for pmid, pmid_lines in groupby(all_lines, itemgetter(0)): if pmid not in data: data[pmid] = {} for frame_idx, frame_lines in groupby(pmid_lines, itemgetter(2)): pmids, sent_idxs, frame_idxs, i_s, c_s, o_s, s_s, p0s, p1s = zip( *frame_lines) assert len(set(pmids)) == 1 assert len(set(sent_idxs)) == 1 assert len(set(frame_idxs)) == 1 assert len(set(s_s)) == 1 if len(nltk.tokenize.word_tokenize(s_s[0])) < 10: continue sent_idx = sent_idxs[0] frame_idx = frame_idxs[0] ev_span = s_s[0] top_frames = sorted(zip(p1s, i_s, c_s, o_s), key=itemgetter(0), reverse=True) assert frame_idx not in data[pmid] frame = { 'sent_idx': sent_idx, 'frame_idx': frame_idx, 'ev': ev_span, 'icos': top_frames[:5], 'sample': sample, } data[pmid][frame_idx] = frame for pmid, frames in data.items(): with open( '{}/documents/frames/{}.bert_frames'.format( utils.DATA_DIR, pmid), 'w') as fout: for frame in frames.values(): fout.write(json.dumps(frame) + '\n')
def export_gt_depths_kitti(): parser = argparse.ArgumentParser(description='export_gt_depth') parser.add_argument('--data_path', type=str, help='path to the root of the KITTI data', required=True) parser.add_argument('--split', type=str, help='which split to export gt from', required=True, choices=["eigen", "eigen_benchmark","odom", "AirSim"]) parser.add_argument('--sequence', type=int, help='which odom sequnce to export gt from', required=False, default=0) opt = parser.parse_args() split_folder = os.path.join(os.path.dirname(__file__), "splits", opt.split) if opt.split == "odom": lines = readlines(os.path.join(split_folder,"{:02d}_exp.txt".format(opt.sequence))) else: lines = readlines(os.path.join(split_folder, "test_files.txt")) print("Exporting ground truth depths for {}".format(opt.split)) gt_depths = [] for line in lines: folder, frame_id, _ = line.split() frame_id = int(frame_id) if (opt.split == "eigen")|(opt.split == "odom"): calib_dir = os.path.join(opt.data_path, folder.split("/")[0]) velo_filename = os.path.join( opt.data_path, folder, "velodyne_points/data", "{:010d}.bin".format(frame_id)) gt_depth = generate_depth_map(calib_dir, velo_filename, 2, True) elif opt.split == "eigen_benchmark": gt_depth_path = os.path.join( opt.data_path, folder, "proj_depth", "groundtruth", "image_02", "{:010d}.png".format(frame_id)) gt_depth = np.array(pil.open(gt_depth_path)).astype(np.float32) / 256 elif opt.split == "AirSim" gt_depth_path = os.path.join( opt.data_path, folder, "{}.pfm".format(frame_id)) gt_depth,_ = read_pfm(gt_depth_path) gt_depths.append(gt_depth.astype(np.float32)) output_path = os.path.join(split_folder, "gt_depths.npz") print("Saving to {}".format(opt.split)) np.savez_compressed(output_path, data=np.array(gt_depths))
def create_sfnorm_pair_with_pole(opts): from datasets_sfgan import SFGAN_Base_Dataset from torch.utils.data import DataLoader from utils import readlines import torch from utils import tensor2disp fpath = os.path.join(os.path.dirname(__file__), "..", "splits", opts.split, "{}_files.txt") train_filenames = readlines(fpath.format("train")) val_filenames = readlines(fpath.format("val")) syn_train_filenames = readlines(fpath.format("syn_train")) syn_val_filenames = readlines(fpath.format("syn_val")) train_dataset = SFGAN_Base_Dataset(opts.data_path, train_filenames, syn_train_filenames, opts.height, opts.width, opts.frame_ids, 4, opts=opts, is_train=False, load_seman=True) train_loader = DataLoader(train_dataset, 1, shuffle=not opts.noShuffle, num_workers=opts.num_workers, pin_memory=True, drop_last=False) min_num = 100 poleId = 5 pole_ind_rec = list() for batch_idx, inputs in enumerate(train_loader): num_syn = torch.sum(inputs['syn_semanLabel'] == poleId) num_real = torch.sum(inputs['real_semanLabel'] == poleId) if num_syn > min_num and num_real > min_num: pole_ind_rec.append(batch_idx) print(batch_idx) split_root = '../splits/sfnorm_pole' wf1 = open(os.path.join(split_root, 'train_files.txt'), "w") for pole_ind in pole_ind_rec: wf1.write(train_filenames[pole_ind] + '\n') wf1.close() wf1 = open(os.path.join(split_root, 'syn_train_files.txt'), "w") for pole_ind in pole_ind_rec: wf1.write(syn_train_filenames[pole_ind] + '\n') wf1.close()
def ibo_tagging(corpus, keywords, output=None, thread=None): """ Arguments: corpus(str): Path to the corpus file. keywords(str): Path to where keywords dictionaries is. thread(int): Number of thread to process. output(str): Path to the output file. """ # output name if output is None: output = corpus[:-4] + "_ibo.tsv" # Load and merge dictionary # files = [itr for itr in os.listdir(rule) if itr.endswith("_leaf.json")] # Load entities # entity = dict() # for itr in files: # entity.update(json.load(open(rule + itr, "r"))) entity = json.load(open(keywords, "r")) # Read corpus raw_data = readlines(corpus) # Threading param = (entity,) result = generic_threading(thread, raw_data, generate_ibo, param) # Write result to file file_io(output, result)
def main(): # We preprocess the list with a simple cli call to sort lines = utils.readlines("day4/sortedinput") guards = {} for line in lines: guardIdMatch = re.search(r"#(\d+)", line) if guardIdMatch is not None: guardId = int(guardIdMatch.group(1)) if guardId in guards: currentSleepRecord = guards[guardId] else: currentSleepRecord = [0] * 60 guards[guardId] = currentSleepRecord asleepMatch = re.search(r":(\d+)] falls asleep", line) if asleepMatch is not None: asleepMinute = int(asleepMatch.group(1)) awakeMatch = re.search(r":(\d+)] wakes up", line) if awakeMatch is not None: awakeMinute = int(awakeMatch.group(1)) for t in range(asleepMinute, awakeMinute): currentSleepRecord[t] += 1 scenarioOneGuard = max(guards.items(), key=lambda g: sum(g[1])) print(scenarioOneGuard[0] * scenarioOneGuard[1].index(max(scenarioOneGuard[1]))) scenarioTwoGuard = max(guards.items(), key=lambda g: max(g[1])) print(scenarioTwoGuard[0] * scenarioTwoGuard[1].index(max(scenarioTwoGuard[1])))
def preliminary_cleanup(corpus, rule, output=None, thread=None, limit=None): """ Preliminary cleanup the corpus to make it easier for further processing methods. This method can be used to correct the missing spaces after punctuations any other customized rules can be added to the rule file, see punctuation_cleanup in utils for the formatting of the rules. Arguments: corpus(str): Path to the corpus file. rule(str): Path to the processing rule file. thread(int): Number of thread to process. output(str): Path to the output file. """ # output name if output is None: output = corpus[:-4] + "_preprocessed.tsv" # Load rules rules = load_rules(rule) # Load data raw_data = readlines(corpus, limit=limit, skip=True) # Threading param = (rules, "PRELIMINARY") result = generic_threading(thread, raw_data, punctuation_cleanup, param) # Write result to file write_to_file(output, result)
def remove_old(path): files = iter_files(path) for file in files: filename = os.path.basename(file) print(filename) f = open(os.path.join('./final', filename), 'a+') lines = readlines(file) i = 0 cnt = 0 for line in lines: article = json.loads(line) if 'year' in article: if int(article['year']) >= 2000: if "author" in article: article['author'] = list(set(article['author'])) tmp = json.dumps(article) f.write(tmp + '\n') f.flush() i += 1 # if i % 100000 == 0: # print(i) else: cnt += 1 else: cnt += 1 print('%s skip:%d, save %d' % (filename, cnt, i))
def refine_corpus(corpus, rule_path, output=None, thread=None): """ Clean up the given corpus according to the rules defined in the files. This method utilizes multithreading to accelerate the process. Arguments: corpus(str): Path to the corpus file. rule_path(str): Path to where "parentheses.tsv" and "refine_list.tsv" are. thread(int): Number of thread to process. output(str): Path to the output file. """ if output is None: output = corpus[:-4] + "_cleaned.txt" if not rule_path.endswith("/"): rule_path += "/" # Load rule files file_p = rule_path + "parentheses.tsv" file_r = rule_path + "refine_list.tsv" parentheses = load_rules(file_p) refine_list = load_rules(file_r) # Acquire the corpus (skip first line) raw_data = readlines(corpus) # Threading param = (parentheses, refine_list) result = generic_threading(thread, raw_data, corpus_cleanup, param) # Write all result to file write_to_file(output, result)
def export_gt_depths_kitti(opt): """ Generate ground-truth data and store as .npz file """ split_folder = os.path.join(os.path.dirname(__file__), "splits", opt.split) lines = readlines(os.path.join(split_folder, "test_files.txt")) print("Exporting ground truth depths for {}".format(opt.split)) gt_depths = [] for line in lines: folder, frame_id, _ = line.split() frame_id = int(frame_id) if opt.split == "eigen": calib_dir = os.path.join(opt.data_path, folder.split("/")[0]) velo_filename = os.path.join(opt.data_path, folder, "velodyne_points/data", "{:010d}.bin".format(frame_id)) gt_depth = generate_depth_map(calib_dir, velo_filename, 2, True) elif opt.split == "eigen_benchmark": gt_depth_path = os.path.join(opt.data_path, folder, "proj_depth", "groundtruth", "image_02", "{:010d}.png".format(frame_id)) gt_depth = np.array(pil.open(gt_depth_path)).astype( np.float32) / 256 gt_depths.append(gt_depth.astype(np.float32)) output_path = os.path.join(split_folder, "gt_depths.npz") print("Saving to {}".format(opt.split)) np.savez_compressed(output_path, data=np.array(gt_depths))
def read_filenames(self): # e.g. splits\\eigen_zhou\\train.txt split_path = os.path.join(self.dataset.split_folder, self.dataset.split_name).replace('\\', '/') self.filenames = readlines(split_path) self.num_items = len(self.filenames) self.steps_per_epoch = self.num_items // self.batch_size
def parse_subwords(file): """ Parse subword mapping to dictionary. Args: file(str): Path to the subword mapping file. format: <[WORD]>S1,S2,... """ # Load subwords raw_data = readlines(file, limit=None) # Parsing information mentions, subwords = list(), list() for itr in raw_data: index = itr.find(">") mentions.append(itr[1:index]) subwords.append(itr[index + 2:].split(",")) # Create dictionary for mentions and its subwords dictionary = dict(zip(mentions, subwords)) write_to_file("data/subwords.json", dictionary) # Subword pool for subword embedding # subword_pool = np.unique(list(chain.from_iterable(subwords))) subword_pool = dict(Counter(list(chain.from_iterable(subwords)))) # print(subword_pool) print("Raw number of subwords: {:8d}".format(len(subword_pool))) write_to_file("data/subword_pool.json", subword_pool)
def main(): lines = utils.readlines("day2/input") sum2 = 0 sum3 = 0 for line in lines: letters = {} for c in line: if c in letters: letters[c] += 1 else: letters[c] = 1 found2 = False found3 = False for c in letters: if not found2 and letters[c] == 2: sum2 += 1 found2 = True elif not found3 and letters[c] == 3: sum3 += 1 found3 = True print("Checksum:", sum2 * sum3) for line in lines: for secondline in lines: if len(line) - 1 == len(strIntersection(line, secondline)): print("Common letters:", strIntersection(line, secondline)) return
def extract_vocabularies(corpus, rule, output=None, thread=None): """ Extract vocabularies from the corpus, additional rules to achieve purer vocabularies can be defined in src/refine_rules/voc_cleanup.tsv Arguments: corpus(str): Path to the corpus file. rule(str): Path to the processing rule file. thread(int): Number of thread to process. output(str): Path to the output file. """ if output is None: output = corpus[:-4] + "_vocabulary_list.json" # Load rules rules = load_rules(rule) # Acquire the corpus raw_data = readlines(corpus, limit=None) # Threading (TO-BE-IMPLEMENTED) # param = (rules, "SPLIT_WORDS") # generic_threading(thread, raw_data, punctuation_cleanup, param) result = punctuation_cleanup(0, raw_data, rules, mode='SPLIT_WORDS') # Counting occurance print("Counting occurance...") voc_list = Counter(result) # Save vocabulary to file write_to_file(output, voc_list)
def parseData(filepath): samples = utils.readlines(filepath) output = [] for sample in samples: m = re.search(r"(\d+) (\d+) (\d+) (\d+)", sample) e = executionData(*[int(i) for i in m.group(1, 2, 3, 4)]) output.append(e) return output
def export_gt_depths_kitti(): parser = argparse.ArgumentParser(description="export_gt_depth") parser.add_argument( "--data_path", type=str, help="path to the root of the KITTI data", required=True, ) parser.add_argument( "--split", type=str, help="which split to export gt from", required=True, choices=["eigen", "eigen_benchmark"], ) opt = parser.parse_args() split_folder = os.path.join(os.path.dirname(__file__), "splits", opt.split) lines = readlines(os.path.join(split_folder, "test_files.txt")) print("Exporting ground truth depths for {}".format(opt.split)) gt_depths = [] for line in lines: folder, frame_id, _ = line.split() frame_id = int(frame_id) if opt.split == "eigen": calib_dir = os.path.join(opt.data_path, folder.split("/")[0]) velo_filename = os.path.join( opt.data_path, folder, "velodyne_points/data", "{:010d}.bin".format(frame_id), ) gt_depth = generate_depth_map(calib_dir, velo_filename, 2, True) elif opt.split == "eigen_benchmark": gt_depth_path = os.path.join( opt.data_path, folder, "proj_depth", "groundtruth", "image_02", "{:010d}.png".format(frame_id), ) gt_depth = np.array(pil.open(gt_depth_path)).astype( np.float32) / 256 gt_depths.append(gt_depth.astype(np.float32)) output_path = os.path.join(split_folder, "gt_depths.npz") print("Saving to {}".format(opt.split)) np.savez_compressed(output_path, data=np.array(gt_depths))
def evaluate(opt): """Evaluates a pretrained model using a specified test set """ lc_window_sz = 1 ratios = [] normal_errors = [] encoder, depth_decoder, thisH, thisW = prepare_model_for_test(opt) backproject_depth = BackprojectDepth(1, thisH, thisW) filenames = readlines('./splits/nyu_test.txt') dataset = datasets.NYUTestDataset( opt.data_path, filenames, thisH, thisW, ) dataloader = DataLoader( dataset, 1, shuffle=False, num_workers=opt.num_workers ) print("-> Computing predictions with size {}x{}".format(thisH, thisW)) with torch.no_grad(): for ind, (data, _, gt_norm, gt_norm_mask, K, K_inv) in enumerate(tqdm(dataloader)): input_color = data.cuda() if opt.post_process: input_color = torch.cat((input_color, torch.flip(input_color, [3])), 0) output = depth_decoder(encoder(input_color)) pred_disp, _ = disp_to_depth( output[("disp", 0)], opt.min_depth, opt.max_depth ) pred_disp = pred_disp.data.cpu() if opt.post_process: N = pred_disp.shape[0] // 2 pred_disp = batch_post_process_disparity( pred_disp[:N], torch.flip(pred_disp[N:], [3]) ) pred_depth = 1 / pred_disp cam_points = backproject_depth(pred_depth, K_inv) cam_points = cam_points[:, :3, ...].view(1, 3, thisH, thisW) normal = depth_2_normal(cam_points, lc_window_sz) normal = F.pad(normal, (0, lc_window_sz, 0, lc_window_sz), mode='replicate') normal = F.interpolate(normal, (gt_norm.shape[2], gt_norm.shape[3])) normal_errors.append(compute_normal_errors(normal, gt_norm, gt_norm_mask)) mean_normal_errors = np.array(normal_errors).mean(0) print("\n " + ("{:>8} | " * 6).format("mean", "rmse", "a1", "a2", "a3", "a4")) print(("&{: 8.3f} " * 6).format(*mean_normal_errors.tolist()) + "\\\\") print("\n-> Done!")
def export_gt_depths_kitti(): parser = argparse.ArgumentParser(description='export_gt_depth') parser.add_argument('--data_path', type=str, help='path to the root of the KITTI data', required=True) parser.add_argument('--split', type=str, help='which split to export gt from', required=True, choices=["eigen", "eigen_benchmark"]) opt = parser.parse_args() split_folder = os.path.join(os.path.dirname(__file__), "splits", opt.split) lines = readlines(os.path.join(split_folder, "test_files.txt")) print("Exporting ground truth depths for {}".format(opt.split)) gt_depths = [] for line in lines: folder, frame_id, _ = line.split() frame_id = int(frame_id) if opt.split == "eigen": calib_dir = os.path.join(opt.data_path, folder.split("/")[0]) velo_filename = os.path.join(opt.data_path, folder, "velodyne_points/data", "{:010d}.bin".format(frame_id)) # gt_depth = generate_depth_map(calib_dir, velo_filename, 2, True) ## ZMH: This won't work because the generate_depth_map function has been redefined. # gt_depth = generate_depth_map_original(calib_dir, velo_filename, 2, True) ## ZMH: the original function in monodepth2 # gt_depth = generate_depth_map_original(calib_dir, velo_filename, 2, False) ## ZMH: the original function in monodepth2, use transformed depth velo_rect, P_rect_norm, im_shape = generate_depth_map(calib_dir, velo_filename, 2) gt_depth = project_lidar_to_img(velo_rect, P_rect_norm, im_shape) ## ZMH: the way gt is generated I used in training elif opt.split == "eigen_benchmark": # gt_depth_path = os.path.join(opt.data_path, folder, "proj_depth", # "groundtruth", "image_02", "{:010d}.png".format(frame_id)) gt_depth_path = os.path.join(opt.data_path, folder, "proj_depth", "groundtruth", "image_02", "{:010d}.png".format(frame_id), 'val', folder.split("/")[1], "proj_depth", "groundtruth", "image_02", "{:010d}.png".format(frame_id)) if not os.path.exists(gt_depth_path): gt_depth_path = os.path.join(opt.data_path, folder, "proj_depth", "groundtruth", "image_02", "{:010d}.png".format(frame_id), 'train', folder.split("/")[1], "proj_depth", "groundtruth", "image_02", "{:010d}.png".format(frame_id)) if not os.path.exists(gt_depth_path): raise ValueError("This file does not exist! {} {}".format(folder, frame_id)) gt_depth = np.array(pil.open(gt_depth_path)).astype(np.float32) / 256 gt_depths.append(gt_depth.astype(np.float32)) output_path = os.path.join(split_folder, "gt_depths_im_cus.npz") print("Saving to {}".format(opt.split)) np.savez_compressed(output_path, data=np.array(gt_depths))
def network_define(opt, data_path, height, width): opt.load_weights_folder = os.path.expanduser(opt.load_weights_folder) assert os.path.isdir(opt.load_weights_folder), \ "Cannot find a folder at {}".format(opt.load_weights_folder) print("-> Loading weights from {}".format(opt.load_weights_folder)) filenames = readlines(os.path.join(splits_dir, opt.eval_split, split_file)) encoder_path = os.path.join(opt.load_weights_folder, "encoder.pth") decoder_path = os.path.join(opt.load_weights_folder, "depth.pth") encoder_dict = torch.load(encoder_path, map_location=torch.device("cuda:1")) if opt.dataset_val[0] == "kitti": dataset = datasets.KITTIRAWDataset(data_path, filenames, height, width, [0], 4, is_train=False) elif opt.dataset_val[0] == "vkitti": dataset = datasets.VKITTIDataset(data_path, filenames, height, width, [0], 4, is_train=False) # dataloader = DataLoader(dataset, 16, shuffle=False, num_workers=opt.num_workers, # pin_memory=True, drop_last=False) dataloader = DataLoader( dataset, 1, shuffle=False, num_workers=opt.num_workers, pin_memory=True, drop_last=False, collate_fn=my_collate_fn ) ## the default collate_fn will fail because there are non-deterministic length sample encoder = networks.ResnetEncoder(opt.num_layers, False) depth_decoder = networks.DepthDecoder(encoder.num_ch_enc) model_dict = encoder.state_dict() encoder.load_state_dict( {k: v for k, v in encoder_dict.items() if k in model_dict}) depth_decoder.load_state_dict( torch.load(decoder_path, map_location=torch.device("cuda:1"))) encoder.cuda(1) encoder.eval() depth_decoder.cuda(1) depth_decoder.eval() return encoder, depth_decoder, dataloader, filenames
def parseData(filepath, instructions): lines = utils.readlines(filepath) ipMatch = re.search(r"#ip (\d+)", lines[0]) ip = int(ipMatch.group(1)) for line in lines[1:]: instrMatch = re.search(r"(.*) (\d+) (\d+) (\d+)", line) e = executionData(instrMatch.group(1), *[int(i) for i in instrMatch.group(2, 3, 4)]) instructions.append(e) return ip
def cleanup(file, output, cleanup_only=False, threshold=5, thread=None): """ Cleanup the dataset according to the specs of the task. Args: file(str): Input corpus filename. output(str): Output filename. cleanup_only(bool): Just cleanup the words using predefined frequent words. threshold(int): The threshold to filter out infrequent words. thread(int): Number of thread to run simultaneously """ # 1. Load and convert each title to lowercase. data = readlines(file, delimiter="\t", lower=True) # 2. Remove all characters that are not # (1) lowercase characters (a-z), # (2) whitespace, or # (3) hyphen '-' data = remove_redundant_char(data, index=1) # 3. Tokenize each title into words by splitting on whitespace. words, data = tokenize_context(data, index=1, thread=thread) # 4. Remove all tokens that appear fewer than 5 times in the dataset. # 4-1. Find frequent words if not cleanup_only: frequent_words = find_frequent_words(words, threshold=threshold) write_to_file("models/frequent_words.txt", frequent_words) else: print("Loading frequent_words from training set.") frequent_words = readlines("models/frequent_words.txt", lower=True) frequent_words = list(chain.from_iterable(frequent_words)) # 4-2. Remove infrequent words in titles data = filter_title(data, index=1, frequent_words=frequent_words, thread=thread) # Save to file write_to_file(output, data, delimiter="\t", row_as_line=True)
def get_docs(default_span='mv'): ebm_nlp = '/home/ben/Desktop/EBM-NLP/ebm_nlp_2_00' ann_dir = '{}/annotations'.format(ebm_nlp) pmids = utils.readlines('../data/id_splits/ebm_nlp/test.txt') docs = {p: {'i': {}, 'o': {}} for p in pmids} for p in pmids: token_fname = os.path.join('../data/documents/tokens/', '{}.tokens'.format(p)) tokens = utils.readlines(token_fname) docs[p]['tokens'] = tokens for el in ['interventions', 'outcomes']: agg_fname = '{}/aggregated/starting_spans/{}/test/gold/{}.AGGREGATED.ann'.format( ann_dir, el, p) indv_fnames = glob.glob( '{}/individual/phase_1/{}/test/gold/{}.*.ann'.format( ann_dir, el, p)) e = el[0] docs[p][e]['hmm'] = list(map(int, utils.readlines(agg_fname))) docs[p][e]['indv'] = [] for f in indv_fnames: docs[p][e]['indv'].append(list(map(int, utils.readlines(f)))) docs[p][e]['avg'] = list(map(np.mean, zip(*docs[p][e]['indv']))) agg_strats = [\ ('mv', lambda x: int(x + 0.5)), ('union', lambda x: int(x > 0)), ('intersection', lambda x: int(x))] for name, func in agg_strats: docs[p][e][name] = list(map(func, docs[p][e]['avg'])) spans = utils.condense_labels(docs[p][e][default_span]) docs[p][e]['spans'] = [' '.join(tokens[i:f]) for i, f, l in spans] return docs
def compare(group = 'dev'): pmids = utils.group_ids(group) with open('frames.csv', 'w') as fout: fout.write('\t'.join('label i c o span'.split()) + '\n') for pmid in pmids: try: bert_frames = utils.readlines('{}/documents/frames/{}.bert_frames'.format(utils.DATA_DIR, pmid)) except FileNotFoundError: # womp womp no frames for this doc continue gold_frames = utils.readlines('{}/documents/frames/{}.frames'.format(utils.DATA_DIR, pmid)) gold_sent_idxs = utils.readlines('{}/documents/frames/{}.sent_idxs'.format(utils.DATA_DIR, pmid)) gold_lookup = { i: l.split('\t') for i,l in enumerate(gold_sent_idxs) if len(l.split('\t')) == 1 } for frame_str in bert_frames: frame = json.loads(frame_str) matching_frames = [i for i, idxs in gold_lookup.items() if frame['sent_idx'] in idxs] if matching_frames: matching_frame = gold_frames[matching_frames[0]] i, c, o, _, _, ev = matching_frame.split('\t') if ev not in frame['ev']: continue fout.write('\t'.join(['gold', i, c, o, ev]) + '\n') ico = frame['icos'][0] i_score = '' c_score = '' o_score = '' if o.lower() == ico[3].lower(): o_score = '5' if i.lower() == ico[1].lower(): i_score = '5' elif i.lower() == ico[2].lower(): ico[1], ico[2] = ico[2], ico[1] i_score = '5' if frame['sample'] == 'c': frame['sample'] = 'i' if c.lower() == ico[2].lower(): c_score = '5' fout.write('\t'.join([frame['sample'], ico[1], ico[2], ico[3], frame['ev']]) + '\n')
def test_challenge20(self): plaintexts = [ base64_to_bytes(line) for line in utils.readlines('20.txt') ] key = encryption_key() nonce = '\0' * 8 ciphertexts = [ ctr_encrypt(m, key, nonce) for m in plaintexts ] # Because of the fixed-nonce, the encrypted keystream bytes are # repeated for every plaintext message. # # ciphertext[i] ^ keystream[i] = plaintext[i] # # We can create a transposed ciphertext message by concatenating # ciphertext[i] from every encrypted message and then xor'ing that # against a guessed keystream byte. Then we can test whether the # resulting plaintext looks like english based on character # distributions. If so, then we've figured out the keystream byte. keystream = '' for index in itertools.count(): transposed = ''.join(m[index:index+1] for m in ciphertexts) if not transposed: break allowed_chars = None if index == 0: allowed_chars = string.ascii_uppercase + '"\'' score, _, key = crack.find_best_single_byte_key( transposed, allowed_chars=allowed_chars ) # print 'Best score for index {}: {}'.format(index, score) keystream += key[0] recovered_plaintexts = [ bitops.xor(m, keystream) for m in ciphertexts ] # for m in recovered_plaintexts: # print m self.assertIn( '\'Cause my girl is definitely mad / \'Cause it took us too long to do this album', recovered_plaintexts )
def uncertainty_map(res_path): uncertainty_dir = os.path.join(res_path, 'uncertainty_map') if not os.path.exists(uncertainty_dir): os.mkdir(uncertainty_dir) filenames = readlines( os.path.join(splits_dir, "eigen_benchmark", "test_files.txt")) for i in range(len(filenames)): imageset = np.load(os.path.join(res_path, '{}_stage4.npy'.format(i))) uncert = np.std(imageset, 0) plt.imsave(os.path.join(uncertainty_dir, '{}_uncert.png'.format(i)), uncert, cmap='Greys')
def format_results(fdir, group='dev'): input_lines = utils.readlines('{}/sent_classifier/{}/{}.tsv'.format( utils.DATA_DIR, fdir, group)) output_lines = utils.readlines( '{}/sent_classifier/{}/{}_results.tsv'.format(utils.DATA_DIR, fdir, group)) assert len(input_lines) == len(output_lines) assert utils.SENT_INPUT_FIELDS == 'dummy pmid sent_idx sent' input_data = [l.split('\t') for l in input_lines] output_probs = [[float(x) for x in l.split('\t')] for l in output_lines] output_preds = [l.index(max(l)) for l in output_probs] all_data = [inputs + [p] for inputs, p in zip(input_data, output_preds)] doc_data = groupby(all_data, itemgetter(1)) for pmid, lines in doc_data: with open( '{}/documents/sents/{}.bert_{}'.format(utils.DATA_DIR, pmid, fdir), 'w') as fout: for _, pmid, _, sent, label in lines: fout.write('{}\n'.format(label))
def sample_dataset(file, amount): """ Sample the given amount of data from the file. Args: file(str): File to be sampled. amount(int): Amount of data to be drawn from the file. """ # Load and convert each title to lowercase. data = readlines(file, delimiter="\t", lower=True) # Sample sample_data(file, data, amount=amount)
def export_gt_poses_kitti(): parser = argparse.ArgumentParser(description='export_gt_depth') parser.add_argument('--data_path', type=str, help='path to the root of the KITTI data', required=True) parser.add_argument('--split', type=str, help='which split to export gt from', required=True, choices=["raw_odometry"]) opt = parser.parse_args() split_folder = os.path.join(os.path.dirname(__file__), "splits", opt.split) files = readlines(os.path.join(split_folder, "test_files.txt")) videos = readlines(os.path.join(split_folder, "test_video_list.txt")) print("Exporting ground truth depths for {}".format(opt.split)) for video in videos: oxts_paths = [] ids = sorted([ int(file.stem) for file in Path(opt.data_path).glob(f"{video}/oxts/data/*.txt") ]) for frame_id in ids: filepath_oxst = os.path.join(opt.data_path, video, "oxts", "data", "{:010d}.txt".format(frame_id)) oxts_paths.append(filepath_oxst) oxts = load_oxts_packets_and_poses(oxts_paths) poses_path = os.path.join(opt.data_path, video, "oxts", "poses.txt") poses = np.stack([np.array(o[1]) for o in oxts]) print("Saving to {}".format(poses_path)) poses = poses[:, :3, :].reshape(-1, 12) np.savetxt(poses_path, poses)
def export_gt_depths_kitti(): class Opt: data_path = '/home/jihyo/PycharmProjects/RNNslam/KITTI_data/KITTI_depth/data_depth_annotated/sum' split = 'eigen_benchmark' # parser = argparse.ArgumentParser(description='export_gt_depth') # # parser.add_argument('--data_path', # type=str, # help='path to the root of the KITTI data', # required=True) # parser.add_argument('--split', # type=str, # help='which split to export gt from', # required=True, # choices=["eigen", "eigen_benchmark"]) # opt = parser.parse_args() opt = Opt() split_folder = os.path.join(os.path.dirname(__file__), "splits", opt.split) lines = readlines(os.path.join(split_folder, "test_files.txt")) print("Exporting ground truth depths for {}".format(opt.split)) gt_depths = [] for line in lines: folder, frame_id, _ = line.split() frame_id = int(frame_id) if opt.split == "eigen": calib_dir = os.path.join(opt.data_path, folder.split("/")[0]) velo_filename = os.path.join(opt.data_path, folder, "velodyne_points/data", "{:010d}.bin".format(frame_id)) gt_depth = generate_depth_map(calib_dir, velo_filename, 2, True) elif opt.split == "eigen_benchmark": gt_depth_path = os.path.join(opt.data_path, folder, "proj_depth", "groundtruth", "image_02", "{:010d}.png".format(frame_id)) gt_depth = np.array(pil.open(gt_depth_path)).astype( np.float32) / 256 gt_depths.append(gt_depth.astype(np.float32)) output_path = os.path.join(split_folder, "gt_depths.npz") print("Saving to {}".format(opt.split)) np.savez_compressed(output_path, data=np.array(gt_depths))
import utils import recognize import sys sentlabel = ["喜び", "信頼", "不安", "驚き", "悲しみ", "嫌気", "怒り", "予測"] def w2vsentiment(w2v, sentence): sims = [] testdata = utils.wakati_sentence(sentence) dic = w2v.calc_each_sentiment(testdata) for ratio in dic: sims.append(ratio / sum(dic)) return sims if __name__ == '__main__': ratios = [sentlabel] w2v = recognize.RecognizeWord2Vec() documents = utils.readlines(sys.argv[1]) for doc in documents: ratios.append(w2vsentiment(w2v, doc)) utils.output_csv(sys.argv[2], ratios)