def generate_ev_binary(group='dev', output_name='ev_binary'):
    outdir = '{}/sent_classifier/{}/'.format(utils.DATA_DIR, output_name)
    try:
        os.system('mkdir -p {}'.format(outdir))
    except OSError:
        print('Target dir: {} already exists'.format(outdir))
        input('Proceeding with generation...')

    pmids = utils.group_ids('ev_inf', group)
    with open('{}/{}.tsv'.format(outdir, group), 'w') as fout:
        for pmid in pmids:
            sents = utils.readlines('{}/documents/sents/{}.sents'.format(
                utils.DATA_DIR, pmid))
            frames = utils.readlines('{}/documents/sents/{}.frame_idx'.format(
                utils.DATA_DIR, pmid))

            pos_sents = [s for s, fs in zip(sents, frames) if len(fs) >= 1]
            neg_sents = [s for s, fs in zip(sents, frames) if len(fs) == 0]

            print(pmid)
            print(pos_sents)
            input()
            shuffle(neg_sents)
            neg_sents = neg_sents[:len(pos_sents)]

            for s in pos_sents:
                fout.write('{}\t{}\n'.format(1, s))
            for s in neg_sents:
                fout.write('{}\t{}\n'.format(0, s))
def read_docs(phase='starting_spans'):
    pmid_groups = {}
    for g in GROUPS:
        pmids = utils.readlines(
            os.path.join(config.EBM_NLP_DIR, 'pmids_{}.txt'.format(g)))
        for pmid in pmids:
            pmid_groups[pmid] = g

    def get_e_fname(pmid, e):
        if pmid_groups[pmid] == 'test':
            subdir = os.path.join('test', 'gold')
        else:
            subdir = 'train'
        f = '{}.AGGREGATED.ann'.format(pmid)
        return os.path.join(config.EBM_NLP_DIR, 'annotations', 'aggregated',
                            phase, e, subdir, f)

    docs = []
    for pmid, group in pmid_groups.items():
        tokens = utils.readlines(
            os.path.join(config.EBM_NLP_DIR, 'documents',
                         '{}.tokens'.format(pmid)))
        text, token_offsets = utils.join_tokens(tokens)
        doc = classes.Doc(pmid, text)
        doc.group = group
        for e in ['participants', 'interventions', 'outcomes']:
            label_name = 'GOLD_{}'.format(e[0])
            labels = [int(l) for l in utils.readlines(get_e_fname(pmid, e))]
            for token_i, token_f, l in utils.condense_labels(labels):
                char_i = token_offsets[token_i][0]
                char_f = token_offsets[token_f - 1][1]
                doc.labels[label_name].append(
                    classes.Span(char_i, char_f, text[char_i:char_f]))
        docs.append(doc)
    return docs
Exemplo n.º 3
0
def generate_frames(output_name, group='dev'):
    icodir = '{}/ico_acceptor/{}/'.format(utils.DATA_DIR, output_name)

    data = {}

    for sample in ['x', 'c', 'o']:
        input_lines = utils.readlines('{}/{}_sample_{}.txt'.format(
            icodir, group, sample))
        input_lines = [l.split('\t') for l in input_lines]
        output_lines = utils.readlines('{}/{}_sample_{}_results.txt'.format(
            icodir, group, sample))
        output_lines = [l.split('\t') for l in output_lines]

        #sample_c.write(utils.joinstr([pmid, sent_idx, frame_idx, i, c, o, s])) don't show Jay
        assert len(input_lines) == len(output_lines)

        all_lines = [i_l + o_l for i_l, o_l in zip(input_lines, output_lines)]

        for pmid, pmid_lines in groupby(all_lines, itemgetter(0)):
            if pmid not in data:
                data[pmid] = {}
            for frame_idx, frame_lines in groupby(pmid_lines, itemgetter(2)):
                pmids, sent_idxs, frame_idxs, i_s, c_s, o_s, s_s, p0s, p1s = zip(
                    *frame_lines)
                assert len(set(pmids)) == 1
                assert len(set(sent_idxs)) == 1
                assert len(set(frame_idxs)) == 1
                assert len(set(s_s)) == 1

                if len(nltk.tokenize.word_tokenize(s_s[0])) < 10:
                    continue

                sent_idx = sent_idxs[0]
                frame_idx = frame_idxs[0]
                ev_span = s_s[0]

                top_frames = sorted(zip(p1s, i_s, c_s, o_s),
                                    key=itemgetter(0),
                                    reverse=True)

                assert frame_idx not in data[pmid]
                frame = {
                    'sent_idx': sent_idx,
                    'frame_idx': frame_idx,
                    'ev': ev_span,
                    'icos': top_frames[:5],
                    'sample': sample,
                }
                data[pmid][frame_idx] = frame

    for pmid, frames in data.items():
        with open(
                '{}/documents/frames/{}.bert_frames'.format(
                    utils.DATA_DIR, pmid), 'w') as fout:
            for frame in frames.values():
                fout.write(json.dumps(frame) + '\n')
Exemplo n.º 4
0
def export_gt_depths_kitti():
    parser = argparse.ArgumentParser(description='export_gt_depth')

    parser.add_argument('--data_path',
                        type=str,
                        help='path to the root of the KITTI data',
                        required=True)
    parser.add_argument('--split',
                        type=str,
                        help='which split to export gt from',
                        required=True,
                        choices=["eigen", "eigen_benchmark","odom", "AirSim"])
    parser.add_argument('--sequence',
                        type=int,
                        help='which odom sequnce to export gt from',
                        required=False,
                        default=0)
    opt = parser.parse_args()

    split_folder = os.path.join(os.path.dirname(__file__), "splits", opt.split)
    if opt.split == "odom":
        lines = readlines(os.path.join(split_folder,"{:02d}_exp.txt".format(opt.sequence)))
    else:
        lines = readlines(os.path.join(split_folder, "test_files.txt"))

    print("Exporting ground truth depths for {}".format(opt.split))

    gt_depths = []
    for line in lines:
        folder, frame_id, _ = line.split()
        frame_id = int(frame_id)

        if (opt.split == "eigen")|(opt.split == "odom"):
            calib_dir = os.path.join(opt.data_path, folder.split("/")[0])
            velo_filename = os.path.join(
                opt.data_path, folder,
                "velodyne_points/data", "{:010d}.bin".format(frame_id))
            gt_depth = generate_depth_map(calib_dir, velo_filename, 2, True)
        elif opt.split == "eigen_benchmark":
            gt_depth_path = os.path.join(
                opt.data_path, folder, "proj_depth",
                "groundtruth", "image_02", "{:010d}.png".format(frame_id))
            gt_depth = np.array(pil.open(gt_depth_path)).astype(np.float32) / 256
        elif opt.split == "AirSim"
            gt_depth_path = os.path.join(
                opt.data_path, folder, "{}.pfm".format(frame_id))
            gt_depth,_ = read_pfm(gt_depth_path)

        gt_depths.append(gt_depth.astype(np.float32))

    output_path = os.path.join(split_folder, "gt_depths.npz")

    print("Saving to {}".format(opt.split))

    np.savez_compressed(output_path, data=np.array(gt_depths))
Exemplo n.º 5
0
def create_sfnorm_pair_with_pole(opts):
    from datasets_sfgan import SFGAN_Base_Dataset
    from torch.utils.data import DataLoader
    from utils import readlines
    import torch
    from utils import tensor2disp

    fpath = os.path.join(os.path.dirname(__file__), "..", "splits", opts.split,
                         "{}_files.txt")
    train_filenames = readlines(fpath.format("train"))
    val_filenames = readlines(fpath.format("val"))
    syn_train_filenames = readlines(fpath.format("syn_train"))
    syn_val_filenames = readlines(fpath.format("syn_val"))

    train_dataset = SFGAN_Base_Dataset(opts.data_path,
                                       train_filenames,
                                       syn_train_filenames,
                                       opts.height,
                                       opts.width,
                                       opts.frame_ids,
                                       4,
                                       opts=opts,
                                       is_train=False,
                                       load_seman=True)
    train_loader = DataLoader(train_dataset,
                              1,
                              shuffle=not opts.noShuffle,
                              num_workers=opts.num_workers,
                              pin_memory=True,
                              drop_last=False)

    min_num = 100
    poleId = 5
    pole_ind_rec = list()
    for batch_idx, inputs in enumerate(train_loader):
        num_syn = torch.sum(inputs['syn_semanLabel'] == poleId)
        num_real = torch.sum(inputs['real_semanLabel'] == poleId)

        if num_syn > min_num and num_real > min_num:
            pole_ind_rec.append(batch_idx)

        print(batch_idx)

    split_root = '../splits/sfnorm_pole'

    wf1 = open(os.path.join(split_root, 'train_files.txt'), "w")
    for pole_ind in pole_ind_rec:
        wf1.write(train_filenames[pole_ind] + '\n')
    wf1.close()

    wf1 = open(os.path.join(split_root, 'syn_train_files.txt'), "w")
    for pole_ind in pole_ind_rec:
        wf1.write(syn_train_filenames[pole_ind] + '\n')
    wf1.close()
Exemplo n.º 6
0
def ibo_tagging(corpus, keywords, output=None, thread=None):
    """
    Arguments:
        corpus(str): Path to the corpus file.
        keywords(str): Path to where keywords dictionaries is.
        thread(int): Number of thread to process.
        output(str): Path to the output file.
    """
    # output name
    if output is None:
        output = corpus[:-4] + "_ibo.tsv"

    # Load and merge dictionary
    # files = [itr for itr in os.listdir(rule) if itr.endswith("_leaf.json")]

    # Load entities
    # entity = dict()
    # for itr in files:
    #     entity.update(json.load(open(rule + itr, "r")))
    entity = json.load(open(keywords, "r"))

    # Read corpus
    raw_data = readlines(corpus)

    # Threading
    param = (entity,)
    result = generic_threading(thread, raw_data, generate_ibo, param)

    # Write result to file
    file_io(output, result)
Exemplo n.º 7
0
def main():
    # We preprocess the list with a simple cli call to sort
    lines = utils.readlines("day4/sortedinput")

    guards = {}
    for line in lines:
        guardIdMatch = re.search(r"#(\d+)", line)
        if guardIdMatch is not None:
            guardId = int(guardIdMatch.group(1))
            if guardId in guards:
                currentSleepRecord = guards[guardId]
            else:
                currentSleepRecord = [0] * 60
                guards[guardId] = currentSleepRecord

        asleepMatch = re.search(r":(\d+)] falls asleep", line)
        if asleepMatch is not None:
            asleepMinute = int(asleepMatch.group(1))

        awakeMatch = re.search(r":(\d+)] wakes up", line)
        if awakeMatch is not None:
            awakeMinute = int(awakeMatch.group(1))
            for t in range(asleepMinute, awakeMinute):
                currentSleepRecord[t] += 1

    scenarioOneGuard = max(guards.items(), key=lambda g: sum(g[1]))
    print(scenarioOneGuard[0] * scenarioOneGuard[1].index(max(scenarioOneGuard[1])))

    scenarioTwoGuard = max(guards.items(), key=lambda g: max(g[1]))
    print(scenarioTwoGuard[0] * scenarioTwoGuard[1].index(max(scenarioTwoGuard[1])))
Exemplo n.º 8
0
def preliminary_cleanup(corpus, rule, output=None, thread=None, limit=None):
    """
    Preliminary cleanup the corpus to make it easier for further
    processing methods. This method can be used to correct the
    missing spaces after punctuations any other customized rules
    can be added to the rule file, see punctuation_cleanup in utils
    for the formatting of the rules.

    Arguments:
        corpus(str): Path to the corpus file.
        rule(str): Path to the processing rule file.
        thread(int): Number of thread to process.
        output(str): Path to the output file.
    """
    # output name
    if output is None:
        output = corpus[:-4] + "_preprocessed.tsv"

    # Load rules
    rules = load_rules(rule)
    # Load data
    raw_data = readlines(corpus, limit=limit, skip=True)

    # Threading
    param = (rules, "PRELIMINARY")
    result = generic_threading(thread, raw_data, punctuation_cleanup, param)

    # Write result to file
    write_to_file(output, result)
Exemplo n.º 9
0
def remove_old(path):
    files = iter_files(path)
    for file in files:
        filename = os.path.basename(file)
        print(filename)
        f = open(os.path.join('./final', filename), 'a+')
        lines = readlines(file)
        i = 0
        cnt = 0
        for line in lines:
            article = json.loads(line)
            if 'year' in article:
                if int(article['year']) >= 2000:
                    if "author" in article:
                        article['author'] = list(set(article['author']))
                        tmp = json.dumps(article)
                        f.write(tmp + '\n')
                        f.flush()

                        i += 1
                        # if i % 100000 == 0:
                        #     print(i)
                    else:
                        cnt += 1

            else:
                cnt += 1
        print('%s  skip:%d, save %d' % (filename, cnt, i))
Exemplo n.º 10
0
def refine_corpus(corpus, rule_path, output=None, thread=None):
    """
    Clean up the given corpus according to the rules defined in the files.
    This method utilizes multithreading to accelerate the process.

    Arguments:
        corpus(str): Path to the corpus file.
        rule_path(str): Path to where "parentheses.tsv" and 
            "refine_list.tsv" are.
        thread(int): Number of thread to process.
        output(str): Path to the output file.
    """
    if output is None:
        output = corpus[:-4] + "_cleaned.txt"
    if not rule_path.endswith("/"):
        rule_path += "/"

    # Load rule files
    file_p = rule_path + "parentheses.tsv"
    file_r = rule_path + "refine_list.tsv"
    parentheses = load_rules(file_p)
    refine_list = load_rules(file_r)

    # Acquire the corpus (skip first line)
    raw_data = readlines(corpus)

    # Threading
    param = (parentheses, refine_list)
    result = generic_threading(thread, raw_data, corpus_cleanup, param)

    # Write all result to file
    write_to_file(output, result)
def export_gt_depths_kitti(opt):
    """
    Generate ground-truth data and store as .npz file
    """
    split_folder = os.path.join(os.path.dirname(__file__), "splits", opt.split)
    lines = readlines(os.path.join(split_folder, "test_files.txt"))

    print("Exporting ground truth depths for {}".format(opt.split))

    gt_depths = []
    for line in lines:

        folder, frame_id, _ = line.split()
        frame_id = int(frame_id)

        if opt.split == "eigen":
            calib_dir = os.path.join(opt.data_path, folder.split("/")[0])
            velo_filename = os.path.join(opt.data_path, folder,
                                         "velodyne_points/data",
                                         "{:010d}.bin".format(frame_id))
            gt_depth = generate_depth_map(calib_dir, velo_filename, 2, True)
        elif opt.split == "eigen_benchmark":
            gt_depth_path = os.path.join(opt.data_path, folder, "proj_depth",
                                         "groundtruth", "image_02",
                                         "{:010d}.png".format(frame_id))
            gt_depth = np.array(pil.open(gt_depth_path)).astype(
                np.float32) / 256

        gt_depths.append(gt_depth.astype(np.float32))

    output_path = os.path.join(split_folder, "gt_depths.npz")

    print("Saving to {}".format(opt.split))

    np.savez_compressed(output_path, data=np.array(gt_depths))
Exemplo n.º 12
0
 def read_filenames(self):
     # e.g. splits\\eigen_zhou\\train.txt
     split_path = os.path.join(self.dataset.split_folder,
                               self.dataset.split_name).replace('\\', '/')
     self.filenames = readlines(split_path)
     self.num_items = len(self.filenames)
     self.steps_per_epoch = self.num_items // self.batch_size
def parse_subwords(file):
    """
    Parse subword mapping to dictionary.

    Args:
        file(str): Path to the subword mapping file.
                   format: <[WORD]>S1,S2,...
    """
    # Load subwords
    raw_data = readlines(file, limit=None)

    # Parsing information
    mentions, subwords = list(), list()
    for itr in raw_data:
        index = itr.find(">")
        mentions.append(itr[1:index])
        subwords.append(itr[index + 2:].split(","))

    # Create dictionary for mentions and its subwords
    dictionary = dict(zip(mentions, subwords))

    write_to_file("data/subwords.json", dictionary)

    # Subword pool for subword embedding
    # subword_pool = np.unique(list(chain.from_iterable(subwords)))
    subword_pool = dict(Counter(list(chain.from_iterable(subwords))))
    # print(subword_pool)
    print("Raw number of subwords: {:8d}".format(len(subword_pool)))
    write_to_file("data/subword_pool.json", subword_pool)
Exemplo n.º 14
0
def main():
    lines = utils.readlines("day2/input")

    sum2 = 0
    sum3 = 0
    for line in lines:
        letters = {}
        for c in line:
            if c in letters:
                letters[c] += 1
            else:
                letters[c] = 1

        found2 = False
        found3 = False
        for c in letters:
            if not found2 and letters[c] == 2:
                sum2 += 1
                found2 = True
            elif not found3 and letters[c] == 3:
                sum3 += 1
                found3 = True

    print("Checksum:", sum2 * sum3)

    for line in lines:
        for secondline in lines:
            if len(line) - 1 == len(strIntersection(line, secondline)):
                print("Common letters:", strIntersection(line, secondline))
                return
Exemplo n.º 15
0
def extract_vocabularies(corpus, rule, output=None, thread=None):
    """
    Extract vocabularies from the corpus, additional rules to achieve
    purer vocabularies can be defined in src/refine_rules/voc_cleanup.tsv

    Arguments:
        corpus(str): Path to the corpus file.
        rule(str): Path to the processing rule file.
        thread(int): Number of thread to process.
        output(str): Path to the output file.
    """
    if output is None:
        output = corpus[:-4] + "_vocabulary_list.json"

    # Load rules
    rules = load_rules(rule)

    # Acquire the corpus
    raw_data = readlines(corpus, limit=None)

    # Threading (TO-BE-IMPLEMENTED)
    # param = (rules, "SPLIT_WORDS")
    # generic_threading(thread, raw_data, punctuation_cleanup, param)
    result = punctuation_cleanup(0, raw_data, rules, mode='SPLIT_WORDS')

    # Counting occurance
    print("Counting occurance...")
    voc_list = Counter(result)

    # Save vocabulary to file
    write_to_file(output, voc_list)
Exemplo n.º 16
0
def parseData(filepath):
    samples = utils.readlines(filepath)
    output = []
    for sample in samples:
        m = re.search(r"(\d+) (\d+) (\d+) (\d+)", sample)
        e = executionData(*[int(i) for i in m.group(1, 2, 3, 4)])
        output.append(e)
    return output
Exemplo n.º 17
0
def export_gt_depths_kitti():

    parser = argparse.ArgumentParser(description="export_gt_depth")

    parser.add_argument(
        "--data_path",
        type=str,
        help="path to the root of the KITTI data",
        required=True,
    )
    parser.add_argument(
        "--split",
        type=str,
        help="which split to export gt from",
        required=True,
        choices=["eigen", "eigen_benchmark"],
    )
    opt = parser.parse_args()

    split_folder = os.path.join(os.path.dirname(__file__), "splits", opt.split)
    lines = readlines(os.path.join(split_folder, "test_files.txt"))

    print("Exporting ground truth depths for {}".format(opt.split))

    gt_depths = []
    for line in lines:

        folder, frame_id, _ = line.split()
        frame_id = int(frame_id)

        if opt.split == "eigen":
            calib_dir = os.path.join(opt.data_path, folder.split("/")[0])
            velo_filename = os.path.join(
                opt.data_path,
                folder,
                "velodyne_points/data",
                "{:010d}.bin".format(frame_id),
            )
            gt_depth = generate_depth_map(calib_dir, velo_filename, 2, True)
        elif opt.split == "eigen_benchmark":
            gt_depth_path = os.path.join(
                opt.data_path,
                folder,
                "proj_depth",
                "groundtruth",
                "image_02",
                "{:010d}.png".format(frame_id),
            )
            gt_depth = np.array(pil.open(gt_depth_path)).astype(
                np.float32) / 256

        gt_depths.append(gt_depth.astype(np.float32))

    output_path = os.path.join(split_folder, "gt_depths.npz")

    print("Saving to {}".format(opt.split))

    np.savez_compressed(output_path, data=np.array(gt_depths))
Exemplo n.º 18
0
def evaluate(opt):
    """Evaluates a pretrained model using a specified test set
    """
    lc_window_sz = 1

    ratios = []
    normal_errors = []
        
    encoder, depth_decoder, thisH, thisW = prepare_model_for_test(opt)
    backproject_depth = BackprojectDepth(1, thisH, thisW)

    filenames = readlines('./splits/nyu_test.txt')
    dataset = datasets.NYUTestDataset(
            opt.data_path,
            filenames,
            thisH, thisW,
    )
    
    dataloader = DataLoader(
            dataset, 1, shuffle=False, 
            num_workers=opt.num_workers
    )
    print("-> Computing predictions with size {}x{}".format(thisH, thisW))

    with torch.no_grad():
        for ind, (data, _, gt_norm, gt_norm_mask, K, K_inv) in enumerate(tqdm(dataloader)):
            input_color = data.cuda()
            if opt.post_process:
                input_color = torch.cat((input_color, torch.flip(input_color, [3])), 0)
            output = depth_decoder(encoder(input_color))

            pred_disp, _ = disp_to_depth(
                    output[("disp", 0)], 
                    opt.min_depth, 
                    opt.max_depth
            )
            pred_disp = pred_disp.data.cpu() 

            if opt.post_process:
                N = pred_disp.shape[0] // 2
                pred_disp = batch_post_process_disparity(
                        pred_disp[:N], torch.flip(pred_disp[N:], [3]) 
                )
            pred_depth = 1 / pred_disp

            cam_points = backproject_depth(pred_depth, K_inv)
            cam_points = cam_points[:, :3, ...].view(1, 3, thisH, thisW)
            normal = depth_2_normal(cam_points, lc_window_sz)

            normal = F.pad(normal, (0, lc_window_sz, 0, lc_window_sz), mode='replicate')
            normal = F.interpolate(normal, (gt_norm.shape[2], gt_norm.shape[3])) 
            normal_errors.append(compute_normal_errors(normal, gt_norm, gt_norm_mask))

    mean_normal_errors = np.array(normal_errors).mean(0)
    
    print("\n  " + ("{:>8} | " * 6).format("mean", "rmse", "a1", "a2", "a3", "a4"))
    print(("&{: 8.3f}  " * 6).format(*mean_normal_errors.tolist()) + "\\\\")
    print("\n-> Done!")
Exemplo n.º 19
0
def export_gt_depths_kitti():

    parser = argparse.ArgumentParser(description='export_gt_depth')

    parser.add_argument('--data_path',
                        type=str,
                        help='path to the root of the KITTI data',
                        required=True)
    parser.add_argument('--split',
                        type=str,
                        help='which split to export gt from',
                        required=True,
                        choices=["eigen", "eigen_benchmark"])
    opt = parser.parse_args()

    split_folder = os.path.join(os.path.dirname(__file__), "splits", opt.split)
    lines = readlines(os.path.join(split_folder, "test_files.txt"))

    print("Exporting ground truth depths for {}".format(opt.split))

    gt_depths = []
    for line in lines:

        folder, frame_id, _ = line.split()
        frame_id = int(frame_id)

        if opt.split == "eigen":
            calib_dir = os.path.join(opt.data_path, folder.split("/")[0])
            velo_filename = os.path.join(opt.data_path, folder,
                                         "velodyne_points/data", "{:010d}.bin".format(frame_id))
            # gt_depth = generate_depth_map(calib_dir, velo_filename, 2, True) ## ZMH: This won't work because the generate_depth_map function has been redefined.
            # gt_depth = generate_depth_map_original(calib_dir, velo_filename, 2, True) ## ZMH: the original function in monodepth2
            # gt_depth = generate_depth_map_original(calib_dir, velo_filename, 2, False) ## ZMH: the original function in monodepth2, use transformed depth

            velo_rect, P_rect_norm, im_shape  = generate_depth_map(calib_dir, velo_filename, 2)
            gt_depth = project_lidar_to_img(velo_rect, P_rect_norm, im_shape)                   ## ZMH: the way gt is generated I used in training

        elif opt.split == "eigen_benchmark":
            # gt_depth_path = os.path.join(opt.data_path, folder, "proj_depth",
            #                              "groundtruth", "image_02", "{:010d}.png".format(frame_id))
            gt_depth_path = os.path.join(opt.data_path, folder, "proj_depth",
                                         "groundtruth", "image_02", "{:010d}.png".format(frame_id), 'val', folder.split("/")[1], "proj_depth",
                                         "groundtruth", "image_02", "{:010d}.png".format(frame_id))
            if not os.path.exists(gt_depth_path):
                gt_depth_path = os.path.join(opt.data_path, folder, "proj_depth",
                                         "groundtruth", "image_02", "{:010d}.png".format(frame_id), 'train', folder.split("/")[1], "proj_depth",
                                         "groundtruth", "image_02", "{:010d}.png".format(frame_id))
                if not os.path.exists(gt_depth_path):
                    raise ValueError("This file does not exist! {} {}".format(folder, frame_id))
            gt_depth = np.array(pil.open(gt_depth_path)).astype(np.float32) / 256

        gt_depths.append(gt_depth.astype(np.float32))

    output_path = os.path.join(split_folder, "gt_depths_im_cus.npz")

    print("Saving to {}".format(opt.split))

    np.savez_compressed(output_path, data=np.array(gt_depths))
Exemplo n.º 20
0
def network_define(opt, data_path, height, width):
    opt.load_weights_folder = os.path.expanduser(opt.load_weights_folder)

    assert os.path.isdir(opt.load_weights_folder), \
        "Cannot find a folder at {}".format(opt.load_weights_folder)

    print("-> Loading weights from {}".format(opt.load_weights_folder))

    filenames = readlines(os.path.join(splits_dir, opt.eval_split, split_file))
    encoder_path = os.path.join(opt.load_weights_folder, "encoder.pth")
    decoder_path = os.path.join(opt.load_weights_folder, "depth.pth")

    encoder_dict = torch.load(encoder_path,
                              map_location=torch.device("cuda:1"))

    if opt.dataset_val[0] == "kitti":
        dataset = datasets.KITTIRAWDataset(data_path,
                                           filenames,
                                           height,
                                           width, [0],
                                           4,
                                           is_train=False)
    elif opt.dataset_val[0] == "vkitti":
        dataset = datasets.VKITTIDataset(data_path,
                                         filenames,
                                         height,
                                         width, [0],
                                         4,
                                         is_train=False)
    # dataloader = DataLoader(dataset, 16, shuffle=False, num_workers=opt.num_workers,
    #                         pin_memory=True, drop_last=False)
    dataloader = DataLoader(
        dataset,
        1,
        shuffle=False,
        num_workers=opt.num_workers,
        pin_memory=True,
        drop_last=False,
        collate_fn=my_collate_fn
    )  ## the default collate_fn will fail because there are non-deterministic length sample

    encoder = networks.ResnetEncoder(opt.num_layers, False)
    depth_decoder = networks.DepthDecoder(encoder.num_ch_enc)

    model_dict = encoder.state_dict()
    encoder.load_state_dict(
        {k: v
         for k, v in encoder_dict.items() if k in model_dict})
    depth_decoder.load_state_dict(
        torch.load(decoder_path, map_location=torch.device("cuda:1")))

    encoder.cuda(1)
    encoder.eval()
    depth_decoder.cuda(1)
    depth_decoder.eval()

    return encoder, depth_decoder, dataloader, filenames
Exemplo n.º 21
0
def parseData(filepath, instructions):
    lines = utils.readlines(filepath)
    ipMatch = re.search(r"#ip (\d+)", lines[0])
    ip = int(ipMatch.group(1))

    for line in lines[1:]:
        instrMatch = re.search(r"(.*) (\d+) (\d+) (\d+)", line)
        e = executionData(instrMatch.group(1),
                          *[int(i) for i in instrMatch.group(2, 3, 4)])
        instructions.append(e)
    return ip
Exemplo n.º 22
0
def cleanup(file, output, cleanup_only=False, threshold=5, thread=None):
    """
    Cleanup the dataset according to the specs of the task.

    Args:
        file(str): Input corpus filename.
        output(str): Output filename.
        cleanup_only(bool): Just cleanup the words using predefined frequent words.
        threshold(int): The threshold to filter out infrequent words.
        thread(int): Number of thread to run simultaneously
    """

    # 1. Load and convert each title to lowercase.
    data = readlines(file, delimiter="\t", lower=True)

    # 2. Remove all characters that are not
    #    (1) lowercase characters (a-z),
    #    (2) whitespace, or
    #    (3) hyphen '-'
    data = remove_redundant_char(data, index=1)

    # 3. Tokenize each title into words by splitting on whitespace.
    words, data = tokenize_context(data, index=1, thread=thread)

    # 4. Remove all tokens that appear fewer than 5 times in the dataset.
    # 4-1. Find frequent words
    if not cleanup_only:
        frequent_words = find_frequent_words(words, threshold=threshold)
        write_to_file("models/frequent_words.txt", frequent_words)
    else:
        print("Loading frequent_words from training set.")
        frequent_words = readlines("models/frequent_words.txt", lower=True)
        frequent_words = list(chain.from_iterable(frequent_words))

    # 4-2. Remove infrequent words in titles
    data = filter_title(data,
                        index=1,
                        frequent_words=frequent_words,
                        thread=thread)
    # Save to file
    write_to_file(output, data, delimiter="\t", row_as_line=True)
Exemplo n.º 23
0
def get_docs(default_span='mv'):
    ebm_nlp = '/home/ben/Desktop/EBM-NLP/ebm_nlp_2_00'
    ann_dir = '{}/annotations'.format(ebm_nlp)

    pmids = utils.readlines('../data/id_splits/ebm_nlp/test.txt')
    docs = {p: {'i': {}, 'o': {}} for p in pmids}

    for p in pmids:

        token_fname = os.path.join('../data/documents/tokens/',
                                   '{}.tokens'.format(p))
        tokens = utils.readlines(token_fname)
        docs[p]['tokens'] = tokens

        for el in ['interventions', 'outcomes']:

            agg_fname = '{}/aggregated/starting_spans/{}/test/gold/{}.AGGREGATED.ann'.format(
                ann_dir, el, p)
            indv_fnames = glob.glob(
                '{}/individual/phase_1/{}/test/gold/{}.*.ann'.format(
                    ann_dir, el, p))
            e = el[0]

            docs[p][e]['hmm'] = list(map(int, utils.readlines(agg_fname)))
            docs[p][e]['indv'] = []
            for f in indv_fnames:
                docs[p][e]['indv'].append(list(map(int, utils.readlines(f))))
            docs[p][e]['avg'] = list(map(np.mean, zip(*docs[p][e]['indv'])))

            agg_strats = [\
                ('mv',           lambda x: int(x + 0.5)),
                ('union',        lambda x: int(x > 0)),
                ('intersection', lambda x: int(x))]

            for name, func in agg_strats:
                docs[p][e][name] = list(map(func, docs[p][e]['avg']))

            spans = utils.condense_labels(docs[p][e][default_span])
            docs[p][e]['spans'] = [' '.join(tokens[i:f]) for i, f, l in spans]

    return docs
Exemplo n.º 24
0
def compare(group = 'dev'):
  pmids = utils.group_ids(group)

  with open('frames.csv', 'w') as fout:
    fout.write('\t'.join('label i c o span'.split()) + '\n')
    for pmid in pmids:
      try:
        bert_frames = utils.readlines('{}/documents/frames/{}.bert_frames'.format(utils.DATA_DIR, pmid))
      except FileNotFoundError:
        # womp womp no frames for this doc
        continue
      gold_frames = utils.readlines('{}/documents/frames/{}.frames'.format(utils.DATA_DIR, pmid))
      gold_sent_idxs = utils.readlines('{}/documents/frames/{}.sent_idxs'.format(utils.DATA_DIR, pmid))

      gold_lookup = { i: l.split('\t') for i,l in enumerate(gold_sent_idxs) if len(l.split('\t')) == 1 }

      for frame_str in bert_frames:
        frame = json.loads(frame_str)
        matching_frames = [i for i, idxs in gold_lookup.items() if frame['sent_idx'] in idxs]
        if matching_frames:
          matching_frame = gold_frames[matching_frames[0]]
          i, c, o, _, _, ev = matching_frame.split('\t')
          if ev not in frame['ev']:
            continue
          fout.write('\t'.join(['gold', i, c, o, ev]) + '\n')

          ico = frame['icos'][0]
          i_score = ''
          c_score = ''
          o_score = ''
          if o.lower() == ico[3].lower():
            o_score = '5'
          if i.lower() == ico[1].lower():
            i_score = '5'
          elif i.lower() == ico[2].lower():
            ico[1], ico[2] = ico[2], ico[1]
            i_score = '5'
            if frame['sample'] == 'c': frame['sample'] = 'i'
          if c.lower() == ico[2].lower():
            c_score = '5'
          fout.write('\t'.join([frame['sample'], ico[1], ico[2], ico[3], frame['ev']]) + '\n')
Exemplo n.º 25
0
    def test_challenge20(self):
        plaintexts = [
            base64_to_bytes(line)
            for line in utils.readlines('20.txt')
        ]

        key = encryption_key()
        nonce = '\0' * 8
        ciphertexts = [
            ctr_encrypt(m, key, nonce)
            for m in plaintexts
        ]

        # Because of the fixed-nonce, the encrypted keystream bytes are
        # repeated for every plaintext message.
        #
        # ciphertext[i] ^ keystream[i] = plaintext[i]
        #
        # We can create a transposed ciphertext message by concatenating
        # ciphertext[i] from every encrypted message and then xor'ing that
        # against a guessed keystream byte. Then we can test whether the
        # resulting plaintext looks like english based on character
        # distributions. If so, then we've figured out the keystream byte.

        keystream = ''
        for index in itertools.count():
            transposed = ''.join(m[index:index+1] for m in ciphertexts)
            if not transposed:
                break

            allowed_chars = None
            if index == 0:
                allowed_chars = string.ascii_uppercase + '"\''

            score, _, key = crack.find_best_single_byte_key(
                transposed,
                allowed_chars=allowed_chars
            )
            # print 'Best score for index {}: {}'.format(index, score)
            keystream += key[0]

        recovered_plaintexts = [
            bitops.xor(m, keystream) for m in ciphertexts
        ]

        # for m in recovered_plaintexts:
        #     print m

        self.assertIn(
            '\'Cause my girl is definitely mad / \'Cause it took us too long to do this album',
            recovered_plaintexts
        )
def uncertainty_map(res_path):
    uncertainty_dir = os.path.join(res_path, 'uncertainty_map')

    if not os.path.exists(uncertainty_dir):
        os.mkdir(uncertainty_dir)
    filenames = readlines(
        os.path.join(splits_dir, "eigen_benchmark", "test_files.txt"))
    for i in range(len(filenames)):
        imageset = np.load(os.path.join(res_path, '{}_stage4.npy'.format(i)))
        uncert = np.std(imageset, 0)
        plt.imsave(os.path.join(uncertainty_dir, '{}_uncert.png'.format(i)),
                   uncert,
                   cmap='Greys')
Exemplo n.º 27
0
def format_results(fdir, group='dev'):
    input_lines = utils.readlines('{}/sent_classifier/{}/{}.tsv'.format(
        utils.DATA_DIR, fdir, group))
    output_lines = utils.readlines(
        '{}/sent_classifier/{}/{}_results.tsv'.format(utils.DATA_DIR, fdir,
                                                      group))

    assert len(input_lines) == len(output_lines)
    assert utils.SENT_INPUT_FIELDS == 'dummy pmid sent_idx sent'

    input_data = [l.split('\t') for l in input_lines]
    output_probs = [[float(x) for x in l.split('\t')] for l in output_lines]
    output_preds = [l.index(max(l)) for l in output_probs]

    all_data = [inputs + [p] for inputs, p in zip(input_data, output_preds)]
    doc_data = groupby(all_data, itemgetter(1))
    for pmid, lines in doc_data:
        with open(
                '{}/documents/sents/{}.bert_{}'.format(utils.DATA_DIR, pmid,
                                                       fdir), 'w') as fout:
            for _, pmid, _, sent, label in lines:
                fout.write('{}\n'.format(label))
Exemplo n.º 28
0
def sample_dataset(file, amount):
    """
    Sample the given amount of data from the file.

    Args:
        file(str): File to be sampled.
        amount(int): Amount of data to be drawn from the file.
    """

    # Load and convert each title to lowercase.
    data = readlines(file, delimiter="\t", lower=True)
    # Sample
    sample_data(file, data, amount=amount)
Exemplo n.º 29
0
def export_gt_poses_kitti():

    parser = argparse.ArgumentParser(description='export_gt_depth')

    parser.add_argument('--data_path',
                        type=str,
                        help='path to the root of the KITTI data',
                        required=True)
    parser.add_argument('--split',
                        type=str,
                        help='which split to export gt from',
                        required=True,
                        choices=["raw_odometry"])
    opt = parser.parse_args()

    split_folder = os.path.join(os.path.dirname(__file__), "splits", opt.split)
    files = readlines(os.path.join(split_folder, "test_files.txt"))
    videos = readlines(os.path.join(split_folder, "test_video_list.txt"))

    print("Exporting ground truth depths for {}".format(opt.split))

    for video in videos:
        oxts_paths = []
        ids = sorted([
            int(file.stem)
            for file in Path(opt.data_path).glob(f"{video}/oxts/data/*.txt")
        ])
        for frame_id in ids:
            filepath_oxst = os.path.join(opt.data_path, video, "oxts", "data",
                                         "{:010d}.txt".format(frame_id))

            oxts_paths.append(filepath_oxst)
        oxts = load_oxts_packets_and_poses(oxts_paths)
        poses_path = os.path.join(opt.data_path, video, "oxts", "poses.txt")
        poses = np.stack([np.array(o[1]) for o in oxts])

        print("Saving to {}".format(poses_path))
        poses = poses[:, :3, :].reshape(-1, 12)
        np.savetxt(poses_path, poses)
Exemplo n.º 30
0
def export_gt_depths_kitti():
    class Opt:
        data_path = '/home/jihyo/PycharmProjects/RNNslam/KITTI_data/KITTI_depth/data_depth_annotated/sum'
        split = 'eigen_benchmark'

    # parser = argparse.ArgumentParser(description='export_gt_depth')
    #
    # parser.add_argument('--data_path',
    #                     type=str,
    #                     help='path to the root of the KITTI data',
    #                     required=True)
    # parser.add_argument('--split',
    #                     type=str,
    #                     help='which split to export gt from',
    #                     required=True,
    #                     choices=["eigen", "eigen_benchmark"])
    # opt = parser.parse_args()

    opt = Opt()
    split_folder = os.path.join(os.path.dirname(__file__), "splits", opt.split)
    lines = readlines(os.path.join(split_folder, "test_files.txt"))

    print("Exporting ground truth depths for {}".format(opt.split))

    gt_depths = []
    for line in lines:

        folder, frame_id, _ = line.split()
        frame_id = int(frame_id)

        if opt.split == "eigen":
            calib_dir = os.path.join(opt.data_path, folder.split("/")[0])
            velo_filename = os.path.join(opt.data_path, folder,
                                         "velodyne_points/data",
                                         "{:010d}.bin".format(frame_id))
            gt_depth = generate_depth_map(calib_dir, velo_filename, 2, True)
        elif opt.split == "eigen_benchmark":
            gt_depth_path = os.path.join(opt.data_path, folder, "proj_depth",
                                         "groundtruth", "image_02",
                                         "{:010d}.png".format(frame_id))
            gt_depth = np.array(pil.open(gt_depth_path)).astype(
                np.float32) / 256

        gt_depths.append(gt_depth.astype(np.float32))

    output_path = os.path.join(split_folder, "gt_depths.npz")

    print("Saving to {}".format(opt.split))

    np.savez_compressed(output_path, data=np.array(gt_depths))
Exemplo n.º 31
0
import utils
import recognize
import sys

sentlabel = ["喜び", "信頼", "不安", "驚き", "悲しみ", "嫌気", "怒り", "予測"]


def w2vsentiment(w2v, sentence):
    sims = []

    testdata = utils.wakati_sentence(sentence)

    dic = w2v.calc_each_sentiment(testdata)

    for ratio in dic:
        sims.append(ratio / sum(dic))

    return sims


if __name__ == '__main__':
    ratios = [sentlabel]

    w2v = recognize.RecognizeWord2Vec()
    documents = utils.readlines(sys.argv[1])

    for doc in documents:
        ratios.append(w2vsentiment(w2v, doc))

    utils.output_csv(sys.argv[2], ratios)