예제 #1
0
def acc_probe_lineplot_main():

    opt = utils.parse_args()
    sep_patt = re.compile('[\s/]+')

    #data_str1 = '0.178 / 35.0  0.39 / 289.0 0.495 / 769.0 0.565 / 1465.0 0.618 / 2372.0'
    with_catalyzer = True
    #if with_catalyzer:
    #    km_data_path = osp.join(utils.data_dir, 'km_plot_data_c')
    #else:
    km_data_path = osp.join(utils.data_dir, 'km_plot_data')
    train_data_path = osp.join(utils.data_dir, 'train_plot_data')

    #data format example: data_str_km = '0.788 / 50803.0 0.791 / 53157.0 0.795 / 55564.0 0.798 / 58021.0'

    data_str_km = utils.load_lines(km_data_path)[0]
    data_str_train = utils.load_lines(train_data_path)[0]

    label_ar = ['km', 'neural']
    method_l = []
    acc_l = []
    probe_l = []
    #determine this dynamically
    probe95_plot_l = [True, True]

    for i, data_str in enumerate([data_str_km, data_str_train]):
        data_ar = sep_patt.split(data_str.strip())
        if probe95_plot_l[i]:

            assert (len(data_ar) % 3) == 0
            step = 3
        else:
            assert (len(data_ar) & 1) == 0
            step = 2
        data_ar = list(map(float, data_ar))

        acc_l.extend(data_ar[0::step])
        probe_l.extend(data_ar[1::step])
        method_l.extend([label_ar[i]] * (len(data_ar) // step))

        if False and probe95_plot:
            acc_l.extend(data_ar[0::step])
            probe_l.extend(data_ar[2::step])
            method_l.extend([label_ar[i] + '95'] * (len(data_ar) // step))

    acc_ar = np.array(acc_l)
    probe_ar = np.array(probe_l)

    height = 1
    n_clusters = 256
    acc_probe_lineplot(probe_ar, acc_ar, method_l, height, n_clusters, opt)
예제 #2
0
def main():

    prsd_ids = []

    # if file with parsed IDS exists
    # that means we've processed some part of them before
    # also it might be all of them...
    # but with fails(connection, network)
    if not os.path.exists(processed_ids_fpath(FILE_PATH)):
        open(processed_ids_fpath(FILE_PATH), 'a').close()
    else:
        prsd_ids = load_lines(processed_ids_fpath(FILE_PATH))

    src_ids = load_lines(FILE_PATH)

    if prsd_ids:
        s = set(src_ids)
        s.difference_update(set(prsd_ids))
        ids = list(s)
    else:
        ids = src_ids

    if not ids:
        exit(1)

    ids = deque(ids)
    parser = JuridicalInfoParser(FILE_PATH, MAX_FILE_OUTPUT_SIZE, retries=RETRIES,
                                 backoff=BACKOFF, timeout=TIMEOUT)
    cnt = 0
    print('-' * 20)
    while ids:
        try:
            idx = ids.popleft()
            _row = parser.process(idx)
            cnt += 1

        except ParseError:
            print('Failed to process {}'.format(idx))
            append_file(failed_ids_fpath(FILE_PATH), idx)
            print('-' * 20)
        except Exception as e:
            print('Failed to process {}'.format(idx))
            print(e)
        else:
            append_file(processed_ids_fpath(FILE_PATH), idx)
        finally:
            print(f'{cnt} - {_row}')
            print('-' * 20)
예제 #3
0
def read_all_ranks_glove(opt):
    ##need weights!!!!!!!!!!!!
    graph_path = osp.join(utils.glove_dir, 'normalized', 'knn_100', 'graph_10',
                          'graph.txt')
    ranks = []

    lines = utils.load_lines(graph_path)[1:]
    #tuples of 2 indices, and their weights
    idx2weights = {}

    for i, line in enumerate(lines, 1):

        cur_list = line.strip().split(' ')
        cur_ranks = []

        for j in range(0, len(cur_list), 2):
            neigh = int(cur_list[j])
            cur_ranks.append(neigh)
            neigh_weight = int(cur_list[j + 1])
            tup = (i, neigh) if i < neigh else (neigh, i)
            idx2weights[tup] = neigh_weight

        #ensure proper k! for resulting graph
        ranks.append(cur_ranks)

    return ranks, idx2weights
예제 #4
0
def words_from_file(filename):
    file = filename
    lines = load_lines(file)
    words = []
    for i in range(len(lines)):
        word = lines[i].strip('\n')
        words.append(word)
    return words
예제 #5
0
 def __init__(self):
     #self.n_clusters = n_clusters
     #kahip partition top level result
     #64 for now!!
     #-loads partition data and prepares to make predictions
     self.kahip_path = osp.join(utils.data_dir,
                                'cache_partition64strong_0ht2')
     classes_l = utils.load_lines(self.kahip_path)  ##########
     self.classes_l = [int(c) for c in classes_l]
예제 #6
0
def process_child(ranks, graph_path, datalen, branching_l, height, idx2classes, proc_i, ht2cutsz, opt):
    
    n_edges = create_graph.write_knn_graph(ranks, graph_path)
    
    parts_path = run_kahip(graph_path, datalen, branching_l, height, opt)

    lines = utils.load_lines(parts_path)
    idx2classes[proc_i] = [int(line) for line in lines]

    '''
예제 #7
0
def process_child(ranks, graph_path, datalen, branching_l, height, idx2classes,
                  proc_i, ht2cutsz, opt):

    n_edges = create_graph.write_knn_graph(ranks, graph_path)

    parts_path = run_kahip(graph_path, datalen, branching_l, height, opt)

    lines = utils.load_lines(parts_path)
    idx2classes[proc_i] = [int(line) for line in lines]

    compute_cut_size_b = True and not opt.glove
    if compute_cut_size_b:
        cut_sz = compute_cut_size(classes, ranks)
        ht2cutsz[height].append((cut_sz, n_edges))
예제 #8
0
def get_bgl3_count_df(output_dir=None):
    """ combines the inp and sel variant lists into a single dataframe with counts """
    inp_fn = "source_data/bgl3/unlabeled_Bgl3_mutations.txt"
    sel_fn = "source_data/bgl3/positive_Bgl3_mutations.txt"

    cache_fn = "bgl3_raw_counts.tsv"
    if output_dir is None or not isfile(join(output_dir, cache_fn)):
        print("Computing bgl3 count df from raw counts")
        inp_variant_list = utils.load_lines(inp_fn)
        sel_variant_list = utils.load_lines(sel_fn)
        df = pd.concat([
            parse_bgl3_variant_list(inp_variant_list, "inp"),
            parse_bgl3_variant_list(sel_variant_list, "sel")
        ],
                       axis=1,
                       sort=True).fillna(0)
        if output_dir is not None:
            df.to_csv(join(output_dir, cache_fn), sep="\t")
        return df

    print("Loading cached count df from file: {}".format(
        join(output_dir, cache_fn)))
    return pd.read_csv(join(output_dir, cache_fn), sep="\t", index_col=0)
예제 #9
0
def process_document(path,
                     vocab,
                     title_start="========,",
                     forbidden_start="***LIST***",
                     test=False,
                     ssplit=True):
    print("ssplit: " + str(ssplit))
    lines = ([
        l for l in utils.load_lines(path) if not l.startswith(forbidden_start)
    ]) if ssplit else (sentence_split(utils.load_txt_file(path)))
    stride = 1 if test else config.sent_stride
    lab_lines = []
    lines_txt = []
    for i in range(len(lines)):
        if lines[i].startswith(title_start):
            continue
        if (i - 1) >= 0 and lines[i - 1].startswith(title_start):
            lab_lines.append((lines[i], 1))
        else:
            lab_lines.append((lines[i], 0))
        lines_txt.append(lines[i])

    raw_blocks = []
    i = 0
    while i < len(lab_lines):
        block = lab_lines[i:i + config.sent_window]
        if len(block) < config.sent_window:
            block.extend([(config.fake_sent, 0)] *
                         (config.sent_window - len(block)))
        raw_blocks.append(block)
        i += stride

    if not test:
        random.shuffle(raw_blocks)
        raw_blocks = raw_blocks[:int(config.perc_blocks_train *
                                     len(raw_blocks))]

    doc_recs = []
    for rb in raw_blocks:
        records = create_one_instance(rb, lines_txt, vocab)
        doc_recs.extend(records)

    return doc_recs, len(raw_blocks), raw_blocks if test else None
예제 #10
0
def read_all_ranks(opt, path=None):
    if opt.glove:
        graph_path = osp.join(utils.glove_dir, 'graph.txt')
    elif opt.sift:
        graph_path = osp.join(utils.data_dir, 'sift_graph_10', 'graph.txt')
    elif opt.prefix10m:
        graph_path = osp.join(utils.data_dir, 'prefix10m_graph_10.txt')       
    else:
        if path is not None:
            graph_path = path
        else:
            raise Exception('Cannot read precomputed knn graph for unknown type data')
    
    ranks = []    
    lines = utils.load_lines(graph_path)[1:]
    #tuples of 2 indices, and their weights
    idx2weights = {}
    
    for i, line in enumerate(lines, 1):
                
        cur_list = line.strip().split(' ')
        cur_ranks = []
        
        for j in range(0, len(cur_list), 2):
            neigh = int(cur_list[j])
            cur_ranks.append(neigh)
            
            neigh_weight = int(cur_list[j+1])
            tup = (i, neigh) if i < neigh else (neigh, i)
            idx2weights[tup] = neigh_weight
            
        #ensure proper k! for resulting graph
        ranks.append(cur_ranks)
    
    #ranks = torch.LongTensor(ranks)
    return ranks, idx2weights
def convert_file(fname: str, target_fname: str):
    triples = parse_lines(utils.load_lines(fname))
    records = (record_from_triple(*t) for t in triples)
    utils.write_file_and_print_stats(records, target_fname)
예제 #12
0
 def lines(self):
     return load_lines(self._ids_fpath)
예제 #13
0
def main(arguments):
    """Main run function for processing the Cornell Movie Dialog Data."""

    # Parse the arguments
    args = utils.parse_arguments(arguments)

    # movie lines file
    movie_lines_file = os.path.join(args.infile_path, 'movie_lines.txt')
    # movie conversations file
    movie_conversations_file = os.path.join(args.infile_path,
                                            'movie_conversations.txt')

    if not args.pairs:
        tf.logging.log(
            tf.logging.INFO,
            "Selecting and saving {} random lines...".format(args.num_lines))
        lines = []
        try:
            with open(movie_lines_file, encoding='iso-8859-1') as f:
                for line in f:
                    values = line.split(" +++$+++ ")
                    lines.append(values[-1].strip())
        except FileNotFoundError as error:
            tf.logging.log(tf.logging.ERROR, error)
            tf.logging.log(
                tf.logging.ERROR,
                'Input file not found, correct the specified location.')
            sys.exit(0)
        tf.logging.info("Found {} input lines.".format(len(lines)))

        with open(args.outfile, 'w', encoding='iso-8859-1') as f:
            if args.num_lines != 0:
                for item in np.random.choice(lines,
                                             args.num_lines,
                                             replace=False):
                    f.write("%s\n" % item)
            else:
                for item in lines:
                    f.write("%s\n" % item)
        tf.logging.log(
            tf.logging.INFO,
            'Wrote {} lines to {}.'.format(args.num_lines, args.outfile))
    else:
        tf.logging.log(
            tf.logging.INFO,
            "Selecting and saving {} random pairs...".format(args.num_lines))
        tf.logging.log(tf.logging.INFO,
                       'CMDC movie_lines_path: {}'.format(movie_lines_file))
        tf.logging.log(
            tf.logging.INFO, 'CMDC movie_converstions_path: {}'.format(
                movie_conversations_file))

        movie_lines_fields = [
            "lineID", "characterID", "movieID", "character", "text"
        ]
        movie_conversations_fields = [
            "character1ID", "character2ID", "movieID", "utteranceIDs"
        ]

        # load the lines
        lines = utils.load_lines(movie_lines_file, movie_lines_fields)
        tf.logging.log(
            tf.logging.INFO,
            "Loaded {} lines: {}".format(len(lines), movie_lines_file))

        # load the conversations
        conversations = utils.load_conversations(movie_conversations_file,
                                                 lines,
                                                 movie_conversations_fields)
        tf.logging.info("Loaded {} conversations: {}".format(
            len(conversations), movie_conversations_file))

        with open(args.outfile, 'w', encoding='iso-8859-1') as outputfile:
            writer = csv.writer(outputfile, delimiter=args.delimiter)
            collected_pairs = utils.extract_pairs(conversations)
            tf.logging.log(tf.logging.INFO,
                           'Total of {} pairs'.format(len(collected_pairs)))
            if int(args.num_lines) != 0:
                random_idxs = np.random.choice(len(collected_pairs),
                                               args.num_lines,
                                               replace=False)
                for random_id in random_idxs:
                    pair = collected_pairs[random_id]
                    writer.writerow(pair)
                tf.logging.info("Wrote {} pairs to {}.".format(
                    args.num_lines, args.outfile))
            else:
                for item in collected_pairs:
                    writer.writerow(item)
                tf.logging.info("Wrote {} pairs to {}.".format(
                    len(collected_pairs), args.outfile))
예제 #14
0
def create_data_tree_root(dataset, all_ranks, ds_idx, train_node, idx2bin,
                          height, branching_l, ht2cutsz, opt):

    #create graph from data.
    data = dataset[ds_idx]
    datalen = len(data)
    if datalen <= opt.k:
        return None
    graph_path = os.path.join(opt.data_dir,
                              opt.graph_file)  #'../data/knn.graph'

    #ranks are 1-based
    if opt.glove or opt.sift:  #and len(branching_l) == 1:
        parts_path = run_kahip(graph_path, datalen, branching_l, height, opt)

        lines = utils.load_lines(parts_path)
        classes = [int(line) for line in lines]
        #read in all_ranks
        if opt.glove:
            all_ranks, idx2weights = read_all_ranks_glove(opt)
        elif opt.sift:
            all_ranks, idx2weights = read_all_ranks_sift(opt)  ###implement!!

        #create root DataNode dataset, ds_idx, parent_train_node, idx2bin, height, opt
        dsnode = add_datanode_children(dataset, (all_ranks, idx2weights),
                                       ds_idx, train_node, idx2bin, height - 1,
                                       branching_l, classes, ht2cutsz, opt)
        return dsnode

    if len(branching_l) == 1:  #this is always the case now
        #only use distance at top level of tree
        ranks = create_graph.create_knn_graph(data, k=opt.k,
                                              opt=opt)  #should supply opt
        all_ranks = ranks
    else:
        assert all_ranks is not None
        #else compute part of previous graph
        ranks = create_graph.create_knn_sub_graph(all_ranks, ds_idx, data, opt)

    n_edges = create_graph.write_knn_graph(ranks, graph_path)

    #graph_dir = create_graph.data_dir
    #graph_file = os.path.join(graph_dir, graph_file)

    #create partition from graph
    #this overrides file each iteration
    #parts_path = opt.parts_path_root

    parts_path = run_kahip(graph_path, datalen, branching_l, height, opt)

    lines = utils.load_lines(parts_path)
    classes = [int(line) for line in lines]

    compute_cut_size_b = True and not opt.glove
    if compute_cut_size_b:
        cut_sz = compute_cut_size(classes, ranks)
        ht2cutsz[height].append((cut_sz, n_edges))

    #create root DataNode dataset, ds_idx, parent_train_node, idx2bin, height, opt
    dsnode = add_datanode_children(dataset, (all_ranks, None), ds_idx,
                                   train_node, idx2bin, height - 1,
                                   branching_l, classes, ht2cutsz, opt)

    return dsnode
예제 #15
0
def create_data_tree_root(dataset, all_ranks, ds_idx, train_node, idx2bin, height, branching_l, ht2cutsz, opt):

    datalen = len(ds_idx)
    if datalen <= opt.k:
        return None
    graph_path = os.path.join(opt.data_dir, opt.graph_file) #'../data/knn.graph'
    
    #ranks are 1-based
    if opt.glove or opt.sift or opt.prefix10m: #and len(branching_l) == 1:

        if opt.glove:
            #custom paths
        #if opt.glove and opt.k_graph==50: #april, 50NN graph file
            #graph_path = os.path.join(opt.data_dir, 'glove50_'+opt.graph_file) #'../data/knn.graph'
            graph_path = os.path.join(opt.data_dir, opt.graph_file) #'../data/knn.graph'
            #graph_path = os.path.join(opt.data_dir, 'glove10_sub10knn.graph')
            print('graph file {}'.format(graph_path))
        parts_path = run_kahip(graph_path, datalen, branching_l, height, opt)
        print('Done partitioning top level!')
        lines = utils.load_lines(parts_path)
        classes = [int(line) for line in lines]
        
        #read in all_ranks, for partitioning on further levels.
        all_ranks, idx2weights = read_all_ranks(opt)
        if opt.dataset_name != 'prefix10m':
            k1 = max(1, int(opt.nn_mult*opt.k))
            ranks = utils.dist_rank(dataset, k=k1)
        else:
            #subtract 1 as graph was created with 1-indexing for kahip.
            ranks = torch.load('/large/prefix10m10knn.graph.pt') - 1
        #create root DataNode dataset, ds_idx, parent_train_node, idx2bin, height, opt
        dsnode = add_datanode_children(dataset, (all_ranks, idx2weights), ds_idx, train_node, idx2bin, height-1, branching_l, classes, ht2cutsz, 0, opt, ranks, toplevel=True, root=True)    
        return dsnode

    #create graph from data.
    data = dataset[ds_idx]
    if len(branching_l) == 1: #this is always the case now
        #use tree created at top level throughout the hierarchy
        ranks = create_graph.create_knn_graph(data, k=opt.k, opt=opt) #should supply opt
        all_ranks = ranks
    else:
        assert all_ranks is not None        
        #else compute part of previous graph
        ranks = create_graph.create_knn_sub_graph(all_ranks, ds_idx, data, opt)
    
    n_edges = create_graph.write_knn_graph(ranks, graph_path)
    _, idx2weights = read_all_ranks(opt, path=graph_path)
        
    #create partition from graph
    #this overrides file each iteration        
    parts_path = run_kahip(graph_path, datalen, branching_l, height, opt)

    lines = utils.load_lines(parts_path)
    classes = [int(line) for line in lines]
    
    compute_cut_size_b = False and not opt.glove
    if compute_cut_size_b:
        cut_sz = compute_cut_size(classes, ranks)
        ht2cutsz[height].append((cut_sz, n_edges))                
    
    #create root DataNode dataset, ds_idx, parent_train_node, idx2bin, height, opt
    dsnode = add_datanode_children(dataset, (all_ranks, idx2weights), ds_idx, train_node, idx2bin, height-1, branching_l, classes, ht2cutsz, 0, opt, all_ranks-1, toplevel=True, root=True)
    #Note the above all_ranks is not 5*opt.k number of nearest neighbors.
    
    return dsnode