Exemplo n.º 1
0
def generate_data(test_pct=0.2, validation_pct=0.2):
    pt_abs_id_map = gutils.get_pt_abs_id_map(items)
    pos_data_pairs, neg_data_pairs = [], []

    for pt, abs_ids in pt_abs_id_map.items():
        selected_abs_ids = [x for x in abs_ids if len(groups[x]) >= cnt.MIN_GROUP_SIZE]
        n = len(selected_abs_ids)

        if n > 1:
            print(len(selected_abs_ids))
            random.shuffle(selected_abs_ids)
            group_embeds = {abs_id:gutils.get_wv_embeddings(groups[abs_id]) for abs_id in selected_abs_ids}
                
            pool = ThreadPool(cnt.NUM_THREADS)
            pt_data_pairs = pool.map(lambda x: generate_group_data(x[1], selected_abs_ids[(x[0]+1)%n], group_embeds[x[1]], pt), enumerate(selected_abs_ids))
            pool.close()
            pool.join()
            
            for x in pt_data_pairs:
                pos_data_pairs += x[0]
                neg_data_pairs += x[1]

    data_pairs = pos_data_pairs + neg_data_pairs

    train_data_pairs, test_data_pairs = train_test_split(data_pairs, test_size=test_pct)
    train_data_pairs, validation_data_pairs = train_test_split(train_data_pairs, test_size=validation_pct)

    gutils.save_data_pkl(train_data_pairs, cnt.TRAIN_DATA_PAIRS_FILE)
    gutils.save_data_pkl(test_data_pairs, cnt.TEST_DATA_PAIRS_FILE)
    gutils.save_data_pkl(validation_data_pairs, cnt.VALIDATION_DATA_PAIRS_FILE)

    return len(train_data_pairs), len(test_data_pairs), len(validation_data_pairs)
Exemplo n.º 2
0
def create_sent_tokens_array():
    try:
        tokens_file = tables.open_file(os.path.join(cnt.DATA_FOLDER, cnt.SENT_TOKENS_FILE), mode='w')
        atom = tables.StringAtom(itemsize=16)
        tokens_arr = tokens_file.create_earray(tokens_file.root, 'data', atom, (0, cnt.MAX_WORDS))
        vocab = set()
        
        n, batch_size = len(items), cnt.PYTABLES_INSERT_BATCH_SIZE
        num_batches = int(math.ceil(float(n)/batch_size))

        for m in range(num_batches):
            start, end = m*batch_size, min((m+1)*batch_size, n)
            batch_items = [items[x] for x in range(start, end)]
            tokens = [gutils.padd_fn(gutils.get_tokens(gutils.get_item_text(item))) for item in batch_items]
            tokens_arr.append(tokens)
            vocab.update([x for token in tokens for x in token])
            
        vocab = sorted(list(vocab))
        word2idx_map = {w: i + 1 for i, w in enumerate(vocab)}
        gutils.save_data_pkl(word2idx_map, cnt.WORD2IDX_FILE)
        
        sent_tokens = tokens_file.root.data
        
        sents_arr_file = tables.open_file(os.path.join(cnt.DATA_FOLDER, cnt.SENT_ARRAYS_FILE), mode='w')
        atom = tables.Int32Atom()
        sents_arr = sents_arr_file.create_earray(sents_arr_file.root, 'data', atom, (0, cnt.MAX_WORDS))
        
        n, batch_size = len(items), cnt.PYTABLES_INSERT_BATCH_SIZE
        num_batches = int(math.ceil(float(n)/batch_size))
        
        for m in range(num_batches):
            start, end = m*batch_size, min((m+1)*batch_size, n)
            tokens = [sent_tokens[x] for x in range(start, end)]
            sent_arrs = [[gutils.word_to_idx(w, word2idx_map) for w in token] for token in tokens]
            sents_arr.append(sent_arrs)
        
    finally:
        tokens_file.close()
        sents_arr_file.close()
Exemplo n.º 3
0
                    
                    correct_pt_threshold[pt][thres] += a
                    pred_pt_threshold[pt][thres] += b
                    actual_pt_threshold[pt][thres] += c

                    num_correct += a
                    num_predicted += b
                    num_actual += c

        if num_predicted > 0:
            precision = float(num_correct)/num_predicted
            recall = float(num_correct)/num_actual

            print(thres, radius, precision, recall)
            
    gutils.save_data_pkl(correct_pt_threshold, 'correct_pt_threshold.pkl')
    gutils.save_data_pkl(pred_pt_threshold, 'pred_pt_threshold.pkl')
    gutils.save_data_pkl(actual_pt_threshold, 'actual_pt_threshold.pkl')
    
    for pt in actual_pt_threshold:
        print(pt)
        for thres in actual_pt_threshold[pt]:
            if pt in correct_pt_threshold and thres in correct_pt_threshold[pt]:
                if pt in pred_pt_threshold and thres in pred_pt_threshold[pt] and pred_pt_threshold[pt][thres] > 0:
                    precision = float(correct_pt_threshold[pt][thres])/pred_pt_threshold[pt][thres]
                else:
                    precision = 0.0

                if pt in actual_pt_threshold and thres in actual_pt_threshold[pt] and actual_pt_threshold[pt][thres] > 0:
                    recall = float(correct_pt_threshold[pt][thres])/actual_pt_threshold[pt][thres]
                else:
Exemplo n.º 4
0
print("Training word2vec...")
gutils.train_wv_model()
gutils.generate_text_wv_embeddings()

print("Getting groups...")
groups = gutils.abstract_groups(items)
print(len(groups))

print("Getting wv embeddings one per group...")
random.seed(42)
group_indices = []
for abs_id, indexes in groups.items():
    idx = random.sample(indexes, 1)[0]
    group_indices.append(idx)

gutils.save_data_pkl(group_indices, cnt.GROUP_INDICES_FILE)

vectors = gutils.get_wv_embeddings(group_indices)

print("Constructing word embeddings KD Tree...")
gutils.construct_kd_tree(vectors, save_file=cnt.WV_KD_TREE_FILE)

group_indices = gutils.load_data_pkl(cnt.GROUP_INDICES_FILE)
print(items[group_indices[0]][0], items[group_indices[0]][5])
print(gutils.get_item_text(items[group_indices[0]]))

kdtree = gutils.load_data_pkl(cnt.WV_KD_TREE_FILE)
query_vector = gutils.get_wv_embeddings([group_indices[0]])[0]
u = gutils.get_nearest_neighbors_count(kdtree, query_vector, count=5)

for x in u: