Пример #1
0
 def test_get_lots_of_nns(self):
     f = 10
     i = AnnoyIndex(f, 'euclidean')
     i.add_item(0, [random.gauss(0, 1) for x in xrange(f)])
     i.build(10)
     for j in xrange(100):
         self.assertEqual(i.get_nns_by_item(0, 999999999), [0])
Пример #2
0
    def test_write_failed(self):
        f = 40

        # Build the initial index
        t = AnnoyIndex(f)
        for i in range(1000):
            v = [random.gauss(0, 1) for z in range(f)]
            t.add_item(i, v)
        t.build(10)

        if sys.platform == "linux" or sys.platform == "linux2":
            # linux
            try:
                t.save("/dev/full") 
                self.fail("didn't get expected exception")
            except Exception as e:
                self.assertTrue(str(e).find("No space left on device") > 0)
        elif sys.platform == "darwin":
            volume = "FULLDISK"
            device = os.popen('hdiutil attach -nomount ram://64').read()
            os.popen('diskutil erasevolume MS-DOS %s %s' % (volume, device))
            os.popen('touch "/Volumes/%s/full"' % volume)
            try:
                t.save('/Volumes/%s/annoy.tree' % volume)
                self.fail("didn't get expected exception")
            except Exception as e:
                self.assertTrue(str(e).find("No space left on device") > 0)
            finally:
                os.popen("hdiutil detach %s" % device)
Пример #3
0
    def test_overwrite_index(self):
        # Issue #335
        f = 40

        # Build the initial index
        t = AnnoyIndex(f)
        for i in range(1000):
            v = [random.gauss(0, 1) for z in range(f)]
            t.add_item(i, v)
        t.build(10)
        t.save('test.ann')

        # Load index file
        t2 = AnnoyIndex(f)
        t2.load('test.ann')

        # Overwrite index file
        t3 = AnnoyIndex(f)
        for i in range(500):
            v = [random.gauss(0, 1) for z in range(f)]
            t3.add_item(i, v)
        t3.build(10)
        if os.name == 'nt':
            # Can't overwrite on Windows
            with self.assertRaises(IOError):
                t3.save('test.ann')
        else:
            t3.save('test.ann')
            # Get nearest neighbors
            v = [random.gauss(0, 1) for z in range(f)]
            nns = t2.get_nns_by_vector(v, 1000)  # Should not crash
Пример #4
0
def precision(f=40, n=1000000):
    t = AnnoyIndex(f)
    for i in xrange(n):
        v = []
        for z in xrange(f):
            v.append(random.gauss(0, 1))
        t.add_item(i, v)

    t.build(2 * f)
    t.save('test.tree')

    limits = [10, 100, 1000, 10000]
    k = 10
    prec_sum = {}
    prec_n = 1000
    time_sum = {}

    for i in xrange(prec_n):
        j = random.randrange(0, n)
        print 'finding nbs for', j
        
        closest = set(t.get_nns_by_item(j, n)[:k])
        for limit in limits:
            t0 = time.time()
            toplist = t.get_nns_by_item(j, limit)
            T = time.time() - t0
            
            found = len(closest.intersection(toplist))
            hitrate = 1.0 * found / k
            prec_sum[limit] = prec_sum.get(limit, 0.0) + hitrate
            time_sum[limit] = time_sum.get(limit, 0.0) + T

        for limit in limits:
            print 'limit: %-9d precision: %6.2f%% avg time: %.6fs' % (limit, 100.0 * prec_sum[limit] / (i + 1), time_sum[limit] / (i + 1))
Пример #5
0
    def recall_at(self, n, n_trees=10, n_points=1000, n_rounds=5):
        # the best movie/variable name
        total_recall = 0.

        for r in range(n_rounds):
            # create random points at distance x
            f = 10
            idx = AnnoyIndex(f, 'dot')

            data = numpy.array([
                [random.gauss(0, 1) for z in range(f)]
                for j in range(n_points)
            ])

            expected_results = [
                sorted(
                    range(n_points),
                    key=lambda j: dot_metric(data[i], data[j])
                )[:n]
                for i in range(n_points)
            ]

            for i, vec in enumerate(data):
                idx.add_item(i, vec)

            idx.build(n_trees)

            for i in range(n_points):
                nns = idx.get_nns_by_vector(data[i], n)
                total_recall += recall(nns, expected_results[i])

        return total_recall / float(n_rounds * n_points)
Пример #6
0
 def test_single_vector(self):
     # https://github.com/spotify/annoy/issues/194
     a = AnnoyIndex(3)
     a.add_item(0, [1, 0, 0])
     a.build(10)
     a.save('1.ann')
     self.assertEquals(a.get_nns_by_vector([1, 0, 0], 3, include_distances=True), ([0], [0.0]))
def make_text_graph(user_lemma_matrix, dimensionality, metric, number_of_estimators, number_of_neighbors):
    user_lemma_matrix_tfidf = augmented_tf_idf(user_lemma_matrix)
    # print(user_lemma_matrix_tfidf.shape)
    if (user_lemma_matrix_tfidf.shape[0] <= dimensionality) or (user_lemma_matrix_tfidf.shape[1] <= dimensionality):
        X_svd = user_lemma_matrix_tfidf.toarray()
    else:
        X_svd = TruncatedSVD(n_components=dimensionality).fit_transform(user_lemma_matrix_tfidf)

    annoy_index = AnnoyIndex(X_svd.shape[1], metric=metric)

    for q in range(X_svd.shape[0]):
        annoy_index.add_item(q, X_svd[q, :])

    annoy_index.build(number_of_estimators)

    row = list()
    col = list()
    data = list()
    for q in range(X_svd.shape[0]):
        neighbors, distances = annoy_index.get_nns_by_item(q, number_of_neighbors, include_distances=True)

        row.extend([q] * number_of_neighbors)
        col.extend(neighbors)
        data.extend(distances)

    row = np.array(row, dtype=np.int64)
    col = np.array(col, dtype=np.int64)
    data = np.array(data, dtype=np.float64)

    text_graph = spsp.coo_matrix((data, (row, col)), shape=(X_svd.shape[0], X_svd.shape[0]))
    text_graph = spsp.csr_matrix(text_graph)

    return text_graph
Пример #8
0
    def test_zero_vectors(self):
        # Mentioned on the annoy-user list
        bitstrings = [
            '0000000000011000001110000011111000101110111110000100000100000000',
            '0000000000011000001110000011111000101110111110000100000100000001',
            '0000000000011000001110000011111000101110111110000100000100000010',
            '0010010100011001001000010001100101011110000000110000011110001100',
            '1001011010000110100101101001111010001110100001101000111000001110',
            '0111100101111001011110010010001100010111000111100001101100011111',
            '0011000010011101000011010010111000101110100101111000011101001011',
            '0011000010011100000011010010111000101110100101111000011101001011',
            '1001100000111010001010000010110000111100100101001001010000000111',
            '0000000000111101010100010001000101101001000000011000001101000000',
            '1000101001010001011100010111001100110011001100110011001111001100',
            '1110011001001111100110010001100100001011000011010010111100100111',
        ]
        vectors = [[int(bit) for bit in bitstring] for bitstring in bitstrings]

        f = 64
        idx = AnnoyIndex(f, 'hamming')
        for i, v in enumerate(vectors):
            idx.add_item(i, v)

        idx.build(10)
        idx.save('idx.ann')
        idx = AnnoyIndex(f, 'hamming')
        idx.load('idx.ann')
        js, ds = idx.get_nns_by_item(0, 5, include_distances=True)
        self.assertEquals(js[0], 0)
        self.assertEquals(ds[:4], [0, 1, 1, 22])
Пример #9
0
 def _test_holes_base(self, n, f=100, base_i=100000):
     annoy = AnnoyIndex(f)
     for i in range(n):
         annoy.add_item(base_i + i, numpy.random.normal(size=(f,)))
     annoy.build(100)
     res = annoy.get_nns_by_item(base_i, n)
     self.assertEquals(set(res), set([base_i + i for i in range(n)]))
Пример #10
0
    def _get_index(self, dataset):
        url = 'http://vectors.erikbern.com/%s.hdf5' % dataset
        vectors_fn = os.path.join('test', dataset + '.hdf5')
        index_fn = os.path.join('test', dataset + '.annoy')

        if not os.path.exists(vectors_fn):
            print('downloading', url, '->', vectors_fn)
            urlretrieve(url, vectors_fn)

        dataset_f = h5py.File(vectors_fn)
        distance = dataset_f.attrs['distance']
        f = dataset_f['train'].shape[1]
        annoy = AnnoyIndex(f, distance)

        if not os.path.exists(index_fn):
            print('adding items', distance, f)
            for i, v in enumerate(dataset_f['train']):
                annoy.add_item(i, v)

            print('building index')
            annoy.build(10)
            annoy.save(index_fn)
        else:
            annoy.load(index_fn)
        return annoy, dataset_f
Пример #11
0
def build_index(df,n_trees = 50,dist_metric='angular',out_dir="./"):
    n_records = df.shape[0]
    n_col = df.shape[1]
    index = AnnoyIndex(n_col,metric=dist_metric)
    patient_dict = {}
    index_dict = {}
    i = 0
    print "Adding items to the index..."
    for patient_id in df.index.values:
        if i % 10000 == 0:
            print str(i)
        vec = df.loc[patient_id].values
        index.add_item(i,vec)
        patient_dict[patient_id] = i
        index_dict[i] = patient_id
        i += 1
    print "Building the index..."
    index.build(n_trees)
    index.save(out_dir+"annoy_index.ann")
    ## Save the patient_id -> index mapping ##
    w = csv.writer(open(out_dir+"patient_mapping.csv", "w"))
    for key, val in patient_dict.items():
        w.writerow([key, val])
    w = csv.writer(open(out_dir+"index_mapping.csv", "w"))
    for key, val in index_dict.items():
        w.writerow([key, val])
Пример #12
0
def build_annoy_index(corpus, dimension, winlen, winstep):
    print "Adding to Annoy index"
    index = AnnoyIndex(dimension, "euclidean")
    mfcc_list = []
    i = 0
    for filename, frames in corpus:
#        print filename, frames.shape
        for index_in_file, mfcc in enumerate(frames):
            mfcc_list.append((filename, index_in_file))
            index.add_item(i, mfcc.tolist())
            assert mfcc_list[i] == (filename, index_in_file)
            i += 1

    opts = {"samplerate": desired_samplerate,
            "winlen": winlen,
            "winstep": winstep,
            "numcep": 13,
            "nfilt": 26,
            "nfft": 512,
            "ntrees": ANN_NTREES
            }
    cache_filename = "annoy_index_" + hashlib.md5(str([filename for filename, frames in corpus])).hexdigest() + "." + "_".join("%s=%s" % (k, v) for k, v in sorted(opts.items())) + ".tree"
    
    if not os.path.exists(cache_filename):
        print "Building Annoy index with %d trees" % ANN_NTREES
    #    index.build(-1)
        index.build(ANN_NTREES)
        index.save(cache_filename)
        print "\tWrote cache to %s" % cache_filename
    else:
        print "\tReading cache from %s" % cache_filename
        index.load(cache_filename)
    return index, mfcc_list
Пример #13
0
def ANN(searchSpace):
    dimension = searchSpace[0].shape[0]
    t = AnnoyIndex(dimension, metric='euclidean')
    for i in range(len(searchSpace)):
        t.add_item(i, searchSpace[i])
    t.build(10)
    return t
Пример #14
0
def build_tree(df, metric):
    '''
    INPUTS: Pandas DataFrame, Choice of Metric Space String
    OUTPUTS: Returns the built AnnoyIndex tree, returns a dictionary
             mapping index numbers to the DataFrame's index

    Builds a ANN tree using Spotify's ANNoy library. Metric is the
    metric space (either euclidean or angular)
    '''
    tree = AnnoyIndex(len(df.iloc[0, :].values), metric=metric)

    indexes = {}

    for i in xrange(len(df)):
        v = df.iloc[i, :]
        indexes[i] = v.name
        tree.add_item(i, v.values)

    tree.build(50)

    tree.save(DATA_DIR + 'tree_' + metric + '.ann')
    with open(DATA_DIR + 'indexes_' + metric, 'wb') as f:
        pickle.dump(indexes, f)

    return (tree, indexes)
Пример #15
0
    def test_tuple(self, n_points=1000, n_trees=10):
        f = 10
        i = AnnoyIndex(f, 'euclidean')
        for j in xrange(n_points):
            i.add_item(j, (random.gauss(0, 1) for x in xrange(f)))

        i.build(n_trees)
Пример #16
0
 def test_no_items(self):
     idx = AnnoyIndex(100)
     idx.build(n_trees=10)
     idx.save('foo.idx')
     idx = AnnoyIndex(100)
     idx.load('foo.idx')
     self.assertEquals(idx.get_n_items(), 0)
     self.assertEquals(idx.get_nns_by_vector(vector=numpy.random.randn(100), n=50, include_distances=False), [])
Пример #17
0
def fit_annoy(data, n_trees=-1):
    logger.info('Fitting Annoy Matcher...')
    from annoy import AnnoyIndex
    matcher = AnnoyIndex(data.shape[1], metric='euclidean')
    for i, d in enumerate(data):
        matcher.add_item(i, d)
    matcher.build(n_trees)
    return matcher
def build_index(counts,label_to_id,dimension):
    index = AnnoyIndex(dimension,metric='angular')
    for label,cnt_list in counts.items():
        id = label_to_id[label]
        index.add_item(id,cnt_list)

    index.build(100)
    return index
Пример #19
0
    def test_get_nns_by_vector(self):
        f = 2
        i = AnnoyIndex(f, 'euclidean')
        i.add_item(0, [2,2])
        i.add_item(1, [3,2])
        i.build(10)

        self.assertEquals(i.get_nns_by_vector([3,3], 2), [1, 0])
Пример #20
0
 def test_save_without_build(self):
     # Issue #61
     i = AnnoyIndex(10)
     i.add_item(1000, [random.gauss(0, 1) for z in xrange(10)])
     i.save('x.tree')
     j = AnnoyIndex(10)
     j.load('x.tree')
     j.build(10)
Пример #21
0
    def test_wrong_length(self, n_points=1000, n_trees=10):
        f = 10
        i = AnnoyIndex(f, 'euclidean')
        i.add_item(0, [random.gauss(0, 1) for x in xrange(f)])
        self.assertRaises(IndexError, i.add_item, 1, [random.gauss(0, 1) for x in xrange(f+1000)])
        self.assertRaises(IndexError, i.add_item, 2, [])

        i.build(n_trees)
Пример #22
0
    def _build_from_model(self, vectors, labels, num_features):
        index = AnnoyIndex(num_features)

        for vector_num, vector in enumerate(vectors):
            index.add_item(vector_num, vector)

        index.build(self.num_trees)
        self.index = index
        self.labels = labels
Пример #23
0
    def test_get_nns_by_vector(self):
        f = 3
        i = AnnoyIndex(f)
        i.add_item(0, [1,0,0])
        i.add_item(1, [0,1,0])
        i.add_item(2, [0,0,1])
        i.build(10)

        self.assertEquals(i.get_nns_by_vector([3,2,1], 3), [0,1,2])
Пример #24
0
def create_index_tree(clusters):
    features = clusters.shape[1]
    tree = AnnoyIndex(features, metric='euclidean')

    for i, v in enumerate(clusters):
        tree.add_item(i, v.tolist())

    tree.build(features*2)
    return tree
Пример #25
0
 def test_include_dists_check_ranges(self):
     f = 3
     i = AnnoyIndex(f)
     for j in xrange(100000):
         i.add_item(j, numpy.random.normal(size=f))
     i.build(10)
     indices, dists = i.get_nns_by_item(0, 100000, include_distances=True)
     self.assertTrue(max(dists) < 2.0)
     self.assertAlmostEqual(min(dists), 0.0)
Пример #26
0
    def test_numpy(self, n_points=1000, n_trees=10):
        f = 10
        i = AnnoyIndex(f, 'euclidean')
        for j in xrange(n_points):
            a = numpy.random.normal(size=f)
            a = a.astype(random.choice([numpy.float64, numpy.float32, numpy.uint8, numpy.int16]))
            i.add_item(j, a)

        i.build(n_trees)
Пример #27
0
def get_rank(uid):
    """Returns a list of the 10 best ranked items for a user

    This function generates a rank of items for a given user by using 
    Approximate Nearest Neighbours. The algorithm
    is imported from the Annoy library (developed by Spotify). 
    
    Todo: The index is built from scratch everytime the function is called, 
    which definitely should be changed in the future for increased performance. 
    It should be fairly easy to do as ANNOY can store indexes in files which can
    easily been shared by processes. However, it works well with a few hundred items
    as it is now. 

    item_queue: It is a list of item ids for each user. It acts as a circular queue
    for keeping track of which items the user has seen so far. When two new items are 
    shown to the user, they are placed in the back of the queue. 

    Args:
        uid (int): User ID

    Returns: 
        List of item ids (str)
    """
    ann = AnnoyIndex(data_dimension)
    try:
        items = db.items.find()
        q = db.users.find({"uid": uid}, {"item_queue" : 1, "_id": 0})[0]["item_queue"]
    except TypeError:
        print "Unable to fetch user from DB"
    ids = [i["vid"] for i in q ]
    # Following line can be deleted or modified. 
    # It removes the last 15 items from the ANN tree, so they will never be recommended
    # for the user. This is done to make sure the user only sees new items in the 
    # recommended list (assuming 15 is the number of comparisons the user has made). 
    # This is sort of a hack and can be removed/modified later on if necessary.
    ids[-15:] = []
    print ids
    id_dict = {}
    # Add items to ANN tree
    for i,item in enumerate(items):
        if item["vid"] in ids:
            # Store all ids in a dictionary
            id_dict[str(i)] = item["vid"]
            ann.add_item(i, item["vals"])
    # Erik Bernhardson (aurthor of ANNOY) suggests to use 2*dimension of data as the number
    # of trees to build. 
    ann.build(data_dimension*2)
    try: 
        user = db.users.find({"uid": uid})[0]
    except TypeError:
        print "Unable to fetch user from DB"
    # Get 10 highest ranked items for that user
    nns_tmp = ann.get_nns_by_vector(user["vals"],10)
    nns = [id_dict[str(k)] for k in nns_tmp]
    print nns
    return nns
Пример #28
0
    def test_get_nns_by_item(self):
        f = 2
        i = AnnoyIndex(f, 'euclidean')
        i.add_item(0, [2, 2])
        i.add_item(1, [3, 2])
        i.add_item(2, [3, 3])
        i.build(10)

        self.assertEqual(i.get_nns_by_item(0, 3), [0, 1, 2])
        self.assertEqual(i.get_nns_by_item(2, 3), [2, 1, 0])
Пример #29
0
    def test_get_nns_by_item(self):
        f = 3
        i = AnnoyIndex(f)
        i.add_item(0, [2,1,0])
        i.add_item(1, [1,2,0])
        i.add_item(2, [0,0,1])
        i.build(10)

        self.assertEqual(i.get_nns_by_item(0, 3), [0,1,2])
        self.assertEqual(i.get_nns_by_item(1, 3), [1,0,2])
Пример #30
0
 def test_only_one_item(self):
     # reported to annoy-user by Kireet Reddy
     idx = AnnoyIndex(100)
     idx.add_item(0, numpy.random.randn(100))
     idx.build(n_trees=10)
     idx.save('foo.idx')
     idx = AnnoyIndex(100)
     idx.load('foo.idx')
     self.assertEquals(idx.get_n_items(), 1)
     self.assertEquals(idx.get_nns_by_vector(vector=numpy.random.randn(100), n=50, include_distances=False), [0])
Пример #31
0
class LRU_KNN:
    def __init__(self, capacity, key_dim, value_dim, batch_size):
        self.capacity = capacity
        self.curr_capacity = 0

        self.states = np.zeros((capacity, key_dim))
        self.values = np.zeros((capacity, value_dim))
        self.lru = np.zeros(capacity)
        self.tm = 0.0

        self.index = AnnoyIndex(key_dim, metric="euclidean")
        self.index.set_seed(123)

        self.initial_update_size = batch_size
        self.min_update_size = self.initial_update_size
        self.cached_states = []
        self.cached_values = []
        self.cached_indices = []

    def nn(self, keys, k):
        dists = []
        inds = []
        for key in keys:
            ind, dist = self.index.get_nns_by_vector(key,
                                                     k,
                                                     include_distances=True)
            dists.append(dist)
            inds.append(ind)
        return dists, inds

    def query(self, keys, k):
        _, indices = self.nn(keys, k)
        states = []
        values = []

        for ind in indices:
            self.lru[ind] = self.tm
            states.append(self.states[ind])
            values.append(self.values[ind])
        self.tm += 0.001
        return states, values

    def _insert(self, keys, values, indices):
        self.cached_states = self.cached_states + keys
        self.cached_values = self.cached_values + values
        self.cached_indices = self.cached_indices + indices

        if len(self.cached_states) >= self.min_update_size:
            self.min_update_size = max(self.initial_update_size,
                                       self.curr_capacity * 0.02)
            self._update_index()

    def _update_index(self):
        self.index.unbuild()
        for i, ind in enumerate(self.cached_indices):
            new_state = self.cached_states[i]
            new_value = self.cached_values[i]

            self.states[ind] = new_state
            self.values[ind] = new_value
            self.index.add_item(ind, new_state)

        self.cached_states = []
        self.cached_values = []
        self.cached_indices = []

        self.index.build(50)
        self.built_capacity = self.curr_capacity

    def _rebuild_index(self):
        self.index.unbuild()
        for ind, state in enumerate(self.states[:self.curr_capacity]):
            self.index.add_item(ind, state)
        self.index.build(50)
        self.built_capacity = self.curr_capacity
Пример #32
0
k_strided = [fake.k[i] for i in range(0, len(fake.k), opt.stride)]

for window_left in k_strided:
    window_right = window_left + opt.width
    if window_right > np.max(fake.k):
        break
    selection = ((fake.k >= window_left) & (fake.k <= window_right))
    window_k = fake.k[selection]
    window_band_l = fake.E_lower[selection]
    window_band_u = fake.E_upper[selection]
    window_size = np.max(window_k) - np.min(window_k)

    gap = np.min(window_band_u) - np.max(window_band_l)
    window_band_l = interpolate_normalize(window_k, window_band_l,
                                          opt.dimensions)
    window_band_u = interpolate_normalize(window_k, window_band_u,
                                          opt.dimensions)

    #plt.plot(window_band_l)
    #plt.plot(window_band_u)
    #plt.title('k =' + str(window_left) + ' gap = '+ str( gap))
    #plt.show()

    annoyindex.add_item(len(lookuptable),
                        np.concatenate([window_band_l, window_band_u]))
    lookuptable.append([window_left, gap])

annoyindex.build(opt.trees)
annoyindex.save('index_test.ann')
np.save('lookuptable_test', lookuptable)
Пример #33
0
parser.add_argument('--file', help='Input file')
parser.add_argument('--out', help='Outfile base')
parser.add_argument('--L', help='Fingerprint length')
parser.add_argument('--norm', help='Normalize')
args = parser.parse_args()

a = AnnoyIndex(int(args.L))
i = 0
names = []

with gzip.open(args.file, 'rt') as f:
    for line in f:
        id, statements, *v = line.split("\t")
        id = re.sub('.json.gz', '', id)
        id = re.sub('\.', '|', id)
        names.append(id)
        v = [float(j) for j in v]
        if args.norm:
            avg = statistics.mean(v)
            std = statistics.stdev(v)
            v = [(j - avg) / std for j in v]
        a.add_item(i, v)
        i = i + 1

a.build(-1)
a.save(args.out + '.tree')

with open(args.out + '.names', 'w') as f:
    for item in names:
        f.write("%s\n" % item)
Пример #34
0
 def hard_mining_reset(self):
     #import faiss
     from annoy import AnnoyIndex
     data = nd.zeros(self.provide_data[0][1])
     label = nd.zeros(self.provide_label[0][1])
     #label = np.zeros( self.provide_label[0][1] )
     X = None
     ba = 0
     batch_num = 0
     while ba < len(self.oseq):
         batch_num += 1
         if batch_num % 10 == 0:
             print('loading batch', batch_num, ba)
         bb = min(ba + self.batch_size, len(self.oseq))
         _count = bb - ba
         for i in range(_count):
             idx = self.oseq[i + ba]
             s = self.imgrec.read_idx(idx)
             header, img = recordio.unpack(s)
             img = self.imdecode(img)
             data[i][:] = self.postprocess_data(img)
             label[i][:] = header.label
         db = mx.io.DataBatch(data=(data, self.data_extra), label=(label, ))
         self.mx_model.forward(db, is_train=False)
         net_out = self.mx_model.get_outputs()
         embedding = net_out[0].asnumpy()
         nembedding = sklearn.preprocessing.normalize(embedding)
         if _count < self.batch_size:
             nembedding = nembedding[0:_count, :]
         if X is None:
             X = np.zeros((len(self.id2range), nembedding.shape[1]),
                          dtype=np.float32)
         nplabel = label.asnumpy()
         for i in range(_count):
             ilabel = int(nplabel[i])
             #print(ilabel, ilabel.__class__)
             X[ilabel] += nembedding[i]
         ba = bb
     X = sklearn.preprocessing.normalize(X)
     d = X.shape[1]
     t = AnnoyIndex(d, metric='euclidean')
     for i in range(X.shape[0]):
         t.add_item(i, X[i])
     print('start to build index')
     t.build(20)
     print(X.shape)
     k = self.per_identities
     self.seq = []
     for i in range(X.shape[0]):
         nnlist = t.get_nns_by_item(i, k)
         assert nnlist[0] == i
         for _label in nnlist:
             assert _label < len(self.id2range)
             _id = self.header0[0] + _label
             v = self.id2range[_id]
             _list = range(*v)
             if len(_list) < self.images_per_identity:
                 random.shuffle(_list)
             else:
                 _list = np.random.choice(_list,
                                          self.images_per_identity,
                                          replace=False)
             for i in range(self.images_per_identity):
                 _idx = _list[i % len(_list)]
                 self.seq.append(_idx)
    log.debug(f'{df_click.head()}')

    article_vec_map = word2vec(df_click, 'user_id', 'click_article_id',
                               model_path)
    f = open(w2v_file, 'wb')
    pickle.dump(article_vec_map, f)
    f.close()

    # 将 embedding 建立索引
    article_index = AnnoyIndex(256, 'angular')
    article_index.set_seed(2020)

    for article_id, emb in tqdm(article_vec_map.items()):
        article_index.add_item(article_id, emb)

    article_index.build(100)

    user_item_ = df_click.groupby('user_id')['click_article_id'].agg(
        lambda x: list(x)).reset_index()
    user_item_dict = dict(
        zip(user_item_['user_id'], user_item_['click_article_id']))

    # 召回
    n_split = max_threads
    all_users = df_query['user_id'].unique()
    shuffle(all_users)
    total = len(all_users)
    n_len = total // n_split

    # 清空临时文件夹
    for path, _, file_list in os.walk('../tmp/w2v'):
Пример #36
0
class PerCategoryTable:
    def __init__(self, db):
        self.db = db
        self.cfg = db.cfg
        self.cache_dir = db.cache_dir

    # def retrieve(self, query_vector, K=1):
    #     if getattr(self, 'nntable') is None:
    #         print('The NNTable has not been built, please run build_nntable first.')
    #         return None
    #     inds = self.nntable.get_nns_by_vector(query_vector, K, search_k=-1, include_distances=False)
    #     inds = list(inds)
    #     if len(inds) > 1:
    #         patches = []
    #         for i in range(len(inds)):
    #             patches.append(self.patchdb[inds[i]])
    #         return patches
    #     else:
    #         return self.patchdb[inds[0]]

    def retrieve(self, query_vector, K=1):
        # if getattr(self, 'nntable') is None:
        #     print('The NNTable has not been built, please run build_nntable first.')
        #     return None
        N = 10
        inds = self.nntable.get_nns_by_vector(query_vector, N, search_k=-1, include_distances=False)
        inds = list(inds)
        tmp = np.random.permutation(range(N))
        return self.patchdb[inds[tmp[0]]]
            

    def build_nntable(self, category_id, patchdb, use_cache=True):
        # keep a reference to the per-category patchdb
        self.patchdb = patchdb
        # cache output directories
        if self.cfg.use_patch_background:
            nntable_folder_name = self.db.split + '_nntables_with_bg'
        else:
            nntable_folder_name = self.db.split + '_nntables_without_bg'
        nntable_dir = osp.join(self.cache_dir, nntable_folder_name)
        maybe_create(nntable_dir)
        nntable_file = osp.join(nntable_dir, '%03d_nntable.ann'%category_id)

        # load or create the files
        if osp.exists(nntable_file) and use_cache:
            #################################################################
            ## Load the files if possible
            #################################################################
            self.nntable = AnnoyIndex(self.cfg.n_patch_features)
            self.nntable.load(nntable_file)
        else:
            #################################################################
            ## create the cache files
            #################################################################
            category = self.db.classes[category_id]
            print("%s NNTable"%category)
            t0 = time()
            self.nntable = AnnoyIndex(self.cfg.n_patch_features)
            for i in range(len(patchdb)):
                x = patchdb[i]
                image_index = x['image_index']
                instance_ind = x['instance_ind']
                feature_path = self.db.patch_path_from_indices(image_index, instance_ind, 'patch_feature', 'pkl', self.cfg.use_patch_background)
                with open(feature_path, 'rb') as fid:
                    features = pickle.load(fid)
                    self.nntable.add_item(i, features)
            n_trees = max(len(patchdb)//100, self.cfg.n_nntable_trees)
            self.nntable.build(n_trees)
            print("%s NNTable completes (time %.2fs)" % (category, time() - t0))

            #####################################################################
            ## Save cache files for faster loading in the future
            #####################################################################
            self.nntable.save(nntable_file)
            print('wrote nntable to {}'.format(nntable_file))
Пример #37
0
for folder in os.listdir(base_url):
    celeb_encoding = {}
    celeb_mapping[folder] = []
    for image in tqdm(listdir(base_url + '/' + folder)):
        try:
            encoding = get_encoding(os.path.join(base_url, folder, image))
        except Exception as e:
            print(e)
            continue

        if encoding is not None:
            c += 1
            celeb_encoding[c] = encoding[0]
            celeb_mapping[folder].append(c)
            ann_index.add_item(c, encoding[0])
    save_json(celeb_mapping)
    pickle.dump(celeb_encoding,
                open(f"celeb_encodings/{folder}_encoding.pkl", "wb"))
    del celeb_encoding

save_json(celeb_mapping)
print("Encoding and mapping files saved successfully")

print("Building ann index...")
ann_index.build(1000)
x = ann_index.save("celeb_index.ann")
if x:
    print("Ann index saved successfully")
else:
    print("Error in saving ann index")
Пример #38
0
    def generate_negative_training_examples(self, df_positives_cases, df_wiki_1_vectors, df_wiki_2_vectors):

        # initialize data frame
        df_negative_examples = pd.DataFrame(columns=['entity_id_wiki_1', 'entity_id_wiki_2', 'vector_entity_1', 'vector_entity_2','label'])

        dict_wiki_1 = df_wiki_1_vectors['entity_id'].to_dict()
        index_map_wiki_1 = dict((v,k) for k,v in dict_wiki_1.items())

        dict_wiki_2 = df_wiki_2_vectors['entity_id'].to_dict()
        index_map_wiki_2 = dict((v,k) for k,v in dict_wiki_2.items())

        t_wiki_1 = None  # Length of item vector that will be indexed
        t_wiki_2 = None


        if len(df_wiki_1_vectors) > 0:
            print('building index for entities in wiki 1')
            t_wiki_1 = AnnoyIndex(EMBEDDING_VECTOR_LENGTH, 'angular')
            for index, row in df_wiki_1_vectors.iterrows():
                v = row['vector']
                t_wiki_1.add_item(index, v)
            t_wiki_1.build(20) # 10 trees
            print('building index for entities in wiki 1 done')
        
        if len(df_wiki_2_vectors) > 0:
            print('building index for entities in wiki 2')
            t_wiki_2 = AnnoyIndex(EMBEDDING_VECTOR_LENGTH, 'angular')    
            for index, row in df_wiki_2_vectors.iterrows():
                v = row['vector']
                t_wiki_2.add_item(index, v)
            t_wiki_2.build(20) # 10 trees
            print('building index for entities in wiki 2 done')
            
        print('total positive cases:', len(df_positives_cases))
        df_positives_cases_slice = df_positives_cases.head(1000)
        for index, row in df_positives_cases_slice.iterrows():
            #print(df_wiki_1_vectors.head())
            #print(row['entity_id_wiki_1'])

            
            entity_1 = row['entity_id_wiki_1']
            entity_2 = row['entity_id_wiki_2']

            if entity_1 in index_map_wiki_1:
                filtered_entity_1 = df_wiki_1_vectors.loc[index_map_wiki_1[entity_1],:]
                

                #print(filtered_entity_1)
                if len(filtered_entity_1) > 0:
                    vector = filtered_entity_1['vector']

                    nn_ent1 = t_wiki_2.get_nns_by_vector(vector, 5, include_distances=False)
                    for i in range(0,len(nn_ent1)):
                        df_negative_examples.loc[len(df_negative_examples)] = [entity_1] + [dict_wiki_2[i]] + [vector] + [t_wiki_2.get_item_vector(i)] +[0]
                        #df_negative_examples = df_negative_examples.append({'entity_id_wiki_1': entity_1, 'entity_id_wiki_2': dict_wiki_2[i], 'vector_entity_1': vector ,'vector_entity_2': t_wiki_2.get_item_vector(i), 'label':0}, ignore_index=True)
            
            
            if entity_2 in index_map_wiki_2:
                filtered_entity_2 = df_wiki_2_vectors.loc[index_map_wiki_2[entity_2],:]
                if len(filtered_entity_2) > 0:
                    vector = filtered_entity_2['vector']
                    nn_ent2 = t_wiki_1.get_nns_by_vector(vector, 5, include_distances=False)

                    for i in range(0,len(nn_ent2)):
                        df_negative_examples = df_negative_examples.append({'entity_id_wiki_1': dict_wiki_1[i], 'entity_id_wiki_2': entity_2, 'vector_entity_1': t_wiki_1.get_item_vector(i) ,'vector_entity_2': vector, 'label':0}, ignore_index=True)
        return df_negative_examples
Пример #39
0
class DatasetCollector:
    """Класс для создания тренировочных и проверочных данных"""

    DATASET_R_KEY_EX = 60 * 60 * 24 * 5
    DATASET_R_KEY = 'dataset:{}'
    DATASET_ALL_R_KEY = 'dataset:{}:all'
    DATASET_START_R_KEY = 'dataset:{}:start'

    def __init__(self, dataset_model: Dataset):
        self.dataset_model = dataset_model
        self.dataset_dir = dataset_model.path
        self.annoy_index = None

    def create_doctor_item_base_matrix(self,
                                       save: bool = True
                                       ) -> Tuple[pd.DataFrame, AnnoyIndex]:
        """Создание item base матрицы врачей и сохранение в индексе annoy и csv"""
        data = pd.DataFrame.from_records(
            Doctor.query.order_by(Doctor.id).all())

        ids = data.iloc[:, 0]
        # Производим нормализацию (MinMaxScaler переносит все точки на отрезок (0, 1))
        features = pd.DataFrame.from_records(MinMaxScaler().fit_transform(
            data.iloc[:, 3:]))

        matrix_data = pd.concat([ids, features], axis=1)

        self.annoy_index = AnnoyIndex(AnnoySettings.ITEMS,
                                      AnnoySettings.METRIC)
        for doc_id, doc_feature in zip(ids.values, features.values):
            self.annoy_index.add_item(doc_id, doc_feature)

        self.annoy_index.build(AnnoySettings.TREES, AnnoySettings.JOBS)

        if save:
            matrix_data.to_csv(self.get_save_path(DOCTORS_CSV),
                               header=False,
                               index=False)
            self.annoy_index.save(self.get_save_path(DOCTORS_ANN))

        return matrix_data, self.annoy_index

    def get_save_path(self, file_name: str) -> str:
        return os.path.join(self.dataset_dir, file_name)

    @staticmethod
    def get_appts_by_user(user_id: int) -> List[Appointment]:
        """Получает список записей на прием"""
        appts = (Appointment.query.options(
            load_only(
                'id', 'doctor_id',
                'spec_id')).filter(Appointment.user_id == user_id).order_by(
                    desc(Appointment.dt_created)).distinct().all())
        return [appt for appt in appts]

    @staticmethod
    def get_users(min_appt=1) -> List[int]:
        """Получает список пользователей, у которых записей на прием не меньше, чем min_appt"""
        users = (Appointment.query.with_entities(Appointment.user_id).group_by(
            Appointment.user_id).having(
                func.count(Appointment.doctor_id) >= min_appt))
        return [user[0] for user in users.all()]

    @staticmethod
    def get_doctor_towns() -> Dict[int, int]:
        """Получаем докторов и их города"""
        doctors = Doctor.query.with_entities(Doctor.id, Doctor.town_id).all()
        return {doctor_id: town_id for doctor_id, town_id in doctors}

    @staticmethod
    def get_town_doctor_list(town_id: int, spec_id: int,
                             exclude: Tuple[int]) -> List[int]:
        """Получает список врачей в городе по заданной специальности исключая exclude"""
        doctors = (DoctorTown.query.with_entities(
            DoctorTown.doctor_id).filter(DoctorTown.town_id == town_id).filter(
                DoctorTown.wp_spec_id == spec_id).filter(
                    ~DoctorTown.doctor_id.in_(exclude)).order_by(
                        desc(DoctorTown.rating)).distinct())
        return [doc[0] for doc in doctors.all()]

    def set_appt_dataset(self, to_list, doc_towns, appt) -> None:
        doctors = self.get_town_doctor_list(doc_towns[appt.doctor_id],
                                            appt.spec_id,
                                            exclude=(appt.doctor_id, ))
        for doctor in doctors[:100]:
            to_list.append(
                [0, appt.id, *self.annoy_index.get_item_vector(doctor)])
        to_list.append(
            [1, appt.id, *self.annoy_index.get_item_vector(appt.doctor_id)])

    def get_check_data(self, doc_towns, last_appt, old_appts) -> dict:
        """Предагрегирует данные для финального тестирования модели"""
        doctors = self.get_town_doctor_list(doc_towns[last_appt.doctor_id],
                                            last_appt.spec_id,
                                            exclude=tuple())[:200]

        return {
            'selected_doctor': last_appt.doctor_id,
            'suggested_doctors': doctors,
            'all_appts': [appt.doctor_id for appt in old_appts],
        }

    def create_datasets_for_catboost(
            self,
            min_appts: int = 1,
            save: bool = True
    ) -> Tuple[pd.DataFrame, pd.DataFrame, List[dict]]:
        """Создает датасет для тренировки и тестирования"""
        assert self.annoy_index is not None, 'annoy_index does not exist'

        all_users = self.get_users(min_appts)
        doc_towns = self.get_doctor_towns()

        test, train, check = [], [], []

        r_con = redis_connection()
        dataset_r_key = self.DATASET_R_KEY.format(self.dataset_model.id)
        r_con.set(dataset_r_key, 0, ex=self.DATASET_R_KEY_EX)
        r_con.set(self.DATASET_ALL_R_KEY.format(self.dataset_model.id),
                  len(all_users),
                  ex=self.DATASET_R_KEY_EX)
        r_con.set(self.DATASET_START_R_KEY.format(self.dataset_model.id),
                  time.time(),
                  ex=self.DATASET_R_KEY_EX)

        for user in all_users:
            last_appt, *old_appts = self.get_appts_by_user(user)

            # последнюю запись на прием оставляем для финального тестирования
            check.append(self.get_check_data(doc_towns, last_appt, old_appts))
            r_con.incr(dataset_r_key)

            if not old_appts:
                continue  # была всего одна запись на прием

            test_appt, *train_user_appts = old_appts
            # предпоследнюю запись на прием оставляем для тестирования при обучении
            self.set_appt_dataset(test, doc_towns, test_appt)

            # старые записи на прием оставляем для обучения
            for appt in train_user_appts:
                self.set_appt_dataset(train, doc_towns, appt)

        test_df = pd.DataFrame(test)
        train_df = pd.DataFrame(train)

        if save:
            test_df.to_csv(self.get_save_path(TEST_DATASET),
                           header=False,
                           index=False)
            train_df.to_csv(self.get_save_path(TRAIN_DATASET),
                            header=False,
                            index=False)

            with open(self.get_save_path(CHECK_DATASET), 'w') as fp:
                json.dump(check, fp)

        return test_df, train_df, check

    def load_dataset(self):
        test_df = pd.read_csv(self.get_save_path(TEST_DATASET), header=None)
        train_df = pd.read_csv(self.get_save_path(TRAIN_DATASET), header=None)
        return test_df, train_df

    def load_check_dataset(self):
        with open(self.get_save_path(CHECK_DATASET), 'r') as fp:
            return json.load(fp)

    def load_annoy_index(self):
        if self.annoy_index is None:
            self.annoy_index = AnnoyIndex(AnnoySettings.ITEMS,
                                          AnnoySettings.METRIC)
            self.annoy_index.load(self.get_save_path(DOCTORS_ANN))
        return self.annoy_index
        print("Load pre-computed embeddings from disc")
        with open(embedding_cache_path, "rb") as fIn:
            cache_data = pickle.load(fIn)
            corpus_sentences = cache_data['sentences']
            corpus_embeddings = cache_data['embeddings']

    if not os.path.exists(annoy_index_path):
        # Create Annoy Index
        print("Create Annoy index with {} trees. This can take some time.".
              format(n_trees))
        annoy_index = AnnoyIndex(embedding_size, 'angular')

        for i in range(len(corpus_embeddings)):
            annoy_index.add_item(i, corpus_embeddings[i])

        annoy_index.build(n_trees)
        annoy_index.save(annoy_index_path)
    else:
        #Load Annoy Index from disc
        annoy_index = AnnoyIndex(embedding_size, 'angular')
        annoy_index.load(annoy_index_path)

    corpus_embeddings = torch.from_numpy(corpus_embeddings)

    ######### Search in the index ###########

    print("Corpus loaded with {} sentences / embeddings".format(
        len(corpus_sentences)))

    while True:
        inp_question = input("Please enter a question: ")
Пример #41
0
from annoy import AnnoyIndex
import numpy as np
import time
start = time.time()
data = np.random.randn(1000, 100).astype(np.float32)

index = AnnoyIndex(100, metric="euclidean")

for ind, data_ in enumerate(data):
    index.add_item(ind, data_)
index.build(50)
print(index.get_n_items())

data_1 = np.random.randn(1000, 100).astype(np.float32)
for ind, data_ in enumerate(data_1):
    index.add_item(ind + 1000, data_)

print(index.get_n_items())

index.build(50)
print(index.get_n_items())
end = time.time()

for ind, data_ in enumerate(data_1):
    index.add_item(ind + 1000, data_)
print(index.get_n_items())

index.build(50)
print(index.get_n_items())
print(end - start)
Пример #42
0
import spacy
from utils import load_jsonl
import numpy as np
import annoy
# import faiss
from annoy import AnnoyIndex
import random

if False:
    f = 7
    t = AnnoyIndex(f)  # Length of item vector that will be indexed
    for i in range(1000):
        v = [random.gauss(0, 1) for z in range(f)]
        t.add_item(i, v)

    t.build(10)  # 10 trees
    t.save('test.ann')

    # ...

    u = AnnoyIndex(f)
    u.load('test.ann')  # super fast, will just mmap the file
    print(u.get_nns_by_item(0, 1000))  # will find the 1000 nearest neighbors

    assert False

nlp = spacy.load('predict/my_model')

texts = [
    'Today is sunny',
    'I hate bunnies',
Пример #43
0
# config
dims = 2048
n_nearest_neighbors = 3
trees = 10000
infiles = glob.glob('image_vectors/*.npz')

# build ann index
t = AnnoyIndex(dims)
for file_index, i in enumerate(infiles):
  file_vector = np.loadtxt(i)
  file_name = os.path.basename(i).split('.')[0]
  file_index_to_file_name[file_index] = file_name
  file_index_to_file_vector[file_index] = file_vector
  t.add_item(file_index, file_vector)
t.build(trees)
t.save('tree.ann')

'''
# create a nearest neighbors json file for each input
if not os.path.exists('nearest_neighbors'):
  os.makedirs('nearest_neighbors')

for i in file_index_to_file_name.keys():
  master_file_name = file_index_to_file_name[i]
  master_vector = file_index_to_file_vector[i]

  named_nearest_neighbors = []
  nearest_neighbors = t.get_nns_by_item(i, n_nearest_neighbors)
  for j in nearest_neighbors:
    neighbor_file_name = file_index_to_file_name[j]
Пример #44
0
    ]
    metadata_array, embeddings_array = list(
        zip(*map(load_vectors_and_metadata, file_names)))
    return list(itertools.chain.from_iterable(metadata_array)), np.concatenate(
        embeddings_array, axis=0)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--paths", nargs='+', help="folder of parsed pictures")
    args = parser.parse_args()
    metadata_array = list()
    embeddings_array = list()
    for doc_path in args.paths:
        meta_, vec_ = prepare_data(doc_path)
        metadata_array.append(meta_)
        embeddings_array.append(vec_)

    meta, vec = list(
        itertools.chain.from_iterable(metadata_array)), np.concatenate(
            embeddings_array, axis=0)

    f = vec.shape[1]
    t = AnnoyIndex(f, 'angular')  # Length of item vector that will be indexed
    for i in range(vec.shape[0]):
        t.add_item(i, vec[i])

    t.build(30)  # 30 trees
    t.save('test.ann')
    json.dump(meta, open("metadata.json", "w"))
Пример #45
0
def generate_hashes(images, filename_single_edges, filename_corners,
                    filename_opposite_edges, filename_three_edges,
                    filename_four_edges):
    num_images = images.shape[0]
    #tile_index = AnnoyIndex(4 * images.shape[1], metric='euclidean')
    channels = 1
    if len(images.shape) > 3:
        channels = images.shape[3]

    corner_index = AnnoyIndex(
        2 * channels * images.shape[1],
        metric='euclidean')  # Length of item vector that will be indexed
    full_filename_corners = os.path.join(os.getcwd(),
                                         filename_corners.replace('/', '\\'))

    single_edge_index = AnnoyIndex(
        channels * images.shape[1],
        metric='euclidean')  # Length of item vector that will be indexed
    full_filename_edges = os.path.join(
        os.getcwd(), filename_single_edges.replace('/', '\\'))

    opposite_edges_index = AnnoyIndex(
        2 * channels * images.shape[1],
        metric='euclidean')  # Length of item vector that will be indexed
    full_filename_opposite_edges = os.path.join(
        os.getcwd(), filename_opposite_edges.replace('/', '\\'))

    three_edges_index = AnnoyIndex(
        3 * channels * images.shape[1],
        metric='euclidean')  # Length of item vector that will be indexed
    full_filename_three_edges = os.path.join(
        os.getcwd(), filename_three_edges.replace('/', '\\'))

    four_edges_index = AnnoyIndex(
        4 * channels * images.shape[1],
        metric='euclidean')  # Length of item vector that will be indexed
    full_filename_four_edges = os.path.join(
        os.getcwd(), filename_four_edges.replace('/', '\\'))

    identifiers = np.column_stack(
        (np.floor(np.arange(0, num_images,
                            0.25)), np.tile(range(4), (1, num_images))[0]))

    generate_single_edges = not os.path.isfile(full_filename_edges)
    generate_corners = not os.path.isfile(full_filename_corners)
    generate_opposite_edges = not os.path.isfile(full_filename_opposite_edges)
    generate_three_edges = not os.path.isfile(full_filename_three_edges)
    generate_four_edges = not os.path.isfile(full_filename_four_edges)

    if not generate_single_edges:
        single_edge_index.load(full_filename_edges)
        print('loaded single edge index')
    if not generate_corners:  # and os.path.isfile(full_filename_tiles):
        corner_index.load(full_filename_corners)
        print('loaded corner index')
    if not generate_opposite_edges:
        opposite_edges_index.load(full_filename_opposite_edges)
        print('loaded opposite edge index')
    if not generate_three_edges:
        three_edges_index.load(full_filename_three_edges)
        print('loaded three edge index')
    if not generate_four_edges:
        four_edges_index.load(full_filename_four_edges)
        print('loaded four edge index')

    if not generate_corners and not generate_single_edges and not generate_opposite_edges and not generate_three_edges and not generate_four_edges:
        print('found all indices, returning...')
        return single_edge_index, corner_index, opposite_edges_index, three_edges_index, four_edges_index, identifiers  #if all are already loaded from file, no generation needed - return from here

    ct = 0
    for idx, image in enumerate(tqdm(images)):
        (top, right, bottom, left) = get_all_edges_from_array(image)
        if generate_single_edges:
            single_edge_index.add_item(ct, top)
            single_edge_index.add_item(ct + 1, right)
            single_edge_index.add_item(ct + 2, bottom)
            single_edge_index.add_item(ct + 3, left)

        if generate_corners:
            corner_left_top = np.concatenate([left, top])
            corner_top_right = np.concatenate([top, right])
            corner_right_bottom = np.concatenate([right, bottom])
            corner_bottom_left = np.concatenate([bottom, left])

            corner_index.add_item(ct, corner_left_top)
            corner_index.add_item(ct + 1, corner_top_right)
            corner_index.add_item(ct + 2, corner_right_bottom)
            corner_index.add_item(ct + 3, corner_bottom_left)

        if generate_opposite_edges:
            opposite_left_right = np.concatenate([left, right])
            opposite_top_bottom = np.concatenate([top, bottom])
            opposite_right_left = np.concatenate([right, left])
            opposite_bottom_top = np.concatenate([bottom, top])

            opposite_edges_index.add_item(ct, opposite_left_right)
            opposite_edges_index.add_item(ct + 1, opposite_top_bottom)
            opposite_edges_index.add_item(ct + 2, opposite_right_left)
            opposite_edges_index.add_item(ct + 3, opposite_bottom_top)

        if generate_three_edges:
            three_without_top = np.concatenate([right, bottom, left])
            three_without_right = np.concatenate([bottom, left, top])
            three_without_bottom = np.concatenate([left, top, right])
            three_without_left = np.concatenate([top, right, bottom])

            three_edges_index.add_item(ct, three_without_top)
            three_edges_index.add_item(ct + 1, three_without_right)
            three_edges_index.add_item(ct + 2, three_without_bottom)
            three_edges_index.add_item(ct + 3, three_without_left)

        if generate_four_edges:
            tile_edge_top = np.concatenate([top, right, bottom, left])
            tile_edge_right = np.concatenate([right, bottom, left, top])
            tile_edge_bottom = np.concatenate([bottom, left, top, right])
            tile_edge_left = np.concatenate([left, top, right, bottom])

            four_edges_index.add_item(ct, tile_edge_top)
            four_edges_index.add_item(ct + 1, tile_edge_right)
            four_edges_index.add_item(ct + 2, tile_edge_bottom)
            four_edges_index.add_item(ct + 3, tile_edge_left)

        ct += 4

    if generate_single_edges:
        single_edge_index.build(10)  # 10 trees
        single_edge_index.save(filename_single_edges)
        print('generated and saved single edges index')

    if generate_corners:
        corner_index.build(10)  # 10 trees
        corner_index.save(filename_corners)
        print('generated and saved corner index')

    if generate_opposite_edges:
        opposite_edges_index.build(10)  # 10 trees
        opposite_edges_index.save(filename_opposite_edges)
        print('generated and saved opposite edges index')

    if generate_three_edges:
        three_edges_index.build(10)  # 10 trees
        three_edges_index.save(filename_three_edges)
        print('generated and saved three edges index')

    if generate_four_edges:
        four_edges_index.build(10)  # 10 trees
        four_edges_index.save(filename_four_edges)
        print('generated and saved four edges index')

    return single_edge_index, corner_index, opposite_edges_index, three_edges_index, four_edges_index, identifiers
Пример #46
0
class Indexer:
    def __init__(self,
                 dim,
                 repository='',
                 metric='angular',
                 index_name='index.ann',
                 db_name='names.bin',
                 ntrees=500):
        self.metric = metric  # angular or euclidean
        self.dim = dim  # dimension of the indexed feature vectors
        self.repository = repository
        self.index_name = self.repository + '/' + index_name
        self.db_name = self.repository + '/' + db_name
        self.ntrees = ntrees
        self.t = AnnoyIndex(dim, metric)
        self.s = shelve.open(self.db_name)
        self.s['dim'] = self.dim
        self.s['metric'] = self.metric
        self.sm = {}  # in memory

    def __enter__(self):
        return self

    def __exit__(
        self, exc_type, exc_value, traceback
    ):  # to be used with the 'with Indexer(...) as indexer:' statement
        if len(self.sm) > 0:
            for k, v in self.sm.iteritems():
                self.s[k] = v
        self.s.close()

    def index_single(self, c, feature_vector, uri):
        self.t.add_item(c, feature_vector)
        self.s[str(c)] = uri

    def index(self, feature_vectors, uris):
        c = 0
        for f in feature_vectors:
            self.t.add_item(c, f)
            self.s[str(c)] = uris[
                c]  # uris contains the image uri and possibly more complicated structures
            c = c + 1
        self.build_index()
        self.save_index()

    def index_tags_single(self, tags, uri):
        for t in tags:
            #print t['cat'],t['prob']
            cat_key = str(t['cat'])
            if not cat_key in self.sm:
                self.sm[cat_key] = [{'uri': uri, 'prob': t['prob']}]
            else:
                temp = self.sm[cat_key]
                temp.append({'uri': uri, 'prob': t['prob']})
                self.sm[cat_key] = temp

    def build_index(self):
        logger.info('building index in ' + self.repository)
        self.t.build(self.ntrees)

    def save_index(self):
        logger.info('saving index into ' + self.index_name)
        self.t.save(self.index_name)
Пример #47
0
class JerkAgent(threading.Thread):
    def __init__(self, env, solutions=[]):
        threading.Thread.__init__(self)
        self.env = TrackedEnv(env)
        self.solutions = solutions
        #self.history = pickle.load(open('./history.pkl', 'rb'))
        self.history = []
        self.annoy_index = None  #AnnoyIndex(512)
        #self.annoy_index.load('./test.ann')
        self.recorded_episode_count = 0
        #self.replay_buffer = PrioritizedReplayBuffer(1000000, 0.5, 0.4, epsilon=0.1)

    def run(self):
        self.train()

    def should_use_history(self, reward, env):
        reward_percentage = reward / MAX_SCORE

        the_end_is_nigh = (EXPLOIT_BIAS +
                           env.total_steps_ever / TOTAL_TIMESTEPS)**3

        return random.random() < np.mean([reward_percentage, the_end_is_nigh])

    def best_solution(self):
        best_pair = sorted(self.solutions, key=lambda x: np.mean(x[0]))[-1]
        reward = np.mean(best_pair[0])

        return best_pair, reward

    def train(self):
        """Run JERK on the attached environment."""
        new_ep = True
        best_reward = 0.
        keep_percentage = 0.6
        best_pair = None

        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True  # pylint: disable=E1101
        with tf.Session(config=config):
            self.session = tf.get_default_session()
            self.model = policies.CnnPolicy(self.session,
                                            self.env.observation_space,
                                            self.env.action_space, 1, 1)
            self.a0 = self.model.pd.sample()
            params = tf.trainable_variables()
            #print('params', params)
            #for i in tf.get_default_graph().get_operations():
            #    print(i.name)
            #self.output_layer = tf.get_default_graph().get_tensor_by_name('model/fc1/add:0')
            self.output_layer = tf.get_default_graph().get_tensor_by_name(
                'model/Relu_3:0')
            #load_path = '/root/compo/saved_weights.joblib'
            load_path = './saved_weights.joblib'
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            self.session.run(restores)

            print('model created')

            while True:
                if new_ep:
                    if len(self.solutions) > 0:
                        best_pair, best_reward = self.best_solution()

                    if self.solutions and self.should_use_history(
                            best_reward, self.env):
                        new_rew, last_reward_index = self.exploit(
                            self.env, best_pair[1])
                        best_pair[0].append(new_rew)
                        if (new_rew / best_reward > keep_percentage) and len(
                                self.env.best_sequence()) != len(best_pair[1]):
                            self.solutions.append(([
                                max(self.env.reward_history)
                            ], self.env.best_sequence()[0:last_reward_index]))
                            self.record_history()
                        print('replayed best with reward %f' % (new_rew * 100))
                        continue
                    elif best_pair:
                        mutation_rate = (
                            1 - (best_reward / MAX_SCORE)) * MUTATION_DAMPEN
                        mutated = self.mutate(best_pair[1], mutation_rate)
                        new_rew, last_reward_index = self.exploit(
                            self.env, mutated)
                        print('mutated solution rewarded %f vs %f' %
                              ((new_rew * 100), (best_reward * 100)))
                        if (new_rew / best_reward > keep_percentage):
                            self.solutions.append(([
                                max(self.env.reward_history)
                            ], self.env.best_sequence()[0:last_reward_index]))
                            self.record_history()
                        continue
                    else:
                        self.env.reset()
                        new_ep = False
                rew, new_ep = self.move(self.env, 100)
                if not new_ep and rew <= 0:
                    print('backtracking due to negative reward: %f' %
                          (rew * 100))
                    _, new_ep = self.move(self.env, 70, left=True)
                if new_ep:
                    print('Episode rewarded %f vs %f' % ((rew * 100),
                                                         (best_reward * 100)))
                    self.record_history()
                    self.solutions.append(([max(self.env.reward_history)],
                                           self.env.best_sequence()))

    def record_history(self):
        self.recorded_episode_count += 1
        op = self.output_layer

        embeddings = []

        for i in range(len(self.env.reward_history)):
            obs = self.env.obs_history[
                i]  #(i * batch_size):((i + 1) * batch_size)]
            embedding = self.session.run([op],
                                         {self.model.X: [obs]})[0].reshape(
                                             512, )
            reward = self.env.reward_history[i]
            action = self.env.action_history[i]

            #print('obs', np.array(obs).shape)
            #self.replay_buffer.add_sample({
            #    'obs': obs,
            #    'actions': [action],
            #    'rewards': [reward],
            #    'new_obs': i == len(self.env.reward_history)
            #    })

            if reward > 0 and action[6]:
                self.history.append((embedding, reward, action))

        save_interval = 2
        if self.recorded_episode_count % save_interval == 0:
            print('recording history')
            #pickle.dump(self.history, open('./history.pkl', 'wb'))
            self.annoy_index = AnnoyIndex(512)
            for i in range(len(self.history)):
                self.annoy_index.add_item(i, self.history[i][0])
            self.annoy_index.build(20)
            #pickle.dump(self.replay_buffer, open( "./replay_buffer.p", "wb" ))

    def mutate(self, sequence, mutation_rate):
        mutated = copy.deepcopy(sequence)
        sequence_length = len(sequence)
        mutation_count = 0

        #mutation_start_index = min(sequence_length, random.randint(100, 2000))
        if random.random() < sequence_length / 100.:
            deletion_index = random.randint(0, sequence_length - 1)
            deletion_length = random.randint(0, sequence_length // 5)
            del mutated[deletion_index:(deletion_index + deletion_length)]
            print('excised %d of %d actions' %
                  (deletion_length, sequence_length))

        trim_length = random.randint(0, sequence_length // 5)
        del mutated[-trim_length:]
        print('trimmed %d of %d actions' % (trim_length, sequence_length))

        mutation_start_index = len(mutated)
        for i, action in reversed(
                list(enumerate(sequence[0:mutation_start_index]))):
            #percent_distance = i + 1 / sequence_length
            exponent = -(mutation_start_index - i - 1) / 1e2
            if random.random() < np.exp(exponent) * mutation_rate:
                #mutated = mutated[0:i]
                #print('trimmed %d of %d actions' % (mutation_start_index - len(mutated), sequence_length))
                #return mutated

                mutated[i] = random.choice(ACTIONS).copy()
                mutation_count += 1
        print('mutated %d out of %d actions' %
              (mutation_count, sequence_length))

        return mutated

    def random_next_step(self,
                         left=False,
                         jump_prob=1.0 / 10.0,
                         jump_repeat=4,
                         jumping_steps_left=0):
        action = random.choice(ACTIONS).copy()  #np.zeros((12,), dtype=np.bool)
        action[6] = left
        action[7] = not left
        if jumping_steps_left > 0:
            action[0] = True
            jumping_steps_left -= 1
        else:
            if random.random() < jump_prob:
                jumping_steps_left = jump_repeat - 1
                action[0] = True

        return action, jumping_steps_left

    def move(self,
             env,
             num_steps,
             left=False,
             jump_prob=1.0 / 10.0,
             jump_repeat=4):
        """
        Move right or left for a certain number of steps,
        jumping periodically.
        """
        #start_time = time.clock()
        total_rew = 0.0
        done = False
        steps_taken = 0
        jumping_steps_left = 0
        #random_prob = 0.5
        use_memory = random.random() > 0.5
        use_model = random.random() > 0.5
        times = {}
        while not done and steps_taken < num_steps:
            if self.model \
                    and self.annoy_index is not None \
                    and len(self.env.obs_history) > 0 \
                    and not left \
                    and use_memory \
                    and self.recorded_episode_count > 5:
                #print('sample', time.clock() - start_time)
                ob = [self.env.obs_history[-1]]

                embedding = self.session.run([self.output_layer],
                                             {self.model.X: ob})[0].reshape(
                                                 512, )
                results = self.annoy_index.get_nns_by_vector(
                    embedding, 100, include_distances=True)
                items = [self.history[i] for i in results[0]]
                rewards = [item[1] for item in items]
                #action = self.history[results[0][np.argmax(np.multiply(rewards, np.divide(1, results[1] + 1e9)))]][2]

                if len(rewards) > 0:
                    action = self.history[results[0][np.argmax(rewards)]][2]
                else:
                    action, jumping_steps_left = self.random_next_step(
                        left, jump_prob, jump_repeat, jumping_steps_left)
                #print(action, 'memory')
                _, rew, done, _ = env.step(action)

                #print('step', time.clock() - start_time)

            elif self.model \
                    and len(self.env.obs_history) > 0 \
                    and not left \
                    and use_model:

                ob = [self.env.obs_history[-1]]
                actions = self.session.run([self.a0], {self.model.X: ob})
                action = ACTIONS[actions[0][0]].copy()
                #print(action, 'model')
                _, rew, done, _ = env.step(action)

            else:
                action, jumping_steps_left = self.random_next_step(
                    left, jump_prob, jump_repeat, jumping_steps_left)
                #print(action, 'random')
                _, rew, done, _ = env.step(action)

            total_rew += rew
            steps_taken += 1
            if done:
                break
        #print('time to move {} steps'.format(steps_taken), time.clock() - start_time)
        return total_rew, done

    def exploit(self, env, sequence):
        """
        Replay an action sequence; pad with NOPs if needed.

        Returns the final cumulative reward.
        """
        env.reset()
        done = False
        idx = 0
        total_reward = 0
        jumping_steps_left = 0
        left = False
        last_reward_index = 0
        while not done:
            if idx >= len(sequence) or idx - last_reward_index > 100:
                while not done:
                    steps = 100
                    reward, done = self.move(env, steps, left)
                    idx += steps
                    if left:
                        left = False
                    if reward == 0:
                        left = True
                    else:
                        last_reward_index = idx
            else:
                action = sequence[idx]
                _, reward, done, info = env.step(action)
                total_reward += reward
                if reward > 0:
                    last_reward_index = idx

            #_, _, done, _ = env.step(action)
            idx += 1
        return env.total_reward, last_reward_index
Пример #48
0
class W2V_ANN(Model):
    def __init__(self, config):
        self.requirement = [
            'test_file', 'lastN', 'topN', 'type', 'item_vec_file',
            'index_file_file'
        ]
        self.config = config
        miss = set()
        for item in self.requirement:
            if item not in self.config:
                miss.add(item)
        if len(miss) > 0:
            raise Exception(f"Miss the key : {miss}")

        Model.__init__(self, self.config['test_file'], self.config['lastN'],
                       self.config['topN'])
        self.type = config['type']  # behavior / item

    def train(self):
        b_time = time.time()
        self.item_idx = {}
        self.item_idx_reverse = {}

        with open(self.config['item_vec_file'], 'r') as in_f:
            num_items, dim = in_f.readline().strip().split()
            print(f'Num of items : {num_items}, dim : {dim}')
            self.t = AnnoyIndex(int(dim), 'angular')

            for idx, line in tqdm(enumerate(in_f)):
                tmp = line.split()
                self.item_idx[tmp[0]] = idx
                self.item_idx_reverse[idx] = tmp[0]
                self.t.add_item(idx, list(map(float, tmp[1:])))
        print("Read file finished ...")
        file_name = self.config['index_file_file'] + '.' + self.type

        self.t.build(30)  # 10 trees
        self.t.save(f'{file_name}.ann')

        # self.t.load(f'{file_name}.ann')

        print(f"Train finished ...{time.time() - b_time}")

    def predict(self, last_n_events, topN):
        b_time = time.time()
        candidate_set = set()
        if self.type == 'item':
            last_n_items = [
                self.item_idx[e.split(':', 1)[1]] for e in last_n_events[::-1]
                if e in self.item_idx
            ]
        else:
            last_n_items = [
                self.item_idx[e] for e in last_n_events[::-1]
                if e in self.item_idx
            ]

        if len(last_n_items) == 0:
            return []

        rank_weight = np.array(
            [1 / np.log2(rank + 2) for rank in range(len(last_n_items))])
        # Calculate session vector
        session_vec = np.mean([
            np.array(self.t.get_item_vector(e)) * rank_weight[idx]
            for idx, e in enumerate(last_n_items)
        ],
                              axis=0)
        r_items, r_scores = self.t.get_nns_by_vector(session_vec,
                                                     topN * 2,
                                                     include_distances=True)

        res = []
        for item in r_items:
            if item in last_n_items:
                continue

            try:
                if self.type == 'item':
                    item_raw = self.item_idx_reverse[item]
                else:
                    item_raw = self.item_idx_reverse[item].split(':', 1)[1]

                if item_raw in res:
                    continue
                res.append(item_raw)
            except:
                pass
            if len(res) == topN:
                break
        return res
Пример #49
0
class Annoy:
    def __init__(self):
        self.dim = 300
        self.sim_metric = 'angular'
        self.n_trees = 10
        self.search_k = 1
        self.modelLoaded = False # self.loadModelFromDisk(model_location)

    def initAnnoy(self, dim, metric, matrix):
        self.sim_metric = metric
        self.dim = dim

        print('Annoy init index')
        self.a_index = AnnoyIndex(self.dim, self.sim_metric)
        build_ = self.a_index.build(self.n_trees)

        #if build_:
        #    self.modelLoaded = self.saveModelToDisk(model_location, self.a_index)
        return build_ #self.modelLoaded

    def addVectors(self, documents):
        ids = []
        # unbuild annoy index before adding new data
        self.a_index.unbuild()
        # add vectors
        for document in documents:
            _id = document._id
            vec = document.vector
            ids.append(_id)
            vector_e = vec.e
            vector_e_l = len(vector_e)
            # check if the vector length is below dimention limit
            # then pad vector with 0 by dimension
            if vector_e_l < self.dim:
                vector_e.extend([0]*(self.dim-vector_e_l))
            # make sure vector length doesn't exceed dimension limit
            vector_e = vector_e[:self.dim]
        
            # add vector
            self.a_index.add_item(int(_id), vector_e)
            
        # build vector
        build_ = self.a_index.build(self.n_trees)
        # if build_:
            # self.modelLoaded = self.saveModelToDisk(model_location, self.a_index)
        return build_, ids

    def deleteVectors(self, ids):

        return True, ids

    def getNearest(self, matrix, k):
        ids = []
        dists = []

        for vec_data in matrix:
            _id, _dist = self.a_index.get_nns_by_vector(vec_data, k, search_k=self.search_k, include_distances=True)
            ids.append(_id)
            dists.append(_dist)

        return True, ids, dists

    def loadModelFromDisk(self, location):
        try:
            # read index
            self.a_index = AnnoyIndex(self.dim, self.sim_metric)
            self.a_index.load(location)
            print('Annoy index loading success')
            return True
        except: 
            print('Annoy index loading failed')
            return False

    def saveModelToDisk(self, location, index):
        try:
            # write index
            index.save(location)
            print('Annoy index writing success')
            return True
        except:
            print('Annoy index writing failed')
            return False
Пример #50
0
class NearSentence(object):
    def __init__(self, fn_word, model_name, model_path):
        self.model = QueryModel(fn_word, model_name, model_path)
        self.queries = []
        self.titles = []

        self.query_index = 0
        self.title_index = 0
        self.query_ann = AnnoyIndex(self.model.dim, metric='euclidean')
        self.title_ann = AnnoyIndex(self.model.dim, metric='euclidean')

    def load_queries(self, fn_query, column):
        print '[In load_queries] Load candidate queries'
        sentences = []
        chunk = []

        vecs = []
        with open(fn_query) as fin:
            for line in fin:
                ll = line.decode('utf8').strip().split('\t')
                if len(ll) < column:
                    continue
                chunk.append(ll[column - 1])
                if len(chunk) == 1000:
                    vec, valid_sentence = self.model.get_query_vec(chunk)
                    vec = vec / np.sqrt(np.sum(vec**2, 1, keepdims=True))
                    vecs.extend(list(vec))
                    sentences.extend(valid_sentence)
                    chunk = []
        if len(chunk) > 0:
            vec, valid_sentence = self.model.get_query_vec(chunk)
            vecs.extend(list(vec))
            sentences.extend(valid_sentence)

        print '[In load_queries] Build query annoy tree'
        for s, v in izip(sentences, vecs):
            self.queries.append(s)
            # if vecs == [0] * self.vectorizer.dim:
            #     continue
            self.query_ann.add_item(self.query_index, v)
            self.query_index += 1

        self.query_ann.build(10)
        print '[In load_queries] Size of tree =', self.query_ann.get_n_items()

    def load_titles(self, fn_title, column):
        print '[In load_titles] Load candidate titles'
        sentences = []

        chunk = []
        vecs = []
        with open(fn_title) as fin:
            for line in fin:
                ll = line.decode('utf8').strip().split('\t')
                if len(ll) < column:
                    continue
                chunk.append(ll[column - 1])
                if len(chunk) == 1000:
                    vec, valid_sentence = self.model.get_title_vec(chunk)
                    vec = vec / np.sqrt(np.sum(vec ** 2, 1, keepdims=True))
                    vecs.extend(list(vec))
                    sentences.extend(valid_sentence)
                    chunk = []
            if len(chunk) > 0:
                vec, valid_sentence = self.model.get_title_vec(chunk)
                vec = vec / np.sqrt(np.sum(vec ** 2, 1, keepdims=True))
                vecs.extend(list(vec))
                sentences.extend(valid_sentence)

        print '[In load_titles] Build titles annoy tree, size =', len(vecs)

        for s, v in izip(sentences, vecs):
            self.titles.append(s)
            self.title_ann.add_item(self.title_index, v)     # v is a list
            self.title_index += 1
        self.title_ann.build(10)
        print '[In load_titles] Size of tree =', self.title_ann.get_n_items()



    def get_k_nearest_query(self, query, k):

        if isinstance(query, unicode):
            query = query.encode('utf8')

        cut_data = text_cutter.process({'title': query})
        cut_query = cut_data['cut_title'].decode('utf8')
        vecs, valid_queries= self.model.get_query_vec([cut_query])
        if len(valid_queries) == 0:
            return []
        vecs = vecs / np.sqrt(np.sum(vecs ** 2, 1, keepdims=True))
        vec = list(vecs)[0]

        k_neighbors, scores = self.query_ann.get_nns_by_vector(vec, n=k, include_distances=True)
        neighbors = []
        for i in k_neighbors:
            neighbors.append(self.queries[i])
        return sorted(zip(neighbors, scores), key=lambda x: x[-1])

    # def sim(self, u, v):
    #     norm_u = u / np.sqrt(np.sum(u ** 2, keepdims=True))
    #     norm_v = u /np.sqrt(np.sum(v ** 2, keepdims=True))
    #     return np.dot(norm_u, norm_v)

    def get_k_nearest_title(self, title, k):
        if isinstance(title, unicode):
            title = title.encode('utf8')

        cut_data = text_cutter.process({'title': title})
        title = cut_data['cut_title'].decode('utf8')
        vecs, valid_titles = self.model.get_title_vec([title])
        if len(valid_titles) == 0:
            return []
        vecs = vecs / np.sqrt(np.sum(vecs ** 2, 1, keepdims=True))
        vec = list(vecs)[0]
        k_neighbors, scores = self.title_ann.get_nns_by_vector(vec, n=k, include_distances=True)
        neighbors = []
        for i in k_neighbors:
            neighbors.append(self.titles[i])
        return sorted(zip(neighbors, scores), key=lambda x: x[-1])



    def get_answers(self, query, k):
        if isinstance(query, unicode):
            query = query.encode('utf8')

        cut_data = text_cutter.process({'title': query})
        cut_query = cut_data['cut_title'].decode('utf8')
        vecs, valid_queries = self.model.get_query_vec([cut_query])
        if len(valid_queries)==0:
            return []

        vecs = vecs / np.sqrt(np.sum(vecs ** 2, 1, keepdims=True))
        vec = list(vecs)[0]
        # recall titles according to cosine similarity
        candidate_titles_index, scores = self.title_ann.get_nns_by_vector(vec, n=k*10, include_distances=True)

        # rank candidate titles using model
        candidate_titles = []
        for i in candidate_titles_index:
            candidate_titles.append(self.titles[i])

        ranks = self.model.rank_titles(cut_query, candidate_titles)[:k]
        return ranks


    def process(self, data):
        res = {}
        if 'titles' in data:
            res['title_nns'] = self.get_k_nearest_title(data['titles'], 10)
        if 'queries' in data:
            res['query_nns'] = self.get_k_nearest_query(data['queries'], 10)
        return json.dumps(res, ensure_ascii=False).encode('utf8')
Пример #51
0
	def evaluate_predictions(self,id_to_latent_factor_dict,userid_to_latent_factor_dict,spectrogram_dir,num_users,id_to_songname_dict): #evaluate latent factor predictions
		get_song_embeddings = Model(inputs=self.model.input,outputs=self.model.get_layer(index=13).output)
		with open('Metadata\\song_id_to_prediction.txt') as f:
			song_id_to_prediction = json.loads(f.read())
		with open('Metadata\\user_id_to_prediction.txt') as f:
			user_id_to_prediction = json.loads(f.read())
		song_ids,song_predictions = zip(*(song_id_to_prediction.items())) #get song vectors generated by the cnn and the corresponding song ids
		user_ids,user_predictions = zip(*(user_id_to_prediction.items())) #get user vectors generated by the cnn and the corresponding user ids
		print('Predictions for {} songs in database created.....'.format(len(song_ids)))
		print('Predictions for {} users in database created.....'.format(len(user_ids)))
		new_subset_songs = [] 
		accuracy = [] # list containing recommendation accuracy for each user
		mAP = [] # list for containing mAP for each user
		song_ids_actual = [] #list of songs
		song_latent_factors = []
	#get latent factors for every song in the dataset
		for song in song_ids:
			try:
				song_latent_factors.append(id_to_latent_factor_dict[song])
			except KeyError:
				pass
		#build vector space with predicted latent factors
		t_pred_space = AnnoyIndex(self.num_factors,'angular')
		for i in range(len(song_predictions)):
			t_pred_space.add_item(i,song_predictions[i]) 
		t_pred_space.build(10)

		#build vector space with actual latent factors
		t_latent_space = AnnoyIndex(self.num_factors,'angular')
		for i in range(len(song_latent_factors)):
			t_latent_space.add_item(i,song_latent_factors[i])
		t_latent_space.build(10)
		user_count = 0
		for i in range(len(user_ids)):
			closest_songs_predicted = []
			closest_songs_actual = []
			user_id = user_ids[i]
			user_count+=1

			closest_songs = t_pred_space.get_nns_by_vector(user_predictions[i],500,include_distances = False) #get 500 closest songs to each user vector
			for index in closest_songs:
				try:
					closest_songs_predicted.append(id_to_songname_dict[song_ids[index]])
				except KeyError:
					pass										 #get the songids of the closest songs to the given user vector
			print("Closest songs generated by our network for user number {} ({}) is:".format(user_count,user_id))
			print(closest_songs_predicted[0:50])
			print('\n============================================\n')
			closest_songs_latent_space = t_latent_space.get_nns_by_vector(userid_to_latent_factor_dict[user_id],500,include_distances=False)
			for index in closest_songs_latent_space:
				try:
					closest_songs_actual.append(id_to_songname_dict[song_ids[index]])
				except KeyError:
					pass				
			print("Closest songs generated by latent factors for user number {} is:".format(user_count))
			print(closest_songs_actual[0:50])
			print('\n')


			map_user = ml_metrics.mapk(closest_songs_actual,closest_songs_predicted,k=500)
			good_recom = set(closest_songs_actual) & set(closest_songs_predicted)
			good_recom_count = len(good_recom)
			accuracy_user = (good_recom_count/500)*100
			#print("The accuracy for this user is {} ".format(accuracy_user))
			print("The map for this user is {} ".format(map_user))
			print('\n')
			accuracy.append(accuracy_user)
			mAP.append(map_user)
		total_accuracy = sum(accuracy)/user_count
		total_map = sum(mAP)/user_count
		print('\n The mAP of the recommendations for {} users is {} '.format(user_count,total_map))
		print('The accuracy of the recommendations for {} users is {}%'.format(user_count,total_accuracy))
		return total_accuracy
Пример #52
0
def random_nn_trees(X, num_trees):
    t = AnnoyIndex(X.shape[1], 'euclidean')
    for i in range(X.shape[0]):
        t.add_item(i, X[i, :])
    t.build(num_trees)
    return t
Пример #53
0
class Embed(object):
    def __init__(self, data_path, model='w2v', num_walks=100, walk_length=10):
        '''
        data: a dataframe: user, item

        model: 'w2v','deepwalk','gcn', 'gat'
        '''

        self.le = preprocessing.LabelEncoder()
        self.data, self.le = convert_to(data_path, self.le)
        self.w2v_model = None
        self._annoy = None

        self._embeddings = {}
        self.model_type = model
        self.num_walks = num_walks
        self.walk_length = walk_length

        if self.model_type == 'w2v':

            self.sentences = generate_sentences(self.data)

        if self.model_type == 'deepwalk':
            self.sentences = generate_sentences_dw(data)
        if self.model_type == 'gat':
            pass

    def train(self,
              window_size=5,
              workers=3,
              iter=5,
              learning_rate=0.01,
              epochs=10,
              dimensions=128,
              num_of_walks=80,
              beta=0.5,
              gamma=0.5,
              **kwargs):
        self.workers = workers
        self.iter = iter
        self.window_size = window_size
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.dimensions = dimensions
        self.num_of_walks = num_of_walks
        self.beta = beta
        self.gamma = gamma
        self._annoy = AnnoyIndex(dimensions, 'angular')

        if self.model_type == 'w2v' or self.model_type == 'deepwalk':
            kwargs["sentences"] = self.sentences
            kwargs["min_count"] = kwargs.get("min_count", 0)
            kwargs["size"] = self.dimensions
            kwargs["sg"] = 1  # skip gram
            kwargs["hs"] = 1  # deepwalk use Hierarchical Softmax
            kwargs["workers"] = self.workers
            kwargs["window"] = self.window_size
            kwargs["iter"] = self.iter

            print(f"There are {self.data.user.nunique()} users")
            print(f"There are {self.data.item.nunique()} items")

            print("Learning embedding vectors...")
            model_w2v = Word2Vec(**kwargs)
            print("Learning embedding vectors done!")

            self.w2v_model = model_w2v

            words = self.data['user'].unique().tolist(
            ) + self.data['item'].unique().tolist()

            for word in words:
                self._annoy.add_item(
                    self.le.transform([word])[0], self.w2v_model.wv[word])

            self._annoy.build(-1)

        if self.model_type == 'gat':
            model = AttentionWalkTrainer(graph_path=self.data,
                                         dimensions=self.dimensions,
                                         learning_rate=self.learning_rate,
                                         epochs=self.epochs,
                                         window_size=self.window_size,
                                         num_of_walks=self.num_of_walks,
                                         beta=self.beta,
                                         gamma=self.gamma)
            model.fit()
            emb = model.save_embedding()

            for id in emb.id:
                self._annoy.add_item(int(id),
                                     emb[emb.id == id].values.tolist()[0][1:])

            self._annoy.build(-1)

        #return model_w2v

    # def get_embeddings(self,):
    #     if self.w2v_model is None:
    #         print("model not train")
    #         return {}
    #
    #     self._embeddings = {}
    #     words = self.data['user'].unique().tolist() + self.data['item'].unique().tolist()
    #     for word in words:
    #         self._embeddings[word] = self.w2v_model.wv[word]
    #
    #     return self._embeddings

    def search(self, seed, k=5, type=None):
        '''
        seed: seed item to find nearest neighbor
        k: number of cloest neighhbors
        '''

        a_return = self._annoy.get_nns_by_item(
            int(self.le.transform([seed])[0]), k)
        return list(self.le.inverse_transform(a_return))
Пример #54
0
def convert(input_file_path, output_file_path=None,
            precision=DEFAULT_PRECISION, subword=False,
            subword_start=DEFAULT_NGRAM_BEG,
            subword_end=DEFAULT_NGRAM_END,
            approx=False, approx_trees=None,
            vocab_path=None, unicode_errors='strict'):

    files_to_remove = []
    subword = int(subword)
    approx = int(approx)

    # If no output_file_path specified, create it in a tempdir
    if output_file_path is None:
        output_file_path = os.path.join(
            tempfile.mkdtemp(),
            fast_md5_file(input_file_path) +
            '.magnitude')
        if os.path.isfile(output_file_path):
            try:
                conn = sqlite3.connect(output_file_path)
                db = conn.cursor()
                db.execute(
                    "SELECT value FROM magnitude_format WHERE key='size'") \
                    .fetchall()[0][0]
                conn.close()
                # File already exists and is functioning
                return output_file_path
            except BaseException:
                pass

    # Check args
    meta_1_path = None
    meta_2_path = None
    input_is_text = input_file_path.endswith('.txt') or \
        input_file_path.endswith('.vec')
    input_is_binary = input_file_path.endswith('.bin')
    input_is_hdf5 = input_file_path.endswith('.hdf5')
    input_is_hdf5_weights = input_file_path.endswith('_weights.hdf5')
    if not input_is_text and not input_is_binary and not input_is_hdf5:
        exit("The input file path must be `.txt`, `.bin`, `.vec`, or `.hdf5`")
    if not output_file_path.endswith('.magnitude'):
        exit("The output file path file path must be `.magnitude`")
    if vocab_path and not vocab_path.endswith(".magnitude"):
        exit("The vocab file path file path must be `.magnitude`")

    # Detect ELMo and ELMo options file
    input_is_elmo = False
    elmo_options_path = None
    if input_is_hdf5:
        elmo_options_path = input_file_path[0:-13] + \
            '_options.json' if input_is_hdf5_weights else input_file_path[0:-5] + '.json'  # noqa
        if not os.path.isfile(elmo_options_path):
            exit(
                "Expected `" +
                elmo_options_path +
                "` to exist. ELMo models require a JSON options file.")
        input_is_elmo = True
        meta_1_path = input_file_path
        meta_2_path = elmo_options_path

    # Detect GloVe format and convert to word2vec if detected
    detected_glove = False
    if input_is_text:
        with io.open(input_file_path, mode="r", encoding="utf-8",
                     errors="ignore") as ifp:
            line1 = None
            line2 = None
            while line1 is None or line2 is None:
                line = ifp.readline().strip()
                if len(line) > 0:
                    if line1 is None:
                        line1 = line
                    elif line2 is None:
                        line2 = line
            line1 = line1.replace('\t', ' ')
            line2 = line2.replace('\t', ' ')
            line1 = line1.split()
            line2 = line2.split()
            if len(line1) == len(line2):  # No header line present
                detected_glove = True
    if detected_glove:
        eprint("Detected GloVe format! Converting to word2vec format first..."
               "(this may take some time)")
        temp_file_path = os.path.join(
            tempfile.mkdtemp(), os.path.basename(input_file_path) + '.txt')
        try:
            import gensim
        except ImportError:
            raise ImportError("You need gensim >= 3.3.0 installed with pip \
                (`pip install gensim`) to convert GloVe files.")
        gensim.scripts.glove2word2vec.glove2word2vec(
            input_file_path,
            temp_file_path
        )
        input_file_path = temp_file_path
        files_to_remove.append(temp_file_path)

    # Open and load vector file
    eprint("Loading vectors... (this may take some time)")
    number_of_keys = None
    dimensions = None
    if input_is_binary:
        try:
            from gensim.models import KeyedVectors
        except ImportError:
            raise ImportError("You need gensim >= 3.3.0 installed with pip \
                (`pip install gensim`) to convert binary files.")
        keyed_vectors = KeyedVectors.load_word2vec_format(
            input_file_path, binary=input_is_binary, unicode_errors=unicode_errors)
        number_of_keys = len(keyed_vectors.vectors)
        dimensions = len(keyed_vectors.vectors[0])
    elif input_is_text:
        # Read it manually instead of with gensim so we can stream large models
        class KeyedVectors:
            pass

        def keyed_vectors_generator():
            number_of_keys, dimensions = (None, None)
            f = io.open(input_file_path, mode="r", encoding="utf-8",
                        errors="ignore")
            first_line = True
            for line in f:
                line_split = line.strip().replace('\t', ' ').split()
                if len(line_split) == 0:
                    continue
                if first_line:
                    first_line = False
                    number_of_keys = int(line_split[0])
                    dimensions = int(line_split[1])
                    yield (number_of_keys, dimensions)
                else:
                    empty_key = len(line_split) == dimensions
                    vec_floats = line_split if empty_key else line_split[1:]
                    key = "" if empty_key else line_split[0]
                    if len(vec_floats) > dimensions:
                        key = " ".join(
                            [key] + vec_floats[0:len(vec_floats) - dimensions])
                        vec_floats = vec_floats[len(vec_floats) - dimensions:]
                    vector = np.asarray([float(elem)
                                         for elem in vec_floats])
                    yield (key, vector)
        keyed_vectors = KeyedVectors()
        kv_gen = keyed_vectors_generator()
        number_of_keys, dimensions = next(kv_gen)
        kv_gen_1, kv_gen_2 = tee(kv_gen)
        keyed_vectors.vectors = imap(lambda kv: kv[1], kv_gen_1)
        keyed_vectors.index2word = imap(lambda kv: kv[0], kv_gen_2)
    elif input_is_elmo:
        vocab_magnitude = None
        if vocab_path:
            vocab_magnitude = Magnitude(vocab_path, eager=False, lazy_loading=1)
        else:
            vocab_magnitude = FeaturizerMagnitude(100)

        class KeyedVectors:
            pass
        elmo = ElmoEmbedder(elmo_options_path, input_file_path)
        keyed_vectors = KeyedVectors()
        number_of_keys = len(vocab_magnitude)
        dimensions = np.concatenate(elmo.embed_batch(
            [["test"]])[0], axis=1).flatten().shape[0]
        kv_gen_1, kv_gen_2 = tee(vocab_magnitude)
        keyed_vectors.vectors = chain.from_iterable(
            imap(
                lambda b: imap(
                    lambda e: np.concatenate(
                        e, axis=1).flatten(), elmo.embed_batch(
                        list(
                            imap(
                                lambda k: [k], b)))), ibatch(
                    imap(
                        lambda kv: kv[0], kv_gen_1), 1000)))
        keyed_vectors.index2word = imap(lambda kv: kv[0], kv_gen_2)
    else:
        class KeyedVectors:
            pass
        keyed_vectors = KeyedVectors()
        number_of_keys = 0
        dimensions = 0
        keyed_vectors.vectors = []
        keyed_vectors.index2word = []

    eprint("Found %d key(s)" % number_of_keys)
    eprint("Each vector has %d dimension(s)" % dimensions)

    # Delete files if they exist
    try_deleting(output_file_path)
    try_deleting(output_file_path + "-shm")
    try_deleting(output_file_path + "-wal")

    # Temporarily re-direct the output to a tmp file
    output_file_path_tmp = output_file_path + '.tmp'
    output_file_path_orig = output_file_path
    output_file_path = output_file_path_tmp

    # Delete files if they exist
    try_deleting(output_file_path)
    try_deleting(output_file_path + "-shm")
    try_deleting(output_file_path + "-wal")

    # Connect to magnitude datastore
    conn = sqlite3.connect(output_file_path)
    db = conn.cursor()

    # Make the database fast
    conn.isolation_level = None
    db.execute("PRAGMA synchronous = OFF;")
    db.execute("PRAGMA default_synchronous = OFF;")
    db.execute("PRAGMA journal_mode = WAL;")
    db.execute("PRAGMA count_changes = OFF;")

    # Create table structure
    eprint("Creating magnitude format...")
    db.execute("DROP TABLE IF EXISTS `magnitude`;")
    db.execute("""
        CREATE TABLE `magnitude` (
            key TEXT COLLATE NOCASE,
            """ +
               ",\n".join([("dim_%d INTEGER" % i) for i in range(dimensions)]) +
               ",\nmagnitude REAL" +
               """
        );
    """)
    db.execute("""
        CREATE TABLE `magnitude_format` (
            key TEXT COLLATE NOCASE,
            value INTEGER
        );
    """)
    if subword:
        db.execute("""
            CREATE VIRTUAL TABLE `magnitude_subword`
            USING fts3(
                char_ngrams,
                num_ngrams
            );
        """)
    if approx:
        db.execute("""
            CREATE TABLE `magnitude_approx` (
                trees INTEGER,
                index_file BLOB
            );
        """)

    metas = [('meta_1', meta_1_path), ('meta_2', meta_2_path)]
    for meta_name, meta_path in metas:
        if meta_path:
            db.execute("""
                CREATE TABLE `magnitude_""" + meta_name + """` (
                    meta_file BLOB
                );
            """)

    # Create annoy index
    approx_index = None
    if approx:
        approx_index = AnnoyIndex(dimensions)

    # Write vectors
    eprint("Writing vectors... (this may take some time)")
    insert_query = """
        INSERT INTO `magnitude`(
            key,
            """ + \
        ",\n".join([("dim_%d" % i) for i in range(dimensions)]) + \
        ",\nmagnitude" \
        + """)
        VALUES (
            """ + \
        (",\n".join(["?"] * (dimensions + 2))) \
        + """
        );
    """
    insert_subword_query = """
        INSERT INTO `magnitude_subword`(
            char_ngrams,
            num_ngrams
        )
        VALUES (
            ?, ?
        );
    """
    counters = [Counter() for i in range(dimensions)]
    key_vectors_iterable = izip(keyed_vectors.index2word, keyed_vectors.vectors)
    progress = -1
    db.execute("BEGIN;")
    for i, (key, vector) in enumerate(key_vectors_iterable):
        current_progress = int((float(i) / float(number_of_keys)) * 100)
        if current_progress > progress:
            progress = current_progress
            eprint("%d%% completed" % progress)
        if i % 100000 == 0:
            db.execute("COMMIT;")
            db.execute("BEGIN;")
        magnitude = np.linalg.norm(vector)
        vector = vector / magnitude
        epsilon = np.random.choice(
            [-1.0 / (10**precision), 1.0 / (10**precision)], dimensions)
        vector = epsilon if np.isnan(vector).any() else vector
        for d, v in enumerate(vector):
            counters[d][int(v * 100)] += 1
        db.execute(insert_query, (key,) + tuple(int(round(v * (10**precision)))
                                                for v in vector) + (float(magnitude),))  # noqa
        if subword:
            ngrams = set(
                (n.lower() for n in char_ngrams(
                    BOW + key + EOW,
                    subword_start,
                    subword_end)))
            num_ngrams = len(ngrams) * 4
            ngrams = set((n for n in ngrams if not any(
                [c in SQLITE_TOKEN_SPLITTERS for c in n])))
            db.execute(insert_subword_query,
                       (" ".join(ngrams), num_ngrams))
        if approx:
            approx_index.add_item(i, vector)
    eprint("Committing written vectors... (this may take some time)")
    db.execute("COMMIT;")

    # Figure out which dimensions have the most entropy
    entropies = [(d, entropy(counter)) for d, counter in enumerate(counters)]
    entropies.sort(key=lambda e: e[1], reverse=True)
    for e in entropies:
        eprint("Entropy of dimension %d is %f" % (e[0], e[1]))
    highest_entropy_dimensions = [e[0] for e in entropies]

    # Writing metadata
    insert_format_query = """
        INSERT INTO `magnitude_format`(
            key,
            value
        )
        VALUES (
            ?, ?
        );
    """

    db.execute(insert_format_query, ('version', CONVERTER_VERSION))
    db.execute(insert_format_query, ('elmo', input_is_elmo))
    db.execute(insert_format_query, ('size', number_of_keys))
    db.execute(insert_format_query, ('dim', dimensions))
    db.execute(insert_format_query, ('precision', precision))
    if subword:
        db.execute(insert_format_query, ('subword', subword))
        db.execute(insert_format_query, ('subword_start', subword_start))
        db.execute(insert_format_query, ('subword_end', subword_end))
    if approx:
        if approx_trees is None:
            approx_trees = max(50, int((number_of_keys / 3000000.0) * 50.0))
        db.execute(insert_format_query, ('approx', approx))
        db.execute(insert_format_query, ('approx_trees', approx_trees))
    for d in highest_entropy_dimensions:
        db.execute(insert_format_query, ('entropy', d))

    # Create indicies
    eprint("Creating search index... (this may take some time)")
    db.execute("CREATE INDEX `magnitude_key_idx` ON `magnitude` (key);")
    for i in highest_entropy_dimensions[0:1]:
        eprint("Creating spatial search index for dimension %d "
               "(it has high entropy)... (this may take some time)" % i)
        db.execute("""
            CREATE INDEX `magnitude_dim_%d_idx` ON `magnitude` (dim_%d);
        """ % (i, i))

    # Write approximate index to the database
    if approx:
        eprint("Creating approximate nearest neighbors index... \
(this may take some time)")
        approx_index.build(approx_trees)
        approx_index_file_path = os.path.join(
            tempfile.mkdtemp(),
            fast_md5_file(input_file_path) + '.ann')
        eprint("Dumping approximate nearest neighbors index... \
(this may take some time)")
        approx_index.save(approx_index_file_path)
        eprint("Compressing approximate nearest neighbors index... \
(this may take some time)")
        chunk_size = 104857600
        full_size = os.path.getsize(approx_index_file_path)
        insert_approx_query = """
            INSERT INTO magnitude_approx(trees, index_file) VALUES (?, ?);
        """
        with open(approx_index_file_path, 'rb') as ifh, \
                lz4.frame.LZ4FrameCompressor() as compressor:
            for i, chunk in enumerate(iter(partial(ifh.read, chunk_size), b'')):
                if i == 0:
                    chunk = compressor.begin() + compressor.compress(chunk)
                else:
                    chunk = compressor.compress(chunk)
                eprint(str((ifh.tell() / float(full_size)) * 100.0) + "%")
                if len(chunk) > 0:
                    db.execute(insert_approx_query,
                               (approx_trees, sqlite3.Binary(chunk)))
            chunk = compressor.flush()
            if len(chunk) > 0:
                db.execute(insert_approx_query,
                           (approx_trees, sqlite3.Binary(chunk)))
        files_to_remove.append(approx_index_file_path)

    for meta_name, meta_path in metas:
        if not meta_path:
            continue
        eprint("Compressing meta file... \
(this may take some time)")
        chunk_size = 104857600
        full_size = os.path.getsize(meta_path)
        insert_meta_query = """
            INSERT INTO magnitude_""" + meta_name + """(meta_file)
            VALUES (?);
        """
        with open(meta_path, 'rb') as ifh, \
                lz4.frame.LZ4FrameCompressor() as compressor:
            for i, chunk in enumerate(iter(partial(ifh.read, chunk_size), b'')):
                if i == 0:
                    chunk = compressor.begin() + compressor.compress(chunk)
                else:
                    chunk = compressor.compress(chunk)
                eprint(str((ifh.tell() / float(full_size)) * 100.0) + "%")
                if len(chunk) > 0:
                    db.execute(insert_meta_query,
                               (sqlite3.Binary(chunk),))
            chunk = compressor.flush()
            if len(chunk) > 0:
                db.execute(insert_meta_query,
                           (sqlite3.Binary(chunk),))

    # Clean up
    if len(files_to_remove) > 0:
        eprint("Cleaning up temporary files...")
        for file_to_remove in files_to_remove:
            try_deleting(file_to_remove)

    # Calculate max duplicate keys
    eprint("Finding duplicate keys... (this may take some time)")
    duplicate_keys_query = db.execute("""
        SELECT MAX(key_count)
        FROM (
            SELECT COUNT(key)
            AS key_count
            FROM magnitude
            GROUP BY key
        );
    """).fetchall()
    max_duplicate_keys = (
        duplicate_keys_query[0][0] if duplicate_keys_query[0][0] is not None else 1)  # noqa
    eprint(
        "Found %d as the maximum number of duplicate key(s)" %
        max_duplicate_keys)
    db.execute(insert_format_query, ('max_duplicate_keys', max_duplicate_keys))

    # VACUUM
    eprint("Vacuuming to save space... (this may take some time)")
    db.execute("VACUUM;")

    # Restore safe database settings
    db.execute("PRAGMA synchronous = FULL;")
    db.execute("PRAGMA default_synchronous = FULL;")
    db.execute("PRAGMA journal_mode = DELETE;")
    db.execute("PRAGMA count_changes = ON;")

    # Clean up connection
    conn.commit()
    conn.close()
    files_to_remove.append(output_file_path + "-shm")
    files_to_remove.append(output_file_path + "-wal")

    # Clean up
    if len(files_to_remove) > 0:
        eprint("Cleaning up temporary files...")
        for file_to_remove in files_to_remove:
            try_deleting(file_to_remove)

    # Rename file the temporary output to the real output
    os.rename(output_file_path, output_file_path_orig)
    output_file_path = output_file_path_orig

    # Print success
    eprint("Successfully converted '%s' to '%s'!" %
           (input_file_path, output_file_path))

    return output_file_path
Пример #55
0
    print('Centering the dataset and queries')
    center = np.mean(trainDataset, axis=0)
    trainDataset -= center
    queries -= center
    print('Done')

    print('Constructing the index')
    t1 = timeit.default_timer()
    # set the parameters
    dimension = len(trainDataset[0])
    index = AnnoyIndex(dimension, 'angular')

    for (i, object) in enumerate(trainDataset):
        index.add_item(i, object)
    index.build(number_of_trees)
    t2 = timeit.default_timer()
    print('Done')
    print('Construction time: {}'.format((t2 - t1)))

    print("start querying")
    for k in search_k:
        score = 0.0
        t1 = timeit.default_timer()
        for (i, query) in enumerate(queries):
            score += len(set(index.get_nns_by_vector(query, topk, search_k=k)).intersection(set(groundTruth[i])))
        t2 = timeit.default_timer()
        print("for search_k = {}".format(k))
        print('Query time: {} per query'.format((t2 - t1) * 1000 / float(
            len(queries))))
        print("the recall is {}".format(score / topk / float(len(queries))))
Пример #56
0
# encoding=utf-8
from annoy import AnnoyIndex
import random

f = 40
t = AnnoyIndex(40)
for i in range(1000):
    v = [random.uniform(-1, 1) for z in range(f)]
    t.add_item(i, v)

t.build(10)
t.save('./test.ann')

u = AnnoyIndex(f)
u.load('./test.ann')

target = [0.0] * f

# 结果是item id
a, b = u.get_nns_by_vector(target, 5, include_distances=True)
print a, b
for idx in a:
    # 获取具体的向量
    print u.get_item_vector(idx)
Пример #57
0
    print("[!] Creating a new image similarity search index.")
    print("[!] Loading the inception CNN")
    create_graph("./tensorflow_inception_graph.pb")
    print("[!] Done.")
    input_path = sys.argv[2]
    files = os.listdir(input_path)
    images = [input_path + i for i in files]
    results = extract_features(images, True)

    print("[!] Done extracting features, building search index")
    ann_index = AnnoyIndex(len(results[0]))
    for i in xrange(len(images)):
        ann_index.add_item(i, results[i])

    print("[!] Constructing trees")
    ann_index.build(80)
    print("[!] Saving the index to '%s'" % sys.argv[3])
    ann_index.save(sys.argv[3])
    print("[!] Saving the filelist to '%s'" % (sys.argv[3] + ".filelist"))
    filelist = file(sys.argv[3] + ".filelist", "wt")
    filelist.write("\n".join(images))
    filelist.close()
elif sys.argv[1] == "search":
    print("[!] Searching for similar images.")
    print("[!] Loading the inception CNN")
    create_graph("./tensorflow_inception_graph.pb")
    print("[!] Done.")
    input_path = sys.argv[2]
    files = os.listdir(input_path)
    images = [input_path + i for i in files]
    results = extract_features(images, True)
Пример #58
0
# -*- coding: utf-8 -*-
"""
@author:XuMing([email protected])
@description: 
"""

from annoy import AnnoyIndex

a = AnnoyIndex(3, 'angular')
a.add_item(0, [1, 0, 0])
a.add_item(1, [0, 1, 0])
a.add_item(2, [0, 0, 1])
a.build(-1)

print(a.get_nns_by_item(0, 100))
print(a.get_nns_by_vector([1.0, 0.5, 0.5], 100))

import random

f = 40
t = AnnoyIndex(f, 'angular')  # Length of item vector that will be indexed
for i in range(1000):
    v = [random.gauss(0, 1) for z in range(f)]
    t.add_item(i, v)

t.build(10)  # 10 trees
t.save('test.ann')

# ...

u = AnnoyIndex(f, 'angular')
Пример #59
0
def get_nearest_neighbors(
    node_ids: List[Union[int, str]], embeddings: np.ndarray, num_nearest: int
) -> Dict[Union[int, str], Set[Union[int, str]]]:
    """
    Compute similar nodes among a set of embeddings.

    Parameters
    ----------
    node_ids:
        Names of each embedding sample.
    embeddings:
        Embedding vectors of shape (num_total, embed_dim).
    num_nearest:
        Number of nearest-neighbors to find.

    Returns
    -------
    Dictionary mapping node_id's to nearest-neighbors
    """
    num_total = embeddings.shape[0]
    embed_dim = embeddings.shape[1]

    if num_total != len(node_ids):
        raise SimilaritySearchError("Lengths of node_ids and embeddings must match")

    # normalize embeddings
    embeddings = StandardScaler().fit_transform(embeddings)

    # initialize annoy index w/ angular distance metric
    annoy_index = AnnoyIndex(embed_dim, metric="angular")

    # add items to index
    for i in range(num_total):
        annoy_index.add_item(node_ids[i], embeddings[i])

    # build index
    annoy_index.build(n_trees=16, n_jobs=-1)

    nodes_neighbors = {}

    for node_id in node_ids:
        neighbors = annoy_index.get_nns_by_item(node_id, num_nearest + 1)

        # remove course itself
        neighbors = [x for x in neighbors if x != node_id]

        # set neighbors
        nodes_neighbors[node_id] = neighbors

    # symmmmetric filtering
    for node_id, neighbors in nodes_neighbors.items():

        filter_nodes = [x for x in neighbors if node_id in nodes_neighbors[x]]

        nodes_neighbors[node_id] = filter_nodes

    nodes_neighbors = {
        node_id: list(zip(neighbors, range(len(neighbors))))
        for node_id, neighbors in nodes_neighbors.items()
        if len(neighbors) > 0
    }

    return nodes_neighbors
        path=model_path,
        is_train=False,
        hyper_overrides={})
    
    predictions = []
    for language in ('python', 'go', 'javascript', 'java', 'php', 'ruby'):
        print("Evaluating language: %s" % language)
        definitions = pickle.load(open('../resources/data/{}_dedupe_definitions_v2.pkl'.format(language), 'rb'))
        indexes = [{'code_tokens': d['function_tokens'], 'language': d['language']} for d in tqdm(definitions)]
        code_representations = model.get_code_representations(indexes)

        indices = AnnoyIndex(code_representations[0].shape[0], 'angular')
        for index, vector in tqdm(enumerate(code_representations)):
            if vector is not None:
                indices.add_item(index, vector)
        indices.build(10)

        for query in queries:
            for idx, _ in zip(*query_model(query, model, indices, language)):
                predictions.append((query, language, definitions[idx]['identifier'], definitions[idx]['url']))

    df = pd.DataFrame(predictions, columns=['query', 'language', 'identifier', 'url'])
    df.to_csv(predictions_csv, index=False)


    if run_id:
        print('Uploading predictions to W&B')
        # upload model predictions CSV file to W&B

        # we checked that there are three path components above
        entity, project, name = args_wandb_run_id.split('/')