def precision(f=40, n=1000000): t = AnnoyIndex(f) for i in xrange(n): v = [] for z in xrange(f): v.append(random.gauss(0, 1)) t.add_item(i, v) t.build(2 * f) t.save('test.tree') limits = [10, 100, 1000, 10000] k = 10 prec_sum = {} prec_n = 1000 time_sum = {} for i in xrange(prec_n): j = random.randrange(0, n) print 'finding nbs for', j closest = set(t.get_nns_by_item(j, n)[:k]) for limit in limits: t0 = time.time() toplist = t.get_nns_by_item(j, limit) T = time.time() - t0 found = len(closest.intersection(toplist)) hitrate = 1.0 * found / k prec_sum[limit] = prec_sum.get(limit, 0.0) + hitrate time_sum[limit] = time_sum.get(limit, 0.0) + T for limit in limits: print 'limit: %-9d precision: %6.2f%% avg time: %.6fs' % (limit, 100.0 * prec_sum[limit] / (i + 1), time_sum[limit] / (i + 1))
def test_write_failed(self): f = 40 # Build the initial index t = AnnoyIndex(f) for i in range(1000): v = [random.gauss(0, 1) for z in range(f)] t.add_item(i, v) t.build(10) if sys.platform == "linux" or sys.platform == "linux2": # linux try: t.save("/dev/full") self.fail("didn't get expected exception") except Exception as e: self.assertTrue(str(e).find("No space left on device") > 0) elif sys.platform == "darwin": volume = "FULLDISK" device = os.popen('hdiutil attach -nomount ram://64').read() os.popen('diskutil erasevolume MS-DOS %s %s' % (volume, device)) os.popen('touch "/Volumes/%s/full"' % volume) try: t.save('/Volumes/%s/annoy.tree' % volume) self.fail("didn't get expected exception") except Exception as e: self.assertTrue(str(e).find("No space left on device") > 0) finally: os.popen("hdiutil detach %s" % device)
def build_index(df,n_trees = 50,dist_metric='angular',out_dir="./"): n_records = df.shape[0] n_col = df.shape[1] index = AnnoyIndex(n_col,metric=dist_metric) patient_dict = {} index_dict = {} i = 0 print "Adding items to the index..." for patient_id in df.index.values: if i % 10000 == 0: print str(i) vec = df.loc[patient_id].values index.add_item(i,vec) patient_dict[patient_id] = i index_dict[i] = patient_id i += 1 print "Building the index..." index.build(n_trees) index.save(out_dir+"annoy_index.ann") ## Save the patient_id -> index mapping ## w = csv.writer(open(out_dir+"patient_mapping.csv", "w")) for key, val in patient_dict.items(): w.writerow([key, val]) w = csv.writer(open(out_dir+"index_mapping.csv", "w")) for key, val in index_dict.items(): w.writerow([key, val])
def test_single_vector(self): # https://github.com/spotify/annoy/issues/194 a = AnnoyIndex(3) a.add_item(0, [1, 0, 0]) a.build(10) a.save('1.ann') self.assertEquals(a.get_nns_by_vector([1, 0, 0], 3, include_distances=True), ([0], [0.0]))
def _get_index(self, dataset): url = 'http://vectors.erikbern.com/%s.hdf5' % dataset vectors_fn = os.path.join('test', dataset + '.hdf5') index_fn = os.path.join('test', dataset + '.annoy') if not os.path.exists(vectors_fn): print('downloading', url, '->', vectors_fn) urlretrieve(url, vectors_fn) dataset_f = h5py.File(vectors_fn) distance = dataset_f.attrs['distance'] f = dataset_f['train'].shape[1] annoy = AnnoyIndex(f, distance) if not os.path.exists(index_fn): print('adding items', distance, f) for i, v in enumerate(dataset_f['train']): annoy.add_item(i, v) print('building index') annoy.build(10) annoy.save(index_fn) else: annoy.load(index_fn) return annoy, dataset_f
def build_annoy_index(corpus, dimension, winlen, winstep): print "Adding to Annoy index" index = AnnoyIndex(dimension, "euclidean") mfcc_list = [] i = 0 for filename, frames in corpus: # print filename, frames.shape for index_in_file, mfcc in enumerate(frames): mfcc_list.append((filename, index_in_file)) index.add_item(i, mfcc.tolist()) assert mfcc_list[i] == (filename, index_in_file) i += 1 opts = {"samplerate": desired_samplerate, "winlen": winlen, "winstep": winstep, "numcep": 13, "nfilt": 26, "nfft": 512, "ntrees": ANN_NTREES } cache_filename = "annoy_index_" + hashlib.md5(str([filename for filename, frames in corpus])).hexdigest() + "." + "_".join("%s=%s" % (k, v) for k, v in sorted(opts.items())) + ".tree" if not os.path.exists(cache_filename): print "Building Annoy index with %d trees" % ANN_NTREES # index.build(-1) index.build(ANN_NTREES) index.save(cache_filename) print "\tWrote cache to %s" % cache_filename else: print "\tReading cache from %s" % cache_filename index.load(cache_filename) return index, mfcc_list
def test_zero_vectors(self): # Mentioned on the annoy-user list bitstrings = [ '0000000000011000001110000011111000101110111110000100000100000000', '0000000000011000001110000011111000101110111110000100000100000001', '0000000000011000001110000011111000101110111110000100000100000010', '0010010100011001001000010001100101011110000000110000011110001100', '1001011010000110100101101001111010001110100001101000111000001110', '0111100101111001011110010010001100010111000111100001101100011111', '0011000010011101000011010010111000101110100101111000011101001011', '0011000010011100000011010010111000101110100101111000011101001011', '1001100000111010001010000010110000111100100101001001010000000111', '0000000000111101010100010001000101101001000000011000001101000000', '1000101001010001011100010111001100110011001100110011001111001100', '1110011001001111100110010001100100001011000011010010111100100111', ] vectors = [[int(bit) for bit in bitstring] for bitstring in bitstrings] f = 64 idx = AnnoyIndex(f, 'hamming') for i, v in enumerate(vectors): idx.add_item(i, v) idx.build(10) idx.save('idx.ann') idx = AnnoyIndex(f, 'hamming') idx.load('idx.ann') js, ds = idx.get_nns_by_item(0, 5, include_distances=True) self.assertEquals(js[0], 0) self.assertEquals(ds[:4], [0, 1, 1, 22])
def build_tree(df, metric): ''' INPUTS: Pandas DataFrame, Choice of Metric Space String OUTPUTS: Returns the built AnnoyIndex tree, returns a dictionary mapping index numbers to the DataFrame's index Builds a ANN tree using Spotify's ANNoy library. Metric is the metric space (either euclidean or angular) ''' tree = AnnoyIndex(len(df.iloc[0, :].values), metric=metric) indexes = {} for i in xrange(len(df)): v = df.iloc[i, :] indexes[i] = v.name tree.add_item(i, v.values) tree.build(50) tree.save(DATA_DIR + 'tree_' + metric + '.ann') with open(DATA_DIR + 'indexes_' + metric, 'wb') as f: pickle.dump(indexes, f) return (tree, indexes)
def test_no_items(self): idx = AnnoyIndex(100) idx.build(n_trees=10) idx.save('foo.idx') idx = AnnoyIndex(100) idx.load('foo.idx') self.assertEquals(idx.get_n_items(), 0) self.assertEquals(idx.get_nns_by_vector(vector=numpy.random.randn(100), n=50, include_distances=False), [])
def test_save_without_build(self): # Issue #61 i = AnnoyIndex(10) i.add_item(1000, [random.gauss(0, 1) for z in xrange(10)]) i.save('x.tree') j = AnnoyIndex(10) j.load('x.tree') j.build(10)
def test_only_one_item(self): # reported to annoy-user by Kireet Reddy idx = AnnoyIndex(100) idx.add_item(0, numpy.random.randn(100)) idx.build(n_trees=10) idx.save('foo.idx') idx = AnnoyIndex(100) idx.load('foo.idx') self.assertEquals(idx.get_n_items(), 1) self.assertEquals(idx.get_nns_by_vector(vector=numpy.random.randn(100), n=50, include_distances=False), [0])
def build_annoy_index(metric, input_filename, output_filename, n_trees): # Creates an index for Approimate Nearest Neighbors retrieval, using the annoy library. print 'Aproximate Nearest Neighbors for: ' + input_filename centroids_array = np.load(input_filename) n_dimensions = centroids_array.shape[1] t = AnnoyIndex(n_dimensions, metric=metric) for i in range(centroids_array.shape[0]): t.add_item(i, centroids_array[i][:]) print "Building Index - Number of Trees: ",str(n_trees) t.build(n_trees) t.save(output_filename)
def build_annoy_tree(word2vec_model, output_file_name, n_trees=100): tree = AnnoyIndex(word2vec_model.layer1_size) for i, word in enumerate(word2vec_model.index2word): tree.add_item(i, list(word2vec_model[word])) tree.build(n_trees) tree.save(output_file_name) return output_file_name
def build_annoy_index(encoded, outfile): input_shape = encoded.shape f = input_shape[1] t = AnnoyIndex(f, metric='angular') # Length of item vector that will be indexed for i,v in enumerate(encoded): t.add_item(i, v) t.build(100) # 10 trees if outfile is not None: t.save(outfile) return t
def create_profile(profile_name, image_folder, crop_width, crop_height, crop_increment): """ given a folder and profile name, gather a series of subimages into a profile with which to create a collage """ profile_folder = PROFILES_DIRECTORY + profile_name + "/" if not os.path.exists(profile_folder): os.makedirs(profile_folder) if not os.path.exists(profile_folder + "images/"): os.makedirs(profile_folder + "images/") image_file_list = [ f for f in listdir(image_folder) if isfile(join(image_folder, f))] # todo: use crop ratio to calculate variable vector size nns_index = AnnoyIndex(SAMPLE_DIMENSION[0]*SAMPLE_DIMENSION[1], metric="euclidean") image_index = [] index = 0 # iterate over images for processing into boxes and associated feature vectors for image_file in image_file_list: print("processing {}...").format(image_file), image_destination = profile_folder + "images/" + image_file copyfile(image_folder + image_file, image_destination) image = Image.open(image_destination) image_width, image_height = image.size[0], image.size[1] for x in xrange(0, image_width-crop_width, crop_increment): for y in xrange(0, image_height-crop_height, crop_increment): box = (x, y, x + crop_width, y + crop_height) image_sample = image.crop(box).resize( SAMPLE_DIMENSION).convert("LA") # dimensionality reduction gs_pixeldata = [] # reset feature vector # create feature vector for annoy for pixel in list(image_sample.getdata()): gs_pixeldata.append(pixel[0]) # add feature vector to annoy nns_index.add_item(index, gs_pixeldata) image_index.insert( index, {"image": image_destination, "box": (x, y, x + crop_width, y + crop_height)}) index += 1 print("done.") # image_index[-1] holds profile metadata. image_index.append({"crop_width": crop_width, "crop_height": crop_height, "total_images": index-1}) print("{} total subimages to be indexed...").format(str(index-1)) print("building trees (this can take awhile)...") nns_index.build(TREE_SIZE) # annoy builds trees print("done.") print("serializing trees..."), nns_index.save(profile_folder + profile_name + ".tree") print("done.") print("serializing index..."), pickle.dump(image_index, open(profile_folder + profile_name + ".p", "wb")) print("done.") print("{} profile completed. Saved in {}").format( profile_name, profile_folder) return
def test_load_save(self): # Issue #61 i = AnnoyIndex(10) i.load('test/test.tree') u = i.get_item_vector(99) i.save('x.tree') v = i.get_item_vector(99) self.assertEqual(u, v) j = AnnoyIndex(10) j.load('test/test.tree') w = i.get_item_vector(99) self.assertEqual(u, w)
def build(self, index_file, vectors, sender_urn): logger.info("Building {0}".format(index_file)) logger.info("Vectors {0}".format(vectors)) new_index = AnnoyIndex(self.feat_size, metric='euclidean') for idx, v in enumerate(vectors): logger.info("Adding item {0} with id {1}".format(v, idx)) new_index.add_item(idx, v) new_index.build(self.n_trees) logger.info("Saving index file {0}".format(index_file)) new_index.save(index_file) new_index.unload() pykka.ActorRegistry.get_by_urn(actor_urn=sender_urn).proxy().load() logger.info("Sent load command to worker")
def run(self): #get ids with self.output()['ids'].open('w') as ids_fd: corpus = FeaCorpus(self.input()[0].fn, onlyID=True) for id in corpus: print >> ids_fd, id corpus = FeaCorpus(self.input()[0].fn, sparse=False) t = AnnoyIndex(self.n_components, metric='angular') i = 0 for v in corpus: t.add_item(i, v) i += 1 t.build(int(self.n_components / 2)) t.save(self.output()['index'].fn)
def main(args): """ Main entry. """ data = Dataset(args.dataset) f = data.base.shape[1] for ntrees in args.ntrees: t = AnnoyIndex(f) # Length of item vector that will be indexed idxpath = os.path.join(args.exp_dir, 'sift_annoy_ntrees%d.idx' % ntrees) if not os.path.exists(idxpath): logging.info("Adding items ...") for i in xrange(data.nbae): t.add_item(i, data.base[i]) if i % 100000 == 0: logging.info("\t%d/%d" % (i, data.nbae)) logging.info("\tDone!") logging.info("Building indexes ...") t.build(ntrees) logging.info("\tDone!") t.save(idxpath) else: logging.info("Loading indexes ...") t.load(idxpath) logging.info("\tDone!") ids = np.zeros((data.nqry, args.topk), np.int) logging.info("Searching ...") tic() for i in xrange(data.nqry): ids[i, :] = np.array(t.get_nns_by_vector(data.query[i], args.topk)) time_costs = toc() logging.info("\tDone!") report = os.path.join(args.exp_dir, "report.txt") with open(report, "a") as rptf: rptf.write("*" * 64 + "\n") rptf.write("* %s\n" % time.asctime()) rptf.write("*" * 64 + "\n") r_at_k = compute_stats(data.groundtruth, ids, args.topk)[-1][-1] with open(report, "a") as rptf: rptf.write("=" * 64 + "\n") rptf.write("index_%s-ntrees_%s\n" % ("Annoy", ntrees)) rptf.write("-" * 64 + "\n") rptf.write("recall@%-8d%.4f\n" % (args.topk, r_at_k)) rptf.write("time cost (ms): %.3f\n" % (time_costs * 1000 / data.nqry))
def test_item_vector_after_save(self): # Issue #279 a = AnnoyIndex(3) a.verbose(True) a.add_item(1, [1, 0, 0]) a.add_item(2, [0, 1, 0]) a.add_item(3, [0, 0, 1]) a.build(-1) self.assertEqual(a.get_n_items(), 4) self.assertEqual(a.get_item_vector(3), [0, 0, 1]) self.assertEqual(set(a.get_nns_by_item(1, 999)), set([1, 2, 3])) a.save('something.annoy') self.assertEqual(a.get_n_items(), 4) self.assertEqual(a.get_item_vector(3), [0, 0, 1]) self.assertEqual(set(a.get_nns_by_item(1, 999)), set([1, 2, 3]))
def test_save_load(self): f = 100 i = AnnoyIndex(f, 'hamming') u = numpy.random.binomial(1, 0.5, f) v = numpy.random.binomial(1, 0.5, f) i.add_item(0, u) i.add_item(1, v) i.build(10) i.save('blah.ann') j = AnnoyIndex(f, 'hamming') j.load('blah.ann') rs, ds = j.get_nns_by_item(0, 99, include_distances=True) self.assertEquals(rs, [0, 1]) self.assertAlmostEqual(ds[0], 0) self.assertAlmostEqual(ds[1], numpy.dot(u-v, u-v))
class ANN: def __init__(self, dimension): self.ann = AnnoyIndex(dimension) def addVectors(self,vectors): for idx,v in enumerate(vectors): self.ann.add_item(idx,v) self.ann.build(10) def query(self,vector): match = self.ann.get_nns_by_vector(vector,1)[0] # return self.ann.get_item_vector(match),match return match def save(self): self.ann.save("analogies.ann") def load(self,filename): self.ann.load(filename)
def baseline_train(olddata, f, trees): """" olddata to train with using f number of features of the data and building an index with trees number of trees """ t = AnnoyIndex(f) # Length of item vector that will be indexed if os.path.isfile(saving_model): print "Loading in a pre-made, large read-only data structure we previously made with training data to use for approximate nearest neighbors on holdout data..." t.load(saving_model) else: print "Creating a large read-only data structure with training data to use for approximate nearest neighbors on holdout data..." for i in olddata.index: v = list(olddata.ix[i, ["latitude", "longitude", "time_period"]]) t.add_item(i, v) print "Building the trees..." t.build(trees) assert t.get_n_items() == olddata.shape[0] print "Saving the model..." t.save(saving_model) # Can easily be loaded into memory later. return t
class SimilarStringStore: def __init__(self, **kwargs): self.transformer = FeatureGenerator(k=1) print(self.transformer.n_features) self.store = AnnoyIndex(self.transformer.n_features) def vectorize(self, s): return self.transformer.transform(s) def add(self, id, s): ''' add a string to index ''' vector = self.transformer.transform(s) self.store.add_item(int(id), vector) return vector def build(self): self.store.build(500) def save(self, filename='store.knn'): self.store.save(filename) def build_and_save(self, filename='store.knn'): self.build() self.save(filename) def load(self, filename='store.knn'): self.store.load(filename) def query(self, s): ''' query index ''' vector = self.transformer.transform(s) neighbors = self.store.get_nns_by_vector(vector, 40) return neighbors def remove(self, id): ''' remove a string from the index ''' pass
def build_index_annoy(h5fname , dset,out='data.ann',trees = 128,lazy=True): #establish connection to HDF5 file h5f = h5py.File(h5fname,'r') if lazy: X = h5f[dset] else: X = h5f[dset][:] #get dimension f = X.shape[1] #initialize annoy t = AnnoyIndex(f,'angular') #iterate over features, add to annoy for i,v in enumerate(X): t.add_item(i, v) #build and save index t.build(trees) t.save(out)
def test_overwrite_index(self): # Issue #335 f = 40 # Build the initial index t = AnnoyIndex(f) for i in range(1000): v = [random.gauss(0, 1) for z in range(f)] t.add_item(i, v) t.build(10) t.save('test.ann') # Load index file t2 = AnnoyIndex(f) t2.load('test.ann') # Overwrite index file t3 = AnnoyIndex(f) for i in range(500): v = [random.gauss(0, 1) for z in range(f)] t3.add_item(i, v) t3.build(10) if os.name == 'nt': # Can't overwrite on Windows with self.assertRaises(IOError): t3.save('test.ann') else: t3.save('test.ann') # Get nearest neighbors v = [random.gauss(0, 1) for z in range(f)] nns = t2.get_nns_by_vector(v, 1000) # Should not crash
def merge_indicies(self, index_file_a, index_file_b, sender_urn): logger.info("Merging {0} and {1} for {2} index".format(index_file_a, index_file_b, sender_urn)) index_a = AnnoyIndex(self.feat_size, metric='euclidean') index_b = AnnoyIndex(self.feat_size, metric='euclidean') new_index = AnnoyIndex(self.feat_size, metric='euclidean') index_a.load(index_file_a) index_b.load(index_file_b) cnt = 0 for i in range(index_a.get_n_items()): new_index.add_item(cnt, index_a.get_item_vector(i)) cnt += 1 for i in range(index_b.get_n_items()): new_index.add_item(cnt, index_b.get_item_vector(i)) cnt += 1 new_index_file = index_file_a + ".merged" index_a.unload() index_b.unload() new_index.build(self.n_trees) new_index.save(new_index_file) logger.info("Merging {0} and {1} for {2} index, total number of items: {3}".format( index_file_a, index_file_b, sender_urn, cnt)) new_index.unload() pykka.ActorRegistry.get_by_urn(sender_urn).proxy().complete_compaction( new_index_file=new_index_file, index_file_a=index_file_a, index_file_b=index_file_b )
def _get_index(self, f, distance): input = 'test/glove.twitter.27B.%dd.txt.gz' % f output = 'test/glove.%d.%s.annoy' % (f, distance) if not os.path.exists(output): if not os.path.exists(input): # Download GloVe pretrained vectors: http://nlp.stanford.edu/projects/glove/ url = 'http://www-nlp.stanford.edu/data/glove.twitter.27B.%dd.txt.gz' % f print('downloading', url, '->', input) urlretrieve(url, input) print('building index', distance, f) annoy = AnnoyIndex(f, distance) for i, line in enumerate(gzip.open(input, 'rb')): v = [float(x) for x in line.strip().split()[1:]] annoy.add_item(i, v); annoy.build(10) annoy.save(output) annoy = AnnoyIndex(f, distance) annoy.load(output) return annoy
def test_load_save_get_item_vector(self): f = 3 i = AnnoyIndex(f) i.add_item(0, [1.1, 2.2, 3.3]) i.add_item(1, [4.4, 5.5, 6.6]) i.add_item(2, [7.7, 8.8, 9.9]) numpy.testing.assert_array_almost_equal(i.get_item_vector(0), [1.1, 2.2, 3.3]) self.assertTrue(i.build(10)) self.assertTrue(i.save('blah.ann')) numpy.testing.assert_array_almost_equal(i.get_item_vector(1), [4.4, 5.5, 6.6]) j = AnnoyIndex(f) self.assertTrue(j.load('blah.ann')) numpy.testing.assert_array_almost_equal(j.get_item_vector(2), [7.7, 8.8, 9.9])
def create_index(file_list, start_count,model_filename, redis_index_file): f = 100 t = AnnoyIndex(f) t.verbose(True) redisindex = open("/raid/ankit/"+redis_index_file,"w") i = start_count for f in file_list: print "Processing {} ...".format(f) with open(query_vectors_directory+f) as cur_f: for line in cur_f: #print line if not line.strip(): continue if i%1000000 == 0: print "{} lines complete.".format(i) query, vector = line.split('\t') vector = normalize_redis_vector(vector) redisindex.write(str(query)+"\t\t"+str(i)+"\n") try: t.add_item(i,vector) except: print "Exception : "+ str(line) pass #print i i+=1 print "Done adding items, now starting to build 10 trees.." t.build(10) print "Saving Model on Disk..." t.save('/raid/ankit/ann_models/'+model_filename) print "Finished Building and Saving Model!" redisindex.close() return i
img_path = file img = image.load_img(img_path, target_size=(224, 224)) x = image.img_to_array(img) x = np.expand_dims(x, axis=0) x = preprocess_input(x) fc2_features = model.predict(x) annoy_model.add_item(numimg, fc2_features[0]) print(id) numimg += 1 print('num files=' + str(numimg)) #ビルドして、ファイルとして保存する annoy_model.build(numimg) save_path = os.path.join(base_dir, "result.ann") annoy_model.save(save_path) # 確認する #annoy_model.unload() #trained_model.load("D:/python/annoy/images_next.ann") trained_model = AnnoyIndex(4096) trained_model.load('D:/python/annoy/images_next.ann') # モデルを読み込むことも可能です。 print(trained_model.get_nns_by_item(0, 6000)) # インデックス0付近の1000個のデータの返します items = trained_model.get_nns_by_item(1, 6, search_k=-1, include_distances=False) print(items) ####txt出力#### txt1 = 'D:/python/annoy/test.csv' file = open(txt1, "w", encoding = "utf_8")
class AnnoyTools: def __init__(self, config, is_build_annoy=False): self.word_embedding_path = config['word_embedding_path'] self.annoy_path = config['annoy_path'] self.word2index = {} self.index2word = {} self.annoy_index = AnnoyIndex(768, 'angular') self.word2index_path = config['word2index_path'] self.index2word_path = config['index2word2_path'] self.annoy_tree_num = config['annoy_tree_num'] if is_build_annoy: self._save_annoy_index() logger.info("Build and save annoy index.") else: self._load_annoy_index() logger.info("Load the saved annoy index.") def _save_annoy_index(self): try: with codecs.open(self.word_embedding_path, 'r', encoding="utf-8") as f: count = 0 for line in f: count += 1 result = line.strip('\n').split() if len(result) == 2: continue word = result[0] # index2word[count] = word self.word2index[word] = count vector = list(map(float, result[1:])) self.annoy_index.add_item(count, vector) except Exception as e: logger.info(e) self.index2word = {v: k for k, v in self.word2index.items()} with open(self.word2index_path, 'wb') as f1, open(self.index2word_path, 'wb') as f2: pickle.dump(self.word2index, f1, protocol=pickle.HIGHEST_PROTOCOL) pickle.dump(self.index2word, f2, protocol=pickle.HIGHEST_PROTOCOL) self.annoy_index.build(self.annoy_tree_num) logger.info("Save annoy tree done.") self.annoy_index.save(self.annoy_path) def _load_annoy_index(self): self.annoy_index.load(self.annoy_path) with open(self.word2index_path, "rb") as f1, open(self.index2word_path, 'rb') as f2: self.word2index = pickle.load(f1) self.index2word = pickle.load(f2) logger.info("Loaded the saved word2index and index2word.") def get_similar_by_query(self, query, topk=21): query_vec = bc.encode([query])[0] idxes, dists = self.annoy_index.get_nns_by_vector( query_vec, topk, include_distances=True) idxes = [self.index2word[i] for i in idxes] similars = list(zip(idxes, dists)) result = [(i, 0.1 * (abs(1 - score)) + 0.5) for i, score in zip(idxes, dists)] print(result) return similars def _read_vector(self): model = KeyedVectors.load_word2vec_format("words.vector", binary=True) model.wv.save_word2vec_format(self.word_embedding_path, binary=False)
class FMClassifier: """Class implementing the Features Matching Classifier (FMClassifier) Args: catalog_path (string): [description] params (dict): [description] """ ########################## # Init ########################## def __init__(self, catalog_path: str, params: Dict = {}): self.catalog_path = catalog_path self._config_classifier(catalog_path, params) ########################## # Config ########################## def _config_classifier(self, catalog_path, params): self._get_classifier_config(params) self._get_catalog_images(catalog_path) self._get_catalog_labels(catalog_path) self._get_catalog_images2labels() self._load_fingerprints() def _get_classifier_config(self, params): self.config = edict({ "verbose": params.get("verbose", constants.VERBOSE), "feature_descriptor": params.get("feature_descriptor", constants.FEATURE_DESCRIPTOR), "feature_dimension": params.get("feature_dimension", constants.FEATURE_DIMENSION), "image_size": params.get("image_size", constants.IMAGE_SIZE), "keypoint_stride": params.get("keypoint_stride", constants.KEYPOINT_STRIDE), "keypoint_sizes": params.get("keypoint_sizes", constants.KEYPOINT_SIZES), "matcher_path": params.get("matcher_path", constants.MATCHER_PATH), "matcher_distance": params.get("matcher_distance", constants.MATCHER_DISTANCE), "matcher_n_trees": params.get("matcher_n_trees", constants.MATCHER_N_TREES), "scoring": params.get("scoring", constants.SCORING), "k_nn": params.get("k_nn", constants.K_NN), "fingerprint_path": params.get("fingerprint_path", constants.FINGERPRINT_PATH), }) def _get_catalog_images(self, catalog_path): self.catalog_images = utils.get_all_images_from_folder(catalog_path) def _get_catalog_labels(self, catalog_path): self.catalog_labels = utils.get_labels_from_catalog(catalog_path) def _get_catalog_images2labels(self): self.catalog_images2labels = utils.compute_images2labels( self.catalog_images, self.catalog_labels) def _load_fingerprints(self): # Previous fingerprint if os.path.exists(self.config.fingerprint_path): with open(self.config.fingerprint_path, "rb") as pickle_file: self.config.fingerprint = pickle.load(pickle_file) else: self.config.fingerprint = "" # Current fingerprint self.fingerprint = fm_utils.compute_fingerprint( self.catalog_path, self.config) ########################## # Train ########################## def train(self): """Method used to train the classifier. """ # Init matcher self.matcher = AnnoyIndex(self.config.feature_dimension, self.config.matcher_distance) # Create or load matcher if self._should_create_index(): self._create_matcher_index() self._save_matcher_index() self._save_fingerprint() else: self._load_matcher_index() def _should_create_index(self): fingerprint_changed = self.config.fingerprint != self.fingerprint matcher_file_exists = os.path.isfile(self.config.matcher_path) return fingerprint_changed or (not matcher_file_exists) def _create_matcher_index(self): # Get descriptors catalog_descriptors = self._get_catalog_descriptors() # Get iterator descriptors_iterator = utils.get_iterator( catalog_descriptors, verbose=self.config.verbose, description="Creating Index...") # Config matcher for k, descriptor in enumerate(descriptors_iterator): self.matcher.add_item(k, descriptor) self.matcher.build(self.config.matcher_n_trees) def _get_catalog_descriptors(self): # Init descriptors list catalog_descriptors = [] # Init iterator iterator = utils.get_iterator( utils.get_all_images_from_folder(self.catalog_path), verbose=self.config.verbose, description="Computing catalog descriptors") # Compute all descriptors for path in iterator: # Read image img = utils.read_image(path, size=self.config.image_size) # Compute keypoints keypoints = utils.compute_keypoints(img, self.config.keypoint_stride, self.config.keypoint_sizes) # Compute descriptors descriptors = utils.compute_descriptors( img, keypoints, self.config.feature_descriptor) # Update descriptors list catalog_descriptors.append(descriptors) # Reshape descriptors list catalog_descriptors = np.array(catalog_descriptors) catalog_descriptors = catalog_descriptors.reshape( -1, catalog_descriptors.shape[-1]) return catalog_descriptors def _save_matcher_index(self): matcher_folder = "/".join(self.config.matcher_path.split("/")[:-1]) if not os.path.exists(matcher_folder): os.makedirs(matcher_folder) if self.config.verbose: print("Saving Index...") self.matcher.save(self.config.matcher_path) def _load_matcher_index(self): if self.config.verbose: print("Loading Index...") self.matcher.load(self.config.matcher_path) def _save_fingerprint(self): fingerprint_folder = "/".join( self.config.fingerprint_path.split("/")[:-1]) if not os.path.exists(fingerprint_folder): os.makedirs(fingerprint_folder) with open(self.config.fingerprint_path, "wb") as pickle_file: pickle.dump(self.fingerprint, pickle_file) ########################## # Predict ########################## def predict(self, query_path: str) -> np.array: """Method used to predict a score per class for a given query. Args: query_path (str): The local path of the query. Returns: np.array: The list of scores per class. """ # Read img query_img = utils.read_image(query_path, size=self.config.image_size) # Get keypoints query_keypoints = utils.compute_keypoints(query_img, self.config.keypoint_stride, self.config.keypoint_sizes) # Get descriptors query_descriptors = utils.compute_descriptors( query_img, query_keypoints, self.config.feature_descriptor) # Get scores scores = self._get_query_scores(query_descriptors) # To numpy scores = np.array(scores) return scores def predict_batch(self, query_paths: List[str]) -> np.array: """Method used to predict a class for a batch of queries. Args: query_paths (List[str]): The list of all query paths. Returns: np.array: The scores per class for each query. """ # Init scores scores = [] # Get iterator iterator = utils.get_iterator(query_paths, verbose=self.config.verbose, description="Prediction of all queries") # Loop over all queries for query_path in iterator: # Predict score of query query_scores = self.predict(query_path) # Update scores scores.append(query_scores) # To numpy scores = np.array(scores) return scores def _get_query_scores(self, query_descriptors): # Init scores variables scores = np.zeros((len(self.catalog_labels))) n_desc = query_descriptors.shape[0] # Compute matches train_idx, distances = self._compute_query_matches(query_descriptors) # Compute score matrix scores_matrix = self._compute_scores_matrix(distances) # Compute final scores for ind, nn_train_idx in enumerate(train_idx): for k, idx in enumerate(nn_train_idx): # Get image_path image_path = self.catalog_images[int(idx // n_desc)] # Get image_label image_label = self.catalog_images2labels[image_path] # Get label_idx label_idx = self.catalog_labels.index(image_label) # Update score scores[label_idx] += scores_matrix[ind, k] return scores def _compute_query_matches(self, query_descriptors): # Init matches variables n_matches = query_descriptors.shape[0] train_idx = np.zeros((n_matches, self.config.k_nn)) distances = np.zeros((n_matches, self.config.k_nn)) # Compute matches for i, descriptor in enumerate(query_descriptors): idx, dist = self.matcher.get_nns_by_vector(descriptor, self.config.k_nn, include_distances=True) train_idx[i] = idx distances[i] = dist return train_idx, distances def _compute_scores_matrix(self, distances): if self.config.scoring == "distance": return self._compute_scores_matrix_distance(distances) if self.config.scoring == "count": return self._compute_scores_matrix_count(distances) return self._compute_scores_matrix_distance(distances) def _compute_scores_matrix_distance(self, distances): return np.exp(-distances**2) def _compute_scores_matrix_count(self, distances): scores_matrix = np.zeros(distances.shape) for k in range(self.config.k_nn): scores_matrix[:, k] = 1 - k / self.config.k_nn return scores_matrix ########################## # Utils ########################## def label_id2str(self, label_id: int) -> str: """Gets the label_str given the label_id. Args: label_id (int): The given label_id. Returns: str: The label_str of the given label_id. """ return self.catalog_labels[label_id] def label_str2id(self, label_str: str) -> int: """Gets the label_id given the label_str. Args: label_str (str): The given label_str. Returns: int: The label_id of the given label_id. """ if label_str in self.catalog_labels: return self.catalog_labels.index(label_str) return -1
class Face: def __init__(self, app): self.storage = app.config["storage"] self.db = app.db self.faces = [] # storage all faces in caches array of face object self.known_encoding_faces = [] # faces data for recognition self.face_user_keys = {} self.load_all() def load_user_by_index_key(self, index_key=0): key_str = str(index_key) if key_str in self.face_user_keys: return self.face_user_keys[key_str] return None def load_train_file_by_name(self, name): trained_storage = path.join(self.storage, 'trained') return path.join(trained_storage, name) def load_unknown_file_by_name(self, name): unknown_storage = path.join(self.storage, 'unknown') unknown_storage_face = path.join(self.storage, 'unknown_face') return (path.join(unknown_storage, name), path.join(unknown_storage_face, name)) def load_all(self): results = self.db.select( 'SELECT faces.id, faces.user_id, faces.filename, faces.created FROM faces' ) self.layer_size = 0 count = 0 for row in results: user_id = row[1] filename = row[2] face = { "id": row[0], "user_id": user_id, "filename": filename, "created": row[3] } self.faces.append(face) face_image = face_recognition_api.load_image_file( self.load_train_file_by_name(filename)) face_image_encoding = face_recognition_api.face_encodings( face_image)[0] index_key = len(self.known_encoding_faces) self.known_encoding_faces.append(face_image_encoding) index_key_string = str(index_key) self.face_user_keys['{0}'.format(index_key_string)] = user_id print('user_id', user_id) if count == 0: self.layer_size = len(face_image_encoding) self.tree = AnnoyIndex(self.layer_size, metric) # prepare index self.tree.add_item(user_id, face_image_encoding) count += 1 print 'building index...\n' if self.layer_size > 0: print 'layer_size=', self.layer_size self.tree.build(ntrees) self.tree.save('index.ann') def recognize(self, unknown_filename): tree = loadannoy() (unfile, unfile_face) = self.load_unknown_file_by_name(unknown_filename) unknown_image = face_recognition_api.load_image_file(unfile) unknown_encoding_image = face_recognition_api.face_encodings( unknown_image)[0] #results = face_recognition.compare_faces(self.known_encoding_faces, unknown_encoding_image); results2 = find_matching_id(unknown_encoding_image, tree) guess_age = age_predict.predict([unfile_face]) guess_gender = gender_predict.predict([unfile_face]) #print("results", results) print("results2", results2) if results2: matching_id, min_dist = results2 user_id = matching_id #self.load_user_by_index_key(matching_id) return (user_id, guess_age, guess_gender) return ('unknown', guess_age, guess_gender) '''
class MLSAT_ANNSet(Dataset): @RedirectWrapper(target_cli=CLI) def __init__(self, K=15, train=False): self.load(train=train) # Run Approximate NN search self.annoy = AnnoyIndex(self.dim, 'euclidean') # 'angular' ? if os.path.exists(self.annpth): self.annoy.load(self.annpth) print("Loaded ANN indices from %s" % self.annpth) else: print("Creating ANN indices ...") self.X = self.X.view(-1, self.dim) # 28 * 28 = 784 for i, x in enumerate(self.X): self.annoy.add_item(i, x) self.annoy.build(128) self.annoy.save(self.annpth) print("ANN index complete") self.size = len(self.X) self.X = self.X.view(-1, *self.shape).contiguous().numpy() # self.X = self.X.view(-1, 784).contiguous().numpy() self.K = K self.ANNIdx = np.zeros((len(self.Y), self.K)).astype(np.int32) for i in range(len(self.Y)): self.ANNIdx[i] = self.annoy.get_nns_by_item(i, self.K + 1)[1:] def load(self, train=False): raise NotImplementedError def __getitem__(self, idx): # No GCN now # neighbors = self.annoy.get_nns_by_item(idx, self.K+1)[1:] neighbors = self.ANNIdx[idx] edge_index = torch.tensor( [[0] * self.K, [i for i in range(1, self.K + 1)]], dtype=torch.long) # 1-hop neighbor x = torch.tensor([self.X[idx]] + [self.X[i] for i in neighbors], dtype=torch.float) sid = [] for i in range(len(x)): sid.append(i if i <= self.K else (self.K + 1)) scatter_idx = torch.tensor(sid, dtype=torch.long) raw_edges = torch.tensor([[idx, i] for i in neighbors], dtype=torch.long) center = torch.tensor([self.X[idx]], dtype=torch.float) data = Data(x=x, edge_index=edge_index, scatter_idx=scatter_idx, raw_edges=raw_edges, center=center) return data def generateGraph(self, path): G = nx.Graph() for i, x in enumerate(self.X): G.add_node(i, digit=self.Y[i].item()) for i in range(len(self.X)): for j in self.annoy.get_nns_by_item(i, self.K + 1)[1:]: G.add_edge(i, j) nx.write_gexf(G, path) def __len__(self): return self.size
class AnnoySearch(object): def __init__(self, input_file=None, model_path=None, dict_path=None, vec_dim=128, tree_num=10): self.input_file = input_file self.model_path = model_path self.dict_path = dict_path self.vec_dim = vec_dim self.tree_num = tree_num self._vecs_train = [] self._ids = [] self._id_index = dict() self._index_id = dict() self._annoy_tree = None self.__load() def __load(self): if self.input_file: with open(self.input_file, 'r') as f: for line in f: arr = line.strip().split(' ') id = arr[0] vec = [float(sub) for sub in arr[1:]] self._vecs_train.append(vec) self._ids.append(id) if self.model_path and self.dict_path: self._annoy_tree = AnnoyIndex(self.vec_dim) self._annoy_tree.load(self.model_path) dict_file = open(self.dict_path, 'rb') dict_list = pickle.load(dict_file) self._id_index, self._index_id = dict_list def build_tree(self): self._annoy_tree = AnnoyIndex(self.vec_dim) for index, id in enumerate(self._ids): self._id_index[id] = index self._index_id[index] = id for index, vec_train in enumerate(self._vecs_train): #print vec_train self._annoy_tree.add_item(index, vec_train) self._annoy_tree.build(self.tree_num) def save_tree(self, model_path, dict_path): self._annoy_tree.save(model_path) dict_file = open(dict_path, 'wb') pickle.dump([self._id_index, self._index_id], dict_file) dict_file.close() def find_nns_by_id(self, id, n_items=40, search_k=-1, include_distances=False): index = self._id_index[id] #print index if self._annoy_tree and self._id_index: ids_found = [] res_found = self._annoy_tree.get_nns_by_item( index, n_items, search_k=search_k, include_distances=include_distances) #print res_found if include_distances: for index, dist in zip(res_found[0], res_found[1]): id_found = self._index_id[index] res = (id_found, dist) ids_found.append(res) else: for index in res_found: id_found = self._index_id[index] ids_found.append(id_found) return ids_found def print_nns_by_file(self, id_file, n_items, search_k=-1, include_distances=False): for line in open(id_file, 'r').readlines(): id = line.strip() index = self._id_index[id] res_found = self._annoy_tree.get_nns_by_item( index, n_items, search_k=search_k, include_distances=include_distances) #ids_found = self._index_id[index_found] if include_distances: for index, dist in zip(res_found[0], res_found[1]): id_found = self._index_id[index] print '%s\t%s\t%s' % (id, id_found, str(dist)) else: for index in res_found: id_found = self._index_id[index] print '%s\t%s' % (id, id_found)
if __name__ == '__main__': EMBEDDING_PATH = 'data/materials/zh.300.vec.gz' DEFAULT_KEYVEC = KeyedVectors.load_word2vec_format(EMBEDDING_PATH, limit=50000) id2word = {i: word for i, word in enumerate(DEFAULT_KEYVEC.index2word)} word2id = {word: i for i, word in enumerate(DEFAULT_KEYVEC.index2word)} n_trees = 100 emb_dim = 300 ann_index = AnnoyIndex(emb_dim, metric='angular') for i, word in enumerate(DEFAULT_KEYVEC.index2word): vec = DEFAULT_KEYVEC.get_vector(word) ann_index.add_item(i, vec) ann_index.build(n_trees) ann_index.save('data/index/annoy.cosine.idx') pickle_dump(id2word, 'data/index/id2word.pkl') pickle_dump(word2id, 'data/index/word2id.pkl') with open('data/index/annoy.cosine.10neighbors.txt', 'w', encoding='utf-8') as wf: for i, word in enumerate(DEFAULT_KEYVEC.index2word): cur_word = id2word[i] neighbors = [ id2word[id] for id in ann_index.get_nns_by_item(i, 11) ][1:] # 第一个是自己,去掉 wf.write('%s\t%s\n' % (cur_word, json.dumps(neighbors, ensure_ascii=False)))
embed = hub.load('https://tfhub.dev/google/universal-sentence-encoder-large/5') print('fetched model.') r = redis.Redis(host='127.0.0.1', port=6379) D = 512 NUM_TREES = 10 ann = AnnoyIndex(D, metric='angular') embedding_counter = 0 texts = [] with open('wiki/AA/wiki_00') as f: for line_index, line in enumerate(f): # print(line) embeddings = embed(line) print(embeddings) ann.add_item(line_index, embeddings[0]) if line_index == 0: texts.append(line) break # data.append(json.loads(line)) # ann.add_item(embedding_counter, e) # embedding_counter += 1 embeddings = embed(texts) ann.build(NUM_TREES) ann.save('wiki_articles.index')
def test_fail_save(self): t = AnnoyIndex(40, 'angular') with self.assertRaises(IOError): t.save('')
def test_save_twice(self): # Issue #100 t = AnnoyIndex(10) t.save("t.ann") t.save("t.ann")
for i in range(0,160): for j in range(0,320): feat=extract_features("https://gibs.earthdata.nasa.gov/wmts/epsg4326/best/MODIS_Terra_CorrectedReflectance_TrueColor/default/2005-08-29/250m/8/"+str(i)+"/"+str(j)+".jpg", model1) print("length",len(feat)) features.append(feat) print(str(i)+" "+str(j)) time.sleep(30) # Length of item vector that will be indexed t=AnnoyIndex(len(features[0])) for p in range(len(features)): feature = features[p] t.add_item(p, feature) t.build(40) # 40 trees t.save('hurricanes1.ann') except: print("Error Occurred, indexing") t=AnnoyIndex(features[0]) for p in range(len(features)): feature = features[p] t.add_item(p, feature) t.build(40) # 40 trees t.save('hurricanes1.ann')
class AnnoyIndexer(BaseChunkIndexer): def __init__(self, num_dim: int, data_path: str, metric: str = 'angular', n_trees=10, *args, **kwargs): super().__init__(*args, **kwargs) self.num_dim = num_dim self.data_path = data_path self.metric = metric self.n_trees = n_trees self._key_info_indexer = ListKeyIndexer() def post_init(self): from annoy import AnnoyIndex self._index = AnnoyIndex(self.num_dim, self.metric) try: if not os.path.exists(self.data_path): raise FileNotFoundError('"data_path" is not exist') if os.path.isdir(self.data_path): raise IsADirectoryError( '"data_path" must be a file path, not a directory') self._index.load(self.data_path) except: self.logger.warning( 'fail to load model from %s, will create an empty one' % self.data_path) def add(self, keys: List[Tuple[int, Any]], vectors: np.ndarray, weights: List[float], *args, **kwargs): last_idx = self._key_info_indexer.size if len(vectors) != len(keys): raise ValueError('vectors length should be equal to doc_ids') if vectors.dtype != np.float32: raise ValueError("vectors should be ndarray of float32") for idx, vec in enumerate(vectors): self._index.add_item(last_idx + idx, vec) self._key_info_indexer.add(keys, weights) def query(self, keys: 'np.ndarray', top_k: int, *args, **kwargs) -> List[List[Tuple]]: self._index.build(self.n_trees) if keys.dtype != np.float32: raise ValueError('vectors should be ndarray of float32') res = [] for k in keys: ret, relevance_score = self._index.get_nns_by_vector( k, top_k, include_distances=True) relevance_score = self.normalize_score(relevance_score, self.metric) chunk_info = self._key_info_indexer.query(ret) res.append([(*r, s) for r, s in zip(chunk_info, relevance_score)]) return res def normalize_score(self, score: List[float], metrics: str, *args, **kwargs) -> List[float]: if metrics == 'angular': return list(map(lambda x: 1 / (1 + x), score)) elif metrics == 'euclidean': import math return list( map(lambda x: 1 / (1 + math.sqrt(x) / self.num_dim), score)) elif metrics == 'manhattan': return list(map(lambda x: 1 / (1 + x / self.num_dim), score)) elif metrics == 'hamming': return list(map(lambda x: 1 / (1 + x), score)) elif metrics == 'dot': raise NotImplementedError @property def size(self): return self._index.get_n_items() def __getstate__(self): d = super().__getstate__() self._index.save(self.data_path) return d
xrange except NameError: # Python 3 compat xrange = range n, f = 100000, 40 t = AnnoyIndex(f) for i in xrange(n): v = [] for z in xrange(f): v.append(random.gauss(0, 1)) t.add_item(i, v) t.build(2 * f) t.save('test.tree') limits = [10, 100, 1000, 10000] k = 10 prec_sum = {} prec_n = 1000 time_sum = {} for i in xrange(prec_n): j = random.randrange(0, n) print('finding nbs for', j) closest = set(t.get_nns_by_item(j, k, n)) for limit in limits: t0 = time.time() toplist = t.get_nns_by_item(j, k, limit)
def convert(input_file_path, output_file_path=None, precision=DEFAULT_PRECISION, subword=False, subword_start=DEFAULT_NGRAM_BEG, subword_end=DEFAULT_NGRAM_END, approx=False, approx_trees=None): files_to_remove = [] subword = int(subword) approx = int(approx) # If no output_file_path specified, create it in a tempdir if output_file_path is None: output_file_path = os.path.join( tempfile.gettempdir(), fast_md5_file(input_file_path) + '.magnitude') if os.path.isfile(output_file_path): try: conn = sqlite3.connect(output_file_path) db = conn.cursor() size = db.execute( "SELECT value FROM magnitude_format WHERE key='size'") \ .fetchall()[0][0] conn, close() return output_file_path # File already exists and is functioning except: pass # Check args input_is_text = input_file_path.endswith('.txt') or \ input_file_path.endswith('.vec') input_is_binary = input_file_path.endswith('.bin') if not input_is_text and not input_is_binary: exit("The input file path must be .txt, .bin, or .vec") if not output_file_path.endswith('.magnitude'): exit("The output file path file path must be .magnitude") # Detect GloVE format and convert to word2vec if detected detected_GloVE = False if input_is_text: with io.open(input_file_path, mode="r", encoding="utf-8", errors="ignore") as ifp: line1 = None line2 = None while line1 is None or line2 is None: line = ifp.readline().strip() if len(line) > 0: if line1 is None: line1 = line elif line2 is None: line2 = line line1 = line1.replace('\t', ' ') line2 = line2.replace('\t', ' ') line1 = line1.split() line2 = line2.split() if len(line1) == len(line2): # No header line present detected_GloVE = True if detected_GloVE: eprint("Detected GloVE format! Converting to word2vec format first..." "(this may take some time)") temp_file_path = os.path.join( tempfile.gettempdir(), os.path.basename(input_file_path) + '.txt') try: import gensim except ImportError: raise ImportError("You need gensim >= 3.3.0 installed with pip \ (`pip install gensim`) to convert GloVE files.") gensim.scripts.glove2word2vec.glove2word2vec(input_file_path, temp_file_path) input_file_path = temp_file_path files_to_remove.append(temp_file_path) # Open and load vector file eprint("Loading vectors... (this may take some time)") number_of_keys = None dimensions = None if input_is_binary: try: from gensim.models import KeyedVectors except ImportError: raise ImportError("You need gensim >= 3.3.0 installed with pip \ (`pip install gensim`) to convert binary files.") keyed_vectors = KeyedVectors.load_word2vec_format( input_file_path, binary=input_is_binary) number_of_keys = len(keyed_vectors.vectors) dimensions = len(keyed_vectors.vectors[0]) else: # Read it manually instead of with gensim so we can stream large models class KeyedVectors: pass def keyed_vectors_generator(): number_of_keys, dimensions = (None, None) f = io.open(input_file_path, mode="r", encoding="utf-8", errors="ignore") first_line = True for line in f: line_split = line.strip().replace('\t', ' ').split() if len(line_split) == 0: continue if first_line: first_line = False number_of_keys = int(line_split[0]) dimensions = int(line_split[1]) yield (number_of_keys, dimensions) else: empty_key = len(line_split) == dimensions vec_floats = line_split if empty_key else line_split[1:] key = "" if empty_key else line_split[0] if len(vec_floats) > dimensions: key = " ".join([key] + \ vec_floats[0:len(vec_floats)-dimensions]) vec_floats = vec_floats[len(vec_floats) - dimensions:] vector = np.asarray([float(elem) \ for elem in vec_floats]) yield (key, vector) keyed_vectors = KeyedVectors() kv_gen = keyed_vectors_generator() number_of_keys, dimensions = next(kv_gen) kv_gen_1, kv_gen_2 = tee(kv_gen) keyed_vectors.vectors = imap(lambda kv: kv[1], kv_gen_1) keyed_vectors.index2word = imap(lambda kv: kv[0], kv_gen_2) eprint("Found %d key(s)" % number_of_keys) eprint("Each vector has %d dimension(s)" % dimensions) # Connect to magnitude datastore try_deleting(output_file_path) try_deleting(output_file_path + "-shm") try_deleting(output_file_path + "-wal") conn = sqlite3.connect(output_file_path) files_to_remove.append(output_file_path + "-shm") files_to_remove.append(output_file_path + "-wal") db = conn.cursor() # Make the database fast conn.isolation_level = None db.execute("PRAGMA synchronous = OFF;") db.execute("PRAGMA default_synchronous = OFF;") db.execute("PRAGMA journal_mode = WAL;") db.execute("PRAGMA count_changes = OFF;") # Create table structure eprint("Creating magnitude format...") db.execute("DROP TABLE IF EXISTS `magnitude`;") db.execute(""" CREATE TABLE `magnitude` ( key TEXT COLLATE NOCASE, """ + ",\n".join([("dim_%d INTEGER" % i) for i in range(dimensions)]) + """ ); """) db.execute(""" CREATE TABLE `magnitude_format` ( key TEXT COLLATE NOCASE, value INTEGER ); """) if subword: db.execute(""" CREATE VIRTUAL TABLE `magnitude_subword` USING fts3( char_ngrams, num_ngrams ); """) if approx: db.execute(""" CREATE TABLE `magnitude_approx` ( trees INTEGER, index_file BLOB ); """) # Create annoy index approx_index = None if approx: approx_index = AnnoyIndex(dimensions) # Write vectors eprint("Writing vectors... (this may take some time)") insert_query = """ INSERT INTO `magnitude`( key, """ + \ ",\n".join([("dim_%d" % i) for i in range(dimensions)]) \ + """) VALUES ( """ + \ (",\n".join(["?"] * (dimensions + 1))) \ + """ ); """ insert_subword_query = """ INSERT INTO `magnitude_subword`( char_ngrams, num_ngrams ) VALUES ( ?, ? ); """ counters = [Counter() for i in range(dimensions)] key_vectors_iterable = zip(keyed_vectors.index2word, keyed_vectors.vectors) progress = -1 db.execute("BEGIN;") for i, (key, vector) in enumerate(key_vectors_iterable): current_progress = int((float(i) / float(number_of_keys)) * 100) if current_progress > progress: progress = current_progress eprint("%d%% completed" % progress) if i % 100000: db.execute("COMMIT;") db.execute("BEGIN;") vector = vector / np.linalg.norm(vector) for d, v in enumerate(vector): counters[d][int(v * 100)] += 1 db.execute(insert_query, (key,) + tuple(int(round(v*(10**precision))) \ for v in vector)) if subword: ngrams = set( (n.lower() for n in char_ngrams(BOW + key + EOW, subword_start, subword_end))) num_ngrams = len(ngrams) * 4 ngrams = set( (n for n in ngrams if not any([c in SQLITE_TOKEN_SPLITTERS for c in n]))) db.execute(insert_subword_query, (" ".join(ngrams), num_ngrams)) if approx: approx_index.add_item(i, vector) eprint("Committing written vectors... (this may take some time)") db.execute("COMMIT;") # Figure out which dimensions have the most entropy entropies = [(d, entropy(counter)) for d, counter in enumerate(counters)] entropies.sort(key=lambda e: e[1], reverse=True) for e in entropies: eprint("Entropy of dimension %d is %f" % (e[0], e[1])) highest_entropy_dimensions = [e[0] for e in entropies] # Writing metadata insert_format_query = """ INSERT INTO `magnitude_format`( key, value ) VALUES ( ?, ? ); """ db.execute(insert_format_query, ('size', number_of_keys)) db.execute(insert_format_query, ('dim', dimensions)) db.execute(insert_format_query, ('precision', precision)) if subword: db.execute(insert_format_query, ('subword', subword)) db.execute(insert_format_query, ('subword_start', subword_start)) db.execute(insert_format_query, ('subword_end', subword_end)) if approx: if approx_trees is None: approx_trees = max(50, int((number_of_keys / 3000000.0) * 50.0)) db.execute(insert_format_query, ('approx', approx)) db.execute(insert_format_query, ('approx_trees', approx_trees)) for d in highest_entropy_dimensions: db.execute(insert_format_query, ('entropy', d)) # Create indicies eprint("Creating search index... (this may take some time)") db.execute("CREATE INDEX `magnitude_key_idx` ON `magnitude` (key);") for i in highest_entropy_dimensions[0:1]: eprint("Creating spatial search index for dimension %d " "(it has high entropy)... (this may take some time)" % i) db.execute(""" CREATE INDEX `magnitude_dim_%d_idx` ON `magnitude` (dim_%d); """ % (i, i)) # Write approximate index to the database if approx: eprint("Creating approximate nearest neighbors index... \ (this may take some time)") approx_index.build(approx_trees) approx_index_file_path = os.path.join( tempfile.gettempdir(), fast_md5_file(input_file_path) + '.ann') eprint("Dumping approximate nearest neighbors index... \ (this may take some time)") approx_index.save(approx_index_file_path) eprint("Compressing approximate nearest neighbors index... \ (this may take some time)") chunk_size = 104857600 full_size = os.path.getsize(approx_index_file_path) insert_approx_query = """ INSERT INTO magnitude_approx(trees, index_file) VALUES (?, ?); """ with open(approx_index_file_path, 'rb') as ifh, \ lz4.frame.LZ4FrameCompressor() as compressor: for i, chunk in enumerate(iter(partial(ifh.read, chunk_size), '')): if i == 0: chunk = compressor.begin() + compressor.compress(chunk) else: chunk = compressor.compress(chunk) eprint(str((ifh.tell() / float(full_size)) * 100.0) + "%") if len(chunk) > 0: db.execute(insert_approx_query, (approx_trees, sqlite3.Binary(chunk))) chunk = compressor.flush() if len(chunk) > 0: db.execute(insert_approx_query, (approx_trees, sqlite3.Binary(chunk))) files_to_remove.append(approx_index_file_path) # VACUUM eprint("Vacuuming to save space... (this may take some time)") db.execute("VACUUM;") # Restore safe database settings db.execute("PRAGMA synchronous = FULL;") db.execute("PRAGMA default_synchronous = FULL;") db.execute("PRAGMA journal_mode = DELETE;") db.execute("PRAGMA count_changes = ON;") # Clean up connection conn.commit() conn.close() # Clean up if len(files_to_remove) > 0: eprint("Cleaning up temporary files...") for file_to_remove in files_to_remove: try_deleting(file_to_remove) # Print success eprint("Successfully converted '%s' to '%s'!" % (input_file_path, output_file_path)) return output_file_path
def start_extraction(self): print('Start extraction') file_path = os.path.dirname(os.path.abspath(__file__)) file_name_process = os.path.join(file_path, 'export/nodes_export_process.json') text_arr = None # Für das Abarbeiten von Nodes aus der Datei nodes_export.json wird diese Datei zu nodes_export_process.json kopiert. Somit können aus Drupal heraus Nodes # exportiert werden, während aus der Datei nodes_export_process.json noch Nodes verarbeitet werden und es wird nicht in derselben Datei von unterschiedlichen # Prozessen zeitgleich Inhalt hinzugefügt, bzw. entfernt. Da die Funktion rekursiv aufgerufen wird, wird jedes mal überprüft, ob die nodes_export_process.json # noch existiert und der Inhalt per json.loads geladen werden kann. if (os.path.isfile(file_name_process)): try: file = open(file_name_process, 'r', encoding="utf-8") data = file.read() file.close() if (data == ''): os.remove(file_name_process) except: self.add_log("Problem opening file " + file_name_process) self.add_log("exit Task") exit() try: text_arr = json.loads(data) # Ist die Länge des Arrays 0, so wurden alle Nodes abgearbeitet und die Datei kann gelöscht werden if (len(text_arr) == 0): self.add_log('File ' + file_name_process + ' is empty. Delete file.') os.remove(file_name_process) text_arr = None except: self.add_log("Cant convert data from " + file_name_process + ' into json dict') # Wurde kein Json bis hier geladen oder die Datei nodes_export_process.json existiert nicht mehr, so wird die Datei nodes_export.json # versucht zu öffnen if (text_arr == None): file_name_default = os.path.join(file_path, 'export/nodes_export.json') try: file = open(file_name_default, 'r', encoding="utf-8") data = file.read() json_arr = json.loads(data) # Ist diese Datei leer, weil aktuell keine Nodes verarbeitet werden müssen, wird die Anwendung beendet if (len(json_arr) == 0): self.add_log("No input for processing in " + file_name_default) self.add_log("exit Task") exit() except: self.add_log("No input for processing in " + file_name_default) self.add_log("exit Task") exit() try: # Sind Nodes in der Datei, dann wird diese in nodes_export_process.json kopiert, mit der dann beim nächsten rekursiven Aufruf # weiter gearbeitet wird shutil.copy2(file_name_default, file_name_process) self.add_log("Copy file " + file_name_default + " to " + file_name_process) # Da die Datei kopiert wurde, kann die Ausgangsdatei nun geleert und gespeichert werden. file = open(file_name_default, "w") file.write("{}") file.close() # Inhalt aus der neuen Datei auslesen und in data speichern file = open(file_name_process, 'r', encoding="utf-8") data = file.read() file.close() except: self.add_log("Problem opening file " + file_name_process) self.add_log("exit Task") exit() # Versuchen json zu laden. Schlägt dies fehl, soll die gesamte Anwendung beendet werden, da es keine Daten zum Verarbeiten gibt try: text_arr = json.loads(data) except: self.add_log("Cant convert data from " + file_name_process + ' into json dict') self.add_log("exit Task") exit() # Fehlgeschlagene Nodes werden in einer Datei gespeichert. Damit weitere fehlgeschlagene Nodes hinten angehängt werden können, die Datei zunächst erst einmal laden. failed_nodes_name = os.path.join(file_path, 'export/nodes_failed.json') failed_nodes_arr = None if (os.path.isfile(failed_nodes_name)): f = open(failed_nodes_name, "r", encoding="utf-8") data = f.read() f.close() if (data == ''): failed_nodes_arr = {} else: try: failed_nodes_arr = json.loads(data) except: failed_nodes_arr = {} else: failed_nodes_arr = {} # Durch den rekursiven Aufruf der Funktion wird pro Aufruf eine Node abgearbeitet. Dafür zunächst den obersten Content Type aus dem Array laden # In dem mehrdimensionalen Array vom Content Type als nächste die Node ID holen und die dazugehörigen Values content_type = next(iter(text_arr)) content_type_values = next(iter(text_arr.values())) node_id = next(iter(content_type_values)) node_values = next(iter(content_type_values.values())) title = node_values['title'] created = node_values['created'] changed = node_values['changed'] self.add_log("Remaining Nodes: " + str(len(content_type_values.keys()))) self.add_log("Node ID: " + node_id + "; Title: " + title) print("Remaining Nodes: " + str(len(content_type_values.keys()))) print(node_id) # Versuchen die bisherige Node in Neo4j zu löschen. Dabei werden nicht Entitäten und Synonyme gelöscht, nur die Root Node # Content Fields, Sentence, Tags und die Relationen dazwischen. Beim erneuten indexieren einer Node und eventuellen Veränderungen # ist es einfacher den Baum, den die Node mit ihren Content Fields, Sentences etc. aufspannt einmal komplett zu entfernen. tries = 3 for i in range(tries): try: self.driver.del_node(node_id) except Exception as e: if (type(e).__name__ == "ServiceUnavailable" and i < tries - 1): self.add_log(str(e)) self.add_log("Retry") continue else: raise break # Alle Felder der aktuellen Node aus dem Array iterieren, versuchen die Informationen mit CoreNLP zu extrahieren und anschließend in Neo4j zu speichern. for field, content in node_values['fields'].items(): # Manche Felder haben mehrere Inhalte (beispielsweise beim Feld Siblings wäre jedes aufgeführte Geschwisterkind ein eigener Inhalt) und Drupal gibt zu jedem # Feld ein Array mit den unterschiedlichen Inhalten zurück. for text in content: # Versuchen die Informationen zu extrahieren. Das Einfügen in Neo4j ergibt nur Sinn, wenn dieser Prozess erfolgreich war. Andernfalls wird die Node mit # den dazugehörigen Values in nodes_failed mit aufgenommen extract_success = False try: extract_dict = self.extractInformations(text) extract_success = True except RuntimeError as e: if (content_type not in failed_nodes_arr): failed_nodes_arr[content_type] = {} failed_nodes_arr[content_type][node_id] = node_values self.add_log( "Problem occured during extraction. Maybe restart stanford core nlp. Message: " + str(e)) print('runtimereror') except Exception as e: if (content_type not in failed_nodes_arr): failed_nodes_arr[content_type] = {} failed_nodes_arr[content_type][node_id] = node_values self.add_log( "Problem occured during extraction. Maybe restart stanford core nlp. Message: " + str(e)) print('generic error') # War das Extrahieren der Informationen erfolgreich, soll das Ergebnis in Neo4j abgespeichert werden. Auch hier gilt, wenn das # Abspeichern nicht möglich ist, wird die Node mit den Values in nodes_failed mit aufgenommen if (extract_success): tries = 3 for i in range(tries): try: self.add_log("Insert field " + field + " with content in database") print( self.driver.create_root_node( extract_dict, node_id, content_type, field, title, created, changed).data()) #self.driver.create_root_node(extract_dict, node_id, content_type, field, title, created, changed) except Exception as e: if (type(e).__name__ == "ServiceUnavailable" and i < tries - 1): self.add_log(str(e)) self.add_log("Retry") continue else: if (content_type not in failed_nodes_arr): failed_nodes_arr[content_type] = {} failed_nodes_arr[content_type][ node_id] = node_values self.add_log( "Problem occured during save. Maybe restart neo4j service. Message: " + str(e)) break # Egal ob das Extrahieren und Abspeichern erfolgreich war, soll im Anschluss die Node mit den Values aus dem Array entfernt werden, damit beim nächsten rekursiven # Aufruf die nächste Node mit ihren Values abgearbeitet werden kann del (text_arr[content_type][node_id]) # Hat der Content Type keine Nodes mehr, so soll dieser auch entfernt werden, damit ggf. mit dem nächsten Content Type und seinen Nodes beim nächsten rekursiven # Aufruf fortgefahren werden kann. if (len(text_arr[content_type].keys()) == 0): del (text_arr[content_type]) # Das Array mit den Content Types und Nodes wieder abspeichern file = open(file_name_process, "w", encoding="utf-8") file.write(json.dumps(text_arr)) file.close() # Das Array mit den fehlgeschlagenen Nodes ebenfalls abspeichern file = open(failed_nodes_name, "w", encoding="utf-8") file.write(json.dumps(failed_nodes_arr)) file.close() # Ist die Länge vom Array 0, wurden alle Content Types und Nodes abgearbeitet. Danach den Suchindex neu erstellen und manuell hinzugefügte Entitäten # zu den Bäumen in der Datenbank hinzufügen. if (len(text_arr) == 0): # Der Suchindex erleichtert das Durchsuchen der Sätze und Teilsätze für die semantische Ähnlichkeit. Werden jedes mal alle Sätze aus der Datenbank geladen # und auf semantische Ähnlichkeit überprüft werden, dauert ein Aufruf mehr als 10 Sekunden, da es tausende von Sätzen sind. Beim Suchindex werden die Vektoren der Sätze so im Suchindex abgespeichert, # sodass durch nearest neighbor search die ähnlichen Sätze gefunden werden können. Dadurh verringert sich die Zeit auf ms. Der Suchindex kann aber nicht aktualisiert werden # und muss daher jedes mal neu erstellt werden. result = self.driver.get_all_sent_clauses() if (len(result) > 0): self.add_log('Creating search index') ann = AnnoyIndex(300) for res in result: nlp_res = self.nlp(res['shorten_original'].lower()) ann.add_item(int(res['sen_id']), nlp_res.vector) counter = 0 for clause in res['shorten_clauses']: clause_count = clause[1] nlp_clause = self.nlp(clause[0].lower()) ann.add_item(int(clause_count), nlp_clause.vector) counter += 1 ann.build(10) ann.save('search_index.ann') # Die manuell hinzugefügten Entitäten stammen von Drupal und wurden nach dem Anlegen in eine Datei abgespeichert, die # von dieser Anwendung verarbeitet werden kann. Da beim erneuten Indexieren die Node mit ihren Unterknoten aus Neo4j gelöscht wird und somit keine Verbindung mehr mit # manuell hinzugefügten Entitäten besteht wird im Anschluss geschaut in welchen Sätzen der Nodes die entsprechenden Entitäten vorkommen und verknüpft diese mit den Sätzen. self.add_log('Adding manual created nodes') manually_entities = None try: changed_entities = os.path.join(file_path, 'changed_entities.json') file = open(changed_entities, 'r', encoding="utf-8") data = file.read() changed_entities = json.loads(data) manually_entities = changed_entities['added_entities'] except: pass if (manually_entities != None): for ent in manually_entities: self.driver.add_entity(ent, manually_entities[ent]) print('manuelaly nodes') # Rekursiver Aufruf der Funktion self.start_extraction()
from annoy import AnnoyIndex import rocksdb import numpy as np import io import json db = rocksdb.DB("fastText.db", rocksdb.Options(create_if_missing=False)) emojiDB = rocksdb.DB("emojiFastText.db", rocksdb.Options(create_if_missing=False)) with io.open('emojiData.json', encoding='utf8') as f: data = json.load(f) f = 300 t = AnnoyIndex(f, metric='angular') for i, e in enumerate(data): j = e["emoji"] X = np.frombuffer(emojiDB.get(j.encode())) t.add_item(i, X) t.build(100) t.save('emojis.ann')
class EditVectorCombinedDistanceSolver(VectorDistanceSolver): """ A simple baseline model for doing OOV translation that takes the translation of an OOV word to be the translation of the in-vocabulary word with the highest interpolation of vector similarity + edit similarity. We find the word with the highest similarity in the source vocabulary, and pick its most likely translation (according to the t-table) as our predicted translation. We take advantage of the FastText package from Facebook to easily generate vectors for unknown words. """ @overrides def __init__(self): super(EditVectorCombinedDistanceSolver, self).__init__() # We don't use self.foreign_vectors, delete to avoid bugs del self.foreign_vectors self.int_to_foreign = None self.annoy_index = None self.annoy_index_path = None self.vector_dim = None @overrides def get_state_dict(self): state_dict = { "solver_class": self.__class__, "solver_init_params": self.solver_init_params, "fasttext_model_path": self.fasttext_model_path, "foreign_to_english": self.foreign_to_english, "int_to_foreign": self.int_to_foreign, "annoy_index_path": self.annoy_index_path, "vector_dim": self.vector_dim } return state_dict @overrides def load_from_state_dict(self, state_dict): self.fasttext_model_path = state_dict["fasttext_model_path"] self.foreign_to_english = state_dict["foreign_to_english"] self.int_to_foreign = state_dict["int_to_foreign"] self.annoy_index_path = state_dict["annoy_index_path"] self.vector_dim = state_dict["vector_dim"] self.annoy_index = AnnoyIndex(self.vector_dim) self.annoy_index.load(self.annoy_index_path) self.was_loaded = True return self @overrides def save_to_file(self, save_dir, run_id): save_path = os.path.join(save_dir, run_id + "_model.pkl") # Move the fastText model we used to the save path logger.info("Copying fastText model from {} to " "save dir at {}".format(self.fasttext_model_path, save_dir)) shutil.copy(self.fasttext_model_path, save_dir) # Now edit the model path to point to file we wrote self.fasttext_model_path = os.path.join( save_dir, os.path.basename(self.fasttext_model_path)) # Save the annoy index to the save path logger.info("Saving annoy index to save dir at {}".format(save_dir)) self.annoy_index_path = os.path.join(save_dir, run_id + "_annoy_index.ann") self.annoy_index.save(self.annoy_index_path) state_dict = self.get_state_dict() torch.save(state_dict, save_path, pickle_module=dill) @overrides def train_model(self, foreign_vectors, foreign_to_english, num_trees=500, log_dir=None, save_dir=None, run_id=None): if self.was_loaded: raise ValueError( "EditVectorCombinedDistanceSolver does not support " "training from a saved model.") # This model has no parameters to optimize self.foreign_to_english = foreign_to_english # Use FastText to generate vectors for tokens in the # foreign_to_english dictionary that aren't in foreign_vectors. logger.info("Using FastText to make vectors for tokens that are in " "our foreign to english dictionary, but not in the set " "of pretrained vectors.") uncovered_foreign_tokens = [ tok for tok in self.foreign_to_english if tok not in foreign_vectors ] uncovered_tokens_to_vectors = generate_fasttext_vectors_from_list( fasttext_binary_path=self.fasttext_bin_path, fasttext_model_path=self.fasttext_model_path, input_words=uncovered_foreign_tokens) # Add these vectors to the foreign_vectors dict for token, vector in uncovered_tokens_to_vectors.items(): if self.vector_dim is None: self.vector_dim = len(vector) else: assert self.vector_dim == len(vector) foreign_vectors[token] = vector # Prune the foreign_vectors_dict until the set of foreign tokens in # our foreign to english dict is the same as the set of vectors we have pruned_foreign_vectors_dict = { k: v for k, v in foreign_vectors.items() if k in self.foreign_to_english } self.int_to_foreign = { k: v for k, v in enumerate(pruned_foreign_vectors_dict.keys()) } # Build the annoy index logger.info("Building annoy index with {} trees".format(num_trees)) self.annoy_index = AnnoyIndex(self.vector_dim) num_added = 0 for index, foreign in self.int_to_foreign.items(): # If we don't have translations for a foreign word, we don't # want to propose that as the source for a translation. if foreign not in self.foreign_to_english: continue vector = foreign_vectors[foreign] self.annoy_index.add_item(index, vector) num_added += 1 self.annoy_index.build(num_trees) assert self.annoy_index.get_n_items() == len(self.foreign_to_english) if save_dir is not None and run_id is not None: logger.info("Saving trained model to save dir {} with run " "id {}".format(save_dir, run_id)) self.save_to_file(save_dir=save_dir, run_id=run_id) @overrides def translate_list(self, oov_list, show_progbar=True, n_jobs=1, debug=False): # Get vectors for all of the uncovered_oovs oov_vectors = generate_fasttext_vectors_from_list( fasttext_binary_path=self.fasttext_bin_path, fasttext_model_path=self.fasttext_model_path, input_words=oov_list) oov_token_candidates_list = [] num_to_pick = int(math.ceil(0.2 * len(self.foreign_to_english))) logger.info("Using annoy to find top {} nearest " "neighbors for each token".format(num_to_pick)) # Use Annoy to find the top nearest neighbors for each oov token. for oov_token in oov_list: oov_vector = oov_vectors[oov_token] # Find the top 5% of nearest neighbors (in vector space) with the # oov token's vector. This tries to find words that are semantically # similar. nn_indices = self.annoy_index.get_nns_by_vector( oov_vector, num_to_pick, search_k=-1, include_distances=False) # Get the foreign words corresponding to the found nearest neighbors # These are the candidates we will use in the edit distance translation candidate_foreigns = [ self.int_to_foreign[index] for index in nn_indices ] oov_token_candidates_list.append((oov_token, candidate_foreigns)) if n_jobs > 1: # Since we can't pickle self.annoy_index, set it to a local variable # and then delete it. annoy_index = self.annoy_index del self.annoy_index logger.info("Translating with {} processes".format(n_jobs)) pool = multiprocessing.Pool(processes=n_jobs) if six.PY2: # Create a multiprocess pool with the _get_nearest_neighbor alias. # This is not used in python 3 because there's overhead in passing # the object back and forth. _bound_get_nearest_neighbor_mp_alias = functools.partial( _get_nearest_neighbor_mp_alias, self) closest_source_tokens = pool.map( _bound_get_nearest_neighbor_mp_alias, oov_token_candidates_list) else: closest_source_tokens = pool.map(self._get_nearest_neighbor, oov_token_candidates_list) # Restore self.annoy_index self.annoy_index = annoy_index else: if show_progbar: oov_iterable = tqdm(oov_token_candidates_list) else: oov_iterable = oov_token_candidates_list closest_source_tokens = [ self._get_nearest_neighbor(oov_token_vector_tuple) for oov_token_vector_tuple in oov_iterable ] predicted_translations = [] for source_token in closest_source_tokens: english_translations = self.foreign_to_english[source_token] predicted_translation = max(english_translations.keys(), key=lambda k: english_translations[k]) predicted_translations.append(predicted_translation) return predicted_translations def _get_nearest_neighbor(self, oov_token_candidates_tuple): """ Given a single OOV token, find the best English translation. Parameters ---------- oov_token_candidates_tuple: tuple of (str, List[str]) Tuple of (oov_token, candidates). The oov token is the string to predict a translation for. Candidates are the words we can choose among as potential source words for translation. """ oov_token, foreign_candidates = oov_token_candidates_tuple # Out of the candidates, pick the one with the highest # edit similarity. def calculate_edit_similarity_with_input_oov(x): if len(x) == 0: return 0 longest_common_prefix_len = len( os.path.commonprefix([x, oov_token])) edit_distance = int(editdistance.eval(x, oov_token)) score = (0.75 * (1 - (edit_distance / max(len(x), len(oov_token)))) + 0.25 * (longest_common_prefix_len / min(len(x), len(oov_token)))) return score most_similar_source_token = max( foreign_candidates, key=calculate_edit_similarity_with_input_oov) return most_similar_source_token
class AnnoyIndexer(BaseVectorIndexer): lock_work_dir = True def __init__(self, num_dim: int, data_path: str, metric: str = 'angular', n_trees=10, *args, **kwargs): super().__init__(*args, **kwargs) self.num_dim = num_dim self.work_dir = data_path self.indexer_file_path = os.path.join(self.work_dir, self.internal_index_path) self.metric = metric self.n_trees = n_trees self._key_info_indexer = ListKeyIndexer() def post_init(self): from annoy import AnnoyIndex self._index = AnnoyIndex(self.num_dim, self.metric) try: self._index.load(self.indexer_file_path) except: self.logger.warning( 'fail to load model from %s, will create an empty one' % self.indexer_file_path) def add(self, keys: List[Tuple[int, int]], vectors: np.ndarray, weights: List[float], *args, **kwargs): last_idx = self._key_info_indexer.size if len(vectors) != len(keys): raise ValueError('vectors length should be equal to doc_ids') if vectors.dtype != np.float32: raise ValueError("vectors should be ndarray of float32") for idx, vec in enumerate(vectors): self._index.add_item(last_idx + idx, vec) self._key_info_indexer.add(keys, weights) def query(self, keys: 'np.ndarray', top_k: int, *args, **kwargs) -> List[List[Tuple]]: self._index.build(self.n_trees) if keys.dtype != np.float32: raise ValueError('vectors should be ndarray of float32') res = [] for k in keys: ret, relevance_score = self._index.get_nns_by_vector( k, top_k, include_distances=True) chunk_info = self._key_info_indexer.query(ret) res.append([(*r, -s) for r, s in zip(chunk_info, relevance_score)]) return res @property def size(self): return self._index.get_n_items() def __getstate__(self): d = super().__getstate__() self._index.save(self.indexer_file_path) return d
class realtimeTrain: def __init__(self): self.storage = path.join(getcwd(), 'storage') self.db = Database() self.faces = [] # storage all faces in caches array of face object self.known_encoding_faces = [] # faces data for recognition self.face_user_keys = {} self.load_all() def load_user_by_index_key(self, index_key=0): key_str = str(index_key) if key_str in self.face_user_keys: return self.face_user_keys[key_str] return None def load_train_file_by_name(self, name): trained_storage = path.join(self.storage, 'trained') return path.join(trained_storage, name) def load_unknown_file_by_name(self, name): unknown_storage = path.join(self.storage, 'unknown') return path.join(unknown_storage, name) def load_all(self): results = self.db.select( 'SELECT faces.id, faces.user_id, faces.filename, faces.created FROM faces' ) self.layer_size = 0 count = 0 for row in results: user_id = row[1] filename = row[2] print('train::', user_id) face = { "id": row[0], "user_id": user_id, "filename": filename, "created": row[3] } self.faces.append(face) face_image = face_recognition_api.load_image_file( self.load_train_file_by_name(filename)) face_image_encoding = face_recognition_api.face_encodings( face_image)[0] index_key = len(self.known_encoding_faces) self.known_encoding_faces.append(face_image_encoding) index_key_string = str(index_key) self.face_user_keys['{0}'.format(index_key_string)] = user_id if count == 0: self.layer_size = len(face_image_encoding) self.tree = AnnoyIndex(self.layer_size, metric) # prepare index self.tree.add_item(user_id, face_image_encoding) count += 1 print 'building index...\n' if self.layer_size > 0: print 'layer_size=', self.layer_size self.tree.build(ntrees) self.tree.save('index.ann')
from keras.preprocessing import image from keras.applications.vgg16 import preprocess_input, VGG16 from keras.models import Model from annoy import AnnoyIndex # img_dir_path = 'dataset/All/' img_dir_path = 'dataDrivenArt/bin/data/images/' annoy_model_path = 'model/x-fresh-fc2.ann' annoy_dim = 4096 base_model = VGG16(weights='imagenet') model = Model(inputs=base_model.input, outputs=base_model.get_layer('fc2').output) annoy_model = AnnoyIndex(annoy_dim) for i in range(1, 3988): img_path = img_dir_path + str(i) + '.jpg' img = image.load_img(img_path, target_size=(224, 224)) x = image.img_to_array(img) x = np.expand_dims(x, axis=0) x = preprocess_input(x) fc2_features = model.predict(x) annoy_model.add_item(i, fc2_features[0]) print(img_path, 'saved') annoy_model.build(3987) annoy_model.save(annoy_model_path)
class QA_process(): def __init__(self): self.baiduzhidao = Baiduzhidao_spider() load_file = open('./mod/zhishi_entity.bin', 'rb') self.zhishi_entity = pickle.load(load_file) self.bc = BertClient(ip='192.168.1.101', ignore_all_checks=True) self.annoyIndex = AnnoyIndex(768) self.annoyIndex.load('./mod/qa_index.mod') load_file = open('./mod/qs_dict.bin', 'rb') self.qa_dict = pickle.load(load_file) def getZhishi(self, entity): if self.zhishi_entity.get(entity): logging.info('find %s from zhishi_entity' % entity) return self.zhishi_entity.get(entity) url = 'http://zhishi.me/api/entity/%s?property=infobox' headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36', 'Accept': 'text / html, application / xhtml + xml, application / xml;q = 0.9,image/webp, * / *;q = 0.8', 'Accept-Language': 'zh-CN, zh;q = 0.9' } try: wb_data = requests.get(url % entity, headers=headers, allow_redirects=True) wb_data.encoding = 'utf-8' content = wb_data.json() logging.info('request ' + str(content)) result = jsonpath.jsonpath(content, "$..'infobox'")[0] self.zhishi_entity[entity] = result fou = open('./mod/zhishi_entity.bin', 'wb') pickle.dump(self.zhishi_entity, fou) fou.close() return result except: logging.error('未成功取得' + entity + '属性') return {} def getAllQA(self, theme): #得到主题的属性 theme_json = self.getZhishi(theme) logging.info(str(theme_json)) logging.info(str(theme_json.keys())) properties = theme_json.keys() properties = [p[:-1] for p in properties] qa_pairs = self.baiduzhidao.getQA(theme, 5) for p in properties: qa_pairs.extend(self.baiduzhidao.getQA(theme + '%20' + p, 5)) fou = open('./mod/qa_pairs.bin', 'wb') pickle.dump(qa_pairs, fou) fou.close() def get_sim(self, something): url = 'http://10.122.141.12:9006/similar' r = requests.post(url, json={ "ck": "synonym", "synonym_word": something, "synonym_selectedMode": "auto", "homoionym_word": "", "homoionym_selectedMode": "auto", "homoionym_num": "" }) json = r.json() result = json['detail']['res']['synonym'] return result def selectQA(self, qa_pairs, theme): theme_json = self.zhishi_entity.get(theme) # properties = theme_json.keys() # properties = [p[:-1] for p in properties] # qs=defaultdict(set) qs = dict() #有同义词问题 samenames = ['故宫'] samenames.append(theme) for qa in qa_pairs: for samename in samenames: if samename in qa[0]: #要包含主题词,这是触发词 # t_p=qa[6].split('%20') # if(len(t_p)==2): # p=t_p[1] # p_val=theme_json.get(p+':') # qs[qa[6]].add(qa[0]) #直接保存吧 qs[qa[0]] = qa fou = open('./mod/qs_dict.bin', 'wb') pickle.dump(qs, fou) fou.close() self.save_qa_vec(qs) def save_qa_vec(self, qs): #保存到问题句向量中 q_arr = [q for q in qs] encodes = self.bc.encode(q_arr) for i, encode in enumerate(encodes): self.annoyIndex.add_item(i, encode) self.annoyIndex.build(10) self.annoyIndex.save('./mod/qa_index.mod') def anwser(self, q): encode = self.bc.encode([q])[0] restult, distance = self.annoyIndex.get_nns_by_vector( encode, 1, include_distances=True) answer_arr = [self.qa_dict.get(q) for q in self.qa_dict] quest_arr = [q for q in self.qa_dict] if np.cos(distance) > 0.8: logging.info(str(np.cos(distance)) + quest_arr[restult[0]]) logging.info(str(answer_arr[restult[0]][5])) else: logging.info('不知道') logging.info(str(np.cos(distance)) + quest_arr[restult[0]]) logging.info(str(answer_arr[restult[0]][5])) def test(self): # self.getAllQA('故宫博物院') # load_file = open('./mod/qa_pairs.bin', 'rb') # qa_pairs = pickle.load(load_file) # logging.info('qa_pairs size:%d'%len(qa_pairs)) # self.selectQA(qa_pairs,'故宫博物院') # qs=set(q[0] for q in qa_pairs) # logging.info('qs size:%d' % len(qs)) # sorted(maybe_errors, key=lambda k: k[1], reverse=False) # qs=sorted(qa_pairs,key=lambda k:int(k[4]),reverse=True) # logging.info(str(qs[:2])) q = '千里江山图?' self.anwser(q)
class Annoy(VectorIndex): def __init__(self, path, dims=None, metric='angular', build_on_disk=True): self.path = path self.is_mutable = None self.is_built = None self.build_on_disk = build_on_disk self.metric = metric if os.path.isfile(self.path): logging.debug(f'Loading existing index: {self.path}') self.load_meta() assert self.dims == dims or not dims, \ 'Passed path to existing index but dims do not match' assert self.metric == metric or not metric, \ 'Passed path to existing index but metrics do not match' self.index = AnnoyIndex(self.dims, metric=self.metric) elif dims: logging.debug( f'Creating new index with {dims} dimensions and {self.metric} metric' ) self.dims = dims self.index = AnnoyIndex(self.dims, metric=self.metric) if build_on_disk: self.index.on_disk_build(self.path) else: logging.debug(f'Loading existing index: {self.path}') self.load_meta() self.index = AnnoyIndex(self.dims, metric=self.metric) @property def meta_path(self): return self.path + '.meta.json' @property def files(self): return [self.path, self.meta_path] def load_meta(self): self.__dict__.update(load_json(self.meta_path)) def save_meta(self): d = {**self.__dict__} d.pop('index') save_json(d, self.meta_path) def build(self, num_trees=10): logging.debug(f'staring to build index: {self.path}') self.index.build(num_trees) logging.debug(f'finished building index: {self.path}') self.is_mutable = False self.is_built = True self.save_meta() def save(self): self.index.save(self.path) self.is_mutable = False self.save_meta() def load(self, memory=False): self.index.load(self.path, prefault=memory) self.is_mutable = False def unload(self): self.index.unload() def __del__(self): self.unload() def __setitem__(self, idx, vector): self.index.add_item(idx, vector) def __getitem__(self, idx): return self.index.get_item_vector(idx) def __len__(self): return self.index.get_n_items() def add(self, vector): idx = len(self) self[idx] = vector return idx def add_bulk(self, vectors): start = len(self) for n, v in enumerate(vectors): self[start + n] = v return self def set_bulk(self, indices, vectors): for idx, vector in zip(indices, vectors): self[idx] = vector def search(self, vector, num=10, depth=None, distances=True): return self.index.get_nns_by_vector(vector, num, depth or -1, distances) def search_index(self, idx, num=10, depth=None, distances=True): return self.index.get_nns_by_item(idx, num, depth or -1, distances) def distance(self, i, j): return self.index.get_distance(i, j)
else: relations[w].add(rel) for line in open('predicates_fw.tsv').readlines(): line = line.strip().lower().split('\t') rel = line[0] label = [x for x in ' '.join(line[1:]).split(' ') if x not in stop] for w in label: if w not in relations: relations[w] = set([]) else: relations[w].add(rel) all_relation_words = set([]) all_relation_words.update(relations.keys()) word2vec_pretrain_embed = gensim.models.Word2Vec.load_word2vec_format( '/dccstor/cssblr/amrita/resources/glove/GoogleNews-vectors-negative300.bin', binary=True) f = 300 index = AnnoyIndex(f, metric='euclidean') index_desc = {} count = 0 for word in all_relation_words: word = word if word in word2vec_pretrain_embed: embed = word2vec_pretrain_embed[word] index.add_item(count, embed) index_desc[count] = word count = count + 1 index.build(100) index.save('annoy_index_noisy/glove_embedding_of_vocab.ann') pkl.dump(index_desc, open('annoy_index_noisy/index2word.pkl', 'wb'))
item_vectors = movielens['item_features'] * model.item_embeddings # Now let's make an annoy index for item to item querying: # In[93]: from annoy import AnnoyIndex f = item_vectors.shape[1] # Length of item vector that will be indexed t = AnnoyIndex(f) for i in range(item_vectors.shape[0]): v = item_vectors[i] t.add_item(i, v) t.build(10) # 10 trees t.save('movielens_item_Annoy_idx.ann') # And query the index for similar movies: # In[94]: def nearest_movies_Annoy(movie_id, index, n=10, print_output=True): nn = index.get_nns_by_item(movie_id, 10) if print_output == True: print('Closest to %s : \n' % movielens['item_labels'][movie_id]) titles = [movielens['item_labels'][i] for i in nn] if print_output == True: print("\n".join(titles))
def create_annoy(target_features): t = AnnoyIndex(layer_dimension) for idx, target_feature in enumerate(target_features): t.add_item(idx, target_feature) t.build(10) t.save(os.path.join(work_dir, 'annoy.ann'))
class SearchIndex(): """The search index manages search indexes on disk This support creating indexes and operations to save/load to/from disk """ def __init__(self): """Generates a new SearchIndex, used in Server Class The main purpose of this class is to generate an index, without the Server class needs to know the search index it is being used A search index is ready to be used when an index exists and it isready (when an index has been built). """ self.index = None self.ready = False def build_from_trained_model(self, trained_model, depth): """Creates an index from a trained model :param TrainedModel trained_model: The trained model :param int depth: The depth desired to generate the search index """ entities_matrix = trained_model.E nrows, emb_size = entities_matrix.shape self.index = AnnoyIndex(emb_size) # Populate the search index with the trained embedding for row in range(0, nrows): vector = list(entities_matrix[row]) self.index.add_item(row, vector) # Generate the index itself. This may take long time self.index.build(depth) # Index ready self.ready = True def save_to_binary(self, filepath): """Dump the search tree on a file on disk :param string filepath: The path where the file will be saved :return: If operations had or not errors :rtype: boolean """ if self.index is None or self.ready is False: print("The index is not ready to be saved") return False self.index.save(filepath) return True def load_from_file(self, filepath, emb_size): """Load the search tree from a file on disk :param string filepath: The path where the file will be saved :param int emb_size: The size of embedding vector used :return: If operations had or not errors :rtype: boolean """ self.index = AnnoyIndex(emb_size) self.index.load(filepath) self.ready = True
logging.info('building index for %s' % (EMB_DIR % corpus)) aidx = AnnoyIndex(DIMENSIONS) for f in os.listdir(EMB_DIR % corpus): logging.debug('indexing %s' % f) with open('%s/%s' % (EMB_DIR % corpus, f)) as embf: data = json.loads(embf.read()) # print(repr(data)) i = int(f.replace('.json', '')) aidx.add_item(i, data['emb']) # for i in xrange(1000): # v = [random.gauss(0, 1) for z in xrange(f)] logging.info('building %d trees' % NUM_TREES) aidx.build(NUM_TREES) aidx.save(INDEX_FN % corpus) logging.debug('%s written.' % (INDEX_FN % corpus)) # # test index # # u = AnnoyIndex(f) # u.load(INDEX_FN) # super fast, will just mmap the file # print(u.get_nns_by_item(0, 1000)) # will find the 1000 nearest neighbors
def process(args): utils.make_directory(args.path['model']) tokenizer = args.tokenizer(args.path['vocab']) train_batch = args.batch(tokenizer, args.max_lens) train_batch.set_data(utils.read_lines(args.path['train_x']), utils.read_lines(args.path['train_y'])) dev_batch = args.batch(tokenizer, args.max_lens) dev_batch.set_data(utils.read_lines(args.path['dev_x']), utils.read_lines(args.path['dev_y'])) model = args.model(args) os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_device config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = args.gpu_memory with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) saver = tf.train.Saver(pad_step_number=True) recorder = Recorder() starter = time.time() for i in range(args.max_steps): input_x, input_y, idx, update_epoch = train_batch.next_batch( args.batch_size, recorder.train_idx) train_features = { 'input_x_ph': input_x, 'input_y_ph': input_y, 'keep_prob_ph': args.keep_prob } recorder.train_idx = idx train_fetches, train_feed = model.train_step(train_features) _, train_loss, train_acc = sess.run(train_fetches, train_feed) recorder.train_losses.append(train_loss) recorder.train_accs.append(train_acc) if not i % args.show_steps and i: input_x, input_y, idx, update_epoch = dev_batch.next_batch( args.batch_size, recorder.dev_idx) dev_features = { 'input_x_ph': input_x, 'input_y_ph': input_y, 'keep_prob_ph': 1.0 } recorder.dev_idx = idx dev_fetches, dev_feed = model.dev_step(dev_features) dev_loss, dev_acc = sess.run(dev_fetches, dev_feed) recorder.dev_losses.append(dev_loss) recorder.dev_accs.append(dev_acc) speed = args.show_steps / (time.time() - starter) utils.verbose( r' step {:05d} | train [{:.5f} {:.5f}] | ' r'dev [{:.5f} {:.5f}] | speed {:.5f} it/s'.format( i, train_loss, train_acc, dev_loss, dev_acc, speed)) starter = time.time() if not i % args.save_steps and i: features = recorder.stats() if features['save']: saver.save(sess, args.path['model']) utils.verbose( r'step {:05d} - {:05d} | train [{:.5f} {:.5f}] | ' r'dev [{:.5f} {:.5f}]'.format(i - args.save_steps, i, features['train_loss'], features['train_acc'], features['dev_loss'], features['dev_acc'])) print('-+' * 55) utils.write_result(args, recorder.lowest_loss) utils.verbose('Start building vector space from dual encoder model') vectors = [] infer_batch = args.batch(tokenizer, args.max_lens) infer_batch.set_data(utils.read_lines(args.path['train_x']), utils.read_lines(args.path['train_y'])) starter = time.time() idx = 0 update_epoch = False i = 0 while not update_epoch: input_x, input_y, idx, update_epoch = infer_batch.next_batch( args.batch_size, idx) infer_features = {'input_x_ph': input_x, 'keep_prob_ph': 1.0} infer_fetches, infer_feed = model.infer_step(infer_features) enc_questions = sess.run(infer_fetches, infer_feed) vectors += enc_questions if not i % args.show_steps and i: speed = args.show_steps / (time.time() - starter) utils.verbose('step : {:05d} | speed: {:.5f} it/s'.format( i, speed)) starter = time.time() i += 1 vectors = np.reshape(np.array(vectors), [-1, args.hidden])[:infer_batch.data_size] vec_dim = vectors.shape[-1] ann = AnnoyIndex(vec_dim) for n, ii in enumerate(vectors): ann.add_item(n, ii) ann.build(args.num_trees) ann.save(args.path['ann']) utils.verbose('Annoy has been dump in {}'.format(args.path['ann']))
songs = [] embedding_size = 300 t = AnnoyIndex(embedding_size, 'angular') with open('lyrics.csv', encoding="utf8") as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') line_count = 0 index = 0 for row in itertools.islice(csv_reader, 0, 380000, 10): if line_count == 0: print(f'Column names are {", ".join(row)}') line_count += 1 fields = row else: lyrics = row[5] if line_count == 1: testPhrase = lyrics if len(lyrics) > 0: embedding = document_embedding(lyrics) t.add_item(index, embedding) songs.append((row[1],row[3],row[5])) index += 1 line_count += 1 print(f'Processed {line_count} lines.') t.build(10) # 10 trees t.save('test.ann') song_df = DataFrame(songs, columns = ['songtitle' , 'artist', 'lyrics']) song_store.put('/mat', song_df)
self.out = tf.squeeze( self.sess.graph.get_tensor_by_name('vgg_16/avgp5/AvgPool:0')) def feat1(self, image_path): img_data = np.expand_dims(np.array(open(image_path, 'r').read()), 0) return self.sess.run(self.out, {self.img: img_data}) def feat2(self, feat_string): img_data = np.expand_dims(np.array(feat_string), 0) return self.sess.run(self.out, {self.img: img_data}) names = np.load('data/name.npy') if not os.path.exists('model/inshop.ann'): feats = np.load('data/feats.npy') t = AnnoyIndex(512) for i, a in enumerate(feats): t.add_item(i, a) t.build(200) t.save('model/inshop.ann') else: t = AnnoyIndex(512) t.load('model/inshop.ann') worker = Feature2() str_ = open('./test.jpg', 'r').read() feat1 = worker.feat2(str_) feat2 = np.load('rst2.npy') print 'Extract Feature:', t.get_nns_by_vector(feat1, 20) print 'Serving Feature:', t.get_nns_by_vector(feat2, 20)