def cloud_main(self, file_count): with open('encrypted_vectors/feature_vectors.json') as data_file: feature_loaded = json.load(data_file) with open('encrypted_indices/indices.json') as data_file: indices_loaded = json.load(data_file) with open('encrypted_query/query_vectors.json') as data_file: query_loaded = json.load(data_file) d = Decrypt() feature_vectors = d.decrypt_indices_vector(bytes(feature_loaded)) indices = d.decrypt_indices_vector(bytes(indices_loaded)) query_vectors = d.decrypt_indices_vector(bytes(query_loaded)) feature_vectors = np.frombuffer(feature_vectors, dtype=int) feature_vectors = np.reshape(feature_vectors, (file_count, -1)) indices = json.loads(indices.decode()) query_vectors = np.frombuffer(query_vectors, dtype=int) query_vectors = np.reshape(query_vectors, (-1, 6)) print(feature_vectors.shape, indices, query_vectors.shape) l = LSH(feature_vectors, indices) n_neighbors, result = l.query(query_vectors, 6, 45) print(n_neighbors) cursor.execute(sql_select_Query) records = cursor.fetchall() for row in records: if row[0] - 1 in result: image_name = row[1] print(image_name) CloudAPISender().cloud_api_sender(image_name) # Closing the connection conn.close()
def loadOrTrainLSHModel(forceGenerate=False): lshModel = None if os.path.exists("./pickle_files/lshModel.pickle") and not forceGenerate: print("LSH model found on disk") pickleIn = open("./pickle_files/lshModel.pickle", "rb") lshModel = pickle.load(pickleIn) else: print("Training LSH model") trainAudioDataAndRateArray = loadAllFiles("train", '') trainingData = generateData(trainAudioDataAndRateArray) lshModel = LSH() for data in trainingData: print("fileName", data[1]) validFrameList = extractValidFrames(data[0]) for i in range(0, validFrameList.shape[0]): reshapedValidFrame = validFrameList[i].reshape(1, -1) # print("reshapedValidFrame", reshapedValidFrame) lshModel.train(reshapedValidFrame, { "name": data[1] + "_" + str(i), "frameIndex": i }) # hr.train(validFrames[1:2], data[1]) # print("lshModel", lshModel) pickleOut = open("./pickle_files/lshModel.pickle", "wb") pickle.dump(lshModel, pickleOut) pickleOut.close() return lshModel
def get_similar(num_layers, num_hashes, features, filepaths, query_image, num_results): lsh = LSH(num_layers, num_hashes, 10) lsh.fit(features) query_vec = get_cm_features_by_image_path(query_image) similar_indices = lsh.get_similar(query_vec, num_results)[:num_results] return [list(features[x]) for x in similar_indices], [filepaths[x] for x in similar_indices]
def setUp(self): self.lsh = LSH(3, 2, 1) self.lsh_two_tables = LSH(3, 2, 2) # Overwrite randomly initalized planes with known values. self.lsh.planes = [np.array([[0.1, 0.2], [-0.1, -0.2], [-1.0, 1.0]])] self.lsh_two_tables.planes = [ np.array([[0.1, 0.2], [-0.1, -0.2], [-1.0, 1.0]]), np.array([[-0.1, -0.2], [0.1, 0.2], [-2.0, 2.0]]), ]
def __init__(self, N, D, K, L): self.lsh = LSH(SimHash(D, K, L), K, L) self.keys = np.zeros((N, D), dtype=np.float32) self.values = np.zeros((N, 1), dtype=np.float32) self.lru = np.zeros(N, dtype=np.float32) self.key2idx = dict() self.size = 0 self.max_memory = N self.K = K self.L = L
def query(fname, key='key', topk=10, truncate=80): model = pickle.load(open(CONST.MODEL, 'rb')) dataframe = pd.read_csv(CONST.DATASET) corpus = TfidfVectorizer().fit_transform(dataframe['content']) lsh = LSH(corpus, model) index = dataframe[dataframe[key].apply(str) == str(fname)].index[0] dataframe['content'] = dataframe['content'].str[:int(truncate)] return lsh.query(corpus[index, :], int(topk), 10)[0].join(dataframe, on='id').sort_values('distance').iloc[:, 1:]
def _build_lsh(self, minhashes): ''' Builds the LSH using the given minhash vectors ''' self.lsh = LSH(minhashes) with open(self.LSH_FILE, 'wb') as f: pickle.dump(self.lsh.hash_tables, f)
def __init__(self, digest_length, num_hashtables, dTreshold): self._dTrehold = dTreshold self._lsh = LSH(digest_length, 2**20, storage_config=None, num_hashtables=num_hashtables) self._hashvect = HashVect()
def query(self, input_list, topk=10, key_dist=1, dist_func=LSH.cosine_dist): query_docinfo = '' if self.docinfo_lsh is None: query_docinfo = np.array(input_list, dtype='float32') else: query_docinfo = bitarray(''.join(self.docinfo_lsh.hash(input_list))) rs_dict = {} if key_dist >= 0: query_key_list = self.lsh.hash(input_list) for i, query_key in enumerate(query_key_list): for k in LSH.get_keys_str(query_key, key_dist): k += ('_' + str(i)) if k in self.index_dict: for idx in self.index_dict[k]: if idx in rs_dict: continue dist = dist_func(self.docinfo_list[idx], query_docinfo) docid = self.docid_list[idx] rs_dict[docid] = dist else: # brute force search for docid, docinfo in zip(self.docid_list, self.docinfo_list): dist = dist_func(docinfo, query_docinfo) rs_dict[docid] = dist print 'Candidates: %d' % len(rs_dict) return (sorted(rs_dict.items(), key=itemgetter(1), reverse=False)[: topk], len(rs_dict))
def starter(image_id, m, k, l): query_results = Database().retrieve_many() id_vector_pairs = [(item["image_id"], item["vector"]) for item in query_results] search_results = LSH(l, k, id_vector_pairs).get_search_results(image_id, show=True) print( "Original dataset size: {} | Reduced search space size: {} | Reduction by {} %" .format( len(id_vector_pairs), len(search_results), float(len(id_vector_pairs) - len(search_results)) * 100 / len(id_vector_pairs), )) query_results = Database().retrieve_many(list(search_results)) search_id_vector_pairs = [(item["image_id"], item["vector"]) for item in query_results] source_vector = Database().retrieve_one(image_id)["vector"] all_images = functions.find_similarity(source_vector, search_id_vector_pairs) similar_images = all_images[:m] print(similar_images) visualizer.visualize_lsh(image_id, similar_images) return similar_images, all_images
def __init__(self, nbits=3, kernel='rbf', kernel_kwds='range', para_p_max=300, para_p_exp=0.5, para_t_max=30, para_t_ratio=4, weight_pool_size=50): LSH.__init__(self, weight_pool_size) self._nbits = nbits self._kernel_func = None self._kernel_kwds = kernel_kwds self._check_kernel(kernel, kernel_kwds) self._train_data = None self._para_p = para_p_max self._para_p_exp = para_p_exp self._para_t = para_t_max self._para_t_ratio = para_t_ratio self._K_half = None self._weight_pool_size = weight_pool_size self._e_s_pool = None self._weight_pool = None self._k = None
def load_lsh(self): ''' Loads the buckets from the files and initalizes a LSH object using this data. ''' with open(self.LSH_FILE, 'rb') as f: data = pickle.load(f) self.lsh = LSH(table=data)
def __init__(self, movie_filename, rating_filename, k, m, c): # Hyperparameter self.c = c # read movie file and create dictionary _movie_names self._movie_names = {} f = open(movie_filename, "r", encoding="utf8") reader = csv.reader(f) next(reader) # skips header line for line in reader: movieid = line[0] moviename = line[1] # ignore line[2], genre self._movie_names[movieid] = moviename f.close() # read rating file and create _movie_ratings (ratings for a movie) # and _user_ratings (ratings by a user) dicts self._movie_ratings, self._movie_time = {}, {} self._user_ratings, self._user_time = {}, {} f = open(rating_filename, "r", encoding="utf8") reader = csv.reader(f) next(reader) # skips header line for line in reader: userid = line[0] movieid = line[1] rating = line[2] timestamp = line[3] if userid not in self._user_ratings: self._user_ratings[userid] = { } # each user is a dict with movies and ratings self._user_time[userid] = {} self._user_ratings[userid][movieid] = float(rating) self._user_time[userid][movieid] = float(timestamp) if movieid not in self._movie_ratings: self._movie_ratings[movieid] = {} self._movie_time[movieid] = {} self._movie_ratings[movieid][userid] = float(rating) self._movie_time[movieid][userid] = float(timestamp) f.close() self.me = LSH(k, m, self._user_ratings, self._movie_ratings)
def cloud_main(self, file_count): #establishing the connection conn = mysql.connector.connect(user='******', password='******', host='127.0.0.1', database='ImageRetrieval') #Creating a cursor object using the cursor() method cursor = conn.cursor() # Preparing SQL query to select a record from the database. sql_select_Query = "select * from images" with open('encrypted_vectors/feature_vectors.json') as data_file: feature_loaded = json.load(data_file) with open('encrypted_indices/indices.json') as data_file: indices_loaded = json.load(data_file) with open('encrypted_query/query_vectors.json') as data_file: query_loaded = json.load(data_file) d = Decrypt() feature_vectors = d.decrypt_indices_vector(bytes(feature_loaded)) indices = d.decrypt_indices_vector(bytes(indices_loaded)) query_vectors = d.decrypt_indices_vector(bytes(query_loaded)) feature_vectors = np.frombuffer(feature_vectors, dtype=int) feature_vectors = np.reshape(feature_vectors, (file_count, -1)) indices = json.loads(indices.decode()) query_vectors = np.frombuffer(query_vectors, dtype=int) query_vectors = np.reshape(query_vectors, (-1, 6)) print(feature_vectors.shape, indices, query_vectors.shape) l = LSH(feature_vectors, indices) n_neighbors, result = l.query(query_vectors, 6, 45) print(n_neighbors) cursor.execute(sql_select_Query) records = cursor.fetchall() for row in records: if row[0] - 1 in result: image_name = row[1] print(image_name) CloudAPISender().cloud_api_sender(image_name) # Closing the connection conn.close()
def run(dataPath): nrofBands = 25 nrOfPerms = 132 nrOfRows = int(nrOfPerms / nrofBands) # Load data data = np.load(dataPath) print("Data is loaded") # Create an empty file to add results with open('results.txt', 'w') as file: file.write("") file.close() # Create Signature Matrix signatureMatrix = minHashing(data, nrOfPerms) pairsFound = LSH(data, signatureMatrix, nrOfRows, nrofBands) print(" Number of similar pairs found is : ", len(pairsFound)) return (pairsFound)
from create_vector import vectorize from lsh import LSH from parse_document import extract_from_pdf import sys if __name__ == "__main__": pdfs = sys.argv[1:] if pdfs: vector_list = [vectorize(extract_from_pdf(pdf)) for pdf in pdfs] else: print "Usage: python create_vector.py pdf1 [pdf2] [pdf3] .." sys.exit() lsh = LSH(300) [lsh.insert_document(title, vector) for (title, vector) in vector_list] # print lsh.get_similarities() print lsh.closest_match(vector_list[0][0]) # return vector_list
import pandas as pd import sqlite3 uniDB = UniprotDB("Uniprot_DB.sqlite") #Construct the protein database """ uniDB.deleteProteins() protManager = ProteinsManager() uniDB.createTables() protManager.loadProteins("Ecolx.xml",uniDB) protManager.loadProteins("PseA7.xml",uniDB) """ minhash3 = LSH(0.3,32) minhash4 = LSH(0.3,64) minhash4b = LSH(0.3,96) minhash5 = LSH(0.3,128) # Create the minhashes proteins = uniDB.extractProteins() """ minhashes3, lsh3 = minhash3.calculateLSH(proteins, 3) minhashes4, lsh4 = minhash4.calculateLSH(proteins, 3) minhashes4b, lsh4b = minhash5.calculateLSH(proteins, 3) minhashes5, lsh5 = minhash6.calculateLSH(proteins, 3) minhash3.saveLSH(32) minhash4.saveLSH(64) minhash4b.saveLSH(96)
from lsh import LSH import cPickle as pickle #define model model = LSH(base_vec_num=3, iter_num=2, dimens=2) #data vec_dict = { "a": [0.1, 0.2], "b": [0.5, -0.2], "c": [-0.3, -0.1], "d": [1.0, 0.0], "e": [-3, 2], "f": [2, 2] } #build model.build_lsh(vec_dict) #get candidate set for name in model.get_candidate_set([0.3, 0.9]): print name, print "" #test pickle with open("model.pkl", 'wb') as fout: pickle.dump(model, fout) with open("model.pkl", 'rb') as fin: dumped_model = pickle.load(fin) print "dumped model:" for name in dumped_model.get_candidate_set([0.3, 0.9]):
#coding: utf-8 from lsh import LSH lsh = LSH(L=10,k=5,d=11) lsh.loadDataSet('training_set1.txt') vect = [1,10,1,11,1,13,1,12,1,1,9] knn_vects = lsh.knn(vect,3) print knn_vects
print 'create tfidf vectors of documents' tfidf = TFIDF(doc_dict) ''' Perform lsh ''' print time.asctime(time.localtime(time.time())) digest_length = int(sys.argv[2]) vect_length = tfidf.vect_length num_hashtables = 1 log += 'perform lsh with hash-length: ' + str( digest_length) + ', vect-length: ' + str( vect_length) + ', num-hashtables: ' + str(num_hashtables) + '\n' print 'perform lsh with hash-length: ' + str( digest_length) + ', vect-length: ' + str( vect_length) + ', num-hashtables: ' + str(num_hashtables) r = {"dict": None} lsh = LSH(digest_length, vect_length, storage_config=r, num_hashtables=num_hashtables) for i, k in enumerate(tfidf._id_list): vect = tfidf.get_vector(i) lsh.index(vect, extra_data=tfidf._id_list[i]) ''' Query documents ''' log += str(time.asctime(time.localtime(time.time()))) + '\n' log += 'query documents\n' print time.asctime(time.localtime(time.time())) print 'Query documents' distance_func = "cosine" corr = set() for i, key in enumerate(tfidf._id_list): query_object = tfidf.get_vector(i)
#MySQL related information needs to be update here conn = mysql.connector.connect( user='******', password='******', host='127.0.0.1', database='ImageRetrieval') #Creating a cursor object using the cursor() method cursor = conn.cursor() # Preparing SQL query to INSERT a record into the database. insert_stmt = ( "insert into images (path)" "values (%s)" ) key = (0.1, 0.1) num_of_random_vectors = 16 hc = HarrisCorner() sb = SurfBow() e = Encrypt() l = LSH() #Image directory path to be mentioned here img_dir = "../images/" feat_vec=[] if not os.path.exists("encrypted_images/"): cwd = os.getcwd() directory = "/encrypted_images" os.mkdir(cwd+directory) if not os.path.exists("encrypted_indices/"): cwd = os.getcwd() directory = "/encrypted_indices" os.mkdir(cwd+directory) if not os.path.exists("encrypted_vectors/"): cwd = os.getcwd() directory = "/encrypted_vectors"
def train(device, data, schedule, mi_type, args): model = MI_Estimator(device, D=d, ED=ed, HD=256) model.to(device) model.train() optimizer = optim.Adam(model.parameters(), lr=5e-4) xs, ys = data xs = xs.to(device) ys = ys.to(device) zxs = torch.cat([xs, zerot], dim=0) lsh = LSH(SimHash(ed, K, L), K, L) estimates = [] for batch_idx, MI in enumerate(schedule): optimizer.zero_grad() # randomly select data from data distribution sdx_iter = (batch_idx // mi_range) * mi_range sdx_offset = sdx_iter * batch_size sdx = torch.from_numpy( np.random.choice(mi_range * batch_size, batch_size, replace=False) + sdx_offset).to(device) t = 10 if batch_idx <= 1000 else 100 if batch_idx % t == 0: # Load first section of desired size into lsh hash tables lxs = xs[:desired_size, :] assert (lxs.size(0) == desired_size) build(lsh, model, lxs) #lsh.stats() # Full - Load All Data #build(lsh, model, xs) # embed data y = F.embedding(sdx, ys).detach() ey = model.embed_y(y) # for each data sample, query lsh data structure, remove accidental hit # find maximum number of samples # create matrix and pad appropriately np_indices = lsh.query_remove_matrix(ey, sdx, xs.size(0)) indices = torch.from_numpy(np_indices).to(device) # create mask distinguishing between samples and padding mask = 1.0 - torch.eq(indices, xs.size(0)).float() mask = torch.cat([bs_onet, mask], dim=1).detach() px = torch.unsqueeze(F.embedding(sdx, xs), dim=1) nx = F.embedding(indices, zxs, padding_idx=xs.size(0)) x = torch.cat([px, nx], dim=1).detach() mi = model(x, y, mask, args) loss = -mi loss.backward() optimizer.step() estimates.append(mi.item()) if (batch_idx + 1) % 100 == 0: print('{} {} MI:{}, E_MI: {:.6f}'.format(mi_type.name, batch_idx + 1, MI, mi.item())) sys.stdout.flush() lsh.stats() return estimates
def main(args): # Get input params input_dir = args["dir"] th = args["th"] # Read all files contained in the input directory print("Loading documents...") onlyfiles = [f for f in listdir(input_dir) if isfile(join(input_dir, f))] docs = [] for fname in onlyfiles: with open(join(input_dir, fname), "r") as file: docs += [file.read()] # Clean documents removing trailing and duplicate blanks print("Cleaning documents...") docs = [re.sub('\W+', ' ', doc) for doc in docs] # Compute shingles of size n print("Computing shingles...") sh = Shingling(args["n"]) shingles = sh.transform(docs) # Compute jaccard similarities print("Jaccard similarities (on hashed shingles) > " + str(th) + ":") similarities = {(onlyfiles[i], onlyfiles[j]): compare_shingles(shingles[i], shingles[j]) for i in range(0, len(docs)) for j in range(i + 1, len(docs))} # Show similarities greater than the threshold print( sorted([(k, v) for k, v in similarities.items() if v > th], key=itemgetter(1), reverse=True)) # Compute minHash signatures print("Computing signatures...") mh = MinHashing(args["k"]) signatures = mh.transform(shingles) # Compute similarity esrimations print("Similarity estimations using minHashing > " + str(th) + ":") estimations = {(onlyfiles[i], onlyfiles[j]): compare_signatures(signatures[:, i], signatures[:, j]) for i in range(0, len(docs)) for j in range(i + 1, len(docs))} # Show similarity estimations greater than a threshold print( sorted([(k, v) for k, v in estimations.items() if v > th], key=itemgetter(1), reverse=True)) # Show Differences between estimations and real similarities errors = {(onlyfiles[i], onlyfiles[j]): abs(estimations[(onlyfiles[i], onlyfiles[j])] - similarities[(onlyfiles[i], onlyfiles[j])]) for i in range(0, len(docs)) for j in range(i + 1, len(docs))} # Show errors greater than 5% print("Estimaions with error greater than 5%:") print( sorted([(k, v) for k, v in errors.items() if v > 0.05], key=itemgetter(1), reverse=True)) # Apply LSH to find pairs of probable similar items lsh = LSH(signatures, th) lsh.index() candidates = lsh.get_pairs() # Show candidates print("Identified candidates with LSH:") print([(onlyfiles[t[0]], onlyfiles[t[1]]) for t in candidates])
from mrjob.job import MRJob from mrjob.step import MRStep import util import os from datastore import SQLiteDatastore import config import random SPACE = u' ' PATH = os.path.dirname(os.path.abspath(__file__)) from lsh import LSH, DocumentTooShortError datastore = SQLiteDatastore(config.SQLITE_PATH, False) lsh = LSH(datastore) class CandidatesMapReducer(MRJob): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.remaining = str() def steps(self): return [ MRStep( mapper=self.mapper_paragraphs, mapper_final=self.mapper_paragraphs_final, reducer=self.reducer_minhash, ), MRStep(reducer=self.reducer_unique, ), ]
from lsh import LSH import cPickle as pickle #define model model = LSH(base_vec_num=3, iter_num=2, dimens=2) #data vec_dict = { "a":[0.1,0.2], "b":[0.5,-0.2], "c":[-0.3,-0.1], "d":[1.0,0.0], "e":[-3,2], "f":[2,2] } #build model.build_lsh(vec_dict) #get candidate set for name in model.get_candidate_set([0.3,0.9]): print name, print "" #test pickle with open("model.pkl", 'wb') as fout: pickle.dump(model, fout) with open("model.pkl", 'rb') as fin: dumped_model = pickle.load(fin) print "dumped model:" for name in dumped_model.get_candidate_set([0.3,0.9]):
class DND: MAX_SIZE = 25 TM = 0.1 def __init__(self, N, D, K, L): self.lsh = LSH(SimHash(D, K, L), K, L) self.keys = np.zeros((N, D), dtype=np.float32) self.values = np.zeros((N, 1), dtype=np.float32) self.lru = np.zeros(N, dtype=np.float32) self.key2idx = dict() self.size = 0 self.max_memory = N self.K = K self.L = L def __contains__(self, key): return tuple(key) in self.key2idx def __getitem__(self, key): try: index = self.key2idx[tuple(key)] self.lru[index] += DND.TM return self.values[index] except: return None def __setitem__(self, key, value): item = tuple(key) try: # 1) Find memory index for key vector index = self.key2idx[item] except: # 2) Add key vector if not present if self.size >= self.max_memory: # 3) If memory is full, select LRU memory index and remove from LSH hash tables index = np.argmin(self.lru) self.lsh.erase(self.keys[index], index) else: index = self.size self.size += 1 # Rehash key into LSH hash tables self.lsh.insert(key, index) self.key2idx[item] = index # Add new key to memory self.keys[index] = key finally: # Update memory value self.values[index] = value self.lru[index] += DND.TM def retrieve(self, query): # Collect memory indices from LSH hash tables indices, cL = self.lsh.query(query.data, DND.MAX_SIZE) # Gather keys and values from memory keys = self.keys[indices] values = self.values[indices] self.lru[indices] += DND.TM assert (keys.shape[0] == values.shape[0]) return keys, values, indices, cL
from mrjob.job import MRJob from mrjob.step import MRStep from mrjob.protocol import TextProtocol import util import os from datastore import SQLiteDatastore import config from lsh import LSH, DocumentTooShortError datastore = SQLiteDatastore(config.SQLITE_PATH, False) lsh = LSH(datastore, paragraphs=True) class GeneratorMapReducer(MRJob): INPUT_PROTOCOL = TextProtocol def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.remaining = str() def steps(self): return [ MRStep( mapper=self.mapper_articles, reducer=self.reducer_minhash, ) ] def mapper_articles(self, article_id, article):
def run(self): print(\ """Local Sensitivity Hashing-based protein similarity search. Options: E[X]it, [L]oad Database, [D]elete Database, [C]alculate LSH, [RC] Recalculate LSH, [LL] Load LSH, [S]ave LSH [Q]uery LSH, Query [A]ll LSH, Read [B]LAST, Compare [R]esults, """) mode = input('Choose option:') uniDB = UniprotDB("Uniprot_DB.sqlite") minhash = LSH(0.5, 96) while (mode != 'Exit' and mode != 'X'): if (mode == 'Delete Database' or mode == 'D'): uniDB.deleteProteins() if (mode == 'Load Database' or mode == 'L'): protManager = ProteinsManager() uniDB.createTables() filename = input( 'XML filename (e.g. Ecolx.xml or PseA7.xml or Human.xml): ' ) protManager.loadProteins(filename, uniDB) if (mode == 'Calculate LSH' or mode == 'C'): uniDB = UniprotDB("Uniprot_DB.sqlite") proteins = uniDB.extractProteins() minhashes, lsh = minhash.calculateLSH(proteins, 3) print("Calculated") if (mode == 'Recalculate LSH' or mode == 'RC'): jaccardThreshold = float( input( "Specify a Jaccard similarity threshold (default: 0.5): " )) permutations = int( input( "Specify the number of permutations(default: 96) : ")) shinglesize = int( input("Specify the shingle size (default: 3): ")) minhash = LSH(jaccardThreshold, permutations) proteins = uniDB.extractProteins() minhashes, lsh = minhash.calculateLSH(proteins, shinglesize) print("Recalculated") if (mode == 'Query LSH' or mode == 'Q'): protein = input('Protein accession: ') start_time = time.time() result = minhash.queryProtein(protein) if result is not None: jaccResultsDict = minhash.checkJaccardResultsOfProtein( protein, result) # Return the results in sorted order, big to small Jaccard score sorted_jaccResultsDict = OrderedDict( sorted(jaccResultsDict.items(), key=lambda x: -x[1])) for jaccRes in sorted_jaccResultsDict.items(): print("\nMatch with Jaccard:", jaccRes[1]) information = uniDB.extractProteinInformation( jaccRes[0]) proteininfo = uniProtein(*information) proteininfo.printUniProtein(printSeq=False) print("Runtime of query search: %s seconds " % (time.time() - start_time)) if (mode == 'Calculate All' or mode == 'CA'): start_time = time.time() uniDB = UniprotDB("Uniprot_DB.sqlite") #uni_DB.close() proteins = uniDB.extractProteins() #minhash.calculateLSH([protein[1] for protein in proteins]) minhashes, lsh = minhash.calculateLSH(proteins, 3) for protein in proteins: print("Protein ", protein[0]) result = minhash.queryProtein(protein[0]) if result is not None: jaccResultsDict = minhash.checkJaccardResultsOfProtein( protein[0], result) sorted_jaccResultsDict = OrderedDict( sorted(jaccResultsDict.items(), key=lambda x: -x[1])) for jaccRes in sorted_jaccResultsDict.items(): print(jaccRes[0], " - Jaccard: ", jaccRes[1]) print("Runtime of query all: %s seconds " % (time.time() - start_time)) if (mode == 'Query All LSH' or mode == 'A'): resultsDB = ResultsDB("Results_DB.sqlite") resultsDB.createLSHtable("lshresults") resultsDB.deleteTable("lshresults") resultsDB.createLSHtable("lshresults") for query in minhash.minhashes.keys(): matches = minhash.queryProtein(query) for match in matches: # Filter self-matches if query != match: jaccard = minhash.estimateJaccard(query, match) resultsDB.addLSHresult(query, match, jaccard, "lshresults") print(resultsDB.extractLSHresults("lshresults")) if (mode == 'Read BLAST Results' or mode == 'B'): filename = input('Filename: ') handle = open(filename, 'r') resultsDB = ResultsDB("Results_DB.sqlite") resultsDB.createBLASTtable() resultsDB.deleteBLASTresults() resultsDB.createBLASTtable() for line in handle: line = line[:-1].split('\t') # Extract accessions from 'sp|A0A0R6L508|MCR1_ECOLX'-like string line[0] = line[0].split('|')[1] line[1] = line[1].split('|')[1] print(line) # Filter self-matches, add to the database if line[0] != line[1]: resultsDB.addBLASTresult(line[0], line[1], line[2], line[3]) print(resultsDB.extractBLASTresults()) if (mode == 'Compare Results' or mode == 'R'): # Database with all LSH and BLASTp results resultsDB = ResultsDB("Results_DB.sqlite") identity_th, alignment_th, jaccard_th = 80.0, 100, 0.5 precisions = [] recalls = [] # Load in all protein ids to loop over uniDB = UniprotDB("Uniprot_DB.sqlite") proteins = uniDB.extractProteins() # Store all precisions and recalls per query, to calculate the average for query in proteins: intersect = resultsDB.extractIntersectCountPerProtein( query[0], 'lshresults', identity_th, alignment_th, jaccard_th) lshresults = resultsDB.extractLSHcountPerProtein( query[0], 'lshresults', jaccard_th) blastresults = resultsDB.extractBLASTcountPerProtein( query[0], identity_th, alignment_th) tp = intersect fp = lshresults - intersect fn = blastresults - intersect precision = tp / (tp + fp) if (tp + fp) != 0 else -1 recall = tp / (tp + fn) if (tp + fn) != 0 else -1 # Exclude results without any similar proteins / division by zero if precision != -1: precisions.append(precision) if recall != -1: recalls.append(recall) print("Comparison of BLAST and LSH results:\n Number of proteins queried: %i \n Average precision: %0.3f Average recall: %0.3f\n" \ % (len(proteins), sum(precisions)/len(precisions), sum(recalls)/len(recalls))) if (mode == 'Save LSH' or mode == 'S'): number = int(input('Suffix number: ')) minhash.saveLSH(number) if (mode == 'Load LSH' or mode == 'LL'): number = int(input('Suffix number: ')) minhash.loadLSH(number) mode = input('Choose option: ')
if "-h" in sys.argv or "--help" in sys.argv: print( "Usage: ./nn.py [OPTION] \n\n" " -h | --help Show this help message and exit \n" " --fetch <plugin> Fetch new data with proprietary plugin \n" " --train Train LSH model \n" " --query <ID> Nearest Neighbor query \n") exit(0) if "--fetch" in sys.argv: pluginName = sys.argv[2].replace('.py', '') Dimport("%s" % pluginName, pluginName, FULLNAME('plugins'))(CONST.DATASET) if "--train" in sys.argv: dataframe = pd.read_csv(CONST.DATASET) corpus = TfidfVectorizer().fit_transform(dataframe['content']) lsh = LSH(corpus) model = lsh.train() pickle.dump(model, open(CONST.MODEL, 'wb')) if "--query" in sys.argv: print(query(sys.argv[2])) # eof
def lsh_cluster(unlabel): return LSH(unlabel)
import pandas as pd import sqlite3 uniDB = UniprotDB("Uniprot_DB_ec_pa_human.sqlite") #Construct the protein database """ uniDB.deleteProteins() protManager = ProteinsManager() uniDB.createTables() protManager.loadProteins("Ecolx.xml",uniDB) protManager.loadProteins("PseA7.xml",uniDB) """ protManager = ProteinsManager() protManager.loadProteins("Human.xml", uniDB) minhash3 = LSH(0.3, 96) minhash4 = LSH(0.5, 96) minhash5 = LSH(0.5, 128) # Create the minhashes proteins = uniDB.extractProteins() minhashes3, lsh3 = minhash3.calculateLSH(proteins, 3) minhashes4, lsh4 = minhash4.calculateLSH(proteins, 3) minhashes5, lsh5 = minhash5.calculateLSH(proteins, 3) minhash3.saveLSH(963) minhash4.saveLSH(965) minhash5.saveLSH(1285) """ minhash3.loadLSH(963)
with gzip.open(datasetPath + filenameIn + '_sample.warc.gz', mode='rb') as gzf: for record in warc.WARCFile(fileobj=gzf): record_id = record['WARC-Record-ID'] payload = record.payload.read() doc_uri[record_id] = record['WARC-Target-URI'] text = HTMLPreprocessing(payload).get_text() doc_dict[record_id] = text doc_count += 1 print 'create vectors' tfidf = TFIDF(doc_dict) vect_length = tfidf.vect_length # length of the input vector num_hashtables = 1 # number of iterations digest_length = 0 print 'perform lsh' lsh = LSH(digest_length, vect_length, num_hashtables=num_hashtables) for i, k in enumerate(tfidf._id_list): vect = tfidf.get_vector(i) lsh.index(vect, extra_data=tfidf._id_list[i]) ''' Query documents ''' dedup = set() keys = lsh.hash_tables[0].keys() i = 0 for key in keys: bucket = lsh.hash_tables[0].get_val(key) for query_object in bucket: candidates = lsh.query(query_object[0], distance_func='cosine') for c in candidates: candidate_key = c[0][ 1] # warc id is appended as extra data in lsh.index() if candidate_key == query_object[1]: