def __init__(self, data_path, n, l, subset='small', feature_fields=None, measure='Cosine', k=5, magic_number=800): if feature_fields is None: feature_fields = ['mfcc'] self.data = FMA(data_path, feature_fields=feature_fields, subset=subset) self.lsh = LSH(self.data.features.shape[1], n, l) self._measure = measure self._k = k self._magic_number = magic_number # holds a reference to a set from FMA. For internal usage only self._training_set = None self._test_set = None
def evaluate_minhash(self, n, p): mean_accuracy = 0 headers = [] for i in range(1, self.n_folds + 1): headers.append("fold {}".format(i)) headers.append("Mean") accuracy_list = [] for i in range(1, self.n_folds + 1): indices = self.data_folds[i] training = self.input.drop(self.input.index[indices]) training_y = self.output.drop(self.output.index[indices]) test = self.input.loc[self.input.index[indices], :] test_y = self.output.loc[self.output.index[indices], :] lsh = LSH.MinHash(training, training_y, n, p) lsh.train() lsh.predict(test, 5, 1) correct, counter, accuracy = lsh.accuracy(test_y) accuracy_list.append(accuracy) mean_accuracy += accuracy accuracy_list.append(float(mean_accuracy) / self.n_folds) accuracy_table = pd.DataFrame([accuracy_list], columns=headers) accuracy_table = accuracy_table.rename(index={0: "result"}) print(accuracy_table) return accuracy_table
def main(opt): dataset = torchvision.datasets.Omniglot( root="./data", download=True, background=False, transform=torchvision.transforms.Compose([ torchvision.transforms.Resize([28, 28]), torchvision.transforms.ToTensor(), ])) model = torch.load(opt['model.model_path'], encoding='utf-8') model.eval() nlen = [int(0.8 * len(dataset)), int(0.2 * len(dataset))] trainset, testset = torch.utils.data.random_split(dataset, nlen) dataset = 0 train_values = [] for d in tqdm.tqdm(trainset): train_values.append(d[1]) test_values = [] for d in tqdm.tqdm(testset): test_values.append(d[1]) n_way = opt['data.test_way'] #50 n_shot = opt['data.test_shot'] acc = 0 itr = 10000 lsh = LSH.LSH(64, opt['dist.qbits'], opt['memsize']) for it in tqdm.tqdm(range(itr)): k = random.sample(train_values, n_way) q = random.sample(k, 1) while not (q[0] in test_values): q = random.sample(k, 1) support = [] support_val = [] for i in k: s = get_values(train_values, i, n_shot) for j in s: x = model.encoder.forward( 1 - trainset[j][0][-1, :, :].reshape([1, 1, 28, 28])) lsh.append(x, i) s = get_values(test_values, q[0], 1) x = model.encoder.forward( 1 - testset[s[0]][0][-1, :, :].reshape([1, 1, 28, 28])) y_s = lsh.search(x) if y_s == q[0]: acc = acc + 1 print("Accuracy : ", acc * 100 / (it + 1))
def evaluate_minhash(self, b, r): """ evaluates Min-hash """ mean_accuracy = 0 mean_coverage = 0 headers = [] for i in range(1, self.n_folds + 1): headers.append("fold {}".format(i)) headers.append("Mean") accuracy_list = [] coverage_list = [] for i in range(1, self.n_folds + 1): print("fold {}".format(i)) d = [] p = [] indices = self.data_folds[i] training = self.input.drop(self.input.index[indices]) training_y = self.output.drop(self.output.index[indices]) test = self.input.loc[self.input.index[indices], :] test_y = self.output.loc[self.output.index[indices], :] '''train without fold i''' lsh = LSH.MinHash(training, training_y, b, r) lsh.train() '''test on fold i''' courses = lsh.predict(test) '''calculate rmse''' rmse = lsh.accuracy(test_y, d, p) accuracy_list.append(rmse) mean_accuracy += rmse '''calculate coverage. We have defined coverage as follows: coverage = # of unique items we have recommended on the test set / # of all items ''' for item in courses: if item not in self.recommended: self.recommended.append(item) c = len(self.recommended) / float(lsh.item_num) mean_coverage += c coverage_list.append(c) coverage_list.append(float(mean_coverage) / self.n_folds) accuracy_list.append(float(mean_accuracy) / self.n_folds) accuracy_table = pd.DataFrame([accuracy_list, coverage_list], columns=headers) accuracy_table = accuracy_table.rename(index={ 0: "RMSE", 1: "Coverage" }) print(accuracy_table) return accuracy_table
def meta_val(test_loader, model, train_mean=None): top1 = AverageMeter() model.eval() with torch.no_grad(): tqdm_test_loader = warp_tqdm(test_loader) for i, (inputs, target) in enumerate(tqdm_test_loader): target = target.cuda(0, non_blocking=True) output = model(inputs, True)[0].cuda(0) if train_mean is not None: output = output - train_mean train_out = output[:args.meta_val_way * args.meta_val_shot] train_label = target[:args.meta_val_way * args.meta_val_shot] test_out = output[args.meta_val_way * args.meta_val_shot:] test_label = target[args.meta_val_way * args.meta_val_shot:] train_out = train_out.reshape(args.meta_val_way, args.meta_val_shot, -1).mean(1) train_label = train_label[::args.meta_val_shot] #prediction = metric_prediction(train_out, test_out, train_label, args.meta_val_metric) #perform quantization train_out = do_quantize(train_out, args.quantization, True) test_out = do_quantize(test_out, args.quantization, True) prediction = torch.zeros(75, 5) if args.meta_val_metric == "LSH": lsh = LSH.LSH(lsh_size, args.quantization, 256) for ind, out in enumerate(train_out): lsh.append(out.cpu(), train_label[ind].item()) prediction = torch.tensor( [lsh.search(item.cpu()) for item in test_out]) elif args.meta_val_metric == "mcam": prediction = mcam_calc(test_out, train_out) else: prediction = metric_prediction(train_out, test_out, train_label, args.meta_val_metric) #output = torch.tensor([lsh.search_dist(item.cpu()) for item in query_proto], dtype=torch.float32, requires_grad=True) #need to think about order here... #output = output.cuda() #prediction = torch.tensor([lsh.search(item.cpu()) for item in test_out]) acc = (prediction.cuda() == test_label).float().mean() #print("meta val acc is ", acc) top1.update(acc.item()) if not args.disable_tqdm: tqdm_test_loader.set_description('Acc {:.2f}'.format(top1.avg * 100)) return top1.avg
class Database: "A class representing a database of each user's neighbourhoods and ratings" def __init__(self, df): self.neighbour = db.cf_neighbour self.lsh = LSH(4503) self.users = df.user_id.unique() self.df = df def populate_by_calculating(self, k=5): """ a populator for generating each user's top k nearest neighbourhoods. """ users = self.users for t, user_id in enumerate(users): topK = self.lsh.topK(user_id) self.neighbour.insert({'_id':user_id, 'neighbours':topK}) if t % 100 == 0: print t
def picture_search(request): global username if request.method == "POST": #photo=request.FILES['photo'] #print(photo) #cv2.imwrite("/static/upload_img/target.jpg",photo) #signature = request.POST['signature'] file_content = ContentFile(request.FILES['img'].read()) photo = request.FILES['img'] print(photo) user = Picture_search.objects.create(photo=photo) user.save() '''user2 = BBS_users.objects.get(username = username) user2.signature = signature user2.photo =request.FILES['img'] #img = ImageStore(name = request.FILES['img'].name, img =request.FILES['img']) #img.save() user2.save()''' pic_list = LSH.LSH( "/home/lds/Documents/django_project/Helloworld/media/search_file/" + str(photo)) for i in range(len(pic_list)): pic_list[i] = pic_list[i][:-4] user1 = BBS_users.objects.get(username=username) template = loader.get_template('picture_result.html') context = { 'login_out': 'login out', 'edit_you': 'edit you', 'username': user1.username, 'img': img, 'register': 'change user', 'signature': user1.signature, 'logout_href': '/login/', 'change_user_href': '/login/', 'pic_list': pic_list, } return HttpResponse(template.render(context, request))
def main(): df = pd.read_csv("old.csv", names=['user', 'rating']) df2 = pd.read_csv("new.csv", names=['user', 'rating']) df['rating'] = df.apply(lambda row: ast.literal_eval(row['rating']), axis=1) df2['rating'] = df2.apply(lambda row: ast.literal_eval(row['rating']), axis=1) item_column = [] item_column2 = [] for i in range(df.shape[0]): item_column.append(list(df.iloc[i]['rating'])) for i in range(df2.shape[0]): item_column2.append(list(df2.iloc[i]['rating'])) df['item'] = pd.Series(item_column) df2['item'] = pd.Series(item_column2) # df['item'] = df.apply(lambda row: list(row['rating'].keys()), axis=1) output = df[['rating']] input = df[['item']] input2 = df2[['item']] # print('data loaded') # data = Accuracy.CrossValidate(input, output, n_folds=5) # data.split() # print('data preprocessed') # tuned_param = list() # for i in range(4, 5): # for j in range(3, 4): # print(i, j) # accuracy = data.evaluate_minhash(i, j) # mean_score = accuracy['Mean'][0] # if len(tuned_param) == 0: # tuned_param = [i, j, mean_score] # elif tuned_param[2] > mean_score: # tuned_param = [i, j, mean_score] # print("best param: ", tuned_param[0], tuned_param[1]) nrows = input.shape[0] numbers = list(range(nrows)) each_fold_size = math.floor(float(nrows) / 5) indices = np.random.choice(numbers, each_fold_size, replace=False).tolist() nrows2 = input2.shape[0] numbers2 = list(range(nrows2)) each_fold_size2 = math.floor(float(nrows2) / 5) indices2 = np.random.choice(numbers2, each_fold_size2, replace=False).tolist() training = input.drop(input.index[indices]) training_y = output.drop(output.index[indices]) test1 = input.loc[input.index[indices], :] test2 = input2.loc[input2.index[indices2], :] lsh = LSH.MinHash(training, training_y, 4, 3) lsh.train() '''test on fold i''' courses = lsh.predict(test1) courses2 = lsh.predict(test2) counter = 0 for i in courses2: if i not in courses: counter += 1 print(counter / float(len(courses2)))
class MusicSearch: """ Class for finding the similar tracks for a given test set, calculating the k-nearest-neighbors (knn), classifying the genre of each track and printing the genre classification score for each genre. Parameters ---------- data_path : str Location of the data files (tracks.csv and features.csv). n : int Number of hash tables to use for LSH. l : int Length of hashes in hash tables. subset : str, default='small' Selects the FMA subset. feature_fields : str, default=None Selects subset of the features. Other choices are e.g. ['chroma_cens', 'tonnetz', 'spectral_contrast']. measure : str, default='Cosine' Measure for computing the similarity between feature vectors. Other implemented possibility "Euclidean". k : int, default=5 Amount of the most-similar tracks to consider for knn. magic_number : int, default=800 Size of the random subset when calculating similarity of similar tracks in the course of approx. knn-computation. """ def __init__(self, data_path, n, l, subset='small', feature_fields=None, measure='Cosine', k=5, magic_number=800): if feature_fields is None: feature_fields = ['mfcc'] self.data = FMA(data_path, feature_fields=feature_fields, subset=subset) self.lsh = LSH(self.data.features.shape[1], n, l) self._measure = measure self._k = k self._magic_number = magic_number # holds a reference to a set from FMA. For internal usage only self._training_set = None self._test_set = None def train(self): """ Builds the hash tables of LSH from the training data """ self._training_set = self.data.get_training_data() for item in self._training_set: self.lsh.hash_data(item) def test(self): self._test_set = self.data.get_test_data() self.print_classification_results(self._test_set) def train_with_validation(self): """ Builds the hash tables of LSH from the validation data """ self._training_set = self.data.get_training_with_validation_data() for item in self._training_set: self.lsh.hash_data(item) def test_with_validation(self): self._test_set = self.data.get_validation_data() self.print_classification_results(self._test_set) def find_similar_tracks(self, feature): """ takes a feature vector, which is passed to every hash table and returns track_ids of similar tracks """ result = set() for hash_table in self.lsh.hashes: result.update(hash_table.get(feature)) return list(result) def calculate_similarity(self, feature, track_id): index = np.where(self._training_set[0].index == track_id)[0][0] training_feature = self._training_set[0].iloc[index] if self._measure == "Cosine": return self.cosine_similarity(feature, training_feature) elif self._measure == "Euclidean": return self.euclidean_similarity(feature, training_feature) else: raise Exception("Invalid similarity measure.\n") def k_neighbors(self, feature): """ Returns list of track_ids of knn for given feature vector. self._magic_number refers to the size of the random subset of similar tracks, needed for the approximation of the knn problem. """ similar_tracks = self.find_similar_tracks(feature) k_neighbors = [] if not similar_tracks: return k_neighbors # selects a random subset of similar tracks to approximate the problem # and only calculates similarities for this subset for track_id in np.random.choice(similar_tracks, min(self._magic_number, len(similar_tracks)), replace=True): k_neighbors.append((track_id, self.calculate_similarity(feature, track_id))) # (track_id, similarity)-pairs are sorted via the similarity and only # k-most similar tracks are returned if self._measure == "Cosine": # ideally 1 --> sorted descending k_neighbors = sorted(k_neighbors, key=lambda l: l[1], reverse=True)[:self._k] elif self._measure == "Euclidean": # ideally 0 --> sorted ascending k_neighbors = sorted(k_neighbors, key=lambda l: l[1], reverse=False)[:self._k] k_neighbors = [neighbor[0] for neighbor in k_neighbors] # only return the track_ids return k_neighbors def predict_genre(self, feature): """ Predicts genre for given feature vector """ k_neighbors = self.k_neighbors(feature) indices = [np.where(self._training_set[0].index == track_id)[0][0] for track_id in k_neighbors] genres_of_k_neighbors = [self._training_set[1].iloc[index] for index in indices] if genres_of_k_neighbors: return self.most_common(genres_of_k_neighbors) else: print("No similar tracks found.") return def classification_score(self, test): """ Returns a dictionary containing the absolute number of correct predictions per genre. test[0] refers to features and test[1] refers to the corresponding genres. """ scores_per_genres = {} for track_id, feature in tqdm(test[0].iterrows(), total=test[0].shape[0], position=0, leave=True): predicted_genre = self.predict_genre(feature) id = np.where(test[0].index == track_id)[0][0] true_genre = test[1].iloc[id] # Creates/calculates the dic-entries if true_genre == predicted_genre: if true_genre not in scores_per_genres: scores_per_genres[true_genre] = 1 else: scores_per_genres[true_genre] += 1 return scores_per_genres def print_classification_results(self, test): scores_per_genres = self.classification_score(test) print('\nClassification Accuracy per genre:\n') for genre_score in scores_per_genres: ''' for FMA "small" dataset the absolute number of correct prediction equals the percentage values since there are 100 songs per genre.''' print(f'{genre_score}: {scores_per_genres[genre_score]}%') overall_score = np.average([scores_per_genres[count] for count in scores_per_genres]) print('-----------------------------------------') print(f'Overall classification accuracy: {overall_score}%\n') @staticmethod def cosine_similarity(vec1, vec2): return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2)) @staticmethod def euclidean_similarity(vec1, vec2): return np.linalg.norm(vec1 - vec2) @staticmethod def most_common(collection): return max(set(collection), key=collection.count)
def train(train_loader, model, criterion, optimizer, epoch, scheduler, log): batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() # switch to train mode model.train() end = time.time() tqdm_train_loader = warp_tqdm(train_loader) for i, (input, target) in enumerate(tqdm_train_loader): if args.scheduler == 'cosine': scheduler.step(epoch * len(train_loader) + i) # measure data loading time data_time.update(time.time() - end) if args.do_meta_train: target = torch.arange(args.meta_train_way)[:, None].repeat( 1, args.meta_train_query).reshape(-1).long() target = target.cuda(0, non_blocking=True) # compute output r = np.random.rand(1) if args.beta > 0 and r < args.cutmix_prob: # generate mixed sample lam = np.random.beta(args.beta, args.beta) rand_index = torch.randperm(input.size()[0]).cuda() target_a = target target_b = target[rand_index] bbx1, bby1, bbx2, bby2 = rand_bbox(input.size(), lam) input[:, :, bbx1:bbx2, bby1:bby2] = input[rand_index, :, bbx1:bbx2, bby1:bby2] # adjust lambda to exactly match pixel ratio lam = 1 - ((bbx2 - bbx1) * (bby2 - bby1) / (input.size()[-1] * input.size()[-2])) # compute output output = model(input) loss = criterion(output, target_a) * lam + criterion( output, target_b) * (1. - lam) else: input = input.cuda() output = model(input) if args.do_meta_train: output = output.cuda(0) #0 #print("shape of output is ", output.size()) shot_proto = output[:args.meta_train_shot * args.meta_train_way] query_proto = output[args.meta_train_shot * args.meta_train_way:] shot_proto = shot_proto.reshape(args.meta_train_way, args.meta_train_shot, -1).mean(1) #print("query is ", query_proto.size) #print("shot is ", shot_proto.size) #output = -get_metric(args.meta_train_metric)(shot_proto, query_proto) #perform quantization query_proto = do_quantize(query_proto, args.quantization, True) shot_proto = do_quantize(shot_proto, args.quantization, True) if args.meta_train_metric == "LSH": #LSH code lsh = LSH.LSH(lsh_size, args.quantization, 256) for ind, out in enumerate(shot_proto): lsh.append( out.cpu(), ind ) #index will work!! (bc of how target reshape happens) output = torch.tensor( [lsh.search_dist(item.cpu()) for item in query_proto], dtype=torch.float32, requires_grad=True) #dtype=torch.float32, output = output.float().cuda() elif args.meta_train_metric == "mcam": output = -mcam_calc( shot_proto, query_proto) #TODO: should be negative?? else: output = -get_metric(args.meta_train_metric)(shot_proto, query_proto) #print("final output shape is ", output.shape) loss = criterion(output, target) # measure accuracy and record loss losses.update(loss.item(), input.size(0)) # compute gradient and do SGD step optimizer.zero_grad() loss.backward() optimizer.step() prec1, prec5 = accuracy(output, target, topk=(1, 5)) top1.update(prec1[0], input.size(0)) top5.update(prec5[0], input.size(0)) if not args.disable_tqdm: tqdm_train_loader.set_description('Acc {:.2f}'.format(top1.avg)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0: log.info('Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format( epoch, i, len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses, top1=top1, top5=top5))
from LSH import * lsh = LSH( datafile = "data_for_lsh.csv", dim = 10, r = 50, b = 100, ) lsh.get_data_from_csv() lsh.initialize_hash_store() lsh.hash_all_data() # lsh.display_contents_of_all_hash_bins_pre_lsh() # similarity_neighborhoods = lsh.lsh_basic_for_nearest_neighbors('sample0_1') # print(similarity_neighborhoods) similarity_groups = lsh.lsh_basic_for_neighborhood_clusters() coalesced_similarity_groups = lsh.merge_similarity_groups_with_coalescence( similarity_groups ) similarity_group_mean_values = lsh.merge_similarity_groups_with_l2norm_sample_based( coalesced_similarity_groups ) with open('output.json','w') as file: json.dump(similarity_group_mean_values, file)
def __init__(self, df): self.neighbour = db.cf_neighbour self.lsh = LSH(4503) self.users = df.user_id.unique() self.df = df
def metric_class_type(gallery, query, train_label, test_label, shot, train_mean=None, norm_type='CL2N'): # normalizing if norm_type == 'CL2N': gallery = gallery - train_mean gallery = gallery / LA.norm(gallery, 2, 1)[:, None] query = query - train_mean query = query / LA.norm(query, 2, 1)[:, None] elif norm_type == 'L2N': gallery = gallery / LA.norm(gallery, 2, 1)[:, None] query = query / LA.norm(query, 2, 1)[:, None] gallery = gallery.reshape(args.meta_val_way, shot, gallery.shape[-1]).mean(1) train_label = train_label[::shot] # perform quantization gallery = do_quantize(gallery, args.quantization, False) query = do_quantize(query, args.quantization, False) nearest_samples = np.zeros((1, 75)) distance = np.zeros((5, 75)) if args.eval_metric == "LSH": #LSH code lsh = LSH.LSH(lsh_size, args.quantization, 64) #do hash sig_dict = {} #gallery = torch.tensor(gallery) '''for i, x in enumerate(gallery): l = x.as_bytes() if l in sig_dict: gallery[i] = sig_dict[l] else: #print("sizeof x is ", x.shape) gallery[i] = lsh.signature(x) sig_dict.append(y, gallery[i])''' #gallery = tf.vectorized_map() for ind, out in enumerate(gallery): lsh.append(out, train_label[ind]) nearest_samples = np.array( [lsh.search(torch.tensor(item)) for item in query]) elif args.eval_metric == "mcam": #WILL NOT WORK --> NEED NP VERSION distance = mcam_calc(gallery, query, True) elif args.eval_metric == "cosine": distance = 1. - sklearn.metrics.pairwise.cosine_similarity( gallery, query) #cosine (1. - ???) elif args.eval_metric == "euclidean_squared": distance = ((gallery[:, None, :] - query)**2).sum(2) elif args.eval_metric == "manhattan": distance = sklearn.metrics.pairwise.manhattan_distances( gallery, query) #man elif args.eval_metric == "chebyshev": distance = sklearn.metrics.pairwise_distances( gallery, query, metric='chebyshev') #cheb else: distance = LA.norm(gallery[:, None, :] - query, 2, axis=-1) #euclidean if args.eval_metric != "lsh": idx = np.argpartition(distance, args.num_NN, axis=0)[:args.num_NN] nearest_samples = np.take(train_label, idx) out = mode(nearest_samples, axis=0)[0] out = out.astype(int) test_label = np.array(test_label) acc = (out == test_label).mean() return acc
def run_community_detection(self, seeds, n_accounts=50, n_seeds=5, result_interval=10, runtime_file=None): """ runs community detection from seeds :param seeds: - the seeds to start the community detection with. The process appends to the seeds :param community sizes - the total size of the ground truthed communities :param n_accounts - the maximum number of accounts to grow the community to :param n_seeds: The number of seeds to use :param min_seed_followers - seeds need to have more than this number of followers :param max_followers - don't allow any accounts larger than this value to form communities :param generate_seeds - if true randomly select seeds from a given tag class. Otherwise read them from file :param result_interval - the number of accounts that are added to the seeds between each reading of the recall :param runtime_file: write the runtime of these methods to file """ start_time = time() # Use the locality sensitive hashing table to conduct an initial nearest neighbours search if not isinstance(self.lsh_table, list): print 'loading lsh lookup table' self.load_lsh_table() print 'running lsh query' print seeds self.lsh_candidates = LSH.run_query(seeds, self.signatures, self.lsh_table, return_query_id=True) # reduce the signatures matrix to only work with nearby accounts self.active_signatures = self.signatures.ix[ self.lsh_candidates.active_indices, 1:].values n_candidates = len(self.active_signatures) if n_candidates < n_accounts: print "not all community members are active. Will only consider ", n_candidates, ' of the ', n_accounts, ' accounts' n_additions = n_candidates else: n_additions = n_accounts # implement a new lookup # find the jaccard distance to all non-seeds averaged over the seeds ast0 = time() account_similarities = self.calculate_initial_average_similarity(seeds) avg_sim_time = time() - ast0 self.output_best_initial_averages(account_similarities, seeds, tags, n_seeds, n_accounts, result_interval, file_name='initial_avgs.csv') prt0 = time() R = self.pageRank(account_similarities, seeds, print_full_info=True) pr_time = time() - prt0 self.output_best_initial_averages(R, seeds, tags, n_seeds, n_accounts, result_interval, file_name='pagerank.csv') self.used_ids = {} srt0 = time() for idx in range(n_additions): # Adds the next most similar account to each group of seeds and updates the average distance from the community members to all other accounts self.increment_communities(account_similarities, seeds) # record the recall every if (idx + 1) % 10 == 0: print idx + 1, 'accounts added' sim_rank_time = time() - srt0 if runtime_file: writer = csv.writer(runtime_file) community = self.outfolder.rsplit('/', 1)[-1] writer.writerow(['page_rank', community, pr_time]) writer.writerow(['min_rank', community, sim_rank_time]) writer.writerow(['avg_sim_time', community, avg_sim_time]) print 'added', n_accounts, 'into each of', len( seeds), ' communities in ', time() - start_time, 'seconds' return seeds
#! encoding utf-8 ''' ''' import LSH import readNewsFiles import Apriori if __name__ == '__main__': data_ls = readNewsFiles.get_news_data() array = LSH.Shingling(data_ls) signature_ls, rowNums = LSH.minHash(array) LSH.computeJaccardSimilarityAfterMinhash(signature_ls) s = 0.60 # similarity threshold r = 20 # rows per band bandNum = int(rowNums / r) print(bandNum) #buckets = LSH.MyLSHashing(signature_ls,bandNum,s,r) # buckets = LSH.MylocalitySensitiveHashing(signature_ls,bandNum,s,r) buckets = LSH.localitySensitiveHashing(signature_ls, bandNum, s, r) I_set_list = LSH.buckets_sort(buckets) for ii in I_set_list: print(ii) buckets_dict = LSH.hash_buckets(I_set_list)
# -*- coding:utf-8 -*- import cv2 import numpy as np from cv2 import imread, imshow import LSH result = [] #image = cv.LoadImage("target.jpg", 1) lsh = LSH.LSH(100, 128, table_num=5) kp = [[] for i in range(155)] sift = cv2.SIFT(2000) for i in range(1, 155): print i image1 = cv2.imread(("/Users/mcdreamy/Documents/test2/%d.jpg" % i), 1) x = 0 image1 = cv2.resize(image1, (0, 0), fx=0.4, fy=0.4) tkp, des = sift.detectAndCompute(image1, None) print len(tkp) kp[i - 1].append(tkp) for vec in des: #print vec lsh.index(vec, i * 100000 + x) x = x + 1 while 1: file_name = raw_input("please input the file name") image = cv2.imread(file_name) tkp, des = sift.detectAndCompute(image, None) x = 0
def run(file_location, seed, k=128, b=16, r=8, partial=False, skip=0, take=-1): start_time = timer() print("==============================================") print(" PREPROCESSING") print("==============================================") movie_data = None user_movies_sparse_matrix = None if not partial: print("\nRunning sequential...") print("\nLoading data and generating matrix...") user_movies_sparse_matrix = data.transform_to_sparse_matrix( data.load(file_location, skip, take), partial) else: print("\nRunning partial...") print("\nLoading data...") movie_data = data.load(file_location, skip, take) print("Generating matrix...") user_movies_sparse_matrix = data.transform_to_sparse_matrix( movie_data, partial) matrix_shape = user_movies_sparse_matrix.shape print("Shape: " + str(matrix_shape[1]) + " users | " + str(matrix_shape[0]) + " movies.") nr_movies = matrix_shape[0] nr_users = matrix_shape[1] util.print_time(start_time) print() print("==============================================") print(" Min Hashing") print("==============================================") print("\nGenerating signature matrix...") print("Length: " + str(k)) if not partial: signature_matrix = mh.generate_signature(user_movies_sparse_matrix, k, nr_users, nr_movies, seed) else: signature_matrix = mh.build_signature_in_parts( k, nr_users, nr_movies, user_movies_sparse_matrix, seed) util.print_time(start_time) print() print("==============================================") print(" LSH") print("==============================================") print("\nNumber of bands: " + str(b)) print("Number of rows per band: " + str(r)) print() # free memory del user_movies_sparse_matrix gc.collect() if movie_data is None: movie_data = data.load(file_location, skip, take) user_movies_matrix = data.transform_to_dense_matrix(movie_data) nr_found, nr_buckets = LSH.apply(signature_matrix, user_movies_matrix, b, r, start_time) print_report(nr_found, nr_buckets, start_time, matrix_shape, k, b, r, seed)
from LSH import * lsh = LSH( datafile = "data_for_lsh.csv", dim = 10, r = 50, b = 100, ) start = timeit.default_timer() new_data = [-0.067, -0.015, 0.907, 0.034, -0.05, -0.017, -0.144, -0.204, -0.042, 0.013] with open('output.json','r') as file: similarity_group_mean_values = json.load(file) similar_set = lsh.search_for_similar_set(similarity_group_mean_values, new_data) end = timeit.default_timer() print('==============') print(similar_set) print('Running time is '+ str(end-start) + 's')
from LSH import * lsh = LSH( datafile = "xyz_1000_new.csv", dim = 50, r = 20, b = 10, ) lsh.get_data_from_csv() lsh.initialize_hash_store() lsh.hash_all_data() lsh.display_contents_of_all_hash_bins_pre_lsh() similarity_neighborhoods = lsh.lsh_basic_for_nearest_neighbors()
def OASIS_example(dataset, update_metric, update_lsh): print("Dataset:", dataset) print("Metric update method:", update_metric) print("LSH update method:", update_lsh) total_lsh_update_time = 0 # update_metric = 'online' # update_lsh = 'rebuild' X_example, y_example = X[:100], y[:100] if dataset == "hepmass": tn_neighbors = 10 lamda = 0.0005 elif dataset == "made": tn_neighbors = 20 lamda = 0.0001 lsh_maintainer_flag = False recalls_10 = [] recalls_50 = [] recalls_100 = [] result_acc = 0 accuracy = 0 metric_L = np.eye(X_test.shape[1]) #build the LSH index t0 = timeit.default_timer() index_flag = True metric_build = np.copy(metric_L) # parameters for synthetic dataset if dataset == "made": width = 12 # parameters for hepmass elif dataset == "hepmass": width = 7 lsh = LSH.LSH(dataset, width=width) lsh.build(X, metric_build) total_lsh_build_time = timeit.default_timer() - t0 print("LSH Index Build...") print("Total LSH index building time is", total_lsh_build_time) for i in range(X_test.shape[0]): new_element = np.dot(X_test[i], metric_L.T) #extract candidates candidate = lsh.query(X_test[i]) #extract ground truth distance training_set_ = np.dot(X, metric_L.T) #extract the ground truth n_neighbors e2distances_ = helpers.euclidean_distances(new_element, training_set_) #calculate the recall varying number of n n_neighbors = 10 knn_points_index_10 = np.argsort(e2distances_)[0, :n_neighbors] recalls_10 += [len(np.intersect1d(candidate, knn_points_index_10))] n_neighbors = 50 knn_points_index_50 = np.argsort(e2distances_)[0, :n_neighbors] recalls_50 += [len(np.intersect1d(candidate, knn_points_index_50))] n_neighbors = 100 knn_points_index_50 = np.argsort(e2distances_)[0, :n_neighbors] recalls_100 += [len(np.intersect1d(candidate, knn_points_index_50))] #perform KNN classification on the example set class_label = y_test[i] example_set = np.dot(X_example, metric_L.T) knn_pred_idx = np.argsort( helpers.euclidean_distances(new_element, example_set))[0, :n_neighbors] knn_pred = mode(y_example[knn_pred_idx])[0][0] cor_pred = (knn_pred == class_label) accuracy += cor_pred # insert the example if KNN classification is incorrect if not cor_pred: X_example = np.append(X_example, X_test[i][np.newaxis, ], axis=0) y_example = np.append(y_example, y_test[i]) if update_metric == 'online': #update the metric on new object metric_L = Online_Metric_Learning.online_metric_learning \ (X_example, X_test[i][np.newaxis,], y_example,tn_neighbors, metric_L, class_label, lamda) t0 = timeit.default_timer() if update_lsh == 'rebuild': lsh.rebuild(X, metric_L) elif update_lsh == 'online': #if the first time perform online update, then initialize the LSH maintainer if not lsh_maintainer_flag: lsh_maintainer = LSH_update.lsh_maintainer(lsh, X) lsh_maintainer_flag = True lsh_maintainer.update(metric_L) total_lsh_update_time += timeit.default_timer() - t0 print("Tototal lsh update time is:", total_lsh_update_time) print("The recall when n = 10:", sum(recalls_10) / 10 / len(y_test)) print("The recall when n = 50:", sum(recalls_50) / 50 / len(y_test)) print("The recall when n = 100:", sum(recalls_100) / 100 / len(y_test))
#!/usr/bin/env python # -*- coding: utf-8 -*- # by zhangzhi @2013-10-11 23:45:57 # Copyright 2013 NONE rights reserved. import LSH import SigGen if __name__ == '__main__': sigMFileName = 'SIG_M.pickle' 'load file' g_code2Shg, g_code2Doc, g_shg2Code, g_doc2Code, sigM = SigGen.loadSigM(sigMFileName) print sigM ret = LSH.sigMTopN(sigM, 50, 2, g_doc2Code[1], 100) print ret
def import_biz_user(df): uniqueuserid = df.user_id.unique() uniquebizid = df.business_id.unique() header = ["biz_id"] header.extend([i for i in uniqueuserid]) for t, bizid in enumerate(uniquebizid): df2 = get_restaurant_reviews(bizid, df) bizname = df2.biz_name.values[0] userids = df2.user_id.values stars = df2.stars.values average = df2.user_avg.values row = [0 for i in range(len(uniqueuserid))] for t, userid in enumerate(userids): index = header.index(userid) row[index - 1] = stars[t] - average[t] biz_user.insert({'_id':bizid, 'biz_name':bizname, 'vector':row}) if t % 100 == 0: print t if __name__ == '__main__': fulldf=pd.read_csv("bigdf.csv") smallidf=fulldf[(fulldf.user_review_count > 60) & (fulldf.business_review_count > 150)] smalldf=recompute_frame(smallidf) biz_user = db.biz_user import_biz_user(smalldf) lsh = LSH(240) lsh.populate(biz_user.find())
from LSH import * lsh = LSH(datafile="img_for_lsh_new.csv", dim=100, r=50, b=100, num_clusters=4) lsh.get_data_from_csv() lsh.initialize_hash_store() lsh.hash_all_data() lsh.display_contents_of_all_hash_bins_pre_lsh() # similarity_neighborhoods = lsh.lsh_basic_for_nearest_neighbors() similarity_groups = lsh.lsh_basic_for_neighborhood_clusters() coalesced_similarity_groups = lsh.merge_similarity_groups_with_coalescence( similarity_groups) merged_similarity_groups = lsh.merge_similarity_groups_with_l2norm_sample_based( coalesced_similarity_groups)
choice = int(input()) if choice == 1: num = int(input("How many words do you wish to insert?\n")) rndmW.generate(num, 1) numOfQueries = int(input("how many queries do you wish to perform?\n")) bloomTime = bloomfilter.main(numOfQueries) print("Time for %d membership queries in a Bloom Filter is: %.7f" %(numOfQueries, bloomTime)) rndmW.generateCSV( num) bplusTime = bplustree.main(numOfQueries) print("Time for %d membership queries in b+ Trees is: %.7f" % (numOfQueries, bplusTime)) if choice == 2: ######################################################### """now we start with the b+ tree""" numOfArticles = int(input("How many articles do you wish to compare ? \n")) SimilarityMatrix1 = LSH.main(numOfArticles) cosine_similarity_matrix = CosineSimilarity.main(numOfArticles) print("The Cosine Similarity Method suggests: ") """ for x in range(len(cosine_similarity_matrix)): for y in range(len(cosine_similarity_matrix[x])): print("%.5f" % (cosine_similarity_matrix[x][y]), end=" ") pass print("") """ for x in range(0, len(cosine_similarity_matrix)): for y in range(1, len(cosine_similarity_matrix[x])): if(y<=x): print(" ", end=" ") else: