示例#1
0
 def __init__(self, data_path, n, l, subset='small', feature_fields=None, measure='Cosine', k=5, magic_number=800):
     if feature_fields is None:
         feature_fields = ['mfcc']
     self.data = FMA(data_path, feature_fields=feature_fields, subset=subset)
     self.lsh = LSH(self.data.features.shape[1], n, l)
     self._measure = measure
     self._k = k
     self._magic_number = magic_number
     
     # holds a reference to a set from FMA. For internal usage only
     self._training_set = None
     self._test_set = None
示例#2
0
    def evaluate_minhash(self, n, p):
        mean_accuracy = 0
        headers = []
        for i in range(1, self.n_folds + 1):
            headers.append("fold {}".format(i))
        headers.append("Mean")

        accuracy_list = []
        for i in range(1, self.n_folds + 1):
            indices = self.data_folds[i]
            training = self.input.drop(self.input.index[indices])
            training_y = self.output.drop(self.output.index[indices])
            test = self.input.loc[self.input.index[indices], :]
            test_y = self.output.loc[self.output.index[indices], :]
            lsh = LSH.MinHash(training, training_y, n, p)
            lsh.train()
            lsh.predict(test, 5, 1)
            correct, counter, accuracy = lsh.accuracy(test_y)
            accuracy_list.append(accuracy)
            mean_accuracy += accuracy
        accuracy_list.append(float(mean_accuracy) / self.n_folds)
        accuracy_table = pd.DataFrame([accuracy_list], columns=headers)
        accuracy_table = accuracy_table.rename(index={0: "result"})
        print(accuracy_table)
        return accuracy_table
示例#3
0
def main(opt):

    dataset = torchvision.datasets.Omniglot(
        root="./data",
        download=True,
        background=False,
        transform=torchvision.transforms.Compose([
            torchvision.transforms.Resize([28, 28]),
            torchvision.transforms.ToTensor(),
        ]))

    model = torch.load(opt['model.model_path'], encoding='utf-8')
    model.eval()

    nlen = [int(0.8 * len(dataset)), int(0.2 * len(dataset))]
    trainset, testset = torch.utils.data.random_split(dataset, nlen)

    dataset = 0

    train_values = []
    for d in tqdm.tqdm(trainset):
        train_values.append(d[1])
    test_values = []
    for d in tqdm.tqdm(testset):
        test_values.append(d[1])
    n_way = opt['data.test_way']  #50
    n_shot = opt['data.test_shot']

    acc = 0
    itr = 10000
    lsh = LSH.LSH(64, opt['dist.qbits'], opt['memsize'])
    for it in tqdm.tqdm(range(itr)):
        k = random.sample(train_values, n_way)
        q = random.sample(k, 1)
        while not (q[0] in test_values):
            q = random.sample(k, 1)
        support = []
        support_val = []
        for i in k:
            s = get_values(train_values, i, n_shot)
            for j in s:
                x = model.encoder.forward(
                    1 - trainset[j][0][-1, :, :].reshape([1, 1, 28, 28]))
                lsh.append(x, i)
        s = get_values(test_values, q[0], 1)
        x = model.encoder.forward(
            1 - testset[s[0]][0][-1, :, :].reshape([1, 1, 28, 28]))

        y_s = lsh.search(x)
        if y_s == q[0]:
            acc = acc + 1

    print("Accuracy : ", acc * 100 / (it + 1))
示例#4
0
    def evaluate_minhash(self, b, r):
        """
        evaluates Min-hash
        """
        mean_accuracy = 0
        mean_coverage = 0
        headers = []
        for i in range(1, self.n_folds + 1):
            headers.append("fold {}".format(i))
        headers.append("Mean")

        accuracy_list = []
        coverage_list = []
        for i in range(1, self.n_folds + 1):
            print("fold {}".format(i))
            d = []
            p = []
            indices = self.data_folds[i]
            training = self.input.drop(self.input.index[indices])
            training_y = self.output.drop(self.output.index[indices])
            test = self.input.loc[self.input.index[indices], :]
            test_y = self.output.loc[self.output.index[indices], :]
            '''train without fold i'''
            lsh = LSH.MinHash(training, training_y, b, r)
            lsh.train()
            '''test on fold i'''
            courses = lsh.predict(test)
            '''calculate rmse'''
            rmse = lsh.accuracy(test_y, d, p)
            accuracy_list.append(rmse)
            mean_accuracy += rmse
            '''calculate coverage. We have defined coverage as follows:
               coverage = # of unique items we have recommended on the test set / # of all items
            '''
            for item in courses:
                if item not in self.recommended:
                    self.recommended.append(item)
            c = len(self.recommended) / float(lsh.item_num)
            mean_coverage += c
            coverage_list.append(c)
        coverage_list.append(float(mean_coverage) / self.n_folds)
        accuracy_list.append(float(mean_accuracy) / self.n_folds)
        accuracy_table = pd.DataFrame([accuracy_list, coverage_list],
                                      columns=headers)
        accuracy_table = accuracy_table.rename(index={
            0: "RMSE",
            1: "Coverage"
        })
        print(accuracy_table)

        return accuracy_table
示例#5
0
def meta_val(test_loader, model, train_mean=None):
    top1 = AverageMeter()
    model.eval()

    with torch.no_grad():
        tqdm_test_loader = warp_tqdm(test_loader)
        for i, (inputs, target) in enumerate(tqdm_test_loader):
            target = target.cuda(0, non_blocking=True)
            output = model(inputs, True)[0].cuda(0)
            if train_mean is not None:
                output = output - train_mean
            train_out = output[:args.meta_val_way * args.meta_val_shot]
            train_label = target[:args.meta_val_way * args.meta_val_shot]
            test_out = output[args.meta_val_way * args.meta_val_shot:]
            test_label = target[args.meta_val_way * args.meta_val_shot:]
            train_out = train_out.reshape(args.meta_val_way,
                                          args.meta_val_shot, -1).mean(1)
            train_label = train_label[::args.meta_val_shot]
            #prediction = metric_prediction(train_out, test_out, train_label, args.meta_val_metric)

            #perform quantization
            train_out = do_quantize(train_out, args.quantization, True)
            test_out = do_quantize(test_out, args.quantization, True)

            prediction = torch.zeros(75, 5)
            if args.meta_val_metric == "LSH":
                lsh = LSH.LSH(lsh_size, args.quantization, 256)
                for ind, out in enumerate(train_out):
                    lsh.append(out.cpu(), train_label[ind].item())
                prediction = torch.tensor(
                    [lsh.search(item.cpu()) for item in test_out])
            elif args.meta_val_metric == "mcam":
                prediction = mcam_calc(test_out, train_out)
            else:
                prediction = metric_prediction(train_out, test_out,
                                               train_label,
                                               args.meta_val_metric)

            #output = torch.tensor([lsh.search_dist(item.cpu()) for item in query_proto], dtype=torch.float32, requires_grad=True) #need to think about order here...
            #output = output.cuda()
            #prediction = torch.tensor([lsh.search(item.cpu()) for item in test_out])
            acc = (prediction.cuda() == test_label).float().mean()
            #print("meta val acc is ", acc)

            top1.update(acc.item())
            if not args.disable_tqdm:
                tqdm_test_loader.set_description('Acc {:.2f}'.format(top1.avg *
                                                                     100))
    return top1.avg
示例#6
0
class Database:
    "A class representing a database of each user's neighbourhoods and ratings"
    
    def __init__(self, df):
        self.neighbour = db.cf_neighbour
        self.lsh = LSH(4503)
        self.users = df.user_id.unique()
        self.df = df
        
    def populate_by_calculating(self, k=5):
        """
        a populator for generating each user's top k nearest neighbourhoods.
        """
        users = self.users
        for t, user_id in enumerate(users):
            topK = self.lsh.topK(user_id)
            self.neighbour.insert({'_id':user_id, 'neighbours':topK})
            if t % 100 == 0:
                print t
示例#7
0
文件: views.py 项目: DasongLi/a-forum
def picture_search(request):
    global username
    if request.method == "POST":
        #photo=request.FILES['photo']
        #print(photo)
        #cv2.imwrite("/static/upload_img/target.jpg",photo)
        #signature = request.POST['signature']
        file_content = ContentFile(request.FILES['img'].read())
        photo = request.FILES['img']
        print(photo)
        user = Picture_search.objects.create(photo=photo)
        user.save()
        '''user2 = BBS_users.objects.get(username = username)
		user2.signature = signature
		user2.photo =request.FILES['img']
        #img = ImageStore(name = request.FILES['img'].name, img =request.FILES['img'])  
    	#img.save()
    	user2.save()'''
    pic_list = LSH.LSH(
        "/home/lds/Documents/django_project/Helloworld/media/search_file/" +
        str(photo))
    for i in range(len(pic_list)):
        pic_list[i] = pic_list[i][:-4]
    user1 = BBS_users.objects.get(username=username)
    template = loader.get_template('picture_result.html')
    context = {
        'login_out': 'login out',
        'edit_you': 'edit you',
        'username': user1.username,
        'img': img,
        'register': 'change user',
        'signature': user1.signature,
        'logout_href': '/login/',
        'change_user_href': '/login/',
        'pic_list': pic_list,
    }
    return HttpResponse(template.render(context, request))
def main():
    df = pd.read_csv("old.csv", names=['user', 'rating'])
    df2 = pd.read_csv("new.csv", names=['user', 'rating'])

    df['rating'] = df.apply(lambda row: ast.literal_eval(row['rating']),
                            axis=1)
    df2['rating'] = df2.apply(lambda row: ast.literal_eval(row['rating']),
                              axis=1)
    item_column = []
    item_column2 = []
    for i in range(df.shape[0]):
        item_column.append(list(df.iloc[i]['rating']))

    for i in range(df2.shape[0]):
        item_column2.append(list(df2.iloc[i]['rating']))
    df['item'] = pd.Series(item_column)
    df2['item'] = pd.Series(item_column2)
    # df['item'] = df.apply(lambda row: list(row['rating'].keys()), axis=1)
    output = df[['rating']]
    input = df[['item']]
    input2 = df2[['item']]
    # print('data loaded')
    # data = Accuracy.CrossValidate(input, output, n_folds=5)
    # data.split()
    # print('data preprocessed')
    # tuned_param = list()
    # for i in range(4, 5):
    #     for j in range(3, 4):
    #         print(i, j)
    #         accuracy = data.evaluate_minhash(i, j)
    #         mean_score = accuracy['Mean'][0]
    #         if len(tuned_param) == 0:
    #             tuned_param = [i, j, mean_score]
    #         elif tuned_param[2] > mean_score:
    #             tuned_param = [i, j, mean_score]
    # print("best param: ", tuned_param[0], tuned_param[1])

    nrows = input.shape[0]
    numbers = list(range(nrows))
    each_fold_size = math.floor(float(nrows) / 5)
    indices = np.random.choice(numbers, each_fold_size, replace=False).tolist()

    nrows2 = input2.shape[0]
    numbers2 = list(range(nrows2))
    each_fold_size2 = math.floor(float(nrows2) / 5)
    indices2 = np.random.choice(numbers2, each_fold_size2,
                                replace=False).tolist()

    training = input.drop(input.index[indices])
    training_y = output.drop(output.index[indices])
    test1 = input.loc[input.index[indices], :]
    test2 = input2.loc[input2.index[indices2], :]

    lsh = LSH.MinHash(training, training_y, 4, 3)
    lsh.train()
    '''test on fold i'''
    courses = lsh.predict(test1)
    courses2 = lsh.predict(test2)

    counter = 0
    for i in courses2:
        if i not in courses:
            counter += 1

    print(counter / float(len(courses2)))
示例#9
0
class MusicSearch:
    """ Class for finding the similar tracks for a given test set, calculating the k-nearest-neighbors (knn), 
        classifying the genre of each track and printing the genre classification score for each genre. 
        
        Parameters
        ----------
        data_path : str
            Location of the data files (tracks.csv and features.csv).

        n : int
            Number of hash tables to use for LSH.
                
        l : int
            Length of hashes in hash tables.
            
        subset : str, default='small'
            Selects the FMA subset.
            
        feature_fields : str, default=None
            Selects subset of the features. Other choices are e.g. ['chroma_cens', 'tonnetz', 'spectral_contrast'].
        
        measure : str, default='Cosine'
            Measure for computing the similarity between feature vectors. Other implemented possibility "Euclidean".
            
        k : int, default=5
            Amount of the most-similar tracks to consider for knn.
            
        magic_number : int, default=800
            Size of the random subset when calculating similarity of similar tracks in the course of approx. knn-computation.
        """
        
    def __init__(self, data_path, n, l, subset='small', feature_fields=None, measure='Cosine', k=5, magic_number=800):
        if feature_fields is None:
            feature_fields = ['mfcc']
        self.data = FMA(data_path, feature_fields=feature_fields, subset=subset)
        self.lsh = LSH(self.data.features.shape[1], n, l)
        self._measure = measure
        self._k = k
        self._magic_number = magic_number
        
        # holds a reference to a set from FMA. For internal usage only
        self._training_set = None
        self._test_set = None

    def train(self):
        """ Builds the hash tables of LSH from the training data """
        
        self._training_set = self.data.get_training_data()
        for item in self._training_set:
            self.lsh.hash_data(item)

    def test(self):
        self._test_set = self.data.get_test_data()
        self.print_classification_results(self._test_set)

    def train_with_validation(self):
        """ Builds the hash tables of LSH from the validation data """

        self._training_set = self.data.get_training_with_validation_data()
        for item in self._training_set:
            self.lsh.hash_data(item)

    def test_with_validation(self):
        self._test_set = self.data.get_validation_data()
        self.print_classification_results(self._test_set)

    def find_similar_tracks(self, feature):
        """ takes a feature vector, which is passed to every hash table 
         and returns track_ids of similar tracks """ 
         
        result = set()
        for hash_table in self.lsh.hashes:
            result.update(hash_table.get(feature))

        return list(result)

    def calculate_similarity(self, feature, track_id):
        index = np.where(self._training_set[0].index == track_id)[0][0]
        training_feature = self._training_set[0].iloc[index]

        if self._measure == "Cosine":
            return self.cosine_similarity(feature, training_feature)

        elif self._measure == "Euclidean":
            return self.euclidean_similarity(feature, training_feature)

        else:
            raise Exception("Invalid similarity measure.\n")

    def k_neighbors(self, feature):
        """ Returns list of track_ids of knn for given feature vector. 
            self._magic_number refers to the size of the random subset of similar tracks, 
            needed for the approximation of the knn problem. 
            """
            
        similar_tracks = self.find_similar_tracks(feature)

        k_neighbors = []
        if not similar_tracks:
            return k_neighbors

        # selects a random subset of similar tracks to approximate the problem
        # and only calculates similarities for this subset
        for track_id in np.random.choice(similar_tracks, min(self._magic_number, len(similar_tracks)),
                                         replace=True): 
            k_neighbors.append((track_id, self.calculate_similarity(feature, track_id)))

        # (track_id, similarity)-pairs are sorted via the similarity and only
        # k-most similar tracks are returned    
        if self._measure == "Cosine":  # ideally 1 --> sorted descending
            k_neighbors = sorted(k_neighbors, key=lambda l: l[1], reverse=True)[:self._k]

        elif self._measure == "Euclidean":  # ideally 0 --> sorted ascending
            k_neighbors = sorted(k_neighbors, key=lambda l: l[1], reverse=False)[:self._k]

        k_neighbors = [neighbor[0] for neighbor in k_neighbors]  # only return the track_ids

        return k_neighbors

    def predict_genre(self, feature):
        """ Predicts genre for given feature vector """
        
        k_neighbors = self.k_neighbors(feature)
        indices = [np.where(self._training_set[0].index == track_id)[0][0] for track_id in k_neighbors]
        genres_of_k_neighbors = [self._training_set[1].iloc[index] for index in indices]

        if genres_of_k_neighbors:
            return self.most_common(genres_of_k_neighbors)
        else:
            print("No similar tracks found.")
            return

    def classification_score(self, test):
        """ Returns a dictionary containing the absolute number of correct 
            predictions per genre. test[0] refers to features and test[1] refers 
            to the corresponding genres. """
            
        scores_per_genres = {}
    
        for track_id, feature in tqdm(test[0].iterrows(), total=test[0].shape[0], position=0, leave=True):
            predicted_genre = self.predict_genre(feature)
            id = np.where(test[0].index == track_id)[0][0]
            true_genre = test[1].iloc[id]

            # Creates/calculates the dic-entries
            if true_genre == predicted_genre:
                if true_genre not in scores_per_genres:
                    scores_per_genres[true_genre] = 1
                else:
                    scores_per_genres[true_genre] += 1

        return scores_per_genres

    def print_classification_results(self, test):
        scores_per_genres = self.classification_score(test)

        print('\nClassification Accuracy per genre:\n')

        for genre_score in scores_per_genres:
            ''' for FMA "small" dataset the absolute number of correct prediction 
                equals the percentage values since there are 100 songs per genre.'''
            print(f'{genre_score}: {scores_per_genres[genre_score]}%')

        overall_score = np.average([scores_per_genres[count] for count in scores_per_genres])
        print('-----------------------------------------')
        print(f'Overall classification accuracy: {overall_score}%\n')

    @staticmethod
    def cosine_similarity(vec1, vec2):
        return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

    @staticmethod
    def euclidean_similarity(vec1, vec2):
        return np.linalg.norm(vec1 - vec2)

    @staticmethod
    def most_common(collection):
        return max(set(collection), key=collection.count)
示例#10
0
def train(train_loader, model, criterion, optimizer, epoch, scheduler, log):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()

    # switch to train mode
    model.train()

    end = time.time()
    tqdm_train_loader = warp_tqdm(train_loader)
    for i, (input, target) in enumerate(tqdm_train_loader):

        if args.scheduler == 'cosine':
            scheduler.step(epoch * len(train_loader) + i)
        # measure data loading time
        data_time.update(time.time() - end)

        if args.do_meta_train:
            target = torch.arange(args.meta_train_way)[:, None].repeat(
                1, args.meta_train_query).reshape(-1).long()
        target = target.cuda(0, non_blocking=True)

        # compute output
        r = np.random.rand(1)
        if args.beta > 0 and r < args.cutmix_prob:
            # generate mixed sample
            lam = np.random.beta(args.beta, args.beta)
            rand_index = torch.randperm(input.size()[0]).cuda()
            target_a = target
            target_b = target[rand_index]
            bbx1, bby1, bbx2, bby2 = rand_bbox(input.size(), lam)
            input[:, :, bbx1:bbx2, bby1:bby2] = input[rand_index, :, bbx1:bbx2,
                                                      bby1:bby2]
            # adjust lambda to exactly match pixel ratio
            lam = 1 - ((bbx2 - bbx1) * (bby2 - bby1) /
                       (input.size()[-1] * input.size()[-2]))
            # compute output
            output = model(input)
            loss = criterion(output, target_a) * lam + criterion(
                output, target_b) * (1. - lam)
        else:
            input = input.cuda()
            output = model(input)

            if args.do_meta_train:

                output = output.cuda(0)  #0
                #print("shape of output is ", output.size())
                shot_proto = output[:args.meta_train_shot *
                                    args.meta_train_way]
                query_proto = output[args.meta_train_shot *
                                     args.meta_train_way:]
                shot_proto = shot_proto.reshape(args.meta_train_way,
                                                args.meta_train_shot,
                                                -1).mean(1)
                #print("query is ", query_proto.size)
                #print("shot is ", shot_proto.size)
                #output = -get_metric(args.meta_train_metric)(shot_proto, query_proto)

                #perform quantization
                query_proto = do_quantize(query_proto, args.quantization, True)
                shot_proto = do_quantize(shot_proto, args.quantization, True)

                if args.meta_train_metric == "LSH":
                    #LSH code
                    lsh = LSH.LSH(lsh_size, args.quantization, 256)
                    for ind, out in enumerate(shot_proto):
                        lsh.append(
                            out.cpu(), ind
                        )  #index will work!! (bc of how target reshape happens)
                    output = torch.tensor(
                        [lsh.search_dist(item.cpu()) for item in query_proto],
                        dtype=torch.float32,
                        requires_grad=True)  #dtype=torch.float32,
                    output = output.float().cuda()
                elif args.meta_train_metric == "mcam":
                    output = -mcam_calc(
                        shot_proto, query_proto)  #TODO: should be negative??
                else:
                    output = -get_metric(args.meta_train_metric)(shot_proto,
                                                                 query_proto)
                    #print("final output shape is ", output.shape)

            loss = criterion(output, target)

        # measure accuracy and record loss
        losses.update(loss.item(), input.size(0))

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        prec1, prec5 = accuracy(output, target, topk=(1, 5))
        top1.update(prec1[0], input.size(0))
        top5.update(prec5[0], input.size(0))
        if not args.disable_tqdm:
            tqdm_train_loader.set_description('Acc {:.2f}'.format(top1.avg))

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % args.print_freq == 0:
            log.info('Epoch: [{0}][{1}/{2}]\t'
                     'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                     'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                     'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                     'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
                     'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
                         epoch,
                         i,
                         len(train_loader),
                         batch_time=batch_time,
                         data_time=data_time,
                         loss=losses,
                         top1=top1,
                         top5=top5))
示例#11
0
from LSH import *
lsh = LSH( datafile = "data_for_lsh.csv",  
                                dim = 10,
                                r = 50,                             
                                b = 100,

                              )
lsh.get_data_from_csv()
lsh.initialize_hash_store()
lsh.hash_all_data()
# lsh.display_contents_of_all_hash_bins_pre_lsh()
# similarity_neighborhoods = lsh.lsh_basic_for_nearest_neighbors('sample0_1')
# print(similarity_neighborhoods)
similarity_groups = lsh.lsh_basic_for_neighborhood_clusters()

coalesced_similarity_groups = lsh.merge_similarity_groups_with_coalescence( similarity_groups )
similarity_group_mean_values = lsh.merge_similarity_groups_with_l2norm_sample_based( coalesced_similarity_groups )

with open('output.json','w') as file:
    json.dump(similarity_group_mean_values, file)
示例#12
0
 def __init__(self, df):
     self.neighbour = db.cf_neighbour
     self.lsh = LSH(4503)
     self.users = df.user_id.unique()
     self.df = df
示例#13
0
def metric_class_type(gallery,
                      query,
                      train_label,
                      test_label,
                      shot,
                      train_mean=None,
                      norm_type='CL2N'):

    # normalizing
    if norm_type == 'CL2N':
        gallery = gallery - train_mean
        gallery = gallery / LA.norm(gallery, 2, 1)[:, None]
        query = query - train_mean
        query = query / LA.norm(query, 2, 1)[:, None]
    elif norm_type == 'L2N':
        gallery = gallery / LA.norm(gallery, 2, 1)[:, None]
        query = query / LA.norm(query, 2, 1)[:, None]
    gallery = gallery.reshape(args.meta_val_way, shot,
                              gallery.shape[-1]).mean(1)
    train_label = train_label[::shot]

    # perform quantization
    gallery = do_quantize(gallery, args.quantization, False)
    query = do_quantize(query, args.quantization, False)

    nearest_samples = np.zeros((1, 75))
    distance = np.zeros((5, 75))

    if args.eval_metric == "LSH":
        #LSH code
        lsh = LSH.LSH(lsh_size, args.quantization, 64)
        #do hash
        sig_dict = {}
        #gallery = torch.tensor(gallery)
        '''for i, x in enumerate(gallery):
            l = x.as_bytes()
            if l in sig_dict:
                gallery[i] = sig_dict[l]
            else:
                #print("sizeof x is ", x.shape)
                gallery[i] = lsh.signature(x)
                sig_dict.append(y, gallery[i])'''
        #gallery = tf.vectorized_map()

        for ind, out in enumerate(gallery):
            lsh.append(out, train_label[ind])
        nearest_samples = np.array(
            [lsh.search(torch.tensor(item)) for item in query])
    elif args.eval_metric == "mcam":
        #WILL NOT WORK --> NEED NP VERSION
        distance = mcam_calc(gallery, query, True)
    elif args.eval_metric == "cosine":
        distance = 1. - sklearn.metrics.pairwise.cosine_similarity(
            gallery, query)  #cosine (1. - ???)
    elif args.eval_metric == "euclidean_squared":
        distance = ((gallery[:, None, :] - query)**2).sum(2)
    elif args.eval_metric == "manhattan":
        distance = sklearn.metrics.pairwise.manhattan_distances(
            gallery, query)  #man
    elif args.eval_metric == "chebyshev":
        distance = sklearn.metrics.pairwise_distances(
            gallery, query, metric='chebyshev')  #cheb
    else:
        distance = LA.norm(gallery[:, None, :] - query, 2, axis=-1)  #euclidean

    if args.eval_metric != "lsh":
        idx = np.argpartition(distance, args.num_NN, axis=0)[:args.num_NN]
        nearest_samples = np.take(train_label, idx)

    out = mode(nearest_samples, axis=0)[0]
    out = out.astype(int)
    test_label = np.array(test_label)
    acc = (out == test_label).mean()
    return acc
示例#14
0
    def run_community_detection(self,
                                seeds,
                                n_accounts=50,
                                n_seeds=5,
                                result_interval=10,
                                runtime_file=None):
        """
        runs community detection from seeds
        :param seeds: - the seeds to start the community detection with. The process appends to the seeds
        :param community sizes - the total size of the ground truthed communities
        :param n_accounts - the maximum number of accounts to grow the community to
        :param n_seeds: The number of seeds to use
        :param min_seed_followers - seeds need to have more than this number of followers
        :param max_followers - don't allow any accounts larger than this value to form communities
        :param generate_seeds - if true randomly select seeds from a given tag class. Otherwise read them from file
        :param result_interval - the number of accounts that are added to the seeds between each reading of the recall
        :param runtime_file: write the runtime of these methods to file
        """
        start_time = time()

        # Use the locality sensitive hashing table to conduct an initial nearest neighbours search
        if not isinstance(self.lsh_table, list):
            print 'loading lsh lookup table'
            self.load_lsh_table()
        print 'running lsh query'
        print seeds
        self.lsh_candidates = LSH.run_query(seeds,
                                            self.signatures,
                                            self.lsh_table,
                                            return_query_id=True)
        # reduce the signatures matrix to only work with nearby accounts
        self.active_signatures = self.signatures.ix[
            self.lsh_candidates.active_indices, 1:].values
        n_candidates = len(self.active_signatures)
        if n_candidates < n_accounts:
            print "not all community members are active. Will only consider ", n_candidates, ' of the ', n_accounts, ' accounts'
            n_additions = n_candidates
        else:
            n_additions = n_accounts
            # implement a new lookup

        # find the jaccard distance to all non-seeds averaged over the seeds
        ast0 = time()
        account_similarities = self.calculate_initial_average_similarity(seeds)
        avg_sim_time = time() - ast0
        self.output_best_initial_averages(account_similarities,
                                          seeds,
                                          tags,
                                          n_seeds,
                                          n_accounts,
                                          result_interval,
                                          file_name='initial_avgs.csv')
        prt0 = time()
        R = self.pageRank(account_similarities, seeds, print_full_info=True)
        pr_time = time() - prt0
        self.output_best_initial_averages(R,
                                          seeds,
                                          tags,
                                          n_seeds,
                                          n_accounts,
                                          result_interval,
                                          file_name='pagerank.csv')

        self.used_ids = {}
        srt0 = time()
        for idx in range(n_additions):
            # Adds the next most similar account to each group of seeds and updates the average distance from the community members to all other accounts
            self.increment_communities(account_similarities, seeds)
            # record the recall every
            if (idx + 1) % 10 == 0:
                print idx + 1, 'accounts added'
        sim_rank_time = time() - srt0

        if runtime_file:
            writer = csv.writer(runtime_file)
            community = self.outfolder.rsplit('/', 1)[-1]
            writer.writerow(['page_rank', community, pr_time])
            writer.writerow(['min_rank', community, sim_rank_time])
            writer.writerow(['avg_sim_time', community, avg_sim_time])

        print 'added', n_accounts, 'into each of', len(
            seeds), ' communities in ', time() - start_time, 'seconds'
        return seeds
示例#15
0
#! encoding utf-8
'''
'''

import LSH
import readNewsFiles

import Apriori

if __name__ == '__main__':
    data_ls = readNewsFiles.get_news_data()

    array = LSH.Shingling(data_ls)
    signature_ls, rowNums = LSH.minHash(array)
    LSH.computeJaccardSimilarityAfterMinhash(signature_ls)

    s = 0.60  # similarity threshold
    r = 20  # rows per band
    bandNum = int(rowNums / r)
    print(bandNum)

    #buckets = LSH.MyLSHashing(signature_ls,bandNum,s,r)
    # buckets = LSH.MylocalitySensitiveHashing(signature_ls,bandNum,s,r)

    buckets = LSH.localitySensitiveHashing(signature_ls, bandNum, s, r)
    I_set_list = LSH.buckets_sort(buckets)

    for ii in I_set_list:
        print(ii)

    buckets_dict = LSH.hash_buckets(I_set_list)
示例#16
0
# -*- coding:utf-8 -*-

import cv2
import numpy as np
from cv2 import imread, imshow
import LSH

result = []
#image = cv.LoadImage("target.jpg", 1)

lsh = LSH.LSH(100, 128, table_num=5)
kp = [[] for i in range(155)]
sift = cv2.SIFT(2000)
for i in range(1, 155):
    print i
    image1 = cv2.imread(("/Users/mcdreamy/Documents/test2/%d.jpg" % i), 1)
    x = 0
    image1 = cv2.resize(image1, (0, 0), fx=0.4, fy=0.4)
    tkp, des = sift.detectAndCompute(image1, None)
    print len(tkp)
    kp[i - 1].append(tkp)
    for vec in des:
        #print vec
        lsh.index(vec, i * 100000 + x)
        x = x + 1
while 1:
    file_name = raw_input("please input the file name")
    image = cv2.imread(file_name)

    tkp, des = sift.detectAndCompute(image, None)
    x = 0
示例#17
0
def run(file_location, seed, k=128, b=16, r=8, partial=False, skip=0, take=-1):
    start_time = timer()

    print("==============================================")
    print("               PREPROCESSING")
    print("==============================================")

    movie_data = None
    user_movies_sparse_matrix = None

    if not partial:
        print("\nRunning sequential...")
        print("\nLoading data and generating matrix...")
        user_movies_sparse_matrix = data.transform_to_sparse_matrix(
            data.load(file_location, skip, take), partial)
    else:
        print("\nRunning partial...")
        print("\nLoading data...")
        movie_data = data.load(file_location, skip, take)
        print("Generating matrix...")
        user_movies_sparse_matrix = data.transform_to_sparse_matrix(
            movie_data, partial)

    matrix_shape = user_movies_sparse_matrix.shape
    print("Shape: " + str(matrix_shape[1]) + " users | " +
          str(matrix_shape[0]) + " movies.")
    nr_movies = matrix_shape[0]
    nr_users = matrix_shape[1]
    util.print_time(start_time)

    print()
    print("==============================================")
    print("                Min Hashing")
    print("==============================================")
    print("\nGenerating signature matrix...")
    print("Length: " + str(k))
    if not partial:
        signature_matrix = mh.generate_signature(user_movies_sparse_matrix, k,
                                                 nr_users, nr_movies, seed)
    else:
        signature_matrix = mh.build_signature_in_parts(
            k, nr_users, nr_movies, user_movies_sparse_matrix, seed)

    util.print_time(start_time)

    print()
    print("==============================================")
    print("                    LSH")
    print("==============================================")

    print("\nNumber of bands: " + str(b))
    print("Number of rows per band: " + str(r))
    print()

    # free memory
    del user_movies_sparse_matrix
    gc.collect()

    if movie_data is None:
        movie_data = data.load(file_location, skip, take)

    user_movies_matrix = data.transform_to_dense_matrix(movie_data)
    nr_found, nr_buckets = LSH.apply(signature_matrix, user_movies_matrix, b,
                                     r, start_time)

    print_report(nr_found, nr_buckets, start_time, matrix_shape, k, b, r, seed)
示例#18
0
from LSH import *
lsh = LSH( datafile = "data_for_lsh.csv",  
                                dim = 10,
                                r = 50,                             
                                b = 100,

                              )

start = timeit.default_timer()

new_data = [-0.067, -0.015, 0.907, 0.034, -0.05, -0.017, -0.144, -0.204, -0.042, 0.013]

with open('output.json','r') as file:
	similarity_group_mean_values = json.load(file)

similar_set = lsh.search_for_similar_set(similarity_group_mean_values, new_data)
end = timeit.default_timer()

print('==============')
print(similar_set)
print('Running time is '+ str(end-start) + 's')

示例#19
0
from LSH import *

lsh = LSH( 
           datafile = "xyz_1000_new.csv",
           dim = 50,
           r = 20,                
           b = 10,               
      )
lsh.get_data_from_csv()
lsh.initialize_hash_store()
lsh.hash_all_data()
lsh.display_contents_of_all_hash_bins_pre_lsh()
similarity_neighborhoods = lsh.lsh_basic_for_nearest_neighbors()


示例#20
0
def OASIS_example(dataset, update_metric, update_lsh):
    print("Dataset:", dataset)
    print("Metric update method:", update_metric)
    print("LSH update method:", update_lsh)
    total_lsh_update_time = 0
    # update_metric = 'online'
    # update_lsh = 'rebuild'
    X_example, y_example = X[:100], y[:100]
    if dataset == "hepmass":
        tn_neighbors = 10
        lamda = 0.0005
    elif dataset == "made":
        tn_neighbors = 20
        lamda = 0.0001

    lsh_maintainer_flag = False

    recalls_10 = []
    recalls_50 = []
    recalls_100 = []
    result_acc = 0
    accuracy = 0
    metric_L = np.eye(X_test.shape[1])

    #build the LSH index
    t0 = timeit.default_timer()
    index_flag = True
    metric_build = np.copy(metric_L)
    # parameters for synthetic dataset
    if dataset == "made":
        width = 12
    # parameters for hepmass
    elif dataset == "hepmass":
        width = 7
    lsh = LSH.LSH(dataset, width=width)
    lsh.build(X, metric_build)
    total_lsh_build_time = timeit.default_timer() - t0
    print("LSH Index Build...")
    print("Total LSH index building time is", total_lsh_build_time)

    for i in range(X_test.shape[0]):

        new_element = np.dot(X_test[i], metric_L.T)

        #extract candidates
        candidate = lsh.query(X_test[i])
        #extract ground truth distance
        training_set_ = np.dot(X, metric_L.T)

        #extract the ground truth n_neighbors
        e2distances_ = helpers.euclidean_distances(new_element, training_set_)

        #calculate the recall varying number of n
        n_neighbors = 10
        knn_points_index_10 = np.argsort(e2distances_)[0, :n_neighbors]
        recalls_10 += [len(np.intersect1d(candidate, knn_points_index_10))]

        n_neighbors = 50
        knn_points_index_50 = np.argsort(e2distances_)[0, :n_neighbors]
        recalls_50 += [len(np.intersect1d(candidate, knn_points_index_50))]

        n_neighbors = 100
        knn_points_index_50 = np.argsort(e2distances_)[0, :n_neighbors]
        recalls_100 += [len(np.intersect1d(candidate, knn_points_index_50))]

        #perform KNN classification on the example set
        class_label = y_test[i]
        example_set = np.dot(X_example, metric_L.T)
        knn_pred_idx = np.argsort(
            helpers.euclidean_distances(new_element,
                                        example_set))[0, :n_neighbors]
        knn_pred = mode(y_example[knn_pred_idx])[0][0]
        cor_pred = (knn_pred == class_label)
        accuracy += cor_pred

        # insert the example if KNN classification is incorrect
        if not cor_pred:
            X_example = np.append(X_example, X_test[i][np.newaxis, ], axis=0)
            y_example = np.append(y_example, y_test[i])

        if update_metric == 'online':
            #update the metric on new object
            metric_L = Online_Metric_Learning.online_metric_learning \
            (X_example, X_test[i][np.newaxis,], y_example,tn_neighbors, metric_L, class_label, lamda)

        t0 = timeit.default_timer()
        if update_lsh == 'rebuild':
            lsh.rebuild(X, metric_L)

        elif update_lsh == 'online':
            #if the first time perform online update, then initialize the LSH maintainer
            if not lsh_maintainer_flag:
                lsh_maintainer = LSH_update.lsh_maintainer(lsh, X)
                lsh_maintainer_flag = True

            lsh_maintainer.update(metric_L)

        total_lsh_update_time += timeit.default_timer() - t0

    print("Tototal lsh update time is:", total_lsh_update_time)
    print("The recall when n = 10:", sum(recalls_10) / 10 / len(y_test))
    print("The recall when n = 50:", sum(recalls_50) / 50 / len(y_test))
    print("The recall when n = 100:", sum(recalls_100) / 100 / len(y_test))
示例#21
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-  
# by zhangzhi @2013-10-11 23:45:57 
# Copyright 2013 NONE rights reserved.
import LSH
import SigGen

if __name__ == '__main__':
    sigMFileName = 'SIG_M.pickle'
    'load file'
    g_code2Shg, g_code2Doc, g_shg2Code, g_doc2Code, sigM = SigGen.loadSigM(sigMFileName)
    print sigM
    ret = LSH.sigMTopN(sigM, 50, 2, g_doc2Code[1], 100)
    print ret
示例#22
0
def import_biz_user(df):
    uniqueuserid = df.user_id.unique()
    uniquebizid = df.business_id.unique()
    header = ["biz_id"]
    header.extend([i for i in uniqueuserid])
    for t, bizid in enumerate(uniquebizid):
        df2 = get_restaurant_reviews(bizid, df)
        bizname = df2.biz_name.values[0]
        userids = df2.user_id.values
        stars = df2.stars.values
        average = df2.user_avg.values
        row = [0 for i in range(len(uniqueuserid))]
        for t, userid in enumerate(userids):
            index = header.index(userid)
            row[index - 1] = stars[t] - average[t] 
        biz_user.insert({'_id':bizid, 'biz_name':bizname, 'vector':row})
        if t % 100 == 0:
            print t

if __name__ == '__main__':
    fulldf=pd.read_csv("bigdf.csv")
    smallidf=fulldf[(fulldf.user_review_count > 60) & (fulldf.business_review_count > 150)]
    smalldf=recompute_frame(smallidf)
    
    biz_user = db.biz_user
    import_biz_user(smalldf)

    lsh = LSH(240)
    lsh.populate(biz_user.find())
示例#23
0
from LSH import *

lsh = LSH(datafile="img_for_lsh_new.csv", dim=100, r=50, b=100, num_clusters=4)
lsh.get_data_from_csv()
lsh.initialize_hash_store()
lsh.hash_all_data()
lsh.display_contents_of_all_hash_bins_pre_lsh()
# similarity_neighborhoods = lsh.lsh_basic_for_nearest_neighbors()
similarity_groups = lsh.lsh_basic_for_neighborhood_clusters()
coalesced_similarity_groups = lsh.merge_similarity_groups_with_coalescence(
    similarity_groups)

merged_similarity_groups = lsh.merge_similarity_groups_with_l2norm_sample_based(
    coalesced_similarity_groups)
示例#24
0
		choice = int(input())
		if choice == 1:
			num = int(input("How many words do you wish to insert?\n"))
			rndmW.generate(num, 1)
			numOfQueries = int(input("how many queries do you wish to perform?\n"))
			bloomTime = bloomfilter.main(numOfQueries)
			print("Time for %d membership queries in a Bloom Filter is: %.7f" %(numOfQueries, bloomTime))
			rndmW.generateCSV( num)
			bplusTime = bplustree.main(numOfQueries)
			print("Time for %d membership queries in b+ Trees is: %.7f" % (numOfQueries, bplusTime))

		if choice == 2:
			#########################################################
			"""now we start with the b+ tree"""
			numOfArticles = int(input("How many articles do you wish to compare ? \n"))
			SimilarityMatrix1 = LSH.main(numOfArticles)
			cosine_similarity_matrix = CosineSimilarity.main(numOfArticles)
			print("The Cosine Similarity Method suggests: ")
			"""
			for x in range(len(cosine_similarity_matrix)):
				for y in range(len(cosine_similarity_matrix[x])):

					print("%.5f" % (cosine_similarity_matrix[x][y]), end=" ")
				pass
				print("")
			"""
			for x in range(0, len(cosine_similarity_matrix)):
				for y in range(1, len(cosine_similarity_matrix[x])):
					if(y<=x):
						print("     ", end=" ")
					else: