示例#1
0
def unsuper_simple_rasar_multiclass(X_train, X_test, y_train, y_test):
    
    df_simple_train = pd.DataFrame()
    df_simple_test = pd.DataFrame()
    
    for i in range(1,6):
        # in order to train K-NN
        X_train_i = X_train[y_train == i].copy()
        
        ##########################
        ######## DF RASAR -- TRAIN
        ##########################
        
        knn_train = KNeighborsClassifier(n_jobs = -2, leaf_size = 30, n_neighbors = 2)
        knn_train.fit(X_train_i, y_train[y_train == i])
        
        neigh_i = knn_train.kneighbors(X_train, return_distance = True)
        idx_neigh_i, dist_neigh_i = right_neighbor(neigh_i, X_train, X_train_i)
        
        df_simple_train.loc[:, 'LC50_MOR_' + str(i)] = dist_neigh_i        
        
        ##########################
        ######## DF RASAR -- TEST
        ##########################
        
        knn_test = KNeighborsClassifier(n_jobs = -2, leaf_size = 30, n_neighbors = 1)
        knn_test.fit(X_train_i, y_train[y_train == i])
        
        neigh_i = knn_test.kneighbors(X_test, return_distance = True)
        
        df_simple_test.loc[:, 'LC50_MOR_' + str(i)] = neigh_i[0].ravel()
        
    return df_simple_train, df_simple_test
示例#2
0
class Search_Engine(object):
    def __init__(self, n_neighbors=1, thre=0.3, rej= 'top' ,weights='uniform', 
                 algorithm='ball_tree', metric='euclidean'):
                     
        self.engin = KNeighborsClassifier(n_neighbors=n_neighbors,weights=weights,
                                          algorithm=algorithm, metric=metric)
        self.thre = thre
        if rej not in {'top','mean'}:
            raise ValueError('No such rej mean')
        self.rej = rej
        self.n_neighbors = n_neighbors

    def fit(self,feas,labels):
        if feas.ndim == 2:
            fea_norm = np.linalg.norm(feas, axis=1)[:, np.newaxis]
        else:
            raise Exception('Wrong dimension')
        feas = feas/fea_norm
        self.engin.fit(feas,labels)
        
    def predict(self, query):
        if query.ndim == 1:
            query = query/np.linalg.norm(query)
        else:
            raise Exception('Wrong dimension')
        if self.rej == 'top':
            dis = self.engin.kneighbors(query,1)[0][0][0] 
        elif self.rej == 'mean':

            dis = np.mean(self.engin.kneighbors(query,self.n_neighbors/2+1)[0][0])
        if dis > self.thre:
            label = -1
        else:
            label = self.engin.predict(query)[0]
        return label,dis
示例#3
0
 def synthesize_hybrid(self, data, label):
     """
     处理混合聚类簇
     :param data:
     :param label:
     :return:
     """
     major_data = []
     minor_data = []
     major_label = []
     minor_label = []
     synthetics = []
     # 将一个聚类簇的多数类和少数类分开
     for data_, label_ in zip(data, label):
         if label_ == 1.:
             minor_data.append(data_)
             minor_label.append(label_)
         else:
             major_data.append(data_)
             major_label.append(label_)
     border_minor = []
     border_major = []
     knn_major = KNeighborsClassifier(n_neighbors=1)
     knn_minor = KNeighborsClassifier(n_neighbors=1)
     knn_minor.fit(X=minor_data, y=minor_label)
     knn_major.fit(X=major_data, y=major_label)
     # 寻找边界多数类和少数类
     for major in major_data:
         index = knn_minor.kneighbors(X=[major], n_neighbors=1, return_distance=False)[0][0]
         border_minor.append(minor_data[index])
     for minor in minor_data:
         index = knn_major.kneighbors(X=[minor], n_neighbors=1, return_distance=False)[0][0]
         border_major.append(major_data[index])
     border_minor = self.set(border_minor)
     border_major = self.set(border_major)
     n_neighbors_major = self.n
     n_neighbors_minor = self.n
     if n_neighbors_minor > len(minor_data):
         n_neighbors_minor = len(minor_data)
     if n_neighbors_major > len(major_data):
         n_neighbors_major = len(major_data)
     # 合成少数类样本
     for minor in minor_data:
         if minor in border_minor:
             index = knn_major.kneighbors(X=[minor], n_neighbors=n_neighbors_major, return_distance=False)[0]
             length = len(index)
             for i in range(self.n):
                 index_ = index[i % length]
                 point_y = major_data[index_]
                 synthetics.append(self.synthesize(point_x=minor, point_y=point_y))
         else:
             index = knn_minor.kneighbors(X=[minor], n_neighbors=n_neighbors_minor, return_distance=False)[0]
             index = index[1:]
             length = len(index)
             for i in range(self.n):
                 index_ = index[i % length]
                 point_y = minor_data[index_]
                 synthetics.append(self.synthesize(point_x=minor, point_y=point_y))
     return synthetics
def print_acc(emb_table):
    classifier = KNeighborsClassifier(n_neighbors=20)  
    classifier.fit(emb_table, range(len(emb_table))) 
    
    classifier_cos = KNeighborsClassifier(n_neighbors=20, metric='cosine')  
    classifier_cos.fit(emb_table, range(len(emb_table))) 

    artist_range = range(len(artist_related))
    
    id_sim = 0    
    id_sim_cos = 0
    for _ in range(1000):      
        
        a = random.choice(artist_range)
        
        emb_related = classifier.kneighbors(emb_table[a].reshape(1, -1), len(artist_related[a]))
        emb_related_cos = classifier_cos.kneighbors(emb_table[a].reshape(1, -1), len(artist_related[a]))

        spot_id = set(artist_related[a])
        emb_id = set(emb_related[1][0]) 
        emb_id_cos = set(emb_related_cos[1][0])
        
        id_sim += float(len(emb_id & spot_id)) / len(emb_id | spot_id) * 100
        id_sim_cos += float(len(emb_id_cos & spot_id)) / len(emb_id_cos | spot_id) * 100

    mse_loss = 0
    mse_mean = 0
    rand_mse_loss = 0
    rand_mse_mean = 0
    for a, r in artist_related.items():
        related_indices = list(r)
        random_indices = random.sample(artist_range, len(related_indices))
        repeated_artist_indices = [a] * len(related_indices)

        mse_vals = (np.square(emb_table[repeated_artist_indices] - emb_table[related_indices])).mean(axis=1)
        mse_loss += mse_vals.sum(axis=0)
        mse_mean += mse_vals.mean(axis=0)
        
        rand_mse_vals = (np.square(emb_table[repeated_artist_indices] - emb_table[random_indices])).mean(axis=1)
        rand_mse_loss += rand_mse_vals.sum(axis=0)
        rand_mse_mean += rand_mse_vals.mean(axis=0)
        
    print(f'ID Similarity MSE    : {id_sim / 1000:.20f}')
    print(f'ID Similarity COS    : {id_sim_cos / 1000:.20f}')
    print(f'Related Total MSE    : {mse_loss:.20f}')
    print(f'Random  Total MSE    : {rand_mse_loss:.20f}')
    print(f'Total MSE Ratio      : {rand_mse_loss / mse_loss:.20f}')
    print(f'Related Average MSE  : {mse_mean / len(artist_related):.20f}')
    print(f'Random  Average MSE  : {rand_mse_mean / len(artist_related):.20f}')
    print(f'Average MSE Ratio    : {(rand_mse_mean / len(artist_related)) / (mse_mean / len(artist_related)):.20f}')

    with open(model_name + '_temp.emb.pickle', 'wb') as f:
        save = {
            'embedding_lookup': emb_table
        }
        pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
        del save
示例#5
0
    def generate_nuclei_centers(self, n_pix_per_nuc=9, dtype='int32'):
        '''
            Generates the nuclei randomly within a cell
            --------------------
            parameters:
                n_pix_per_nuc: number of pixels in a nucleus (default: 9)
                dtype: data type for the nuclei tensor (default: int16)
            --------------------
            adds:
                self.nuclei: 3d tensor of nuclei, -1 is no nucleus, number is according
                    to the cell ID
        '''
        if self.cell_centers is None:
            self.generate_cell_centers()

        nuclei = np.zeros(self.true_map.shape, dtype=dtype)
        nuclei -= 1

        check_nuclei_surroundings = KNeighborsClassifier(n_neighbors=27)

        non_zero_cell_locs = np.where(self.true_map != -1)
        cell_ids = self.true_map.ravel()
        xs = non_zero_cell_locs[0]
        ys = non_zero_cell_locs[1]
        zs = non_zero_cell_locs[2]
        cell_ids = cell_ids[cell_ids != -1]
        check_nuclei_surroundings.fit(np.vstack((xs, ys, zs)).T, cell_ids)

        for i in np.unique(self.cell_ids):
            if i != -1:
                cell_coords = np.where(self.true_map == i)
                rand_index = randint(0, len(cell_coords[0]) - 1)
                xs = cell_coords[0]
                ys = cell_coords[1]
                zs = cell_coords[2]

                locs_as_mat = np.vstack((xs, ys, zs)).T
                clf_seed = KNeighborsClassifier(
                    n_neighbors=min(n_pix_per_nuc, locs_as_mat.shape[0]))
                nuclei_seed = locs_as_mat[rand_index, :]

                clf_seed.fit(locs_as_mat, np.arange(locs_as_mat.shape[0]))
                nuc_pix_locs = clf_seed.kneighbors([nuclei_seed])[1][0]
                nuc_pix = locs_as_mat[nuc_pix_locs, :]

                non_same_celltype = cell_ids[
                    check_nuclei_surroundings.kneighbors(nuc_pix)[1]]
                #remove nuc pixels that are on the border
                nuc_pix = nuc_pix[np.sum(non_same_celltype == i, axis=1) ==
                                  27, :]
                if nuc_pix.shape[0] > 0:
                    nuclei[nuc_pix[:, 0], nuc_pix[:, 1], nuc_pix[:, 2]] = i
                else:
                    nuclei[nuclei_seed[0], nuclei_seed[1], nuclei_seed[2]] = i
                #print(np.unique(nuclei))

        self.nuclei = nuclei
def unsuper_simple_rasar(train_distance_matrix, test_distance_matrix, X_train,
                         X_test, y_train, y_test):
    ######## starting DATAFRAME ##########

    X_train0 = X_train[y_train == 0].copy()
    X_train1 = X_train[y_train == 1].copy()

    # in order to train 1-NN
    dist_matr_train_0 = train_distance_matrix.iloc[y_train == 0, y_train == 0]
    dist_matr_train_1 = train_distance_matrix.iloc[y_train == 1, y_train == 1]

    # To find neighbors for train experiments --> df_rasar_train
    dist_matr_train_train_0 = train_distance_matrix.iloc[:, y_train == 0]
    dist_matr_train_train_1 = train_distance_matrix.iloc[:, y_train == 1]

    # To find neighbors for test experiments --> df_rasar_test
    dist_matr_test_train_0 = test_distance_matrix.iloc[:, y_train == 0]
    dist_matr_test_train_1 = test_distance_matrix.iloc[:, y_train == 1]

    ####### DF train RASAR ###############

    # finding the nearest 0s experiments for training experiments that is not itself
    knn0 = KNeighborsClassifier(metric='precomputed', n_jobs=-2, n_neighbors=2)
    knn0.fit(dist_matr_train_0, y_train[y_train == 0])
    neigh0 = knn0.kneighbors(dist_matr_train_train_0, return_distance=True)
    _, dist0 = right_neighbor(neigh0, X_train, X_train0)

    # finding the nearest 1s experiments for training experiments that is not itself
    knn1 = KNeighborsClassifier(metric='precomputed', n_jobs=-2, n_neighbors=2)
    knn1.fit(dist_matr_train_1, y_train[y_train == 1])
    neigh1 = knn1.kneighbors(dist_matr_train_train_1, return_distance=True)
    _, dist1 = right_neighbor(neigh1, X_train, X_train1)

    df_rasar_train = pd.DataFrame({'dist_neigh0': dist0, 'dist_neigh1': dist1})

    ####### DF test RASAR ################

    # finding the nearest 0s experiments to test data
    knn0 = KNeighborsClassifier(metric='precomputed', n_neighbors=1, n_jobs=-2)
    knn0.fit(dist_matr_train_0, y_train[y_train == 0])
    neigh0 = knn0.kneighbors(dist_matr_test_train_0, return_distance=True)
    #     idx_neigh_0 = pd.DataFrame(neigh0[1])[0].apply(lambda x: X_train.iloc[y_train==0].iloc[x].name)

    # finding the nearest 1s experiments to test data
    knn1 = KNeighborsClassifier(metric='precomputed', n_neighbors=1, n_jobs=-2)
    knn1.fit(dist_matr_train_1, y_train[y_train == 1])
    neigh1 = knn1.kneighbors(dist_matr_test_train_1, return_distance=True)
    #     idx_neigh_1 = pd.DataFrame(neigh1[1])[0].apply(lambda x: X_train.iloc[y_train==1].iloc[x].name)

    df_rasar_test = pd.DataFrame({
        'dist_neigh0': neigh0[0].ravel(),
        'dist_neigh1': neigh1[0].ravel()
    })

    return df_rasar_train, df_rasar_test
示例#7
0
def get_summary(positive_sentences, negative_sentences, num_clusters=3):

    #Tokenize the sentences
    print("Tokenizing")

    pos_token = [tokenizer(i) for i in positive_sentences]
    neg_token = [tokenizer(i) for i in negative_sentences]

    #Preparing Vocabulary
    print("Preparing vocabulary")

    stop = set(stopwords.words('english'))

    vocab = set(pos_model.wv.vocab) - stop
    vocab = [i for i in vocab if can_be_adjective(i)]
    s1 = []
    for sent in pos_token:
        for word in sent:
            if word in vocab:
                s1.append(pos_model[word])
    s1 = np.array(s1)

    vocab = set(neg_model.wv.vocab) - stop
    vocab = [i for i in vocab if can_be_adjective(i)]
    s2 = []
    for sent in neg_token:
        for word in sent:
            if word in vocab:
                s2.append(neg_model[word])
    s2 = np.array(s2)

    #Clustering
    print("Clustering")

    pos_kmeans = KMeans(n_clusters=num_clusters).fit(s1)
    pos_centers = pos_kmeans.cluster_centers_
    pos_neigh = KNeighborsClassifier(n_neighbors=1)
    pos_neigh.fit(s1, pos_kmeans.labels_)
    neg_kmeans = KMeans(n_clusters=num_clusters).fit(s2)
    neg_centers = neg_kmeans.cluster_centers_
    neg_neigh = KNeighborsClassifier(n_neighbors=1)
    neg_neigh.fit(s2, neg_kmeans.labels_)

    print("Most significant words")
    print("--Positives--")
    for i in pos_centers:
        print(vocab[pos_neigh.kneighbors(i.reshape(1, -1),
                                         return_distance=False)[0, 0]])
    print("\n\n")
    print("--Negatives--")
    for i in neg_centers:
        print(vocab[neg_neigh.kneighbors(i.reshape(1, -1),
                                         return_distance=False)[0, 0]])
def testing(input):
    vect = CountVectorizer()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    X_ = vect.fit_transform(X)
    x_test = vect.transform(X_test)
    knn = KNeighborsClassifier(n_neighbors=1, metric='euclidean')
    knn.fit(X_, y)
    y_pred = knn.predict(x_test)
    test = vect.transform([input])
    nearest_neighbor = df['QUES'][knn.kneighbors(test, 1)[1][0][0]]
    ans = df['ANS'][knn.kneighbors(test, 1)[1][0][0]]
    label = get_key(knn.predict(test)[0])
    accuracy = metrics.accuracy_score(y_test, y_pred)
    return nearest_neighbor, ans, label, accuracy
def process_grid_cell(train, test, grid_id, threshold, model, grid_variable):
    """ Creates model and generates predictions for row_ids in a particular grid cell.
    """
    start = time.time()
    # Filter data onto single grid cell
    train_cell = train[train[grid_variable] == grid_id]
    test_cell = test[test[grid_variable] == grid_id]
    test_ids = test_cell.index

    # Remove place ids from train data with frequency below threshold
    place_counts = train_cell.place_id.value_counts()
    mask = place_counts[train_cell.place_id.values] >= threshold
    train_cell = train_cell.loc[mask.values]

    # Encode place id as labels
    le = LabelEncoder()
    y_train = le.fit_transform(train_cell.place_id.values)
    X_train = train_cell.drop(['place_id', grid_variable], axis = 1).values
    X_test = test_cell.drop(['place_id', grid_variable], axis = 1).values
        
    # NN as features
    model_nn = KNeighborsClassifier(n_neighbors = 31, n_jobs = -1, weights = 'distance', metric = 'manhattan')
    model_nn.fit(X_train, y_train)
    train_neighbors = pd.DataFrame(model_nn.kneighbors(X_train, n_neighbors = 31, return_distance = True)[0])
    test_neighbors = pd.DataFrame(model_nn.kneighbors(X_test, n_neighbors = 31, return_distance = True)[0])
    train_nn_cols = train_neighbors.columns
    test_nn_cols = test_neighbors.columns    
    
    train_cell[train_nn_cols] = train_neighbors
    train_cell[train_nn_cols] = train_neighbors.values
    
    test_cell[test_nn_cols] = test_neighbors
    test_cell[test_nn_cols] = test_neighbors.values 
   
    X_train = train_cell.drop(['place_id', grid_variable], axis = 1).values
    X_test = test_cell.drop(['place_id', grid_variable], axis = 1).values
        
    # Build training classifier and predict
    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_test)

    pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:,::-1][:,:3]).astype(str)   
    end = time.time()
    time_elapsed = (end - start)
    
    # Generate CV score
    map3 = MAP3(test_cell['place_id'], pred_labels)
    
    # Return data
    return pred_labels, test_ids, time_elapsed, map3
示例#10
0
def para_func2(arg):
    num, shape, shape2, metric, cnum = arg
    X = _sharedX
    X2 = _sharedX2
    centers = choice(X.shape[0], cnum, False)
    mod = KClass(1, metric=metric)
    mod.fit(X[centers, :], range(centers.size))
    dista1, ma1 = mod.kneighbors(X, return_distance=True)
    distb1, mb1 = mod.kneighbors(X2, return_distance=True)

    mall = ma1
    mall2 = mb1

    return mall2, mall
示例#11
0
def neighbors(data, pred_c, new_user):
    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(data, pred_c)
    n_knn = knn.predict(new_user)
    y, idx = knn.kneighbors(new_user, n_neighbors=5)
    print("距离新用户距离最近的用户:", idx)
    return idx
示例#12
0
class Identifier:
    def __init__(self):
        self.model = KNeighborsClassifier(n_neighbors=1, n_jobs=4)
        self.threshold = 0.7
        self.data = {}
        self.data_classes = []

    def identify(self, face):
        if face.embedding is not None:
            nearestest_neighbour = self.model.kneighbors([face.embedding])
            distance = nearestest_neighbour[0][0][0]
            print('nearestest_neighbour: {}'.format(nearestest_neighbour))
            if distance >= self.threshold:
                return 'Unknown'

            return self.data_classes[nearestest_neighbour[1][0][0]]

    def add_identities(self, faces):
        for face in faces:
            if face.name not in self.data:
                self.data[face.name] = []
            self.data[face.name].append(face.embedding)
        data = []
        classes = []
        for name, deep_features in self.data.items():
            for deep_feature in deep_features:
                data.append(deep_feature)
                classes.append(name)
        self.data_classes = classes
        self.model.fit(data, classes)
        return face
示例#13
0
def get_recommnedations(X, y, n_neighs=100):

  neigh = KNeighborsClassifier(n_neighbors=n_neighs, metric='cosine')
  neigh.fit(X, y)

  distances, neigh_ids = neigh.kneighbors(X, n_neighs)

  titles_recommendations = {}

  for i, title in enumerate(y):
    titles_scores = [distances[i]]
    titles_neighs = [neigh_ids[i]]
    normalized_scores = 1 - normalize(titles_scores)
    # #normalized_scores = normalize(titles_scores)

    grouped_title_scores = {}
    for idx, titles_id in enumerate(titles_neighs):
      neighs_title_name = y[tuple(titles_neighs)]
      neighs_title_distance = normalized_scores[idx]

      for neigh_title, neigh_distance in zip(neighs_title_name, neighs_title_distance):
        grouped_title_scores.setdefault(neigh_title, []).append(neigh_distance)

    titles_recommendations[title] = sorted(
      [(tit, dist[0]) for tit, dist in grouped_title_scores.items()],
      key=lambda x: x[1],
      reverse=True
    )
  return titles_recommendations
示例#14
0
 def get_types(self, X: np.ndarray, y: np.ndarray) -> np.ndarray:
     knn = KNeighborsClassifier(n_neighbors=5)
     knn.fit(X, y)
     types = []
     neibors = knn.kneighbors(
         X, n_neighbors=6,
         return_distance=False)  #TODO correct spelling mistake
     actual_neibors = neibors[:, 1:]  # counts it self its own neighbor
     for i in range(len(actual_neibors)):
         instance_neighbors_of_the_same_class = 0
         for neibor in actual_neibors[i]:
             if y[i] == y[neibor]:
                 instance_neighbors_of_the_same_class += 1
         if instance_neighbors_of_the_same_class >= 4:
             types.append(Types.SAFE)
             continue
         elif instance_neighbors_of_the_same_class >= 2:
             types.append(Types.BORDERLINE)
             continue
         elif instance_neighbors_of_the_same_class >= 1:
             types.append(Types.RARE)
             continue
         else:
             types.append(Types.OUTLIER)
     return np.array(types)
示例#15
0
class KNN():
    def __init__(self, knn, stage):
        self.knn = knn
        _, self.kernel_pca = pre_process(stage)
        sample_pos, sample_neg, X_c1, self.Y_c1 = read_label_data(stage)

        X_reduced = self.kernel_pca.transform(X_c1)

        self.knn1 = KNeighborsClassifier(n_neighbors=1, weights='uniform')
        self.y = []
        self.knn1.fit(X_reduced, self.Y_c1)
        self.knn2 = KNeighborsClassifier(n_neighbors=knn, weights='distance')
        self.knn2.fit(X_reduced, self.Y_c1)


    def predict(self, sample):
        prop = []
        X_reduced_valid = self.kernel_pca.transform(sample)
        for j in range(X_reduced_valid.shape[0]):
            kNeighbour = self.knn2.kneighbors([X_reduced_valid[j]], n_neighbors=self.knn)[1]
            if np.min(self.Y_c1[kNeighbour]) <= 0:
                self.y.append(0)
            else:
                self.y.append(1)
            prop.append(np.mean(self.Y_c1[kNeighbour]))
        y = np.array(self.y)
        self.y=[]
        return y, np.array(prop)
示例#16
0
 def hybrid_precessor(self, data, label):
     major = []
     major_label = []
     minor = []
     minor_label = []
     border_major = []
     border_major_label = []
     for data_, label_ in zip(data, label):
         if label == 1.0:
             minor.append(data_)
             minor_label.append(label_)
         else:
             major.append(data_)
             major_label.append(label_)
     knn = KNeighborsClassifier(n_neighbors=1)
     knn.fit(X=major, y=major_label)
     for m in minor:
         neighbor = knn.kneighbors(X=[m],
                                   n_neighbors=1,
                                   return_distance=False)[0][0]
         border_major.append(major[neighbor])
         border_major_label.append(0.0)
     return_data = []
     return_label = []
     for d in major:
         if d not in border_major:
             return_data.append(d)
             return_label.append(0.0)
     return return_data, return_label, border_major, border_major_label
示例#17
0
文件: KNORAU.py 项目: w4k2/weles
class KNORAU(BaseEstimator, ClassifierMixin):
    """
    Implementation of the KNORA-Union des method.
    """
    def __init__(self, ensemble=[], k=7, metric="euclidean"):
        self.ensemble = ensemble
        self.k = k
        self.metric = metric

    def fit(self, X, y):
        self.X_dsel = X
        self.y_dsel = y

        self.knn = KNeighborsClassifier(n_neighbors=self.k, metric="euclidean")

        self.knn.fit(self.X_dsel, self.y_dsel)

    def estimate_competence(self, X):
        self.competences = np.zeros((X.shape[0], len(self.ensemble)))
        _, self.neighbors = self.knn.kneighbors(X=X, n_neighbors=self.k)

        local_X = np.reshape(self.X_dsel[self.neighbors], (-1, X.shape[-1]))
        local_y = np.reshape(self.y_dsel[self.neighbors], (-1))

        self.competences = np.sum(
            np.array([
                np.reshape(
                    clf.predict(local_X) == local_y, (X.shape[0], self.k))
                for clf in self.ensemble
            ]),
            axis=2,
        ).T
        # print(self.competences)

    def ensemble_matrix(self, X):
        """EM."""
        return np.array(
            [member_clf.predict(X) for member_clf in self.ensemble]).T

    def predict(self, X):
        if self.X_dsel.shape[0] >= self.k:
            self.estimate_competence(X)
            em = self.ensemble_matrix(X)
            predict = []

            for i, row in enumerate(em):
                decision = np.bincount(row, weights=self.competences[i])
                predict.append(np.argmax(decision))
        else:
            em = self.ensemble_matrix(X)
            predict = []

            for i, row in enumerate(em):
                decision = np.bincount(row)
                predict.append(np.argmax(decision))

        return np.array(predict)

    def score(self, X, y):
        return balanced_accuracy_score(y, self.predict(X))
def KnnPrediction(df_Movies,df_movie_id):

    movie_id = df_movie_id.iloc[0]
    cluster = df_Movies[df_Movies["tconst"] == movie_id]["cluster"].iloc[0]

    df_inter=df_Movies.loc[df_Movies['cluster']==cluster]

    if df_inter.shape[0] < 6 :
        df_Cluster = df_Movies
    else : 
        columns=['isAdult','startYear','runtimeMinutes','averageRating','numVotes']

        X=df_inter[columns]
        y=df_inter['tconst']

        model_KNN = KNeighborsClassifier(n_neighbors=5)
        model_KNN.fit(X,y)

        MovieTemp = model_KNN.kneighbors(df_inter.loc[df_inter['tconst']==movie_id, columns],n_neighbors=6)

        clusterList = []
        for i in range(1,6):
            clusterList.append(df_inter.iloc[MovieTemp[1][0][i]]['tconst'])

        df_Cluster = df_Movies[df_Movies["tconst"].isin(clusterList)]
    return df_Cluster
示例#19
0
def classify(data):
    X, y = generate_X_and_Y()
    X_data = strip_song_and_artist(X)
    X_scaled = scale(X_data)

    knn = KNeighborsClassifier(n_neighbors=20)
    knn.fit(X_data, y)

    data_scaled = scale(data)
    print("Predicted Mood:", knn.predict(data))
    print("accuracy:", knn.score(X_data, y))
    distances, indices = knn.kneighbors(data, n_neighbors=20)

    # Moods + songs & artists of K nearest neighbors
    moods = [y[index] for index in indices[0]]
    songs_and_artists = [X[index][0:2] for index in indices[0]]
    sa_classes = [y[index] for index in indices[0]]
    # print("moods:")
    # pprint(moods)
    # print("indices:")
    # pprint(indices)
    for i in range(0, len(songs_and_artists)):
        print(songs_and_artists[i])
        print(sa_classes[i])
    # print("songs and artists:")
    # pprint(songs_and_artists)
    # print("their moods:")
    # pprint(sa_classes)

    return moods
def ex713():
    L=40
    # Cross-validation not necessary. Instead, compute matrix of nearest neighbor
    # distances between each pair of data points ..
    knclassifier = KNeighborsClassifier(n_neighbors=L+1).fit(X, ravel(y))
    neighbors = knclassifier.kneighbors(X)
    # .. and extract matrix where each row contains class labels of subsequent neighbours
    # (sorted by distance)
    ndist, nid = neighbors[0], neighbors[1]
    print len(ndist)
    print len(nid)
    print "="*20
    nclass = y[nid].flatten().reshape(N,L+1)

    # Use the above matrix to compute the class labels of majority of neighbors
    # (for each number of neighbors l), and estimate the test errors.
    errors = np.zeros(L)
    nclass_count = np.zeros((N,C))
    for l in range(1,L+1):
        for c in range(C):
            nclass_count[:,c] = sum(nclass[:,1:l+1]==c,1).A.ravel()
        y_est = np.argmax(nclass_count,1);
        errors[l-1] = (y_est!=y.A.ravel()).sum()


    # Plot the classification error rate
    figure(1)
    plot(100*errors/N)
    xlabel('Number of neighbors')
    ylabel('Classification error rate (%)')

    figure(2)
    imshow(nclass, cmap='binary', interpolation='None'); xlabel("k'th neighbor"); ylabel('data point'); title("Neighbors class matrix");

    show()
示例#21
0
文件: TP2.py 项目: jffp113/AA-tp2
def tunning_eps(data):
    zeros = [0] * 563
    neigh = KNeighborsClassifier(n_neighbors=5).fit(X=data, y=zeros)
    distances, indices = neigh.kneighbors(data)
    fig, ax = plt.subplots(figsize=(8, 6))
    sort = np.sort(distances, axis=0)
    ax.plot(np.linspace(0, N_IMAGES, num=N_IMAGES), sort[:, 4])
示例#22
0
class KNN_model():
    def __init__(self):
        self.datasets=datasets()
        self.movie_name = self.datasets.get_movie_name()
        self.movie_dict=self.datasets.get_movie_dict()
        self.movie_score = self.datasets.get_movie_score()
        self.OneHot=self.datasets.get_OneHot()

        self.knn = KNeighborsClassifier(n_neighbors=3)
        self.knn.fit(self.OneHot, range(0,len(self.OneHot)))

    def knn_predict(self, movie, neighbors=10, out_number=5):
        self.location_id = self.movie_dict[movie]
        self.movie_id = self.OneHot[self.location_id]
        self.neighbors = self.knn.kneighbors(np.reshape(self.movie_id,[-1,self.OneHot.shape[-1]]), neighbors, False)
        self.get_neoghbor_name = np.reshape(self.movie_name[self.neighbors],[-1,neighbors])
        self.get_neoghbor_score = np.reshape(self.movie_score[self.neighbors], [-1,neighbors])
        self.id = np.argsort(self.get_neoghbor_score)
        self.result = []
        for i in range(0, len(self.get_neoghbor_name)):
            rst = self.get_neoghbor_name[i][self.id[i]].tolist()
            rst.remove(movie)
            self.result.append(rst[0:out_number])
        return self.result


# iris_y_predict = knn.predict(iris_x_test)

# print(iris_y_predict)
示例#23
0
def show_neighbors(item_names, item_features, i=None):
    # item_names = np.array of n_samples names
    # item_features = np.array of n_samples x D encoded features
    # i = item to show neighbors for
    
    if i is None:
        i = np.random.choice(range(len(item_names)))
    
    neigh = KNeighborsClassifier(n_neighbors=20)
    neigh.fit(item_features, item_names)
    # i = 10822
    nei_items = item_names[neigh.kneighbors(item_features[[i], :])[1]]
    nei_dists = neigh.kneighbors(item_features[[i], :])[0]
    print(item_names[i])
    print(nei_items)
    print(nei_dists)
示例#24
0
def recommend_by_userid(user_id: int):
    # load user and movie embeddings
    user_embeddings = np.load(USER_EMBED_PATH)
    movie_embeddings = np.load(MOVIE_EMBED_PATH)
    
    # load user, movie and user_movie mappings
    uid_lbl_mapping = pickle.load(open(USER_LABEL_MAPPING, "rb"))
    mid_lbl_mapping = pickle.load(open(MOVIE_LABEL_MAPPING, "rb"))
    lbl_mid_mapping = pickle.load(open(LABEL_MOVIE_MAPPING, "rb"))
    user_movie_mapping = pickle.load(open(USER_MOVIE_MAPPING, "rb"))
    id_title_mapping = pickle.load(open(ID_TITLE_MAPPING, "rb"))
    
    user_label = uid_lbl_mapping.get(user_id)
    user_embedding = user_embeddings[user_label]
    
    user_watched_movies = user_movie_mapping[user_id]
    movies = list(mid_lbl_mapping.keys())
    user_unwatched_movies = list(set(movies) - set(user_watched_movies))
    user_unwatched_movies_labels = [mid_lbl_mapping[mid] for mid in user_unwatched_movies]
    
    clf = KNeighborsClassifier(n_neighbors=11)
    unwatched_movie_embeddings = movie_embeddings[user_unwatched_movies_labels]
    clf.fit(unwatched_movie_embeddings, user_unwatched_movies_labels)
    
    distances, indices = clf.kneighbors(user_embedding.reshape(1, -1), n_neighbors=10)
    distances, indices = zip(*sorted(zip(distances[0], indices[0])))
    distances, indices = list(distances), list(indices)
    
    sorted_movie_ids = [lbl_mid_mapping[m_idx] for m_idx in indices if m_idx != 0]
    
    recommend_movies = [id_title_mapping[mid] for mid in sorted_movie_ids]
    print("Recommended movies:", recommend_movies)

    return recommend_movies
class PriorNetwork(nn.Module):
    def __init__(self,
                 size_training_set,
                 code_length,
                 n_hidden=512,
                 k=5,
                 random_seed=4543):
        super(PriorNetwork, self).__init__()
        self.rdn = np.random.RandomState(random_seed)
        self.k = k
        self.size_training_set = size_training_set
        self.code_length = code_length
        self.fc1 = nn.Linear(self.code_length, n_hidden)
        self.fc2_u = nn.Linear(n_hidden, self.code_length)
        self.fc2_s = nn.Linear(n_hidden, self.code_length)

        self.knn = KNeighborsClassifier(n_neighbors=self.k, n_jobs=-1)
        # codes are initialized randomly - Alg 1: initialize C: c(x)~N(0,1)
        codes = self.rdn.standard_normal(
            (self.size_training_set, self.code_length))
        self.fit_knn(codes)

    def fit_knn(self, codes):
        ''' will reset the knn  given an nd array
        '''
        st = time.time()
        self.codes = codes
        assert (len(self.codes) > 1)
        y = np.zeros((len(self.codes)))
        self.knn.fit(self.codes, y)

    def batch_pick_close_neighbor(self, codes):
        '''
        :code latent activation of training example as np
        '''
        neighbor_distances, neighbor_indexes = self.knn.kneighbors(
            codes, n_neighbors=self.k, return_distance=True)
        bsize = neighbor_indexes.shape[0]
        if self.training:
            # randomly choose neighbor index from top k
            chosen_neighbor_index = self.rdn.randint(0,
                                                     neighbor_indexes.shape[1],
                                                     size=bsize)
        else:
            chosen_neighbor_index = np.zeros((bsize), dtype=np.int)
        return self.codes[neighbor_indexes[np.arange(bsize),
                                           chosen_neighbor_index]]

    def forward(self, codes):
        st = time.time()
        np_codes = codes.cpu().detach().numpy()
        previous_codes = self.batch_pick_close_neighbor(np_codes)
        previous_codes = torch.FloatTensor(previous_codes).to(DEVICE)
        return self.encode(previous_codes)

    def encode(self, prev_code):
        h1 = F.relu(self.fc1(prev_code))
        mu = self.fc2_u(h1)
        logstd = self.fc2_s(h1)
        return mu, logstd
示例#26
0
def main():
	list_data = training()
	dict_vect, entity_names = entities(list_data)
	vec = DictVectorizer()
	transformer = TfidfTransformer()
	vectors = vec.fit_transform(dict_vect).toarray()
	#tfidf_vectors = transformer.fit_transform(vectors).toarray()
	
	clf = GaussianNB().fit(vectors, entity_names)
	ne = KNeighborsClassifier(n_neighbors=5).fit(vectors, entity_names)
	#clf = MultinomialNB().fit(tfidf_vectors, entity_names)
	#clf = BernoulliNB().fit(tfidf_vectors, entity_names)
	allfiles = glob.glob('test/*.txt')	
	for each in allfiles:
		with open(each, 'r') as f:
			data = f.read()
			dict_vectors, entity_n = test(data)
			vectors_pred = vec.transform(dict_vectors).toarray()
			#tfidf_vectors_pred = transformer.transform(vectors_pred).toarray()
			pred = clf.predict(vectors_pred)
			from sklearn.metrics import accuracy_score
			#print(accuracy_score(entity_names,clf.predict(vectors)))
			predict = ne.kneighbors(vectors_pred, n_neighbors= 5,return_distance=False)
			print("\n \nPredictions for the given file: \t"+each)
			print("\n GussianNB algorithm prediction: \t")
			for x,y in zip(entity_n,pred):
				print("\ninput word in the given file: \t"+x,"\n predicted word for the input: \t"+y)
			print("\n5 Nearest neighbors using k-nearest neighbors algorithm prediction: \n")
			for each,z in zip(predict,entity_n):
				print("The input word given in the file:\t"+z+"\n")
				for each1 in each:
					print(entity_names[each1])
				print("\n")
class Recommender:
    def __init__(self, data, num_neighbors=20, metric='cosine'):
        print('Fit')
        self.data = data
        self.kn = KNeighborsClassifier(n_neighbors=num_neighbors,
                                       weights='distance',
                                       algorithm='brute',
                                       metric=metric,
                                       p=2)
        matrix = data.to_sparse_matrix()
        confs_name = ['' for i in range(data.conferences_set.__len__())]
        for conf in data.conferences_set:
            confs_name[data.conf_to_num[conf]] = conf
        self.kn.fit(matrix, confs_name)

    def recommend(self, user_conferences, bound=0.4, num_neighbors=10):
        print('Predict')
        rec_conf_with_dist = {i:0 for i in self.data.conferences_set}
        rec_conf_number = {i:0 for i in self.data.conferences_set}
        for c in user_conferences:
            number_of_current_user_conf = self.data.conf_to_num[c]
            dst1, ind1 = self.kn.kneighbors(self.data.conf_to_vector(c),
                                            return_distance=True,
                                            n_neighbors=num_neighbors)
            conf_with_dist = zip(normalize(dst1[0])[0], ind1[0])
            for dist, conf in conf_with_dist:
                rec_conf_with_dist[self.data.num_to_conf[conf]] += dist
                rec_conf_number[self.data.num_to_conf[conf]] += 1
        for conf, number in rec_conf_number.items():
            if number != 0 and number != 1:
                rec_conf_with_dist[conf] /= number
        result = sorted(rec_conf_with_dist.items(), key=operator.itemgetter(1))
        result = list(filter(lambda tmp: tmp[1] != 0 and tmp[1] < bound, result))
        return result
示例#28
0
def df_datafusion_rasar(db_datafusion, db):

    grouped_datafusion = db_datafusion.groupby(
        by=['endpoint', 'effect', 'target'])

    db_datafusion_rasar = pd.DataFrame()

    for group in grouped_datafusion.groups:

        name = group[0] + '_' + group[1] + '_' + str(group[2])

        train_X = grouped_datafusion.get_group(group).drop(
            columns=['endpoint', 'effect', 'target'])
        test_X = db.copy()

        train_y = grouped_datafusion.get_group(group)['target'].values

        knn = KNeighborsClassifier(n_jobs=-1, n_neighbors=1)
        knn.fit(train_X, train_y)

        neigh = knn.kneighbors(test_X, return_distance=True)

        db_datafusion_rasar[name] = neigh[0].ravel()

    return db_datafusion_rasar
示例#29
0
def test(example, population, k):
    """
    Compare your results against the sklearn KNN classifier.
    
    This function should create a sklearn KNeighborsClassifier and verify
    that get_neighbors method returns the same result as the KNeighborsClassifier.
    
    >>> example = np.array([[1, 1]])
    >>> population = np.array([
            [0, 0, 0], # point at coordinate (0, 0) belongs to class 0 
            [100, 100, 1] # point at coordinate (100, 100) belongs to class 1
        ])
    >>> # Provided get_neighbors implemented correctly
    >>> test(example, population, k=1)
    True
    """
    # YOUR CODE HERE
    #############################
    knn = KNeighborsClassifier()
    knn.fit(np.array([x[:2] for x in population]),
            np.array([x[2] for x in population]))
    neigh = knn.kneighbors(example, k, return_distance=False)
    res = sorted([x for i, x in enumerate(population) if i in neigh],
                 key=lambda x: x[0])
    res2 = sorted(get_neighbors(example, population, k), key=lambda x: x[0])
    for x, y in zip(res, res2):
        if all(x == y):
            pass
        else:
            return False
    return True
示例#30
0
def frienemy_pruning(X_query, X_dsel, y_dsel, ensemble, k):
    """Implements the Online Pruning method (frienemy) which prunes base
    classifiers that do not cross the region of competence of a given instance.
    A classifier crosses the region of competence if it correctly
    classify at least one sample for each different class in the region.

    Parameters
    ----------
    X_query : array-like of shape (n_samples, n_features)
        Test set.
    X_dsel : array-like of shape (n_samples, n_features)
        Dynamic selection set.
    y_dsel : array-like of shape (n_samples,)
        The target values (Dynamic selection set).
    ensemble : list of shape = [n_classifiers]
        The ensemble of classifiers to be pruned.
    k : int
        Number of neighbors used to compute the regions of competence.

    Returns
    -------
    DFP_mask : array-like of shape = [n_samples, n_classifiers]
               Mask containing 1 for the selected base classifier and 0
               otherwise.

    """
    predictions = np.zeros((X_dsel.shape[0], len(ensemble)),
                           dtype=np.intp)
    for index, clf in enumerate(ensemble):
        predictions[:, index] = clf.predict(X_dsel)
    hit_miss = predictions == y_dsel[:, np.newaxis]
    competence_region = KNeighborsClassifier(n_neighbors=k).fit(X_dsel, y_dsel)
    neighbors = competence_region.kneighbors(X_query, return_distance=False)
    return frienemy_pruning_preprocessed(neighbors, y_dsel, hit_miss)
示例#31
0
def neighborhood_hit(X, y, k):
    # X is data, y labels and k number of neighbors
    logging.info("Computing neighborhood hit")
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X, y)
    neighbors = knn.kneighbors(X, return_distance=False)
    return np.mean(np.mean((y[neighbors] == np.tile(y.reshape((-1, 1)), k)).astype('uint8'), axis=1))
示例#32
0
def NSC_k_NN(df_treatment, embeds_cols, plot_conf=False, savepath=None):
    # Create classes for each moa
    class_dict = dict(zip(df_treatment['moa'].unique(), np.arange(len(df_treatment['moa'].unique()))))
    df_treatment['moa_class'] = df_treatment['moa'].map(class_dict)

    # Create nearest neighbors classifier
    predictions = list()
    labels = list()
    label_names = list()
    for comp in df_treatment['compound'].unique():
        df_ = df_treatment.loc[df_treatment['compound'] != comp, :]
        knn = KNeighborsClassifier(n_neighbors=4, algorithm='brute', metric='cosine')
        knn.fit(df_.loc[:, embeds_cols], df_.loc[:, 'moa_class'])

        nn = knn.kneighbors(df_treatment.loc[df_treatment['compound'] == comp, embeds_cols])
        for p in range(nn[1].shape[0]):
            predictions.append(list(df_.iloc[nn[1][p]]['moa_class']))
        labels.extend(df_treatment.loc[df_treatment['compound'] == comp, 'moa_class'])
        label_names.extend(df_treatment.loc[df_treatment['compound'] == comp, 'moa'])

    predictions = np.asarray(predictions)
    k_nn_acc = [accuracy_score(labels, predictions[:, 0]),
                accuracy_score(labels, predictions[:, 1]),
                accuracy_score(labels, predictions[:, 2]),
                accuracy_score(labels, predictions[:, 3])]

    if plot_conf:
        print('There are {} treatments'.format(len(df_treatment)))
        print('NSC is: {:.2f}%'.format(accuracy_score(labels, predictions[:, 0]) * 100))
        plot_confusion_matrix(labels, predictions[:, 0], class_dict, 'NSC', savepath)
    return k_nn_acc
示例#33
0
def main():
    plt.scatter(X[:, 0], X[:, 1], c=y, s=20,
                cmap='cool')  # 此处c=y指的是按照y的分类进行颜色区分,cmap指色彩映射
    plt.scatter(c[:, 0], c[:, 1], c='orange', s=50, marker='^')
    #模型训练
    k = 5
    clf = KNeighborsClassifier(n_neighbors=k)
    clf.fit(X, y)
    #模型预测
    X_sample = np.array([[0, 1], [-2, 4],
                         [3, 1]])  #新版本sklearn规定,传入的必须是二维数组,同时我也将其处理为np格式
    y_sample = clf.predict(X_sample)
    print(y_sample)
    # print(X_sample[:,1])
    neighbors = clf.kneighbors(X_sample, return_distance=False)
    # print(neighbors)
    plt.scatter(X_sample[:, 0],
                X_sample[:, 1],
                c=y_sample,
                marker='x',
                cmap='cool')  #注意,若给定的测试数据集最终预测类型只有两类,导致颜色错误
    for index, element in enumerate(neighbors):  #enumrate函数来获取列表中索引的位置
        for i in element:
            # print(index)
            plt.plot([X[i][0], X_sample[index][0]],
                     [X[i][1], X_sample[index][1]],
                     'g--',
                     linewidth=0.6)  #预测点与距离最近的k个样本的连线
    # plt.plot([1,2,3], [1,2,3], 'go-', label='line 1', linewidth=2)
    plt.show()
示例#34
0
def calculateDiscriminant(prototypes, classes, values):
    d = []
    c1 = classes[0]
    c2 = classes[1]
    bool_c1 = False
    bool_c2 = False

    knn = KNeighborsClassifier(n_neighbors=len(classes))
    knn.fit(prototypes, classes)

    for i in values:
        dist, kn_index = knn.kneighbors(X=[i], return_distance=True)
        dist = dist[0]
        kn_index = kn_index[0]

        for j in range(len(kn_index)):
            if ((classes[kn_index[j]] == c1) and not bool_c1):
                p_c1 = kn_index[j]
                dist_c1 = dist[j]
                bool_c1 = True
            elif ((classes[kn_index[j]] == c2) and not bool_c2):
                p_c2 = kn_index[j]
                dist_c2 = dist[j]
                bool_c2 = True

        d.append(dist_c1 - dist_c2)
        bool_c1 = False
        bool_c2 = False

    return d
示例#35
0
def para_func(arg):
    num, shape, metric, cnum = arg
    X = _sharedX
    centers = choice(X.shape[0], cnum, False)
    mod = KClass(1, metric=metric)
    mod.fit(X[centers, :], range(centers.size))
    dist, m = mod.kneighbors(X, return_distance=True)
    return m
示例#36
0
def kNNEngine(k,train,test):
    #create classifier
    clf= KNeighborsClassifier(k)
    clf.fit(train, [0]*len(train)) #2nd attribute is arbitrary feature values

    #get nearest neighbors of my test point
    x= clf.kneighbors(test) #returns array([distances list],[neighbor index list])
    similarity_scores, indices= x[0][0], x[1][0]
    return similarity_scores, indices
示例#37
0
class KNN_c():

    def __init__(self, k=5):
        self.k = k
        self._model = KNeighborsClassifier(n_neighbors=k)

    def description(self):
        return 'KNN %s' % (self.k)

    def predict_p(self, X_train, y_train, X_test): 
        self._model.fit(X_train, y_train)
        # Compute empirical probabilities
        return np.array([np.mean(y_train[self._model.kneighbors(X_test[i,:])[1]]==1) for i in range(X_test.shape[0])])
class RoomClassifier:
    """
    Class to convert fingerprints into a room label

    Train the classifier using a set of labeled fingerprints by calling fit().
    New fingerprints can be labeled using predict(). To recognize fingerprints
    outside of the calibrated rooms, call predict_outlier() to check whether
    the fingerprints are outliers.

    This class is a simple wrapper around sklearn's PCA and KNeighborsClassifier.
    """
    def __init__(self, outlier_threshold=10.0):
        """
        Instantiate room classifier
        :param outlier_threshold: Threshold in dB for outlier detection
        """
        self.dimred = PCA(n_components=5)
        self.classifier = KNeighborsClassifier(n_neighbors=5)
        self.outlier_threshold = outlier_threshold

    def fit(self, fingerprints, label):
        """
        Train room classifier using labeled fingerprints
        :param fingerprints: list of fingerprints
        :param label: list of labels corresponding to fingerprints
        """
        fp = self.dimred.fit_transform(fingerprints)
        self.classifier.fit(fp, label)

    def predict(self, fingerprints):
        """
        Predict room label for all fingerprints
        :param fingerprints: list of fingerprints
        :return: list of room labels
        """
        fp = self.dimred.transform(fingerprints)
        return self.classifier.predict(fp)

    def predict_outlier(self, fingerprints):
        """
        Predict whether the fingerprints are taken in an unlabeled room
        :param fingerprints: list of fingerprints
        :return: list of booleans, True if the room is unlabeled, False otherwise
        """
        fp = self.dimred.transform(fingerprints)
        dist, ind = self.classifier.kneighbors(fp, n_neighbors=1, return_distance=True)
        return (dist > self.outlier_threshold).reshape(-1)
示例#39
0
def smote(X, y, target, family, k=None, sp=np.array([])):
    """
    INPUT:
    X, y - your data
    target - the percentage of positive class 
             observations in the output
    k - k in k nearest neighbors
    OUTPUT:
    X_oversampled, y_oversampled - oversampled data
    `smote` generates new observations from the positive (minority) class:
    For details, see: https://www.jair.org/media/953/live-953-2037-jair.pdf
    """
    if target <= np.sum([y==family])/float(len(y)):
        return X, y
    if k is None:
        k = len(X)**.5

    # fit kNN model
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X[y==family], y[y==family])
    neighbors = knn.kneighbors()[0]
    positive_observations = X[y==family]

    # determine how many new positive observations to generate
    positive_count = np.sum([y==family])
    negative_count = len(y) - positive_count
    target_positive_count = target*negative_count / (1. - target)
    target_positive_count = int(round(target_positive_count))
    number_of_new_observations = target_positive_count - positive_count

    # generate synthetic observations
    synthetic_observations = np.empty((0, X.shape[1]))
    while len(synthetic_observations) < number_of_new_observations:
        obs_index = np.random.randint(len(positive_observations))
        observation = positive_observations[obs_index]
        neighbor_index = np.random.choice(neighbors[obs_index])
        neighbor = X[neighbor_index]
        obs_weights = np.random.random(len(neighbor))
        neighbor_weights = 1 - obs_weights
        new_observation = obs_weights*observation + neighbor_weights*neighbor
        synthetic_observations = np.vstack((synthetic_observations, new_observation))

    X_smoted = np.vstack((X, synthetic_observations))
    y_smoted = np.concatenate((y, [family]*len(synthetic_observations)))

    return X_smoted, y_smoted
示例#40
0
def main(noun_to_vect_dict_loc, labels_loc, centroids_loc):
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    # Load in pickled noun to vector dictionary
    logger.info('Loading pickled noun to vector dictionary')
    # Load noun to vector dictionary
    with open(noun_to_vect_dict_loc, 'rb') as f:
        noun_to_vect_dict = pickle.load(f)

    # Create nouns array
    nouns = np.array(noun_to_vect_dict.keys())
    # Create vectors array
    vectors = noun_to_vect_dict.values()

    # Create labels array
    labels = []
    # Load in labels
    logger.info('Loading in labels')
    with open(labels_loc, 'r') as f:
        for line in f:
            labels.append(int(line))
    labels = np.array(labels)

    # Load in pickled centroids
    logger.info('Loading pickled centroids')
    with open(centroids_loc, 'rb') as f:
        centroids = pickle.load(f)

    # Create empty dictionary for top nouns for a cluster
    top_nouns_dict = {}

    # Instantiate and fit kNN model
    knc = KNeighborsClassifier(n_jobs=-1)
    knc.fit(vectors, labels)

    # Get indices of top vectors
    for i, centroid in enumerate(centroids):
        # Determine number of representative vectors to get
        class_size = sum(labels == i)
        n_neighbors = 50 if class_size >= 50 else class_size
        # Get indices of n_neighbors vectors nearest to centroid
        indices = knc.kneighbors(X=centroid, n_neighbors=n_neighbors)
        # Add top nouns corresponding to those indices to dictionary
        top_nouns_dict[i] = nouns[indices]
示例#41
0
def kNearestNeighbours(X, y, C, L=40, s=""):    
    print "Doing k-nearest neighbours for: " 
    print s
    minError = 500
    bestK = -1
    N = len(X)
    
    # Cross-validation not necessary. Instead, compute matrix of nearest neighbor
    # distances between each pair of data points ..
    knclassifier = KNeighborsClassifier(n_neighbors=L+1, warn_on_equidistant=False).fit(X, y)
    neighbors = knclassifier.kneighbors(X)
    # .. and extract matrix where each row contains class labels of subsequent neighbours
    # (sorted by distance)
    ndist, nid = neighbors[0], neighbors[1]
    nclass = y[nid].flatten().reshape(N,L+1)
    
    # Use the above matrix to compute the class labels of majority of neighbors
    # (for each number of neighbors l), and estimate the test errors.
    errors = np.zeros(L)
    nclass_count = np.zeros((N,C))
    for l in range(1,L+1):
        for c in range(C):
            nclass_count[:,c] = sum(nclass[:,1:l+1]==c,1).A.ravel()
        y_est = np.argmax(nclass_count,1);
        errors[l-1] = (y_est!=y.A.ravel()).sum()
        if errors[l-1] < minError:
            minError = errors[l-1]
            bestK = l
    
        
    # Plot the classification error rate
    figure()
    plot(100*errors/N)
    xlabel('Number of neighbors')
    ylabel('Classification error rate (%)')
    
    figure()
    imshow(nclass, cmap='binary', interpolation='None'); xlabel("k'th neighbor"); ylabel('data point'); title("Neighbors class matrix");
    
    show()
    
    print '\n'
    
    return (bestK, minError / N)
示例#42
0
def similar_users(user,genre_arrays,neighbors):
	''' Pass: the active user object and the number of neighbors to calculate
		Returns: An array of similar users, containing the username and the distance to that user on the genre-dimensional plot
	 '''
	# if user.pk == 18:
	# 	pdb.set_trace()
	id_array, x_array = genre_arrays
	copy_id_array = id_array.copy()
	copy_x_array = x_array.copy()	
	user_id = user.pk
	id_index = copy_id_array.index(user_id)
	user_array = copy_x_array[id_index]
	del copy_id_array[id_index]
	del copy_x_array[id_index]
	if len(copy_x_array) < neighbors:
		neighbors = len(copy_x_array)
	y_array = [random.random() for x in range(len(copy_x_array))]
	neigh = KNeighborsClassifier(n_neighbors=neighbors)
	neigh.fit(copy_x_array, y_array)
	result = neigh.kneighbors(user_array,neighbors)
	similar_users = [[copy_id_array[result[1][0][x]],result[0][0][x]] for x in range(neighbors)]
	return similar_users
def getSimilarArticles(target, numOfDays, numOfNeighbors):
    articles = app.getTrainingSet(500, 70)
    neigh = KNeighborsClassifier()
    count_vect = CountVectorizer(stop_words='english', ngram_range=(1,2))
    tfidf_trans = TfidfTransformer()
    trainingTitle = [x.title for x in articles]
    trainingLabels = [1 for x in articles]
    targetTitleCounts = count_vect.fit_transform([target.title])
    targetCounts = count_vect.transform([target.text]) + targetTitleCounts
    trainingCounts = count_vect.transform(trainingTitle)
    print count_vect.get_feature_names()
    trainingCountsTfidf = tfidf_trans.fit_transform(trainingCounts)
    targetCountsTfidf = tfidf_trans.transform(targetTitleCounts)
    print targetCounts
    print 'After weighted by tfidf:'
    targetCounts = targetCounts.multiply(targetCountsTfidf)
    print targetCounts
    neigh.fit(trainingCounts, trainingLabels)
    similar_articles_index = neigh.kneighbors(targetCounts, numOfNeighbors, False)
    similar_articles = []
    for index in similar_articles_index[0]:
        similar_articles.append(articles[index].title)
    return similar_articles
示例#44
0
knn.fit(X, y)                                       # fit with data

knn.predict([3, 5, 4, 2])                           # predict for a new observation


# predict for multiple observations at once
X_new = [[3, 5, 4, 2], [3, 5, 2, 2]]
knn.predict(X_new)

# try a different value of K
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X, y)
knn.predict(X_new)              # predictions
knn.predict_proba(X_new)        # predicted probabilities
knn.kneighbors([3, 5, 4, 2])    # distances to nearest neighbors (and identities)

# compute the accuracy for K=5 and K=1

# K = 5
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X, y)
knn.score(X, y)
# the score function will return the accuracy of your prediction
# the number of correct prepdictions / the number of rows


# K = 1
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X, y)
knn.score(X, y)
def relieff(X,y, ind, _class, no_iter, nneighbors):
    df = pd.DataFrame(X)
    df['class'] = y
    knn = KNeighborsClassifier(n_neighbors=nneighbors)
    #group dataframe by class
    grouped = df.groupby(_class)
    
    #extract dataframe groups
    by_class_dfs = [df for name, df in grouped]
    
    #Retrieve selected class dataframe   
    df0 = by_class_dfs.pop(ind)
    pos_class = df0['class'].iloc[0]   
    
    #convert dataframe for each group into X matrices of values and y vectors of class labels
    Xy_list = [df_to_Xy(df) for df in by_class_dfs]
    
    #train Knn models
    knn_models = []
    for X, y in Xy_list:
        knn = KNeighborsClassifier()
        knn_models.append(knn.fit(X, y))
   
    num_attrs = len(df0.sample().columns)-1
    weights = np.zeros(num_attrs)
    
    num_classes = len(by_class_dfs)
    i = 0
    while i < no_iter:
        #print 'i: '+str(i)
        inst0 = df0.drop('class', axis=1).sample()
        a = inst0.as_matrix()[0]
        _df0 = df0.drop(inst0.index)
        X0, y0 = df_to_Xy(_df0)
        knn0 = KNeighborsClassifier(n_neighbors=nneighbors)
        knn0 = knn0.fit(X0, y0)
        nn_hit_indices = knn0.kneighbors(a.reshape([1,-1]), return_distance=False)[0]
        
        nn_misses = [knn.kneighbors(a.reshape([1,-1]), return_distance=False)[0] for knn in knn_models]       
                    
        for j in range(num_attrs):   
            nn_hit = [abs(a[j] - X0[l][j]) for l in nn_hit_indices]
            nn_hit_val = np.mean(nn_hit)    
              
            nn_miss = np.zeros(num_classes)            
            #print '\tj: '+str(j)
            #For each negative class
            for k in range(num_classes):
                #printed = False
                #print '\t\tk: '+str(k)
                X1 = Xy_list[k][0]
                #For each neighbor
                nn_miss_k = [abs(a[j] - X1[l][j]) for l in nn_misses[k]]                
                #if printed == False:
                    #print '\t\t\tl: '+str(l)
                    #printed = True
                nn_miss[k] = np.mean(nn_miss_k)  
            nn_miss_val = np.mean(nn_miss)    
            weights[j] = weights[j] - nn_hit_val + nn_miss_val
        i+=1            
    return weights, pos_class
    subdirname = basedir + actions[actionnum] + '/'
    subdir = os.listdir(subdirname)
    for seqnum in range(len(subdir)):
        allMHIs[:,:,allMHIs_counter] = computeMHI(subdirname + subdir[seqnum])
        allMoments[allMHIs_counter,:] = huMoments(allMHIs[:,:,allMHIs_counter])
        allMHIs_counter += 1

allMoments = allMoments/np.linalg.norm(allMoments)
testMoments = allMoments[k,:]
allLabels = [i for i in xrange(1,6) for j in xrange(4)]
allLabels = np.array(allLabels)


neigh = KNeighborsClassifier(n_neighbors=5)
#neigh.fit([trainMHI[:,:,i].flatten() for i in xrange(20)], trainLabels)
#print(neigh.predict([testMHI.flatten()]))
neigh.fit(allMoments, allLabels)
dist, ind = neigh.kneighbors(testMoments, n_neighbors=4)

fig=plt.figure()
ax=fig.add_subplot(3,2,1)
ax.set_title("Input")
ax.imshow(allMHIs[:,:,k], cmap = cm.Greys_r)
for i in xrange(ind.size):
    bx=fig.add_subplot(3,2,i+3)
    bx.set_title("Nearest - %d" %i)
    bx.imshow(allMHIs[:,:,ind[0][i]], cmap = cm.Greys_r)
#    cv2.imshow("pic %d"%i, allMHIs[:,:,i])
plt.show()
if 0xFF & cv2.waitKey(0) == 27:
    cv2.destroyAllWindows()
#并且选取这140个样本的标签作为训练数据集的标签
iris_x_test = iris_x[indices[-10:]]
 #剩下的10个样本作为测试数据集
iris_y_test = iris_y[indices[-10:]] 
#并且把剩下10个样本对应标签作为测试数据及的标签

knn = KNeighborsClassifier() 
#定义一个knn分类器对象
knn.fit(iris_x_train, iris_y_train) 
#调用该对象的训练方法,主要接收两个参数:训练数据集及其样本标签

iris_y_predict = knn.predict(iris_x_test) 
 #调用该对象的测试方法,主要接收一个参数:测试数据集
probility=knn.predict_proba(iris_x_test)  
 #计算各测试样本基于概率的预测
neighborpoint=knn.kneighbors(iris_x_test[-1], 5, False)
#计算与最后一个测试样本距离在最近的5个点,返回的是这些样本的序号组成的数组
score=knn.score(iris_x_test,iris_y_test,sample_weight=None)
#调用该对象的打分方法,计算出准确率

print('iris_y_predict = ') 
print(iris_y_predict) 
#输出测试的结果

print('iris_y_test = ')
print(iris_y_test) 
#输出原始测试数据集的正确标签,以方便对比
print('Accuracy:',score )
#输出准确率计算结果
print('neighborpoint of last test sample:',neighborpoint)
 
from pylab import *
from scipy.io import loadmat
from sklearn.neighbors import KNeighborsClassifier

# requires data from exercise 4.1.1
from ex4_1_1 import *

# Maximum number of neighbors
L=40


# Cross-validation not necessary. Instead, compute matrix of nearest neighbor
# distances between each pair of data points ..
knclassifier = KNeighborsClassifier(n_neighbors=L+1).fit(X, ravel(y))
neighbors = knclassifier.kneighbors(X)
# .. and extract matrix where each row contains class labels of subsequent neighbours
# (sorted by distance)
ndist, nid = neighbors[0], neighbors[1]
nclass = y[nid].flatten().reshape(N,L+1)

# Use the above matrix to compute the class labels of majority of neighbors
# (for each number of neighbors l), and estimate the test errors.
errors = np.zeros(L)
nclass_count = np.zeros((N,C))
for l in range(1,L+1):
    for c in range(C):
        nclass_count[:,c] = sum(nclass[:,1:l+1]==c,1).A.ravel()
    y_est = np.argmax(nclass_count,1);
    errors[l-1] = (y_est!=y.A.ravel()).sum()
示例#49
0
#### initial visualization
plt.xlim(0.0, 1.0)
plt.ylim(0.0, 1.0)
plt.scatter(bumpy_fast, grade_fast, color="b", label="fast")
plt.scatter(grade_slow, bumpy_slow, color="r", label="slow")
plt.legend()
plt.xlabel("bumpiness")
plt.ylabel("grade")
plt.show()
################################################################################


### your code here!  name your classifier object clf if you want the
### visualization code (prettyPicture) to show you the decision boundary

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

nbrs = KNeighborsClassifier(n_neighbors=25, algorithm='ball_tree').fit(features_train, labels_train)
distances, indices = nbrs.kneighbors(features_train)

predicted = nbrs.predict(features_test)
accuracy = accuracy_score(labels_test, predicted)
print accuracy

try:
    prettyPicture(nbrs, features_test, labels_test)
except NameError:
    pass
class MagentoClassifier(object):
    N_NEIGHBORS = 2 # was 10
    KNN_WEIGHTS = 'distance'

    @staticmethod
    def test_on_dataset(buildings, test_images_per_building=1, train_images_per_building=-1, class_count=-1,
                        n_neighbors=N_NEIGHBORS,
                        weights=KNN_WEIGHTS, method='mode', iterations=1, seed=-1):
        """
        :type test_images_per_building: int
        :type train_images_per_building: int
        :type weights: str
        :type n_neighbors: int
        :type buildings: list[Building]
        :rtype: float
        """
        if method not in METHODS:
            raise Exception('Pick valid method from ' + str(METHODS))
        method_idx = METHODS.index(method)
        ult_score = np.zeros(len(METHODS))
        for iter in range(1, iterations + 1):
            print_info("Starting testing iteration " + str(iter) + "/" + str(iterations))
            train_images, test_images = MagentoClassifier._test_train_split_buildings(buildings,
                                                                                      train_images_per_building=train_images_per_building,
                                                                                      test_images_per_building=test_images_per_building,
                                                                                      class_count=class_count,
                                                                                      seed=seed)
            mc = MagentoClassifier(n_neighbors=n_neighbors, weights=weights)
            mc.fit(train_images)
            score = mc.score(test_images, method=method)
            print_result("Iteration " + str(iter) + "/" + str(iterations) + " score is " + str(
                    score[method_idx]) + " (all scores: " + str(zip(METHODS, score)) + ")")
            ult_score += score
            seed += 1 if seed != -1 else 0
            print_result("Ultimate score so far is " + str(ult_score[method_idx] / iter) + " (all scores: " + str(
                    zip(METHODS, ult_score / iter)) + ")")
        score_iterations = ult_score / iterations

        print_result("Ultimate score is " + str(score_iterations[method_idx]) + " (all scores: " + str(
                zip(METHODS, score_iterations)) + ")")
        return score_iterations

    @staticmethod
    def _test_train_split_buildings(buildings, test_images_per_building=1, train_images_per_building=-1, class_count=-1,
                                    seed=-1):
        """
        :type test_images_per_building: int
        :type train_images_per_building: int
        :type buildings: list[Building]
        """
        mapped = map(lambda building: building.get_test_train_images(train_count=train_images_per_building,
                                                                     test_count=test_images_per_building, seed=seed),
                     buildings)
        random.shuffle(mapped)
        all_trains, all_tests = [], []
        all_classes = 0
        for train, test in mapped:
            if class_count != -1 and all_classes == class_count:
                break
            if (len(train) < train_images_per_building) or (len(test) < test_images_per_building):
                pass
            else:
                all_trains.extend(train)
                all_tests.extend(test)
                all_classes += 1

        if all_classes != class_count:
            assert class_count == -1, "Database too small"
            print_warn("There are not enough samples for all classes")

        print_info(
                "Loaded " + str(all_classes) + " classes, with " + str(train_images_per_building) + " train and " + str(
                        test_images_per_building) + " test images, with " + str(
                        ("seed=" + str(seed)) if seed != -1 else "default seed"))
        return all_trains, all_tests

    def __init__(self, n_neighbors=N_NEIGHBORS, weights=KNN_WEIGHTS):
        self._classifier = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights)
        self._buildings = None  # type: None|list[Building]
        self._classes = None  # type: None|list[int]
        self._features_all = None  # type: None|list[Feature]

    def fit(self, images):
        """

        :type images: list[Images]
        """
        print_info("Starting fitting process")
        assert all([image.get_building() is not None for image in images])
        self._buildings = list(set([image.get_building() for image in images]))
        self._buildings.sort(key=Building.get_identifier)
        self._classes = map(Building.get_identifier, self._buildings)

        features_all = []
        for image in images:
            print_result("Processing image " + str(image))
            features_all.append(image.get_all_features())
        features_all = [feature for features in features_all for feature in features]

        self._features_all = features_all

        descriptors_all = np.array([feature.get_descriptor() for feature in features_all])
        assert len(descriptors_all.shape) == 2

        classes_all = np.array(
                [feature.get_image().get_building().get_identifier() for feature in features_all])
        assert len(classes_all.shape) == 1

        assert descriptors_all.shape[0] == classes_all.shape[0]

        self._classifier.fit(descriptors_all, classes_all)

    def predict(self, images, method='mode'):
        """

        :type images: list[Image]
        :rtype: list[int]
        """
        print_info("Starting predict process")
        size = float(len(images))
        data = [(image, self, idx / size, method) for idx, image in enumerate(images)]
        if CPU_COUNT == 1:
            return [predict(d) for d in data]
        else:
            pool = Pool(CPU_COUNT)
            results = pool.map(predict, data)
            pool.close()
            pool = None
            gc.collect()
            return results

    def score(self, images, method='mode'):
        """
        :type images: list[Image]
        :rtype: np.ndarray
        """
        print_info("Starting scoring process")
        y_pred = np.array(self.predict(images, method=method))
        y_true = np.array([image.get_building().get_identifier() for image in images])[:, np.newaxis]
        return np.sum((y_true - y_pred) == 0, axis=0) / float(y_pred.shape[0])

    def show_match(self, image_test, descriptors_all):
        """
        :type image_test: Image
        :type matches: np.array
        :type distances: np.array
        """
        distances, matches = self._classifier.kneighbors(descriptors_all, return_distance=True, n_neighbors=1)

        image_test_rgb = image_test.get_rgb()  # type: Image

        for feature, matchs, distancess in zip(image_test.get_all_features(), matches, distances):
            xy1, w1 = feature.get_global_xy_w()
            for m in matchs:
                other_feature = self._features_all[m]
                image_train_rgb = other_feature.get_image().get_rgb()
                xy2, w2 = other_feature.get_global_xy_w()

                offset = image_test_rgb.shape[1]
                size = offset + image_train_rgb.shape[1]
                xy2 += [offset, 0]

                showoff = np.zeros((Image.DEFAULT_HEIGHT, size, 3), np.uint8)

                showoff[0:image_test_rgb.shape[0], 0:image_test_rgb.shape[1], :] = image_test_rgb
                showoff[0:image_train_rgb.shape[0], 0 + offset:image_train_rgb.shape[1] + offset, :] = image_train_rgb

                cv2.line(showoff, tuple(xy1), tuple(xy2), (0, 0, 255), thickness=1)
                cv2.circle(showoff, tuple(xy1), w1, (0, 0, 255), thickness=1)
                cv2.circle(showoff, tuple(xy2), w2, (0, 0, 255), thickness=1)
                plt.imshow(cv2.cvtColor(showoff, cv2.COLOR_RGB2BGR)), plt.show()
示例#51
0
# 		else:
# 			score[l] = 1/dist[i][j]
# 			count[l] = 1
# 	for key in range(numCategories):
# 		if key in score:
# 			p.append(score[key] / count[key])
# 		else:
# 			p.append(0)
# 	p = np.array(p)
# 	proba.append(p / np.sum(p))
# 	prediction.append(np.argmax(p)+1)

# find neighbors
neighborFile = open(outputDir + "/neighbors.txt", "w")
for i in range(len(features["test"])):
    dist, match = neigh.kneighbors(features["test"][i])
    neighborFile.write(str(labels["test"][i]) + " " + str(match[0][0]) + " " + str(dist[0][0]) + "\n")
neighborFile.close()

# output data
file = open(outputFile, "w")
file.write("labels ")
for c in classes:
    file.write(str(c) + " ")
file.write("\n")
for i in range(len(prediction)):
    l = prediction[i]
    file.write(str(l) + " ")
    for p in proba[i]:
        file.write(str(p) + " ")
    file.write("\n")
示例#52
0
knc = KNeighborsClassifier(n_neighbors=5)
knc_1 = []
knc_2 = []
knc_3 = []
knc_4 = []
knc_5 = []

print("Train KNN classifier (step 10/13)")
for d in tqdm(range(reduced.shape[0])):
    dest = destinations.loc[d, 'srch_destination_id']
    # Fitting model
    knc = knc.fit(reduced[destinations.srch_destination_id != dest],
                  destinations.srch_destination_id[destinations.
                                                   srch_destination_id != dest])

    nearest_neighbors = knc.kneighbors(np.reshape(
        reduced.loc[d, :].as_matrix(), [1, -1]), return_distance=False)
    nearest_neighbors = nearest_neighbors[0]
    # For each destination, list of first, second, etc. nearest neighbours.
    knc_1.append(nearest_neighbors[0])
    knc_2.append(nearest_neighbors[1])
    knc_3.append(nearest_neighbors[2])
    knc_4.append(nearest_neighbors[3])
    knc_5.append(nearest_neighbors[4])

# Now we need to match the destinations in the testing set with their
# corresponding neighbours. For that we need to create temporary dataframes to
# help us merge the results we obtained with the model and the test dataframe.
# There might be a more elegant way to do so!

print("Updating test dataframe (step 11/13)")
temp1 = pd.DataFrame()
示例#53
0
# create classifier

neigh = KNeighborsClassifier(n_neighbors=3, weights='distance')
print np.vstack(inputs).shape
print np.vstack(outputs)
neigh.fit(np.vstack(inputs), np.vstack(outputs))

# test

while True:
    ret, frame = cap.read()

    feat = cnn.calculate_features(frame)

    print neigh.kneighbors(feat)
    location = np.squeeze(neigh.predict(feat))
    print location
    x = location[0]
    y = location[1]

    # show display
    resized = cv2.resize(frame, (0,0), fx=.25, fy=.25)
    height, width = resized.shape[:2]
    img = np.zeros((512,512,3), np.uint8)
    cv2.circle(img,(x,y),10,(255,0,0),-1)
    img[-height:,-width:] = resized

    cv2.imshow('frame',img)
    if (cv2.waitKey(1) & 0xFF) == ord('q'):
        break
示例#54
0
class project:
    def __init__(self):
        self.train_dataframe = pd.read_csv("data/training.csv", header=0)
        self.test_dataframe = pd.read_csv("data/test.csv", header=0)
        self.test_dataframe_refId = self.test_dataframe["RefId"]

        self.preprocess_data()

        # Initialise classifiers
        self.attributes = list(self.train_dataframe.columns.values)[1:]
        self.lsh_neighbours = 2000
        self.initialise_knn()
        self.initialise_pca()
        self.initialise_svm()
        self.initialise_nn()

        self.lsh = lsh.lsh(self.train_dataframe)

    def preprocess_data(self):
        print "Preprocessing Data"
        self.train_dataframe = preprocess.preprocess(self.train_dataframe)
        self.test_dataframe = preprocess.preprocess(self.test_dataframe)

        # Add dummy column to test dataframe to match dimensions
        # Quick hack: should take away
        self.test_dataframe["IsBadBuy"] = 0

    def initialise_knn(self):
        print "Initialising KNN"
        k = np.sqrt(self.lsh_neighbours / 2)
        # k = 150	# Testing
        self.knn_clf = KNeighborsClassifier(n_neighbors=k)

    def initialise_pca(self):
        print "Initialsing PCA"
        self.pca_clf = PCA(n_components=len(self.attributes) / 2)

    def initialise_svm(self):
        print "Initialising SVM"
        self.svm_clf = svm.SVC(kernel="linear")

    def initialise_nn(self):
        print "Initialising Neural Network"
        num_hidden_nodes = 3
        learning_rate = 0.05
        batch_size = 30
        self.nn_clf = BernoulliRBM(n_components=num_hidden_nodes, learning_rate=learning_rate, batch_size=batch_size)

    def run(self):
        predictions = []
        refId = []

        for idx, row in self.test_dataframe.iterrows():
            print "Querying LSH"
            # query_vector = self.train_dataframe.iloc[1]	# Testing query vector
            query_vector = row

            lsh_idx = self.lsh.query(query_vector, self.lsh_neighbours)
            # print lsh_idx

            print "K Nearest Neighbours"
            kneighbours = self.k_nearest_neighbours(lsh_idx, query_vector)

            # For PCA
            # train_pca, query_pca = self.perform_pca(kneighbours, query_vector)
            # prediction = self.neural_network(train_pca, query_pca)

            try:
                prediction = self.neural_network(self.train_dataframe.ix[kneighbours], query_vector)
            except:
                prediction = 0
            predictions.append(prediction)
            refId.append(self.test_dataframe_refId.ix[idx])

            # print str(prediction) + " " + str(self.test_dataframe_refId.ix[idx])

            # Quick hack for testing
            """
			if idx == 3:
				break
			"""

        self.output_data(predictions, refId)

    def k_nearest_neighbours(self, lsh_idx, query_vector):
        """
		This function finds num_neighbours k-nearest-neighbours
		- Default k value: sqrt(num_k_neighbours/2)
		- Default Distance: Euclidean
		Reference: http://blog.yhathq.com/posts/classification-using-knn-and-python.html
		
		Returns: np.array([]) of row indices of dataframe that are closest to query vector
		TODO: Graph of accuracy as k increases? Or modify how to calculate distance between points
		"""
        lsh_dataframe = self.train_dataframe.ix[lsh_idx]
        self.knn_clf.fit(lsh_dataframe[self.attributes], lsh_dataframe["IsBadBuy"])
        neighbours = self.knn_clf.kneighbors(query_vector[self.attributes], return_distance=False)

        # print neighbours
        return neighbours.flatten()

    def perform_pca(self, kneighbours, query_vector):
        print "Performing PCA"
        dataframe = self.train_dataframe.ix[kneighbours]
        self.pca_clf.fit(dataframe)
        components = self.pca_clf.components_

        train_pca = self.pca_clf.transform(dataframe)
        query_pca = self.pca_clf.transform(query_vector)

        return train_pca.flatten(), query_pca.flatten()

    def neural_network(self, dataframe, query_vector):
        """
		This function trains a neural network based on a PCA transformed dataframe and query vector
		Using: BernoulliRBM, SVM (because 2 classes) pipeline 
		
		Output: prediction for query vector
		"""

        # Drop the predicted variable which was previously put in as dummy to match indices
        query_vector = query_vector.drop(["IsBadBuy"])

        classifier = Pipeline(steps=[("neural", self.nn_clf), ("svm", self.svm_clf)])
        classifier.fit(dataframe[self.attributes], dataframe["IsBadBuy"])
        prediction = classifier.predict(query_vector)

        # print prediction
        return prediction[0]

    def output_data(self, predictions, refID):
        print "Writing to file"
        array = np.vstack((refID, predictions))
        array_transpose = np.array(np.matrix(array).transpose())

        df_results = pd.DataFrame({"RefId": array_transpose[:, 0], "Predicted": array_transpose[:, 1]})
        df_results.to_csv("results.csv", index=False, cols=["RefId", "Predicted"])
def recommend(data, user, user_conferences, bound=0.4, num_neighbors=20):
    """
    :param data: Data object
    :param user: Username
    :param user_conferences: List of visited user conferences
    :param bound: The bound of distance with the recommendations that are derived
    :param num_neighbors: Number of neighbors in KNN
    :return: List of pair (conference, distance)
    """
    # fill mapping to numbers

    # print('Creates user model')
    # def make_user_model():
    #     u_confs = lil_matrix((1, data.members_set.__len__()))
    #     for c in data.list_of_members_with_conferences[user]:
    #         for m in data.list_of_conferences_with_members[c]:
    #             u_confs[0, data.member_to_num[m]] += 1
    #     return normalize(np.divide(u_confs, data.list_of_members_with_conferences[user].__len__()))
    # user_model_vector = make_user_model()
    # fit
    print('Fit')
    kn = KNeighborsClassifier(n_neighbors=num_neighbors, weights='distance', metric='minkowski', p=2)
    matrix = data.to_sparse_matrix()
    confs_name = ['' for i in range(data.conferences_set.__len__())]
    for conf in data.conferences_set:
        confs_name[data.conf_to_num[conf]] = conf
    kn.fit(matrix, confs_name)

    # predict
    print('Creates sparse matrix')
    user_confs = data.user_confs_to_sparse_matrix(user)
    # res = sorted(filter(lambda conf: conf not in data.list_of_members_with_conferences[user],
    #                      kn.predict(user_confs)))
    print('Predict')
    rec_conf_with_dist = {i:0 for i in data.conferences_set}
    rec_conf_number = {i:0 for i in data.conferences_set}
    for c in user_conferences:
        number_of_current_user_conf = data.conf_to_num[c]
        dst1, ind1 = kn.kneighbors(data.conf_to_vector(c), return_distance=True)
        conf_with_dist = zip(normalize(dst1[0])[0], ind1[0])
        for dist, conf in conf_with_dist:
            rec_conf_with_dist[data.num_to_conf[conf]] += dist
            rec_conf_number[data.num_to_conf[conf]] += 1
    for conf, number in rec_conf_number.items():
        if number != 0 and number != 1:
            rec_conf_with_dist[conf] /= number
    result = sorted(rec_conf_with_dist.items(), key=operator.itemgetter(1))
    result = list(filter(lambda tmp: tmp[1] != 0 and tmp[1] < bound, result))
    return result

    # dst1, ind1 = kn.kneighbors(user_model_vector, n_neighbors=20, return_distance=True)
    # return dst1, ind1
    # confs_with_simularuty = []
    # for conf in data.list_of_members_with_conferences[user]:
    #     conf_lst = lil_matrix((1, data.members_set.__len__()))
    #     for member in data.list_of_conferences_with_members[conf]:
    #         conf_lst[0, data.member_to_num[member]] = 1
    #     dist, ind = kn.kneighbors(conf_lst, n_neighbors=6, return_distance=True)
    #     dist = dist[0]
    #     ind = [data.num_to_conf[i] for i in ind[0]]
    #     tmp_res = zip(dist, ind)
    #     print(dist)
    #     print(ind)
    #     print(conf)
    #     print('---------')
    #
        # for i in range(ind.__len__()):
        #     confs_with_simularuty[ind[i]] = max(dist[i], confs_with_simularuty[data.num_to_conf[ind[i]]])
    # return confs_with_simularuty
示例#56
0
    return i

print ("Loading model ...")
data = np.loadtxt(os.path.join(args.d, args.features), delimiter = ',') / 255
labels = np.loadtxt(os.path.join(args.d, args.labels), np.uint8)
lines = [i.strip().split(" ") for i in open(os.path.join(args.d, args.labelmapping))]
labelmap = dict([(int(n), l) for l, n in lines])

clf = KNeighborsClassifier(n_neighbors = 7, algorithm = 'brute')
clf.fit(data, labels)

img = resize_rgb_image(read_rgb_image(args.i), (32, 32))
d = []
for i, details in normed_windows(img, [1.0], details = True):
    arr = flatten(i) / 255
    arr = np.reshape(arr, (1, -1))
    t = clf.predict(arr)
    c = t[0]
    print (c, labelmap[c])
    distances, idx = clf.kneighbors(arr)
    distances = [d for d, p in zip(distances.flatten(), idx.flatten()) if labels[p] == c]
    s = sum(distances) / len(distances)
    d.append([s, c, details])

d = sorted(d)
for s, c, details in d[:3]:
    print (s, labelmap[c], details)
    img = box(img, details[1], details[2], details[0])

write_rgb_image(args.o, img)
示例#57
0
文件: mapper.py 项目: stbman/cs5228
                      max_depth=1, random_state=0, loss='ls')

for chunk in iter_csv:
    chunk = chunk.drop(['DateTime'], axis=1)
    
    attributes = chunk.columns.values
    attributes = np.delete(attributes, attributes.tolist().index('TotalDelay'))    
    
    training_chunk = chunk.iloc[:-1]
    predict_chunk = chunk.iloc[-1]
    actual = predict_chunk['TotalDelay']
    predict_chunk = predict_chunk.drop(['TotalDelay'])
        
    # KNN
    knn.fit(training_chunk[attributes], training_chunk['TotalDelay'])
    neighbours = knn.kneighbors(predict_chunk[attributes], return_distance=False).flatten()
    
    # Other classifiers
    new_training_chunk = training_chunk.ix[neighbours]

    try:
        gbr.fit(new_training_chunk[attributes], new_training_chunk['TotalDelay'])
        prediction = gbr.predict(predict_chunk)[0]
    except:
        prediction = new_training_chunk['TotalDelay'].mean()
    
    print prediction, actual
   


示例#58
0
# Setting number of neighbors = 5
k = 5
# Running KNN model 
result,neigh = knn(data, test, k) 
# Predicted class 
print(result) 
#-> Iris-virginica
# 5 nearest neighbors
print(neigh)
#-> [141, 139, 120, 145, 144]
 

#Comparing our model with scikit-learn
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(data.iloc[:,0:4], data['Name'])

# Predicted class
print(neigh.predict(test))

#-> ['Iris-virginica']

# 3 nearest neighbors
print(neigh.kneighbors(test)[1])
#-> [[141 139 120]]    
"""
Reference : 
https://www.analyticsvidhya.com/blog/2018/03/introduction-k-neighbours-algorithm-clustering/
"""
示例#59
0
def main():
    # open a csv writer
    c = csv.writer(open(Config.output_path + "candle.csv", "wb"))
    c.writerow(
        [
            "TICKER",
            "STRAT_PROFIT",
            "STRAT_CAGR",
            "BUY_HOLD",
            "POS_DAYS",
            "NEG_DAYS",
            "TRUE_POS",
            "TRUE_NEG",
            "FALSE_POS",
            "FALSE_NEG",
            "NUM_TRADES",
        ]
    )
    tickers = Config.sp500
    equities = pd.DataFrame([])
    for ticker in tickers:
        # ohlcva
        raw_data = load_data(ticker)
        dates = load_dates(ticker)

        # feature extraction
        features = gap_features(raw_data)
        returns = get_o2c_returns(raw_data, 2)

        k = 50
        neigh = KNeighborsClassifier(n_neighbors=k, weights="uniform", algorithm="brute")
        dtc = tree.DecisionTreeClassifier()
        rf = RandomForestClassifier()

        print "Beginning analysis of: " + ticker.upper()
        bhold = np.ones(len(returns) + 1)
        equity = np.ones(len(returns) + 1)
        equity_second = np.ones(len(returns) + 1)
        for x in range(len(returns) / 4, len(returns)):
            # if x == len(returns)/4 or x % 500 == 0:
            #    rf.fit(features[x-1200:x], returns[x-1200:x])

            neigh.fit(features[:x], returns[:x])
            # predict = rf.predict(features[x])

            feature_dist, feature_ind = neigh.kneighbors(features[x])

            summation = sum(returns[feature_ind[0]])
            stdev = np.std(returns[feature_ind[0]])

            if summation / k > 0.001:
                equity[x + 1] = 1 + returns[x]

            if summation > stdev:
                equity_second[x + 1] = 1 + returns[x]

            bhold[x + 1] = 1 + returns[x]

        annualRet = annRet(equity.cumprod()[-1], 0.75 * len(returns))
        c.writerow([ticker.upper(), equity.cumprod()[-1], annualRet, bhold.cumprod()[-1], equity_second.cumprod()[-1]])

        # combine equity curves
        series = pd.DataFrame({ticker.upper(): equity[1:]}, index=dates[2:])
        equities = equities.join(series, how="outer")
示例#60
0
# ## Tuning a KNN model

# instantiate the model (using the value K=5)
knn = KNeighborsClassifier(n_neighbors=5)

# fit the model with data
knn.fit(X, y)

# predict the response for new observations
knn.predict(X_new)

# calculate predicted probabilities of class membership
knn.predict_proba(X_new)

# print distances to nearest neighbors (and their identities)
knn.kneighbors([3, 5, 4, 2])

# ## Comparing KNN with other models

# Advantages of KNN:
# - Simple to understand and explain
# - Model training is fast
# - Can be used for classification and regression!

# Disadvantages of KNN:
# - Must store all of the training data
# - Prediction phase can be slow when n is large
# - Sensitive to irrelevant features
# - Sensitive to the scale of the data
# - Accuracy is (generally) not competitive with the best supervised learning methods