Пример #1
0
def kernel(input1, ker, arg, input2=None):
    if input2 is None:

        input1 = input1.T
        if ker == 'linear':
            K = kernels.linear_kernel(input1)
            #   polynomial은 미구현. 파라미터가 좀 다른듯...
            #    if ker == 'poly':
            #        K = kernels.polynomial_kernel(input, )

        if ker == 'rbf':
            gamma = (0.5 / (arg * arg))
            K = kernels.rbf_kernel(input1, gamma=gamma)
        if ker == 'sigmoid':
            K = kernels.sigmoid_kernel(input1, gamma=arg[0], coef0=arg[1])

        return K

    else:

        input1 = input1.T
        input2 = input2.T
        if ker == 'linear':
            K = kernels.linear_kernel(input1, input2)
            #   polynomial은 미구현. 파라미터가 좀 다른듯...
            #    if ker == 'poly':
            #        K = kernels.polynomial_kernel(input, )

        if ker == 'rbf':
            gamma = (0.5 / (arg * arg))
            K = kernels.rbf_kernel(input1, input2, gamma=gamma)
        if ker == 'sigmoid':
            K = kernels.sigmoid_kernel(input1, input2, gamma=arg[0], coef0=arg[1])

        return K
    def kernelMatrix(self, X, y=None):

        if self.K_type == 'linear':
            """
            if y != None:
                if self.mu == None:
                    reg = Lasso(self.param) #TODO change with a model for classification and let the possibility to specify regression or classification
                    self_mu = reg.fit(X, y).coef_
                    self.Xtr = self.Xtr[:, mp.where(self_mu != 0)]

                self.X = self.X[:, mp.where(self_mu != 0)]
            """
            if self.normalize:
                self.K = normalize(linear_kernel(X, self.Xtr))
            else:
                self.K = linear_kernel(X, self.Xtr)

            return self.K

        if self.K_type == 'polynomial':
            if self.normalize:
                self.K = normalize(
                    polynomial_kernel(X, self.Xtr, degree=self.param))
            else:
                self.K = polynomial_kernel(X, self.Xtr, degree=self.param)

            return self.K

        if self.K_type == 'gaussian':
            if self.normalize:
                self.K = normalize(rbf_kernel(X, self.Xtr, gamma=self.param))
            else:
                self.K = rbf_kernel(X, self.Xtr, gamma=self.param)

            return self.K

        if self.K_type == 'laplacian':
            if self.normalize:
                self.K = normalize(
                    laplacian_kernel(X, self.Xtr, gamma=self.param))
            else:
                self.K = laplacian_kernel(X, self.Xtr, gamma=self.param)

            return self.K

        if self.K_type == 'sigmoid':
            if self.normalize:
                self.K = normalize(
                    sigmoid_kernel(X, self.Xtr, gamma=self.param))
            else:
                self.K = sigmoid_kernel(X, self.Xtr, gamma=self.param)
            return self.K
Пример #3
0
def recommend(search_word):
    movie_df = pre_process()

    tfv = vectorizer(min_df=3,
                     max_features=None,
                     strip_accents='unicode',
                     analyzer='word',
                     token_pattern=r'\w{1,}',
                     ngram_range=(1, 3),
                     stop_words='english')
    tfv_matrix = tfv.fit_transform(movie_df['bow'])
    sig = sigmoid_kernel(tfv_matrix, tfv_matrix)
    index = pd.Series(movie_df.index,
                      index=movie_df['original_title']).drop_duplicates()

    try:
        idx = index[search_word]
        sig_scores = list(enumerate(sig[idx]))
        sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True)
        sig_scores = sig_scores[1:15]
        movie_indices = [i[0] for i in sig_scores]

        return list(movie_df['original_title'].iloc[movie_indices])
    except:
        return None
def product_recommendation(title):
    tfidf_v = TfidfVectorizer(
        max_features=None,
        strip_accents="unicode",
        analyzer="word",
        min_df=10,
        token_pattern=r"\w{1,}",
        ngram_range=(1,
                     3),  #take the combination of 1-3 different kind of words
        stop_words="english")
    products["description"] = products["description"].fillna("")
    products["product_name"] = products["product_name"].str.lower()
    tfidf_matrix = tfidf_v.fit_transform(products["description"])
    sig = sigmoid_kernel(tfidf_matrix, tfidf_matrix)
    indices = pd.Series(products.index,
                        index=products["product_name"]).drop_duplicates()
    index = indices.get(title.lower())
    if index is not None:
        sorted_sig_scores = list(enumerate(sig[index]))
        sorted_sig_scores = sorted(sorted_sig_scores,
                                   key=lambda item: item[1],
                                   reverse=True)
        top_10_products = [sorted_sig_scores[i][0] for i in range(0, 11)]
        return products["product_name"].iloc[top_10_products].unique()
    return [index]
Пример #5
0
    def reset_vec(self, kernel='rbf_ap'):
        feat = preprocessing.scale(self.fake_features)
        count = feat.shape[1]
        if kernel == 'rbf':
            temp = rbf_kernel(feat, gamma=1.0 / count).sum(axis=0)
        elif kernel == 'cos':
            temp = ((cosine_similarity(feat) + 1) / 2.0).sum(axis=0)
        elif kernel == 'euc':
            temp = (1.0 / (euclidean_distances(feat) + 1)).sum(axis=0)
        elif kernel == 'sigmoid':
            Sig = sigmoid_kernel(feat, coef0=0, gamma=1.0 / count)
            temp = ((Sig + 1.0) / 2.0).sum(axis=0)
        elif kernel == 'rbf_ap':
            gamma = 1.0 / count
            expVec = np.exp(-gamma * np.einsum("ij, ij -> i", feat, feat))
            feaVec = np.einsum("i, ij -> j", expVec, feat) * (2.0 * gamma)
            outMat = np.einsum("i,ij,ik->jk", expVec, feat, feat)
            outMat *= (2.0 * gamma**2)

            first = expVec * np.sum(expVec)
            second = np.einsum("i, j, ij -> i", expVec, feaVec, feat)
            third = np.einsum("i, jk, ij, ik -> i", expVec, outMat, feat, feat)
            temp = first + second + third

        return (temp / np.sum(temp))
Пример #6
0
    def _apply_kernel(self, x, y):
        """Apply the selected kernel function to the data."""
        if self.kernel == 'linear':
            phi = linear_kernel(x, y)
        elif self.kernel == 'rbf':
            phi = rbf_kernel(x, y, self.coef1)
        elif self.kernel == 'poly':
            phi = polynomial_kernel(x, y, self.degree, self.coef1, self.coef0)
        elif self.kernel == 'sigmoid':
            coef0 = self.coef0 if self.coef0 is not None else 1
            phi = sigmoid_kernel(x, y, self.gamma, coef0)
        elif self.kernel == 'chi2':
            gamma = self.gamma if self.gamma is not None else 1
            phi = chi2_kernel(x, y, self.gamma)
        elif self.kernel == 'laplacian':
            phi = laplacian_kernel(x, y, self.gamma)
        elif callable(self.kernel):
            phi = self.kernel(x, y)
            if len(phi.shape) != 2:
                raise ValueError(
                    "Custom kernel function did not return 2D matrix")
            if phi.shape[0] != x.shape[0]:
                raise ValueError(
                    "Custom kernel function did not return matrix with rows"
                    " equal to number of data points."
                    "")
        else:
            raise ValueError("Kernel selection is invalid.")

        if self.bias_used:
            phi = np.append(phi, np.ones((phi.shape[0], 1)), axis=1)

        return phi
Пример #7
0
def tf_sig(tfidf_matrix):
    print "Using TFIDF with sigmoid kernel"
    Ke = sigmoid_kernel(tfidf_matrix[0:1],tfidf_matrix)
    K=Ke[0]
    top = np.argsort(K)[-11:]
    for i in range(10):
        print (10-i),Total[top[9-i]-1]
Пример #8
0
    def _get_kernel_matrix(self, X1, X2):
        # K is len(X1)-by-len(X2) matrix
        if self._kernel == 'rbf':
            K = pairwise.rbf_kernel(X1, X2, gamma=self._gamma)
        elif self._kernel == 'poly':
            K = pairwise.polynomial_kernel(X1,
                                           X2,
                                           degree=self._degree,
                                           gamma=self._gamma,
                                           coef0=self._coef0)
        elif self._kernel == 'linear':
            K = pairwise.linear_kernel(X1, X2)
        elif self._kernel == 'laplacian':
            K = pairwise.laplacian_kernel(X1, X2, gamma=self._gamma)
        elif self._kernel == 'chi2':
            K = pairwise.chi2_kernel(X1, X2, gamma=self._gamma)
        elif self._kernel == 'additive_chi2':
            K = pairwise.additive_chi2_kernel(X1, X2)
        elif self._kernel == 'sigmoid':
            K = pairwise.sigmoid_kernel(X1,
                                        X2,
                                        gamma=self._gamma,
                                        coef0=self._coef0)
        else:
            print('[Error] Unknown kernel')
            K = None

        return K
Пример #9
0
def hsh_sig(hash_matrix):
    print "Using Hashing with sigmoid kernel"
    Ke = sigmoid_kernel(hash_matrix[0:1],hash_matrix)
    K=Ke[0]
    top = np.argsort(K)[-11:]
    for i in range(10):
        print (10-i),Total[top[9-i]-1]
Пример #10
0
def cal_km(params, X_fit, X, type):
    if type == 'interface':
        if params['kernel'] == 'linear':
            km = linear_kernel(X_fit, X)
        elif params['kernel'] == 'rbf':
            km = rbf_kernel(X_fit, X, gamma=params['gamma'])
        elif params['kernel'] == 'poly':
            km = polynomial_kernel(X_fit, X, gamma=params['gamma'], coef0=0.0)
        elif params['kernel'] == 'sigmoid':
            km = sigmoid_kernel(X_fit, X, gamma=params['gamma'], coef0=0.0)
        else:
            print('Unknown kernel')
            km = None
    elif type == 'realize':
        if params['kernel'] == 'linear':
            km = cal_linear(X_fit, X)
        elif params['kernel'] == 'rbf':
            km = cal_rbf(X_fit, X, gamma=params['gamma'])
        elif params['kernel'] == 'poly':
            km = cal_poly(X_fit, X, gamma=params['gamma'])
        elif params['kernel'] == 'sigmoid':
            km = cal_sigmoid(X_fit, X, gamma=params['gamma'])
        else:
            print('Unknown kernel')
            km = None
    else:
        print('Unknown type')
        km = None
    return km
Пример #11
0
def cnt_sig(count_matrix):
    print "Using Count with sigmoid kernel"
    Ke = sigmoid_kernel(count_matrix[0:1],count_matrix)
    K=Ke[0]
    top = np.argsort(K)[-11:]
    for i in range(10):
        print (10-i),Total[top[9-i]-1]
Пример #12
0
    def margin_kernel(self, X1, kernel_type = 'linear', gamma =1.0):
        """
        Forms the kernel matrix using the samples X1
        Parameters:
        ----------
        X1: np.ndarray
            data (n_samples,n_features) to form a kernel of shape (n_samples,n_samples)
        kernel_type : str
            type of kernel to be used
        gamma: float
            kernel parameter
        Returns:
        -------
        X: np.ndarray
            the kernel of shape (n_samples,n_samples)
        """
        
        if(kernel_type == 'linear'):
            X = linear_kernel(X1,X1)
        elif(kernel_type == 'rbf'):
            X = rbf_kernel(X1,X1,gamma) 
        elif(kernel_type == 'tanh'):
            X = sigmoid_kernel(X1,X1,-gamma) 
        elif(kernel_type == 'sin'):
#            X = np.sin(gamma*manhattan_distances(X1,X1))
            X = np.sin(gamma*pairwise_distances(X1,X1)**2)
        elif(kernel_type =='TL1'):                
            X = np.maximum(0,gamma - manhattan_distances(X1,X1)) 
        else:
            print('no kernel_type, returning None')
            return None
        return X
    def kernel_mean_matching(self, X, Z, kern='lin', B=1.0, eps=None):
        nx = X.shape[0]
        nz = Z.shape[0]

        print("nx: ", nx, " nz: ", nz)

        if eps == None:
            eps = B / math.sqrt(nz)

        if kern == 'lin':
            K = np.dot(Z, Z.T)
            K = K.todense()
            kappa = np.sum(np.dot(Z, X.T) * float(nz) / float(nx), axis=1)
        elif kern == 'rbf':
            K = sk.rbf_kernel(Z, Z)
            kappa = np.sum(sk.rbf_kernel(Z, X), axis=1) * float(nz) / float(nx)
        elif kern == 'poly':
            K = sk.polynomial_kernel(Z, Z)
            kappa = np.sum(sk.polynomial_kernel(Z, X),
                           axis=1) * float(nz) / float(nx)
        elif kern == 'laplacian':
            K = sk.laplacian_kernel(Z, Z)
            kappa = np.sum(sk.laplacian_kernel(Z, X),
                           axis=1) * float(nz) / float(nx)
        elif kern == 'sigmoid':
            K = sk.sigmoid_kernel(Z, Z)
            kappa = np.sum(sk.sigmoid_kernel(Z, X),
                           axis=1) * float(nz) / float(nx)

        else:
            raise ValueError('unknown kernel')

        K = K.astype(np.double)
        K = matrix(K)
        kappa = matrix(kappa)

        G = matrix(np.r_[np.ones((1, nz)), -np.ones((1, nz)),
                         np.eye(nz), -np.eye(nz)])
        h = matrix(np.r_[nz * (1 + eps), nz * (eps - 1), B * np.ones((nz, )),
                         np.zeros((nz, ))])

        print("starting solver")
        solvers.options['show_progress'] = False
        sol = solvers.qp(K, -kappa, G, h)
        print(sol)
        coef = np.array(sol['x'])
        return coef
Пример #14
0
    def rec_hotel(amenities, city):
        my_dataframe = pd.DataFrame({
            'Rating': travel_df['Rating'],
            'Amenities': travel_df['Amenities'],
            'Hotel Names': travel_df['Hotel Names'],
            'City': travel_df['City'],
            'Address': travel_df['Address']
        })
        df = pd.DataFrame({
            "Rating": 2,
            "Amenities": amenities,
            "Hotel Names": ['abc'],
            "City": [city]
        })
        # print(df)
        my_dataframe = my_dataframe.append(df, ignore_index=True)
        my_dataframe = my_dataframe[my_dataframe['City'] == city]
        my_dataframe.reset_index(inplace=True)
        # print(my_dataframe.iloc[-1])
        # print(my_dataframe)
        # print(df.columns)

        tfv = TfidfVectorizer(min_df=3,
                              max_features=None,
                              strip_accents='unicode',
                              analyzer='word',
                              token_pattern=r'\w{1,}',
                              ngram_range=(1, 3),
                              stop_words='english')
        # Filling NaNs with empty string
        my_dataframe['Amenities'] = my_dataframe['Amenities'].fillna('')
        # print(amenities)
        # Fitting the TF-IDF on the 'Amenities' text
        tfv_matrix = tfv.fit_transform(my_dataframe['Amenities'])
        # Compute the sigmoid kernel
        sig = sigmoid_kernel(tfv_matrix, tfv_matrix)
        my_ratings = np.array(
            my_dataframe[my_dataframe['City'] == city]['Rating']) / 5
        indices = pd.Series(
            my_dataframe.index,
            index=my_dataframe['Hotel Names']).drop_duplicates()
        # print(indices)
        idx = indices['abc']
        # l=np.add(sig[idx]*0.5,my_ratings)
        # print("rating:",my_ratings.shape,"sig:",sig[idx].shape)
        # Get the pairwsie similarity scores
        sig_scores = list(enumerate(sig[idx]))

        # Sort the hotels
        sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True)

        # Scores of the 5 most similar hotels
        sig_scores = sig_scores[2:7]

        # Movie indices
        hotel_indices = [i[0] for i in sig_scores]
        my_dataframe = my_dataframe.iloc[:-1, :]
        return my_dataframe[['Hotel Names', 'Address',
                             'Rating']].iloc[hotel_indices]
Пример #15
0
def abc():
    data = pd.read_csv('final.csv')
    tfv = TfidfVectorizer(min_df = 3, max_features = None, strip_accents = 'unicode', analyzer = 'word', token_pattern = r'\w{1,}', ngram_range = (1, 3), stop_words = 'english')
    tfv_matrix = tfv.fit_transform(data['DESCRIPTION'])
    sig = sigmoid_kernel(tfv_matrix, tfv_matrix)
    indices = pd.Series(data.index, index = data['TRACK NAME']).drop_duplicates()
    
    return data, sig, indices
Пример #16
0
    def ResetProbVec(self, kernel='rbf_ap'):
        """
        Calculate the reset probability vector with assigned kernel
        rbf: Radial basis function
        cos: (cosine similarity + 1) / 2.0
        euc: 1.0 / (1 + euclidean distances)
        sigmoid: (tanh(gamma <X_i, X_j>) + 1) / 2.0
        rbf_ap: Taylor-expansion approximated Radial basis function
        """

        if kernel == 'rbf':
            RBF = rbf_kernel(self.featMat, gamma=1.0 / self.featCount)
            RBF = RBF.sum(axis=0)
            resetProbVec = RBF / np.sum(RBF)

        elif kernel == 'cos':
            Cos = (cosine_similarity(self.featMat) + 1) / 2.0
            Cos = Cos.sum(axis=0)
            resetProbVec = Cos / np.sum(Cos)

        elif kernel == 'euc':
            Euc = 1.0 / (euclidean_distances(self.featMat) + 1)
            Euc = Euc.sum(axis=0)
            resetProbVec = Euc / np.sum(Euc)

        elif kernel == 'sigmoid':
            gamma = 1.0 / self.featCount
            Sig = sigmoid_kernel(self.featMat, coef0=0, gamma=gamma)
            Sig = (Sig + 1.0) / 2.0
            Sig = Sig.sum(axis=0)
            resetProbVec = Sig / np.sum(Sig)

        elif kernel == 'rbf_ap':
            parameter = 1.0 / self.featCount
            # w
            lengths = np.einsum("ij, ij -> i", self.featMat, self.featMat)
            expNormVector = np.exp(-parameter * lengths)
            # y
            f_normVec = np.einsum("i, ij -> j", expNormVector, self.featMat)
            featureNormVector = f_normVec * (2.0 * parameter)
            # Z
            outerMat = np.einsum("i, ij, ik -> jk", expNormVector,
                                 self.featMat, self.featMat)
            featureOuterNorm = outerMat * (2.0 * parameter**2)
            # r'
            first = expNormVector * np.sum(expNormVector)
            second = np.einsum("i, j, ij -> i", expNormVector,
                               featureNormVector, self.featMat)
            third = np.einsum("i, jk, ij, ik -> i", expNormVector,
                              featureOuterNorm, self.featMat, self.featMat)
            resetProbVec = first + second + third
            # r
            resetProbVec /= np.sum(resetProbVec)

        self.resetProbVec = resetProbVec
Пример #17
0
def kernel_func(X1, X2, kernel_name, gamma, d, r):
    if kernel_name == 'rbf':
        return rbf_kernel(X1, X2, gamma=gamma)
    elif kernel_name == 'polynomial':
        return polynomial_kernel(X1, X2, gamma=gamma, degree=d, coef0=r)
    elif kernel_name == 'sigmoid':
        return sigmoid_kernel(X1, X2, gamma=gamma, coef0=r)
    elif kernel_name == 'linear':
        return linear_kernel(X1, X2)
    else:
        raise NotImplementedError
Пример #18
0
def gen_similarity(args, X):

    if args.sim_method == 'sigmoid_kernel':
        sim_UXU = sigmoid_kernel(X=X, Y=None, gamma=None, coef0=1)
        sim_MXM = sigmoid_kernel(X=X.T, Y=None, gamma=None, coef0=1)
    elif args.sim_method == 'cosine_similarity':
        sim_UXU = cosine_similarity(X=X, Y=None)
        sim_MXM = cosine_similarity(X=X.T, Y=None)
    ## =====================================================================
    # Save similarity matrix
    fn_str = args.RESULTPATH + 'sim_%s_UXU.npy' % (args.sim_method)
    with open(fn_str, 'wb') as f:
        pickle.dump(sim_UXU, f)

    fn_str = args.RESULTPATH + 'sim_%s_MXM.npy' % (args.sim_method)
    with open(fn_str, 'wb') as f:
        pickle.dump(sim_MXM, f)
    print('saving similarity matrix is done!')
    ## =====================================================================
    return sim_UXU, sim_MXM
def calc_gaussian_sim(data_matrix, method):
    if method == "rbf":
        return rbf_kernel(data_matrix)
    elif method == "chi2":
        return chi2_kernel(data_matrix)
    elif method == "laplacian":
        return laplacian_kernel(data_matrix)
    elif method == "sigmoid":
        return sigmoid_kernel(data_matrix)
    else:
        raise ValueError("Wron method parameter ind calc_gaussian_sim()")
 def transform(self, X, Y):
     if self.type == 'rbf':
         return rbf_kernel(X, Y, self.gamma)[0]
     elif self.type == 'Chi2':
         return chi2_kernel(X, Y, self.gamma)[0]
     elif self.type == 'AChi2':
         return -additive_chi2_kernel(X, Y)[0]
     elif self.type == 'laplacian':
         return laplacian_kernel(X, Y, self.gamma)[0]
     elif self.type == 'sigmoid':
         return sigmoid_kernel(X, Y, self.gamma, self.coef0)[0]
Пример #21
0
def calculate_gram_matrix(x, kernel='linear', gamma=0, degree=0, coef0=0):
    if kernel == 'linear':
        gram = linear_kernel(x, x)
    elif kernel == 'poly':
        gram = polynomial_kernel(x, x, degree=degree, gamma=gamma, coef0=coef0)
    elif kernel == 'sigmoid':
        gram = sigmoid_kernel(x, x, gamma=gamma, coef0=coef0)
    elif kernel == 'rbf':
        gram = rbf_kernel(x, x, gamma=gamma)
    else:
        raise ValueError

    return gram
Пример #22
0
 def _apply_kernel(self, X, Y):
     if self.kernel == "rbf":
         return rbf_kernel(X, Y, self.gamma)
     elif self.kernel == "sigmoid":
         return sigmoid_kernel(X, Y, self.gamma, self.coef0)
     elif self.kernel == "poly":
         return polynomial_kernel(X, Y, self.degree, self.gamma, self.coef0)
     elif self.kernel == "linear":
         return linear_kernel(X, Y)
     elif callable(self.kernel):
         return self.kernel(X, Y)
     else:
         raise ValueError("Unknown kernel: " + str(self.kernel))
def preprocessing(): 
    # Model defination with the feature declaration 
    tfv = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 5), stop_words='english') 
    # Filling NaNs with empty string 
    js['ArticleTitle'] = js['ArticleTitle'].fillna('') 
    tfv_matrix = tfv.fit_transform(js['ArticleTitle']) 
    # Compute the sigmoid kernel
    sig = sigmoid_kernel(tfv_matrix, tfv_matrix)
    # Generate the indices for the recommender system, removing the duplicates
    indices = pd.Series(js.index, index=js['ArticleFullPath'
    ]).drop_duplicates()
    #returning the indices and sigmoid kernel matrix
    return indices, sig
Пример #24
0
def call_recommend(m):
    m = m.lower()
    movie = pd.read_csv('movie_data_final.csv')

    # check if the movie is in our database or not
    if m not in movie['original_title'].unique():
        return (
            'This movie is not in our database.\nPlease check if you spelled it correct.'
        )
    else:
        ## Content Based Recommendation System
        ### Using Tf-IDF Vectorizer to formulate vectorization matrix

        tf = TfidfVectorizer(min_df=3,
                             max_features=None,
                             strip_accents='unicode',
                             analyzer='word',
                             token_pattern=r'\w{1,}',
                             ngram_range=(1, 3),
                             stop_words='english')

        #Fitting the TF-IDF on 'overview' text
        tf_matrix = tf.fit_transform(movie['overview'].values.astype('U'))

        #Compute sigmoid kernel
        sig = sigmoid_kernel(tf_matrix, tf_matrix)

        #Reverse mapping of indices and movie titles
        indices = pd.Series(movie.index,
                            index=movie['original_title']).drop_duplicates()

        #Get the corresponding to original_title
        index = indices[m]

        #Get the pairwise similiarity scores
        sig_scores = list(enumerate(sig[index]))

        #Sort the movies
        sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True)

        #Score of 10 most similar movies
        sig_scores = sig_scores[1:11]

        #Movie indices
        movie_indices = [i[0] for i in sig_scores]

        movieList = movie['original_title'].iloc[movie_indices]
        #movieList.columns = ['Movie Name','Rating']
        #movieList = movieList.sort_values(['Rating'],ascending=False)
        #Top 10 most similar movies
    return movieList
def recommendation(input_json, input_param):
    # FETCHING DATA FROM API
    counter = 0
    job_recommended = []
    jobs = input_json

    # Getting user input - insert to list
    input_value = [{'title': 'INPUT', 'description': input_param}]
    title = input_value[0].get('title')
    added_jobs = jobs.append(input_value, ignore_index=True, sort=False)

    # Term frequency and inverse document frequency
    tfv = TfidfVectorizer(min_df=0,
                          max_features=None,
                          strip_accents='unicode',
                          analyzer='word',
                          token_pattern=r'\w{1,}',
                          ngram_range=(1, 3),
                          stop_words='english')
    added_jobs['description'] = added_jobs['description'].fillna('')
    tfv_matrix = tfv.fit_transform(added_jobs['description'])

    # Sigmoid kernel for Sigmoid calculationsa
    sig = sigmoid_kernel(tfv_matrix, tfv_matrix)
    indices = pd.Series(added_jobs.index, index=added_jobs['title'])

    # Search for the very peculiar input
    jobs_list_length = len(added_jobs)
    jobs_last = jobs_list_length - 1

    for i in range(jobs_list_length):
        # 0.7615941559557649 is sigmoid value for very peculiar input
        if sig[jobs_last][i] <= 0.7615941559557649:
            counter += 1

    if counter >= jobs_last:
        return None
    else:
        # Getting final result
        id_input = indices[title]
        sig_scores = list(enumerate(sig[id_input]))
        sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True)
        sig_scores = sig_scores[1:15]
        job_indices = [i[0] for i in sig_scores]

        for job in job_indices:
            job_dic = {'title': added_jobs['title'].iloc[job]}
            job_recommended.append(job_dic)

        # # Return all recommended titles
        return job_recommended
Пример #26
0
def get_recommendations(movie_id):
    sql_engine = create_engine(os.path.join('sqlite:///' +
                                            os.path.join(basedir, 'site.db')),
                               echo=False)
    movies_results = pd.read_sql_query('select movie_id,genres from Movie',
                                       sql_engine)
    movies_results.to_csv(os.path.join(basedir, 'Movie.csv'),
                          index=False,
                          sep=";")

    user_movies_results = pd.read_sql_query('select movie_id from Credits',
                                            sql_engine)
    user_movies_results.to_csv(os.path.join(basedir, 'Credits.csv'),
                               index=False,
                               sep=";")

    movies_df = pd.read_csv(os.path.join(basedir, 'Movie.csv'), sep=';')
    user_df = pd.read_csv(os.path.join(basedir, 'Credits.csv'), sep=';')
    user_df.drop_duplicates(subset='movie_id', inplace=True)
    user_df.reset_index(drop=True, inplace=True)
    movies_df_merge = movies_df.merge(user_df, on='movie_id')
    tfv = TfidfVectorizer(min_df=3,
                          max_features=None,
                          strip_accents='unicode',
                          analyzer='word',
                          token_pattern=r'\w{1,}',
                          ngram_range=(1, 3),
                          stop_words='english')
    tfv_matrix = tfv.fit_transform(movies_df_merge['genres'])
    indices = pd.Series(movies_df_merge.index,
                        index=movies_df_merge['movie_id']).drop_duplicates()

    sig = sigmoid_kernel(tfv_matrix, tfv_matrix)
    # Get the index corresponding to original_title
    idx = indices[movie_id]

    # Get the pairwsie similarity scores
    sig_scores = list(enumerate(sig[idx]))

    # Sort the movies
    sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True)

    # Scores of the 10 most similar movies
    sig_scores = sig_scores[1:]

    # Movie indices
    movie_indices = [i[0] for i in sig_scores]

    # Top 10 most similar movies
    rem_movie = list(movies_df_merge['movie_id'].iloc[movie_indices])
    return rem_movie
Пример #27
0
        def tanhFunc():
            if self.parameters["kernel"].__contains__("gamma"):
                g = self.parameters["kernel"]["gamma"]
            else:
                g = 0.01

            if self.parameters["kernel"].__contains__("offset"):
                c = self.parameters["kernel"]["offset"]
            else:
                c = 1

            K = smp.sigmoid_kernel(X, Y, gamma=g, coef0=c)

            return K
Пример #28
0
    def train_and_test (self, dataTest, realOutput=None, aval=False, reg=0.01, deg=3, gamm=None, coef=1):  
                
        if self.kernelType == 'rbf':
            K = rbf_kernel(self.inTrain, self.inTrain, gamm)
            Ktest = rbf_kernel(dataTest, self.inTrain, gamm)
        elif self.kernelType == 'pol':
            K = polynomial_kernel(self.inTrain, self.inTrain, deg, gamm, coef)
            Ktest = polynomial_kernel(dataTest, self.inTrain, deg, gamm, coef)
        elif self.kernelType == 'sig':
            K = sigmoid_kernel(self.inTrain, self.inTrain, gamm, coef)
            Ktest = sigmoid_kernel(dataTest, self.inTrain, gamm, coef)
 
        I = np.eye(self.inTrain.shape[0])
        outNet = np.dot (np.dot(Ktest, np.linalg.inv(K + reg*I)), self.outTrain)
        
        if aval:        
            miss = float(cont_error (realOutput, outNet))
            si = float(outNet.shape[0])
            acc = (1-miss/si)*100
            print 'Miss classification on the test: ', miss, ' of ', si, ' - Accuracy: ',acc , '%'       
            return outNet, acc
            
        return outNet, None
Пример #29
0
def process():
    global movie_df, sig, index
    movie_df = pre_process()
    tfv = vectorizer(min_df=3,
                     max_features=None,
                     strip_accents='unicode',
                     analyzer='word',
                     token_pattern=r'\w{1,}',
                     ngram_range=(1, 3),
                     stop_words='english')
    tfv_matrix = tfv.fit_transform(movie_df['overview'])
    sig = sigmoid_kernel(tfv_matrix, tfv_matrix)
    index = pd.Series(movie_df.index,
                      index=movie_df['title']).drop_duplicates()
Пример #30
0
def calculateMultipleKernel(x, y):
    theta = random.sample(range(1, 47), 46)  # given a random theta for now

    # Convert our 2d arrays to numpy arrays
    x = np.array(x)
    y = np.array(y)

    # Reshape the array-like input vectors since we only have one sample
    x = x.reshape(1, -1)
    y = y.reshape(1, -1)

    # Variables to aggregate the kernel result
    kernelResult = 0
    index = 0

    for i in range(0, 3):
        kernelResult += theta[index] * additive_chi2_kernel(x, y)
        index += 1

    for i in range(0, 3):
        kernelResult += theta[index] * chi2_kernel(x, y, theta[index + 1])
        index += 2

    for i in range(0, 3):
        kernelResult += theta[index] * cosine_similarity(x, y)
        index += 1

    for i in range(0, 3):
        kernelResult += theta[index] * linear_kernel(x, y)
        index += 1

    for i in range(0, 3):
        kernelResult += theta[index] * polynomial_kernel(
            x, y, theta[index + 1], theta[index + 2], theta[index + 3])
        index += 4

    for i in range(0, 3):
        kernelResult += theta[index] * rbf_kernel(x, y, theta[index + 1])
        index += 2

    for i in range(0, 3):
        kernelResult += theta[index] * laplacian_kernel(x, y, theta[index + 1])
        index += 2

    for i in range(0, 3):
        kernelResult += theta[index] * sigmoid_kernel(x, y, theta[index + 1])
        index += 2

    return kernelResult
Пример #31
0
def calculateMultipleKernel(x, y):
    theta = random.sample(range(1,47),46) # given a random theta for now

    # Convert our 2d arrays to numpy arrays
    x = np.array(x)
    y = np.array(y)
    
    # Reshape the array-like input vectors since we only have one sample
    x = x.reshape(1,-1)
    y = y.reshape(1,-1)
    
    # Variables to aggregate the kernel result
    kernelResult = 0;
    index = 0; 
    
    for i in range(0,3):
        kernelResult += theta[index] * additive_chi2_kernel(x,y)
        index += 1
        
    for i in range(0,3):
        kernelResult += theta[index] * chi2_kernel(x,y,theta[index+1])
        index += 2
    
    for i in range(0,3):
        kernelResult += theta[index] * cosine_similarity(x,y)
        index += 1
    
    for i in range(0,3):
        kernelResult += theta[index] * linear_kernel(x,y)
        index += 1
    
    for i in range(0,3):
        kernelResult += theta[index] * polynomial_kernel(
            x,y,theta[index+1],theta[index+2], theta[index+3])
        index += 4
        
    for i in range(0,3):
        kernelResult += theta[index] * rbf_kernel(x,y,theta[index+1])
        index += 2
        
    for i in range(0,3):
        kernelResult += theta[index] * laplacian_kernel(x,y,theta[index+1])
        index += 2
    
    for i in range(0,3):
        kernelResult += theta[index] * sigmoid_kernel(x,y,theta[index+1])
        index += 2
        
    return kernelResult
Пример #32
0
 def compare_data(self):
     # getting all movies that are animated or not depending on the movie searched
     data = Movies.query.filter_by(animation=self.movie_details.animation)
     # Converting the list of SQLalchemy movies objects into a data frame of movies
     movie_data_frame = pd.DataFrame([(d.title, d.overview, d.image, d.popularity, d.release_date) for d in data],
                                     columns=['title', 'overview', 'image', 'popularity', 'release_date'])
     # Specifying parameters for the comparison
     tfv = TfidfVectorizer(min_df=1, max_features=None, strip_accents='unicode', analyzer='word',
                           token_pattern=r'\w{1,}', ngram_range=(1, 3), stop_words='english')
     # Specify the column for the comparison
     tfv_matrix = tfv.fit_transform(movie_data_frame['overview'])
     # Form a matrix
     sig = sigmoid_kernel(tfv_matrix, tfv_matrix)
     # Remove duplicate movies and index them
     indices = pd.Series(movie_data_frame.index, index=movie_data_frame['title']).drop_duplicates()
     return movie_data_frame, sig, indices
Пример #33
0
    def kernel_function(self, x1, x2):
        features = []

        # linear kernel:
        # Cosine distance
        features += np.squeeze(1 -
                               pairwise.paired_cosine_distances(x1, x2)[0]),

        # Manhanttan distance
        features += pairwise.paired_manhattan_distances(x1, x2)[0],

        # Euclidean distance
        features += pairwise.paired_euclidean_distances(x1, x2)[0],

        # Chebyshev distance
        features += pairwise.pairwise_distances(x1, x2,
                                                metric="chebyshev")[0][0],

        # stat kernel:
        # Pearson coefficient
        pearson = stats.pearsonr(np.squeeze(np.asarray(x1)),
                                 np.squeeze(np.asarray(x2)))[0]
        features += 0 if np.isnan(pearson) else pearson,

        # Spearman coefficient
        spearman = stats.spearmanr(x1, x2, axis=1).correlation
        features += 0 if np.isnan(spearman) else spearman,

        # Kendall tau coefficient
        kendall = stats.kendalltau(x1, x2).correlation
        features += 0 if np.isnan(kendall) else kendall,

        # non-linear kernel:
        # polynomial
        features += pairwise.polynomial_kernel(x1, x2, degree=2)[0][0],

        # rbf
        features += pairwise.rbf_kernel(x1, x2)[0][0],

        # laplacian
        features += pairwise.laplacian_kernel(x1, x2)[0][0],

        # sigmoid
        features += pairwise.sigmoid_kernel(x1, x2)[0][0],

        return features
def Recommendation_System(df,player_id,k):

  query = str(Playerdata.objects.all().query) 
  df1 = pd.read_sql_query(query, connection)    
  ID2namesmapper=df1.set_index('sofifa_id')['short_name']  
  sc=StandardScaler()  
  df_sc=sc.fit_transform(df)
  kn=sigmoid_kernel(df_sc,df_sc)
  so_fifa_id=list(df.index)
  kn_df=pd.DataFrame(kn,index=so_fifa_id,columns=so_fifa_id)
  try:
    temp_dict=kn_df[player_id].to_dict()
    temp_list=list({k: v for k, v in sorted(temp_dict.items(), key=lambda item: item[1], reverse=True)}.keys())
    temp_list.remove(player_id)
    return ID2namesmapper[temp_list[0:k]].to_list()
  except:
    print('PlayerID not present in the database')
Пример #35
0
def drawAlgoCompGraph():
    h = 0.02
    names = ["ridge", "KNN", "Linear SVM", "RBF SVM", "LDA",
             "Random Forest", "AdaBoost", "Naive Bayes", "QDA", "Logistic"]

    kernel_names =['laplacian kernel', 'RBF kernel', 'Sigmoid kernel']
    classifiers = [
        linear_model.Ridge(),
        KNeighborsClassifier(9),
        SVC(kernel="linear", C=0.025),
        SVC(kernel="rbf", gamma=0.25),
        LDA(),
        RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
        AdaBoostClassifier(),
        GaussianNB(),
        QDA(),
        linear_model.LogisticRegression()]

    filename = '/Users/guichengwu/Desktop/208_mid term/exam.dat'

    data = np.loadtxt(filename, dtype='str')

    for i in range(data.shape[0]):
        for j in range(1,data.shape[1]):
            data[i][j] = data[i][j][2:]

    data_matrix = np.matrix(data).astype(np.float)
    X = data_matrix[:, 1:5]
    y = np.asarray(data_matrix[:, 0])
    X = preprocessing.scale(X)

    Lap_X = laplacian_kernel(X)
    pca1 = decomposition.PCA(n_components=2)
    pca1.fit(Lap_X)
    Lap_X = pca1.transform(Lap_X)

    RBF_X = rbf_kernel(X)
    pca2 = decomposition.PCA(n_components=2)
    pca2.fit(RBF_X)
    RBF_X = pca2.transform(RBF_X)

    Sig_X = sigmoid_kernel(X)
    pca3 = decomposition.PCA(n_components=2)
    pca3.fit(Sig_X)
    Sig_X = pca3.transform(Sig_X)

    linearly_separable1 = (Lap_X, y)
    linearly_separable2 = (RBF_X, y)
    linearly_separable3 = (Sig_X, y)

    datasets = [            
                linearly_separable1,
                linearly_separable2,
                linearly_separable3,
                ]

    figure = plt.figure(figsize=(30, 10))
    i = 1

    for kernel_name, ds in zip(kernel_names, datasets):
        X, y = ds
        X = StandardScaler().fit_transform(X)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4)
        x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max()+0.5
        y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max()+0.5
        xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                             np.arange(y_min, y_max, h))

        cm = plt.cm.RdBu
        cm_bright = ListedColormap(['#FF0000', '#0000FF'])
        ax = plt.subplot(len(datasets), len(classifiers)+1, i)
        ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)
        ax.scatter(X_test[:, 0], X_test[:,1], c=y_test, cmap=cm_bright, alpha=0.6)
        ax.set_xlim(xx.min(), xx.max())
        ax.set_ylim(yy.min(), yy.max())
        ax.set_xticks(())
        ax.set_yticks(())
        ax.set_title(kernel_name)
        i += 1

        # iterate over classifiers
        for name, clf in zip(names, classifiers):
            ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
            clf.fit(X_train, y_train)
            score = clf.score(X_test, y_test)

            if hasattr(clf, "decision_function"):
                Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
            else:
                Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]

            Z = Z.reshape(xx.shape)
            ax.contourf(xx, yy, Z, cmap=cm, alpha=0.8)

            ax.scatter(X_train[:, 0], X_train[:,1], c=y_train, cmap=cm_bright)
            ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6)
            ax.set_xlim(xx.min(), xx.max())
            ax.set_ylim(yy.min(), yy.max())
            ax.set_xticks(())
            ax.set_yticks(())
            ax.set_title(name)
            ax.text(xx.max() - 0.3, yy.min() + 0.3, ('%.2f' % score).lstrip('0'),
                    size=15, horizontalalignment='right')
            i += 1

    figure.subplots_adjust(left=0.02, right=0.98)
    plt.show()
    figure.savefig('/Users/guichengwu/Desktop/algorithm_comparison2.png')