示例#1
0
    def fit(self,
            topK=50,
            shrink=100,
            similarity='cosine',
            feature_weighting="none",
            **similarity_args):

        # Similaripy returns also self similarity, which will be set to 0 afterwards
        topK += 1
        self.topK = topK
        self.shrink = shrink

        if feature_weighting not in self.FEATURE_WEIGHTING_VALUES:
            raise ValueError(
                "Value for 'feature_weighting' not recognized. Acceptable values are {}, provided was '{}'"
                .format(self.FEATURE_WEIGHTING_VALUES, feature_weighting))

        if feature_weighting == "BM25":
            self.URM_train = self.URM_train.astype(np.float32)
            self.URM_train = okapi_BM_25(self.URM_train.T).T
            self.URM_train = check_matrix(self.URM_train, 'csr')

        elif feature_weighting == "TF-IDF":
            self.URM_train = self.URM_train.astype(np.float32)
            self.URM_train = TF_IDF(self.URM_train.T).T
            self.URM_train = check_matrix(self.URM_train, 'csr')

        if similarity == "cosine":
            self.W_sparse = sim.cosine(self.URM_train,
                                       k=topK,
                                       shrink=shrink,
                                       **similarity_args)
        elif similarity == "jaccard":
            self.W_sparse = sim.jaccard(self.URM_train,
                                        k=topK,
                                        shrink=shrink,
                                        **similarity_args)
        elif similarity == "dice":
            self.W_sparse = sim.dice(self.URM_train,
                                     k=topK,
                                     shrink=shrink,
                                     **similarity_args)
        elif similarity == "jaccard":
            self.W_sparse = sim.tversky(self.URM_train,
                                        k=topK,
                                        shrink=shrink,
                                        **similarity_args)
        elif similarity == "splus":
            self.W_sparse = sim.s_plus(self.URM_train,
                                       k=topK,
                                       shrink=shrink,
                                       **similarity_args)
        else:
            raise ValueError(
                "Unknown value '{}' for similarity".format(similarity))

        self.W_sparse.setdiag(0)
        self.W_sparse = self.W_sparse.transpose().tocsr()
    def fit(self,
            topK=50,
            shrink=100,
            similarity='cosine',
            feature_weighting="none"):

        self.topK = topK
        self.shrink = shrink

        if feature_weighting == "bm25":
            self.ICM_train = similaripy.normalization.bm25(self.ICM_train)
        elif feature_weighting == "bm25plus":
            self.ICM_train = similaripy.normalization.bm25plus(self.ICM_train)
        elif feature_weighting == "tfidf":
            self.ICM_train = similaripy.normalization.tfidf(self.ICM_train)

        if similarity == "cosine":
            similarity_matrix = similaripy.cosine(self.ICM_train,
                                                  k=self.topK,
                                                  shrink=self.shrink,
                                                  binary=False,
                                                  verbose=False)
        if similarity == "s_plus":
            similarity_matrix = similaripy.s_plus(self.ICM_train,
                                                  k=self.topK,
                                                  shrink=self.shrink,
                                                  binary=False,
                                                  verbose=False)
        if similarity == "dice":
            similarity_matrix = similaripy.dice(self.ICM_train,
                                                k=self.topK,
                                                shrink=self.shrink,
                                                binary=False,
                                                verbose=False)
        if similarity == "rp3beta":
            similarity_matrix = similaripy.rp3beta(self.ICM_train,
                                                   alpha=0.3,
                                                   beta=0.61,
                                                   k=self.topK,
                                                   shrink=self.shrink,
                                                   binary=False,
                                                   verbose=False)
        if similarity == "p3alpha":
            similarity_matrix = similaripy.p3alpha(self.ICM_train,
                                                   k=self.topK,
                                                   shrink=self.shrink,
                                                   binary=False,
                                                   verbose=False)
        if similarity == "jaccard":
            similarity_matrix = similaripy.jaccard(self.ICM_train,
                                                   k=self.topK,
                                                   shrink=self.shrink,
                                                   binary=False,
                                                   verbose=False)

        self.W_sparse = similarity_matrix.transpose().tocsr()
        self.W_sparse = check_matrix(self.W_sparse, format='csr')
    def fit(self, topK=50, shrink=100, similarity='cosine', pre_normalization="none", post_normalization = "none", **similarity_args):

        self.topK = topK
        self.shrink = shrink

        #ucm = sps.load_npz("FULL_UCM.npz")
        interactions = self.URM_train

        if pre_normalization == "bm25plus":
            interactions = similaripy.normalization.bm25plus(self.URM_train, axis=1, k1=1.2, b=0.75, delta=0.85, tf_mode="raw", idf_mode="bm25", inplace=False)
        if pre_normalization == "tfidf":
            interactions = similaripy.normalization.tfidf(self.URM_train, axis=1)

        #interactions = sps.hstack((interactions, ucm))
        
        if similarity == "cosine":
            similarity_matrix = similaripy.cosine(interactions, k=self.topK, shrink=self.shrink, binary=False, verbose=False)
            similarity_matrix = similarity_matrix.transpose().tocsr()
        if similarity == "s_plus":
            similarity_matrix = similaripy.s_plus(interactions, k=self.topK, shrink=self.shrink, binary = False, verbose = False)
            similarity_matrix = similarity_matrix.transpose().tocsr()
        if similarity == "dice":
            similarity_matrix = similaripy.dice(interactions, k=self.topK, shrink=self.shrink, binary=False, verbose=False)
            similarity_matrix = similarity_matrix.transpose().tocsr()
        if similarity == "rp3beta":
            similarity_matrix = similaripy.rp3beta(interactions, alpha=0.3, beta=0.61, k=self.topK, shrink=self.shrink, binary=False, verbose=False)
            similarity_matrix = similarity_matrix.transpose().tocsr()
        if similarity == "asym":
            similarity_matrix = similaripy.asymmetric_cosine(interactions, k=self.topK, shrink=self.shrink, alpha=0.5, binary=False, verbose=False)
            similarity_matrix = similarity_matrix.transpose().tocsr()
        if similarity == "jaccard":
            similarity_matrix = similaripy.jaccard(interactions, k=self.topK, shrink=self.shrink, binary=False, verbose=False)
            similarity_matrix = similarity_matrix.transpose().tocsr()
        
        
        if post_normalization == "bm25plus_once":
            self.URM_train = similaripy.normalization.bm25plus(self.URM_train, axis=1, k1=1.2, b=0.75, delta=0.8, tf_mode='raw', idf_mode='bm25', inplace=False)
        if post_normalization == "bm25plus_twice":
            self.URM_train = similaripy.normalization.bm25plus(self.URM_train, axis=1, k1=1.2, b=0.75, delta=0.8, tf_mode='raw', idf_mode='bm25', inplace=False)
            self.URM_train = similaripy.normalization.bm25plus(self.URM_train, axis=1, k1=1.2, b=0.75, delta=0.8, tf_mode='raw', idf_mode='bm25', inplace=False)
        if post_normalization == "tfidf":
            self.URM_train = similaripy.normalization.tfidf(self.URM_train, axis=1)
        if post_normalization == "bm25":
            self.URM_train = similaripy.normalization.bm25(self.URM_train, axis=1)

        
        self.W_sparse = similarity_matrix
        self.W_sparse = check_matrix(self.W_sparse, format='csr')
示例#4
0
                    choices=['thresh', 'knn'],
                    type=str,
                    required=True)
args = parser.parse_args()
# first load the data
df_train = pd.read_csv(f'../dataset/{args.split}/train.csv', escapechar="\\")
df_test = pd.read_csv(f'../dataset/{args.split}/test.csv', escapechar="\\")
# ALWAYS sort the data by record_id
df_train = df_train.sort_values(by=['record_id']).reset_index(drop=True)
df_test = df_test.sort_values(by=['record_id']).reset_index(drop=True)
df_train.address = df_train.address.fillna('').astype(str)
df_test.address = df_test.address.fillna('').astype(str)
corpus = list(df_train.address) + list(df_test.address)
vectorizer = CountVectorizer(preprocessor=remove_spaces,
                             analyzer=remove_spaces)
X = vectorizer.fit_transform(corpus)
X_train = X[:df_train.shape[0], :]
X_test = X[df_train.shape[0]:, :]
if args.mode == 'thresh':
    cosmatrixxx = sim.jaccard(X_test, X_train.T, k=2000)
    cosmatrixxx.data[cosmatrixxx.data <= 0.3] = 0
else:
    cosmatrixxx = sim.jaccard(X_test, X_train.T, k=300)

if not os.path.isdir(f"../dataset/{args.split}/similarities"):
    os.makedirs(f"../dataset/{args.split}/similarities")

save_npz(
    f'../dataset/{args.split}/similarities/jaccard_uncleaned_address_300k_{args.split}_2ngrams.npz',
    cosmatrixxx.tocsr())
    def fit(self,
            topK=50,
            shrink=100,
            similarity='cosine',
            normalization="none",
            feature_weighting="none",
            **similarity_args):

        self.topK = topK
        self.shrink = shrink

        reader = DataReader()
        icm = reader.load_icm()

        if normalization == "bm25":
            self.URM_train = similaripy.normalization.bm25(self.URM_train,
                                                           axis=1)
        if normalization == "tfidf":
            self.URM_train = similaripy.normalization.tfidf(self.URM_train,
                                                            axis=1)
        if normalization == "bm25plus":
            self.URM_train = similaripy.normalization.bm25plus(self.URM_train,
                                                               axis=1)

        if feature_weighting == "bm25":
            icm = similaripy.normalization.bm25(icm, axis=1)
        if feature_weighting == "tfidf":
            icm = similaripy.normalization.tfidf(icm, axis=1)
        if feature_weighting == "bm25plus":
            icm = similaripy.normalization.bm25plus(icm, axis=1)

        matrix = sps.hstack((self.URM_train.transpose().tocsr(), icm))

        if similarity == "cosine":
            similarity_matrix = similaripy.cosine(matrix,
                                                  k=self.topK,
                                                  shrink=self.shrink,
                                                  binary=False,
                                                  threshold=0)
        if similarity == "dice":
            similarity_matrix = similaripy.dice(matrix,
                                                k=self.topK,
                                                shrink=self.shrink,
                                                binary=False,
                                                threshold=0)
        if similarity == "jaccard":
            similarity_matrix = similaripy.jaccard(matrix,
                                                   k=self.topK,
                                                   shrink=self.shrink,
                                                   binary=False,
                                                   threshold=0)
        if similarity == "asym":
            similarity_matrix = similaripy.asymmetric_cosine(
                matrix,
                k=self.topK,
                shrink=self.shrink,
                binary=False,
                threshold=0)
        if similarity == "rp3beta":
            similarity_matrix = similaripy.rp3beta(matrix,
                                                   k=self.topK,
                                                   shrink=self.shrink,
                                                   binary=False,
                                                   threshold=0,
                                                   alpha=0.3,
                                                   beta=0.61)

        self.W_sparse = similarity_matrix
        self.W_sparse = check_matrix(self.W_sparse, format='csr')
示例#6
0
def check_similarity(m, k, rtol=0.0001, full=False):
    # cython
    dot = sim.dot_product(m, k=k)
    cosine = sim.cosine(m, k=k)
    asy_cosine = sim.asymmetric_cosine(m, alpha=0.2, k=k)
    jaccard = sim.jaccard(m, k=k)
    dice = sim.dice(m, k=k)
    tversky = sim.tversky(m, alpha=0.8, beta=0.4, k=k)
    p3alpha = sim.p3alpha(m, alpha=0.8, k=k)
    rp3beta = sim.rp3beta(m, alpha=0.8, beta=0.4, k=k)

    # python
    dot2 = py_dot(m, k)
    cosine2 = py_cosine(m, k).tocsr()
    asy_cosine2 = py_asy_cosine(m, 0.2, k=k)
    jaccard2 = py_jaccard(m, k)
    dice2 = py_dice(m, k)
    tversky2 = py_tversky(m, alpha=0.8, beta=0.4, k=k)
    p3alpha2 = py_p3alpha(m, alpha=0.8, k=k)
    rp3beta2 = py_rp3beta(m, alpha=0.8, beta=0.4, k=k)

    # test
    np.testing.assert_allclose(check_sum(dot),
                               check_sum(dot2),
                               rtol=rtol,
                               err_msg='dot error')
    np.testing.assert_allclose(check_sum(cosine),
                               check_sum(cosine2),
                               rtol=rtol,
                               err_msg='cosine error')
    np.testing.assert_allclose(check_sum(asy_cosine),
                               check_sum(asy_cosine2),
                               rtol=rtol,
                               err_msg='asy_cosine error')
    np.testing.assert_allclose(check_sum(jaccard),
                               check_sum(jaccard2),
                               rtol=rtol,
                               err_msg='jaccard error')
    np.testing.assert_allclose(check_sum(dice),
                               check_sum(dice2),
                               rtol=rtol,
                               err_msg='dice error')
    np.testing.assert_allclose(check_sum(tversky),
                               check_sum(tversky2),
                               rtol=rtol,
                               err_msg='tversky error')
    np.testing.assert_allclose(check_sum(p3alpha),
                               check_sum(p3alpha2),
                               rtol=rtol,
                               err_msg='p3alpha error')
    np.testing.assert_allclose(check_sum(rp3beta),
                               check_sum(rp3beta2),
                               rtol=rtol,
                               err_msg='rp3beta error')

    # test full rows
    if full:
        np.testing.assert_(check_full(dot, dot2, rtol) == 0, msg='dot error')
        np.testing.assert_(check_full(cosine, cosine2, rtol) == 0,
                           msg='cosine error')
        np.testing.assert_(check_full(asy_cosine, asy_cosine2, rtol) == 0,
                           msg='asy_cosine error')
        np.testing.assert_(check_full(jaccard, jaccard2, rtol) == 0,
                           msg='jaccard error')
        np.testing.assert_(check_full(dice, dice2, rtol) == 0,
                           msg='dice error')
        np.testing.assert_(check_full(tversky, tversky2, rtol) == 0,
                           msg='tversky error')
        np.testing.assert_(check_full(p3alpha, p3alpha2, rtol) == 0,
                           msg='p3alpha error')
        np.testing.assert_(check_full(rp3beta, rp3beta2, rtol) == 0,
                           msg='rp3beta error')

    return
示例#7
0
    def fit(self,
            matrix,
            k,
            distance,
            shrink=0,
            threshold=0,
            implicit=True,
            alpha=None,
            beta=None,
            l=None,
            c=None,
            verbose=False):
        """
        Initialize the model and compute the similarity matrix S with a distance metric.
        Access the similarity matrix using: self._sim_matrix

        Parameters
        ----------
        matrix : csr_matrix
            A sparse matrix. For example, it can be the URM of shape (number_users, number_items).
        k : int
            K nearest neighbour to consider.
        distance : str
            One of the supported distance metrics, check collaborative_filtering_base constants.
        shrink : float, optional
            Shrink term used in the normalization
        threshold: float, optional
            All the values under this value are cutted from the final result
        implicit: bool, optional
            If true, treat the URM as implicit, otherwise consider explicit ratings (real values) in the URM
        alpha: float, optional, included in [0,1]
        beta: float, optional, included in [0,1]
        l: float, optional, balance coefficient used in s_plus distance, included in [0,1]
        c: float, optional, cosine coefficient, included in [0,1]
        """
        alpha = -1 if alpha is None else alpha
        beta = -1 if beta is None else beta
        l = -1 if l is None else l
        c = -1 if c is None else c
        if distance == self.SIM_ASYMCOSINE and not (0 <= alpha <= 1):
            log.error(
                'Invalid parameter alpha in asymmetric cosine similarity!')
            return
        if distance == self.SIM_TVERSKY and not (0 <= alpha <= 1
                                                 and 0 <= beta <= 1):
            log.error('Invalid parameter alpha/beta in tversky similarity!')
            return
        if distance == self.SIM_P3ALPHA and alpha is None:
            log.error('Invalid parameter alpha in p3alpha similarity')
            return
        if distance == self.SIM_RP3BETA and alpha is None and beta is None:
            log.error('Invalid parameter alpha/beta in rp3beta similarity')
            return
        if distance == self.SIM_SPLUS and not (0 <= l <= 1 and 0 <= c <= 1
                                               and 0 <= alpha <= 1
                                               and 0 <= beta <= 1):
            log.error('Invalid parameter alpha/beta/l/c in s_plus similarity')
            return

        # compute and stores the similarity matrix using one of the distance metric: S = R•R'
        if distance == self.SIM_COSINE:
            self._sim_matrix = sim.cosine(matrix,
                                          k=k,
                                          shrink=shrink,
                                          threshold=threshold,
                                          binary=implicit)
        elif distance == self.SIM_ASYMCOSINE:
            self._sim_matrix = sim.asymmetric_cosine(matrix,
                                                     k=k,
                                                     shrink=shrink,
                                                     threshold=threshold,
                                                     binary=implicit,
                                                     alpha=alpha)
        elif distance == self.SIM_JACCARD:
            self._sim_matrix = sim.jaccard(matrix,
                                           k=k,
                                           shrink=shrink,
                                           threshold=threshold,
                                           binary=implicit)
        elif distance == self.SIM_DICE:
            self._sim_matrix = sim.dice(matrix,
                                        k=k,
                                        shrink=shrink,
                                        threshold=threshold,
                                        binary=implicit)
        elif distance == self.SIM_TVERSKY:
            self._sim_matrix = sim.tversky(matrix,
                                           k=k,
                                           shrink=shrink,
                                           threshold=threshold,
                                           binary=implicit,
                                           alpha=alpha,
                                           beta=beta)
        elif distance == self.SIM_P3ALPHA:
            self._sim_matrix = sim.p3alpha(matrix,
                                           k=k,
                                           shrink=shrink,
                                           threshold=threshold,
                                           binary=implicit,
                                           alpha=alpha)
        elif distance == self.SIM_RP3BETA:
            self._sim_matrix = sim.rp3beta(matrix,
                                           k=k,
                                           shrink=shrink,
                                           threshold=threshold,
                                           binary=implicit,
                                           alpha=alpha,
                                           beta=beta)
        elif distance == self.SIM_SPLUS:
            self._sim_matrix = sim.s_plus(matrix,
                                          k=k,
                                          shrink=shrink,
                                          threshold=threshold,
                                          binary=implicit,
                                          l=l,
                                          t1=alpha,
                                          t2=beta,
                                          c=c)
        else:
            log.error('Invalid distance metric: {}'.format(distance))
        #self.SIM_DOTPRODUCT: sim.dot_product(matrix, k=k, shrink=shrink, threshold=threshold, binary=implicit)
        return self._sim_matrix

#setup parser
parser = argparse.ArgumentParser()
parser.add_argument("-s",
                    "--split",
                    help="The dataset split to use",
                    choices=['original', 'validation'],
                    type=str,
                    required=True)
args = parser.parse_args()
# first load the data
df_train = pd.read_csv(f"../dataset/{args.split}/train.csv", escapechar="\\")
df_test = pd.read_csv(f"../dataset/{args.split}/test.csv", escapechar="\\")
# ALWAYS sort the data by record_id
df_train = df_train.sort_values(by=['record_id']).reset_index(drop=True)
df_test = df_test.sort_values(by=['record_id']).reset_index(drop=True)
df_train.address = df_train.address.astype(str)
df_test.address = df_test.address.astype(str)
# mi serve una colonna con tutti i telefoni su cui fare tfidf
all_adds = list(df_train.address) + list(df_test.address)
# daje con tfidf
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
tf_idf_matrix = vectorizer.fit_transform(all_adds)
# split
tf_idf_train = tf_idf_matrix[:df_train.
                             shape[0], :]  # 691440 è la lunghezza del train
tf_idf_test = tf_idf_matrix[df_train.shape[0]:, :]
jac = sim.jaccard(tf_idf_test, tf_idf_train.T, k=300)
save_npz(f'jaccard_tfidf_address_{args.split}.npz', jac.tocsr())
示例#9
0
def similarity(matrix,
               k=100,
               sim_type='cosine',
               shrink=0,
               threshold=0,
               implicit=True,
               alpha=None,
               beta=None,
               l=None,
               c=None):

    # similarity type
    SIM_COSINE = 'cosine'
    SIM_TVERSKY = 'tversky'
    SIM_P3ALPHA = 'p3alpha'
    SIM_ASYMCOSINE = 'asymcosine'
    SIM_RP3BETA = 'rp3beta'
    SIM_SPLUS = 'splus'
    SIM_JACCARD = 'jaccard'
    SIM_DICE = 'dice'

    matrix = matrix.T

    if sim_type == SIM_COSINE:
        return sim.cosine(matrix,
                          k=k,
                          shrink=shrink,
                          threshold=threshold,
                          binary=implicit)
    elif sim_type == SIM_ASYMCOSINE:
        return sim.asymmetric_cosine(matrix,
                                     k=k,
                                     shrink=shrink,
                                     threshold=threshold,
                                     binary=implicit,
                                     alpha=alpha)
    elif sim_type == SIM_JACCARD:
        return sim.jaccard(matrix,
                           k=k,
                           shrink=shrink,
                           threshold=threshold,
                           binary=implicit)
    elif sim_type == SIM_TVERSKY:
        return sim.tversky(matrix,
                           k=k,
                           shrink=shrink,
                           threshold=threshold,
                           binary=implicit,
                           alpha=alpha,
                           beta=beta)
    elif sim_type == SIM_P3ALPHA:
        return sim.p3alpha(matrix,
                           k=k,
                           shrink=shrink,
                           threshold=threshold,
                           binary=implicit,
                           alpha=alpha)
    elif sim_type == SIM_RP3BETA:
        return sim.rp3beta(matrix,
                           k=k,
                           shrink=shrink,
                           threshold=threshold,
                           binary=implicit,
                           alpha=alpha,
                           beta=beta)
    elif sim_type == SIM_SPLUS:
        return sim.s_plus(matrix,
                          k=k,
                          shrink=shrink,
                          threshold=threshold,
                          binary=implicit)  #, l=l, t1=alpha, t2=beta, c=c)
    elif sim_type == SIM_DICE:
        return sim.dice(matrix,
                        k=k,
                        shrink=shrink,
                        threshold=threshold,
                        binary=implicit)
    else:
        print('Error wrong distance metric')
    def fit(self):
        self.alpha = -1 if self.alpha is None else self.alpha
        self.beta = -1 if self.beta is None else self.beta
        self.l = -1 if self.l is None else self.l
        self.c = -1 if self.c is None else self.c
        if self.distance == self.SIM_ASYMCOSINE and not (0 <= self.alpha <= 1):
            log.error(
                'Invalid parameter alpha in asymmetric cosine Similarity_MFD!')
            return
        if self.distance == self.SIM_TVERSKY and not (0 <= self.alpha <= 1
                                                      and 0 <= self.beta <= 1):
            log.error(
                'Invalid parameter alpha/beta in tversky Similarity_MFD!')
            return
        if self.distance == self.SIM_P3ALPHA and self.alpha is None:
            log.error('Invalid parameter alpha in p3alpha Similarity_MFD')
            return
        if self.distance == self.SIM_RP3BETA and self.alpha is None and self.beta is None:
            log.error('Invalid parameter alpha/beta in rp3beta Similarity_MFD')
            return
        if self.distance == self.SIM_SPLUS and not (
                0 <= self.l <= 1 and 0 <= self.c <= 1 and 0 <= self.alpha <= 1
                and 0 <= self.beta <= 1):
            log.error(
                'Invalid parameter alpha/beta/l/c in s_plus Similarity_MFD')
            return

        # compute and stores the Similarity_MFD matrix using one of the distance metric: S = R•R'
        if self.distance == self.SIM_COSINE:
            self._sim_matrix = sim.cosine(self.matrix,
                                          k=self.k,
                                          shrink=self.shrink,
                                          threshold=self.threshold,
                                          binary=self.implicit)
        elif self.distance == self.SIM_ASYMCOSINE:
            self._sim_matrix = sim.asymmetric_cosine(self.matrix,
                                                     k=self.k,
                                                     shrink=self.shrink,
                                                     threshold=self.threshold,
                                                     binary=self.implicit,
                                                     alpha=self.alpha)
        elif self.distance == self.SIM_JACCARD:
            self._sim_matrix = sim.jaccard(self.matrix,
                                           k=self.k,
                                           shrink=self.shrink,
                                           threshold=self.threshold,
                                           binary=self.implicit)
        elif self.distance == self.SIM_DICE:
            self._sim_matrix = sim.dice(self.matrix,
                                        k=self.k,
                                        shrink=self.shrink,
                                        threshold=self.threshold,
                                        binary=self.implicit)
        elif self.distance == self.SIM_TVERSKY:
            self._sim_matrix = sim.tversky(self.matrix,
                                           k=self.k,
                                           shrink=self.shrink,
                                           threshold=self.threshold,
                                           binary=self.implicit,
                                           alpha=self.alpha,
                                           beta=self.beta)
        elif self.distance == self.SIM_P3ALPHA:
            self._sim_matrix = sim.p3alpha(self.matrix,
                                           k=self.k,
                                           shrink=self.shrink,
                                           threshold=self.threshold,
                                           binary=self.implicit,
                                           alpha=self.alpha)
        elif self.distance == self.SIM_RP3BETA:
            self._sim_matrix = sim.rp3beta(self.matrix,
                                           k=self.k,
                                           shrink=self.shrink,
                                           threshold=self.threshold,
                                           binary=self.implicit,
                                           alpha=self.alpha,
                                           beta=self.beta)
        elif self.distance == self.SIM_SPLUS:
            self._sim_matrix = prep.normalize(sim.s_plus(
                self.matrix,
                k=self.k,
                shrink=self.shrink,
                threshold=self.threshold,
                binary=self.implicit,
                l=self.l,
                t1=self.alpha,
                t2=self.beta,
                c=self.c),
                                              norm='l2',
                                              axis=0)
        else:
            log.error('Invalid distance metric: {}'.format(self.distance))
        return self._sim_matrix
示例#11
0
    string = re.sub(r'[,-./]',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

#setup parser
parser = argparse.ArgumentParser()
parser.add_argument("-s","--split",
                    help="The dataset split to use",
                    choices=['original','validation'],
                    type=str,
                    required=True)
args = parser.parse_args()
# first load the data
df_train = pd.read_csv(f"../dataset/{args.split}/train.csv", escapechar="\\")
df_test = pd.read_csv(f"../dataset/{args.split}/test.csv", escapechar="\\")
# ALWAYS sort the data by record_id
df_train = df_train.sort_values(by=['record_id']).reset_index(drop=True)
df_test = df_test.sort_values(by=['record_id']).reset_index(drop=True)
df_train.name = df_train.name.astype(str)
df_test.name = df_test.name.astype(str)
# mi serve una colonna con tutti i nomi su cui fare tfidf
all_names = list(df_train.name) + list(df_test.name)
# daje con tfidf
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
tf_idf_matrix = vectorizer.fit_transform(all_names)
# split
tf_idf_train = tf_idf_matrix[:df_train.shape[0],:] # 691440 è la lunghezza del train
tf_idf_test = tf_idf_matrix[df_train.shape[0]:,:]
cos_tfidf = sim.jaccard(tf_idf_test, tf_idf_train.T, k=300)
save_npz(f'jaccard_tfidf_name_{args.split}_noclean.npz', cos_tfidf.tocsr())