def __calc_distances__(self, v1s, v2s, is_sparse=True):
        if is_sparse:
            dcosine     = np.array([cosine(x.toarray(), y.toarray())       for (x, y) in zip(v1s, v2s)]).reshape((-1,1))
            dcityblock  = np.array([cityblock(x.toarray(), y.toarray())    for (x, y) in zip(v1s, v2s)]).reshape((-1,1))
            dcanberra  = np.array([canberra(x.toarray(), y.toarray())     for (x, y) in zip(v1s, v2s)]).reshape((-1,1))
            deuclidean = np.array([euclidean(x.toarray(), y.toarray())    for (x, y) in zip(v1s, v2s)]).reshape((-1,1))
            dminkowski  = np.array([minkowski(x.toarray(), y.toarray(), 3) for (x, y) in zip(v1s, v2s)]).reshape((-1,1))
            dbraycurtis = np.array([braycurtis(x.toarray(), y.toarray())   for (x, y) in zip(v1s, v2s)]).reshape((-1,1))

            dskew_q1 = [skew(x.toarray().ravel()) for x in v1s]
            dskew_q2 = [skew(x.toarray().ravel()) for x in v2s]
            dkur_q1  = [kurtosis(x.toarray().ravel()) for x in v1s]
            dkur_q2  = [kurtosis(x.toarray().ravel()) for x in v2s]

            dskew_diff = np.abs(np.array(dskew_q1) - np.array(dskew_q2)).reshape((-1,1))
            dkur_diff  = np.abs(np.array(dkur_q1) - np.array(dkur_q2)).reshape((-1,1))
        else:
            dcosine     = np.array([cosine(x, y)       for (x, y) in zip(v1s, v2s)]).reshape((-1,1))
            dcityblock  = np.array([cityblock(x, y)    for (x, y) in zip(v1s, v2s)]).reshape((-1,1))
            dcanberra  = np.array([canberra(x, y)     for (x, y) in zip(v1s, v2s)]).reshape((-1,1))
            deuclidean = np.array([euclidean(x, y)    for (x, y) in zip(v1s, v2s)]).reshape((-1,1))
            dminkowski  = np.array([minkowski(x, y, 3) for (x, y) in zip(v1s, v2s)]).reshape((-1,1))
            dbraycurtis = np.array([braycurtis(x, y)   for (x, y) in zip(v1s, v2s)]).reshape((-1,1))

            dskew_q1 = [skew(x) for x in v1s]
            dskew_q2 = [skew(x) for x in v2s]
            dkur_q1  = [kurtosis(x) for x in v1s]
            dkur_q2  = [kurtosis(x) for x in v2s]

            dskew_diff = np.abs(np.array(dskew_q1) - np.array(dskew_q2)).reshape((-1,1))
            dkur_diff  = np.abs(np.array(dkur_q1) - np.array(dkur_q2)).reshape((-1,1))
        return np.hstack((dcosine,dcityblock,dcanberra,deuclidean,dminkowski,dbraycurtis,dskew_diff,dkur_diff))
 def calculateL2(self, feat1, feat2, c_type='euclidean'):
     assert np.shape(feat1) == np.shape(feat2)
     if config.insight:
         [
             len_,
         ] = np.shape(feat1)
         #print(np.shape(feat1))
     else:
         _, len_ = np.shape(feat1)
     #print("len ",len_)
     if c_type == "cosine":
         s_d = distance.cosine(feat1, feat2)
     elif c_type == "euclidean":
         #s_d = np.sqrt(np.sum(np.square(feat1-feat2)))
         #s_d = distance.euclidean(feat1,feat2,w=1./len_)
         s_d = distance.euclidean(feat1, feat2, w=1)
     elif c_type == "correlation":
         s_d = distance.correlation(feat1, feat2)
     elif c_type == "braycurtis":
         s_d = distance.braycurtis(feat1, feat2)
     elif c_type == 'canberra':
         s_d = distance.canberra(feat1, feat2)
     elif c_type == "chebyshev":
         s_d = distance.chebyshev(feat1, feat2)
     return s_d
def get_braycurtis(
    input_file: pathlib.Path,
    db_file: pathlib.Path,
    db_name: str,
    level_str: str,
    threshold: int,
):
    levels = level_str.split(",")
    input_table = pd.read_csv(input_file)
    db_data = load_table(str(db_file))
    db_df = db_data.to_dataframe(dense=True)
    samples = list(db_df.columns)
    obs_metadata = db_data.metadata_to_dataframe(axis="observation")
    db_table = pd.concat([obs_metadata, db_df], axis=1)
    data = []
    print(f"Calculating braycurtis dissimilarity for {db_name}")
    for level in levels:
        for u, v, otu_ids, col in get_vectors(input_table, db_table, level,
                                              samples, threshold):
            value = braycurtis(u, v)
            data.append({
                "database": db_name,
                "sample": col,
                "tax_level": level,
                "braycurtis": value,
            })
    return data
예제 #4
0
    def step(self, action=None):
        """
        Render HTML and return state, reward, done for each step
        :param action: 
        :return: 
        """
        # print(self.idx)
        if action is None:
            action = self.action_sample()
        self.html_vec[self.idx] = action
        if self.html_vec[:3] == [2, 1, 3]:
            print('HTML vec: ', self.html_vec)
        html = self.html_covr.convert(
            self.html_vec, direction=HTML2VECConverter.VEC2HTML_DIRECTION)
        html = self.fill_text_for_html(html)

        state = self.renderer.render_html(html) / 255.0
        dist = distance.braycurtis(self.result_image.flatten(),
                                   state.flatten())
        reward = HTMLGame.REWARD if dist < 1e-6 else 0
        if set([2, 1, 3]) < set(self.html_vec):
            reward = HTMLGame.REWARD / 2.0
        if set([4, 1, 5]) < set(self.html_vec):
            reward = HTMLGame.REWARD / 2.0
        # reward = HTMLGame.REWARD if self.html_vec == [2, 1, 3, 4, 1, 5] else 0

        self.idx += 1

        done = False
        if reward == HTMLGame.REWARD:
            done = True
        return state, np.array([
            np.identity(6)[v:v + 1] for v in self.html_vec
        ]).flatten(), reward, done
예제 #5
0
def Dist(array1, array2, dist):
    if dist == 'braycurtis':
        return distance.braycurtis(array1, array2)
    elif dist == 'correlation':
        return distance.correlation(array1, array2)
    elif dist == 'mahalanobis':
        return distance.mahalanobis(array1, array2)
    elif dist == 'minkowski':
        return distance.minkowski(array1, array2)
    elif dist == 'seuclidean':
        return distance.seuclidean(array1, array2)
    elif dist == 'sqeuclidean':
        return distance.sqeuclidean(array1, array2)
    elif dist == 'pearsonp':
        r, p = pearsonr(array1, array2)
        return p
    elif dist == 'pearsonr':
        r, p = pearsonr(array1, array2)
        return r
    elif dist == 'spearmanp':
        r, p = spearmanr(array1, array2)
        return p
    elif dist == 'spearmanr':
        r, p = spearmanr(array1, array2)
        return r
예제 #6
0
def calculate_featureset4(dataframe, q1_vectors, q2_vectors):
    dataframe['cosine_dist'] = [
        cosine(x, y)
        for (x, y) in zip(np.nan_to_num(q1_vectors), np.nan_to_num(q2_vectors))
    ]
    dataframe['cityblock_dist'] = [
        cityblock(x, y)
        for (x, y) in zip(np.nan_to_num(q1_vectors), np.nan_to_num(q2_vectors))
    ]
    dataframe['jaccard_dist'] = [
        jaccard(x, y)
        for (x, y) in zip(np.nan_to_num(q1_vectors), np.nan_to_num(q2_vectors))
    ]
    dataframe['canberra_dist'] = [
        canberra(x, y)
        for (x, y) in zip(np.nan_to_num(q1_vectors), np.nan_to_num(q2_vectors))
    ]
    dataframe['euclidean_dist'] = [
        euclidean(x, y)
        for (x, y) in zip(np.nan_to_num(q1_vectors), np.nan_to_num(q2_vectors))
    ]
    dataframe['minkowski_dist'] = [
        minkowski(x, y, 3)
        for (x, y) in zip(np.nan_to_num(q1_vectors), np.nan_to_num(q2_vectors))
    ]
    dataframe['braycurtis_dist'] = [
        braycurtis(x, y)
        for (x, y) in zip(np.nan_to_num(q1_vectors), np.nan_to_num(q2_vectors))
    ]
    dataframe['skew_q1'] = [skew(x) for x in np.nan_to_num(q1_vectors)]
    dataframe['skew_q2'] = [skew(x) for x in np.nan_to_num(q2_vectors)]
    dataframe['kurtosis_q1'] = [kurtosis(x) for x in np.nan_to_num(q1_vectors)]
    dataframe['kurtosis_q2'] = [kurtosis(x) for x in np.nan_to_num(q2_vectors)]
    return dataframe
예제 #7
0
def compare_locations(c1, c2, method='Average'):
    rssi1 = []
    rssi2 = []
    wifi1 = c1['fingerprints']['wifi']
    wifi2 = c2['fingerprints']['wifi']

    common_aps = list(set(wifi1.keys()) & set(wifi2.keys()))

    # No APs in common -> similarity = 1
    if not common_aps:
        return 1

    # TODO: find the best metric
    # If not enough common APs -> similarity = 1
    if len(common_aps) * 10 < len(wifi1.keys()):
        return 1

    for ap in common_aps:
        # Take only the first RSSI value
        if method == 'First':
            rssi1.append(wifi1[ap]['rssi'][0])
            rssi2.append(wifi2[ap]['rssi'][0])

        # Make an average of all RSSI values
        if method == 'Average':
            rssi1.append(np.average(wifi1[ap]['rssi']))
            rssi2.append(np.average(wifi2[ap]['rssi']))

    return braycurtis(tuple(rssi1), tuple(rssi2))
예제 #8
0
def braycurtis(x, y):
    try:
        return distance.braycurtis(x, y)
    except ValueError:
        return np.NaN
    except:
        return np.NaN
예제 #9
0
def extend_with_features(data):
    stop_words = stopwords.words('english')
    data['fuzz_qratio'] = data.apply(
        lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])),
        axis=1)
    data['fuzz_WRatio'] = data.apply(
        lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])),
        axis=1)

    model = gensim.models.KeyedVectors.load_word2vec_format(
        google_news_model_path, binary=True)
    data['wmd'] = data.apply(
        lambda x: wmd(model, x['question1'], x['question2']), axis=1)

    norm_model = gensim.models.KeyedVectors.load_word2vec_format(
        google_news_model_path, binary=True)
    norm_model.init_sims(replace=True)
    data['norm_wmd'] = data.apply(
        lambda x: norm_wmd(norm_model, x['question1'], x['question2']), axis=1)

    question1_vectors = np.zeros((data.shape[0], 300))
    for i, q in enumerate(data.question1.values):
        question1_vectors[i, :] = sent2vec(model, q)

    question2_vectors = np.zeros((data.shape[0], 300))
    for i, q in enumerate(data.question2.values):
        question2_vectors[i, :] = sent2vec(model, q)

    question1_vectors = np.nan_to_num(question1_vectors)
    question2_vectors = np.nan_to_num(question2_vectors)

    data['cosine_distance'] = [
        cosine(x, y) for (x, y) in zip(question1_vectors, question2_vectors)
    ]
    data['cityblock_distance'] = [
        cityblock(x, y) for (x, y) in zip(question1_vectors, question2_vectors)
    ]
    data['jaccard_distance'] = [
        jaccard(x, y) for (x, y) in zip(question1_vectors, question2_vectors)
    ]
    data['canberra_distance'] = [
        canberra(x, y) for (x, y) in zip(question1_vectors, question2_vectors)
    ]
    data['euclidean_distance'] = [
        euclidean(x, y) for (x, y) in zip(question1_vectors, question2_vectors)
    ]
    data['minkowski_distance'] = [
        minkowski(x, y, 3)
        for (x, y) in zip(question1_vectors, question2_vectors)
    ]
    data['braycurtis_distance'] = [
        braycurtis(x, y)
        for (x, y) in zip(question1_vectors, question2_vectors)
    ]

    data['skew_q1vec'] = [skew(x) for x in question1_vectors]
    data['skew_q2vec'] = [skew(x) for x in question2_vectors]
    data['kur_q1vec'] = [kurtosis(x) for x in question1_vectors]
    data['kur_q2vec'] = [kurtosis(x) for x in question2_vectors]
    return data
예제 #10
0
def feature3(data):
    question1_vectors = np.zeros((data.shape[0], 300))
    error_count = 0
    for i, q in tqdm(enumerate(data.question1.values)):
        question1_vectors[i, :] = sent2vec(q)

    question2_vectors  = np.zeros((data.shape[0], 300))
    for i, q in tqdm(enumerate(data.question2.values)):
        question2_vectors[i, :] = sent2vec(q)

    data['cosine_distance'] = [cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                              np.nan_to_num(question2_vectors))]

    data['cityblock_distance'] = [cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                              np.nan_to_num(question2_vectors))]
    data['jaccard_distance'] = [jaccard(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                              np.nan_to_num(question2_vectors))]

    data['canberra_distance'] = [canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                              np.nan_to_num(question2_vectors))]  
    data['euclidean_distance'] = [euclidean(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

    data['minkowski_distance'] = [minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]
    data['braycurtis_distance'] = [braycurtis(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

    data['skew_q1vec'] = [skew(x) for x in np.nan_to_num(question1_vectors)]
    data['skew_q2vec'] = [skew(x) for x in np.nan_to_num(question2_vectors)]
    data['kur_q1vec'] = [kurtosis(x) for x in np.nan_to_num(question1_vectors)]
    data['kur_q2vec'] = [kurtosis(x) for x in np.nan_to_num(question2_vectors)]
    return data
예제 #11
0
 def calculate_distance(self, method, histA, histB):
     """ use chi by default """
     if method == "braycurtis":
         return braycurtis(histA, histB)
     elif method == "intersection":
         return self.intersection(histA, histB)
     return self.chi2_distance(histA, histB)
def vectors_features(in_data: pd.DataFrame,
                     sent2vec: Callable[[str], np.array]) -> pd.DataFrame:
    assert "question1" in in_data.columns
    assert "question2" in in_data.columns
    vectors1 = np.array([sent2vec(x) for x in in_data['question1']])
    vectors2 = np.array([sent2vec(x) for x in in_data['question2']])
    in_data['cos'] = np.array(
        [cosine(x, y) for x, y in zip(vectors1, vectors2)])
    in_data['jaccard'] = np.array(
        [jaccard(x, y) for x, y in zip(vectors1, vectors2)])
    in_data['euclidean'] = np.array(
        [euclidean(x, y) for x, y in zip(vectors1, vectors2)])
    in_data['minkowski'] = np.array(
        [minkowski(x, y) for x, y in zip(vectors1, vectors2)])
    in_data['cityblock'] = np.array(
        [cityblock(x, y) for (x, y) in zip(vectors1, vectors2)])
    in_data['canberra'] = np.array(
        [canberra(x, y) for (x, y) in zip(vectors1, vectors2)])
    in_data['braycurtis'] = np.array(
        [braycurtis(x, y) for (x, y) in zip(vectors1, vectors2)])
    in_data['skew_q1'] = np.array([skew(x) for x in vectors1])
    in_data['skew_q2'] = np.array([skew(x) for x in vectors2])
    in_data['kur_q1'] = np.array([kurtosis(x) for x in vectors1])
    in_data['kur_q2'] = np.array([kurtosis(x) for x in vectors2])
    in_data['skew_diff'] = np.abs(in_data['skew_q1'] - in_data['skew_q2'])
    in_data['kur_diff'] = np.abs(in_data['kur_q1'] - in_data['kur_q2'])
    return in_data
예제 #13
0
def distance_features(data,genismModel):
    w2v_q1 = np.array([sent2vec(q, genismModel) for q in data.question1])
    w2v_q2 = np.array([sent2vec(q, genismModel) for q in data.question2])
    a=np.zeros(300)
    for i in range(len(w2v_q1)):
        if w2v_q1[i].size==1:
            w2v_q1[i]=a
    for i in range(len(w2v_q2)):
        if w2v_q2[i].size==1:
            w2v_q2[i]=a
    
    data['cosine_distance'] = [cosine(x,y) for (x,y) in zip(w2v_q1, w2v_q2)]
    data['cityblock_distance'] = [cityblock(x,y) for (x,y) in zip(w2v_q1, w2v_q2)]
    data['jaccard_distance'] = [jaccard(x,y) for (x,y) in zip(w2v_q1, w2v_q2)]
    data['canberra_distance'] = [canberra(x,y) for (x,y) in zip(w2v_q1, w2v_q2)]
    data['euclidean_distance'] = [euclidean(x,y) for (x,y) in zip(w2v_q1, w2v_q2)]
    data['minkowski_distance'] = [minkowski(x,y,3) for (x,y) in zip(w2v_q1, w2v_q2)]
    data['braycurtis_distance'] = [braycurtis(x,y) for (x,y) in zip(w2v_q1, w2v_q2)]
    data['skew_q1vec'] = [skew(x) for x in w2v_q1]
    data['skew_q2vec'] = [skew(x) for x in w2v_q2]
    data['kur_q1vec'] = [kurtosis(x) for x in w2v_q1]
    data['kur_q2vec'] = [kurtosis(x) for x in w2v_q2]
    fs_4 = ['cosine_distance', 'cityblock_distance', 'jaccard_distance', 'canberra_distance', 
         'euclidean_distance', 'minkowski_distance','braycurtis_distance','skew_q1vec',
         'skew_q2vec','kur_q1vec','kur_q2vec']
    return data,fs_4
예제 #14
0
def get_w2v_simi(query, title):
    q_vec = np.nan_to_num(sent2vec(query))
    t_vec = np.nan_to_num(sent2vec(title))

    w2v_consine = cosine(q_vec, t_vec)
    w2v_cityblock = cityblock(q_vec, t_vec)
    w2v_jaccard = jaccard(q_vec, t_vec)
    w2v_canberra = canberra(q_vec, t_vec)
    w2v_euclidean = euclidean(q_vec, t_vec)
    w2v_minkowski = minkowski(q_vec, t_vec)
    w2v_braycurtis = braycurtis(q_vec, t_vec)

    w2v_skew_qvec = skew(q_vec)
    w2v_skew_tvec = skew(t_vec)
    w2v_kur_qvec = kurtosis(q_vec)
    w2v_kur_tvec = kurtosis(t_vec)

    outlist = [w2v_consine,
               w2v_cityblock,
               w2v_jaccard,
               w2v_canberra,
               w2v_euclidean,
               w2v_minkowski,
               w2v_braycurtis,
               w2v_skew_qvec,
               w2v_skew_tvec,
               w2v_kur_qvec,
               w2v_kur_tvec
               ]
    outformat = ':'.join(['{}']*len(outlist))

    return outformat.format(*outlist)
예제 #15
0
def calculate_distance(X, Y, metric='euclidean'):
    if metric == METRIC_EUCLIDEAN:
        return distance.euclidean(X, Y)
    elif metric == METRIC_JACCARD:
        return distance.jaccard(X, Y)
    elif metric == METRIC_CANBERRA:
        return distance.canberra(X, Y)
    elif metric == METRIC_CHEBYSHEV:
        return distance.chebyshev(X, Y)
    elif metric == METRIC_MINKOWSKI:
        return distance.minkowski(X, Y)
    elif metric == METRIC_WMINKOWSKI:
        return distance.wminkowski(X, Y)
    elif metric == METRIC_BRAYCURTIS:
        return distance.braycurtis(X, Y)
    elif metric == METRIC_HAMMING:
        return distance.hamming(X, Y)
    elif metric == METRIC_MAHALANOBIS:
        return distance.mahalanobis(X, Y)
    elif metric == METRIC_MANHATTAN:
        return sum(abs(a - b) for a, b in zip(X, Y))

    elif metric == METRIC_COSINE:
        dot_product = np.dot(X, Y)
        norm_a = np.linalg.norm(X)
        norm_b = np.linalg.norm(Y)
        return dot_product / (norm_a * norm_b)
예제 #16
0
def _compute_per_level_accuracy(exp, obs, metadata, depth):
    results = []
    vectors = {}
    for level in range(1, depth + 1):
        vectors[level] = {'exp': [], 'obs': []}
        # collapse taxonomy strings to level
        exp_collapsed = _collapse_table(exp, level)
        obs_collapsed = _collapse_table(obs, level)
        # compute stats for each sample individually
        for sample in obs_collapsed.index:
            result = [sample, level]
            # if metadata are passed, map exp sample ID to value in metadata
            if metadata is not None:
                exp_id = metadata[sample]
            else:
                exp_id = sample
            # concatenate obs/exp observations to align features
            joined_table = pd.concat(
                [exp_collapsed.loc[exp_id], obs_collapsed.loc[sample]],
                axis=1,
                sort=True).fillna(0)
            # split joined table apart again for computing stats
            exp_vector = joined_table.iloc[:, 0]
            obs_vector = joined_table.iloc[:, 1]
            exp_features = exp_vector[exp_vector != 0]
            obs_features = obs_vector[obs_vector != 0]
            # Count observed taxa
            observed_feature_count = len(obs_features)
            observed_feature_ratio = (observed_feature_count /
                                      len(exp_features))
            result.extend([observed_feature_count, observed_feature_ratio])
            # compute TAR/TDR
            result.extend(compute_taxon_accuracy(exp_features, obs_features))
            # compute linear least-squares regression results
            if len(exp_vector) == len(obs_vector) == 1:
                # linear regression cannot compute if vector length < 2
                reg_results = [np.nan] * 5
            else:
                reg_results = linregress(exp_vector, obs_vector)
            result.extend(reg_results)
            # compute Bray-Curtis dissimilarity
            result.append(braycurtis(exp_vector, obs_vector))
            # compute Jaccard distance, must convert to bool array
            result.append(
                jaccard(list(map(bool, exp_vector)),
                        list(map(bool, obs_vector))))
            results.append(result)
            # store vectors for constructing regplots
            vectors[level]['exp'].extend(exp_vector)
            vectors[level]['obs'].extend(obs_vector)
    results = pd.DataFrame(results,
                           columns=[
                               'sample', 'level', 'Observed Taxa',
                               'Observed / Expected Taxa', 'TAR', 'TDR',
                               'Slope', 'Intercept', 'r-value', 'P value',
                               'Std Err', 'Bray-Curtis', 'Jaccard'
                           ])
    results['r-squared'] = results['r-value']**2
    return results, vectors
    def features_similarity(cls, df):
        cls.load_model(normed=True)
        question1_vectors, question2_vectors = cls.get_questions_vector(df)
        cls.resetmodel()

        cls.dict_features['cosine_distance'] = [
            cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                           np.nan_to_num(question2_vectors))
        ]
        print("1/11 Cosine Distance finished.")
        cls.dict_features['cityblock_distance'] = [
            cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                              np.nan_to_num(question2_vectors))
        ]
        print("2/11 Cityblock Distance finished.")
        cls.dict_features['jaccard_distance'] = [
            jaccard(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                            np.nan_to_num(question2_vectors))
        ]
        print("3/11 Jaccard Distance finished.")
        cls.dict_features['canberra_distance'] = [
            canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                             np.nan_to_num(question2_vectors))
        ]
        print("4/11 Canberra Distance finished.")
        cls.dict_features['euclidean_distance'] = [
            euclidean(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                              np.nan_to_num(question2_vectors))
        ]
        print("5/11 Euclidean Distance finished.")
        cls.dict_features['minkowski_distance'] = [
            minkowski(x, y, 3)
            for (x, y) in zip(np.nan_to_num(question1_vectors),
                              np.nan_to_num(question2_vectors))
        ]
        print("6/11 Minkowski Distance finished.")
        cls.dict_features['braycurtis_distance'] = [
            braycurtis(x, y)
            for (x, y) in zip(np.nan_to_num(question1_vectors),
                              np.nan_to_num(question2_vectors))
        ]
        print("7/11 Braycurtis Distance finished.")
        cls.dict_features['skew_q1vec'] = [
            skew(x) for x in np.nan_to_num(question1_vectors)
        ]
        print("8/11 Skew Q1 Vec finished.")
        cls.dict_features['skew_q2vec'] = [
            skew(x) for x in np.nan_to_num(question2_vectors)
        ]
        print("9/11 Skew Q2 Vec finished.")
        cls.dict_features['kur_q1vec'] = [
            kurtosis(x) for x in np.nan_to_num(question1_vectors)
        ]
        print("10/11 Kurtosis Q1 Vec finished.")
        cls.dict_features['kur_q2vec'] = [
            kurtosis(x) for x in np.nan_to_num(question2_vectors)
        ]
        print("11/11 Kurtosis Q2 Vec finished.")
        return question1_vectors, question2_vectors
예제 #18
0
def compare_features(feature_array_1, feature_array_2, key):

    br = braycurtis(feature_array_1, feature_array_2)
    #concatenate features of two images and reshape to two dimensional matrix
    features = np.hstack((feature_array_1, feature_array_2))
    br_features = np.hstack((features, br))
    proba = xgboost_model.predict_proba(br_features).tolist()[0]

    return {'name': key, 'different': proba[0], 'same': proba[1]}
예제 #19
0
 def distance(self, u,v,distancemetric):
     if distancemetric is "cosine":
         return distance.cosine(u, v)
     elif distancemetric is "euclidean":
         return distance.euclidean(u, v)
     elif distancemetric is "cityblock":
         return distance.cityblock(u, v)
     elif distancemetric is "braycurtis":
         return distance.braycurtis(u, v)
예제 #20
0
def feats_tfidf(row):
    out_list = []
    que1 = str(row['question1'])
    que2 = str(row['question2'])

    #Calculate que1 lsa vector
    que1_vec = []
    que1_bow = dictionary.doc2bow(que1.lower().split())
    que1_lsi = lsi[que1_bow]
    for (index, value) in que1_lsi:
        que1_vec.append(value)

    #Calculate que2 lsa vector
    que2_vec = []
    que2_bow = dictionary.doc2bow(que2.lower().split())
    que2_lsi = lsi[que2_bow]
    for (index, value) in que2_lsi:
        que2_vec.append(value)

    #drop some dimensions if they don't match
    if len(que1_vec) != len(que2_vec):
        if len(que1_vec) > len(que2_vec):
            que1_vec = que1_vec[:len(que2_vec)]
            que2_vec = que2_vec
        else:
            que1_vec = que1_vec
            que2_vec = que2_vec[:len(que1_vec)]

    #Calculate distances between lsa vectors
    try:
        lsa_cosine = cosine(que1_vec, que2_vec)
    except:
        lsa_cosine = 1

    lsa_cityblock = cityblock(que1_vec, que2_vec)
    lsa_jaccard = jaccard(que1_vec, que2_vec)
    lsa_canberra = canberra(que1_vec, que2_vec)

    try:
        lsa_euclidean = euclidean(que1_vec, que2_vec)
    except:
        lsa_euclidean = np.nan

    lsa_minkowski = minkowski(que1_vec, que2_vec, 3)
    lsa_braycurtis = braycurtis(que1_vec, que2_vec)

    lsa_q1_skew = skew(que1_vec)
    lsa_q1_kurtosis = kurtosis(que1_vec)

    lsa_q2_skew = skew(que2_vec)
    lsa_q2_kurtosis = kurtosis(que2_vec)


    out_list.extend([lsa_cosine,lsa_cityblock,lsa_jaccard,lsa_canberra,lsa_euclidean, \
                     lsa_minkowski,lsa_braycurtis,lsa_q1_skew,lsa_q1_kurtosis,lsa_q2_skew, lsa_q2_kurtosis])

    return out_list
예제 #21
0
def dist_features(data):
    model = gensim.models.KeyedVectors.load_word2vec_format(
        'data/GoogleNews-vectors-negative300.bin.gz', binary=True)
    question1_vectors = np.zeros((data.shape[0], 300))
    error_count = 0

    for i, q in tqdm(enumerate(data.question1.values)):
        question1_vectors[i, :] = sent2vec(q, model)

    question2_vectors = np.zeros((data.shape[0], 300))
    for i, q in tqdm(enumerate(data.question2.values)):
        question2_vectors[i, :] = sent2vec(q, model)

    data['cosine_distance'] = [
        cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                       np.nan_to_num(question2_vectors))
    ]

    data['cityblock_distance'] = [
        cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                          np.nan_to_num(question2_vectors))
    ]

    data['jaccard_distance'] = [
        jaccard(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                        np.nan_to_num(question2_vectors))
    ]

    data['canberra_distance'] = [
        canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                         np.nan_to_num(question2_vectors))
    ]

    data['euclidean_distance'] = [
        euclidean(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                          np.nan_to_num(question2_vectors))
    ]

    data['minkowski_distance'] = [
        minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                             np.nan_to_num(question2_vectors))
    ]

    data['braycurtis_distance'] = [
        braycurtis(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                           np.nan_to_num(question2_vectors))
    ]

    data['skew_q1vec'] = [skew(x) for x in np.nan_to_num(question1_vectors)]
    data['skew_q2vec'] = [skew(x) for x in np.nan_to_num(question2_vectors)]
    data['kur_q1vec'] = [kurtosis(x) for x in np.nan_to_num(question1_vectors)]
    data['kur_q2vec'] = [kurtosis(x) for x in np.nan_to_num(question2_vectors)]

    cPickle.dump(question1_vectors, open('data/q1_w2v.pkl', 'wb'), -1)
    cPickle.dump(question2_vectors, open('data/q2_w2v.pkl', 'wb'), -1)

    return data
예제 #22
0
def identifiability(sub_list, ses_list, gv_array, measure, ses1, ses2):
    '''
    This function calculates the identifiability of subjects as I_diff=I_self-I_others
    where I_self is similarity between the same subject in two different sessions averaged over all subjects
    and I_others is similarity between a given subject and all the others in two different sessions averaged
    over all subjects.

    Input:
    sub_list - vector of subjects,
    ses_list - vector with session numbers,
    gv_array - array of shape (number of subjects * number of sessions) x (number of graph measures)
    measure - 'cosine' - cosine similarity, 'pearsonr' - Pearson correlation coefficient
    ses1, ses2 - numbers of sessions to compare (integers)
    
    Output:
    I_diff - identifiability (scalar).
    '''
    
    ###--- Import packages
    from scipy.stats.stats import pearsonr
    from scipy.spatial.distance import cityblock, euclidean, minkowski, braycurtis
    
    ###--- Define cosine similarity between two vectors
    def dot(A,B): 
        return (sum(a*b for a,b in zip(A,B)))
    def cosine_similarity(a,b):
        return dot(a,b) / ((dot(a,a)**.5) * (dot(b,b)**.5))
    
    ###--- Find number of subjects and number of sessions
    N_ses = int(max(ses_list))
    N_sub = (len(sub_list))
    
    ###--- Calculate identifiability matrix
    I_mat = np.zeros((N_sub,N_sub))
    if measure == 'euclidean':
        for sub1 in range(N_sub):
            for sub2 in range(N_sub):
                I_mat[int(sub1)-1,int(sub2)-1] = euclidean(gv_array[int(sub1)*N_ses+ses1-3,:],gv_array[int(sub2)*N_ses+ses2-3,:])
    elif measure == 'cityblock':
        for sub1 in range(N_sub):
            for sub2 in range(N_sub):
                I_mat[int(sub1)-1,int(sub2)-1] = cityblock(gv_array[int(sub1)*N_ses+ses1-3,:],gv_array[int(sub2)*N_ses+ses2-3,:])
    elif measure == 'braycurtis':
        for sub1 in range(N_sub):
            for sub2 in range(N_sub):
                I_mat[int(sub1)-1,int(sub2)-1] = braycurtis(gv_array[int(sub1)*N_ses+ses1-3,:],gv_array[int(sub2)*N_ses+ses2-3,:])
                
    ###--- Create an out-of-diagonal elements mask
    out = np.ones((len(sub_complete),len(sub_complete)),dtype=bool)
    np.fill_diagonal(out,0)
    ###---Similarity of subject to others, averaged over all subjects
    I_others=np.mean(I_mat[out])
    ###---Similarity of subject to himself, averaged over all subjects
    I_self = np.mean(np.diagonal(I_mat))
    I_diff=I_self/I_others
    return I_diff
예제 #23
0
def compute_sim_bray(df, currentVec, simArray):

    for index, row in df.iterrows():
        imageName = row[0] + '.jpg'
        brayDis = distance.braycurtis(currentVec, row[1:])
        temp = pd.DataFrame([[imageName, brayDis]],
                            columns=['ImageName', 'BrayCurtis'])
        simArray = simArray.append(temp, ignore_index=True)

    return simArray
예제 #24
0
def get_distance_features(data, emb):
    data['cosine_distance'] = pd.Series([cosine(x, y) for x, y in emb])
    data['cityblock_distance'] = pd.Series([cityblock(x, y) for x, y in emb])
    data['jaccard_distance'] = pd.Series([jaccard(x, y) for x, y in emb])
    data['canberra_distance'] = pd.Series([canberra(x, y) for x, y in emb])
    data['euclidean_distance'] = pd.Series([euclidean(x, y) for x, y in emb])
    data['minkowski_distance'] = pd.Series(
        [minkowski(x, y, 3) for x, y in emb])
    data['braycurtis_distance'] = pd.Series([braycurtis(x, y) for x, y in emb])
    return data
def test_compare_braycurtis_definitions():
    x = np.random.uniform(0, 10, 10)
    y = np.random.uniform(0, 10, 10)

    bc1 = braycurtis(x, y)
    bc2 = BrayCurtis(x, y)
    print("The Bray-Curtis distance of the scipy.spatial.distance package is:",
          bc1)
    print("The Bray-Curtis distance ( sum(abs(x-y)) / sum(x+y) ) is:", bc2)
    print("The difference between both definitions is ", bc1 - bc2)
예제 #26
0
    def build_features(self, net, photo):
        size_arr = self.clients_features.shape[0]
        self.photo_features = net.get_features(photo)
        self.main_arr_broadcasted = broadcast_array(self.photo_features,
                                                    size_arr)
        cls_arr = np.hstack((self.clients_features, self.main_arr_broadcasted))

        braycurtis_dist = braycurtis(self.clients_features,
                                     self.main_arr_broadcasted).reshape(
                                         1, size_arr)
        self.cls_arr_br = np.concatenate((cls_arr, braycurtis_dist.T), axis=1)
예제 #27
0
def bray_curtis_dist(user_predict, adoptable_dogs, images):
    '''
    Calculating Bray-Curtis distance between two 1D arrays and return similarity score
    '''
    sim_score = []
    for idx in range(0, len(adoptable_dogs)):
        sim_score.append(
            distance.braycurtis(user_predict.flatten(),
                                adoptable_dogs[idx].flatten()))
    print('Maximum SimScore: ' + str(max(sim_score)))
    return pd.DataFrame({'imgFile': images, 'SimScore': sim_score})
예제 #28
0
def feature_construct(city,
                      model_name,
                      friends,
                      walk_len=100,
                      walk_times=20,
                      num_features=128):
    '''construct the feature matrixu2_checkin
    Args:
        city: city
        model_name: 20_locid
        friends: friends list (asymetric) [u1, u2]
        walk_len: walk length
        walk_times: walk times
        num_features: dimension for vector        
    Returns:
    '''

    if os.path.exists('dataset/'+city+'/feature/'+city+'_'+model_name+'_'+\
                      str(int(walk_len))+'_'+str(int(walk_times))+'_'+str(int(num_features))+'.feature'):
        os.remove('dataset/'+city+'/feature/'+city+'_'+model_name+'_'+\
                  str(int(walk_len))+'_'+str(int(walk_times))+'_'+str(int(num_features))+'.feature')

    emb = pd.read_csv('dataset/'+city+'/emb/'+city+'_'+model_name+'_'+\
                      str(int(walk_len))+'_'+str(int(walk_times))+'_'+str(int(num_features))+'.emb',\
                      header=None, skiprows=1, sep=' ')

    emb = emb.rename(columns={0: 'uid'})  # last column is user id
    emb = emb.loc[emb.uid > 0]  # only take users, no loc_type, not necessary

    pair = pair_construct(emb.uid.unique(), friends)

    for i in range(len(pair)):
        u1 = pair.loc[i, 'u1']
        u2 = pair.loc[i, 'u2']
        label = pair.loc[i, 'label']

        u1_vector = emb.loc[emb.uid == u1, range(1, emb.shape[1])]
        u2_vector = emb.loc[emb.uid == u2, range(1, emb.shape[1])]

        i_feature = pd.DataFrame([[
            u1, u2, label,
            cosine(u1_vector, u2_vector),
            euclidean(u1_vector, u2_vector),
            correlation(u1_vector, u2_vector),
            chebyshev(u1_vector, u2_vector),
            braycurtis(u1_vector, u2_vector),
            canberra(u1_vector, u2_vector),
            cityblock(u1_vector, u2_vector),
            sqeuclidean(u1_vector, u2_vector)
        ]])

        i_feature.to_csv('dataset/'+city+'/feature/'+city+'_'+model_name+'_'+\
                         str(int(walk_len))+'_'+str(int(walk_times))+'_'+str(int(num_features))+'.feature',\
                         index = False, header = None, mode = 'a')
예제 #29
0
    def make_prediction(self, img1, img2):
        image1_features = self.get_features(img1)
        image2_features = self.get_features(img2['base64'])

        br = braycurtis(image1_features, image2_features)

        #concatenate features of two images and reshape to two dimensional matrix
        features = np.hstack((image1_features, image2_features))
        br_features = np.hstack((features, br)).reshape(1, 805)
        proba = self.xgboost_model.predict_proba(br_features).tolist()[0]

        return {'name': img2['name'], 'different': proba[0], 'same': proba[1]}
예제 #30
0
    def run_on(self, df_run):
        if self.col1 not in dicts:
            self.dict1 = self.pd.read_csv(workdir+'dict_'+self.col1+'.csv', dtype={'value': object}).set_index('key')["value"].to_dict()
        else:
            self.dict1 = {v:k for k,v in dicts[self.col1].items()} # make key=number, value=string
        if self.col2 not in dicts:
            self.dict2 = self.pd.read_csv(workdir+'dict_'+self.col2+'.csv', dtype={'value': object}).set_index('key')["value"].to_dict()
        else:
            self.dict2 = {v:k for k,v in dicts[self.col2].items()} # make key=number, value=string
            
        self.dfx = self.pd.DataFrame()
        self.dfx[self.col1] = df_run[self.col1].map(self.dict1)
        self.dfx[self.col2] = df_run[self.col2].map(self.dict2)

        block = int(len(df_run)/50)
        i = 0

        for index, row in self.dfx.iterrows():
            i+=1
            if type(row[self.col1])==str:
                sline1 = self.func(row[self.col1])
            else:
                sline1 = ''
            if type(row[self.col2])==str:
                sline2 = self.func(row[self.col2])
            else:
                sline2 = ''

            wta = word_tokenize(sline1.lower())
            wtb = word_tokenize(sline2.lower())
            s2v_a = self.sent2vec(wta)
            s2v_b = self.sent2vec(wtb)

            df_run.set_value(index, self.fldprefix + '_1', self.wmd(sline1, sline2))
            df_run.set_value(index, self.fldprefix + '_2', self.norm_wmd(sline1, sline2))
            df_run.set_value(index, self.fldprefix + '_3', cosine(s2v_a, s2v_b))
            df_run.set_value(index, self.fldprefix + '_4', cityblock(s2v_a, s2v_b))
            df_run.set_value(index, self.fldprefix + '_5', jaccard(s2v_a, s2v_b))
            df_run.set_value(index, self.fldprefix + '_6', canberra(s2v_a, s2v_b))
            df_run.set_value(index, self.fldprefix + '_7', euclidean(s2v_a, s2v_b))
            df_run.set_value(index, self.fldprefix + '_8', minkowski(s2v_a, s2v_b, 3))
            df_run.set_value(index, self.fldprefix + '_9', braycurtis(s2v_a, s2v_b))
            df_run.set_value(index, self.fldprefix + '_10', skew(s2v_a))
            df_run.set_value(index, self.fldprefix + '_11', skew(s2v_b))
            df_run.set_value(index, self.fldprefix + '_12', kurtosis(s2v_a))
            df_run.set_value(index, self.fldprefix + '_13', kurtosis(s2v_b))


            if i>=block and block>=1000:
                i=0
                print (index)

        df_run[[self.fldprefix + '_3',self.fldprefix + '_5',self.fldprefix + '_9']]=df_run[[self.fldprefix + '_3',self.fldprefix + '_5',self.fldprefix + '_9']].fillna(value=1.0)
예제 #31
0
def compute_distance(net, img1, img2):
    id1 = get_scores(net, img1)
    id1 = np.mean(id1, axis=0)
    id1_norm = id1 / np.linalg.norm(id1)
    id2 = get_scores(net, img2)
    id2 = np.mean(id2, axis=0)

    id2_norm = id2 / np.linalg.norm(id2)
    comp_dist = ssd.braycurtis(id1_norm, id2_norm)
    print comp_dist
    dist_eucl = ssd.euclidean(id1_norm, id2_norm)
    dist_cosine = ssd.cosine(id1_norm, id2_norm)
    return comp_dist, dist_cosine, dist_eucl
예제 #32
0
    def score_braycurtis(self, term1, term2, **kwargs):

        """
        Compute a weighting score based on the "City Block" distance between
        the kernel density estimates of two terms.
        :param term1: The first term.
        :param term2: The second term.
        """

        t1_kde = self.kde(term1, **kwargs)
        t2_kde = self.kde(term2, **kwargs)

        return 1-distance.braycurtis(t1_kde, t2_kde)
예제 #33
0
def BrayCurtis(X):
    '''
    compute Bray-Curtis dissimilarity.
    Args:
      X: input N x K data matrix. N ... the number of samples, K ... the number of features.
    Return:
      N x N data matrix. The value of (i,j) shows the distance between sample-i and sample-j.
    '''
    from scipy.spatial.distance import braycurtis
    X = np.array(X)
    n_samples = X.shape[0]
    n_distance = n_samples * (n_samples - 1) / 2
    d_array = np.zeros((n_distance))
    for i, (idx1, idx2) in enumerate(itertools.combinations(range(n_samples),2)):
        d_array[i] = braycurtis(X[idx1], X[idx2])
    return squareform(d_array)
예제 #34
0
파일: diff.py 프로젝트: davidmcclure/tdiff
    def kde_best_match(self, n=500, show_matches=False, **kwargs):

        """
        For each term in text 1, find the term in text 2 with the most similar
        pattern of distribution.

        Args:
            n (int): Consider N most-frequent words.
            show_matches (bool): Show identity (A -> A) matches.

        Returns:
            list: Tuples of (t1 term, t2 term, weight).
        """

        mft1 = self.text1.most_frequent_terms(n)
        mft2 = self.text2.most_frequent_terms(n)

        # For each term in text 1.
        links = []
        for t1 in mft1:

            # Score against each term in text 2.
            scores = []
            for t2 in mft2:

                t1_kde = self.text1.kde(t1, **kwargs)
                t2_kde = self.text2.kde(t2, **kwargs)

                score = 1-distance.braycurtis(t1_kde, t2_kde)
                scores.append((t2, score))

            # Get the nearest neighbor.
            scores = sorted(scores, key=lambda x: x[1], reverse=True)
            t2 = scores[0][0]

            if show_matches or t1 != t2:
                links.append((
                    self.text1.unstem(t1),
                    self.text2.unstem(t2),
                    scores[0][1]
                ))

        # Sort strongest -> weakest.
        links = sorted(links, key=lambda x: x[2], reverse=True)

        return links
예제 #35
0
파일: text.py 프로젝트: ChengQikai/textplot
    def score_braycurtis(self, term1, term2, **kwargs):

        """
        Compute a weighting score based on the "City Block" distance between
        the kernel density estimates of two terms.

        Args:
            term1 (str)
            term2 (str)

        Returns: float
        """

        t1_kde = self.kde(term1, **kwargs)
        t2_kde = self.kde(term2, **kwargs)

        return 1-distance.braycurtis(t1_kde, t2_kde)
예제 #36
0
def wvBray(a):
	return [distance.braycurtis(x[0], x[1]) for x in a]
예제 #37
0
파일: metric.py 프로젝트: dridon/aml2
def metric_braycurtis_2(i, j):
    return dist.braycurtis(i, j)  
예제 #38
0
ATvectorizer=TfidfVectorizer()

all_questions=[d['question'] for d in data]
all_questions=[userinput]+all_questions
all_answers=[d['answer'] for d in data]
all_answers=[userinput]+all_answers

QuestionTVectorArray=QTvectorizer.fit_transform(all_questions)
AnswerTVectorArray=ATvectorizer.fit_transform(all_answers)

#print "question cosine similairity-->",cosine_similarity(QuestionTVectorArray[0:1],QuestionTVectorArray)
#print "answer cosine similarity-->",cosine_similarity(AnswerTVectorArray[0:1],AnswerTVectorArray)
Qcosines=cosine_similarity(QuestionTVectorArray[0:1],QuestionTVectorArray)
Acosines=cosine_similarity(AnswerTVectorArray[0:1],AnswerTVectorArray)

Qbray=[dist.braycurtis(QuestionTVectorArray[0].toarray(),u.toarray()) for u in QuestionTVectorArray]
Abray=[dist.braycurtis(AnswerTVectorArray[0].toarray(),u.toarray()) for u in AnswerTVectorArray]

Qcanberra=[dist.canberra(QuestionTVectorArray[0].toarray(),u.toarray()) for u in QuestionTVectorArray]
Acanberra=[dist.canberra(AnswerTVectorArray[0].toarray(),u.toarray()) for u in AnswerTVectorArray]

Qhamming=[dist.hamming(QuestionTVectorArray[0].toarray(),u.toarray()) for u in QuestionTVectorArray]
Ahamming=[dist.hamming(AnswerTVectorArray[0].toarray(),u.toarray()) for u in AnswerTVectorArray]

Qcorrelation=[dist.correlation(QuestionTVectorArray[0].toarray(),u.toarray()) for u in QuestionTVectorArray]
Acorrelation=[dist.correlation(AnswerTVectorArray[0].toarray(),u.toarray()) for u in AnswerTVectorArray]

Qcityblock=[dist.cityblock(QuestionTVectorArray[0].toarray(),u.toarray()) for u in QuestionTVectorArray]
Acityblock=[dist.cityblock(AnswerTVectorArray[0].toarray(),u.toarray()) for u in AnswerTVectorArray]

Qdice=[dist.dice(QuestionTVectorArray[0].toarray(),u.toarray()) for u in QuestionTVectorArray]
예제 #39
0
def braycurtis((x, y)):
    return distance.braycurtis(x, y)
data['cosine_distance'] = [cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data['cityblock_distance'] = [cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data['jaccard_distance'] = [jaccard(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data['canberra_distance'] = [canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data['euclidean_distance'] = [euclidean(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data['minkowski_distance'] = [minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data['braycurtis_distance'] = [braycurtis(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data['skew_q1vec'] = [skew(x) for x in np.nan_to_num(question1_vectors)]
data['skew_q2vec'] = [skew(x) for x in np.nan_to_num(question2_vectors)]
data['kur_q1vec'] = [kurtosis(x) for x in np.nan_to_num(question1_vectors)]
data['kur_q2vec'] = [kurtosis(x) for x in np.nan_to_num(question2_vectors)]

cPickle.dump(question1_vectors, open('data/q1_w2v.pkl', 'wb'), -1)
cPickle.dump(question2_vectors, open('data/q2_w2v.pkl', 'wb'), -1)

data.to_csv('data/quora_features.csv', index=False)
예제 #41
0
파일: cortest.py 프로젝트: Klim314/twomat
def test(lst1,lst2):
	print("PEARSON: ", str(pearsonr(lst1, lst2)))
	print("SPEARMAN: ", str(spearmanr(lst1, lst2)))
	print("BRAYCURTIS: ", str(braycurtis(lst1, lst2)))
	print("KULLMANLEIBER: ", str(entropy(lst1, lst2)))