示例#1
0
    def _analyse_second_cossim(self, queries, normed_embs, nodes, k, pair):
        """
        This function is called in the multiprocessing of the second order cosine similarity.
        """

        # Convert the indices of nearest neighbors back into numpy
        indices_0 = np.asarray(queries[pair[0]])
        indices_1 = np.asarray(queries[pair[1]])

        # Convert the embeddings and nodes back into numpy
        norm_emb_0 = np.asarray(normed_embs[pair[0]])
        norm_emb_1 = np.asarray(normed_embs[pair[1]])
        nodes = np.asarray(nodes)

        # Compute the second order cosine similarity
        pair_results = []
        for i in range(len(nodes)):
            # Build the set of nearest neighbors w.r.t. both embeddings
            # Use indices from 1 to k+1, because the first entry will always be the node itself
            neighbors_union = np.union1d(indices_0[i, 1:(k + 1)],
                                         indices_1[i, 1:(k + 1)])

            # Vectors of cosine similarity values of nearest neighbors
            m0 = cos_sim(norm_emb_0[neighbors_union],
                         norm_emb_0[nodes[i]].reshape(1, -1))
            m1 = cos_sim(norm_emb_0[neighbors_union],
                         norm_emb_0[nodes[i]].reshape(1, -1))

            # Cosine similarity between similarity vectors
            pair_results.append(
                float(
                    np.dot(m0, m1) /
                    (np.linalg.norm(m0) * np.linalg.norm(m1))))
        return pair_results
示例#2
0
def next_prob(h, h_next, window, sim, w2v):
    h_next_words = [''.join(w)
                    for w in h_next.seg['SEG']]  # ['abc', 'de', 'f']
    if h.m == h_next.m:
        return h.prob
    else:
        prob_i = []
        for i in range(1, h_next.m - h.m + 1):
            prob_j = []
            center = h.m - 1 + i
            for j in range(1, min(window + 1, center + 1)):
                # print(center, j, h_next_words[center], h_next_words[center - j])
                try:
                    prob_j.append(sim['|'.join(
                        sorted(
                            [h_next_words[center],
                             h_next_words[center - j]]))])
                except Exception:
                    prob_j.append(
                        float(
                            cos_sim(
                                np.asarray(w2v[h_next_words[center]],
                                           dtype='float32').reshape(1, -1),
                                np.asarray(w2v[h_next_words[center - j]],
                                           dtype='float32').reshape(1, -1))))
            prob_j = sum(prob_j) / len(prob_j)
            prob_i.append(prob_j)
        return ((h.m - 1) * h.prob + sum(prob_i)) / (h_next.m - 1)
示例#3
0
def similarity(X,Y=None,Slice=None):
    '''
    Parameters
    ----------
    X : DataFrame
        DESCRIPTION.
    Y : DataFrame, optional
        DESCRIPTION. The default is None.
    Slice : index, optional
        Slice of single table for memory managment purposes. The default is None.
    Returns
    -------
    None.

    '''
    if(Y is None):
        Y=X
    index = X.index
    columns = Y.columns
    if(Slice is not None):
        Y = [text_vec.T[Slice]]
        columns = [Slice]
    
    out = pd.DataFrame(
        data = cos_sim(X,Y),
        index = index,
        columns = columns,)
    return(out)
示例#4
0
def get_cos_sim_list(model, patient_word_list, collocation_function=None):
    """
    calculates pairwise cosine similarities of words or collocations in a word list

    cosine similarity for a word pair that has a collocation is calculated as
    cosine similarity between a word vector and an average of word vectors from the collocation

    :param model: gensim.word2vec model
    :param patient_word_list: list of strings, words produced by the patient
    :param collocation_function: function combining two word vectors in a collocation, if None (default) mean is taken
    :return patient_cos_sim_list: list of float, pairwise cosine similarities
    :return not_found: int, number of words missing from the model vocabulary
    """
    patient_cos_sim_list = []
    not_found = 0
    for j, word in enumerate(patient_word_list):
        if j > 0:
            previous_word = patient_word_list[j - 1]
            word_vector, nf = collocation_handler(model, word,
                                                  collocation_function)
            not_found += nf
            previous_word_vector, nf = collocation_handler(
                model, previous_word, collocation_function)
            not_found += nf
            if word_vector and previous_word_vector:
                patient_cos_sim_list.append(
                    cos_sim(word_vector, previous_word_vector))
            else:
                continue
    not_found = math.ceil(not_found / 2)
    return patient_cos_sim_list, not_found
    def evaluateSimilarity(self, corpusFileName, outputFileName):
        print("Evaluating Word Similarity")
        machine_scores = []
        human_scores = []
        not_found = 0
        words_not_found = []

        output = open(outputFileName, 'w')
        output.write("# Word 1\tWord 2\tHuman (mean)\tMachine\n")

        with open(corpusFileName) as corpus_lines:
            for corpus_line in corpus_lines:
                if corpus_line[0] == "#":
                    continue

                # Reading word from the corpus
                line = {}
                line['tag'], line['word_1'], line['word_2'], line[
                    'human_score'] = corpus_line.rstrip().split('\t')

                # Retrieving the vectors of the words
                if line['word_1'] not in self.glove:
                    not_found += 1
                    words_not_found.append(line['word_1'])
                    continue

                if line['word_2'] not in self.glove:
                    not_found += 1
                    words_not_found.append(line['word_2'])
                    continue

                word1_vec = np.array(self.glove[line['word_1']])
                word2_vec = np.array(self.glove[line['word_2']])

                # Computing the score based on the two vectors
                machine_score = cos_sim(word1_vec.reshape(1, -1),
                                        word2_vec.reshape(1, -1))[0][0] * 10

                machine_scores.append(machine_score)

                # Human score
                human_scores.append(float(line['human_score']))

                # the pair, the human score, and the word embeddings score, and the overall correlation.
                o = '\t'.join([
                    line['tag'], line['word_1'], line['word_2'],
                    line['human_score'],
                    str(round(machine_score, 4))
                ])
                output.write(o + '\n')

        # Evaluate score - compute correlation of the two scores
        evaluation = correlation(human_scores, machine_scores)
        evaluation = round(evaluation[0], 4)
        output.write("# Correlation = " + str(evaluation) + "\n")
        output.close()
        print("Evaluation complete.")
        return evaluation
示例#6
0
def chapter10(in_data, out_path):
    model = word2vec.load('out90.bin')
    with open(out_path, 'w') as f_out:
        for a, b, _ in in_data:
            try:
                cs = cos_sim([model[a]], [model[b]])[0][0]
            except Exception as e:
                cs = -1
            print(f'{a} {b} {_} {cs:f}', file=f_out)
def similarite_offsets(list_offsets):
    sim_offsets = []
    for i in range(len(list_offsets)):
        sim_offsets.append([])
        list_tuples = list(list_offsets[i])
        for j in range(len(list_tuples)):
            for k in range(j+1,len(list_tuples)):
                sim_offsets[-1].append(cos_sim([list_tuples[j]], [list_tuples[k]])[0][0])
    return(np.array(sim_offsets))
示例#8
0
def mutual_information_similarity(file_name):
    """
    Calculates MI between all pairs of short_genre based on their word's MI.

    Prints to file the similarity

    :return:
    """
    from sklearn.metrics.pairwise import cosine_similarity as cos_sim
    import math

    SimilarityScore = collections.namedtuple("SimilarityScore", ("g1", "g2", "score"))  # a type

    # fetch all short genres
    mi_coll = MutualInformation()
    # all possible pairs of genre with no repeat
    genres = []

    # calculate cosine similarity b/w pairs
    dv = DictVectorizer()

    def extract_bow_add_to_genres(genre, bow):
        if genre not in genres:
            genres.append(genre)

        new_bow = {}

        for k in bow.keys():

            curr = bow[k]
            new_bow[k] = 0 if math.isnan(curr) or math.isinf(curr) else curr

            new_bow == 0 and print("Eliminated element")

        return new_bow

    bow_matrix = dv.fit_transform(
        extract_bow_add_to_genres(mi_obj.short_genre, mi_obj.bow) for mi_obj in mi_coll.iterable()
    )

    print("Done with making vector")
    # sort the pairs by the cosine similarity score
    similarity_matrix = cos_sim(bow_matrix)

    print("Done with similarity calculation")
    sorted_list = []
    # sort the similarity scores
    for x, y in itertools.combinations(range(0, len(genres)), 2):
        sorted_list.append(SimilarityScore(genres[x], genres[y], similarity_matrix[x][y]))
    # sort!
    sorted_list = sorted(sorted_list, key=operator.itemgetter(2), reverse=True)

    print("printing file")
    with open(file_name, mode="a", errors="ignore", encoding="latin-1") as file:
        for l in sorted_list:
            file.write("{}, {} value: {}\n".format(l[0], l[1], l[2]))
示例#9
0
def store_sim(w2v, coo, sim_path):
    print("Storing cosine similarity of co-occurring word pairs ...")
    sim_dic = {}
    for keys in tqdm(coo):
        x_1, x_2 = w2v[keys.split('|')[0]], w2v[keys.split('|')[1]]
        sim_dic[keys] = float(cos_sim(x_1, x_2))
    with open(sim_path, 'w') as f:
        json.dump(sim_dic, f)
    print("Cosine similarity stored in {}".format(sim_path))
    return sim_dic
示例#10
0
def chapter09(in_data, out_path):
    ft = load('ft')
    t2i = {token: i for i, token in enumerate(ft)}
    vec = sio.loadmat('../chapter09/pickles/X_300.mat')['X_300']
    with open(out_path, 'w') as f_out:
        for a, b, _ in in_data:
            try:
                cs = cos_sim([vec[t2i[a]]], [vec[t2i[b]]])[0][0]
            except Exception as e:
                cs = -1
            print(f'{a} {b} {_} {cs:f}', file=f_out)
示例#11
0
    def _analyse_angle_divergence(self, queries, normed_embs, nodes, k, pair):
        """
        This function is called in the multiprocessing of the k-NN angle divergence.
        """

        # Convert the indices of nearest neighbors back into numpy
        indices_0 = np.asarray(queries[pair[0]])
        indices_1 = np.asarray(queries[pair[1]])

        # Convert the embeddings and nodes back into numpy
        norm_emb_0 = np.asarray(normed_embs[pair[0]])
        norm_emb_1 = np.asarray(normed_embs[pair[1]])
        nodes = np.asarray(nodes)

        # Compute the second order cosine similarity
        pair_results = []
        for i in range(len(nodes)):
            # Build the set of nearest neighbors w.r.t. both embeddings
            # Use indices from 1 to k+1, because the first entry will always be the node itself
            neighbors_union = np.union1d(indices_0[i, 1:(k + 1)],
                                         indices_1[i, 1:(k + 1)])

            # Vectors of cosine similarity values of nearest neighbors
            cossim_vec0 = np.squeeze(
                cos_sim(norm_emb_0[neighbors_union],
                        norm_emb_0[nodes[i]].reshape(1, -1)))
            cossim_vec1 = np.squeeze(
                cos_sim(norm_emb_1[neighbors_union],
                        norm_emb_1[nodes[i]].reshape(1, -1)))

            # clip cossim values to feasible interval, which it might leave due to numerical issues
            cossim_vec0 = np.clip(cossim_vec0, a_min=-1, a_max=1)
            cossim_vec1 = np.clip(cossim_vec1, a_min=-1, a_max=1)

            # convert to degrees
            m0 = np.degrees(np.arccos(cossim_vec0))
            m1 = np.degrees(np.arccos(cossim_vec1))

            # Cosine similarity between similarity vectors
            pair_results.append(np.mean(np.abs(m0 - m1)))
        return pair_results
示例#12
0
def calc_sim(dic, smiles_0, smiles_1, func, pickle_dic, conf_type, fp_kwargs):
    """
    Calculate the similatiy between conformers of two different species.
    Args:
        dic (dict): prediction dictionary
        smiles_0 (str): first SMILES string
        smiles_1 (str): second SMILES string
        external_fp_fn (str): name of external fingerprinting function
        func (callable): actual external fingerprinting function
        pickle_dic (dict): dictionary of the form {smiles:
            full_pickle_path} for each smiles
        conf_type (str): whether you're comparing conformers picked
            randomly for each species or based on their attention weight.
        fp_kwargs (dict): any keyword arguments you may need for your
            fingerprinting function.
    Returns:
        sim (float): cosine similarity between two conformers, one from
            each species.

    """

    sub_dic_0 = dic[smiles_0]
    sub_dic_1 = dic[smiles_1]

    if func is not None:
        paths = [pickle_dic[smiles_0], pickle_dic[smiles_1]]
        fp_0_choices, fp_1_choices = choices_from_pickle(paths)
    else:
        fp_0_choices = sub_dic_0["conf_fps"]
        fp_1_choices = sub_dic_1["conf_fps"]

    if conf_type == "att":

        conf_0_idx = sub_dic_0["max_weight_conf"]
        conf_1_idx = sub_dic_1["max_weight_conf"]

        fp_0 = fp_0_choices[conf_0_idx]
        fp_1 = fp_1_choices[conf_1_idx]

    elif conf_type == "random":
        fp_0 = random.choice(fp_0_choices)
        fp_1 = random.choice(fp_1_choices)

    fps = [fp_0, fp_1]
    for j, fp in enumerate(fps):
        if fp_kwargs is None:
            fp_kwargs = {}
        if isinstance(fp, Chem.rdchem.Mol):
            fps[j] = func(fp, **fp_kwargs)

    sim = cos_sim(fps[0].reshape(1, -1), fps[1].reshape(1, -1)).item()

    return sim
示例#13
0
def plot_sims(
        W,  # (n_samples, n_features)
        points,
        labels,
        title=None):
    im = plt.imshow(cos_sim(W), vmin=-1.0, vmax=1.0, cmap='seismic')  #'hot')
    im.axes.xaxis.tick_top()
    plt.colorbar()
    plt.xticks(points, labels, rotation='vertical', verticalalignment='bottom')
    plt.yticks(points, labels)
    if title:
        plt.xlabel(title)
    plt.show()
示例#14
0
def get_cosine(input_list, y_train):
    X_train, X_test = input_list
    if type(X_train) is sparse.csr.csr_matrix:
        X_train = X_train.toarray()
        X_test = X_test.toarray()

    n_samples = X_train.shape[0]
    n_categs = len(np.unique(y_train))
    kfolds = StratifiedKFold(y_train, 4)
    X_train_features = np.zeros([n_samples, n_categs])
    for train, test in kfolds:
        X1 = X_train[train, :]
        y1 = y_train[train]
        X2 = X_train[test, :]
        temp = pd.DataFrame(np.c_[y1.reshape(-1, 1), X1])
        m = np.array(temp.groupby(0).mean())
        X_train_features[test, :] = cos_sim(X2, m)

    temp = pd.DataFrame(np.c_[y_train, X_train])
    m = np.array(temp.groupby(0).mean())
    features_euc = [X_train_features, cos_sim(X_test, m)]
    return features_euc
 def thread(self, analogy):
     a, b, c = analogy
     a_, b_, c_ = self.__getVectors(a), self.__getVectors(
         b), self.__getVectors(c)
     d_ = b_ - a_ + c_
     d = ""
     max_score = 0
     for i in self.glove:
         if i == a or i == b or i == c:
             continue
         score = cos_sim(d_,
                         np.array(self.glove[i]).reshape(1, -1))[0][0] * 10
         if score > max_score:
             max_score = score
             d = i
     return d, max_score
示例#16
0
    def rating_recommender(self, user):
        similarity_matrix = cos_sim(self.ratings_matrix)
        prediction_matrix = np.zeros(self.ratings_matrix.shape)
        index_top30 = [np.argsort(similarity_matrix[:, user])[-2:-30 - 2:-1]]
        for item in range(self.rating_matrix.shape[1]):
            if self.rating_matrix[user][item] == 0:
                # Denominator is the sum of similarity for each user with its top 30 users:
                denom = np.sum(similarity_matrix[user, :][index_top30])

                # Numerator
                numer = similarity_matrix[user, :][index_top30].dot(
                    self.rating_matrix[:, item][index_top30])

                prediction_matrix[user, item] = numer / denom

        movie_ids = [i for i in np.argsort(prediction_matrix[user, :])[-30:]]
        return movie_ids
示例#17
0
def get_graph(df):
    embeddings = None
    use_module = hub.Module(USE_MODEL_PATH)
    df['text'] = df['title'] + ' ' + df['summary']
    df = df[df['text'].apply(lambda x: isinstance(x, str) and len(x) >=
                             MINIMUM_CHARACTER_THRESHOLD)]
    df['text'] = df['text'].apply(lambda x: x[:MAXIMUM_CHARACTER_THRESHOLD])
    df = df.reset_index(drop=True)
    with tf.Session() as sess:
        sess.run([tf.global_variables_initializer(), tf.tables_initializer()])
        embeddings_tf = use_module(df['text'].values)
        embeddings = sess.run(embeddings_tf)
    similarities = cos_sim(embeddings)
    edges = np.argwhere(similarities >= SIMILARITY_THRESHOLD)
    weights = [(u, v, similarities[u, v]) for u, v in edges]
    weights.sort(key=itemgetter(2), reverse=True)
    return weights
def fetch_list(movie_user_likes):
    ##Step 1: Read CSV File
    #print(df.head())

    #print(df.columns)
    ##Step 2: Select Features
    features = ['keywords', 'cast', 'genres', 'director']
    ##Step 3: Create a column in DF which combines all selected features
    for feature in features:
        df[feature] = df[feature].fillna('')

    df["combined"] = df.apply(combine_features, axis=1)

    #print(df["combined"].head())

    ##Step 4: Create count matrix from this new combined column
    cv = CountVectorizer()

    count = cv.fit_transform(df["combined"])

    ##Step 5: Compute the Cosine Similarity based on the count_matrix
    similarity_score = cos_sim(count)
    #print(similarity_score)

    #movie_user_likes = "Avatar"

    ## Step 6: Get index of this movie from its title
    index = get_index_from_title(movie_user_likes)

    ## Step 7: Get a list of similar movies in descending order of similarity score
    movies_to_recommend_scores = list(similarity_score[index])

    numbers = list(range(len(movies_to_recommend_scores)))
    result = dict(zip(numbers, movies_to_recommend_scores))
    sorted_keys = sorted(result, key=result.get)
    sorted_keys = sorted_keys[::-1]

    ## Step 8: Print titles of first 50 movies
    movies_to_recommend_list = sorted_keys[1:10]

    movies_to_recommend = []
    for i in movies_to_recommend_list:
        movies_to_recommend.append(get_title_from_index(i))

    return movies_to_recommend
示例#19
0
    def sentence_similarity_with_all_sentence(sentence_bow,document_bow):
        """
        Attribute 5: Similarity between each sentence and all the other sentences in the model

        Difference between sentence s and all text in the page

        First, convert the sentence and target sentence to TF-IDF.

        Then, get the cosine similarity.
        :param sentence_bow: bag of word of the current sentence under consideration
        :param document_bow:bag of words of all other setences in the document
        :return:
        """

        s_cosine_sim=functools.reduce(lambda mean,curr_other_sentence:
                         mean+cos_sim(sentence_bow,curr_other_sentence)/len(document_bow),
                         document_bow,0)
        return s_cosine_sim
示例#20
0
def search(vectorizer, index_matrix, query=''):
    """

    :param vectorizer: CountVectorizer or TfIdfVectorizer
    :param index_matrix: tf-idf array
    :param query: string
    :return: an array of document paths, sorted by relevance
    """
    if query != '':
        clean_req = [preproc_req(query)]
        with open('files_index.txt', 'r') as f_idx:
            paths = f_idx.read().strip('\n').split('\n')
        q = vectorizer.transform(clean_req).toarray().reshape(1, -1)
        rel_dict = defaultdict()
        for i in range(len(index_matrix)):
            rel_dict[paths[i]] = cos_sim(index_matrix[i].reshape(1, -1), q)
        result = sorted(rel_dict, key=rel_dict.get, reverse=True)
        print(result)
        return search(
            vectorizer, index_matrix,
            input('Введите запрос или нажмите enter, чтобы закончить'))
    else:
        return None
示例#21
0
def embed_drinks(corpus_path):
    """
    Embed recipe instruction corpus to pretrained fasttext model

    Params
    ----
    corpus_path: str    filepath to recipe instruction corpus to train and embed
    ing_path: str    filepath to information that contains recipe ingredient x drinks

    Returns
    ----
    pd.DataFrame    pandas dataframe that contains cosine similarity of embedded drinks
    """
    df = pd.read_csv("../data/recipe_cleaned_v1.csv", index_col=0, dtype=str)
    df = df.fillna("0")
    
    model = fasttext.train_unsupervised("../data/instruction_corpus.txt")
    embedded_drinks = [model.get_word_vector(x) for x in df.columns]

    # compute cosine similarity between drinks
    sim_matrix = pd.DataFrame(cos_sim(embedded_drinks), columns=phrase_to_word(list(df.columns)),
                            index=phrase_to_word(list(df.columns)))

    return sim_matrix
示例#22
0
def get_features(df_train, df_test):
    n_dep = len(np.unique(np.concatenate(
        [df_train['Department_'], df_test['Department_']])))
    n_fn = len(np.unique(np.concatenate(
        [df_train['FinelineNumber_'], df_test['FinelineNumber_']])))
    n_upc = len(np.unique(np.concatenate(
        [df_train['Upc_'], df_test['Upc_']])))

    # labels
    y_train = df_train.groupby(['VisitNumber_']).first()['TripType_']
    Y_train = pd.get_dummies(y_train).as_matrix()
    eps = 2**-52

    tfidf = TfidfTransformer(norm='l2', sublinear_tf=True, use_idf=True)
    # tfidf = TfidfTransformer(norm='l2', sublinear_tf=False, use_idf=True)

    n_br_fn = []
    for df in [df_train, df_test]:
        g = df.groupby(
            ['VisitNumber_', 'FinelineNumber_']).sum().reset_index()
        g['br'] = np.logical_and(
            g['ScanCount_binary'] > 0, g['ScanCount_binary_neg'] > 0)
        n_br_fn.append(
            g.groupby(['VisitNumber_']).sum().reset_index()['br'])

    n_br_upc = []
    for df in [df_train, df_test]:
        g = df.groupby(
            ['VisitNumber_', 'Upc_', 'ScanCount_binary']).sum().reset_index()
        g['br'] = np.logical_and(
            g['ScanCount_binary'] > 0, g['ScanCount_binary_neg'] > 0)
        n_br_upc.append(
            g.groupby(['VisitNumber_']).sum().reset_index()['br'])

    b_bought = []
    n_bought = []
    for df in [df_train, df_test]:
        g = df.groupby(
            ['VisitNumber_']).sum().reset_index()
        b_bought.append(g['ScanCount_binary'] > 0)
        n_bought.append(g['ScanCount_rect'])

    b_returned = []
    n_returned = []
    for df in [df_train, df_test]:
        g = df.groupby(
            ['VisitNumber_']).sum().reset_index()
        b_returned.append(g['ScanCount_binary_neg'] > 0)
        n_returned.append(g['ScanCount_rect_neg'])

    # fn raw and tfidf
    fn = []
    for df in [df_train, df_test]:
        g = df.groupby(
            ['VisitNumber_', 'FinelineNumber_', 'ScanCount_binary']).sum().reset_index()
        n = len(np.unique(df['VisitNumber_']))
        g = g[g['ScanCount_binary'] == 1]
        s = sparse.csr_matrix(
            (g['ScanCount_rect'], (g['VisitNumber_'], g['FinelineNumber_'])),
            shape=(n, n_fn), dtype='float64')
        fn.append(s)

    # upc raw and tfidf
    upc = []
    for df in [df_train, df_test]:
        g = df.groupby(
            ['VisitNumber_', 'Upc_', 'ScanCount_binary']).sum().reset_index()
        n = len(np.unique(df['VisitNumber_']))
        g = g[g['ScanCount_binary'] == 1]
        s = sparse.csr_matrix(
            (g['ScanCount_rect'], (g['VisitNumber_'], g['Upc_'])),
            shape=(n, n_upc), dtype='float64')
        upc.append(s)

    tfidf.fit(fn[0])
    fn_tfidf = []
    for sm in fn:
        fn_tfidf.append(tfidf.transform(sm))

    print('Getting dot product between mean fn and datasets')
    fn_dot = get_dot(fn, y_train)

    print('Getting dot product between mean fn_tfidf and datasets')
    fn_tfidf_dot = get_dot(fn_tfidf, y_train)

    tfidf.fit(upc[0])
    upc_tfidf = []
    for sm in upc:
        upc_tfidf.append(tfidf.transform(sm))

    print('Doing SVD on Fineline ScanCounts...')
    svd = TruncatedSVD(n_components=100)
    svd.fit(sparse.hstack([fn[0], upc[0]]))
    fnupc_red = []
    for sm1, sm2 in zip(fn, upc):
        fnupc_red.append(svd.transform(sparse.hstack([sm1, sm2])))

    print('Doing SVD on Fineline/UPC TFIDF ScanCounts...')
    svd = TruncatedSVD(n_components=1500)
    svd.fit(sparse.hstack([fn_tfidf[0], upc_tfidf[0]]))

    fnupc_tfidf_red = []
    for sm1, sm2 in zip(fn_tfidf, upc_tfidf):
        fnupc_tfidf_red.append(svd.transform(sparse.hstack([sm1, sm2])))

    print('Doing SVD on Fineline TFIDF ScanCounts...\n')
    svd = TruncatedSVD(n_components=100)
    svd.fit(fn_tfidf[0])
    fn_tfidf_red = []
    for sm in fn_tfidf:
        fn_tfidf_red.append(svd.transform(sm))

    fn_r = []
    for df in [df_train, df_test]:
        g = df.groupby(
            ['VisitNumber_', 'FinelineNumber_', 'ScanCount_binary']).sum().reset_index()
        n = len(np.unique(df['VisitNumber_']))
        g = g[g['ScanCount_binary'] == 0]
        s = sparse.csr_matrix(
            (g['ScanCount_rect_neg'], (g['VisitNumber_'], g['FinelineNumber_'])),
            shape=(n, n_fn), dtype='float64')
        fn_r.append(s)

    tfidf.fit(fn_r[0])
    fn_r_tfidf = []
    for sm in fn_r:
        fn_r_tfidf.append(tfidf.transform(sm))

    print('Getting dot product between mean fn_r and datasets')
    fn_r_dot = get_dot(fn_r, y_train)

    print('Getting dot product between mean fn_r_tfidf and datasets')
    fn_r_tfidf_dot = get_dot(fn_r_tfidf, y_train)

    print('Doing SVD on Fineline Return TFIDF ScanCounts...')
    svd = TruncatedSVD(n_components=50)
    svd.fit(fn_r_tfidf[0])
    fn_r_tfidf_red = []
    for sm in fn_r_tfidf:
        fn_r_tfidf_red.append(svd.transform(sm))

    # #########################################
    print('Doing SVD on Fineline Difference ScanCounts...\n')
    diff_br = []
    diff_br.append(fn[0] - fn_r[0])
    diff_br.append(fn[1] - fn_r[1])
    svd = TruncatedSVD(n_components=100)
    svd.fit(diff_br[0])
    diff_br_red = []
    for sm in diff_br:
        diff_br_red.append(svd.transform(sm))

    # department total scan counts
    dep = []
    dep_p = []
    dep_entropy = []
    for df in [df_train, df_test]:
        g = df.groupby(
            ['VisitNumber_', 'Department_', 'ScanCount_binary']).sum().reset_index()
        n = len(np.unique(df['VisitNumber_']))
        g = g[g['ScanCount_binary'] == 1]
        s = sparse.csr_matrix(
            (g['ScanCount_rect'], (g['VisitNumber_'], g['Department_'])),
            shape=(n, n_dep), dtype='float64')
        dep.append(s.toarray())

        m = s.toarray()
        p = m / np.sum(m, axis=1)[:, np.newaxis]
        p[np.isnan(p)] = 0
        entropy = -np.sum(p * np.log(p + eps), axis=1)
        dep_p.append(p)
        dep_entropy.append(entropy)

    tfidf.fit(dep[0])
    dep_tfidf = []
    for sm in dep:
        dep_tfidf.append(tfidf.transform(sm).toarray())

    sim_matrix = cos_sim(dep[0].T, dep[0].T)
    sim_matrix /= np.sum(sim_matrix, axis=0)
    dep = [d.dot(sim_matrix) for d in dep]

    # dep = [np.log(i + 1) for i in dep]

    print('Getting dot product between mean dep and datasets')
    dep_dot = get_dot(dep, y_train)

    print('Getting dot product between mean dep_p and datasets')
    dep_p_dot = get_dot(dep_p, y_train)

    print('Getting dot product between mean dep_tfidf and datasets')
    dep_tfidf_dot = get_dot(dep_tfidf, y_train)

    print('Getting distances between mean dep and datasets')
    # dep_maha = get_mahalanobis(dep, y_train)
    dep_manh = get_manhattan(dep, y_train)

    print('Getting distances between mean dep_p and datasets')
    # dep_p_maha = get_mahalanobis(dep_p, y_train)
    dep_p_manh = get_manhattan(dep_p, y_train)

    print('Getting distances between mean dep_tfidf and datasets')
    # dep_tfidf_maha = get_mahalanobis(dep_tfidf, y_train)
    dep_tfidf_manh = get_manhattan(dep_tfidf, y_train)

    print('Getting euclidean for dep')
    dep_euclidean = get_euclidean(dep, y_train)

    print('Getting euclidean for dep_p')
    dep_p_euclidean = get_euclidean(dep_p, y_train)

    print('Getting euclidean for dep_tfidf\n')
    dep_tfidf_euclidean = get_euclidean(dep_tfidf, y_train)

    print('Getting cosine for dep')
    dep_cosine = get_cosine(dep, y_train)

    print('Getting cosine for dep_p')
    dep_p_cosine = get_cosine(dep_p, y_train)

    print('Getting cosine for dep_tfidf\n')
    dep_tfidf_cosine = get_cosine(dep_tfidf, y_train)

    enc = OneHotEncoder(n_values=n_dep)
    enc.fit(np.argmax(dep_p[0], axis=1).reshape(-1, 1))
    top_dep = []
    for m in dep_p:
        onehot = enc.transform(np.argmax(m, axis=1).reshape(-1, 1)).toarray()
        no_buy = m.sum(axis=1) == 0
        onehot[no_buy, :] = 0
        top_dep.append(onehot)

    dep_sorted = []
    dep_p_sorted = []
    for m1, m2 in zip(dep, dep_p):
        dep_sorted.append(np.sort(m1, axis=1)[:, -20:])
        dep_p_sorted.append(np.sort(m2, axis=1)[:, -20:])

    dep_sorted = [np.log(i + 1) for i in dep_sorted]

    print('Getting dot product between mean dep_sorted and datasets')
    dep_sorted_dot = get_dot(dep_sorted, y_train)

    print('Getting dot product between mean dep_p_sorted and datasets')
    dep_p_sorted_dot = get_dot(dep_p_sorted, y_train)

    print('Getting distances between mean dep_sorted and datasets')
    # dep_sorted_maha = get_mahalanobis(dep_sorted, y_train)
    dep_sorted_manh = get_manhattan(dep_sorted, y_train)

    print('Getting distances between mean dep_p_sorted and datasets')
    # dep_p_sorted_maha = get_mahalanobis(dep_p_sorted, y_train)
    dep_p_sorted_manh = get_manhattan(dep_p_sorted, y_train)

    print('Getting euclidean for dep_sorted')
    dep_sorted_euclidean = get_euclidean(dep_sorted, y_train)

    print('Getting euclidean for dep_p_sorted\n')
    dep_p_sorted_euclidean = get_euclidean(dep_p_sorted, y_train)

    print('Getting cosine for dep_sorted')
    dep_sorted_cosine = get_cosine(dep_sorted, y_train)

    print('Getting cosine for dep_p_sorted\n')
    dep_p_sorted_cosine = get_cosine(dep_p_sorted, y_train)

    # department unique UPCs
    dep_uniq = []
    dep_uniq_p = []
    dep_uniq_entropy = []
    for df in [df_train, df_test]:
        g = df.groupby(
            ['VisitNumber_', 'Department_', 'ScanCount_binary']).size().reset_index()
        g.rename(columns={0: 'n_unique'}, inplace=True)
        n = len(np.unique(df['VisitNumber_']))
        g = g[g['ScanCount_binary'] == 1]
        s = sparse.csr_matrix(
            (g['n_unique'], (g['VisitNumber_'], g['Department_'])),
            shape=(n, n_dep), dtype='float64')
        dep_uniq.append(s.toarray())

        m = s.toarray()
        p = m / np.sum(m, axis=1)[:, np.newaxis]
        p[np.isnan(p)] = 0
        entropy = -np.sum(p * np.log(p + eps), axis=1)
        dep_uniq_p.append(p)
        dep_uniq_entropy.append(entropy)

    tfidf.fit(dep_uniq[0])
    dep_uniq_tfidf = []
    for sm in dep_uniq:
        dep_uniq_tfidf.append(tfidf.transform(sm).toarray())

    sim_matrix = cos_sim(dep_uniq[0].T, dep_uniq[0].T)
    sim_matrix /= np.sum(sim_matrix, axis=0)
    dep_uniq = [d.dot(sim_matrix) for d in dep_uniq]

    # dep_uniq = [np.log(i + 1) for i in dep_uniq]

    print('Getting dot product between mean dep_uniq and datasets')
    dep_uniq_dot = get_dot(dep_uniq, y_train)

    print('Getting dot product between mean dep_uniq_p and datasets')
    dep_uniq_p_dot = get_dot(dep_uniq_p, y_train)

    print('Getting dot product between mean dep_uniq_tfidf and datasets')
    dep_uniq_tfidf_dot = get_dot(dep_uniq_tfidf, y_train)

    print('Getting distances between mean dep_uniq and datasets')
    # dep_uniq_maha = get_mahalanobis(dep_uniq, y_train)
    dep_uniq_manh = get_manhattan(dep_uniq, y_train)

    print('Getting distances between mean dep_uniq_p and datasets')
    # dep_uniq_p_maha = get_mahalanobis(dep_uniq_p, y_train)
    dep_uniq_p_manh = get_manhattan(dep_uniq_p, y_train)

    print('Getting distances between mean dep_uniq_tfidf and datasets')
    # dep_uniq_tfidf_maha = get_mahalanobis(dep_uniq_tfidf, y_train)
    dep_uniq_tfidf_manh = get_manhattan(dep_uniq_tfidf, y_train)

    print('Getting euclidean for dep_uniq')
    dep_uniq_euclidean = get_euclidean(dep_uniq, y_train)

    print('Getting euclidean dep_uniq_p')
    dep_uniq_p_euclidean = get_euclidean(dep_uniq_p, y_train)

    print('Getting euclidean for mean dep_uniq_tfidf\n')
    dep_uniq_tfidf_euclidean = get_euclidean(dep_uniq_tfidf, y_train)

    print('Getting cosine for dep_uniq')
    dep_uniq_cosine = get_cosine(dep_uniq, y_train)

    print('Getting cosine dep_uniq_p')
    dep_uniq_p_cosine = get_cosine(dep_uniq_p, y_train)

    print('Getting cosine for mean dep_uniq_tfidf\n')
    dep_uniq_tfidf_cosine = get_cosine(dep_uniq_tfidf, y_train)

    # department unique Finelines
    dep_uniq_fn = []
    dep_uniq_fn_p = []
    dep_uniq_fn_entropy = []
    for df in [df_train, df_test]:
        g = df.groupby(
            ['VisitNumber_', 'Department_', 'FinelineNumber_', 'ScanCount_binary']).size().reset_index()
        g.rename(columns={0: 'n_unique'}, inplace=True)
        g['n_unique'][g['n_unique'] > 1] = 1
        n = len(np.unique(df['VisitNumber_']))
        g = g[g['ScanCount_binary'] == 1]
        s = sparse.csr_matrix(
            (g['n_unique'], (g['VisitNumber_'], g['Department_'])),
            shape=(n, n_dep), dtype='float64')
        dep_uniq_fn.append(s.toarray())

        m = s.toarray()
        p = m / np.sum(m, axis=1)[:, np.newaxis]
        p[np.isnan(p)] = 0
        entropy = -np.sum(p * np.log(p + eps), axis=1)
        dep_uniq_fn_p.append(p)
        dep_uniq_fn_entropy.append(entropy)

    tfidf.fit(dep_uniq_fn[0])
    dep_uniq_fn_tfidf = []
    for sm in dep_uniq_fn:
        dep_uniq_fn_tfidf.append(tfidf.transform(sm).toarray())

    sim_matrix = cos_sim(dep_uniq_fn[0].T, dep_uniq_fn[0].T)
    sim_matrix /= np.sum(sim_matrix, axis=0)
    dep_uniq_fn = [d.dot(sim_matrix) for d in dep_uniq_fn]

    # dep_uniq_fn = [np.log(i + 1) for i in dep_uniq_fn]

    print('Getting dot product between mean dep_uniq_fn and datasets')
    dep_uniq_fn_dot = get_dot(dep_uniq_fn, y_train)

    print('Getting dot product between mean dep_uniq_fn_tfidf and datasets')
    dep_uniq_fn_tfidf_dot = get_dot(dep_uniq_fn_tfidf, y_train)

    print('Getting dot product between mean dep_uniq_fn_p and datasets')
    dep_uniq_fn_p_dot = get_dot(dep_uniq_fn_p, y_train)

    print('Getting distances between mean dep_uniq_fn and datasets')
    # dep_uniq_fn_maha = get_mahalanobis(dep_uniq_fn, y_train)
    dep_uniq_fn_manh = get_manhattan(dep_uniq_fn, y_train)

    print('Getting distances between mean dep_uniq_fn_tfidf and datasets')
    # dep_uniq_fn_tfidf_maha = get_mahalanobis(dep_uniq_fn_tfidf, y_train)
    dep_uniq_fn_tfidf_manh = get_manhattan(dep_uniq_fn_tfidf, y_train)

    print('Getting distances between mean dep_uniq_fn_p and datasets')
    # dep_uniq_fn_p_maha = get_mahalanobis(dep_uniq_fn_p, y_train)
    dep_uniq_fn_p_manh = get_manhattan(dep_uniq_fn_p, y_train)

    print('Getting euclidean for dep_uniq_fn')
    dep_uniq_fn_euclidean = get_euclidean(dep_uniq_fn, y_train)

    print('Getting euclidean for dep_uniq_fn_tfidf')
    dep_uniq_fn_tfidf_euclidean = get_euclidean(dep_uniq_fn_tfidf, y_train)

    print('Getting euclidean for mean dep_uniq_fn_p\n')
    dep_uniq_fn_p_euclidean = get_euclidean(dep_uniq_fn_p, y_train)

    print('Getting cosine for dep_uniq_fn')
    dep_uniq_fn_cosine = get_cosine(dep_uniq_fn, y_train)

    print('Getting cosine for dep_uniq_fn_tfidf')
    dep_uniq_fn_tfidf_cosine = get_cosine(dep_uniq_fn_tfidf, y_train)

    print('Getting cosine for mean dep_uniq_fn_p\n')
    dep_uniq_fn_p_cosine = get_cosine(dep_uniq_fn_p, y_train)

    dep_uniq_fn_sorted = []
    dep_uniq_fn_p_sorted = []
    for m1, m2 in zip(dep_uniq_fn, dep_uniq_fn_p):
        dep_uniq_fn_sorted.append(np.sort(m1, axis=1)[:, -20:])
        dep_uniq_fn_p_sorted.append(np.sort(m2, axis=1)[:, -20:])

    dep_uniq_fn_sorted = [np.log(i + 1) for i in dep_uniq_fn_sorted]

    print('Getting dot product between mean dep_uniq_fn_sorted and datasets')
    dep_uniq_fn_sorted_dot = get_dot(dep_uniq_fn_sorted, y_train)

    print('Getting dot product between mean dep_uniq_fn_p_sorted and datasets')
    dep_uniq_fn_p_sorted_dot = get_dot(dep_uniq_fn_p_sorted, y_train)

    print('Getting distances between mean dep_uniq_fn_sorted and datasets')
    # dep_uniq_fn_sorted_maha = get_mahalanobis(dep_uniq_fn_sorted, y_train)
    dep_uniq_fn_sorted_manh = get_manhattan(dep_uniq_fn_sorted, y_train)

    print('Getting distances between mean dep_uniq_fn_p_sorted and datasets')
    dep_uniq_fn_p_sorted_manh = get_manhattan(dep_uniq_fn_p_sorted, y_train)

    print('Getting euclidean for dep_uniq_fn_sorted')
    dep_uniq_fn_sorted_euclidean = get_euclidean(dep_uniq_fn_sorted, y_train)

    print('Getting for dep_uniq_fn_p_sorted\n')
    dep_uniq_fn_p_sorted_euclidean = get_euclidean(dep_uniq_fn_p_sorted, y_train)

    print('Getting cosine for dep_uniq_fn_sorted')
    dep_uniq_fn_sorted_cosine = get_cosine(dep_uniq_fn_sorted, y_train)

    print('Getting for dep_uniq_fn_p_sorted\n')
    dep_uniq_fn_p_sorted_cosine = get_cosine(dep_uniq_fn_p_sorted, y_train)

    # departments scan binaries
    dep_bin = []
    for df in [df_train, df_test]:
        g = df.groupby(
            ['VisitNumber_', 'Department_', 'ScanCount_binary']).sum().reset_index()
        n = len(np.unique(df['VisitNumber_']))
        g = g[g['ScanCount_binary'] == 1]
        s = sparse.csr_matrix(
            (g['ScanCount_binary'], (g['VisitNumber_'], g['Department_'])),
            shape=(n, n_dep), dtype='float64')
        dep_bin.append(s.toarray())

    tfidf.fit(dep_bin[0])
    dep_bin_tfidf = []
    for sm in dep_bin:
        dep_bin_tfidf.append(tfidf.transform(sm).toarray())

    sim_matrix = cos_sim(dep_bin[0].T, dep_bin[0].T)
    sim_matrix /= np.sum(sim_matrix, axis=0)
    dep_bin = [d.dot(sim_matrix) for d in dep_bin]

    print('Getting dot product between mean dep_bin and datasets')
    dep_bin_dot = get_dot(dep_bin, y_train)

    print('Getting dot product between mean dep_bin_tfidf and datasets')
    dep_bin_tfidf_dot = get_dot(dep_bin_tfidf, y_train)

    print('Getting distances between mean dep_bin and datasets')
    # dep_bin_maha = get_mahalanobis(dep_bin, y_train)
    dep_bin_manh = get_manhattan(dep_bin, y_train)

    print('Getting distances between mean dep_bin_tfidf and datasets')
    # dep_bin_tfidf_maha = get_mahalanobis(dep_bin_tfidf, y_train)
    dep_bin_tfidf_manh = get_manhattan(dep_bin_tfidf, y_train)

    print('Getting euclidean for dep_bin')
    dep_bin_euclidean = get_euclidean(dep_bin, y_train)

    print('Getting euclidean for dep_bin_tfidf\n')
    dep_bin_tfidf_euclidean = get_euclidean(dep_bin_tfidf, y_train)

    print('Getting cosine for dep_bin')
    dep_bin_cosine = get_cosine(dep_bin, y_train)

    print('Getting cosine for dep_bin_tfidf\n')
    dep_bin_tfidf_cosine = get_cosine(dep_bin_tfidf, y_train)

    # departments returns
    dep_r = []
    dep_r_p = []
    dep_r_entropy = []
    for df in [df_train, df_test]:
        g = df.groupby(
            ['VisitNumber_', 'Department_', 'ScanCount_binary']).sum().reset_index()
        n = len(np.unique(df['VisitNumber_']))
        g = g[g['ScanCount_binary'] == 0]
        s = sparse.csr_matrix(
            (g['ScanCount_rect_neg'], (g['VisitNumber_'], g['Department_'])),
            shape=(n, n_dep), dtype='float64')
        dep_r.append(s.toarray())

        m = s.toarray()
        p = m / np.sum(m, axis=1)[:, np.newaxis]
        p[np.isnan(p)] = 0
        entropy = -np.sum(p * np.log(p + eps), axis=1)
        dep_r_p.append(p)
        dep_r_entropy.append(entropy)

    tfidf.fit(dep_r[0])
    dep_r_tfidf = []
    for sm in dep_r:
        dep_r_tfidf.append(tfidf.transform(sm).toarray())

    sim_matrix = cos_sim(dep_r[0].T, dep_r[0].T)
    sim_matrix[np.diag_indices(sim_matrix.shape[0])] = 1
    sim_matrix /= np.sum(sim_matrix, axis=0)
    dep_r = [d.dot(sim_matrix) for d in dep_r]

    # dep_r = [np.log(i + 1) for i in dep_r]

    print('Getting dot product between mean dep_r and datasets')
    dep_r_dot = get_dot(dep_r, y_train)

    print('Getting dot product between mean dep_r_tfidf and datasets')
    dep_r_tfidf_dot = get_dot(dep_r_tfidf, y_train)

    print('Getting distances between mean dep_r and datasets')
    dep_r_manh = get_manhattan(dep_r, y_train)

    print('Getting distances between mean dep_r_tfidf and datasets')
    dep_r_tfidf_manh = get_manhattan(dep_r_tfidf, y_train)

    print('Getting euclidean for dep_r')
    dep_r_euclidean = get_euclidean(dep_r, y_train)

    print('Getting euclidean for dep_r_tfidf\n')
    dep_r_tfidf_euclidean = get_euclidean(dep_r_tfidf, y_train)

    print('Getting cosine for dep_r')
    dep_r_cosine = get_cosine(dep_r, y_train)

    print('Getting cosine for dep_r_tfidf\n')
    dep_r_tfidf_cosine = get_cosine(dep_r_tfidf, y_train)

    dep_bought_mr = []
    dep_r_sorted = []
    dep_r_p_sorted = []
    for i, (m1, m2) in enumerate(zip(dep_r, dep_r_p)):
        n = dep[i].shape[0]
        no_buy = dep_p[i].sum(axis=1) == 0

        temp = dep[i][np.arange(n), np.argmax(m1, axis=1)]
        temp[no_buy] = 0
        dep_bought_mr.append(temp)

        dep_r_sorted.append(np.sort(m1, axis=1)[:, -5:])
        dep_r_p_sorted.append(np.sort(m2, axis=1)[:, -5:])

    # departments uniques return
    dep_r_uniq = []
    dep_r_uniq_p = []
    dep_r_uniq_entropy = []
    for df in [df_train, df_test]:
        g = df.groupby(
            ['VisitNumber_', 'Department_', 'ScanCount_binary']).size().reset_index()
        g.rename(columns={0: 'n_unique'}, inplace=True)
        n = len(np.unique(df['VisitNumber_']))
        g = g[g['ScanCount_binary'] == 0]
        s = sparse.csr_matrix(
            (g['n_unique'], (g['VisitNumber_'], g['Department_'])),
            shape=(n, n_dep), dtype='float64')
        dep_r_uniq.append(s.toarray())

        m = s.toarray()
        p = m / np.sum(m, axis=1)[:, np.newaxis]
        p[np.isnan(p)] = 0
        entropy = -np.sum(p * np.log(p + eps), axis=1)
        dep_r_uniq_p.append(p)
        dep_r_uniq_entropy.append(entropy)

    tfidf.fit(dep_r_uniq[0])
    dep_r_uniq_tfidf = []
    for sm in dep_r_uniq:
        dep_r_uniq_tfidf.append(tfidf.transform(sm).toarray())

    sim_matrix = cos_sim(dep_r_uniq[0].T, dep_r_uniq[0].T)
    sim_matrix[np.diag_indices(sim_matrix.shape[0])] = 1
    sim_matrix /= np.sum(sim_matrix, axis=0)
    dep_r_uniq = [d.dot(sim_matrix) for d in dep_r_uniq]

    # dep_r_uniq = [np.log(i + 1) for i in dep_r_uniq]

    # departments scan binaries returned
    dep_r_bin = []
    for df in [df_train, df_test]:
        g = df.groupby(
            ['VisitNumber_', 'Department_', 'ScanCount_binary']).sum().reset_index()
        n = len(np.unique(df['VisitNumber_']))
        g = g[g['ScanCount_binary'] == 0]
        s = sparse.csr_matrix(
            (g['ScanCount_binary_neg'], (g['VisitNumber_'], g['Department_'])),
            shape=(n, n_dep), dtype='float64')
        dep_r_bin.append(s.toarray())

    tfidf.fit(dep_r_bin[0])
    dep_r_bin_tfidf = []
    for sm in dep_r_bin:
        dep_r_bin_tfidf.append(tfidf.transform(sm).toarray())

    sim_matrix = cos_sim(dep_r_bin[0].T, dep_r_bin[0].T)
    sim_matrix[np.diag_indices(sim_matrix.shape[0])] = 1
    sim_matrix /= np.sum(sim_matrix, axis=0)
    dep_r_bin = [d.dot(sim_matrix) for d in dep_r_bin]

    print('Getting dot product between mean dep_r_bin and datasets')
    dep_r_bin_dot = get_dot(dep_r_bin, y_train)

    print('Getting dot product between mean dep_r_bin_tfidf and datasets')
    dep_r_bin_tfidf_dot = get_dot(dep_r_bin_tfidf, y_train)

    print('Getting distances between mean dep_r_bin and datasets')
    dep_r_bin_manh = get_manhattan(dep_r_bin, y_train)

    print('Getting distances between mean dep_r_bin_tfidf and datasets')
    dep_r_bin_tfidf_manh = get_manhattan(dep_r_bin_tfidf, y_train)

    print('Getting euclidean for dep_r_bin\n')
    dep_r_bin_euclidean = get_euclidean(dep_r_bin, y_train)

    print('Getting euclidean for dep_r_bin_tfidf\n')
    dep_r_bin_tfidf_euclidean = get_euclidean(dep_r_bin_tfidf, y_train)

    print('Getting cosine for dep_r_bin\n')
    dep_r_bin_cosine = get_cosine(dep_r_bin, y_train)

    print('Getting cosine for dep_r_bin_tfidf\n')
    dep_r_bin_tfidf_cosine = get_cosine(dep_r_bin_tfidf, y_train)

    n_unique_dep = []
    for df in [df_train, df_test]:
        n_unique_dep.append(
            df.groupby(['VisitNumber_', 'Department_']).size().reset_index().
            groupby(['VisitNumber_']).size().as_matrix())

    n_unique_fn = []
    for df in [df_train, df_test]:
        n_unique_fn.append(
            df.groupby(['VisitNumber_', 'FinelineNumber_']).size().reset_index().
            groupby(['VisitNumber_']).size().as_matrix())

    n_unique_upc = []
    for df in [df_train, df_test]:
        n_unique_upc.append(
            df.groupby(['VisitNumber_', 'Upc_']).size().reset_index().
            groupby(['VisitNumber_']).size().as_matrix())

    max_scan_count = []
    for df in [df_train, df_test]:
        max_scan_count.append(df.groupby(['VisitNumber_'])['ScanCount'].max())

    min_scan_count = []
    for df in [df_train, df_test]:
        min_scan_count.append(df.groupby(['VisitNumber_'])['ScanCount'].min())

    mean_scan_count_per_dep = []
    for i, df in enumerate([df_train, df_test]):
        mean_scan_count_per_dep.append(
            1. * df.groupby(['VisitNumber_'])['ScanCount'].sum() / n_unique_dep[i])

    # Weekday
    onehot = OneHotEncoder()
    day_train = onehot.fit_transform(
        df_train.groupby(['VisitNumber_'])
        .first()['Weekday_'][:, np.newaxis]).toarray()
    day_test = onehot.fit_transform(
        df_test.groupby(['VisitNumber_'])
        .first()['Weekday_'][:, np.newaxis]).toarray()

    X_train = np.c_[
        fnupc_red[0],
        fnupc_tfidf_red[0],
        # fn_tfidf_red[0],
        fn_r_tfidf_red[0],
        diff_br_red[0],

        dep[0], dep_tfidf[0], dep_p[0], dep_entropy[0],
        dep_uniq[0], dep_uniq_tfidf[0], dep_uniq_p[0], dep_uniq_entropy[0],
        dep_uniq_fn[0], dep_uniq_fn_tfidf[0], dep_uniq_fn_p[0], dep_uniq_fn_entropy[0],
        dep_bin[0], dep_bin_tfidf[0],

        dep_sorted[0], dep_p_sorted[0], dep_uniq_fn_sorted[0], dep_uniq_fn_p_sorted[0],

        dep_r[0], dep_r_tfidf[0],
        # dep_r_uniq_tfidf[0],
        dep_r_bin[0], dep_r_bin_tfidf[0],

        dep_r_sorted[0], dep_r_p_sorted[0],

        dep_bought_mr[0],

        top_dep[0],
        n_br_fn[0], # n_br_upc[0],
        b_bought[0], n_bought[0],
        b_returned[0], n_returned[0],
        n_unique_dep[0], n_unique_fn[0], n_unique_upc[0],
        max_scan_count[0], min_scan_count[0], mean_scan_count_per_dep[0],
        day_train,

        fn_dot[0], fn_tfidf_dot[0],
        dep_dot[0], dep_tfidf_dot[0], dep_p_dot[0],
        dep_uniq_dot[0], dep_uniq_tfidf_dot[0], dep_uniq_p_dot[0],
        dep_uniq_fn_dot[0], dep_uniq_fn_tfidf_dot[0], dep_uniq_fn_p_dot[0],
        dep_bin_dot[0], dep_bin_tfidf_dot[0],
        dep_sorted_dot[0], dep_p_sorted_dot[0],
        dep_uniq_fn_sorted_dot[0], dep_uniq_fn_p_sorted_dot[0],

        dep_manh[0], dep_tfidf_manh[0], dep_p_manh[0],
        dep_uniq_manh[0], dep_uniq_tfidf_manh[0], dep_uniq_p_manh[0],
        dep_uniq_fn_manh[0], dep_uniq_fn_tfidf_manh[0], dep_uniq_fn_p_manh[0],
        dep_bin_manh[0], dep_bin_tfidf_manh[0],
        dep_sorted_manh[0], dep_p_sorted_manh[0],
        dep_uniq_fn_sorted_manh[0], dep_uniq_fn_p_sorted_manh[0],

        dep_euclidean[0], dep_tfidf_euclidean[0], dep_p_euclidean[0],
        dep_uniq_euclidean[0], dep_uniq_tfidf_euclidean[0], dep_uniq_p_euclidean[0],
        dep_uniq_fn_euclidean[0], dep_uniq_fn_tfidf_euclidean[0], dep_uniq_fn_p_euclidean[0],
        dep_bin_euclidean[0], dep_bin_tfidf_euclidean[0],
        dep_sorted_euclidean[0], dep_p_sorted_euclidean[0],
        dep_uniq_fn_sorted_euclidean[0], dep_uniq_fn_p_sorted_euclidean[0],

        dep_cosine[0], dep_tfidf_cosine[0], dep_p_cosine[0],
        dep_uniq_cosine[0], dep_uniq_tfidf_cosine[0], dep_uniq_p_cosine[0],
        dep_uniq_fn_cosine[0], dep_uniq_fn_tfidf_cosine[0], dep_uniq_fn_p_cosine[0],
        dep_bin_cosine[0], dep_bin_tfidf_cosine[0],
        dep_sorted_cosine[0], dep_p_sorted_cosine[0],
        dep_uniq_fn_sorted_cosine[0], dep_uniq_fn_p_sorted_cosine[0],

        fn_r_dot[0], fn_r_tfidf_dot[0],

        dep_r_dot[0], dep_r_bin_dot[0],
        dep_r_tfidf_dot[0], dep_r_bin_tfidf_dot[0],

        dep_r_manh[0], dep_r_bin_manh[0],
        dep_r_tfidf_manh[0], dep_r_bin_tfidf_manh[0],

        dep_r_euclidean[0], dep_r_bin_euclidean[0],
        dep_r_tfidf_euclidean[0], dep_r_bin_tfidf_euclidean[0],

        dep_r_cosine[0], dep_r_bin_cosine[0],
        dep_r_tfidf_cosine[0], dep_r_bin_tfidf_cosine[0],
    ]

    X_test = np.c_[
        fnupc_red[1],
        fnupc_tfidf_red[1],
        # fn_tfidf_red[1],
        fn_r_tfidf_red[1],
        diff_br_red[1],

        dep[1], dep_tfidf[1], dep_p[1], dep_entropy[1],
        dep_uniq[1], dep_uniq_tfidf[1], dep_uniq_p[1], dep_uniq_entropy[1],
        dep_uniq_fn[1], dep_uniq_fn_tfidf[1], dep_uniq_fn_p[1], dep_uniq_fn_entropy[1],
        dep_bin[1], dep_bin_tfidf[1],

        dep_sorted[1], dep_p_sorted[1], dep_uniq_fn_sorted[1], dep_uniq_fn_p_sorted[1],

        dep_r[1], dep_r_tfidf[1],
        # dep_r_uniq_tfidf[1],
        dep_r_bin[1], dep_r_bin_tfidf[1],

        dep_r_sorted[1], dep_r_p_sorted[1],

        dep_bought_mr[1],

        top_dep[1],
        n_br_fn[1], # n_br_upc[1],
        b_bought[1], n_bought[1],
        b_returned[1], n_returned[1],
        n_unique_dep[1], n_unique_fn[1], n_unique_upc[1],
        max_scan_count[1], min_scan_count[1], mean_scan_count_per_dep[1],
        day_test,

        fn_dot[1], fn_tfidf_dot[1],
        dep_dot[1], dep_tfidf_dot[1], dep_p_dot[1],
        dep_uniq_dot[1], dep_uniq_tfidf_dot[1], dep_uniq_p_dot[1],
        dep_uniq_fn_dot[1], dep_uniq_fn_tfidf_dot[1], dep_uniq_fn_p_dot[1],
        dep_bin_dot[1], dep_bin_tfidf_dot[1],
        dep_sorted_dot[1], dep_p_sorted_dot[1],
        dep_uniq_fn_sorted_dot[1], dep_uniq_fn_p_sorted_dot[1],

        dep_manh[1], dep_tfidf_manh[1], dep_p_manh[1],
        dep_uniq_manh[1], dep_uniq_tfidf_manh[1], dep_uniq_p_manh[1],
        dep_uniq_fn_manh[1], dep_uniq_fn_tfidf_manh[1], dep_uniq_fn_p_manh[1],
        dep_bin_manh[1], dep_bin_tfidf_manh[1],
        dep_sorted_manh[1], dep_p_sorted_manh[1],
        dep_uniq_fn_sorted_manh[1], dep_uniq_fn_p_sorted_manh[1],

        dep_euclidean[1], dep_tfidf_euclidean[1], dep_p_euclidean[1],
        dep_uniq_euclidean[1], dep_uniq_tfidf_euclidean[1], dep_uniq_p_euclidean[1],
        dep_uniq_fn_euclidean[1], dep_uniq_fn_tfidf_euclidean[1], dep_uniq_fn_p_euclidean[1],
        dep_bin_euclidean[1], dep_bin_tfidf_euclidean[1],
        dep_sorted_euclidean[1], dep_p_sorted_euclidean[1],
        dep_uniq_fn_sorted_euclidean[1], dep_uniq_fn_p_sorted_euclidean[1],

        dep_cosine[1], dep_tfidf_cosine[1], dep_p_cosine[1],
        dep_uniq_cosine[1], dep_uniq_tfidf_cosine[1], dep_uniq_p_cosine[1],
        dep_uniq_fn_cosine[1], dep_uniq_fn_tfidf_cosine[1], dep_uniq_fn_p_cosine[1],
        dep_bin_cosine[1], dep_bin_tfidf_cosine[1],
        dep_sorted_cosine[1], dep_p_sorted_cosine[1],
        dep_uniq_fn_sorted_cosine[1], dep_uniq_fn_p_sorted_cosine[1],

        fn_r_dot[1], fn_r_tfidf_dot[1],

        dep_r_dot[1], dep_r_bin_dot[1],
        dep_r_tfidf_dot[1], dep_r_bin_tfidf_dot[1],

        dep_r_manh[1], dep_r_bin_manh[1],
        dep_r_tfidf_manh[1], dep_r_bin_tfidf_manh[1],

        dep_r_euclidean[1], dep_r_bin_euclidean[1],
        dep_r_tfidf_euclidean[1], dep_r_bin_tfidf_euclidean[1],

        dep_r_cosine[1], dep_r_bin_cosine[1],
        dep_r_tfidf_cosine[1], dep_r_bin_tfidf_cosine[1],
    ]

    print('Scaling...')
    scl = StandardScaler()
    for i in range(X_train.shape[1]):
        if len(np.unique(X_train[:, i])) > 2:
            scl.fit(X_train[:, i].reshape(-1, 1))
            xtrain = scl.transform(X_train[:, i].reshape(-1, 1)).flatten()
            xtest = scl.transform(X_test[:, i].reshape(-1, 1)).flatten()
            X_train[:, i] = np.clip(xtrain, -25, 25)
            X_test[:, i] = np.clip(xtest, -25, 25)
        else:
            continue

    return X_train, X_test, Y_train, y_train
示例#23
0
 def calc_similarity(self):
     self.user_similarity = cos_sim(self.training_set)
     print('User based similarity matrix built...')
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity as cos_sim

text = ["London Paris London", "Paris Paris London"]

cv = CountVectorizer()
count = cv.fit_transform(text)

# print(count.toarray())
similarity_score = cos_sim(count)

print(similarity_score)
示例#25
0
            axis=1,
            inplace=True)

#%%
from sklearn.metrics.pairwise import cosine_similarity as cos_sim

i = 1
handles = merged.value_counts("Handle").index
# sim_df = pd.DataFrame(columns = ["Handle", "Similarity"])
sim_df = merged.value_counts("Handle").to_frame('Counts')
sim_df.reset_index(inplace=True)
for i, handles_temp in enumerate(handles):
    df_temp = merged.loc[merged["Handle"].astype("str") == handles_temp]
    X = df_temp.loc[:, ["dif"]].values
    Y = df_temp.loc[:, ["sent_score"]].values
    sim_score = cos_sim(np.transpose(X), np.transpose(Y))
    sim_df.loc[i, 'Handle'] = handles_temp
    sim_df.loc[i, 'Similarity'] = sim_score[0][0]
    # print(sim_df)
sim_df
# %%
plt.plot(sim_df.index, sim_df.Similarity.values)
# %%
sim_df["Sim_abs"] = abs(sim_df.Similarity.values)
sim_df.drop(sim_df.loc[sim_df.Counts < 100].index, inplace=True)
sim_df.sort_values("Sim_abs", inplace=True, ascending=False)
sim_df
# %%
sim_df.Counts.values
# %%
plt.plot(sim_df.Counts.values[10:])
示例#26
0
def similarity(x, y):
    return cos_sim(x, y)
示例#27
0
def cosine_eval(trainer, target, features):
    return [(cos_sim([target], [f]), i) for i, f in enumerate(features)]
示例#28
0
out_path_ch09 = 'out94_ch09.txt'
out_path_ch10 = 'out94_ch10.txt'

word2vec_model = word2vec.load('out90.bin')
ft = load('ft')
t2i = {token: i for i, token in enumerate(ft)}
X_300 = sio.loadmat('../chapter09/pickles/X_300.mat')['X_300']

with zipfile.PyZipFile(in_path, "r") as myzip, open(out_path_ch09,
                                                    "w") as f_out_ch09, open(
                                                        out_path_ch10,
                                                        "w") as f_out_ch10:
    with myzip.open('combined.tab') as f_in:
        for line in map(lambda x: x.decode().rstrip(), f_in):
            words = line.split('\t')
            try:
                cs_09 = cos_sim([X_300[t2i[words[0]]]],
                                [X_300[t2i[words[1]]]])[0][0]
            except Exception as e:
                cs_09 = -1
            try:
                cs_10 = cos_sim([word2vec_model[words[0]]],
                                [word2vec_model[words[1]]])[0][0]
            except Exception as e:
                cs_10 = -1
            print(f"{line}\t{cs_09:f}", file=f_out_ch09)
            print(f"{line}\t{cs_10:f}", file=f_out_ch10)

end = time.time()
print(f"elapsed time = {end - start} s")
示例#29
0
        data = pickle.load(f_in)
    return data


in_path = 'out91.txt'
out_path = 'out93.txt'

ft = load('ft')
t2i = {token: i for i, token in enumerate(ft)}
vec = sio.loadmat('../chapter09/pickles/X_300.mat')['X_300']
cnt = [0, 0]

with open(out_path, 'w') as f_out:
    for line in open(in_path):
        a, b, x, y = line.split()  # a - b = x - y <=> y = b - a + x
        try:
            tgt = [vec[t2i[b]] - vec[t2i[a]] + vec[t2i[x]]]
            ranking = [(cos_sim([vec[t2i[key]]], tgt)[0][0], key)
                       for key in ft]
            cs, word = max(ranking)
        except Exception as e:
            word = '***'
            cs = -1
        cnt[y == word] += 1
        print(f'{a} {b} {x} {y} {word} {cs:f}', file=f_out)

message(f'ok = {cnt[True]}, ng = {cnt[False]}')  # => ok = ???, ng = ???
'''
# TODO: 実行時間
'''
示例#30
0
    def pair_texts_similarity(self, ori_sentences, adv_sentences):
        cls_token = self.tokenizer_embed_lm.cls_token_id
        sep_token = self.tokenizer_embed_lm.sep_token_id

        ori_sentences = self.tokenizer_embed_lm(ori_sentences)['input_ids']
        adv_sentences = self.tokenizer_embed_lm(adv_sentences)['input_ids']

        ori_exclude_ids = []
        adv_exclude_ids = []
        for ori in range(len(ori_sentences)):
            for adv in range(len(adv_sentences)):
                if ori not in ori_exclude_ids and adv not in adv_exclude_ids:
                    distance, operations = edit_distance(
                        ori_sentences[ori], adv_sentences[adv])
                    if distance == 0:
                        ori_exclude_ids.append(ori)
                        adv_exclude_ids.append(adv)
                        break
        ori_input = []
        for i in range(len(ori_sentences)):
            if i not in ori_exclude_ids:
                ori_input += ori_sentences[i][1:-1]
        ori_input = [cls_token] + ori_input + [sep_token]
        adv_input = []
        for i in range(len(adv_sentences)):
            if i not in adv_exclude_ids:
                adv_input += adv_sentences[i][1:-1]
        adv_input = [cls_token] + adv_input + [sep_token]

        distance, operations = edit_distance(ori_input, adv_input)
        if distance == 0:
            return 1.0
        operations = operations[1:].split(',')
        operations_o = [int(o.split(';')[0].split()[1]) for o in operations]
        operations_a = [int(o.split(';')[1].split()[1]) for o in operations]
        partial_ids_o = [[max(operations_o[0] - 2, 0), operations_o[0] + 2]]
        partial_ids_a = [[max(operations_a[0] - 2, 0), operations_a[0] + 2]]
        for o, a in zip(operations_o[1:], operations_a[1:]):
            if o - 2 < partial_ids_o[-1][1]:
                partial_ids_o[-1][1] = o + 2
            else:
                partial_ids_o.append([o - 2, o + 2])
            if a - 2 < partial_ids_a[-1][1]:
                partial_ids_a[-1][1] = a + 2
            else:
                partial_ids_a.append([a - 2, a + 2])

        partial_ori = []
        partial_adv = []
        for o, a in zip(partial_ids_o, partial_ids_a):
            partial_o = ori_input[o[0]:o[1]]
            if partial_o[0] != cls_token: partial_o = [cls_token] + partial_o
            if partial_o[-1] != sep_token: partial_o = partial_o + [sep_token]
            partial_a = adv_input[a[0]:a[1]]
            if partial_a[0] != cls_token: partial_a = [cls_token] + partial_a
            if partial_a[-1] != sep_token: partial_a = partial_a + [sep_token]
            partial_ori.append(partial_o)
            partial_adv.append(partial_a)

        if self.verbose:
            for i in range(len(partial_ori)):
                print(get_time() + '[INFO] Modification number: %d' % i)
                print(
                    self.tokenizer_embed_lm.convert_ids_to_tokens(
                        partial_ori[i]))
                print(
                    self.tokenizer_embed_lm.convert_ids_to_tokens(
                        partial_adv[i]))

        ori_inputs = [ori_input] + partial_ori
        adv_inputs = [adv_input] + partial_adv

        with torch.no_grad():
            ori_sentence_emb = []
            for i in range(len(ori_inputs)):
                output = self.model_embed_lm(
                    torch.tensor(ori_inputs[i]).unsqueeze(0).to(self.device))
                ori_sentence_emb.append(output.pooler_output if self.pooler ==
                                        'cls' else output.last_hidden_state[:,
                                                                            0])
            ori_sentence_emb = torch.cat(ori_sentence_emb, axis=0).cpu()

            adv_sentence_emb = []
            for i in range(len(adv_inputs)):
                output = self.model_embed_lm(
                    torch.tensor(adv_inputs[i]).unsqueeze(0).to(self.device))
                adv_sentence_emb.append(output.pooler_output if self.pooler ==
                                        'cls' else output.last_hidden_state[:,
                                                                            0])
            adv_sentence_emb = torch.cat(adv_sentence_emb, axis=0).cpu()

        similarity = np.array([cos_sim(o.reshape(1, -1), a.reshape(1, -1))[0][0] \
                               for o, a in zip(ori_sentence_emb.numpy(), adv_sentence_emb.numpy())])

        if len(similarity) > 1:
            if self.verbose:
                print(get_time() +
                      '[INFO] Original similarity score: %f' % similarity[0])
                print(get_time() +
                      '[INFO] Average partial similarity score: %f' %
                      np.average(similarity[1:]))
                print(get_time() +
                      '[INFO] Minimum partial similarity score: %f' %
                      similarity[1:].min())
                print(similarity[1:])
                all_sim, avg_sim, min_sim = similarity[0], np.average(
                    similarity[1:]), similarity[1:].min()
            else:
                all_sim, avg_sim, min_sim = similarity[0], similarity[
                    0], similarity[0]

        similarity = self.lambda1 * min_sim + self.lambda2 * avg_sim + (
            1 - self.lambda1 - self.lambda2) * all_sim

        return similarity
示例#31
0
def get_corr(args):
    #classes, class_to_idx, idx_to_class = utils.get_classes(dataset)

    f = open(os.getcwd() + '/results/files/' + args.run_name + '/encoding_dict.json', 'r')
    for line in f:
        reps = json.loads(line)


    f = open(os.getcwd() + '/data/files/sketchy_classes.json', 'r')
    for line in f: class_splits = json.loads(line)

    f = open(os.getcwd() + '/data/files/class_to_idx.json', 'r')
    for line in f: class_to_idx = json.loads(line)
    classes = class_splits['train']

    #attr_dict, n_attrs = get_attrs(class_to_idx)


    for name in reps:
        print(name)
        print(len(reps[name]))
    return
    '''
    f = open('/Users/romapatel/Desktop/avg_vgg128_nouns.csv', 'r')
    lines = f.readlines()

    vgg_dict = {}
    for line in lines:
        items = line.strip().split(',')
        #print(items[0])
        if items[0] in classes:
            vgg_dict[items[0]] = [float(item) for item in items[1:]]

    '''


    # finally run this using the function in utils

    print(len(classes))

    f = open(os.getcwd() + '/data/files/sem-vis-sketchy.tsv', 'r')
    lines = [line.strip().split('\t') for line in f.readlines()]
    # evaluate only the first


    class_rep_dict, sims, true = {}, [], []
    for key in reps:
        val = class_to_idx[int(key)]
        if val not in classes: continue
        class_name = classes[int(key)]
        # evaluate only the first
        class_rep_dict[class_name] = reps[key]



    encoding_dict = class_rep_dict
    for key in encoding_dict:
        val = class_to_idx[key]
        if key not in encoding_dict.keys(): continue
        print(len(encoding_dict[key]))
        #print(encoding_dict[key])
        sims = []
        for rep1 in encoding_dict[key]:
            for rep2 in encoding_dict[key]:
                sims.append(cos_sim(np.array(rep1).reshape(1, -1), \
                                    np.array(rep2).reshape(1, -1)))
        #print(sims)
        print(np.mean(sims))
    return

    '''
    f = open(os.getcwd() + '/data/files/wvecs.json', 'r')
    for line in f: wvecs = json.loads(line)
    print(wvecs)
    '''

    #class_rep_dict = attr_dict
    for line in lines:
        word1, word2 = line[0], line[1]
        if word1 not in class_rep_dict.keys(): continue
        if word2 not in class_rep_dict.keys(): continue

        print(len(class_rep_dict[word1]))
        rep1 = np.array(class_rep_dict[word1]).reshape(1, -1)
        rep2 = np.array(class_rep_dict[word2]).reshape(1, -1)

        sim = cos_sim(rep1, rep2)[0][0]
        sims.append(sim)
        true.append(float(line[2]))
        s = word1 + '-' + word2
        print(s)
        print(cos_sim(rep1, rep2))




    pearson = pearsonr(sims, true)
    spearman = spearmanr(sims, true)
    print(pearson)
    print(spearman)
示例#32
0
 def getDistance(self, x1, x2):
     return np.sum(cos_sim(x1, x2))
    def cluster_sentences(self, enc, seq_len, words_conf):
        # ------------------------------------------------------------
        # dynamic clustering depending on num low confidence samples
        # ------------------------------------------------------------
        n_clusters = int(len(seq_len) / config.num_clusters)

        print('\nSimilarity metric is {}\n'.format(config.similarity))
        if config.similarity == 'siamese':

            print("\nReloading the sentence similarity model...\n")
            graph = tf.Graph()
            with graph.as_default():
                sess = tf.Session()
                siamese = Siamese_Model(sess)

            #---------------------------------------------------
            # take all possible pairwise combinations of confused
            # samples to obtain the similarity scores pairwise.

            # The similarity matrix is symmetric.
            #---------------------------------------------------

            split1, split2 = np.array_split(np.arange(len(seq_len)), 2)
            max_len = max(seq_len)
            seq_len1, seq_len2 = \
                [seq_len[i] for i in split1], [seq_len[i] for i in split2]

            if not config.model_aware:
                sent1, sent2 = [
                    np.array(enc[i][0][0]).tolist() for i in split1
                ], [np.array(enc[i][0][0]).tolist() for i in split2]
            else:
                sent1, sent2 = [enc[i]
                                for i in split1], [enc[i] for i in split2]

            if config.model.split()[1] == 'LSTM' or not config.model_aware:
                dim = config.hidden_size_lstm
            else:
                dim = 2 * config.hidden_size_lstm

            for i, row in enumerate(sent1):
                if len(row) <= max_len:
                    sent1[i] += [np.zeros(dim).tolist()] * (max_len - len(row))
                    try:
                        sent2[i] += [np.zeros(dim).tolist()
                                     ] * (max_len - len(sent2[i]))
                    except IndexError:
                        sent2 += [[np.zeros(dim).tolist()] * len(sent1[i])]
                        seq_len2 += [1]

            siamese_enc = np.concatenate(
                siamese.run(sent1, sent2, seq_len1, seq_len2, max_len,
                            len(split1), len(split2)))

            def similarity_scores(enc):
                shape = np.array(enc).shape
                out = np.reshape(np.repeat(enc, [shape[0]], axis=0),
                                 (-1, shape[0], shape[1]))
                X = np.exp(-1 * np.sqrt(
                    np.sum(np.square(out - np.transpose(out, (1, 0, 2))),
                           2,
                           keepdims=False)))
                return X

            X = similarity_scores(siamese_enc)
            clustering = self.spectral_clustering(X, n_clusters)

        elif config.similarity == 'cosine':
            enc1 = [emb[-1] for emb in enc]
            X = np.exp(cos_sim(enc1, enc1))
            clustering = self.spectral_clustering(X, n_clusters)

        elif config.similarity == 'skipthoughts':
            model = skipthoughts.load_model()
            encoder = skipthoughts.Encoder(model)
            vectors = encoder.encode([' '.join(list) for list in words_conf])
            X = np.exp(cos_sim(vectors, vectors))
            #vectors = vectors / np.linalg.norm(vectors)
            #X = np.cos(np.dot(vectors, np.transpose(vectors)))
            clustering = self.spectral_clustering(X, n_clusters)

        return clustering
示例#34
0
def get_graph_d3(old1, new1, csim, cstars, cenr, chours):
    """
    determines optimal path (shortest path)

    Parameters
    ----------

    old1 : int
    index of old topic

    new1 : int
    index of new topic

    csim : int or float
    weight for course similarity

    cstars : int or float
    weight for course rating

    cenr : int or float
    weight for course enrollment

    chours : int or float
    weight for course length

    Returns
    -------

    shortpath : array
    shortest path in the course graph
    """

    # load Graph
    file = open('networkx_graph.pkl', 'rb')
    G = pickle.load(file)
    file.close()

    # load positions
    file = open('networkx_pos.pkl', 'rb')
    pos = pickle.load(file)
    file.close()

    # load node values
    file = open('networkx_values.pkl', 'rb')
    values = pickle.load(file)
    file.close()

    # load titles
    file = open('course_titles.pkl', 'rb')
    titles = pickle.load(file)
    file.close()

    # topic scores
    mat = loadmat('scoremat.mat')
    scoremat = mat['scoremat']
    scorecorrs = cos_sim(scoremat)
    for d in range(len(scorecorrs)):
        scorecorrs[d, d] = 0
    print('corr test 1:', scorecorrs[old1, new1])

    # numeric course info
    mat = loadmat('course_numeric_info.mat')
    stars = mat['stars']
    hours = mat['hours']
    enrollment = mat['enrollment']

    Gdir = nx.DiGraph(G)
    list_edges = list(Gdir.edges)

    # add weighted costs to edges

    stars_norm = normalize_cost(stars, 1)
    enrollment_norm = normalize_cost(np.log10(enrollment), 1)
    hours_norm = normalize_cost(hours)

    weighted_costs = cstars * stars_norm + cenr * enrollment_norm + chours * hours_norm
    if np.shape(weighted_costs)[0] == 1:
        weighted_costs = weighted_costs.T

    list_weighted_costs = []
    list_weights = []
    for edge in Gdir.edges:
        sim = scorecorrs[edge[0], edge[1]]
        dissim = 1 - sim
        edge_cost = weighted_costs[edge[1]] + csim * dissim
        if edge_cost < 0:
            print(edge)
        Gdir.edges[edge[0], edge[1]]['weighted_cost'] = edge_cost
        Gdir.edges[edge[0], edge[1]]['weight'] = 1 - edge_cost
        list_weighted_costs.append(edge_cost)
        list_weights.append(1 - edge_cost)
    print(np.min(np.array(list_weighted_costs)))
    print('corr:', scorecorrs[old1, new1])
    #edge_weights = [Gdir[u][v]['weight']-.4 for u,v in G.edges()] # min is .5; -.4 so that min is .1

    # shortest path
    shortpath = shortest_path(Gdir, old1, new1, weight='weighted_cost')
    print('shortpath:', shortpath)
    mytuples = []
    mytuples_directed = []
    for i in range(len(shortpath) - 1):
        newlink = (shortpath[i], shortpath[i + 1])
        if shortpath[i] < shortpath[i + 1]:
            newlink = (shortpath[i], shortpath[i + 1])
        else:
            newlink = (shortpath[i + 1], shortpath[i])
        mytuples.append(newlink)
        newlink = (shortpath[i], shortpath[i + 1])
        mytuples_directed.append(newlink)

    # write nodes_output.csv
    # write nodes not in shortpath first so large nodes are drawn on top
    with open('static/nodes_output.csv', mode='w') as fp:
        fwriter = csv.writer(fp,
                             delimiter=',',
                             quotechar='"',
                             quoting=csv.QUOTE_MINIMAL)
        fwriter.writerow(['x', 'y', 'strength', 'radius', 'title'])
        for i in range(len(pos)):
            if i in shortpath:
                fwriter.writerow(
                    [pos[i][0], pos[i][1],
                     int(values[i]), 4, titles[i]])
            else:
                pass

    # write edges_output.csv
    with open('static/edges_output.csv', mode='w') as fp:
        fwriter = csv.writer(fp,
                             delimiter=',',
                             quotechar='"',
                             quoting=csv.QUOTE_MINIMAL)
        fwriter.writerow(['x1', 'x2', 'y1', 'y2', 'width', 'color'])
        for i in range(len(list_edges)):
            if list_edges[i] in mytuples:
                if list_edges[i] in mytuples_directed:
                    x1 = pos[list_edges[i][0]][0]
                    x2 = pos[list_edges[i][1]][0]
                    y1 = pos[list_edges[i][0]][1]
                    y2 = pos[list_edges[i][1]][1]
                else:
                    x1 = pos[list_edges[i][1]][0]
                    x2 = pos[list_edges[i][0]][0]
                    y1 = pos[list_edges[i][1]][1]
                    y2 = pos[list_edges[i][0]][1]
                fwriter.writerow([x1, x2, y1, y2, 2, '#ff0000'])

    return shortpath
示例#35
0
文件: eval.py 项目: roma-patel/proto
def prototype_model(pixel_type, num):
    coarse, fine, cat_dict = get_categories()
    f = open(path + 'data/tu-berlin/train_1.json', 'r')
    for line in f:
        train = json.loads(line)

    f = open(path + 'data/tu-berlin/test_1.json', 'r')
    for line in f:
        test = json.loads(line)

    print 'Inside prototype model\n'
    prototypes = {}
    f = open('/Users/romapatel/Desktop/prototypes_20.json', 'r')
    for line in f.readlines()[:num]:
        temp = json.loads(line)
        prototypes[temp['category']] = temp

    all_cats = sorted(prototypes.keys())

    results = {}
    num = len(all_cats)
    cos_matrix, sp_matrix = np.zeros((num, num)), np.zeros((num, num))
    abs_matrix, ce_matrix = np.zeros((num, num)), np.zeros((num, num))

    for i in range(len(all_cats)):
        category = sorted(prototypes.keys())[i]
        print category

        cat_path = path + 'data/tu-berlin/sketches_png/' + category + '/'
        if os.path.isdir(cat_path) is False: continue
        filenames = test[category]

        for filename in filenames:
            if '.DS' in filename: continue
            print filename
            a = Image(cat_path + filename)
            if pixel_type == 'bin':
                pixels = a.get_pixel_features()

            else:
                pixels = a.get_pixels()
            flat_pixels = [val for sublist in pixels for val in sublist]

            cos_temp, sp_temp = [], []
            for j in range(len(all_cats)):
                cat = sorted(prototypes.keys())[j]
                prototype = prototypes[cat]['prototype_arr']
                print len(prototype)
                flat_prototype = [
                    val for sublist in prototype for val in sublist
                ]

                cos_matrix[i][j] += np.mean(cos_sim(pixels, prototype))
                sp_matrix[i][j] += spearmanr(flat_pixels, flat_prototype)[0]
                abs_matrix[i][j] += np.mean(pixels - prototype)
                ce_matrix[i][j] += np.mean(log_loss(pixels, prototype))

            break

    cos_matrix = [list(item) for item in cos_matrix]
    sp_matrix = [list(item) for item in sp_matrix]
    abs_matrix = [list(item) for item in abs_matrix]
    ce_matrix = [list(item) for item in ce_matrix]

    f = open(
        path + 'results/tu-berlin/prototype/' + pixel_type + '_' + str(num) +
        '.json', 'w+')

    results = {
        'cos_sim': list(cos_matrix),
        'spearman': list(sp_matrix),
        'abs': list(abs_matrix),
        'ce': list(ce_matrix),
        'indices': all_cats
    }
    f.write(json.dumps(results))