示例#1
0
def acc_cosine(anchor_embedding, positive_embedding, negative_embedding):
    num_correct_cos_triplets = 0.
    num_triplets = 0.
    pos_cos_distance = paired_cosine_distances(anchor_embedding, positive_embedding)
    neg_cos_distances = paired_cosine_distances(anchor_embedding, negative_embedding)
    for idx in range(len(pos_cos_distance)):
        num_triplets += 1.
        if pos_cos_distance[idx] < neg_cos_distances[idx]:
            num_correct_cos_triplets += 1.
    return num_correct_cos_triplets / num_triplets
    def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:

        if epoch != -1:
            if steps == -1:
                out_txt = f" after epoch {epoch}:"
            else:
                out_txt = f" in epoch {epoch} after {steps} steps:"
        else:
            out_txt = ":"

        logging.info("Binary Accuracy Evaluation of the model on " + self.name + " dataset" + out_txt)
        embeddings1 = model.encode(self.sentences1, batch_size=self.batch_size,
                                   show_progress_bar=self.show_progress_bar, convert_to_numpy=True)
        embeddings2 = model.encode(self.sentences2, batch_size=self.batch_size,
                                   show_progress_bar=self.show_progress_bar, convert_to_numpy=True)

        cosine_scores = 1-paired_cosine_distances(embeddings1, embeddings2)
        manhattan_distances = paired_manhattan_distances(embeddings1, embeddings2)
        euclidean_distances = paired_euclidean_distances(embeddings1, embeddings2)

        if self.log_missclassified:
            misclassed = self.get_missclassified(self.sentences1, self.sentences2, self.labels, cosine_scores)
            self.save_misclassed(output_path, misclassed)


        labels = np.asarray(self.labels)
            

        file_output_data = [epoch, steps]

        main_score = None
        for name, scores, reverse in [['Cosine-Similarity', cosine_scores, True], ['Manhatten-Distance', manhattan_distances, False], ['Euclidean-Distance', euclidean_distances, False]]:
            acc, acc_threshold = self.find_best_acc_and_threshold(scores, labels, reverse)
            f1, precision, recall, f1_threshold = self.find_best_f1_and_threshold(scores, labels, reverse)
            ap = average_precision_score(labels, scores * (1 if reverse else -1))

            logging.info("Accuracy with {}:           {:.2f}\t(Threshold: {:.4f})".format(name, acc * 100, acc_threshold))
            logging.info("F1 with {}:                 {:.2f}\t(Threshold: {:.4f})".format(name, f1 * 100, f1_threshold))
            logging.info("Precision with {}:          {:.2f}".format(name, precision * 100))
            logging.info("Recall with {}:             {:.2f}".format(name, recall * 100))
            logging.info("Average Precision with {}:  {:.2f}\n".format(name, ap * 100))

            file_output_data.extend([acc, acc_threshold, f1, precision, recall, f1_threshold, ap])

            if main_score is None: #Use AveragePrecision with Cosine-Similarity as main score
                main_score = ap

        if output_path is not None:
            csv_path = os.path.join(output_path, self.csv_file)
            if not os.path.isfile(csv_path):
                with open(csv_path, mode="w", encoding="utf-8") as f:
                    writer = csv.writer(f)
                    writer.writerow(self.csv_headers)
                    writer.writerow(file_output_data)
            else:
                with open(csv_path, mode="a", encoding="utf-8") as f:
                    writer = csv.writer(f)
                    writer.writerow(file_output_data)

        return main_score
示例#3
0
def paired_di(params):
    X, models_ = params
    m = sp.lil_matrix((1, len(models_) * X.shape[1]))
    for i, ci_dom in enumerate(models_):
        m[0, i * X.shape[1]:(i + 1) * X.shape[1]] = paired_cosine_distances(
            ci_dom, X)
    return m.tocsr()
示例#4
0
def main(args):
    # Read the dataset
    df = pd.read_csv(args.file)
    embedder = BertWrapper(args.model_path, max_seq_length=256)
    pooler = PoolingLayer(embedder.get_word_embedding_dimension(),
                          pooling_mode_mean_tokens=True,
                          pooling_mode_cls_token=False,
                          pooling_mode_max_tokens=False,
                          layer_to_use=args.layer)
    model = SentenceEncoder(modules=[embedder, pooler])
    model.eval()

    evaluator = EmbeddingSimilarityEvaluator(
        main_similarity=SimilarityFunction.COSINE)

    if args.t2s:
        df["text_1"] = df["text_1"].apply(convert_t2s)
        df["text_2"] = df["text_2"].apply(convert_t2s)

    tmp = model.encode(df["text_1"].tolist() + df["text_2"].tolist(),
                       batch_size=16,
                       show_progress_bar=True)
    embeddings1, embeddings2 = tmp[:df.shape[0]], tmp[df.shape[0]:]

    spearman_score = evaluator(embeddings1,
                               embeddings2,
                               labels=df["similarity"].values)
    print(spearman_score)

    preds = 1 - paired_cosine_distances(embeddings1, embeddings2)
    df["pred"] = preds
    df.to_csv("cache/annotated_zero_shot_pred.csv", index=False)
    print(f"Pred {pd.Series(preds).describe()}")
    return preds, df["similarity"].values
示例#5
0
def evaluate(model,
             loader,
             distance_fn=lambda a, b: 1 - paired_cosine_distances(a, b),
             device='cpu'):
    # def ,
    with torch.no_grad():
        model = model.to(device)
        model.eval()
        a_embeds, b_embeds = [], []
        # scores = []
        labels = []
        for batch in loader:
            batch = [e.to(device) for e in batch]
            a_input_ids, a_input_mask, b_input_ids, b_input_mask, label = batch

            a_embed, b_embed = model(a_input_ids, a_input_mask), model(
                b_input_ids, b_input_mask)
            a_embeds += a_embed.tolist()
            b_embeds += b_embed.tolist()
            # score = distance_fn(a_embed.cpu().numpy(), b_embed.cpu().numpy())
            # scores += score.tolist()
            labels += label.tolist()
    model.train()
    scores = distance_fn(np.asarray(a_embeds), np.asarray(b_embeds))
    return spearmanr(labels, scores)[0]
示例#6
0
    def update(self, embeddings, labels, n, **kwargs):
        scores = 1 - paired_cosine_distances(embeddings[0], embeddings[1])
        rows = list(zip(scores, labels))
        rows = sorted(rows, key=lambda x: x[0], reverse=True)

        nextract = 0
        ncorrect = 0
        total_num = sum(labels)

        for i in range(len(rows) - 1):
            score, label = rows[i]
            nextract += 1
            if label == 1:
                ncorrect += 1
            if ncorrect > 0:
                precision = ncorrect / nextract
                recall = ncorrect / total_num
                f1 = 2 * precision * recall / (precision + recall)
                if f1 > self.best_f1:
                    self.best_f1 = f1
                    self.best_precision = precision
                    self.best_recall = recall
                    self.best_threshold = (rows[i][0] + rows[i + 1][0]) / 2
        #self.max_acc, self.best_threshold = get_accuracy_and_best_threshold_from_pr_curve(preds, labels)
        labels = [r[1] for r in rows]
        preds = [1 if r[0] >= self.best_threshold else 0 for r in rows]
        assert (len(labels) == len(preds))
        self.val = self.best_f1
        self.sum += self.val * n
        self.count += n
        self.avg = self.sum / self.count
示例#7
0
    def classify(self, data_v):
        if ((len(self.normal_labels) + len(self.mal_labels)) < 2):
            print(
                "Calling classification without enough image samples - returning zero vector"
            )
            return np.zeros(self.number_of_classes)

        if (self.normal_data_storage == None):
            all_data = self.mal_data_storage
        elif (self.mal_data_storage == None):
            all_data = self.normal_data_storage
        else:
            all_data = self.normal_data_storage + self.mal_data_storage
        all_labels = self.normal_labels + self.mal_labels
        data_length = len(all_labels)

        min_distance = np.ones(self.number_of_classes) * self.max_distance

        if (self.similarity == 'cos'):
            distances = paired_cosine_distances(
                all_data, np.tile(data_v, (data_length, 1)))
        elif (self.similarity == 'l1'):
            distances = paired_manhattan_distances(
                all_data, np.tile(data_v, (data_length, 1)))
        else:
            distances = paired_euclidean_distances(
                all_data, np.tile(data_v, (data_length, 1)))

        for index_i in range(data_length):
            if (abs(distances[index_i]) < min_distance[all_labels[index_i]]):
                min_distance[all_labels[index_i]] = abs(distances[index_i])

        min_distance = self.max_distance - min_distance
        return min_distance
示例#8
0
def raw(args, encoder):
    df = pd.read_csv(args.file)

    if args.t2s:
        df["text_1"] = df["text_1"].apply(convert_t2s)
        df["text_2"] = df["text_2"].apply(convert_t2s)

    tmp = encoder.encode(
        df["text_1"].tolist() + df["text_2"].tolist(), batch_size=32,
        show_progress_bar=True
    )
    embeddings1, embeddings2 = tmp[:df.shape[0]], tmp[df.shape[0]:]

    evaluator = EmbeddingSimilarityEvaluator(
        main_similarity=SimilarityFunction.COSINE
    )

    spearman_score = evaluator(
        embeddings1, embeddings2, labels=df["similarity"].values
    )
    print(f"Spearman: {spearman_score:.4f}")

    preds = 1 - paired_cosine_distances(embeddings1, embeddings2)
    df["pred"] = preds
    df.to_csv("cache/annotated_pred.csv", index=False)
    return preds, df["similarity"].values
示例#9
0
    def __init__(self, data_path, word2vecpath, pkl):
        self.pkl = pkl
        self.data = pd.read_csv(data_path)[['question1', 'question2']].dropna()
        self.vectorizer_corpus = np.reshape(
            self.data.values, newshape=[len(self.data.values) * 2])
        self.word2vecModel = Word2Vec.load(word2vecpath)
        self.word2vecModel.init_sims(replace=True)

        # tf-idf 向量
        self.vectorizer = TfidfVectorizer(max_df=0.5,
                                          max_features=3000,
                                          min_df=3,
                                          lowercase=False,
                                          decode_error='ignore').fit(
                                              self.vectorizer_corpus)

        self.x = np.squeeze(pd.read_csv(data_path)[['question1'
                                                    ]].fillna("").values,
                            axis=1)
        self.y = np.squeeze(pd.read_csv(data_path)[['question2'
                                                    ]].fillna("").values,
                            axis=1)

        self.X = self.vectorizer.transform(self.x)
        self.Y = self.vectorizer.transform(self.y)
        self.cosine = pairwise.paired_cosine_distances(self.X, self.Y)
        self.euclidean = pairwise.paired_euclidean_distances(self.X, self.Y)
        self.manhattan = pairwise.paired_manhattan_distances(self.X, self.Y)
        print(self.cosine.shape)
        print(self.euclidean.shape)
        print(self.manhattan.shape)
示例#10
0
def HOAD(SimV1,SimV2,k,keval,m):
    
    n = SimV1.shape[0]

    Z = np.block([
                [SimV1, m*np.identity(n)],
                [m*np.identity(n), SimV2]
                ])
    
    D = np.diag(np.sum(Z,axis=1))
    
    L = D - Z 
    
    w,vr = eigh(L,eigvals=(0,k-1))
    
    adscore = np.zeros((n,len(keval)))
    
    for i in range(len(keval)):
    
        Hv1 = vr[0:n,0:keval[i]]
        Hv2 = vr[n:,0:keval[i]]
    
        adscore[:,i] = 1 - paired_cosine_distances(Hv1,Hv2)
    
    return adscore 
示例#11
0
    def ranked_answers_for_question(self, model, question: str,
                                    answers_text: List[str]):
        qas_examples = self.examples_for_q_answers(question, answers_text)
        qas_loader = self._dataloader_from_examples(list(qas_examples), model)
        qas_loader.collate_fn = model.smart_batching_collate

        embeddings1, embeddings2 = paired_embeddings_for_dataloader(
            self.device, model, qas_loader, get_labels=False)

        try:
            cosine_distances = 1 - (paired_cosine_distances(
                embeddings1, embeddings2))
            manhattan_distances = -paired_manhattan_distances(
                embeddings1, embeddings2)
            euclidean_distances = -paired_euclidean_distances(
                embeddings1, embeddings2)
            dot_products = [
                np.dot(emb1, emb2)
                for emb1, emb2 in zip(embeddings1, embeddings2)
            ]
            all_dists = [
                cosine_distances, manhattan_distances, euclidean_distances,
                dot_products
            ]
        except Exception as e:
            print(embeddings1)
            print(embeddings2)
            raise e

        return np.argsort(all_dists[int(self.main_similarity.value)])
示例#12
0
def raw(args, model):
    df = pd.read_csv(f"data/LCQMC/{args.filename}",
                     delimiter="\t",
                     header=None,
                     names=["text_1", "text_2", "label"])

    tmp = model.encoder.encode(df["text_1"].tolist() + df["text_2"].tolist(),
                               batch_size=32,
                               show_progress_bar=True)
    embeddings1, embeddings2 = tmp[:df.shape[0]], tmp[df.shape[0]:]

    evaluator = EmbeddingSimilarityEvaluator(
        main_similarity=SimilarityFunction.COSINE)

    spearman_score = evaluator(embeddings1,
                               embeddings2,
                               labels=df["label"].values)
    print(f"Spearman: {spearman_score:.4f}")

    preds = 1 - paired_cosine_distances(embeddings1, embeddings2)

    df["pred"] = preds
    df.to_csv(f"cache/{Path(args.filename).stem}_pred.csv", index=False)

    return preds, df["label"].values
示例#13
0
def cosDistance(fitted, b, h):
    bodyTfidf = fitted.transform(b)
    headlineTfidf = fitted.transform(h)
    # print("Article Body")
    #print(bodyTfidf)
    #print("Headline")
    #print(headlineTfidf)
    cosDist = paired_cosine_distances(bodyTfidf, headlineTfidf)
    return cosDist
示例#14
0
def evaluate_sbert(model, batch_size=16):
    sts_dataset_path = 'datasets/stsbenchmark.tsv.gz'

    test_samples = []
    with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn:
        reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
        for row in reader:
            if row['split'] == 'test':
                score = float(
                    row['score']) / 5.0  #Normalize score to range 0 ... 1
                test_samples.append(
                    InputExample(texts=[row['sentence1'], row['sentence2']],
                                 label=score))

    sentences1 = []
    sentences2 = []
    scores = []

    examples = test_samples

    for example in examples:
        sentences1.append((example.texts[0], 'none'))
        sentences2.append((example.texts[1], 'none'))
        scores.append(example.label)

    _, embeddings1 = model.forward(sentences1, checkpoint=False)
    _, embeddings2 = model.forward(sentences2, checkpoint=False)
    labels = scores

    cosine_scores = 1 - (paired_cosine_distances(embeddings1, embeddings2))
    manhattan_distances = -paired_manhattan_distances(embeddings1, embeddings2)
    euclidean_distances = -paired_euclidean_distances(embeddings1, embeddings2)
    dot_products = [
        np.dot(emb1, emb2) for emb1, emb2 in zip(embeddings1, embeddings2)
    ]

    eval_pearson_cosine, _ = pearsonr(labels, cosine_scores)
    eval_spearman_cosine, _ = spearmanr(labels, cosine_scores)

    eval_pearson_manhattan, _ = pearsonr(labels, manhattan_distances)
    eval_spearman_manhattan, _ = spearmanr(labels, manhattan_distances)

    eval_pearson_euclidean, _ = pearsonr(labels, euclidean_distances)
    eval_spearman_euclidean, _ = spearmanr(labels, euclidean_distances)

    eval_pearson_dot, _ = pearsonr(labels, dot_products)
    eval_spearman_dot, _ = spearmanr(labels, dot_products)

    print("Cosine-Similarity :\tPearson: {:.4f}\tSpearman: {:.4f}".format(
        eval_pearson_cosine, eval_spearman_cosine))
    print("Manhattan-Distance:\tPearson: {:.4f}\tSpearman: {:.4f}".format(
        eval_pearson_manhattan, eval_spearman_manhattan))
    print("Euclidean-Distance:\tPearson: {:.4f}\tSpearman: {:.4f}".format(
        eval_pearson_euclidean, eval_spearman_euclidean))
    print("Dot-Product-Similarity:\tPearson: {:.4f}\tSpearman: {:.4f}".format(
        eval_pearson_dot, eval_spearman_dot))
示例#15
0
文件: stability.py 项目: pscrapy/cade
def stables(mod_a, mod_b, top_n=20):
    """
    Return <top_n> vocabulary elements with lower cosine similarity between models
    """

    vocab = shared_voc(mod_a, mod_b)
    sims = 1 - paired_cosine_distances(mod_a.wv[vocab], mod_b.wv[vocab])
    sim_dict = {k: v for k, v in zip(vocab, sims)}

    return sorted(sim_dict, key=sim_dict.get, reverse=True)[:top_n]
示例#16
0
def correlation_score(features, labels, size):
    from sklearn.metrics.pairwise import paired_cosine_distances
    from scipy.stats import pearsonr
    logging.info("Sampling 2x{} stimuli from a total of {}".format(size, len(labels)))
    indices = np.array(random.sample(range(len(labels)), size*2))
    y = labels[indices]
    x = features[indices]
    y_sim = y[: size] == y[size :]
    x_sim = 1 - paired_cosine_distances(x[: size], x[size :])
    return pearsonr(x_sim, y_sim)[0]
示例#17
0
def make_comparison_fig(
    group2embedding_matrices: Dict[str, List[np.ndarray]],
    vmin: float = 0.0,
) -> plt.Figure:
    """
    Returns fig showing similarity matrix of probe similarity matrices of multiple models
    """

    group_names = sorted(group2embedding_matrices.keys())

    # get a flat list of embedding matrices, preserving group order
    embedding_matrices_flat = []
    group_names_flat = []
    for k, v in sorted(g2embedding_matrices.items()):
        embedding_matrices_flat.extend(v)
        group_names_flat.extend([k] * len(v))

    avg_sims = np.zeros(
        (len(embedding_matrices_flat), len(embedding_matrices_flat)))
    for i, embeddings_i in enumerate(embedding_matrices_flat):
        for j, embeddings_j in enumerate(embedding_matrices_flat):
            avg_sims[i, j] = 1 - paired_cosine_distances(
                embeddings_i, embeddings_j).mean()

    # fig
    fig, ax = plt.subplots(figsize=config.Fig.fig_size, dpi=config.Fig.dpi)
    mask = np.zeros_like(avg_sims, dtype=np.bool)
    mask[np.triu_indices_from(mask, 1)] = True
    sns.heatmap(avg_sims,
                ax=ax,
                square=True,
                annot=False,
                annot_kws={"size": 5},
                cbar_kws={"shrink": .5},
                vmin=vmin,
                vmax=1.0,
                cmap='jet')  # , mask=mask

    # colorbar
    cbar = ax.collections[0].colorbar
    cbar.set_ticks([vmin, 1.0])
    cbar.set_ticklabels([str(vmin), '1.0'])
    cbar.set_label('Similarity between Semantic Spaces')

    # ax (needs to be below plot for axes to be labeled)
    ax.set_yticks(np.arange(len(group_names_flat)) + 0.5)
    ax.set_xticks(np.arange(len(group_names_flat)) + 0.5)
    ax.set_yticklabels(group_names_flat, rotation=0)
    ax.set_xticklabels(group_names_flat, rotation=90)
    plt.tight_layout()

    return fig
示例#18
0
def raw(args):
    df = pd.read_csv(f"data/LCQMC/{args.filename}",
                     delimiter="\t",
                     header=None,
                     names=["text_1", "text_2", "label"])

    g = tf.Graph()
    with g.as_default():
        text_input = tf.placeholder(dtype=tf.string, shape=[None])
        embed = hub.Module(
            "https://tfhub.dev/google/universal-sentence-encoder-multilingual/1"
        )
        embedded_text = embed(text_input)
        init_op = tf.group(
            [tf.global_variables_initializer(),
             tf.tables_initializer()])
        g.finalize()

    # Initialize session.
    session = tf.Session(graph=g)
    session.run(init_op)

    # Compute embeddings.
    embs_1, embs_2 = [], []
    for i in range(0, len(df), args.batch_size):
        embs_1.append(
            session.run(embedded_text,
                        feed_dict={
                            text_input: df.text_1.values[i:i + args.batch_size]
                        }))
        embs_2.append(
            session.run(embedded_text,
                        feed_dict={
                            text_input: df.text_2.values[i:i + args.batch_size]
                        }))
    embeddings1 = np.concatenate(embs_1)
    embeddings2 = np.concatenate(embs_2)

    evaluator = EmbeddingSimilarityEvaluator(
        main_similarity=SimilarityFunction.COSINE)

    spearman_score = evaluator(embeddings1,
                               embeddings2,
                               labels=df["label"].values)
    print(f"Spearman: {spearman_score:.4f}")

    preds = 1 - paired_cosine_distances(embeddings1, embeddings2)

    df["pred"] = preds
    df.to_csv(f"cache/{Path(args.filename).stem}_pred.csv", index=False)

    return preds, df["label"].values
示例#19
0
def getRecommendation(data):
    dataDict = json.loads(data)
    length = len(dataDict['pic'])
    weights = np.array([sum(col)/length for col in zip(*[pictureDict[int(picdata)]['weights'] for picdata in dataDict['pic']])]).reshape(1,-1)
    minHeap = []
    for restaurant, thatWeights in restaurantWeights.items():
        heapq.heappush(minHeap, (-paired_cosine_distances(weights, np.array(thatWeights).reshape(1,-1)), restaurant))
        if len(minHeap) > 10:
            heapq.heappop(minHeap)
    res = []
    while minHeap:
        res.append(heapq.heappop(minHeap)[1])
    return {'restaurants': [getElement(i) for i in res[::-1]] }
示例#20
0
    def get_dists(gen, fname):
        qs, ds, ixs = [], [], []
        for qdi in gen:
            qs.append(' '.join(qdi.q_terms))
            ds.append(' '.join(qdi.d_terms))
            ixs.append(qdi.ixs[:3])
        if len(qs) == 0:
            return
        qvecs = model.transform(qs)
        dvecs = model.transform(ds)

        dists = paired_cosine_distances(qvecs, dvecs)
        np.savez(fname, vals=np.hstack([ixs, np.array([dists]).T]))
示例#21
0
    def __call__(self,
                 model: 'SequentialSentenceEmbedder',
                 output_path: str = None,
                 epoch: int = -1,
                 steps: int = -1) -> float:
        model.eval()
        embeddings1 = []
        embeddings2 = []
        labels = []

        if epoch != -1:
            if steps == -1:
                out_txt = f" after epoch {epoch}:"
            else:
                out_txt = f" in epoch {epoch} after {steps} steps:"
        else:
            out_txt = ":"

        logging.info("Evaluation the model on " + self.name + " dataset" +
                     out_txt)

        self.dataloader.collate_fn = model.smart_batching_collate

        iterator = self.dataloader
        if self.show_progress_bar:
            iterator = tqdm(iterator, desc="Convert Evaluating")

        for step, batch in enumerate(iterator):
            features, label_ids = batch_to_device(batch, self.device)
            with torch.no_grad():
                emb1, emb2 = [
                    model(sent_features)['sentence_embedding'].to(
                        "cpu").numpy() for sent_features in features
                ]

            labels.extend(label_ids.to("cpu").numpy())
            embeddings1.extend(emb1)
            embeddings2.extend(emb2)

        try:
            cosine_scores = 1 - (paired_cosine_distances(
                embeddings1, embeddings2))
        except Exception as e:
            print(embeddings1)
            print(embeddings2)
            raise (e)

        mse_cosine = mean_squared_error(labels, cosine_scores)
        mae_cosine = mean_absolute_error(labels, cosine_scores)

        return (mse_cosine, mae_cosine)
示例#22
0
 def update(self, embeddings, labels, n, **kwargs):
     assert embeddings.shape[0] == 2
     scores = 1 - paired_cosine_distances(embeddings[0], embeddings[1])
     items = list(zip(scores, labels))
     sorted_items = sorted(items, key=lambda x: x[0], reverse=True)
     labels = [i[1] for i in items]
     scores = [i[0] for i in items]
     if self.return_predictions:
         self.all_predictions.extend(scores)
         self.all_labels.extend(labels)
     self.val = metrics.average_precision_score(labels, scores)
     self.sum += self.val * n
     self.count += n
     self.avg = self.sum / self.count
示例#23
0
    def kernel_function(self, x1, x2):
        features = []

        # linear kernel:
        # Cosine distance
        features += np.squeeze(1 -
                               pairwise.paired_cosine_distances(x1, x2)[0]),

        # Manhanttan distance
        features += pairwise.paired_manhattan_distances(x1, x2)[0],

        # Euclidean distance
        features += pairwise.paired_euclidean_distances(x1, x2)[0],

        # Chebyshev distance
        features += pairwise.pairwise_distances(x1, x2,
                                                metric="chebyshev")[0][0],

        # stat kernel:
        # Pearson coefficient
        pearson = stats.pearsonr(np.squeeze(np.asarray(x1)),
                                 np.squeeze(np.asarray(x2)))[0]
        features += 0 if np.isnan(pearson) else pearson,

        # Spearman coefficient
        spearman = stats.spearmanr(x1, x2, axis=1).correlation
        features += 0 if np.isnan(spearman) else spearman,

        # Kendall tau coefficient
        kendall = stats.kendalltau(x1, x2).correlation
        features += 0 if np.isnan(kendall) else kendall,

        # non-linear kernel:
        # polynomial
        features += pairwise.polynomial_kernel(x1, x2, degree=2)[0][0],

        # rbf
        features += pairwise.rbf_kernel(x1, x2)[0][0],

        # laplacian
        features += pairwise.laplacian_kernel(x1, x2)[0][0],

        # sigmoid
        features += pairwise.sigmoid_kernel(x1, x2)[0][0],

        return features
示例#24
0
def evaluate(wpairs, inv, M):
    A = np.full(shape=len(wpairs), fill_value=0.5)
    l = []  # list of vectors pairs
    f = []  # filter for known pairs
    for w1, w2 in wpairs:
        u = get_vec(w1, inv, M)
        v = get_vec(w2, inv, M)
        if u is None or v is None:
            f.append(False)
        else:
            f.append(True)
            l.append((u, v))
    print(f.count(False), 'missing results (use 0.5)', file=sys.stderr)
    I, J = map(vstack, zip(*l))
    dst = paired_cosine_distances(I, J)
    A[np.array(f)] = 1 - dst
    return A
示例#25
0
    def _get_vectors(self, df):
        def mean_vector(lemmatized_text):
            res = list([
                np.zeros(self.word2vec_vector_length),
            ])
            for word in lemmatized_text:
                try:
                    res.append(self.word2vec_model[word])
                except KeyError:
                    second_candidate = self._synonyms_vocabulary.get(word)
                    if second_candidate:
                        res.append(self.word2vec_model[second_candidate])
                    elif self.word2vec_stopwords and ('NOUN' in word
                                                      or 'VERB' in word):
                        # print(f'There is no "{word}" in vocabulary of the given model; ommited', file=sys.stderr)
                        pass

            mean = sum(np.array(res)) / (len(res) - 1 + 1e-25)
            return mean

        if not self.word2vec_stopwords:
            df.lemmas_x = df.lemmas_x.map(self._remove_stop_words)
            df.lemmas_y = df.lemmas_y.map(self._remove_stop_words)

        # Add the required UPoS postags (as in the rusvectores word2vec model's vocabulary)
        if self.word2vec_tag_required:
            df.lemmas_x = df.snippet_x_locs.map(self._tag_postags_morph)
            df.lemmas_y = df.snippet_y_locs.map(self._tag_postags_morph)

        # Make two dataframes with average vectors for x and y,
        # merge them with the original dataframe
        df_embed_x = df.lemmas_x.apply(mean_vector).values.tolist()
        df_embed_y = df.lemmas_y.apply(mean_vector).values.tolist()
        embeddings = pd.DataFrame(df_embed_x).merge(pd.DataFrame(df_embed_y),
                                                    left_index=True,
                                                    right_index=True)
        embeddings['cos_embed_dist'] = paired_cosine_distances(
            df_embed_x, df_embed_y)
        embeddings['eucl_embed_dist'] = paired_euclidean_distances(
            df_embed_x, df_embed_y)
        df = pd.concat(
            [df.reset_index(drop=True),
             embeddings.reset_index(drop=True)],
            axis=1)

        return df
    def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:

        if epoch != -1:
            if steps == -1:
                out_txt = f" after epoch {epoch}:"
            else:
                out_txt = f" in epoch {epoch} after {steps} steps:"
        else:
            out_txt = ":"

        logger.info("Binary Accuracy Evaluation of the model on " + self.name + " dataset" + out_txt)
        embeddings1 = model.encode(self.sentences1, batch_size=self.batch_size,
                                   show_progress_bar=self.show_progress_bar, convert_to_numpy=True)
        embeddings2 = model.encode(self.sentences2, batch_size=self.batch_size,
                                   show_progress_bar=self.show_progress_bar, convert_to_numpy=True)

        cosine_scores = 1-paired_cosine_distances(embeddings1, embeddings2)
        manhattan_distances = paired_manhattan_distances(embeddings1, embeddings2)
        euclidean_distances = paired_euclidean_distances(embeddings1, embeddings2)


        labels = np.asarray(self.labels)

        file_output_data = [epoch, steps]

        for s1, s2 in zip(self.sentences1, self.sentences2):
          print(s1, s2)
          output = model([s1, s2])

        main_score = None
        for name, scores, reverse in [['Cosine-Similarity', cosine_scores, True], ['Manhatten-Distance', manhattan_distances, False], ['Euclidean-Distance', euclidean_distances, False]]:
            acc, acc_threshold = self.find_best_acc_and_threshold(scores, labels, reverse)
            f1, precision, recall, f1_threshold = self.find_best_f1_and_threshold(scores, labels, reverse)
            ap = average_precision_score(labels, scores * (1 if reverse else -1))

            logger.info("Accuracy with {}:           {:.2f}\t(Threshold: {:.4f})".format(name, acc * 100, acc_threshold))
            logger.info("F1 with {}:                 {:.2f}\t(Threshold: {:.4f})".format(name, f1 * 100, f1_threshold))
            logger.info("Precision with {}:          {:.2f}".format(name, precision * 100))
            logger.info("Recall with {}:             {:.2f}".format(name, recall * 100))
            logger.info("Average Precision with {}:  {:.2f}\n".format(name, ap * 100))

            file_output_data.extend([acc, acc_threshold, f1, precision, recall, f1_threshold, ap])

            if main_score is None: #Use AveragePrecision with Cosine-Similarity as main score
                main_score = ap
        return main_score
def main(args):

    model = SentenceTransformer(args.model_name)

    if args.device == 'cuda' and torch.cuda.is_available():
        model.cuda()

    ids = []
    src_sentences = []
    tgt_sentences = []
    programs = []

    with open(args.input_file, 'r') as fin:
        for i, line in enumerate(fin):
            row = list(map(lambda part: part.strip(), line.split('\t')))
            ids.append(row[0])
            src_sentences.append(row[1])
            tgt_sentences.append(row[2])
            if len(row) > 3:
                programs.append(row[3])

            if args.subsample != -1 and i >= args.subsample:
                break

    embeddings1 = model.encode(src_sentences,
                               batch_size=args.batch_size,
                               show_progress_bar=True,
                               convert_to_numpy=True)
    embeddings2 = model.encode(tgt_sentences,
                               batch_size=args.batch_size,
                               show_progress_bar=True,
                               convert_to_numpy=True)

    cosine_scores = 1 - (paired_cosine_distances(embeddings1, embeddings2))

    with open(args.output_file, 'w') as fout:
        for i in range(len(ids)):
            id_, src, tgt, score = ids[i], src_sentences[i], tgt_sentences[
                i], cosine_scores[i]
            prog = None
            if programs:
                prog = programs[i]
            fout.write('\t'.join([
                id_, src, tgt, '{:0.4f}'.format(score), prog if prog else ''
            ]) + '\n')
示例#28
0
def test_neuron_cosine_sim(X_neuron, adv_sig, neuron_mask=None):
    nb_sample = X_neuron.shape[0]

    # neuron_mask_expand = np.expand_dims(neuron_mask, axis=0)
    # neuron_mask_repeat = np.repeat(neuron_mask_expand, nb_sample, axis=0)

    adv_sig_repeat = np.expand_dims(adv_sig, axis=0)
    adv_sig_repeat = np.repeat(adv_sig_repeat, nb_sample, axis=0)
    adv_sig_flatten = np.reshape(adv_sig_repeat, (nb_sample, -1))

    X_neuron_mask = X_neuron
    X_flatten = np.reshape(X_neuron_mask, (nb_sample, -1))

    cosine_sim = 1 - paired_cosine_distances(X_flatten, adv_sig_flatten)

    # print(list(np.percentile(cosine_sim, [0, 5, 25, 50, 75, 95, 100])))

    return cosine_sim
示例#29
0
    def predict(self, data_df):
        if "text_a" in data_df.columns and "text_b" in data_df.columns:
            if self.args.do_lower_case:
                data_df.loc[:, 'text_a'] = data_df['text_a'].str.lower()
                data_df.loc[:, 'text_b'] = data_df['text_b'].str.lower()

            # Compute embedding for text pairs
            embeddings1 = self.model.encode(list(data_df["text_a"]),
                                            convert_to_numpy=True)
            embeddings2 = self.model.encode(list(data_df["text_b"]),
                                            convert_to_numpy=True)

            cosine_scores = 1 - (paired_cosine_distances(
                embeddings1, embeddings2))
            return cosine_scores
        else:
            raise KeyError(
                'Prediction data processing - Required columns not found!')
示例#30
0
    def predict(self, to_predict, verbose=True):
        sentences1 = []
        sentences2 = []

        for text_1, text_2 in to_predict:
            sentences1.append(text_1)
            sentences2.append(text_2)

        embeddings1 = self.encode(sentences1,
                                  show_progress_bar=verbose,
                                  convert_to_numpy=True)
        embeddings2 = self.encode(sentences2,
                                  show_progress_bar=verbose,
                                  convert_to_numpy=True)

        cosine_scores = 1 - (paired_cosine_distances(embeddings1, embeddings2))

        return cosine_scores