Пример #1
0
    def transform(self, X):
        bc_data, bc_data_idx = get_brown_cluster_data(self.cluster_size)

        y_dim = len(bc_data_idx.values())
        mat = dok_matrix((len(X), y_dim * 2), dtype=np.float32)

        def set_cluster_pair(i, s, offset=0):
            cx = _get_bigram_clusters(s, bc_data)
            for x in cx:
                idx = bc_data_idx[x]
                mat[i, idx + (y_dim * offset)] = 1

        for i, (_, s) in enumerate(X.iterrows()):
            set_cluster_pair(i, s.claimHeadline)
            set_cluster_pair(i, s.articleHeadline, 1)

        return mat
Пример #2
0
    def transform(self, X):
        bc_data, bc_data_idx = get_brown_cluster_data(self.cluster_size)

        y_dim = len(bc_data_idx.values())
        mat = dok_matrix((len(X), y_dim * 2), dtype=np.float32)

        def set_cluster_pair(i, s, offset=0):
            cx = _get_bigram_clusters(s, bc_data)
            for x in cx:
                idx = bc_data_idx[x]
                mat[i, idx + (y_dim * offset)] = 1

        for i, (_, s) in enumerate(X.iterrows()):
            set_cluster_pair(i, s.claimHeadline)
            set_cluster_pair(i, s.articleHeadline, 1)

        return mat
Пример #3
0
    def transform(self, X):
        bc_data, bc_data_idx = get_brown_cluster_data(self.cluster_size)
        mat = dok_matrix((len(X), len(bc_data_idx.values())), dtype=np.float32)
        for i, (_, s) in enumerate(X.iterrows()):
            claim_headline = get_tokenized_lemmas(s.claimHeadline)
            article_headline = get_tokenized_lemmas(s.articleHeadline)
            word_pairs = it.product(article_headline, claim_headline)

            for v, w in word_pairs:
                v_cluster = bc_data.get(v)
                w_cluster = bc_data.get(w)
                if v_cluster is None or w_cluster is None:
                    continue

                idx = bc_data_idx[(v_cluster, w_cluster)]
                mat[i, idx] = 1
        return mat
Пример #4
0
    def transform(self, X):
        bc_data, bc_data_idx = get_brown_cluster_data(self.cluster_size)
        mat = dok_matrix((len(X), len(bc_data_idx.values())), dtype=np.float32)
        for i, (_, s) in enumerate(X.iterrows()):
            claim_headline = get_tokenized_lemmas(s.claimHeadline)
            article_headline = get_tokenized_lemmas(s.articleHeadline)
            word_pairs = it.product(article_headline, claim_headline)

            for v, w in word_pairs:
                v_cluster = bc_data.get(v)
                w_cluster = bc_data.get(w)
                if v_cluster is None or w_cluster is None:
                    continue

                idx = bc_data_idx[(v_cluster, w_cluster)]
                mat[i, idx] = 1
        return mat