示例#1
0
文件: clusters.py 项目: zxlzr/spikex
def cluster_chunks(
    chunks: List[Span],
    stopwords: bool = False,
    filter_pos: List[str] = None,
    min_score: float = None,
):
    """
    Cluster chunks by using a revisited **Radial Ball Mapper** algorithm

    Parameters
    ----------
    chunks : List[Span]
        Chunks to cluster.
    stopwords : bool, optional
        Flag to exclude stopwords from chunks, by default False.
    filter_pos : List[str], optional
        POS tags to filter chunk words, by default None
    min_score : float, optional
        Threshold for clustering chunks, by default None

    Returns
    -------
    List[List[Span]]
        Clusters of chunks
    """
    key2index, key2vector = _map_key_to_vector(chunks, stopwords, filter_pos)
    if not key2index or not key2vector:
        return
    model = KeyedVectors(chunks[0].vector.size)
    keys = list(key2vector.keys())
    weights = list(key2vector.values())
    model.add_vectors(keys, weights)
    clusters = cluster_balls_multi(model, keys, min_score=min_score)
    return [[chunks[key2index[i]] for i in cluster] for cluster in clusters]
示例#2
0
 def to_gensim_model(self):
     """ function to convert wv attribute to gensim wv"""
     vocabList = list(self.wv.keys())
     weights = list(self.wv.values())
     from gensim.models import KeyedVectors
     gensim_w2v = KeyedVectors(self.hiddenSize)
     gensim_w2v.add_vectors(vocabList, weights)
     return gensim_w2v
示例#3
0
 def gensim_wv(self):
     """ function to convert self.result attribute to gensim wv"""
     if self.gensim_result is None:
         vocabList = list(self.keys())
         weights = list(self.values())
         from gensim.models import KeyedVectors
         gensim_w2v = KeyedVectors(self.hiddenSize)
         gensim_w2v.add_vectors(vocabList, weights)
         self.gensim_result = gensim_w2v
     return self.gensim_result
示例#4
0
def test_cluster_balls(nlp):
    ents, wgts = zip(*[(c.text.lower(), c.vector) for c in (
        nlp("apple"),
        nlp("pear"),
        nlp("orange"),
        nlp("lemon"),
    )])
    model = KeyedVectors(wgts[0].size)
    model.add_vectors(ents, list(wgts))
    print(cluster_balls(model))  # is not None  # no root
    print(cluster_balls(model, root="orange"))  # with root
示例#5
0
 def to_gensim_model(self):
     """ function to convert wv attribute to gensim wv"""
     vocabList = list(self.wv.keys())
     weights = list(self.wv.values())
     from gensim.models import KeyedVectors
     gensim_w2v = KeyedVectors(self.hiddenSize)
     try:
         gensim_w2v.add_vectors(vocabList, weights)
     except AttributeError as err:
         raise AttributeError(str(err) + \
         "\nPlease ensure gensim >= 4.0.1 is installed!")
     return gensim_w2v
示例#6
0
    def handle(self, output_file, debug_output_file, **options):
        logger.info("Building definition vectors")

        definitions = Definition.objects.filter(
            auto_translation_source_id__isnull=True).prefetch_related(
                "wordform__lemma")

        count = definitions.count()

        news_vectors = google_news_vectors()

        definition_vector_keys = []
        definition_vector_vectors = []

        unknown_words = set()

        with create_debug_output(debug_output_file) as debug_output:
            for d in tqdm(definitions.iterator(), total=count):
                keys = extract_keyed_words(d.semantic_definition, news_vectors,
                                           unknown_words)
                debug_output(
                    json.dumps(
                        {
                            "definition": d.text,
                            "wordform_text": d.wordform.text,
                            "extracted_keys": keys,
                        },
                        ensure_ascii=False,
                    ))
                if keys:
                    vec_sum = vector_for_keys(news_vectors, keys)

                    definition_vector_keys.append(definition_to_cvd_key(d))
                    definition_vector_vectors.append(vec_sum)

            definition_vectors = KeyedVectors(
                vector_size=news_vectors.vector_size)
            definition_vectors.add_vectors(definition_vector_keys,
                                           definition_vector_vectors)
            output_file.parent.mkdir(exist_ok=True)
            definition_vectors.save(fspath(output_file))
示例#7
0
    def as_keyed_vectors(self) -> KeyedVectors:
        """
        Generated a KeyedVectors instance with all the possible edge embeddings
        :return: Edge embeddings
        """

        edge_generator = combinations_with_replacement(getattr(
            self.kv, self.INDEX_MAPPING_KEY),
                                                       r=2)

        if not self.quiet:
            vocab_size = len(getattr(self.kv, self.INDEX_MAPPING_KEY))
            total_size = reduce(lambda x, y: x * y, range(1, vocab_size + 2)) / \
                         (2 * reduce(lambda x, y: x * y, range(1, vocab_size)))

            edge_generator = tqdm(edge_generator,
                                  desc='Generating edge features',
                                  total=total_size)

        # Generate features
        tokens = []
        features = []
        for edge in edge_generator:
            token = str(tuple(sorted(edge)))
            embedding = self._embed(edge)

            tokens.append(token)
            features.append(embedding)

        # Build KV instance
        edge_kv = KeyedVectors(vector_size=self.kv.vector_size)
        if pkg_resources.get_distribution("gensim").version < '4.0.0':
            edge_kv.add(entities=tokens, weights=features)
        else:
            edge_kv.add_vectors(keys=tokens, weights=features)

        return edge_kv
示例#8
0
    print("Loading vectors in gensim...")
    model = KeyedVectors.load_word2vec_format(args.filename)

    if not args.compress:
        compute_accuracy(model)
        exit(0)

    embeddings = model.vectors
    size = embeddings.nbytes

    print("Reduce dimensions using PCA...")
    embeddings = reduce_dimensions_pca(embeddings)
    new_size = embeddings.nbytes
    print("Size reduction: {:f}%".format((size - new_size) * 100 / size))

    print("Compress embeddings using product quantization...")
    embeddings, codes, centroids = product_quantize(embeddings)

    new_size = codes.nbytes + centroids.nbytes
    print("Size reduction: {:f}%".format((size-new_size)*100/size))

    words = [model.index_to_key[idx] for idx in range(len(embeddings))]
    model = KeyedVectors(vector_size=embeddings.shape[1])
    model.add_vectors(words, embeddings, replace=True)
    compute_accuracy(model)

    save_model('generated', embeddings.shape[1], words, codes, centroids)