Python KeyedVectors.vectors 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: gensim.models

클래스/타입: KeyedVectors

메소드/함수: vectors

hotexamples.com에서의 예제들: 6

Python KeyedVectors.vectors - 6개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 gensim.models.KeyedVectors.vectors에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

add(30)

load_word2vec_format(30)

load(17)

KeyedVectors(15)

save_word2vec_format(15)

most_similar(14)

save(12)

index2word(10)

vocab(9)

add_vectors(8)

vectors(6)

syn0(6)

init_sims(5)

get_vector(4)

vector_size(3)

cosine_similarities(3)

similar_by_vector(2)

distances(2)

similarity(2)

vectors_norm(1)

similar_by_word(1)

index2entity(1)

add_vector(1)

nbow(1)

n_similarity(1)

evaluate_word_analogies(1)

load_fasttext_format(1)

evaluate_word_pairs(1)

load_word2v1ec_format(1)

예제 #1

파일 보기

def restrict(model: KeyedVectors):
    new_vectors = []
    new_vocab = {}
    new_index2entity = []
    new_vectors_norm = []

    for i in range(len(model.vocab)):
        if i % 10000 == 0:
            print(i)
        word = model.index2entity[i]
        vec = model.vectors[i]
        vocab = model.vocab[word]
        vec_norm = model.vectors_norm[i]
        if word.startswith('/c/en/') and '_' not in word:
            word = word[6:]
            vocab.index = len(new_index2entity)
            new_index2entity.append(word)
            new_vocab[word] = vocab
            new_vectors.append(vec)
            new_vectors_norm.append(vec_norm)

    model.vocab = new_vocab
    model.vectors = np.array(new_vectors)
    model.index2entity = np.array(new_index2entity)
    model.index2word = np.array(new_index2entity)
    model.vectors_norm = np.array(new_vectors_norm)

예제 #2

파일 보기

파일: models.py 프로젝트: luckyos-code/ArgU

    def __init_model_out(self):
        """Create KeyVectors for w_OUT"""

        model_out = KeyedVectors(self.vector_size)
        model_out.vocab = self.model_in.wv.vocab
        model_out.index2word = self.model_in.wv.index2word
        model_out.vectors = self.model_in.trainables.syn1neg
        return model_out

예제 #3

파일 보기

파일: util.py 프로젝트: XLexxaX/AnyGraphMatcher

def create_keyed_vector(old_keyed_vector, new_matrix):
    vector_size = new_matrix.shape[1]
    keyed_vector = KeyedVectors(vector_size)
    keyed_vector.vector_size = vector_size
    keyed_vector.vocab = old_keyed_vector.vocab
    keyed_vector.index2word = old_keyed_vector.index2word
    keyed_vector.vectors = new_matrix
    assert (len(old_keyed_vector.vocab),
            vector_size) == keyed_vector.vectors.shape
    return keyed_vector

예제 #4

파일 보기

def load_emb(fp):
    n_vocab, dim = map(int, fp.readline().split())

    emb = KeyedVectors(dim)
    emb.vectors = np.empty((n_vocab, dim), dtype=np.float32)

    for i, line in tqdm(enumerate(fp), total=n_vocab, unit='word'):
        word, vec_str = line.split(' ', 1)
        emb.vectors[i] = np.fromstring(vec_str, sep=' ')
        emb.vocab[word] = Vocab(index=i)
        emb.index2word.append(word)

    return emb

예제 #5

파일 보기

파일: lsa.py 프로젝트: XLexxaX/AnyGraphMatcher

def __create_keyed_vector(matrix, orig_vocab):
    vocab = dict()
    index_to_word = []
    for word, word_id in sorted(orig_vocab.token2id.items(),
                                key=itemgetter(1)):
        index_to_word.append(word)
        vocab[word] = Vocab(index=word_id, count=orig_vocab.word_freq[word_id])
    vector_size = matrix.shape[1]

    keyed_vector = KeyedVectors(vector_size)
    keyed_vector.vector_size = vector_size
    keyed_vector.vocab = vocab
    keyed_vector.index2word = index_to_word
    keyed_vector.vectors = matrix
    assert (len(vocab), vector_size) == keyed_vector.vectors.shape
    return keyed_vector

예제 #6

파일 보기

def convert(input_file_path,
            output_file_path=None,
            precision=DEFAULT_PRECISION,
            subword=False,
            subword_start=DEFAULT_NGRAM_BEG,
            subword_end=DEFAULT_NGRAM_END,
            approx=False,
            approx_trees=None,
            vocab_path=None):
    files_to_remove = []
    subword = int(subword)
    approx = int(approx)

    # If no output_file_path specified, create it in a tempdir
    if output_file_path is None:
        output_file_path = os.path.join(
            tempfile.mkdtemp(),
            fast_md5_file(input_file_path) + '.magnitude')
        if os.path.isfile(output_file_path):
            try:
                conn = sqlite3.connect(output_file_path)
                db = conn.cursor()
                db.execute(
                    "SELECT value FROM magnitude_format WHERE key='size'") \
                    .fetchall()[0][0]
                conn.close()
                # File already exists and is functioning
                return output_file_path
            except BaseException:
                pass

    # Check args
    meta_1_path = None
    meta_2_path = None
    input_is_text = input_file_path.endswith('.txt') or \
                    input_file_path.endswith('.vec')
    input_is_binary = input_file_path.endswith('.bin')
    input_is_hdf5 = input_file_path.endswith('.hdf5')
    input_is_hdf5_weights = input_file_path.endswith('_weights.hdf5')
    if not input_is_text and not input_is_binary and not input_is_hdf5:
        exit("The input file path must be `.txt`, `.bin`, `.vec`, or `.hdf5`")
    if not output_file_path.endswith('.magnitude'):
        exit("The output file path file path must be `.magnitude`")
    if vocab_path and not vocab_path.endswith(".magnitude"):
        exit("The vocab file path file path must be `.magnitude`")

    # Detect ELMo and ELMo options file
    input_is_elmo = False
    elmo_options_path = None
    if input_is_hdf5:
        elmo_options_path = input_file_path[0:-13] + \
                            '_options.json' if input_is_hdf5_weights else input_file_path[0:-5] + '.json'  # noqa
        if not os.path.isfile(elmo_options_path):
            exit("Expected `" + elmo_options_path +
                 "` to exist. ELMo models require a JSON options file.")
        input_is_elmo = True
        meta_1_path = input_file_path
        meta_2_path = elmo_options_path

    # Detect GloVe format and convert to word2vec if detected
    detected_glove = False
    if input_is_text:
        with io.open(input_file_path,
                     mode="r",
                     encoding="utf-8",
                     errors="ignore") as ifp:
            line1 = None
            line2 = None
            while line1 is None or line2 is None:
                line = ifp.readline().strip()
                if len(line) > 0:
                    if line1 is None:
                        line1 = line
                    elif line2 is None:
                        line2 = line
            line1 = line1.replace('\t', ' ')
            line2 = line2.replace('\t', ' ')
            line1 = line1.split()
            line2 = line2.split()
            if len(line1) == len(line2):  # No header line present
                detected_glove = True
    if detected_glove:
        eprint("Detected GloVe format! Converting to word2vec format first..."
               "(this may take some time)")
        temp_file_path = os.path.join(
            tempfile.mkdtemp(),
            os.path.basename(input_file_path) + '.txt')
        try:
            import gensim
        except ImportError:
            raise ImportError("You need gensim >= 3.3.0 installed with pip \
                (`pip install gensim`) to convert GloVe files.")
        gensim.scripts.glove2word2vec.glove2word2vec(input_file_path,
                                                     temp_file_path)
        input_file_path = temp_file_path
        files_to_remove.append(temp_file_path)

    # Open and load vector file
    eprint("Loading vectors... (this may take some time)")
    number_of_keys = None
    dimensions = None
    if input_is_binary:
        try:
            from gensim.models import KeyedVectors
        except ImportError:
            raise ImportError("You need gensim >= 3.3.0 installed with pip \
                (`pip install gensim`) to convert binary files.")
        keyed_vectors = FastText.load_fasttext_format(input_file_path)
        number_of_keys = len(keyed_vectors.wv.vectors)
        dimensions = len(keyed_vectors.wv.vectors[0])
    elif input_is_text:
        # Read it manually instead of with gensim so we can stream large models
        class KeyedVectors:
            pass

        def keyed_vectors_generator():
            number_of_keys, dimensions = (None, None)
            f = io.open(input_file_path,
                        mode="r",
                        encoding="utf-8",
                        errors="ignore")
            first_line = True
            for line in f:
                line_split = line.strip().replace('\t', ' ').split()
                if len(line_split) == 0:
                    continue
                if first_line:
                    first_line = False
                    number_of_keys = int(line_split[0])
                    dimensions = int(line_split[1])
                    yield (number_of_keys, dimensions)
                else:
                    empty_key = len(line_split) == dimensions
                    vec_floats = line_split if empty_key else line_split[1:]
                    key = "" if empty_key else line_split[0]
                    if len(vec_floats) > dimensions:
                        key = " ".join([key] + vec_floats[0:len(vec_floats) -
                                                          dimensions])
                        vec_floats = vec_floats[len(vec_floats) - dimensions:]
                    vector = np.asarray([float(elem) for elem in vec_floats])
                    yield (key, vector)

        keyed_vectors = KeyedVectors()
        kv_gen = keyed_vectors_generator()
        number_of_keys, dimensions = next(kv_gen)
        kv_gen_1, kv_gen_2 = tee(kv_gen)
        keyed_vectors.vectors = imap(lambda kv: kv[1], kv_gen_1)
        keyed_vectors.index2word = imap(lambda kv: kv[0], kv_gen_2)
    else:

        class KeyedVectors:
            pass

        keyed_vectors = KeyedVectors()
        number_of_keys = 0
        dimensions = 0
        keyed_vectors.vectors = []
        keyed_vectors.index2word = []

    eprint("Found %d key(s)" % number_of_keys)
    eprint("Each vector has %d dimension(s)" % dimensions)

    # Delete files if they exist
    try_deleting(output_file_path)
    try_deleting(output_file_path + "-shm")
    try_deleting(output_file_path + "-wal")

    # Temporarily re-direct the output to a tmp file
    output_file_path_tmp = output_file_path + '.tmp'
    output_file_path_orig = output_file_path
    output_file_path = output_file_path_tmp

    # Delete files if they exist
    try_deleting(output_file_path)
    try_deleting(output_file_path + "-shm")
    try_deleting(output_file_path + "-wal")

    # Connect to magnitude datastore
    conn = sqlite3.connect(output_file_path)
    db = conn.cursor()

    # Make the database fast
    conn.isolation_level = None
    db.execute("PRAGMA synchronous = OFF;")
    db.execute("PRAGMA default_synchronous = OFF;")
    db.execute("PRAGMA journal_mode = WAL;")
    db.execute("PRAGMA count_changes = OFF;")

    # Create table structure
    eprint("Creating magnitude format...")
    db.execute("DROP TABLE IF EXISTS `magnitude`;")
    db.execute("""
        CREATE TABLE `magnitude` (
            key TEXT COLLATE NOCASE,
            """ + ",\n".join([("dim_%d INTEGER" % i)
                              for i in range(dimensions)]) +
               ",\nmagnitude REAL" + """
        );
    """)
    db.execute("""
        CREATE TABLE `magnitude_format` (
            key TEXT COLLATE NOCASE,
            value INTEGER
        );
    """)
    if subword:
        db.execute("""
            CREATE VIRTUAL TABLE `magnitude_subword`
            USING fts3(
                char_ngrams,
                num_ngrams
            );
        """)
    if approx:
        db.execute("""
            CREATE TABLE `magnitude_approx` (
                trees INTEGER,
                index_file BLOB
            );
        """)

    metas = [('meta_1', meta_1_path), ('meta_2', meta_2_path)]
    for meta_name, meta_path in metas:
        if meta_path:
            db.execute("""
                CREATE TABLE `magnitude_""" + meta_name + """` (
                    meta_file BLOB
                );
            """)

    # Create annoy index
    approx_index = None
    if approx:
        approx_index = AnnoyIndex(dimensions)

    # Write vectors
    eprint("Writing vectors... (this may take some time)")
    insert_query = """
        INSERT INTO `magnitude`(
            key,
            """ + \
                   ",\n".join([("dim_%d" % i) for i in range(dimensions)]) + \
                   ",\nmagnitude" \
                   + """)
        VALUES (
            """ + \
                   (",\n".join(["?"] * (dimensions + 2))) \
                   + """
        );
    """
    insert_subword_query = """
        INSERT INTO `magnitude_subword`(
            char_ngrams,
            num_ngrams
        )
        VALUES (
            ?, ?
        );
    """
    counters = [Counter() for i in range(dimensions)]
    key_vectors_iterable = izip(keyed_vectors.wv.index2word,
                                keyed_vectors.wv.vectors)
    progress = -1
    db.execute("BEGIN;")
    for i, (key, vector) in enumerate(key_vectors_iterable):
        current_progress = int((float(i) / float(number_of_keys)) * 100)
        if current_progress > progress:
            progress = current_progress
            eprint("%d%% completed" % progress)
        if i % 100000 == 0:
            db.execute("COMMIT;")
            db.execute("BEGIN;")
        magnitude = np.linalg.norm(vector)
        vector = vector / magnitude
        epsilon = np.random.choice(
            [-1.0 / (10**precision), 1.0 / (10**precision)], dimensions)
        vector = epsilon if np.isnan(vector).any() else vector
        for d, v in enumerate(vector):
            counters[d][int(v * 100)] += 1
        db.execute(insert_query, (key, ) +
                   tuple(int(round(v * (10**precision)))
                         for v in vector) + (float(magnitude), ))  # noqa
        if subword:
            ngrams = set(
                (n.lower()
                 for n in char_ngrams(BOW + key +
                                      EOW, subword_start, subword_end)))
            num_ngrams = len(ngrams) * 4
            ngrams = set(
                (n for n in ngrams
                 if not any([c in SQLITE_TOKEN_SPLITTERS for c in n])))
            db.execute(insert_subword_query, (" ".join(ngrams), num_ngrams))
        if approx:
            approx_index.add_item(i, vector)
    eprint("Committing written vectors... (this may take some time)")
    db.execute("COMMIT;")

    # Figure out which dimensions have the most entropy
    entropies = [(d, entropy(counter)) for d, counter in enumerate(counters)]
    entropies.sort(key=lambda e: e[1], reverse=True)
    for e in entropies:
        eprint("Entropy of dimension %d is %f" % (e[0], e[1]))
    highest_entropy_dimensions = [e[0] for e in entropies]

    # Writing metadata
    insert_format_query = """
        INSERT INTO `magnitude_format`(
            key,
            value
        )
        VALUES (
            ?, ?
        );
    """

    db.execute(insert_format_query, ('version', CONVERTER_VERSION))
    db.execute(insert_format_query, ('elmo', input_is_elmo))
    db.execute(insert_format_query, ('size', number_of_keys))
    db.execute(insert_format_query, ('dim', dimensions))
    db.execute(insert_format_query, ('precision', precision))
    if subword:
        db.execute(insert_format_query, ('subword', subword))
        db.execute(insert_format_query, ('subword_start', subword_start))
        db.execute(insert_format_query, ('subword_end', subword_end))
    if approx:
        if approx_trees is None:
            approx_trees = max(50, int((number_of_keys / 3000000.0) * 50.0))
        db.execute(insert_format_query, ('approx', approx))
        db.execute(insert_format_query, ('approx_trees', approx_trees))
    for d in highest_entropy_dimensions:
        db.execute(insert_format_query, ('entropy', d))

    # Create indicies
    eprint("Creating search index... (this may take some time)")
    db.execute("CREATE INDEX `magnitude_key_idx` ON `magnitude` (key);")
    for i in highest_entropy_dimensions[0:1]:
        eprint("Creating spatial search index for dimension %d "
               "(it has high entropy)... (this may take some time)" % i)
        db.execute("""
            CREATE INDEX `magnitude_dim_%d_idx` ON `magnitude` (dim_%d);
        """ % (i, i))

    # Write approximate index to the database
    if approx:
        eprint("Creating approximate nearest neighbors index... \
(this may take some time)")
        approx_index.build(approx_trees)
        approx_index_file_path = os.path.join(
            tempfile.mkdtemp(),
            fast_md5_file(input_file_path) + '.ann')
        eprint("Dumping approximate nearest neighbors index... \
(this may take some time)")
        approx_index.save(approx_index_file_path)
        eprint("Compressing approximate nearest neighbors index... \
(this may take some time)")
        chunk_size = 104857600
        full_size = os.path.getsize(approx_index_file_path)
        insert_approx_query = """
            INSERT INTO magnitude_approx(trees, index_file) VALUES (?, ?);
        """
        with open(approx_index_file_path, 'rb') as ifh, \
                lz4.frame.LZ4FrameCompressor() as compressor:
            for i, chunk in enumerate(iter(partial(ifh.read, chunk_size),
                                           b'')):
                if i == 0:
                    chunk = compressor.begin() + compressor.compress(chunk)
                else:
                    chunk = compressor.compress(chunk)
                eprint(str((ifh.tell() / float(full_size)) * 100.0) + "%")
                if len(chunk) > 0:
                    db.execute(insert_approx_query,
                               (approx_trees, sqlite3.Binary(chunk)))
            chunk = compressor.flush()
            if len(chunk) > 0:
                db.execute(insert_approx_query,
                           (approx_trees, sqlite3.Binary(chunk)))
        files_to_remove.append(approx_index_file_path)

    for meta_name, meta_path in metas:
        if not meta_path:
            continue
        eprint("Compressing meta file... \
(this may take some time)")
        chunk_size = 104857600
        full_size = os.path.getsize(meta_path)
        insert_meta_query = """
            INSERT INTO magnitude_""" + meta_name + """(meta_file)
            VALUES (?);
        """
        with open(meta_path, 'rb') as ifh, \
                lz4.frame.LZ4FrameCompressor() as compressor:
            for i, chunk in enumerate(iter(partial(ifh.read, chunk_size),
                                           b'')):
                if i == 0:
                    chunk = compressor.begin() + compressor.compress(chunk)
                else:
                    chunk = compressor.compress(chunk)
                eprint(str((ifh.tell() / float(full_size)) * 100.0) + "%")
                if len(chunk) > 0:
                    db.execute(insert_meta_query, (sqlite3.Binary(chunk), ))
            chunk = compressor.flush()
            if len(chunk) > 0:
                db.execute(insert_meta_query, (sqlite3.Binary(chunk), ))

    # Clean up
    if len(files_to_remove) > 0:
        eprint("Cleaning up temporary files...")
        for file_to_remove in files_to_remove:
            try_deleting(file_to_remove)

    # Calculate max duplicate keys
    eprint("Finding duplicate keys... (this may take some time)")
    duplicate_keys_query = db.execute("""
        SELECT MAX(key_count)
        FROM (
            SELECT COUNT(key)
            AS key_count
            FROM magnitude
            GROUP BY key
        );
    """).fetchall()
    max_duplicate_keys = (duplicate_keys_query[0][0]
                          if duplicate_keys_query[0][0] is not None else 1
                          )  # noqa
    eprint("Found %d as the maximum number of duplicate key(s)" %
           max_duplicate_keys)
    db.execute(insert_format_query, ('max_duplicate_keys', max_duplicate_keys))

    # VACUUM
    eprint("Vacuuming to save space... (this may take some time)")
    db.execute("VACUUM;")

    # Restore safe database settings
    db.execute("PRAGMA synchronous = FULL;")
    db.execute("PRAGMA default_synchronous = FULL;")
    db.execute("PRAGMA journal_mode = DELETE;")
    db.execute("PRAGMA count_changes = ON;")

    # Clean up connection
    conn.commit()
    conn.close()
    files_to_remove.append(output_file_path + "-shm")
    files_to_remove.append(output_file_path + "-wal")

    # Clean up
    if len(files_to_remove) > 0:
        eprint("Cleaning up temporary files...")
        for file_to_remove in files_to_remove:
            try_deleting(file_to_remove)

    # Rename file the temporary output to the real output
    os.rename(output_file_path, output_file_path_orig)
    output_file_path = output_file_path_orig

    # Print success
    eprint("Successfully converted '%s' to '%s'!" %
           (input_file_path, output_file_path))

    return output_file_path