Пример #1
def merge_vectors(composed_dir, unigrams, output, workers=4, chunk_size=10000):
    # this particular dataset uses spaces instead of underscores. State this to avoid parsing issues
    DocumentFeature.ngram_separator = " "
    DIMS = 100  # SVD dimensionality

    files = glob(os.path.join(composed_dir, "*apt.vec.gz"))
    logging.info("Found %d composed phrase files", len(files))

    # ignore stuff that isn't unigrams, it will cause problems later
    unigrams = Vectors.from_tsv(unigrams, row_filter=lambda x, y: y.type == "1-GRAM")
    logging.info("Found %d unigram vectors", len(unigrams))

    mat, cols, rows = unigrams.to_sparse_matrix()
    unigrams.v.vocabulary_ = {x: i for i, x in enumerate(list(cols))}
    cols = set(cols)
    svd = TruncatedSVD(DIMS, random_state=0)
    logging.info("Reducing dimensionality of matrix of shape %r...", mat.shape)
    start = time.time()
    reduced_mat = svd.fit_transform(mat)
        "Reduced using {} from shape {} to shape {} in {} seconds".format(
            svd, mat.shape, reduced_mat.shape, time.time() - start
        ["SVD:feat{0:03d}".format(i) for i in range(reduced_mat.shape[1])],
        "%s-unigrams-SVD%d" % (output, DIMS),
    del mat

    for i, chunk in enumerate(grouper(chunk_size, files)):
        d = {}
        logging.info("Reading composed vectors, chunk %d...", i)
        for phrase, features in Parallel(n_jobs=workers)(delayed(_read_vector)(f) for f in chunk if f):
            if features:
                d[phrase] = features

        logging.info("Found %d non-empty composed vectors in this chunk, running SVD now...", len(d))
        if not d:

        composed_vec = Vectors(d, column_filter=lambda foo: foo in cols)
        # vectorize second matrix with the vocabulary (columns) of the first thesaurus to ensure shapes match
        # "project" composed matrix into space of unigram thesaurus
        extra_matrix = unigrams.v.transform([dict(fv) for fv in composed_vec.values()])
        assert extra_matrix.shape == (len(composed_vec), len(cols))
        logging.info("Composed matrix is of shape %r before SVD", extra_matrix.shape)

        extra_matrix = svd.transform(extra_matrix)
            ["SVD:feat{0:03d}".format(i) for i in range(extra_matrix.shape[1])],
            "%s-phrases-chunk%d-SVD%d" % (output, i, DIMS),
        del composed_vec