Пример #1
def main():
    Transform EPMI matrix in npz format to SPPMI space and save as pickle file.

    # Get the arguments
    args = docopt(
        '''Transform EPMI matrix in npz format to SPPMI space and save as pickle file.

        transform_matrix_epmi2sppmi.py <spacePrefix> <outPath> <k>

        <spacePrefix> = path to npz without suffix
        <outPath> = output path for space
        <k> = shifting parameter

    spacePrefix = args['<spacePrefix>']
    outPath = args['<outPath>']
    k = int(args['<k>'])

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
    start_time = time.time()

    # Get npz matrix
    with np.load(spacePrefix + '.npz') as loader:
        matrix = csr_matrix(
            (loader['data'], loader['indices'], loader['indptr']),

    with open(spacePrefix + '.words.vocab') as f:
        id2row = vocab = [line.strip() for line in f if len(line) > 0]

    with open(spacePrefix + '.contexts.vocab') as f:
        id2column = [line.strip() for line in f if len(line) > 0]

    # Apply log weighting
    matrix.data = np.log(matrix.data)

    # Shift values
    matrix.data -= np.log(k)

    # Eliminate negative counts
    matrix.data[matrix.data <= 0] = 0.0

    # Eliminate zero counts

    # Create new space
    sparseSpace = Space(SparseMatrix(matrix), id2row, id2column)

    #print sparseSpace.get_cooccurrence_matrix()

    # Save the Space object in pickle format
    save_pkl_files(sparseSpace, outPath + 'ppmi.sm', save_in_one_file=True)

    logging.info("--- %s seconds ---" % (time.time() - start_time))
Пример #2
def main():
    Convert temporal referencing matrix to regular (binned) matrix.

    # Get the arguments
    args = docopt(
        """Convert temporal referencing matrix to regular (binned) matrix.

        tr2bin.py (-w | -s) <spacePrefix> <ref> <outPath>

        <spacePrefix> = path to pickled space without suffix
        <ref> = reference string
        <outPath> = output path for result file

        -w, --w2v   save in w2v format
        -s, --sps   save in sparse matrix format

    is_w2v = args['--w2v']
    is_sps = args['--sps']
    spacePrefix = args['<spacePrefix>']
    ref = args['<ref>']
    outPath = args['<outPath>']

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
    start_time = time.time()

    # Load spaces
    space = load_pkl_files(spacePrefix)
    matrix = space.get_cooccurrence_matrix().get_mat()
    id2row = space.get_id2row()
    id2column = space.get_id2column()

    ti = [(spl[0], i) for i, w in enumerate(id2row) for spl in [w.split('_')]
          if len(spl) == 1 or (len(spl) == 2 and spl[1] == ref)]
    targets, indices = zip(*ti)

    new_matrix = matrix[list(indices), :]

    # Save the Space objects
    if is_w2v:
        new_space = Space(DenseMatrix(new_matrix), list(targets), id2column)
    if is_sps:
        new_space = Space(SparseMatrix(new_matrix), list(targets), id2column)

    logging.info("--- %s seconds ---" % (time.time() - start_time))
def main():
    Convert txt matrix to w2v matrix and save.

    # Get the arguments
    args = docopt('''Convert txt matrix to w2v matrix and save.

        convert_matrix_txt2w2v.py <spacePrefix> <outPath>

        <spacePrefix> = path to npz without suffix
        <outPath> = output path for space

    spacePrefix = args['<spacePrefix>']
    outPath = args['<outPath>']

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
    start_time = time.time()

    space_array = np.loadtxt(spacePrefix + '.txt',
                             delimiter=' ',
    targets = space_array[:, 0].flatten()
    values = space_array[:, 1:].astype(np.float)
    # Create new space
    sparseSpace = Space(DenseMatrix(coo_matrix(values)), list(targets), [])

    #print sparseSpace.get_row('wood').get_mat().toarray()[0].tolist()[id2column.index('inexhaustible')]

    # Save the Space object in pickle format

    logging.info("--- %s seconds ---" % (time.time() - start_time))
Пример #4
def main():
    Align two sparse matrices by intersecting their columns.

    # Get the arguments
    args = docopt('''Align two sparse matrices by intersecting their columns.

        count_alignment_intersect.py [-l] <outPath1> <outPath2> <spacePrefix1> <spacePrefix2>

        <outPath1> = output path for aligned space 1
        <outPath2> = output path for aligned space 2
        <spacePrefix1> = path to pickled space1 without suffix
        <spacePrefix2> = path to pickled space2 without suffix

        -l, --len   normalize final vectors to unit length

    is_len = args['--len']
    spacePrefix1 = args['<spacePrefix1>']
    spacePrefix2 = args['<spacePrefix2>']
    outPath1 = args['<outPath1>']
    outPath2 = args['<outPath2>']

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
    start_time = time.time()

    space1 = load_pkl_files(spacePrefix1)
    space2 = load_pkl_files(spacePrefix2)
    id2row1 = space1.get_id2row()
    id2row2 = space2.get_id2row()
    id2column1 = space1.get_id2column()
    id2column2 = space2.get_id2column()
    column2id1 = space1.get_column2id()
    column2id2 = space2.get_column2id()
    intersected_columns = list(set(id2column1).intersection(id2column2))
    intersected_columns_id1 = [
        column2id1[item] for item in intersected_columns
    intersected_columns_id2 = [
        column2id2[item] for item in intersected_columns
    reduced_matrix1 = space1.get_cooccurrence_matrix(
    )[:, intersected_columns_id1].get_mat()
    reduced_matrix2 = space2.get_cooccurrence_matrix(
    )[:, intersected_columns_id2].get_mat()

    if is_len:
        # L2-normalize vectors
        l2norm1 = linalg.norm(reduced_matrix1, axis=1, ord=2)
        l2norm2 = linalg.norm(reduced_matrix2, axis=1, ord=2)
        l2norm1[l2norm1 == 0.0] = 1.0  # Convert 0 values to 1
        l2norm2[l2norm2 == 0.0] = 1.0  # Convert 0 values to 1
        reduced_matrix1 /= l2norm1.reshape(len(l2norm1), 1)
        reduced_matrix2 /= l2norm2.reshape(len(l2norm2), 1)

    reduced_space1 = Space(SparseMatrix(reduced_matrix1), id2row1,
    reduced_space2 = Space(SparseMatrix(reduced_matrix2), id2row2,

    if reduced_space1.get_id2column() != reduced_space2.get_id2column():
        sys.exit('Two spaces not properly aligned!')

    # Save the Space object in pickle format
    save_pkl_files(reduced_space1, outPath1 + '.sm', save_in_one_file=True)
    save_pkl_files(reduced_space2, outPath2 + '.sm', save_in_one_file=True)

    logging.info("--- %s seconds ---" % (time.time() - start_time))
Пример #5
def main():
    Compute the smoothed and shifted (P)PMI matrix from a co-occurrence matrix. Smoothing is performed as described in

      Omer Levy, Yoav Goldberg, and Ido Dagan. 2015. Improving distributional similarity with lessons learned from word embeddings. Trans. ACL, 3.


    # Get the arguments
    args = docopt(
        '''Compute the smoothed and shifted (P)PMI matrix from a co-occurrence matrix and save it in pickle format.

        ppmi.py [-l] <dsm_prefix> <k> <alpha> <outPath>

        <dsm_prefix> = the prefix for the input files (.sm for the matrix, .rows and .cols) and output files (.ppmi)
        <k> = shifting parameter
        <alpha> = smoothing parameter
        <outPath> = output path for space

        -l, --len   normalize final vectors to unit length


    is_len = args['--len']
    dsm_prefix = args['<dsm_prefix>']
    k = int(args['<k>'])
    alpha = float(args['<alpha>'])
    outPath = args['<outPath>']

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
    start_time = time.time()

    # Get space with sparse matrix
    dsm = load_pkl_files(dsm_prefix)
    id2row = dsm.get_id2row()
    id2column = dsm.get_id2column()

    # Get probabilities
    matrix_ = dsm.cooccurrence_matrix

    row_sum = matrix_.sum(axis=1)
    col_sum = matrix_.sum(axis=0)

    # Compute smoothed P_alpha(c)
    smooth_col_sum = np.power(col_sum, alpha)
    col_sum = smooth_col_sum / smooth_col_sum.sum()

    # Compute P(w)
    row_sum = nonzero_invert(row_sum)
    col_sum = nonzero_invert(col_sum)

    # Apply epmi weighting (without log)
    matrix_ = matrix_.scale_rows(row_sum)
    matrix_ = matrix_.scale_columns(col_sum)

    # Apply log weighting
    matrix_.mat.data = np.log(matrix_.mat.data)

    # Shift values
    matrix_.mat.data -= np.log(k)

    # Eliminate negative counts
    matrix_.mat.data[matrix_.mat.data <= 0] = 0.0

    # Eliminate zero counts

    matrix_ = matrix_.get_mat()

    if is_len:
        # L2-normalize vectors
        l2norm1 = linalg.norm(matrix_, axis=1, ord=2)
        l2norm1[l2norm1 == 0.0] = 1.0  # Convert 0 values to 1
        matrix_ /= l2norm1.reshape(len(l2norm1), 1)

    dsm = Space(SparseMatrix(matrix_), id2row, id2column)

    # Save the Space object in pickle format
    save_pkl_files(dsm, outPath + ".ppmi.sm", save_in_one_file=False)
    logging.info("--- %s seconds ---" % (time.time() - start_time))
Пример #6
def main():
    Make count-based vector space from corpus.

    # Get the arguments
    args = docopt("""Make count-based vector space from corpus.

        count.py [-l] <windowSize> <corpDir> <outPath> <lowerBound> <upperBound>
        <corpDir> = path to corpus directory with zipped files, each sentence in form 'year\tword1 word2 word3...'
        <outPath> = output path for vectors
        <windowSize> = the linear distance of context words to consider in each direction
        <lowerBound> = lower bound for time period
        <upperBound> = upper bound for time period

        -l, --len   normalize final vectors to unit length


    is_len = args['--len']
    corpDir = args['<corpDir>']
    outPath = args['<outPath>']
    windowSize = int(args['<windowSize>'])
    lowerBound = int(args['<lowerBound>'])
    upperBound = int(args['<upperBound>'])

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
    start_time = time.time()

    # Build vocabulary
    logging.info("Building vocabulary")
    sentences = PathLineSentences_mod(corpDir,
    vocabulary = list(
            word for sentence in sentences for word in sentence
            if len(sentence) > 1
        ]))  # Skip one-word sentences to avoid zero-vectors
    w2i = {w: i for i, w in enumerate(vocabulary)}

    # Initialize co-occurrence matrix as dictionary
    cooc_mat = defaultdict(lambda: 0)

    # Get counts from corpus
    sentences = PathLineSentences_mod(corpDir,
    logging.info("Counting context words")
    for sentence in sentences:
        for i, word in enumerate(sentence):
            lowerWindowSize = max(i - windowSize, 0)
            upperWindowSize = min(i + windowSize, len(sentence))
            window = sentence[lowerWindowSize:i] + sentence[i +
                                                            1:upperWindowSize +
            if len(window) == 0:  # Skip one-word sentences
            windex = w2i[word]
            for contextWord in window:
                cooc_mat[(windex, w2i[contextWord])] += 1

    # Convert dictionary to sparse matrix
    logging.info("Converting dictionary to matrix")
    cooc_mat_sparse = dok_matrix((len(vocabulary), len(vocabulary)),
    except NotImplementedError:

    if is_len:
        # L2-normalize vectors
        l2norm1 = linalg.norm(cooc_mat_sparse, axis=1, ord=2)
        l2norm1[l2norm1 == 0.0] = 1.0  # Convert 0 values to 1
        cooc_mat_sparse /= l2norm1.reshape(len(l2norm1), 1)

    # Make space
    vocabulary = [v.encode('utf-8') for v in vocabulary]
    countSpace = Space(SparseMatrix(cooc_mat_sparse), vocabulary, vocabulary)

    # Save the Space object in pickle format
    save_pkl_files(countSpace, outPath, save_in_one_file=False)

    logging.info("Corpus has size %d" % sentences.corpusSize)
    logging.info("--- %s seconds ---" % (time.time() - start_time))
Пример #7
def main():
    Perform dimensionality reduction on a (normally PPMI) matrix by applying truncated SVD as described in

      Omer Levy, Yoav Goldberg, and Ido Dagan. 2015. Improving distributional similarity with lessons learned from word embeddings. Trans. ACL, 3.


    # Get the arguments
    args = docopt(
        '''Perform dimensionality reduction on a (normally PPMI) matrix by applying truncated SVD and save it in pickle format.

        svd.py [-l] <dsm_prefix> <dim> <gamma> <outPath>

        <dsm_prefix> = the prefix for the input files (.sm for the matrix, .rows and .cols) and output files (.svd)
        <dim> = dimensionality of low-dimensional output vectors
        <gamma> = eigenvalue weighting parameter
        <outPath> = output path for space

        -l, --len   normalize final vectors to unit length


    is_len = args['--len']
    dsm_prefix = args['<dsm_prefix>']
    dim = int(args['<dim>'])
    gamma = float(args['<gamma>'])
    outPath = args['<outPath>']

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
    start_time = time.time()

    # Get space with sparse matrix
    dsm = load_pkl_files(dsm_prefix)

    id2row = dsm.get_id2row()

    # Get matrix from space
    matrix_ = dsm.get_cooccurrence_matrix()

    # Apply SVD
    u, s, v = randomized_svd(matrix_.get_mat(),

    # Weight matrix
    if gamma == 0.0:
        matrix_ = u
    elif gamma == 1.0:
        #matrix_ = np.dot(u, np.diag(s)) # This is equivalent to the below formula (because s is a flattened diagonal matrix)
        matrix_ = s * u
        #matrix_ = np.dot(u, np.power(np.diag(s), gamma)) # This is equivalent to the below formula
        matrix_ = np.power(s, gamma) * u

    if is_len:
        # L2-normalize vectors
        l2norm1 = np.linalg.norm(matrix_, axis=1, ord=2)
        l2norm1[l2norm1 == 0.0] = 1.0  # Convert 0 values to 1
        matrix_ /= l2norm1.reshape(len(l2norm1), 1)

    dsm = Space(DenseMatrix(matrix_), id2row, [])

    # Save the Space object in pickle format
                   outPath + ".svd.dm",
    logging.info("--- %s seconds ---" % (time.time() - start_time))
Пример #8
def main():
    Create two aligned low-dimensional vector spaces by sparse random indexing from two co-occurrence matrices as described in:
       Pierpaolo Basile, Annalina Caputo and Giovanni Semeraro, 2014. Analysing Word Meaning over Time by Exploiting Temporal Random Indexing.

    # Get the arguments
    args = docopt(
        '''Create two aligned low-dimensional vector spaces by sparse random indexing from two co-occurrence matrices.

        srv_align.py [-l] (-s <seeds> | -a) <dim> <t> <outPath1> <outPath2> <outPathElement> <spacePrefix1> <spacePrefix2>

        <samplesize> = number negative samples, expressed as percentage of positive samples
        <negAlpha> = smoothing parameter for negative sampling
        <seeds> = number of non-zero values in each random vector
        <dim> = number of dimensions for random vectors
        <t> = threshold for downsampling (if t=None, no subsampling is applied)
        <outPath1> = output path for aligned space 1
        <outPath2> = output path for aligned space 2
        <spacePrefix1> = path to pickled space without suffix
        <spacePrefix2> = path to pickled space without suffix
        <outPathElement> = output path for elemental space (context vectors)

        -l, --len   normalize final vectors to unit length
        -s, --see   specify number of seeds manually
        -a, --aut   calculate number of seeds automatically as proposed in [1,2]
        [1] Ping Li, T. Hastie and K. W. Church, 2006,
           "Very Sparse Random Projections".
        [2] D. Achlioptas, 2001, "Database-friendly random projections",


    is_len = args['--len']
    is_seeds = args['--see']
    if is_seeds:
        seeds = int(args['<seeds>'])
    is_aut = args['--aut']
    dim = int(args['<dim>'])
    if args['<t>'] == 'None':
        t = None
        t = float(args['<t>'])
    outPath1 = args['<outPath1>']
    outPath2 = args['<outPath2>']
    outPathElement = args['<outPathElement>']
    spacePrefix1 = args['<spacePrefix1>']
    spacePrefix2 = args['<spacePrefix2>']

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
    start_time = time.time()

    # Load input spaces
    space1 = load_pkl_files(spacePrefix1)
    space2 = load_pkl_files(spacePrefix2)
    matrix1 = csc_matrix(space1.get_cooccurrence_matrix().get_mat())
    matrix2 = csc_matrix(space2.get_cooccurrence_matrix().get_mat())

    # Get mappings between rows/columns and words
    id2row1 = space1.get_id2row()
    id2row2 = space2.get_id2row()
    row2id_1 = space1.get_row2id()
    row2id_2 = space2.get_row2id()
    id2column1 = space1.get_id2column()
    id2column2 = space2.get_id2column()

    # Get union of rows and columns in both spaces
    unified_rows = sorted(list(set(id2row1).union(id2row2)))
    unified_columns = sorted(list(set(id2column1).union(id2column2)))
    columns_diff1 = list(set(unified_columns) - set(id2column1))
    columns_diff2 = list(set(unified_columns) - set(id2column2))

    # Get mappings of indices of columns in original spaces to indices of columns in unified space
    c2i = {w: i for i, w in enumerate(unified_columns)}
    cj2i1 = {j: c2i[w] for j, w in enumerate(id2column1 + columns_diff1)}
    cj2i2 = {j: c2i[w] for j, w in enumerate(id2column2 + columns_diff2)}

    if t != None:
        rows_diff1 = list(set(unified_rows) - set(id2row1))
        rows_diff2 = list(set(unified_rows) - set(id2row2))

        r2i = {w: i for i, w in enumerate(unified_rows)}
        rj2i1 = {j: r2i[w] for j, w in enumerate(id2row1 + rows_diff1)}
        rj2i2 = {j: r2i[w] for j, w in enumerate(id2row2 + rows_diff2)}

        # Build spaces with unified COLUMNS
        new_columns1 = csc_matrix(
            (len(id2row1), len(columns_diff1)
             ))  # Get empty columns for additional context words
        unified_matrix1 = hstack(
            (matrix1, new_columns1)
        )[:, sorted(
            cj2i1, key=cj2i1.get
        )]  # First concatenate matrix and empty columns and then order columns according to unified_columns

        new_columns2 = csc_matrix((len(id2row2), len(columns_diff2)))
        unified_matrix2 = hstack(
            (matrix2, new_columns2))[:, sorted(cj2i2, key=cj2i2.get)]

        # Build spaces with unified ROWS
        new_rows1 = csc_matrix((len(rows_diff1), len(unified_columns)))
        final_unified_matrix1 = csc_matrix(vstack(
            (unified_matrix1, new_rows1)))[sorted(rj2i1, key=rj2i1.get)]

        new_rows2 = csc_matrix((len(rows_diff2), len(unified_columns)))
        final_unified_matrix2 = csc_matrix(vstack(
            (unified_matrix2, new_rows2)))[sorted(rj2i2, key=rj2i2.get)]

        # Add up final unified matrices
        common_unified_matrix = np.add(final_unified_matrix1,

        # Get number of total occurrences of any word
        totalOcc = np.sum(common_unified_matrix)

        # Define function for downsampling
        downsample = lambda f: np.sqrt(float(t) / f) if f > t else 1.0
        downsample = np.vectorize(downsample)

        # Get total normalized co-occurrence frequency of all contexts in both spaces
        context_freqs = np.array(common_unified_matrix.sum(axis=0) /

    ## Generate ternary random vectors
    if is_seeds:
        elementalMatrix = lil_matrix((len(unified_columns), dim))
        # Generate base vector for random vectors
        baseVector = np.zeros(
        )  # Note: Make sure that number of seeds is not greater than dimensions
        for i in range(0, seeds / 2):
            baseVector[i] = 1.0
        for i in range(seeds / 2, seeds):
            baseVector[i] = -1.0
        for i in range(
        ):  # To-do: make this more efficient by generating random indices for a whole array
            elementalMatrix[i] = baseVector
    if is_aut:
        elementalMatrix = sparse_random_matrix(dim, len(unified_columns)).T

    # Initialize target vectors
    alignedMatrix1 = np.zeros((len(id2row1), dim))
    alignedMatrix2 = np.zeros((len(id2row2), dim))

    # Iterate over rows of space, find context words and update aligned matrix with low-dimensional random vectors of these context words
    for (space, id2row, cj2i,
         alignedMatrix) in [(space1, id2row1, cj2i1, alignedMatrix1),
                            (space2, id2row2, cj2i2, alignedMatrix2)]:
        # Iterate over targets
        for i, target in enumerate(id2row):
            # Get co-occurrence values as matrix
            m = space.get_row(target).get_mat()
            # Get nonzero indexes
            nonzeros = m.nonzero()
            nonzeros = [cj2i[j] for j in nonzeros[1]]
            data = m.data
            pos_context_vectors = elementalMatrix[nonzeros]
            if t != None:
                # Apply subsampling
                rfs = context_freqs[nonzeros]
                rfs = downsample(rfs)
                data *= rfs
            # Weight context vectors by occurrence frequency
            pos_context_vectors = pos_context_vectors.multiply(
                data.reshape(-1, 1))
            # Add up context vectors and store as row for target
            alignedMatrix[i] = np.sum(pos_context_vectors, axis=0)

    if is_len:
        # L2-normalize vectors
        l2norm1 = np.linalg.norm(alignedMatrix1, axis=1, ord=2)
        l2norm2 = np.linalg.norm(alignedMatrix2, axis=1, ord=2)
        l2norm1[l2norm1 == 0.0] = 1.0  # Convert 0 values to 1
        l2norm2[l2norm2 == 0.0] = 1.0  # Convert 0 values to 1
        alignedMatrix1 /= l2norm1.reshape(len(l2norm1), 1)
        alignedMatrix2 /= l2norm2.reshape(len(l2norm2), 1)

    # Make spaces
    alignedSpace1 = Space(DenseMatrix(alignedMatrix1), id2row1, [])
    alignedSpace2 = Space(DenseMatrix(alignedMatrix2), id2row2, [])
    elementalSpace = Space(SparseMatrix(elementalMatrix), unified_columns, [])

    # Save the Space objects in pickle format
    save_pkl_files(alignedSpace1, outPath1 + '.dm', save_in_one_file=False)
    save_pkl_files(alignedSpace2, outPath2 + '.dm', save_in_one_file=False)
                   outPathElement + '.dm',

    logging.info("--- %s seconds ---" % (time.time() - start_time))