Exemplo n.º 1
0
def main():
    """
    Align two sparse matrices by intersecting their columns.
    """

    # Get the arguments
    args = docopt('''Align two sparse matrices by intersecting their columns.

    Usage:
        ci.py <matrix1> <matrix2> <outPath1> <outPath2>

        <matrix1> = path to matrix1 in npz format
        <matrix2> = path to matrix2 in npz format
        <outPath1> = output path for aligned matrix 1
        <outPath2> = output path for aligned matrix 2
    
    ''')

    matrix1 = args['<matrix1>']
    matrix2 = args['<matrix2>']
    outPath1 = args['<outPath1>']
    outPath2 = args['<outPath2>']

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()

    # Load matrices, rows and columns
    space1 = Space(matrix1)
    space2 = Space(matrix2)
    matrix1 = space1.matrix
    rows1 = space1.rows
    columns1 = space1.columns
    column2id1 = space1.column2id
    matrix2 = space2.matrix
    rows2 = space2.rows
    columns2 = space2.columns
    column2id2 = space2.column2id

    # Intersect columns of matrices
    intersected_columns = list(set(columns1).intersection(columns2))
    intersected_columns_id1 = [
        column2id1[item] for item in intersected_columns
    ]
    intersected_columns_id2 = [
        column2id2[item] for item in intersected_columns
    ]
    reduced_matrix1 = matrix1[:, intersected_columns_id1]
    reduced_matrix2 = matrix2[:, intersected_columns_id2]

    # Save matrices
    Space(matrix=reduced_matrix1, rows=rows1,
          columns=intersected_columns).save(outPath1)
    Space(matrix=reduced_matrix2, rows=rows2,
          columns=intersected_columns).save(outPath2)

    logging.info("--- %s seconds ---" % (time.time() - start_time))
Exemplo n.º 2
0
def main():
    """
    Create low-dimensional matrix from count matrix by multiplication with random matrix.
    """

    # Get the arguments
    args = docopt(
        '''Create low-dimensional matrix from count matrix by multiplication with random matrix.

    Usage:
        multiply.py [-l] [-c] <countPath> <randomPath> <outPath>

        <countPath> = path to count matrix
        <randomPath> = path to random matrix
        <outPath> = output path for reduced matrix

    Options:
        -l, --len   normalize final vectors to unit length
        -c, --cen   mean center columns of final matrix

    ''')

    is_len = args['--len']
    is_cen = args['--cen']
    countPath = args['<countPath>']
    randomPath = args['<randomPath>']
    outPath = args['<outPath>']

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()

    # Load matrices
    countSpace = Space(countPath)
    countMatrix = countSpace.matrix
    randomSpace = Space(randomPath)
    randomMatrix = randomSpace.matrix

    logging.info("Multiplying matrices")
    reducedMatrix = np.dot(countMatrix, randomMatrix)
    reducedSpace = Space(matrix=reducedMatrix,
                         rows=countSpace.rows,
                         columns=[])

    if is_len:
        logging.info("L2-normalize vectors")
        reducedSpace.l2_normalize()

    if is_cen:
        logging.info("Mean center columns")
        reducedSpace.mean_center()

    # Save the reduced matrix
    reducedSpace.save(outPath)

    logging.info("--- %s seconds ---" % (time.time() - start_time))
Exemplo n.º 3
0
def main():
    """
    Higher-order similarity matrix.
    """

    # Get the arguments
    args = docopt('''Apply the similarity order transformation.

    Usage:
        sot.py [-l] <matrixPath> <outPath> <alpha>

        <matrixPath>    = path to matrix
        <outPath>       = output path for space
        <alpha>         = the desired similarity-order

    Options:
        -l, --len   normalize vectors to unit length before centering

    ''')

    is_len = args['--len']
    matrixPath = args['<matrixPath>']
    outPath = args['<outPath>']
    alpha = float(args['<alpha>'])

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()

    # Load matrices and rows
    try:
        space = Space(matrixPath, format='npz')
    except ValueError:
        space = Space(matrixPath, format='w2v')

    # L2-normalize vectors
    if is_len:
        space.l2_normalize()

    # Similarity matrix
    space.transform_similarity_order(alpha)

    # Save the matrix
    space.save(outPath, format="w2v")

    logging.info("--- %s seconds ---" % (time.time() - start_time))
Exemplo n.º 4
0
def main():
    """
    Mean center matrix.
    """

    # Get the arguments
    args = docopt('''Mean center matrix.

    Usage:
        center.py [-l] <matrixPath> <outPath>

        <matrixPath> = path to matrix
        <outPath> = output path for space

    Options:
        -l, --len   normalize vectors to unit length before centering

    ''')

    is_len = args['--len']
    matrixPath = args['<matrixPath>']
    outPath = args['<outPath>']

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()    

    # Load matrices and rows
    try:
        space = Space(matrixPath, format='npz')   
    except ValueError:
        space = Space(matrixPath, format='w2v')   

    if is_len:
        # L2-normalize vectors
        space.l2_normalize()

    # Mean center    
    space.mean_center()
        
    # Save the matrix
    space.save(outPath)

    logging.info("--- %s seconds ---" % (time.time() - start_time))                   
Exemplo n.º 5
0
def main():
    """
    Create low-dimensional and sparse random matrix from vocabulary file.
    """

    # Get the arguments
    args = docopt(
        '''Create low-dimensional and sparse random matrix from vocabulary file.

    Usage:
        random.py <vocabFile> <outPath> <dim>

        <vocabFile> = row and column vocabulary
        <outPath> = output path for random matrix
        <dim> = dimensionality for random vectors

    Note:
        Calculates number of seeds automatically as proposed in [1,2]

    References:
        [1] Ping Li, T. Hastie and K. W. Church, 2006,
           "Very Sparse Random Projections".
           http://web.stanford.edu/~hastie/Papers/Ping/KDD06_rp.pdf
        [2] D. Achlioptas, 2001, "Database-friendly random projections",
           http://www.cs.ucsc.edu/~optas/papers/jl.pdf

    ''')
    #np.random.seed(0) # uncomment for reproducibility

    vocabFile = args['<vocabFile>']
    outPath = args['<outPath>']
    dim = int(args['<dim>'])

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()

    # Load vocabulary
    logging.info("Loading vocabulary")
    with open(vocabFile, 'r', encoding='utf-8') as f_in:
        vocabulary = [line.strip() for line in f_in]

    # Generate random vectors
    randomMatrix = sparse_random_matrix(dim, len(vocabulary)).toarray().T

    # Store random matrix
    Space(matrix=randomMatrix, rows=vocabulary, columns=[]).save(outPath)

    logging.info("--- %s seconds ---" % (time.time() - start_time))
Exemplo n.º 6
0
def main():
    """
    Mean center matrix, depending on flag, and remove top n PCA components
    """

    # Get the arguments
    args = docopt(
        '''Depending on the flag, mean centers matrix and applies and removes the top n PCA components.

    Usage:
        pcr.py [-m] <matrixPath> <outPath> <threshold>

        <matrixPath> = path to matrix
        <outPath> = output path for space
        <threshold> = threshold, amount of PCA components

    Options:
        -m, --mean  flag, if mean centering should be applied

    ''')

    matrix_path = args['<matrixPath>']
    out_path = args['<outPath>']
    threshold = args['<threshold>']

    is_mean = args['--mean']

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()

    try:
        space = Space(matrix_path, format='npz')
        _format_flag = 'npz'
    except ValueError:
        space = Space(matrix_path, format='w2v')
        _format_flag = 'w2v'

    # MC+PCR
    space.mc_pcr(int(threshold), is_mean)

    # Save the matrix
    space.save(out_path, format=_format_flag)

    logging.info("--- %s seconds ---" % (time.time() - start_time))
Exemplo n.º 7
0
def main():
    """
    Compute local neighborhood distance for target pairs from two vector spaces.
    """

    # Get the arguments
    args = docopt(
        """Compute local neighborhood distance for target pairs from two vector spaces.

    Usage:
        lnd.py [(-f | -s)] <testset> <matrixPath1> <matrixPath2> <outPath> <k>

        <testset> = path to file with tab-separated word pairs
        <matrixPath1> = path to matrix1
        <matrixPath2> = path to matrix2
        <outPath> = output path for result file
        <k> = parameter k (k nearest neighbors)

    Options:
        -f, --fst   write only first target in output file
        -s, --scd   write only second target in output file
        
    """)

    is_fst = args['--fst']
    is_scd = args['--scd']
    testset = args['<testset>']
    matrixPath1 = args['<matrixPath1>']
    matrixPath2 = args['<matrixPath2>']
    outPath = args['<outPath>']
    k = int(args['<k>'])

    #logging.config.dictConfig({'version': 1, 'disable_existing_loggers': True,})
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()

    # Load matrices and rows
    try:
        space1 = Space(matrixPath1, format='npz')
    except ValueError:
        space1 = Space(matrixPath1, format='w2v')
    try:
        space2 = Space(matrixPath2, format='npz')
    except ValueError:
        space2 = Space(matrixPath2, format='w2v')

    matrix1 = space1.matrix
    row2id1 = space1.row2id
    id2row1 = space1.id2row
    matrix2 = space2.matrix
    row2id2 = space2.row2id
    id2row2 = space2.id2row

    # Load targets
    with open(testset, 'r', encoding='utf-8') as f_in:
        targets = [(line.strip().split('\t')[0], line.strip().split('\t')[1])
                   for line in f_in]

    nbrs1 = NearestNeighbors(n_neighbors=k, metric='cosine',
                             algorithm='brute').fit(matrix1)
    nbrs2 = NearestNeighbors(n_neighbors=k, metric='cosine',
                             algorithm='brute').fit(matrix2)

    scores = {}
    neighborUnionSizes = {}
    for (t1, t2) in targets:

        # Get nearest neighbors
        try:
            index1 = row2id1[t1]
            index2 = row2id2[t2]
        except KeyError:
            scores[(t1, t2)] = 'nan'
            neighborUnionSizes[(t1, t2)] = 'nan'
            continue

        v1 = matrix1[index1].toarray().flatten()
        v2 = matrix2[index2].toarray().flatten()

        distances1, indices1 = nbrs1.kneighbors(matrix1[index1])
        distances2, indices2 = nbrs2.kneighbors(matrix2[index2])

        neighbors1 = list(
            zip([id2row1[i] for i in indices1.flatten().tolist()],
                distances1.flatten().tolist()))
        neighbors2 = list(
            zip([id2row2[i] for i in indices2.flatten().tolist()],
                distances2.flatten().tolist()))

        neighborUnion = sorted(
            list(
                set([
                    a for (a, b) in neighbors1 + neighbors2
                    if (a in row2id1 and a in row2id2 and not a in [t1, t2])
                ])))

        # Filter out vectors with 0-length in either matrix
        neighborUnion = [
            a for a in neighborUnion if (len(matrix1[row2id1[a]].data) > 0
                                         and len(matrix2[row2id2[a]].data) > 0)
        ]

        simVec1 = [
            1.0 - cosine_distance(matrix1[index1].toarray().flatten(),
                                  matrix1[row2id1[n]].toarray().flatten())
            for n in neighborUnion
        ]
        simVec2 = [
            1.0 - cosine_distance(matrix2[index2].toarray().flatten(),
                                  matrix2[row2id2[n]].toarray().flatten())
            for n in neighborUnion
        ]

        # Compute cosine distance of vectors
        distance = cosine_distance(simVec1, simVec2)
        scores[(t1, t2)] = distance
        neighborUnionSizes[(t1, t2)] = len(neighborUnion)

    with open(outPath, 'w', encoding='utf-8') as f_out:
        for (t1, t2) in targets:
            if is_fst:  # output only first target string
                f_out.write('\t'.join(
                    (t1, str(scores[(t1, t2)]),
                     str(neighborUnionSizes[(t1, t2)]) + '\n')))
            elif is_scd:  # output only second target string
                f_out.write('\t'.join(
                    (t2, str(scores[(t1, t2)]),
                     str(neighborUnionSizes[(t1, t2)]) + '\n')))
            else:  # standard outputs both target strings
                f_out.write('\t'.join(
                    ('%s,%s' % (t1, t2), str(scores[(t1, t2)]),
                     str(neighborUnionSizes[(t1, t2)]) + '\n')))

    logging.info("--- %s seconds ---" % (time.time() - start_time))
Exemplo n.º 8
0
def main():
    """
    Perform dimensionality reduction on a (normally PPMI) matrix by applying truncated SVD as described in

      Omer Levy, Yoav Goldberg, and Ido Dagan. 2015. Improving distributional similarity with lessons learned from word embeddings. Trans. ACL, 3.

    """

    # Get the arguments
    args = docopt('''Perform dimensionality reduction on a (normally PPMI) matrix by applying truncated SVD and save it in pickle format.

    Usage:
        svd.py [-l] <matrixPath> <outPath> <dim> <gamma>

        <matrixPath> = path to matrix
        <outPath> = output path for space
        <dim> = dimensionality of low-dimensional output vectors
        <gamma> = eigenvalue weighting parameter

    Options:
        -l, --len   normalize final vectors to unit length

    ''')

    is_len = args['--len']
    matrixPath = args['<matrixPath>']
    outPath = args['<outPath>']
    dim = int(args['<dim>'])
    gamma = float(args['<gamma>'])

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()    

    # Load input matrix
    space = Space(matrixPath)   
    matrix = space.matrix
    
    # Get mappings between rows/columns and words
    rows = space.rows
    id2row = space.id2row
    id2column = space.id2column

    # Apply SVD
    u, s, v = randomized_svd(matrix, n_components=dim, n_iter=5, transpose=False)

    # Weight matrix
    if gamma == 0.0:
        matrix_reduced = u
    elif gamma == 1.0:
        #matrix_reduced = np.dot(u, np.diag(s)) # This is equivalent to the below formula (because s is a flattened diagonal matrix)
        matrix_reduced = s * u
    else:
        #matrix_ = np.dot(u, np.power(np.diag(s), gamma)) # This is equivalent to the below formula
        matrix_reduced = np.power(s, gamma) * u
       
    outSpace = Space(matrix=matrix_reduced, rows=rows, columns=[])

    if is_len:
        # L2-normalize vectors
        outSpace.l2_normalize()
        
    # Save the matrix
    outSpace.save(outPath, format='w2v')

    logging.info("--- %s seconds ---" % (time.time() - start_time))                   
Exemplo n.º 9
0
def main():
    """
    Create low-dimensional vector space by sparse random indexing from co-occurrence matrix.
    """

    # Get the arguments
    args = docopt(
        '''Create low-dimensional vector space by sparse random indexing from co-occurrence matrix.

    Usage:
        ri.py [-l] <matrixPath> <outPath> <dim>

        <matrixPath> = path to matrix
        <outPath> = output path for reduced space 
        <dim> = number of dimensions for random vectors

    Options:
        -l, --len   normalize final vectors to unit length

    Note:
        Paramaters -s, -a and <t> have been removed from an earlier version for efficiency.

    References:
        [1] Ping Li, T. Hastie and K. W. Church, 2006,
           "Very Sparse Random Projections".
           http://web.stanford.edu/~hastie/Papers/Ping/KDD06_rp.pdf
        [2] D. Achlioptas, 2001, "Database-friendly random projections",
           http://www.cs.ucsc.edu/~optas/papers/jl.pdf

    ''')

    is_len = args['--len']
    matrixPath = args['<matrixPath>']
    outPath = args['<outPath>']
    dim = int(args['<dim>'])

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()

    # Load input matrix
    countSpace = Space(matrixPath)
    countMatrix = countSpace.matrix
    rows = countSpace.rows
    columns = countSpace.columns

    # Generate random vectors
    randomMatrix = csr_matrix(
        sparse_random_matrix(dim, len(columns)).toarray().T)

    logging.info("Multiplying matrices")
    reducedMatrix = np.dot(countMatrix, randomMatrix)
    outSpace = Space(matrix=reducedMatrix, rows=rows, columns=[])

    if is_len:
        # L2-normalize vectors
        outSpace.l2_normalize()

    # Save the matrix
    outSpace.save(outPath, format='w2v')

    logging.info("--- %s seconds ---" % (time.time() - start_time))
Exemplo n.º 10
0
def main():
    """
    Create two aligned low-dimensional vector spaces by sparse random indexing from two co-occurrence matrices as described in:
       Pierpaolo Basile, Annalina Caputo and Giovanni Semeraro, 2014. Analysing Word Meaning over Time by Exploiting Temporal Random Indexing.
    """

    # Get the arguments
    args = docopt(
        '''Create two aligned low-dimensional vector spaces by sparse random indexing from two co-occurrence matrices.

    Usage:
        srv_align.py [-l] (-s <seeds> | -a) <matrixPath1> <matrixPath2> <outPath1> <outPath2> <outPathElement> <dim> <t>

        <seeds> = number of non-zero values in each random vector
        <matrixPath1> = path to matrix1
        <matrixPath2> = path to matrix2
        <outPath1> = output path for aligned space 1
        <outPath2> = output path for aligned space 2
        <outPathElement> = output path for elemental space (context vectors)
        <dim> = number of dimensions for random vectors
        <t> = threshold for downsampling (if t=None, no subsampling is applied)

    Options:
        -l, --len   normalize final vectors to unit length
        -s, --see   specify number of seeds manually
        -a, --aut   calculate number of seeds automatically as proposed in [1,2]
  
    References:
        [1] Ping Li, T. Hastie and K. W. Church, 2006,
           "Very Sparse Random Projections".
           http://web.stanford.edu/~hastie/Papers/Ping/KDD06_rp.pdf
        [2] D. Achlioptas, 2001, "Database-friendly random projections",
           http://www.cs.ucsc.edu/~optas/papers/jl.pdf

    ''')

    is_len = args['--len']
    is_seeds = args['--see']
    if is_seeds:
        seeds = int(args['<seeds>'])
    is_aut = args['--aut']
    matrixPath1 = args['<matrixPath1>']
    matrixPath2 = args['<matrixPath2>']
    outPath1 = args['<outPath1>']
    outPath2 = args['<outPath2>']
    outPathElement = args['<outPathElement>']
    dim = int(args['<dim>'])
    if args['<t>'] == 'None':
        t = None
    else:
        t = float(args['<t>'])

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()

    # Load input matrices
    space1 = Space(matrixPath1)
    matrix1 = space1.matrix
    space2 = Space(matrixPath2)
    matrix2 = space2.matrix

    # Get mappings between rows/columns and words
    rows1 = space1.rows
    id2row1 = space1.id2row
    row2id1 = space1.row2id
    columns1 = space1.columns
    column2id1 = space1.column2id
    rows2 = space2.rows
    id2row2 = space2.id2row
    row2id2 = space2.row2id
    columns2 = space2.columns
    column2id2 = space2.column2id

    # Get union of rows and columns in both spaces
    unified_rows = sorted(list(set(rows1).union(rows2)))
    unified_columns = sorted(list(set(columns1).union(columns2)))
    columns_diff1 = sorted(list(set(unified_columns) - set(columns1)))
    columns_diff2 = sorted(list(set(unified_columns) - set(columns2)))

    # Get mappings of indices of columns in original spaces to indices of columns in unified space
    c2i = {w: i for i, w in enumerate(unified_columns)}
    cj2i1 = {j: c2i[w] for j, w in enumerate(columns1 + columns_diff1)}
    cj2i2 = {j: c2i[w] for j, w in enumerate(columns2 + columns_diff2)}

    if t != None:
        rows_diff1 = list(set(unified_rows) - set(rows1))
        rows_diff2 = list(set(unified_rows) - set(rows2))

        r2i = {w: i for i, w in enumerate(unified_rows)}
        rj2i1 = {j: r2i[w] for j, w in enumerate(rows1 + rows_diff1)}
        rj2i2 = {j: r2i[w] for j, w in enumerate(rows2 + rows_diff2)}

        # Build spaces with unified COLUMNS
        new_columns1 = csc_matrix(
            (len(rows1), len(columns_diff1)
             ))  # Get empty columns for additional context words
        unified_matrix1 = csc_matrix(hstack(
            (matrix1, new_columns1)
        ))[:, sorted(
            cj2i1, key=cj2i1.get
        )]  # First concatenate matrix and empty columns and then order columns according to unified_columns

        new_columns2 = csc_matrix((len(rows2), len(columns_diff2)))
        unified_matrix2 = csc_matrix(hstack(
            (matrix2, new_columns2)))[:, sorted(cj2i2, key=cj2i2.get)]

        # Build spaces with unified ROWS
        new_rows1 = csc_matrix((len(rows_diff1), len(unified_columns)))
        final_unified_matrix1 = csc_matrix(vstack(
            (unified_matrix1, new_rows1)))[sorted(rj2i1, key=rj2i1.get)]

        new_rows2 = csc_matrix((len(rows_diff2), len(unified_columns)))
        final_unified_matrix2 = csc_matrix(vstack(
            (unified_matrix2, new_rows2)))[sorted(rj2i2, key=rj2i2.get)]

        # Add up final unified matrices
        common_unified_matrix = np.add(final_unified_matrix1,
                                       final_unified_matrix2)

        # Get number of total occurrences of any word
        totalOcc = np.sum(common_unified_matrix)

        # Define function for downsampling
        downsample = lambda f: np.sqrt(float(t) / f) if f > t else 1.0
        downsample = np.vectorize(downsample)

        # Get total normalized co-occurrence frequency of all contexts in both spaces
        context_freqs = np.array(common_unified_matrix.sum(axis=0) /
                                 totalOcc)[0]

    ## Generate ternary random vectors
    if is_seeds:
        elementalMatrix = lil_matrix((len(unified_columns), dim))
        # Generate base vector for random vectors
        baseVector = np.zeros(
            dim
        )  # Note: Make sure that number of seeds is not greater than dimensions
        for i in range(0, int(seeds / 2)):
            baseVector[i] = 1.0
        for i in range(int(seeds / 2), seeds):
            baseVector[i] = -1.0
        for i in range(
                len(unified_columns)
        ):  # To-do: make this more efficient by generating random indices for a whole array
            np.random.shuffle(baseVector)
            elementalMatrix[i] = baseVector
    if is_aut:
        elementalMatrix = sparse_random_matrix(dim, len(unified_columns)).T

    # Initialize target vectors
    alignedMatrix1 = np.zeros((len(rows1), dim))
    alignedMatrix2 = np.zeros((len(rows2), dim))

    # Iterate over rows of space, find context words and update aligned matrix with low-dimensional random vectors of these context words
    for (matrix, id2row, cj2i,
         alignedMatrix) in [(matrix1, id2row1, cj2i1, alignedMatrix1),
                            (matrix2, id2row2, cj2i2, alignedMatrix2)]:
        # Iterate over targets
        for i in id2row:
            # Get co-occurrence values as matrix
            m = matrix[i]
            # Get nonzero indexes
            nonzeros = m.nonzero()
            nonzeros = [cj2i[j] for j in nonzeros[1]]
            data = m.data
            pos_context_vectors = elementalMatrix[nonzeros]
            if t != None:
                # Apply subsampling
                rfs = context_freqs[nonzeros]
                rfs = downsample(rfs)
                data *= rfs
            # Weight context vectors by occurrence frequency
            pos_context_vectors = pos_context_vectors.multiply(
                data.reshape(-1, 1))
            # Add up context vectors and store as row for target
            alignedMatrix[i] = np.sum(pos_context_vectors, axis=0)

    outSpace1 = Space(matrix=alignedMatrix1, rows=rows1, columns=[])
    outSpace2 = Space(matrix=alignedMatrix2, rows=rows2, columns=[])

    if is_len:
        # L2-normalize vectors
        outSpace1.l2_normalize()
        outSpace2.l2_normalize()

    # Save the matrices
    outSpace1.save(outPath1)
    outSpace2.save(outPath2)
    Space(matrix=elementalMatrix, rows=unified_columns,
          columns=[]).save(outPathElement)

    logging.info("--- %s seconds ---" % (time.time() - start_time))
Exemplo n.º 11
0
def main():
    """
    Compute the smoothed and shifted PPMI matrix from a co-occurrence matrix. Smoothing is performed as described in

      Omer Levy, Yoav Goldberg, and Ido Dagan. 2015. Improving distributional similarity with lessons learned from word embeddings. Trans. ACL, 3.

    """

    # Get the arguments
    args = docopt('''Compute the smoothed and shifted PPMI matrix from a co-occurrence matrix and save it.

    Usage:
        ppmi.py [-l] <matrixPath> <outPath> <k> <alpha>

        <matrixPath> = path to matrix
        <outPath> = output path for space
        <k> = shifting parameter
        <alpha> = smoothing parameter

    Options:
        -l, --len   normalize final vectors to unit length

    ''')

    is_len = args['--len']
    matrixPath = args['<matrixPath>']
    outPath = args['<outPath>']
    k = int(args['<k>'])
    alpha = float(args['<alpha>'])

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()    

    # Load input matrix
    space = Space(matrixPath)   

    # Apply EPMI weighting
    space.epmi_weighting(alpha)
    
    # Apply log weighting
    space.log_weighting()

    # Shift values
    space.shifting(k)

    # Eliminate negative counts
    space.eliminate_negative()

    # Eliminate zero counts
    space.eliminate_zeros()
        
    outSpace = Space(matrix=space.matrix, rows=space.rows, columns=space.columns)

    if is_len:
        # L2-normalize vectors
        outSpace.l2_normalize()
        
    # Save the matrix
    outSpace.save(outPath)

    logging.info("--- %s seconds ---" % (time.time() - start_time))                   
Exemplo n.º 12
0
def main():
    """
    Make count-based vector space from corpus.
    """

    # Get the arguments
    args = docopt("""Make count-based vector space from corpus.

    Usage:
        count.py [-l] <corpDir> <outPath> <windowSize>
        
    Arguments:
       
        <corpDir> = path to corpus or corpus directory (iterates through files)
        <outPath> = output path for vectors
        <windowSize> = the linear distance of context words to consider in each direction

    Options:
        -l, --len   normalize final vectors to unit length

    """)

    is_len = args['--len']
    corpDir = args['<corpDir>']
    outPath = args['<outPath>']
    windowSize = int(args['<windowSize>'])

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()

    # Build vocabulary
    logging.info("Building vocabulary")
    sentences = LineSentence(corpDir)
    # sentences = PathLineSentences(corpDir)
    vocabulary = sorted(
        list(
            set([
                word for sentence in sentences for word in sentence
                if len(sentence) > 1
            ])))  # Skip one-word sentences to avoid zero-vectors
    w2i = {w: i for i, w in enumerate(vocabulary)}

    # Initialize co-occurrence matrix as dictionary
    cooc_mat = defaultdict(lambda: 0)

    # Get counts from corpus
    sentences = PathLineSentences(corpDir)
    logging.info("Counting context words")
    for sentence in sentences:
        for i, word in enumerate(sentence):
            lowerWindowSize = max(i - windowSize, 0)
            upperWindowSize = min(i + windowSize, len(sentence))
            window = sentence[lowerWindowSize:i] + sentence[i +
                                                            1:upperWindowSize +
                                                            1]
            if len(window) == 0:  # Skip one-word sentences
                continue
            windex = w2i[word]
            for contextWord in window:
                cooc_mat[(windex, w2i[contextWord])] += 1

    # Convert dictionary to sparse matrix
    logging.info("Converting dictionary to matrix")
    cooc_mat_sparse = dok_matrix((len(vocabulary), len(vocabulary)),
                                 dtype=float)
    try:
        cooc_mat_sparse.update(cooc_mat)
    except NotImplementedError:
        cooc_mat_sparse._update(cooc_mat)

    outSpace = Space(matrix=cooc_mat_sparse,
                     rows=vocabulary,
                     columns=vocabulary)

    if is_len:
        # L2-normalize vectors
        outSpace.l2_normalize()

    # Save the matrix
    outSpace.save(outPath)

    logging.info("--- %s seconds ---" % (time.time() - start_time))
Exemplo n.º 13
0
def main():
    """
    Create two aligned low-dimensional vector spaces by sparse random indexing from two co-occurrence matrices as described in:
       Pierpaolo Basile, Annalina Caputo and Giovanni Semeraro, 2014. Analysing Word Meaning over Time by Exploiting Temporal Random Indexing.
    """

    # Get the arguments
    args = docopt(
        '''Create two aligned low-dimensional vector spaces by sparse random indexing from two co-occurrence matrices.

    Usage:
        srv_align.py [-l] <matrixPath1> <matrixPath2> <outPath1> <outPath2> <dim>

        <matrixPath1> = path to matrix1
        <matrixPath2> = path to matrix2
        <outPath1> = output path for aligned space 1
        <outPath2> = output path for aligned space 2
        <dim> = number of dimensions for random vectors

    Options:
        -l, --len   normalize final vectors to unit length

    Note:
        Assumes intersected and ordered columns. Paramaters -s, -a and <t> have been removed from an earlier version for efficiency. Also columns are now intersected instead of unified.
  
    References:
        [1] Ping Li, T. Hastie and K. W. Church, 2006,
           "Very Sparse Random Projections".
           http://web.stanford.edu/~hastie/Papers/Ping/KDD06_rp.pdf
        [2] D. Achlioptas, 2001, "Database-friendly random projections",
           http://www.cs.ucsc.edu/~optas/papers/jl.pdf

    ''')

    is_len = args['--len']
    matrixPath1 = args['<matrixPath1>']
    matrixPath2 = args['<matrixPath2>']
    outPath1 = args['<outPath1>']
    outPath2 = args['<outPath2>']
    dim = int(args['<dim>'])

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()

    # Load input matrices
    countSpace1 = Space(matrixPath1)
    countMatrix1 = countSpace1.matrix
    rows1 = countSpace1.rows
    columns1 = countSpace1.columns

    countSpace2 = Space(matrixPath2)
    countMatrix2 = countSpace2.matrix
    rows2 = countSpace2.rows
    columns2 = countSpace2.columns

    # Generate random vectors
    randomMatrix = csr_matrix(
        sparse_random_matrix(dim, len(columns1)).toarray().T)

    logging.info("Multiplying matrices")
    reducedMatrix1 = np.dot(countMatrix1, randomMatrix)
    reducedMatrix2 = np.dot(countMatrix2, randomMatrix)

    outSpace1 = Space(matrix=reducedMatrix1, rows=rows1, columns=[])
    outSpace2 = Space(matrix=reducedMatrix2, rows=rows2, columns=[])

    if is_len:
        # L2-normalize vectors
        outSpace1.l2_normalize()
        outSpace2.l2_normalize()

    # Save the matrices
    outSpace1.save(outPath1)
    outSpace2.save(outPath2)

    logging.info("--- %s seconds ---" % (time.time() - start_time))
Exemplo n.º 14
0
def main():
    """
    Make count-based vector space from corpus.
    """

    # Get the arguments
    args = docopt("""Make count-based vector space from corpus.

    Usage:
        count.py <corpDir> <vocabFile> <outPath> <windowSize>
               
        <corpDir> = path to corpus or corpus directory (iterates through files)
        <vocabFile> = row and column vocabulary
        <outPath> = output path for vectors
        <windowSize> = the linear distance of context words to consider in each direction
        
    Note:
        Skips one-word sentences to avoid zero-vectors. Does not increase window size when out-of-vocabulary words are found.

    """)

    corpDir = args['<corpDir>']
    vocabFile = args['<vocabFile>']
    outPath = args['<outPath>']
    windowSize = int(args['<windowSize>'])

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()

    # Load vocabulary
    logging.info("Loading vocabulary")
    with open(vocabFile, 'r', encoding='utf-8') as f_in:
        vocabulary = [line.strip() for line in f_in]

    w2i = {w: i for i, w in enumerate(vocabulary)}

    # Initialize co-occurrence matrix as dictionary
    cooc_mat = defaultdict(lambda: 0)

    # Get counts from corpus
    logging.info("Counting context words")
    sentences = PathLineSentences(corpDir)
    for sentence in sentences:
        for i, word in enumerate(sentence):
            try:
                windex = w2i[word]
            except KeyError:
                continue
            lowerWindowSize = max(i - windowSize, 0)
            upperWindowSize = min(i + windowSize, len(sentence))
            window = sentence[lowerWindowSize:i] + sentence[i +
                                                            1:upperWindowSize +
                                                            1]
            if len(window) == 0:  # Skip one-word sentences
                continue
            for contextWord in window:
                try:
                    cindex = w2i[contextWord]
                except KeyError:
                    continue
                cooc_mat[(windex, cindex)] += 1

    # Convert dictionary to sparse matrix
    logging.info("Converting dictionary to matrix")
    cooc_mat_sparse = dok_matrix((len(vocabulary), len(vocabulary)),
                                 dtype=float)
    try:
        cooc_mat_sparse.update(cooc_mat)
    except NotImplementedError:
        cooc_mat_sparse._update(cooc_mat)

    outSpace = Space(matrix=cooc_mat_sparse,
                     rows=vocabulary,
                     columns=vocabulary)

    # Save the matrix
    outSpace.save(outPath)

    logging.info("--- %s seconds ---" % (time.time() - start_time))
Exemplo n.º 15
0
def main():

    # Get the arguments
    args = docopt("""

    Usage:
        CountBasedVectors.py  <pathMatrix> <pathw2i> <pathCorpus> <pathTestSentences> <outPathVectors> <sentenceType> <windowSize2> 
        CountBasedVectors.py  <pathCorpus> <pathTestSentences> <sentenceType> <windowSize2>
        
    Arguments:
       
        <pathMatrix> = Path to the word vector matrix
        <pathw2i> = Path to the word-to-index
        <pathCorpus> = path to the corpus 
        <pathTestSentences> = Path to the test sentences
        <outPathVectors> = Path for storing the vectors
        <sentenceType> = "lemma" or "token"
        <windowSize2> = Window size (20 works fine)
        
        
    """)

    pathMatrix = args['<pathMatrix>']
    pathTestSentences = args['<pathTestSentences>']
    pathw2i = args['<pathw2i>']
    outPathVectors = args['<outPathVectors>']
    windowSize2 = int(args['<windowSize2>'])
    pathCorpus = args['<pathCorpus>']
    sentenceType = args['<sentenceType>']

    if len(sys.argv) == 5:
        pathMatrix = "Files/Vectors/FirstOrder/matrix.npz"
        pathw2i = "Files/Vectors/FirstOrder/w2i.npz.npy"
        outPathVectors = "Files/Vectors/SecondOrder/Vectors.npz"

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.CRITICAL)
    print("")
    start_time = time.time()
    logging.critical("ContextVectors start")

    #Load w2i
    w2i = np.load(pathw2i, allow_pickle='TRUE').item()

    if sentenceType == "token":
        sentType = "sentence_token"
    else:
        sentType = "sentence"

    #Load saved wordVectorMatrix
    try:
        inSpace = Space(path=pathMatrix, format='w2v')
    except UnicodeDecodeError:
        inSpace = Space(path=pathMatrix)

    #inSpace =  Space(path=pathMatrix, format='w2v')
    #inSpace = Space(path=pathMatrix)
    cooc_mat_sparse = inSpace.matrix

    #Calculate IDF for every word
    docFreq = {}

    for i in range(0, len(w2i)):
        docFreq[i] = 0
    with gzip.open(pathCorpus, 'rt', encoding="utf-8") as sentences:
        count = 0
        try:
            for sentence in sentences:
                count = count + 1
                for word in set(sentence.split()):
                    if word in w2i:
                        docFreq[w2i[word]] += 1
        except:
            pass
        for key, value in w2i.items():
            docFreq[value] = math.log10(count / max(docFreq[value], 1))

    #Load TestSentences
    contextVectorList = []
    testSentences = []
    with open(pathTestSentences, 'r') as csvFile:
        reader = csv.DictReader(csvFile, delimiter="\t")
        for row in reader:
            testSentences.append(dict(row))

    #Calculate contextVectorMatrix
    logging.critical("Calculate contextVectorMatrix")
    nonExisting = False
    target = str(testSentences[0]["original_word"])
    for dic in testSentences:
        sentence = dic[sentType].split()
        for i, word in enumerate(sentence):
            if str(i) == dic['target_index'] and word == target:
                toMelt = []
                toMeltIDF = []
                lowerWindowSize = max(i - windowSize2, 0)
                upperWindowSize = min(i + windowSize2, len(sentence))
                window = sentence[lowerWindowSize:i] + sentence[
                    i + 1:upperWindowSize + 1]
                if word in w2i:
                    windex = w2i[word]
                    for contextWord in window:
                        if contextWord != "$":
                            if contextWord in w2i:
                                contextWordIndex = w2i[contextWord]
                                toMelt.append(
                                    cooc_mat_sparse[contextWordIndex].toarray(
                                    )[0] *
                                    math.pow(docFreq[contextWordIndex], 1))
                    contextVectorList.append(getContextVector(toMelt))
                else:
                    nonExisting = True

    #Normalize vectors in length
    contextVectorList = preprocessing.normalize(contextVectorList, norm='l2')

    #Save contextVectorList_sparse matrix
    outSpace = Space(matrix=contextVectorList, rows=" ", columns=" ")
    outSpace.save(outPathVectors)

    logging.critical("ContextVectors end")
    logging.critical("--- %s seconds ---" % (time.time() - start_time))
    print("")
Exemplo n.º 16
0
def main():

    # Get the arguments
    args = docopt("""

    Usage:
        W2v.py  <pathTestSentences> <outPathVectors> <windowSize2> <sentenceType>
        W2v.py  <pathTestSentences> <windowSize2> <sentenceType>
        
    Arguments:
       
        <pathTestSentences> = Path to the test sentences
        <outPathVectors> = Path for storing the vectors 
        <windowSize2> = Window size (20 works good)
        <sentenceType> = "lemma" or "token"
    
    """)

    pathTestSentences = args['<pathTestSentences>']
    outPathVectors = args['<outPathVectors>']
    windowSize2 = int(args['<windowSize2>'])
    sentenceType = args['<sentenceType>']

    if len(sys.argv) == 4:
        outPathVectors = "Files/Vectors/SecondOrder/Vectors.npz"

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.CRITICAL)
    print("")
    start_time = time.time()
    logging.critical("W2V start")

    if sentenceType == "token":
        sentType = "sentence_token"
    else:
        sentType = "sentence"

    if not isinstance(windowSize2, int):
        windowSize2 = 20

    #Load Word2Vec
    model = gensim.models.KeyedVectors.load_word2vec_format(
        'Data/GoogleNews-vectors-negative300.bin', binary=True)

    #Load TestSentences
    contextVectorList = []
    testSentences = []
    with open(pathTestSentences, 'r') as csvFile:
        reader = csv.DictReader(csvFile, delimiter="\t")
        for row in reader:
            testSentences.append(dict(row))

    #Calculate contextVectorMatrix
    logging.critical("Calculate contextVectorMatrix")

    nonExisting = False
    #self.target=str(testSentences[0]["original_word"])
    for dic in testSentences:
        sentence = dic[sentType].split()
        for i, word in enumerate(sentence):
            if str(i) == dic['target_index']:

                toMelt = []
                toMeltIDF = []
                lowerWindowSize = max(i - windowSize2, 0)
                upperWindowSize = min(i + windowSize2, len(sentence))
                window = sentence[lowerWindowSize:i] + sentence[
                    i + 1:upperWindowSize + 1]
                if word in model.wv.vocab:
                    for contextWord in window:
                        if contextWord in model.wv.vocab:
                            if contextWord != "$":
                                toMelt.append(
                                    preprocessing.normalize(
                                        [model.wv[contextWord]], norm='l2')[0])

                    contextVectorList.append(getContextVector(toMelt))
                else:
                    contextVectorList.append(np.zeros(300))

    #Normalize vectors in length
    contextVectorList = preprocessing.normalize(contextVectorList, norm='l2')

    #Save contextVectorList_sparse matrix
    outSpace = Space(matrix=contextVectorList, rows=" ", columns=" ")
    outSpace.save(outPathVectors)

    logging.critical("W2V end")
    logging.critical("--- %s seconds ---" % (time.time() - start_time))
    print("")
Exemplo n.º 17
0
def main():
    """
    Compute cosine distance for targets in two matrices.
    """

    # Get the arguments
    args = docopt("""Compute cosine distance for targets in two matrices.

    Usage:
        cd.py <testset> <matrix1> <matrix2> <outPath>

        <testset> = path to file with one target per line
        <matrix1> = path to matrix1 in npz format
        <matrix2> = path to matrix2 in npz format
        <outPath> = output path for result file

     Note:
         Important: spaces must be already aligned (columns in same order)!
        
    """)
    
    matrix1 = args['<matrix1>']
    matrix2 = args['<matrix2>']
    testset = args['<testset>']
    outPath = args['<outPath>']

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()    
    
    # Load matrices and rows
    space1 = Space(path=matrix1)   
    space2 = Space(path=matrix2)   
    matrix1 = space1.matrix
    row2id1 = space1.row2id
    matrix2 = space2.matrix
    row2id2 = space2.row2id
    
    # Load targets
    with open(testset, 'r', encoding='utf-8') as f_in:
            targets = [line.strip() for line in f_in]
        
    scores = {}
    for target in targets:
        
        # Get row vectors
        try:
            v1 = matrix1[row2id1[target]].toarray().flatten()
            v2 = matrix2[row2id2[target]].toarray().flatten()
        except KeyError:
            scores[target] = 'nan'
            continue
        
        # Compute cosine distance of vectors
        distance = cosine(v1, v2)
        scores[target] = distance
        
        
    with open(outPath, 'w', encoding='utf-8') as f_out:
        for target in targets:
            f_out.write('\t'.join((target, str(scores[target])+'\n')))

                
    logging.info("--- %s seconds ---" % (time.time() - start_time))                   
Exemplo n.º 18
0
def main():
    """
    Compute number of context types for all rows of a vector space and save their scores.
    """

    # Get the arguments
    args = docopt(
        """Compute number of context types for all rows of a vector space and save their scores.

    Usage:
        typs.py [(-n <normConst>)] <testset> <matrixPath> <outPath>

        <normConst> = normalization constant
        <testset> = path to file with one target per line in first column
        <matrixPath> = path to matrix
        <outPath> = output path for result file

    Options:
        -n, --nrm  normalize values by normalization constant
        
    """)

    is_norm = args['--nrm']
    if is_norm:
        normConst = float(args['<normConst>'])
    testset = args['<testset>']
    matrixPath = args['<matrixPath>']
    outPath = args['<outPath>']

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()

    # Load input matrix
    space = Space(matrixPath)
    matrix = space.matrix

    # Get rows
    row2id = space.row2id

    # Load targets
    with open(testset, 'r', encoding='utf-8') as f_in:
        targets = [line.strip().split('\t')[0] for line in f_in]

    scores = {}
    # Iterate over targets
    for target in targets:

        try:
            row = matrix[row2id[target]]
        except KeyError:
            scores[target] = 'nan'
            continue

        # Get number of non-zero elements in row
        types = row.getnnz()

        scores[target] = types

    with open(outPath, 'w', encoding='utf-8') as f_out:
        for target in targets:
            if is_norm:
                scores[target] = float(scores[target]) / normConst
            f_out.write('\t'.join((target, str(scores[target]) + '\n')))

    logging.info("--- %s seconds ---" % (time.time() - start_time))
Exemplo n.º 19
0
def main():
    """
    Compute entropy for rows of targets from vector space.
    """

    # Get the arguments
    args = docopt("""Compute entropy for rows of targets from vector space.

    Usage:
        entropy.py [-n] <testset> <matrixPath> <outPath>

        <testset> = path to file with one target per line in first column
        <matrixPath> = path to matrix
        <outPath> = output path for result file
        
    Options:
        -n, --nrm  normalize values by log of number of types

    """)

    is_norm = args['--nrm']
    testset = args['<testset>']
    matrixPath = args['<matrixPath>']
    outPath = args['<outPath>']

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()

    # Load input matrix
    space = Space(matrixPath)
    matrix = space.matrix

    # Get rows
    row2id = space.row2id

    # Load targets
    with open(testset, 'r', encoding='utf-8') as f_in:
        targets = [line.strip().split('\t')[0] for line in f_in]

    scores = {}
    norms = {}
    # Iterate over targets
    for target in targets:

        try:
            row = matrix[row2id[target]]
        except KeyError:
            scores[target] = 'nan'
            norms[target] = 'nan'
            continue

        # Get all counts in row (non-zero elements)
        counts = row.data

        # Compute entropy of row
        H = entropy(counts, base=2)
        scores[target] = H

        if is_norm:
            # Get number of non-zero elements in row
            types = row.getnnz()
            norms[target] = np.log2(types)

    with open(outPath, 'w', encoding='utf-8') as f_out:
        for target in targets:
            if is_norm:
                scores[target] = float(scores[target]) / float(norms[target])
            f_out.write('\t'.join((target, str(scores[target]) + '\n')))

    logging.info("--- %s seconds ---" % (time.time() - start_time))
Exemplo n.º 20
0
def main():

    # Get the arguments
    args = docopt("""

    Usage:
        Bert.py  <pathTestSentences> <outPathVectors> <vecType> 
        Bert.py  <pathTestSentences> <vecType>
        
    Arguments:
       
        <pathTestSentences> = Path to the test sentences
        <outPathVectors> = Path for storing the vectors
        <vecType> = "token" or "lemma"

    """)

    pathTestSentences = args['<pathTestSentences>']
    outPathVectors = args['<outPathVectors>']
    vecType = args['<vecType>']

    if len(sys.argv) == 3:
        outPathVectors = "Files/Vectors/SecondOrder/Vectors.npz"

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.CRITICAL)
    print("")
    start_time = time.time()
    logging.critical("Bert start")

    #Load TestSentences
    # Load pre-trained model tokenizer (vocabulary)
    global tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    # Load pre-trained model (weights)
    global model
    model = BertModel.from_pretrained('bert-base-uncased',
                                      output_hidden_states=True)

    contextVectorList = []
    testSentences = []
    with open(pathTestSentences, 'r') as csvFile:
        reader = csv.DictReader(csvFile, delimiter="\t")
        for row in reader:
            testSentences.append(dict(row))

        #Token vs. Lemma
        if vecType == "token":
            vecTypeString = "sentence_token"
        else:
            vecTypeString = "sentence"

        #Create the vectors
        logging.critical("Create Bert embeddings")
        for i in range(0, len(testSentences)):
            #Create target word(s)
            targetWord = str(testSentences[i][vecTypeString].split()[int(
                [testSentences[i]["target_index"]][0])])
            targetWords = []
            targetWords.append(tokenizer.tokenize(targetWord))
            targetWords = targetWords[0]

            #Tokenize text
            text = testSentences[i][vecTypeString]
            marked_text = "[CLS] " + text + " [SEP]"
            tokenized_text = tokenizer.tokenize(marked_text)

            #Search the indices of the tokenized target word in the tokenized text
            targetWordIndices = []

            for i in range(0, len(tokenized_text)):
                if tokenized_text[i] == targetWords[0]:
                    for l in range(0, len(targetWords)):
                        if tokenized_text[i + l] == targetWords[l]:
                            targetWordIndices.append(i + l)
                        if len(targetWordIndices) == len(targetWords):
                            break

            #Create BERT Token Embeddings
            indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
            segments_ids = [1] * len(tokenized_text)
            tokens_tensor = torch.tensor([indexed_tokens])
            segments_tensors = torch.tensor([segments_ids])
            model.eval()
            with torch.no_grad():
                outputs = model(tokens_tensor, segments_tensors)
                hidden_states = outputs[2]
            token_embeddings = torch.stack(hidden_states, dim=0)
            token_embeddings = torch.squeeze(token_embeddings, dim=1)
            token_embeddings = token_embeddings.permute(1, 0, 2)
            vectors = []
            for number in targetWordIndices:
                token = token_embeddings[number]
                sum_vec = np.sum([np.array(token[12]),
                                  np.array(token[1])],
                                 axis=0)
                vectors.append(np.array(sum_vec))
            contextVectorList.append(np.sum(vectors, axis=0, dtype=float))

    #Normalize vectors in length
    contextVectorList = preprocessing.normalize(contextVectorList, norm='l2')

    #Save contextVectorList_sparse matrix
    outSpace = Space(matrix=contextVectorList, rows=" ", columns=" ")
    outSpace.save(outPathVectors)

    logging.critical("Bert end")
    logging.critical("--- %s seconds ---" % (time.time() - start_time))
    print("")
Exemplo n.º 21
0
def main():
    """
    Create low-dimensional vector space by sparse random indexing from co-occurrence matrix.
    """

    # Get the arguments
    args = docopt('''Create low-dimensional vector space by sparse random indexing from co-occurrence matrix.

    Usage:
        ri.py [-l] (-s <seeds> | -a) <matrixPath> <outPath> <outPathElement> <dim> <t>

        <seeds> = number of non-zero values in each random vector
        <matrixPath> = path to matrix
        <outPath> = output path for reduced space 
        <outPathElement> = output path for elemental space (context vectors)
        <dim> = number of dimensions for random vectors
        <t> = threshold for downsampling (if t=None, no subsampling is applied)

    Options:
        -l, --len   normalize final vectors to unit length
        -s, --see   specify number of seeds manually
        -a, --aut   calculate number of seeds automatically as proposed in [1,2]

    References:
        [1] Ping Li, T. Hastie and K. W. Church, 2006,
           "Very Sparse Random Projections".
           http://web.stanford.edu/~hastie/Papers/Ping/KDD06_rp.pdf
        [2] D. Achlioptas, 2001, "Database-friendly random projections",
           http://www.cs.ucsc.edu/~optas/papers/jl.pdf

    ''')
    
    is_len = args['--len']
    is_seeds = args['--see']
    if is_seeds:
        seeds = int(args['<seeds>'])
    is_aut = args['--aut']
    matrixPath = args['<matrixPath>']
    outPath = args['<outPath>']
    outPathElement = args['<outPathElement>']
    dim = int(args['<dim>'])
    if args['<t>']=='None':
        t = None
    else:
        t = float(args['<t>'])
    
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()    

    # Load input matrix
    space = Space(matrixPath)   
    matrix = space.matrix
    
    # Get mappings between rows/columns and words
    rows = space.rows
    id2row = space.id2row
    row2id = space.row2id
    columns = space.columns
    id2column = space.id2column
    column2id = space.column2id

    ## Generate ternary random vectors
    if is_seeds:
        elementalMatrix = lil_matrix((len(columns),dim))
        # Generate base vector for random vectors 
        baseVector = np.zeros(dim) # Note: Make sure that number of seeds is not greater than dimensions
        for i in range(0,int(seeds/2)):
            baseVector[i] = 1.0
        for i in range(int(seeds/2),seeds):
            baseVector[i] = -1.0
        for i in range(len(columns)):
            np.random.shuffle(baseVector)
            elementalMatrix[i] = baseVector
    if is_aut:
        elementalMatrix = sparse_random_matrix(dim,len(columns)).toarray().T

    elementalMatrix = csc_matrix(elementalMatrix)
    # to-do: get rid of transformation into sparse matrices by initializing them as such

    # Initialize target vectors
    reducedMatrix = np.zeros((len(rows),dim))    

    # Get number of total occurrences of any word
    totalOcc = np.sum(matrix)

    # Define function for downsampling
    downsample = lambda f: np.sqrt(float(t)/f) if f>t else 1.0
    downsample = np.vectorize(downsample)
    
    # Get total normalized co-occurrence frequency of all contexts in space
    context_freqs = np.array(matrix.sum(axis=0))/totalOcc
    
    #to-do: matrix multiplication is done row-wise, do this matrix-wise
    # Iterate over rows of space, find context words and update reduced matrix with low-dimensional random vectors of these context words
    for i in id2row:
        # Get co-occurrence values as matrix
        m = matrix[i]
        #print(m)
        # Get nonzero indexes and data
        nonzeros = m.nonzero()
        #print(nonzeros)        
        data = m.data            
        # Smooth context distribution
        pos_context_vectors = elementalMatrix[nonzeros[1]]
        if t!=None:
            # Apply subsampling
            rfs = context_freqs[0,nonzeros[1]]
            rfs = downsample(rfs)
            data *= rfs
        data = csc_matrix(data)
        # Weight context vectors by occurrence frequency
        pos_context_vectors = pos_context_vectors.multiply(data.reshape(-1,1))
        pos_context_vectors = np.sum(pos_context_vectors, axis=0)
        # Add up context vectors and store as row for target
        reducedMatrix[i] = pos_context_vectors
    
    outSpace = Space(matrix=reducedMatrix, rows=rows, columns=[])

    if is_len:
        # L2-normalize vectors
        outSpace.l2_normalize()
        
    # Save the matrices
    outSpace.save(outPath, format='w2v')
    Space(matrix=elementalMatrix, rows=columns, columns=[]).save(outPathElement)

    logging.info("--- %s seconds ---" % (time.time() - start_time))                   
Exemplo n.º 22
0
def main():
    """
    Compute cosine distance for targets in two matrices.
    """

    # Get the arguments
    args = docopt("""Compute cosine distance for targets in two matrices.

    Usage:
        cd.py [(-f | -s)] <testset> <matrixPath1> <matrixPath2> <outPath>

        <testset> = path to file with tab-separated word pairs
        <matrixPath1> = path to matrix1
        <matrixPath2> = path to matrix2
        <outPath> = output path for result file

    Options:
        -f, --fst   write only first target in output file
        -s, --scd   write only second target in output file

     Note:
         Important: spaces must be already aligned (columns in same order)! Targets in first/second column of testset are computed from matrix1/matrix2.
        
    """)

    is_fst = args['--fst']
    is_scd = args['--scd']
    testset = args['<testset>']
    matrixPath1 = args['<matrixPath1>']
    matrixPath2 = args['<matrixPath2>']
    outPath = args['<outPath>']

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()

    # Load matrices and rows
    try:
        space1 = Space(matrixPath1, format='npz')
    except ValueError:
        space1 = Space(matrixPath1, format='w2v')
    try:
        space2 = Space(matrixPath2, format='npz')
    except ValueError:
        space2 = Space(matrixPath2, format='w2v')

    matrix1 = space1.matrix
    row2id1 = space1.row2id
    matrix2 = space2.matrix
    row2id2 = space2.row2id

    # Load targets
    with open(testset, 'r', encoding='utf-8') as f_in:
        targets = [(line.strip().split('\t')[0], line.strip().split('\t')[1])
                   for line in f_in]

    scores = {}
    for (t1, t2) in targets:

        # Get row vectors
        try:
            v1 = matrix1[row2id1[t1]].toarray().flatten()
            v2 = matrix2[row2id2[t2]].toarray().flatten()
        except KeyError:
            scores[(t1, t2)] = 'nan'
            continue

        # Compute cosine distance of vectors
        distance = cosine_distance(v1, v2)
        scores[(t1, t2)] = distance

    with open(outPath, 'w', encoding='utf-8') as f_out:
        for (t1, t2) in targets:
            if is_fst:  # output only first target string
                f_out.write('\t'.join((t1, str(scores[(t1, t2)]) + '\n')))
            elif is_scd:  # output only second target string
                f_out.write('\t'.join((t2, str(scores[(t1, t2)]) + '\n')))
            else:  # standard outputs both target strings
                f_out.write('\t'.join(
                    ('%s,%s' % (t1, t2), str(scores[(t1, t2)]) + '\n')))

    logging.info("--- %s seconds ---" % (time.time() - start_time))
Exemplo n.º 23
0
def main():

    # Get the arguments
    args = docopt("""

    Usage:
        LSC_W2V.py  <pathSentences1> <pathSentences2> <outPathVectors> <outPathLabels> <outPathResults> <sentenceType> <clusteringInitialization> <clustering> <limitAGL> <limitCOS> <limitCluster> <windowSize>
        LSC_W2V.py  <pathSentences1> <pathSentences2> <sentenceType> <clusteringInitialization> <clustering> <limitAGL> <limitCOS> <limitCluster> <windowSize>  
    
    Arguments:
       
        <pathSentences1> = Path to the test sentences from time1
        <pathSentences2> = Path to the test sentences from time2
        <outPathVectors> = Path to store the vectors
        <outPathLabels> = Path to store the clustering labels
        <outPathResults> = Path to store the lsc scores
        <sentenceType> = "lemma" or "token"
        <clusteringInitialization> = "gaac" for precalculated initializations, else random
        <clustering> = "kmeans" or "hierarchical"
        <limitAGL> = Change score limit for AGL to still be consiered as change (Good is about 0.2)
        <limitCOS> = Change score limit for Cosine to still be consiered as change (Good is about 0.02) 
        <limitCluster> = Minimum number of elements a cluster has to contain from one time and less from the other, to get assigned a change (Good is 5-10)
        <windowSize> = Window size for words to be in context of other words (Good is 20)
        


    """)

    pathSentences1 = args['<pathSentences1>']
    pathSentences2 = args['<pathSentences2>']
    outPathVectors = args['<outPathVectors>']
    outPathLabels = args['<outPathLabels>']
    clusteringInitialization = args['<clusteringInitialization>']
    clustering = args['<clustering>']
    pathResults = args['<outPathResults>']
    limitAGL = float(args['<limitAGL>'])
    limitCOS = float(args['<limitCOS>'])
    limitCluster = int(args['<limitCluster>'])
    windowSize = int(args['<windowSize>'])
    sentenceType = args['<sentenceType>']

    if len(sys.argv) == 10:
        outPathVectors = "Files/Vectors/SecondOrder/Vectors.npz"
        outPathLabels = "Files/Clustering/cluster_labels.csv"
        pathResults = "Files/LSC/lsc_scores.csv"

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.CRITICAL)
    print("")
    start_time = time.time()
    logging.critical("W2v LSC start")

    #Create the vectors of corpora 1
    logging.critical("Create the vectors of corpora 1")
    get_ipython().run_line_magic(
        'run',
        'WordSenseClustering/W2v.py $pathSentences1 $outPathVectors $windowSize $sentenceType'
    )

    inSpace = Space(path=outPathVectors)
    vectors1 = inSpace.matrix.toarray()

    #Createthe vectors of corpora 2
    logging.critical("Create the vectors of corpora 2")
    get_ipython().run_line_magic(
        'run',
        'WordSenseClustering/W2v.py $pathSentences2 $outPathVectors $windowSize $sentenceType'
    )
    inSpace = Space(path=outPathVectors)
    vectors2 = inSpace.matrix.toarray()

    #Create the lists to store the binary results in
    cosineDistanceBinary = []
    APDBinary = []
    clusterScoreBinary = []

    #Calculate cosineDistance for the two vectors
    cosineDistance = getCOS(vectors1, vectors2)
    if cosineDistance >= limitCOS:
        cosineDistanceBinary.append(1)
    else:
        cosineDistanceBinary.append(0)

    #Calculate Average pairwise distance for the two vectors
    APD = getAPD(vectors1, vectors2, 200)
    if APD >= limitAGL:
        APDBinary.append(1)
    else:
        APDBinary.append(0)

    #Create and cluster the combined vectors of both corpora
    logging.critical("Create and cluster the combined vectors of both corpora")
    vectors = np.concatenate((vectors1, vectors2), axis=0)
    outSpace = Space(matrix=vectors, rows=" ", columns=" ")
    outSpace.save(outPathVectors)
    #Cluster the combined vectors
    get_ipython().run_line_magic(
        'run',
        'WordSenseClustering/Clustering.py $outPathVectors 0 $outPathLabels 0 $clusteringInitialization 0 $clustering'
    )

    #Load list of labels
    labels = []
    with open(outPathLabels, 'r') as file:
        data = file.readlines()
    for i in data[-1]:
        if i != ",":
            if i != "\n":
                labels.append(int(i))

    # Calculated cluster LSC score
    labelA_1 = []
    labelA_2 = []

    maximum = len(vectors1)
    for i in range(0, len(vectors1)):
        labelA_1.append(labels[i])

    for i in range(maximum, maximum + len(vectors2)):
        labelA_2.append(labels[i])

    changeA = 0
    for j in set(labels):
        if labelA_1.count(j) >= limitCluster:
            if labelA_2.count(j) < limitCluster:
                changeA = 1
        if labelA_2.count(j) >= limitCluster:
            if labelA_1.count(j) < limitCluster:
                changeA = 1

    clusterScoreBinary.append(changeA)

    p = np.histogram(labelA_1)[0] / len(labelA_1)
    q = np.histogram(labelA_2)[0] / len(labelA_2)

    dist = distance.jensenshannon(p, q)

    filename1 = os.path.splitext(os.path.basename(pathSentences1))[0]
    filename2 = os.path.splitext(os.path.basename(pathSentences2))[0]

    cos = [filename1, filename2, "cosineDistance", cosineDistance]
    apd = [filename1, filename2, "APD", APD]
    cluster = [filename1, filename2, "clusterScore", dist]
    cosBin = [
        filename1, filename2, "cosineDistanceBinary", cosineDistanceBinary[0]
    ]
    APDBin = [filename1, filename2, "APDBinary", APDBinary[0]]
    clusterBin = [
        filename1, filename2, "clusterScoreBinary", clusterScoreBinary[0]
    ]

    print("Graded LSC:")
    print("")
    print("cosine distance:")
    print(cosineDistance)
    print("")
    print("Average pairwise distance:")
    print(APD)
    print("")
    print("JSD:")
    print(dist)
    print("")
    print("")
    print("Binary LSC:")
    print("")
    print("cosine distance binary:")
    print(cosineDistanceBinary[0])
    print("APD distance binary:")
    print(APDBinary[0])
    print("JSD binary:")
    print(clusterScoreBinary[0])

    with open(pathResults, 'a', newline='') as file:
        writer = csv.writer(file)
        writer.writerows([cos, apd, cluster, cosBin, APDBin, clusterBin])

    logging.critical("W2v LSC end")
    logging.critical("--- %s seconds ---" % (time.time() - start_time))
    print("")
Exemplo n.º 24
0
def main():

    # Get the arguments
    args = docopt("""

    Usage:
        Clustering.py  <pathVectors> <pathTestSentences> <outPathLabels> <outPathResults> <initializationType> <numberClusters> <clustering> 
        Clustering.py  <pathTestSentences> <initializationType> <numberClusters> <clustering>
        
    Arguments:
       
        <pathVectors> = Path to the vectors
        <pathTestSentences> = Path to the test sentecens that contain the gold clustering, if no performance is needed set to 0
        <outPathLabels> = Path to store the labels
        <outPathResults> = path to store the performance in, if no performance is needed set to 0 
        <initializationType> = "gaac" for precalculated initialization, else random. (Only for kmeans used)
        <numberClusters> = Number of desired clusters, if 0 than its calculated by sillhouette
        <clustering> = Either "hierarchical" or "kmeans"

    
    """)
    
    pathVectors = args['<pathVectors>']
    pathTestSentences = args['<pathTestSentences>']
    initializationType = args['<initializationType>']
    numberClusters = int(args['<numberClusters>'])
    outPathLabels = args['<outPathLabels>']
    outPathResults = args['<outPathResults>']
    clustering = args['<clustering>']
    

    if len(sys.argv) == 5:
        pathVectors = "Files/Vectors/SecondOrder/Vectors.npz"
        outPathLabels = "Files/Clustering/cluster_labels.csv"
        outPathResults = "Files/Clustering/cluster_scores.csv"

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.CRITICAL)
    print("")
    start_time = time.time()  
    logging.critical("Clustering start") 

    #Load vectors
    inSpace = Space(path=pathVectors)
    loaded_contextVectorList_sparse=inSpace.matrix

    
    if pathTestSentences != "0":
    #Get gold clustering if exists
        testSentences=[]
        gold=[]
        with open(pathTestSentences, 'r') as csvFile:
            reader = csv.DictReader(csvFile, delimiter="\t")
            for row in reader:
                testSentences.append(dict(row))   
        for dic in testSentences:
                gold.append(int(dic['cluster']))
            
    if numberClusters == 0:
        #Calculate silhouette score for eaach number of clusters
        range_n_clusters = [2,3,4,5,6,7,8,9,10]
        maxIndex=0
        maxValue=0
        for n_clusters in range_n_clusters:
            clusterer = KMeans(n_clusters=n_clusters, random_state=10)
            cluster_labels = clusterer.fit_predict(loaded_contextVectorList_sparse.toarray())
            silhouette_avg = silhouette_score(loaded_contextVectorList_sparse.toarray(), cluster_labels)
            if maxValue <=silhouette_avg:
                maxValue=silhouette_avg
                maxIndex=n_clusters
            numberClusters = maxIndex

      
    
    if clustering == "hierarchical":
        clustering = AgglomerativeClustering(n_clusters=numberClusters).fit(loaded_contextVectorList_sparse.toarray())
        label=clustering.labels_        
    
    else:
        
        if initializationType == "gaac":

            #Calculate GAAC on sample vectors for initial centroids
            testList=[]
            size = min(len(loaded_contextVectorList_sparse.toarray()), 50 )
            randoms=random.sample(range(0, len(loaded_contextVectorList_sparse.toarray())), size)
            for i in randoms: 
                testList.append(loaded_contextVectorList_sparse[i].toarray()[0])   
            initialCentroids=preprocessing.normalize(gaac(testList, numberClusters), norm='l2')

            #Calculate kmeans    
            centroid, label = kmeans2(loaded_contextVectorList_sparse.toarray(),
                                                            initialCentroids , 5, minit='matrix')

        else:
            #Calculate kmeans    
            centroid, label = kmeans2(loaded_contextVectorList_sparse.toarray(),
                                                            numberClusters , 5, minit='points')

    if outPathResults != "0":
        filename = os.path.splitext(os.path.basename(pathTestSentences))[0]

        ADJ=[filename, "ADJ", (round(adjusted_rand_score(gold, label),3)) ]
        ACC=[filename, "ACC", cluster_accuracy(np.array(gold), np.array(label)) ]  
  
        with open(outPathResults, 'a', newline='') as file:
            writer = csv.writer(file)
            writer.writerows([ADJ, ACC])    

        #Show results 
        print("")
        print(filename)
        print("")
        print("Adjusted rand index:")
        print(round(adjusted_rand_score(gold, label),3))
        print("Accuracy:")
        print(cluster_accuracy(np.array(gold), np.array(label)))
        print("")
        #plotClusters(loaded_contextVectorList_sparse.toarray(), gold, label)                                  

    #Save labels
    with open(outPathLabels, 'a', newline='') as file:
        writer = csv.writer(file)
        writer.writerows([label])    
    logging.critical("Clustering end") 
    logging.critical("--- %s seconds ---" % (time.time() - start_time))
    print("")