def main(): """ Align two sparse matrices by intersecting their columns. """ # Get the arguments args = docopt('''Align two sparse matrices by intersecting their columns. Usage: ci.py <matrix1> <matrix2> <outPath1> <outPath2> <matrix1> = path to matrix1 in npz format <matrix2> = path to matrix2 in npz format <outPath1> = output path for aligned matrix 1 <outPath2> = output path for aligned matrix 2 ''') matrix1 = args['<matrix1>'] matrix2 = args['<matrix2>'] outPath1 = args['<outPath1>'] outPath2 = args['<outPath2>'] logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info(__file__.upper()) start_time = time.time() # Load matrices, rows and columns space1 = Space(matrix1) space2 = Space(matrix2) matrix1 = space1.matrix rows1 = space1.rows columns1 = space1.columns column2id1 = space1.column2id matrix2 = space2.matrix rows2 = space2.rows columns2 = space2.columns column2id2 = space2.column2id # Intersect columns of matrices intersected_columns = list(set(columns1).intersection(columns2)) intersected_columns_id1 = [ column2id1[item] for item in intersected_columns ] intersected_columns_id2 = [ column2id2[item] for item in intersected_columns ] reduced_matrix1 = matrix1[:, intersected_columns_id1] reduced_matrix2 = matrix2[:, intersected_columns_id2] # Save matrices Space(matrix=reduced_matrix1, rows=rows1, columns=intersected_columns).save(outPath1) Space(matrix=reduced_matrix2, rows=rows2, columns=intersected_columns).save(outPath2) logging.info("--- %s seconds ---" % (time.time() - start_time))
def main(): """ Create low-dimensional matrix from count matrix by multiplication with random matrix. """ # Get the arguments args = docopt( '''Create low-dimensional matrix from count matrix by multiplication with random matrix. Usage: multiply.py [-l] [-c] <countPath> <randomPath> <outPath> <countPath> = path to count matrix <randomPath> = path to random matrix <outPath> = output path for reduced matrix Options: -l, --len normalize final vectors to unit length -c, --cen mean center columns of final matrix ''') is_len = args['--len'] is_cen = args['--cen'] countPath = args['<countPath>'] randomPath = args['<randomPath>'] outPath = args['<outPath>'] logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info(__file__.upper()) start_time = time.time() # Load matrices countSpace = Space(countPath) countMatrix = countSpace.matrix randomSpace = Space(randomPath) randomMatrix = randomSpace.matrix logging.info("Multiplying matrices") reducedMatrix = np.dot(countMatrix, randomMatrix) reducedSpace = Space(matrix=reducedMatrix, rows=countSpace.rows, columns=[]) if is_len: logging.info("L2-normalize vectors") reducedSpace.l2_normalize() if is_cen: logging.info("Mean center columns") reducedSpace.mean_center() # Save the reduced matrix reducedSpace.save(outPath) logging.info("--- %s seconds ---" % (time.time() - start_time))
def main(): """ Higher-order similarity matrix. """ # Get the arguments args = docopt('''Apply the similarity order transformation. Usage: sot.py [-l] <matrixPath> <outPath> <alpha> <matrixPath> = path to matrix <outPath> = output path for space <alpha> = the desired similarity-order Options: -l, --len normalize vectors to unit length before centering ''') is_len = args['--len'] matrixPath = args['<matrixPath>'] outPath = args['<outPath>'] alpha = float(args['<alpha>']) logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info(__file__.upper()) start_time = time.time() # Load matrices and rows try: space = Space(matrixPath, format='npz') except ValueError: space = Space(matrixPath, format='w2v') # L2-normalize vectors if is_len: space.l2_normalize() # Similarity matrix space.transform_similarity_order(alpha) # Save the matrix space.save(outPath, format="w2v") logging.info("--- %s seconds ---" % (time.time() - start_time))
def main(): """ Mean center matrix. """ # Get the arguments args = docopt('''Mean center matrix. Usage: center.py [-l] <matrixPath> <outPath> <matrixPath> = path to matrix <outPath> = output path for space Options: -l, --len normalize vectors to unit length before centering ''') is_len = args['--len'] matrixPath = args['<matrixPath>'] outPath = args['<outPath>'] logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info(__file__.upper()) start_time = time.time() # Load matrices and rows try: space = Space(matrixPath, format='npz') except ValueError: space = Space(matrixPath, format='w2v') if is_len: # L2-normalize vectors space.l2_normalize() # Mean center space.mean_center() # Save the matrix space.save(outPath) logging.info("--- %s seconds ---" % (time.time() - start_time))
def main(): """ Create low-dimensional and sparse random matrix from vocabulary file. """ # Get the arguments args = docopt( '''Create low-dimensional and sparse random matrix from vocabulary file. Usage: random.py <vocabFile> <outPath> <dim> <vocabFile> = row and column vocabulary <outPath> = output path for random matrix <dim> = dimensionality for random vectors Note: Calculates number of seeds automatically as proposed in [1,2] References: [1] Ping Li, T. Hastie and K. W. Church, 2006, "Very Sparse Random Projections". http://web.stanford.edu/~hastie/Papers/Ping/KDD06_rp.pdf [2] D. Achlioptas, 2001, "Database-friendly random projections", http://www.cs.ucsc.edu/~optas/papers/jl.pdf ''') #np.random.seed(0) # uncomment for reproducibility vocabFile = args['<vocabFile>'] outPath = args['<outPath>'] dim = int(args['<dim>']) logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info(__file__.upper()) start_time = time.time() # Load vocabulary logging.info("Loading vocabulary") with open(vocabFile, 'r', encoding='utf-8') as f_in: vocabulary = [line.strip() for line in f_in] # Generate random vectors randomMatrix = sparse_random_matrix(dim, len(vocabulary)).toarray().T # Store random matrix Space(matrix=randomMatrix, rows=vocabulary, columns=[]).save(outPath) logging.info("--- %s seconds ---" % (time.time() - start_time))
def main(): """ Mean center matrix, depending on flag, and remove top n PCA components """ # Get the arguments args = docopt( '''Depending on the flag, mean centers matrix and applies and removes the top n PCA components. Usage: pcr.py [-m] <matrixPath> <outPath> <threshold> <matrixPath> = path to matrix <outPath> = output path for space <threshold> = threshold, amount of PCA components Options: -m, --mean flag, if mean centering should be applied ''') matrix_path = args['<matrixPath>'] out_path = args['<outPath>'] threshold = args['<threshold>'] is_mean = args['--mean'] logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info(__file__.upper()) start_time = time.time() try: space = Space(matrix_path, format='npz') _format_flag = 'npz' except ValueError: space = Space(matrix_path, format='w2v') _format_flag = 'w2v' # MC+PCR space.mc_pcr(int(threshold), is_mean) # Save the matrix space.save(out_path, format=_format_flag) logging.info("--- %s seconds ---" % (time.time() - start_time))
def main(): """ Compute local neighborhood distance for target pairs from two vector spaces. """ # Get the arguments args = docopt( """Compute local neighborhood distance for target pairs from two vector spaces. Usage: lnd.py [(-f | -s)] <testset> <matrixPath1> <matrixPath2> <outPath> <k> <testset> = path to file with tab-separated word pairs <matrixPath1> = path to matrix1 <matrixPath2> = path to matrix2 <outPath> = output path for result file <k> = parameter k (k nearest neighbors) Options: -f, --fst write only first target in output file -s, --scd write only second target in output file """) is_fst = args['--fst'] is_scd = args['--scd'] testset = args['<testset>'] matrixPath1 = args['<matrixPath1>'] matrixPath2 = args['<matrixPath2>'] outPath = args['<outPath>'] k = int(args['<k>']) #logging.config.dictConfig({'version': 1, 'disable_existing_loggers': True,}) logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info(__file__.upper()) start_time = time.time() # Load matrices and rows try: space1 = Space(matrixPath1, format='npz') except ValueError: space1 = Space(matrixPath1, format='w2v') try: space2 = Space(matrixPath2, format='npz') except ValueError: space2 = Space(matrixPath2, format='w2v') matrix1 = space1.matrix row2id1 = space1.row2id id2row1 = space1.id2row matrix2 = space2.matrix row2id2 = space2.row2id id2row2 = space2.id2row # Load targets with open(testset, 'r', encoding='utf-8') as f_in: targets = [(line.strip().split('\t')[0], line.strip().split('\t')[1]) for line in f_in] nbrs1 = NearestNeighbors(n_neighbors=k, metric='cosine', algorithm='brute').fit(matrix1) nbrs2 = NearestNeighbors(n_neighbors=k, metric='cosine', algorithm='brute').fit(matrix2) scores = {} neighborUnionSizes = {} for (t1, t2) in targets: # Get nearest neighbors try: index1 = row2id1[t1] index2 = row2id2[t2] except KeyError: scores[(t1, t2)] = 'nan' neighborUnionSizes[(t1, t2)] = 'nan' continue v1 = matrix1[index1].toarray().flatten() v2 = matrix2[index2].toarray().flatten() distances1, indices1 = nbrs1.kneighbors(matrix1[index1]) distances2, indices2 = nbrs2.kneighbors(matrix2[index2]) neighbors1 = list( zip([id2row1[i] for i in indices1.flatten().tolist()], distances1.flatten().tolist())) neighbors2 = list( zip([id2row2[i] for i in indices2.flatten().tolist()], distances2.flatten().tolist())) neighborUnion = sorted( list( set([ a for (a, b) in neighbors1 + neighbors2 if (a in row2id1 and a in row2id2 and not a in [t1, t2]) ]))) # Filter out vectors with 0-length in either matrix neighborUnion = [ a for a in neighborUnion if (len(matrix1[row2id1[a]].data) > 0 and len(matrix2[row2id2[a]].data) > 0) ] simVec1 = [ 1.0 - cosine_distance(matrix1[index1].toarray().flatten(), matrix1[row2id1[n]].toarray().flatten()) for n in neighborUnion ] simVec2 = [ 1.0 - cosine_distance(matrix2[index2].toarray().flatten(), matrix2[row2id2[n]].toarray().flatten()) for n in neighborUnion ] # Compute cosine distance of vectors distance = cosine_distance(simVec1, simVec2) scores[(t1, t2)] = distance neighborUnionSizes[(t1, t2)] = len(neighborUnion) with open(outPath, 'w', encoding='utf-8') as f_out: for (t1, t2) in targets: if is_fst: # output only first target string f_out.write('\t'.join( (t1, str(scores[(t1, t2)]), str(neighborUnionSizes[(t1, t2)]) + '\n'))) elif is_scd: # output only second target string f_out.write('\t'.join( (t2, str(scores[(t1, t2)]), str(neighborUnionSizes[(t1, t2)]) + '\n'))) else: # standard outputs both target strings f_out.write('\t'.join( ('%s,%s' % (t1, t2), str(scores[(t1, t2)]), str(neighborUnionSizes[(t1, t2)]) + '\n'))) logging.info("--- %s seconds ---" % (time.time() - start_time))
def main(): """ Perform dimensionality reduction on a (normally PPMI) matrix by applying truncated SVD as described in Omer Levy, Yoav Goldberg, and Ido Dagan. 2015. Improving distributional similarity with lessons learned from word embeddings. Trans. ACL, 3. """ # Get the arguments args = docopt('''Perform dimensionality reduction on a (normally PPMI) matrix by applying truncated SVD and save it in pickle format. Usage: svd.py [-l] <matrixPath> <outPath> <dim> <gamma> <matrixPath> = path to matrix <outPath> = output path for space <dim> = dimensionality of low-dimensional output vectors <gamma> = eigenvalue weighting parameter Options: -l, --len normalize final vectors to unit length ''') is_len = args['--len'] matrixPath = args['<matrixPath>'] outPath = args['<outPath>'] dim = int(args['<dim>']) gamma = float(args['<gamma>']) logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info(__file__.upper()) start_time = time.time() # Load input matrix space = Space(matrixPath) matrix = space.matrix # Get mappings between rows/columns and words rows = space.rows id2row = space.id2row id2column = space.id2column # Apply SVD u, s, v = randomized_svd(matrix, n_components=dim, n_iter=5, transpose=False) # Weight matrix if gamma == 0.0: matrix_reduced = u elif gamma == 1.0: #matrix_reduced = np.dot(u, np.diag(s)) # This is equivalent to the below formula (because s is a flattened diagonal matrix) matrix_reduced = s * u else: #matrix_ = np.dot(u, np.power(np.diag(s), gamma)) # This is equivalent to the below formula matrix_reduced = np.power(s, gamma) * u outSpace = Space(matrix=matrix_reduced, rows=rows, columns=[]) if is_len: # L2-normalize vectors outSpace.l2_normalize() # Save the matrix outSpace.save(outPath, format='w2v') logging.info("--- %s seconds ---" % (time.time() - start_time))
def main(): """ Create low-dimensional vector space by sparse random indexing from co-occurrence matrix. """ # Get the arguments args = docopt( '''Create low-dimensional vector space by sparse random indexing from co-occurrence matrix. Usage: ri.py [-l] <matrixPath> <outPath> <dim> <matrixPath> = path to matrix <outPath> = output path for reduced space <dim> = number of dimensions for random vectors Options: -l, --len normalize final vectors to unit length Note: Paramaters -s, -a and <t> have been removed from an earlier version for efficiency. References: [1] Ping Li, T. Hastie and K. W. Church, 2006, "Very Sparse Random Projections". http://web.stanford.edu/~hastie/Papers/Ping/KDD06_rp.pdf [2] D. Achlioptas, 2001, "Database-friendly random projections", http://www.cs.ucsc.edu/~optas/papers/jl.pdf ''') is_len = args['--len'] matrixPath = args['<matrixPath>'] outPath = args['<outPath>'] dim = int(args['<dim>']) logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info(__file__.upper()) start_time = time.time() # Load input matrix countSpace = Space(matrixPath) countMatrix = countSpace.matrix rows = countSpace.rows columns = countSpace.columns # Generate random vectors randomMatrix = csr_matrix( sparse_random_matrix(dim, len(columns)).toarray().T) logging.info("Multiplying matrices") reducedMatrix = np.dot(countMatrix, randomMatrix) outSpace = Space(matrix=reducedMatrix, rows=rows, columns=[]) if is_len: # L2-normalize vectors outSpace.l2_normalize() # Save the matrix outSpace.save(outPath, format='w2v') logging.info("--- %s seconds ---" % (time.time() - start_time))
def main(): """ Create two aligned low-dimensional vector spaces by sparse random indexing from two co-occurrence matrices as described in: Pierpaolo Basile, Annalina Caputo and Giovanni Semeraro, 2014. Analysing Word Meaning over Time by Exploiting Temporal Random Indexing. """ # Get the arguments args = docopt( '''Create two aligned low-dimensional vector spaces by sparse random indexing from two co-occurrence matrices. Usage: srv_align.py [-l] (-s <seeds> | -a) <matrixPath1> <matrixPath2> <outPath1> <outPath2> <outPathElement> <dim> <t> <seeds> = number of non-zero values in each random vector <matrixPath1> = path to matrix1 <matrixPath2> = path to matrix2 <outPath1> = output path for aligned space 1 <outPath2> = output path for aligned space 2 <outPathElement> = output path for elemental space (context vectors) <dim> = number of dimensions for random vectors <t> = threshold for downsampling (if t=None, no subsampling is applied) Options: -l, --len normalize final vectors to unit length -s, --see specify number of seeds manually -a, --aut calculate number of seeds automatically as proposed in [1,2] References: [1] Ping Li, T. Hastie and K. W. Church, 2006, "Very Sparse Random Projections". http://web.stanford.edu/~hastie/Papers/Ping/KDD06_rp.pdf [2] D. Achlioptas, 2001, "Database-friendly random projections", http://www.cs.ucsc.edu/~optas/papers/jl.pdf ''') is_len = args['--len'] is_seeds = args['--see'] if is_seeds: seeds = int(args['<seeds>']) is_aut = args['--aut'] matrixPath1 = args['<matrixPath1>'] matrixPath2 = args['<matrixPath2>'] outPath1 = args['<outPath1>'] outPath2 = args['<outPath2>'] outPathElement = args['<outPathElement>'] dim = int(args['<dim>']) if args['<t>'] == 'None': t = None else: t = float(args['<t>']) logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info(__file__.upper()) start_time = time.time() # Load input matrices space1 = Space(matrixPath1) matrix1 = space1.matrix space2 = Space(matrixPath2) matrix2 = space2.matrix # Get mappings between rows/columns and words rows1 = space1.rows id2row1 = space1.id2row row2id1 = space1.row2id columns1 = space1.columns column2id1 = space1.column2id rows2 = space2.rows id2row2 = space2.id2row row2id2 = space2.row2id columns2 = space2.columns column2id2 = space2.column2id # Get union of rows and columns in both spaces unified_rows = sorted(list(set(rows1).union(rows2))) unified_columns = sorted(list(set(columns1).union(columns2))) columns_diff1 = sorted(list(set(unified_columns) - set(columns1))) columns_diff2 = sorted(list(set(unified_columns) - set(columns2))) # Get mappings of indices of columns in original spaces to indices of columns in unified space c2i = {w: i for i, w in enumerate(unified_columns)} cj2i1 = {j: c2i[w] for j, w in enumerate(columns1 + columns_diff1)} cj2i2 = {j: c2i[w] for j, w in enumerate(columns2 + columns_diff2)} if t != None: rows_diff1 = list(set(unified_rows) - set(rows1)) rows_diff2 = list(set(unified_rows) - set(rows2)) r2i = {w: i for i, w in enumerate(unified_rows)} rj2i1 = {j: r2i[w] for j, w in enumerate(rows1 + rows_diff1)} rj2i2 = {j: r2i[w] for j, w in enumerate(rows2 + rows_diff2)} # Build spaces with unified COLUMNS new_columns1 = csc_matrix( (len(rows1), len(columns_diff1) )) # Get empty columns for additional context words unified_matrix1 = csc_matrix(hstack( (matrix1, new_columns1) ))[:, sorted( cj2i1, key=cj2i1.get )] # First concatenate matrix and empty columns and then order columns according to unified_columns new_columns2 = csc_matrix((len(rows2), len(columns_diff2))) unified_matrix2 = csc_matrix(hstack( (matrix2, new_columns2)))[:, sorted(cj2i2, key=cj2i2.get)] # Build spaces with unified ROWS new_rows1 = csc_matrix((len(rows_diff1), len(unified_columns))) final_unified_matrix1 = csc_matrix(vstack( (unified_matrix1, new_rows1)))[sorted(rj2i1, key=rj2i1.get)] new_rows2 = csc_matrix((len(rows_diff2), len(unified_columns))) final_unified_matrix2 = csc_matrix(vstack( (unified_matrix2, new_rows2)))[sorted(rj2i2, key=rj2i2.get)] # Add up final unified matrices common_unified_matrix = np.add(final_unified_matrix1, final_unified_matrix2) # Get number of total occurrences of any word totalOcc = np.sum(common_unified_matrix) # Define function for downsampling downsample = lambda f: np.sqrt(float(t) / f) if f > t else 1.0 downsample = np.vectorize(downsample) # Get total normalized co-occurrence frequency of all contexts in both spaces context_freqs = np.array(common_unified_matrix.sum(axis=0) / totalOcc)[0] ## Generate ternary random vectors if is_seeds: elementalMatrix = lil_matrix((len(unified_columns), dim)) # Generate base vector for random vectors baseVector = np.zeros( dim ) # Note: Make sure that number of seeds is not greater than dimensions for i in range(0, int(seeds / 2)): baseVector[i] = 1.0 for i in range(int(seeds / 2), seeds): baseVector[i] = -1.0 for i in range( len(unified_columns) ): # To-do: make this more efficient by generating random indices for a whole array np.random.shuffle(baseVector) elementalMatrix[i] = baseVector if is_aut: elementalMatrix = sparse_random_matrix(dim, len(unified_columns)).T # Initialize target vectors alignedMatrix1 = np.zeros((len(rows1), dim)) alignedMatrix2 = np.zeros((len(rows2), dim)) # Iterate over rows of space, find context words and update aligned matrix with low-dimensional random vectors of these context words for (matrix, id2row, cj2i, alignedMatrix) in [(matrix1, id2row1, cj2i1, alignedMatrix1), (matrix2, id2row2, cj2i2, alignedMatrix2)]: # Iterate over targets for i in id2row: # Get co-occurrence values as matrix m = matrix[i] # Get nonzero indexes nonzeros = m.nonzero() nonzeros = [cj2i[j] for j in nonzeros[1]] data = m.data pos_context_vectors = elementalMatrix[nonzeros] if t != None: # Apply subsampling rfs = context_freqs[nonzeros] rfs = downsample(rfs) data *= rfs # Weight context vectors by occurrence frequency pos_context_vectors = pos_context_vectors.multiply( data.reshape(-1, 1)) # Add up context vectors and store as row for target alignedMatrix[i] = np.sum(pos_context_vectors, axis=0) outSpace1 = Space(matrix=alignedMatrix1, rows=rows1, columns=[]) outSpace2 = Space(matrix=alignedMatrix2, rows=rows2, columns=[]) if is_len: # L2-normalize vectors outSpace1.l2_normalize() outSpace2.l2_normalize() # Save the matrices outSpace1.save(outPath1) outSpace2.save(outPath2) Space(matrix=elementalMatrix, rows=unified_columns, columns=[]).save(outPathElement) logging.info("--- %s seconds ---" % (time.time() - start_time))
def main(): """ Compute the smoothed and shifted PPMI matrix from a co-occurrence matrix. Smoothing is performed as described in Omer Levy, Yoav Goldberg, and Ido Dagan. 2015. Improving distributional similarity with lessons learned from word embeddings. Trans. ACL, 3. """ # Get the arguments args = docopt('''Compute the smoothed and shifted PPMI matrix from a co-occurrence matrix and save it. Usage: ppmi.py [-l] <matrixPath> <outPath> <k> <alpha> <matrixPath> = path to matrix <outPath> = output path for space <k> = shifting parameter <alpha> = smoothing parameter Options: -l, --len normalize final vectors to unit length ''') is_len = args['--len'] matrixPath = args['<matrixPath>'] outPath = args['<outPath>'] k = int(args['<k>']) alpha = float(args['<alpha>']) logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info(__file__.upper()) start_time = time.time() # Load input matrix space = Space(matrixPath) # Apply EPMI weighting space.epmi_weighting(alpha) # Apply log weighting space.log_weighting() # Shift values space.shifting(k) # Eliminate negative counts space.eliminate_negative() # Eliminate zero counts space.eliminate_zeros() outSpace = Space(matrix=space.matrix, rows=space.rows, columns=space.columns) if is_len: # L2-normalize vectors outSpace.l2_normalize() # Save the matrix outSpace.save(outPath) logging.info("--- %s seconds ---" % (time.time() - start_time))
def main(): """ Make count-based vector space from corpus. """ # Get the arguments args = docopt("""Make count-based vector space from corpus. Usage: count.py [-l] <corpDir> <outPath> <windowSize> Arguments: <corpDir> = path to corpus or corpus directory (iterates through files) <outPath> = output path for vectors <windowSize> = the linear distance of context words to consider in each direction Options: -l, --len normalize final vectors to unit length """) is_len = args['--len'] corpDir = args['<corpDir>'] outPath = args['<outPath>'] windowSize = int(args['<windowSize>']) logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info(__file__.upper()) start_time = time.time() # Build vocabulary logging.info("Building vocabulary") sentences = LineSentence(corpDir) # sentences = PathLineSentences(corpDir) vocabulary = sorted( list( set([ word for sentence in sentences for word in sentence if len(sentence) > 1 ]))) # Skip one-word sentences to avoid zero-vectors w2i = {w: i for i, w in enumerate(vocabulary)} # Initialize co-occurrence matrix as dictionary cooc_mat = defaultdict(lambda: 0) # Get counts from corpus sentences = PathLineSentences(corpDir) logging.info("Counting context words") for sentence in sentences: for i, word in enumerate(sentence): lowerWindowSize = max(i - windowSize, 0) upperWindowSize = min(i + windowSize, len(sentence)) window = sentence[lowerWindowSize:i] + sentence[i + 1:upperWindowSize + 1] if len(window) == 0: # Skip one-word sentences continue windex = w2i[word] for contextWord in window: cooc_mat[(windex, w2i[contextWord])] += 1 # Convert dictionary to sparse matrix logging.info("Converting dictionary to matrix") cooc_mat_sparse = dok_matrix((len(vocabulary), len(vocabulary)), dtype=float) try: cooc_mat_sparse.update(cooc_mat) except NotImplementedError: cooc_mat_sparse._update(cooc_mat) outSpace = Space(matrix=cooc_mat_sparse, rows=vocabulary, columns=vocabulary) if is_len: # L2-normalize vectors outSpace.l2_normalize() # Save the matrix outSpace.save(outPath) logging.info("--- %s seconds ---" % (time.time() - start_time))
def main(): """ Create two aligned low-dimensional vector spaces by sparse random indexing from two co-occurrence matrices as described in: Pierpaolo Basile, Annalina Caputo and Giovanni Semeraro, 2014. Analysing Word Meaning over Time by Exploiting Temporal Random Indexing. """ # Get the arguments args = docopt( '''Create two aligned low-dimensional vector spaces by sparse random indexing from two co-occurrence matrices. Usage: srv_align.py [-l] <matrixPath1> <matrixPath2> <outPath1> <outPath2> <dim> <matrixPath1> = path to matrix1 <matrixPath2> = path to matrix2 <outPath1> = output path for aligned space 1 <outPath2> = output path for aligned space 2 <dim> = number of dimensions for random vectors Options: -l, --len normalize final vectors to unit length Note: Assumes intersected and ordered columns. Paramaters -s, -a and <t> have been removed from an earlier version for efficiency. Also columns are now intersected instead of unified. References: [1] Ping Li, T. Hastie and K. W. Church, 2006, "Very Sparse Random Projections". http://web.stanford.edu/~hastie/Papers/Ping/KDD06_rp.pdf [2] D. Achlioptas, 2001, "Database-friendly random projections", http://www.cs.ucsc.edu/~optas/papers/jl.pdf ''') is_len = args['--len'] matrixPath1 = args['<matrixPath1>'] matrixPath2 = args['<matrixPath2>'] outPath1 = args['<outPath1>'] outPath2 = args['<outPath2>'] dim = int(args['<dim>']) logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info(__file__.upper()) start_time = time.time() # Load input matrices countSpace1 = Space(matrixPath1) countMatrix1 = countSpace1.matrix rows1 = countSpace1.rows columns1 = countSpace1.columns countSpace2 = Space(matrixPath2) countMatrix2 = countSpace2.matrix rows2 = countSpace2.rows columns2 = countSpace2.columns # Generate random vectors randomMatrix = csr_matrix( sparse_random_matrix(dim, len(columns1)).toarray().T) logging.info("Multiplying matrices") reducedMatrix1 = np.dot(countMatrix1, randomMatrix) reducedMatrix2 = np.dot(countMatrix2, randomMatrix) outSpace1 = Space(matrix=reducedMatrix1, rows=rows1, columns=[]) outSpace2 = Space(matrix=reducedMatrix2, rows=rows2, columns=[]) if is_len: # L2-normalize vectors outSpace1.l2_normalize() outSpace2.l2_normalize() # Save the matrices outSpace1.save(outPath1) outSpace2.save(outPath2) logging.info("--- %s seconds ---" % (time.time() - start_time))
def main(): """ Make count-based vector space from corpus. """ # Get the arguments args = docopt("""Make count-based vector space from corpus. Usage: count.py <corpDir> <vocabFile> <outPath> <windowSize> <corpDir> = path to corpus or corpus directory (iterates through files) <vocabFile> = row and column vocabulary <outPath> = output path for vectors <windowSize> = the linear distance of context words to consider in each direction Note: Skips one-word sentences to avoid zero-vectors. Does not increase window size when out-of-vocabulary words are found. """) corpDir = args['<corpDir>'] vocabFile = args['<vocabFile>'] outPath = args['<outPath>'] windowSize = int(args['<windowSize>']) logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info(__file__.upper()) start_time = time.time() # Load vocabulary logging.info("Loading vocabulary") with open(vocabFile, 'r', encoding='utf-8') as f_in: vocabulary = [line.strip() for line in f_in] w2i = {w: i for i, w in enumerate(vocabulary)} # Initialize co-occurrence matrix as dictionary cooc_mat = defaultdict(lambda: 0) # Get counts from corpus logging.info("Counting context words") sentences = PathLineSentences(corpDir) for sentence in sentences: for i, word in enumerate(sentence): try: windex = w2i[word] except KeyError: continue lowerWindowSize = max(i - windowSize, 0) upperWindowSize = min(i + windowSize, len(sentence)) window = sentence[lowerWindowSize:i] + sentence[i + 1:upperWindowSize + 1] if len(window) == 0: # Skip one-word sentences continue for contextWord in window: try: cindex = w2i[contextWord] except KeyError: continue cooc_mat[(windex, cindex)] += 1 # Convert dictionary to sparse matrix logging.info("Converting dictionary to matrix") cooc_mat_sparse = dok_matrix((len(vocabulary), len(vocabulary)), dtype=float) try: cooc_mat_sparse.update(cooc_mat) except NotImplementedError: cooc_mat_sparse._update(cooc_mat) outSpace = Space(matrix=cooc_mat_sparse, rows=vocabulary, columns=vocabulary) # Save the matrix outSpace.save(outPath) logging.info("--- %s seconds ---" % (time.time() - start_time))
def main(): # Get the arguments args = docopt(""" Usage: CountBasedVectors.py <pathMatrix> <pathw2i> <pathCorpus> <pathTestSentences> <outPathVectors> <sentenceType> <windowSize2> CountBasedVectors.py <pathCorpus> <pathTestSentences> <sentenceType> <windowSize2> Arguments: <pathMatrix> = Path to the word vector matrix <pathw2i> = Path to the word-to-index <pathCorpus> = path to the corpus <pathTestSentences> = Path to the test sentences <outPathVectors> = Path for storing the vectors <sentenceType> = "lemma" or "token" <windowSize2> = Window size (20 works fine) """) pathMatrix = args['<pathMatrix>'] pathTestSentences = args['<pathTestSentences>'] pathw2i = args['<pathw2i>'] outPathVectors = args['<outPathVectors>'] windowSize2 = int(args['<windowSize2>']) pathCorpus = args['<pathCorpus>'] sentenceType = args['<sentenceType>'] if len(sys.argv) == 5: pathMatrix = "Files/Vectors/FirstOrder/matrix.npz" pathw2i = "Files/Vectors/FirstOrder/w2i.npz.npy" outPathVectors = "Files/Vectors/SecondOrder/Vectors.npz" logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.CRITICAL) print("") start_time = time.time() logging.critical("ContextVectors start") #Load w2i w2i = np.load(pathw2i, allow_pickle='TRUE').item() if sentenceType == "token": sentType = "sentence_token" else: sentType = "sentence" #Load saved wordVectorMatrix try: inSpace = Space(path=pathMatrix, format='w2v') except UnicodeDecodeError: inSpace = Space(path=pathMatrix) #inSpace = Space(path=pathMatrix, format='w2v') #inSpace = Space(path=pathMatrix) cooc_mat_sparse = inSpace.matrix #Calculate IDF for every word docFreq = {} for i in range(0, len(w2i)): docFreq[i] = 0 with gzip.open(pathCorpus, 'rt', encoding="utf-8") as sentences: count = 0 try: for sentence in sentences: count = count + 1 for word in set(sentence.split()): if word in w2i: docFreq[w2i[word]] += 1 except: pass for key, value in w2i.items(): docFreq[value] = math.log10(count / max(docFreq[value], 1)) #Load TestSentences contextVectorList = [] testSentences = [] with open(pathTestSentences, 'r') as csvFile: reader = csv.DictReader(csvFile, delimiter="\t") for row in reader: testSentences.append(dict(row)) #Calculate contextVectorMatrix logging.critical("Calculate contextVectorMatrix") nonExisting = False target = str(testSentences[0]["original_word"]) for dic in testSentences: sentence = dic[sentType].split() for i, word in enumerate(sentence): if str(i) == dic['target_index'] and word == target: toMelt = [] toMeltIDF = [] lowerWindowSize = max(i - windowSize2, 0) upperWindowSize = min(i + windowSize2, len(sentence)) window = sentence[lowerWindowSize:i] + sentence[ i + 1:upperWindowSize + 1] if word in w2i: windex = w2i[word] for contextWord in window: if contextWord != "$": if contextWord in w2i: contextWordIndex = w2i[contextWord] toMelt.append( cooc_mat_sparse[contextWordIndex].toarray( )[0] * math.pow(docFreq[contextWordIndex], 1)) contextVectorList.append(getContextVector(toMelt)) else: nonExisting = True #Normalize vectors in length contextVectorList = preprocessing.normalize(contextVectorList, norm='l2') #Save contextVectorList_sparse matrix outSpace = Space(matrix=contextVectorList, rows=" ", columns=" ") outSpace.save(outPathVectors) logging.critical("ContextVectors end") logging.critical("--- %s seconds ---" % (time.time() - start_time)) print("")
def main(): # Get the arguments args = docopt(""" Usage: W2v.py <pathTestSentences> <outPathVectors> <windowSize2> <sentenceType> W2v.py <pathTestSentences> <windowSize2> <sentenceType> Arguments: <pathTestSentences> = Path to the test sentences <outPathVectors> = Path for storing the vectors <windowSize2> = Window size (20 works good) <sentenceType> = "lemma" or "token" """) pathTestSentences = args['<pathTestSentences>'] outPathVectors = args['<outPathVectors>'] windowSize2 = int(args['<windowSize2>']) sentenceType = args['<sentenceType>'] if len(sys.argv) == 4: outPathVectors = "Files/Vectors/SecondOrder/Vectors.npz" logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.CRITICAL) print("") start_time = time.time() logging.critical("W2V start") if sentenceType == "token": sentType = "sentence_token" else: sentType = "sentence" if not isinstance(windowSize2, int): windowSize2 = 20 #Load Word2Vec model = gensim.models.KeyedVectors.load_word2vec_format( 'Data/GoogleNews-vectors-negative300.bin', binary=True) #Load TestSentences contextVectorList = [] testSentences = [] with open(pathTestSentences, 'r') as csvFile: reader = csv.DictReader(csvFile, delimiter="\t") for row in reader: testSentences.append(dict(row)) #Calculate contextVectorMatrix logging.critical("Calculate contextVectorMatrix") nonExisting = False #self.target=str(testSentences[0]["original_word"]) for dic in testSentences: sentence = dic[sentType].split() for i, word in enumerate(sentence): if str(i) == dic['target_index']: toMelt = [] toMeltIDF = [] lowerWindowSize = max(i - windowSize2, 0) upperWindowSize = min(i + windowSize2, len(sentence)) window = sentence[lowerWindowSize:i] + sentence[ i + 1:upperWindowSize + 1] if word in model.wv.vocab: for contextWord in window: if contextWord in model.wv.vocab: if contextWord != "$": toMelt.append( preprocessing.normalize( [model.wv[contextWord]], norm='l2')[0]) contextVectorList.append(getContextVector(toMelt)) else: contextVectorList.append(np.zeros(300)) #Normalize vectors in length contextVectorList = preprocessing.normalize(contextVectorList, norm='l2') #Save contextVectorList_sparse matrix outSpace = Space(matrix=contextVectorList, rows=" ", columns=" ") outSpace.save(outPathVectors) logging.critical("W2V end") logging.critical("--- %s seconds ---" % (time.time() - start_time)) print("")
def main(): """ Compute cosine distance for targets in two matrices. """ # Get the arguments args = docopt("""Compute cosine distance for targets in two matrices. Usage: cd.py <testset> <matrix1> <matrix2> <outPath> <testset> = path to file with one target per line <matrix1> = path to matrix1 in npz format <matrix2> = path to matrix2 in npz format <outPath> = output path for result file Note: Important: spaces must be already aligned (columns in same order)! """) matrix1 = args['<matrix1>'] matrix2 = args['<matrix2>'] testset = args['<testset>'] outPath = args['<outPath>'] logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info(__file__.upper()) start_time = time.time() # Load matrices and rows space1 = Space(path=matrix1) space2 = Space(path=matrix2) matrix1 = space1.matrix row2id1 = space1.row2id matrix2 = space2.matrix row2id2 = space2.row2id # Load targets with open(testset, 'r', encoding='utf-8') as f_in: targets = [line.strip() for line in f_in] scores = {} for target in targets: # Get row vectors try: v1 = matrix1[row2id1[target]].toarray().flatten() v2 = matrix2[row2id2[target]].toarray().flatten() except KeyError: scores[target] = 'nan' continue # Compute cosine distance of vectors distance = cosine(v1, v2) scores[target] = distance with open(outPath, 'w', encoding='utf-8') as f_out: for target in targets: f_out.write('\t'.join((target, str(scores[target])+'\n'))) logging.info("--- %s seconds ---" % (time.time() - start_time))
def main(): """ Compute number of context types for all rows of a vector space and save their scores. """ # Get the arguments args = docopt( """Compute number of context types for all rows of a vector space and save their scores. Usage: typs.py [(-n <normConst>)] <testset> <matrixPath> <outPath> <normConst> = normalization constant <testset> = path to file with one target per line in first column <matrixPath> = path to matrix <outPath> = output path for result file Options: -n, --nrm normalize values by normalization constant """) is_norm = args['--nrm'] if is_norm: normConst = float(args['<normConst>']) testset = args['<testset>'] matrixPath = args['<matrixPath>'] outPath = args['<outPath>'] logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info(__file__.upper()) start_time = time.time() # Load input matrix space = Space(matrixPath) matrix = space.matrix # Get rows row2id = space.row2id # Load targets with open(testset, 'r', encoding='utf-8') as f_in: targets = [line.strip().split('\t')[0] for line in f_in] scores = {} # Iterate over targets for target in targets: try: row = matrix[row2id[target]] except KeyError: scores[target] = 'nan' continue # Get number of non-zero elements in row types = row.getnnz() scores[target] = types with open(outPath, 'w', encoding='utf-8') as f_out: for target in targets: if is_norm: scores[target] = float(scores[target]) / normConst f_out.write('\t'.join((target, str(scores[target]) + '\n'))) logging.info("--- %s seconds ---" % (time.time() - start_time))
def main(): """ Compute entropy for rows of targets from vector space. """ # Get the arguments args = docopt("""Compute entropy for rows of targets from vector space. Usage: entropy.py [-n] <testset> <matrixPath> <outPath> <testset> = path to file with one target per line in first column <matrixPath> = path to matrix <outPath> = output path for result file Options: -n, --nrm normalize values by log of number of types """) is_norm = args['--nrm'] testset = args['<testset>'] matrixPath = args['<matrixPath>'] outPath = args['<outPath>'] logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info(__file__.upper()) start_time = time.time() # Load input matrix space = Space(matrixPath) matrix = space.matrix # Get rows row2id = space.row2id # Load targets with open(testset, 'r', encoding='utf-8') as f_in: targets = [line.strip().split('\t')[0] for line in f_in] scores = {} norms = {} # Iterate over targets for target in targets: try: row = matrix[row2id[target]] except KeyError: scores[target] = 'nan' norms[target] = 'nan' continue # Get all counts in row (non-zero elements) counts = row.data # Compute entropy of row H = entropy(counts, base=2) scores[target] = H if is_norm: # Get number of non-zero elements in row types = row.getnnz() norms[target] = np.log2(types) with open(outPath, 'w', encoding='utf-8') as f_out: for target in targets: if is_norm: scores[target] = float(scores[target]) / float(norms[target]) f_out.write('\t'.join((target, str(scores[target]) + '\n'))) logging.info("--- %s seconds ---" % (time.time() - start_time))
def main(): # Get the arguments args = docopt(""" Usage: Bert.py <pathTestSentences> <outPathVectors> <vecType> Bert.py <pathTestSentences> <vecType> Arguments: <pathTestSentences> = Path to the test sentences <outPathVectors> = Path for storing the vectors <vecType> = "token" or "lemma" """) pathTestSentences = args['<pathTestSentences>'] outPathVectors = args['<outPathVectors>'] vecType = args['<vecType>'] if len(sys.argv) == 3: outPathVectors = "Files/Vectors/SecondOrder/Vectors.npz" logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.CRITICAL) print("") start_time = time.time() logging.critical("Bert start") #Load TestSentences # Load pre-trained model tokenizer (vocabulary) global tokenizer tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # Load pre-trained model (weights) global model model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True) contextVectorList = [] testSentences = [] with open(pathTestSentences, 'r') as csvFile: reader = csv.DictReader(csvFile, delimiter="\t") for row in reader: testSentences.append(dict(row)) #Token vs. Lemma if vecType == "token": vecTypeString = "sentence_token" else: vecTypeString = "sentence" #Create the vectors logging.critical("Create Bert embeddings") for i in range(0, len(testSentences)): #Create target word(s) targetWord = str(testSentences[i][vecTypeString].split()[int( [testSentences[i]["target_index"]][0])]) targetWords = [] targetWords.append(tokenizer.tokenize(targetWord)) targetWords = targetWords[0] #Tokenize text text = testSentences[i][vecTypeString] marked_text = "[CLS] " + text + " [SEP]" tokenized_text = tokenizer.tokenize(marked_text) #Search the indices of the tokenized target word in the tokenized text targetWordIndices = [] for i in range(0, len(tokenized_text)): if tokenized_text[i] == targetWords[0]: for l in range(0, len(targetWords)): if tokenized_text[i + l] == targetWords[l]: targetWordIndices.append(i + l) if len(targetWordIndices) == len(targetWords): break #Create BERT Token Embeddings indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) segments_ids = [1] * len(tokenized_text) tokens_tensor = torch.tensor([indexed_tokens]) segments_tensors = torch.tensor([segments_ids]) model.eval() with torch.no_grad(): outputs = model(tokens_tensor, segments_tensors) hidden_states = outputs[2] token_embeddings = torch.stack(hidden_states, dim=0) token_embeddings = torch.squeeze(token_embeddings, dim=1) token_embeddings = token_embeddings.permute(1, 0, 2) vectors = [] for number in targetWordIndices: token = token_embeddings[number] sum_vec = np.sum([np.array(token[12]), np.array(token[1])], axis=0) vectors.append(np.array(sum_vec)) contextVectorList.append(np.sum(vectors, axis=0, dtype=float)) #Normalize vectors in length contextVectorList = preprocessing.normalize(contextVectorList, norm='l2') #Save contextVectorList_sparse matrix outSpace = Space(matrix=contextVectorList, rows=" ", columns=" ") outSpace.save(outPathVectors) logging.critical("Bert end") logging.critical("--- %s seconds ---" % (time.time() - start_time)) print("")
def main(): """ Create low-dimensional vector space by sparse random indexing from co-occurrence matrix. """ # Get the arguments args = docopt('''Create low-dimensional vector space by sparse random indexing from co-occurrence matrix. Usage: ri.py [-l] (-s <seeds> | -a) <matrixPath> <outPath> <outPathElement> <dim> <t> <seeds> = number of non-zero values in each random vector <matrixPath> = path to matrix <outPath> = output path for reduced space <outPathElement> = output path for elemental space (context vectors) <dim> = number of dimensions for random vectors <t> = threshold for downsampling (if t=None, no subsampling is applied) Options: -l, --len normalize final vectors to unit length -s, --see specify number of seeds manually -a, --aut calculate number of seeds automatically as proposed in [1,2] References: [1] Ping Li, T. Hastie and K. W. Church, 2006, "Very Sparse Random Projections". http://web.stanford.edu/~hastie/Papers/Ping/KDD06_rp.pdf [2] D. Achlioptas, 2001, "Database-friendly random projections", http://www.cs.ucsc.edu/~optas/papers/jl.pdf ''') is_len = args['--len'] is_seeds = args['--see'] if is_seeds: seeds = int(args['<seeds>']) is_aut = args['--aut'] matrixPath = args['<matrixPath>'] outPath = args['<outPath>'] outPathElement = args['<outPathElement>'] dim = int(args['<dim>']) if args['<t>']=='None': t = None else: t = float(args['<t>']) logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info(__file__.upper()) start_time = time.time() # Load input matrix space = Space(matrixPath) matrix = space.matrix # Get mappings between rows/columns and words rows = space.rows id2row = space.id2row row2id = space.row2id columns = space.columns id2column = space.id2column column2id = space.column2id ## Generate ternary random vectors if is_seeds: elementalMatrix = lil_matrix((len(columns),dim)) # Generate base vector for random vectors baseVector = np.zeros(dim) # Note: Make sure that number of seeds is not greater than dimensions for i in range(0,int(seeds/2)): baseVector[i] = 1.0 for i in range(int(seeds/2),seeds): baseVector[i] = -1.0 for i in range(len(columns)): np.random.shuffle(baseVector) elementalMatrix[i] = baseVector if is_aut: elementalMatrix = sparse_random_matrix(dim,len(columns)).toarray().T elementalMatrix = csc_matrix(elementalMatrix) # to-do: get rid of transformation into sparse matrices by initializing them as such # Initialize target vectors reducedMatrix = np.zeros((len(rows),dim)) # Get number of total occurrences of any word totalOcc = np.sum(matrix) # Define function for downsampling downsample = lambda f: np.sqrt(float(t)/f) if f>t else 1.0 downsample = np.vectorize(downsample) # Get total normalized co-occurrence frequency of all contexts in space context_freqs = np.array(matrix.sum(axis=0))/totalOcc #to-do: matrix multiplication is done row-wise, do this matrix-wise # Iterate over rows of space, find context words and update reduced matrix with low-dimensional random vectors of these context words for i in id2row: # Get co-occurrence values as matrix m = matrix[i] #print(m) # Get nonzero indexes and data nonzeros = m.nonzero() #print(nonzeros) data = m.data # Smooth context distribution pos_context_vectors = elementalMatrix[nonzeros[1]] if t!=None: # Apply subsampling rfs = context_freqs[0,nonzeros[1]] rfs = downsample(rfs) data *= rfs data = csc_matrix(data) # Weight context vectors by occurrence frequency pos_context_vectors = pos_context_vectors.multiply(data.reshape(-1,1)) pos_context_vectors = np.sum(pos_context_vectors, axis=0) # Add up context vectors and store as row for target reducedMatrix[i] = pos_context_vectors outSpace = Space(matrix=reducedMatrix, rows=rows, columns=[]) if is_len: # L2-normalize vectors outSpace.l2_normalize() # Save the matrices outSpace.save(outPath, format='w2v') Space(matrix=elementalMatrix, rows=columns, columns=[]).save(outPathElement) logging.info("--- %s seconds ---" % (time.time() - start_time))
def main(): """ Compute cosine distance for targets in two matrices. """ # Get the arguments args = docopt("""Compute cosine distance for targets in two matrices. Usage: cd.py [(-f | -s)] <testset> <matrixPath1> <matrixPath2> <outPath> <testset> = path to file with tab-separated word pairs <matrixPath1> = path to matrix1 <matrixPath2> = path to matrix2 <outPath> = output path for result file Options: -f, --fst write only first target in output file -s, --scd write only second target in output file Note: Important: spaces must be already aligned (columns in same order)! Targets in first/second column of testset are computed from matrix1/matrix2. """) is_fst = args['--fst'] is_scd = args['--scd'] testset = args['<testset>'] matrixPath1 = args['<matrixPath1>'] matrixPath2 = args['<matrixPath2>'] outPath = args['<outPath>'] logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info(__file__.upper()) start_time = time.time() # Load matrices and rows try: space1 = Space(matrixPath1, format='npz') except ValueError: space1 = Space(matrixPath1, format='w2v') try: space2 = Space(matrixPath2, format='npz') except ValueError: space2 = Space(matrixPath2, format='w2v') matrix1 = space1.matrix row2id1 = space1.row2id matrix2 = space2.matrix row2id2 = space2.row2id # Load targets with open(testset, 'r', encoding='utf-8') as f_in: targets = [(line.strip().split('\t')[0], line.strip().split('\t')[1]) for line in f_in] scores = {} for (t1, t2) in targets: # Get row vectors try: v1 = matrix1[row2id1[t1]].toarray().flatten() v2 = matrix2[row2id2[t2]].toarray().flatten() except KeyError: scores[(t1, t2)] = 'nan' continue # Compute cosine distance of vectors distance = cosine_distance(v1, v2) scores[(t1, t2)] = distance with open(outPath, 'w', encoding='utf-8') as f_out: for (t1, t2) in targets: if is_fst: # output only first target string f_out.write('\t'.join((t1, str(scores[(t1, t2)]) + '\n'))) elif is_scd: # output only second target string f_out.write('\t'.join((t2, str(scores[(t1, t2)]) + '\n'))) else: # standard outputs both target strings f_out.write('\t'.join( ('%s,%s' % (t1, t2), str(scores[(t1, t2)]) + '\n'))) logging.info("--- %s seconds ---" % (time.time() - start_time))
def main(): # Get the arguments args = docopt(""" Usage: LSC_W2V.py <pathSentences1> <pathSentences2> <outPathVectors> <outPathLabels> <outPathResults> <sentenceType> <clusteringInitialization> <clustering> <limitAGL> <limitCOS> <limitCluster> <windowSize> LSC_W2V.py <pathSentences1> <pathSentences2> <sentenceType> <clusteringInitialization> <clustering> <limitAGL> <limitCOS> <limitCluster> <windowSize> Arguments: <pathSentences1> = Path to the test sentences from time1 <pathSentences2> = Path to the test sentences from time2 <outPathVectors> = Path to store the vectors <outPathLabels> = Path to store the clustering labels <outPathResults> = Path to store the lsc scores <sentenceType> = "lemma" or "token" <clusteringInitialization> = "gaac" for precalculated initializations, else random <clustering> = "kmeans" or "hierarchical" <limitAGL> = Change score limit for AGL to still be consiered as change (Good is about 0.2) <limitCOS> = Change score limit for Cosine to still be consiered as change (Good is about 0.02) <limitCluster> = Minimum number of elements a cluster has to contain from one time and less from the other, to get assigned a change (Good is 5-10) <windowSize> = Window size for words to be in context of other words (Good is 20) """) pathSentences1 = args['<pathSentences1>'] pathSentences2 = args['<pathSentences2>'] outPathVectors = args['<outPathVectors>'] outPathLabels = args['<outPathLabels>'] clusteringInitialization = args['<clusteringInitialization>'] clustering = args['<clustering>'] pathResults = args['<outPathResults>'] limitAGL = float(args['<limitAGL>']) limitCOS = float(args['<limitCOS>']) limitCluster = int(args['<limitCluster>']) windowSize = int(args['<windowSize>']) sentenceType = args['<sentenceType>'] if len(sys.argv) == 10: outPathVectors = "Files/Vectors/SecondOrder/Vectors.npz" outPathLabels = "Files/Clustering/cluster_labels.csv" pathResults = "Files/LSC/lsc_scores.csv" logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.CRITICAL) print("") start_time = time.time() logging.critical("W2v LSC start") #Create the vectors of corpora 1 logging.critical("Create the vectors of corpora 1") get_ipython().run_line_magic( 'run', 'WordSenseClustering/W2v.py $pathSentences1 $outPathVectors $windowSize $sentenceType' ) inSpace = Space(path=outPathVectors) vectors1 = inSpace.matrix.toarray() #Createthe vectors of corpora 2 logging.critical("Create the vectors of corpora 2") get_ipython().run_line_magic( 'run', 'WordSenseClustering/W2v.py $pathSentences2 $outPathVectors $windowSize $sentenceType' ) inSpace = Space(path=outPathVectors) vectors2 = inSpace.matrix.toarray() #Create the lists to store the binary results in cosineDistanceBinary = [] APDBinary = [] clusterScoreBinary = [] #Calculate cosineDistance for the two vectors cosineDistance = getCOS(vectors1, vectors2) if cosineDistance >= limitCOS: cosineDistanceBinary.append(1) else: cosineDistanceBinary.append(0) #Calculate Average pairwise distance for the two vectors APD = getAPD(vectors1, vectors2, 200) if APD >= limitAGL: APDBinary.append(1) else: APDBinary.append(0) #Create and cluster the combined vectors of both corpora logging.critical("Create and cluster the combined vectors of both corpora") vectors = np.concatenate((vectors1, vectors2), axis=0) outSpace = Space(matrix=vectors, rows=" ", columns=" ") outSpace.save(outPathVectors) #Cluster the combined vectors get_ipython().run_line_magic( 'run', 'WordSenseClustering/Clustering.py $outPathVectors 0 $outPathLabels 0 $clusteringInitialization 0 $clustering' ) #Load list of labels labels = [] with open(outPathLabels, 'r') as file: data = file.readlines() for i in data[-1]: if i != ",": if i != "\n": labels.append(int(i)) # Calculated cluster LSC score labelA_1 = [] labelA_2 = [] maximum = len(vectors1) for i in range(0, len(vectors1)): labelA_1.append(labels[i]) for i in range(maximum, maximum + len(vectors2)): labelA_2.append(labels[i]) changeA = 0 for j in set(labels): if labelA_1.count(j) >= limitCluster: if labelA_2.count(j) < limitCluster: changeA = 1 if labelA_2.count(j) >= limitCluster: if labelA_1.count(j) < limitCluster: changeA = 1 clusterScoreBinary.append(changeA) p = np.histogram(labelA_1)[0] / len(labelA_1) q = np.histogram(labelA_2)[0] / len(labelA_2) dist = distance.jensenshannon(p, q) filename1 = os.path.splitext(os.path.basename(pathSentences1))[0] filename2 = os.path.splitext(os.path.basename(pathSentences2))[0] cos = [filename1, filename2, "cosineDistance", cosineDistance] apd = [filename1, filename2, "APD", APD] cluster = [filename1, filename2, "clusterScore", dist] cosBin = [ filename1, filename2, "cosineDistanceBinary", cosineDistanceBinary[0] ] APDBin = [filename1, filename2, "APDBinary", APDBinary[0]] clusterBin = [ filename1, filename2, "clusterScoreBinary", clusterScoreBinary[0] ] print("Graded LSC:") print("") print("cosine distance:") print(cosineDistance) print("") print("Average pairwise distance:") print(APD) print("") print("JSD:") print(dist) print("") print("") print("Binary LSC:") print("") print("cosine distance binary:") print(cosineDistanceBinary[0]) print("APD distance binary:") print(APDBinary[0]) print("JSD binary:") print(clusterScoreBinary[0]) with open(pathResults, 'a', newline='') as file: writer = csv.writer(file) writer.writerows([cos, apd, cluster, cosBin, APDBin, clusterBin]) logging.critical("W2v LSC end") logging.critical("--- %s seconds ---" % (time.time() - start_time)) print("")
def main(): # Get the arguments args = docopt(""" Usage: Clustering.py <pathVectors> <pathTestSentences> <outPathLabels> <outPathResults> <initializationType> <numberClusters> <clustering> Clustering.py <pathTestSentences> <initializationType> <numberClusters> <clustering> Arguments: <pathVectors> = Path to the vectors <pathTestSentences> = Path to the test sentecens that contain the gold clustering, if no performance is needed set to 0 <outPathLabels> = Path to store the labels <outPathResults> = path to store the performance in, if no performance is needed set to 0 <initializationType> = "gaac" for precalculated initialization, else random. (Only for kmeans used) <numberClusters> = Number of desired clusters, if 0 than its calculated by sillhouette <clustering> = Either "hierarchical" or "kmeans" """) pathVectors = args['<pathVectors>'] pathTestSentences = args['<pathTestSentences>'] initializationType = args['<initializationType>'] numberClusters = int(args['<numberClusters>']) outPathLabels = args['<outPathLabels>'] outPathResults = args['<outPathResults>'] clustering = args['<clustering>'] if len(sys.argv) == 5: pathVectors = "Files/Vectors/SecondOrder/Vectors.npz" outPathLabels = "Files/Clustering/cluster_labels.csv" outPathResults = "Files/Clustering/cluster_scores.csv" logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.CRITICAL) print("") start_time = time.time() logging.critical("Clustering start") #Load vectors inSpace = Space(path=pathVectors) loaded_contextVectorList_sparse=inSpace.matrix if pathTestSentences != "0": #Get gold clustering if exists testSentences=[] gold=[] with open(pathTestSentences, 'r') as csvFile: reader = csv.DictReader(csvFile, delimiter="\t") for row in reader: testSentences.append(dict(row)) for dic in testSentences: gold.append(int(dic['cluster'])) if numberClusters == 0: #Calculate silhouette score for eaach number of clusters range_n_clusters = [2,3,4,5,6,7,8,9,10] maxIndex=0 maxValue=0 for n_clusters in range_n_clusters: clusterer = KMeans(n_clusters=n_clusters, random_state=10) cluster_labels = clusterer.fit_predict(loaded_contextVectorList_sparse.toarray()) silhouette_avg = silhouette_score(loaded_contextVectorList_sparse.toarray(), cluster_labels) if maxValue <=silhouette_avg: maxValue=silhouette_avg maxIndex=n_clusters numberClusters = maxIndex if clustering == "hierarchical": clustering = AgglomerativeClustering(n_clusters=numberClusters).fit(loaded_contextVectorList_sparse.toarray()) label=clustering.labels_ else: if initializationType == "gaac": #Calculate GAAC on sample vectors for initial centroids testList=[] size = min(len(loaded_contextVectorList_sparse.toarray()), 50 ) randoms=random.sample(range(0, len(loaded_contextVectorList_sparse.toarray())), size) for i in randoms: testList.append(loaded_contextVectorList_sparse[i].toarray()[0]) initialCentroids=preprocessing.normalize(gaac(testList, numberClusters), norm='l2') #Calculate kmeans centroid, label = kmeans2(loaded_contextVectorList_sparse.toarray(), initialCentroids , 5, minit='matrix') else: #Calculate kmeans centroid, label = kmeans2(loaded_contextVectorList_sparse.toarray(), numberClusters , 5, minit='points') if outPathResults != "0": filename = os.path.splitext(os.path.basename(pathTestSentences))[0] ADJ=[filename, "ADJ", (round(adjusted_rand_score(gold, label),3)) ] ACC=[filename, "ACC", cluster_accuracy(np.array(gold), np.array(label)) ] with open(outPathResults, 'a', newline='') as file: writer = csv.writer(file) writer.writerows([ADJ, ACC]) #Show results print("") print(filename) print("") print("Adjusted rand index:") print(round(adjusted_rand_score(gold, label),3)) print("Accuracy:") print(cluster_accuracy(np.array(gold), np.array(label))) print("") #plotClusters(loaded_contextVectorList_sparse.toarray(), gold, label) #Save labels with open(outPathLabels, 'a', newline='') as file: writer = csv.writer(file) writer.writerows([label]) logging.critical("Clustering end") logging.critical("--- %s seconds ---" % (time.time() - start_time)) print("")