analysis_version, datasetabbrev)): continue with open('datasets_in_progress_{0}/{1}.txt'.format( analysis_version, datasetabbrev), mode='wt', encoding="utf-8", errors="surrogateescape") as fw: fw.write('working on {}...'.format(datasetabbrev)) print('working on {}...'.format(datasetabbrev)) # load dataset gene_atb = datasetselection.loaddatamatrix( datasetpath=datasetinfo['path'], rowname='gene', columnname='atb', matrixname='gene_atb_associations', skiprows=3, skipcolumns=3, delimiter='\t', dtype='float64', getmetadata=True, # need to fix False case getmatrix=True) # check binary if set(np.unique(gene_atb.matrix)) != {0, 1}: print('warning: converting matrix to binary values') gene_atb.matrix = gene_atb.matrix != 0 gene_atb.updatedtypeattribute() # compute feature similarity atb_atb = gene_atb.tosimilarity(axis=1, metric=similarity_metric) # align with clusters commongenes = gene_atb.rowlabels[np.in1d(gene_atb.rowlabels, gene_cluster.rowlabels)]
if custompath not in sys.path: sys.path.append(custompath) del custompath, custompaths import numpy as np import pickle import machinelearning.datasetselection as ds import os # load the data gene_atb = ds.loaddatamatrix( 'data/original_data/gene_attribute_matrix_cleaned.txt.gz', rowname='gene', columnname='atb', matrixname='gene_atb_associations', skiprows=3, skipcolumns=3, delimiter='\t', dtype='float32', getmetadata=True, # need to fix False case getmatrix=True) # shuffle the data gene_atb.reorder(np.random.permutation(gene_atb.shape[0]), 0) gene_atb.reorder(np.random.permutation(gene_atb.shape[1]), 1) # standardize the data row_mean = gene_atb.matrix.mean(1) row_stdv = gene_atb.matrix.std(1) standardized_row_mean = (row_mean - row_mean.mean()) / row_mean.std() standardized_row_stdv = (row_stdv - row_stdv.mean()) / row_stdv.std()