def generate_kmers_all(self, k, save=False): ''' :param k: :param save: :return: ''' self.k=k self.vocab = [''.join(xs) for xs in itertools.product('atcg', repeat=k)] self.vocab.sort() self.vectorizer = TfidfVectorizer(use_idf=False, vocabulary=self.vocab, analyzer='char', ngram_range=(k, k), norm=None, stop_words=[], lowercase=True, binary=False) data = np.zeros((len(self.fasta_files), len(self.vocab))).astype(np.float64) # multi processing extraction of k-mer distributions t_steps=[] s_steps=[] pool = Pool(processes=self.num_p) for ky, (v,t,s) in tqdm.tqdm(pool.imap_unordered(self.get_kmer_distribution, self.fasta_files, chunksize=1), total=len(self.fasta_files)): data[self.indexing[ky], :] = v t_steps.append(t) s_steps.append(s) # normalize the frequencies data = normalize(data, axis=1, norm='l1') data = sparse.csr_matrix(data) if save: FileUtility.save_sparse_csr(save, data) FileUtility.save_list(save+'_meta',self.fasta_files) # ': '.join(['mean_time', str(np.mean(t_steps))]), ': '.join(['std_time', str(np.std(t_steps))]) FileUtility.save_list(save+'_log',[': '.join(['mean_size', str(np.mean(s_steps))]), ': '.join(['std_size', str(np.std(s_steps))])]) return data
def create_kmer_table(self, path, k, cores=4, override=False): save_path = self.output_path + 'sequence_' + str(k) + 'mer' if override or not os.path.exists('_'.join( [save_path, 'feature', 'vect.npz'])): files = FileUtility.recursive_glob(path, '*') files.sort() input_tuples = [] for file in files: input_tuples.append( (file.split('/')[-1].split('.')[0], file, k)) strains = [] mat = [] kmers = [] pool = Pool(processes=cores) for strain, vec, vocab in tqdm.tqdm(pool.imap_unordered( self._get_kmer_rep, input_tuples, chunksize=cores), total=len(input_tuples)): strains.append(strain) mat.append(vec) kmers = vocab pool.close() mat = sparse.csr_matrix(mat) FileUtility.save_sparse_csr(save_path + '_feature_vect', mat) FileUtility.save_list('_'.join([save_path, 'strains', 'list.txt']), strains) FileUtility.save_list('_'.join([save_path, 'feature', 'list.txt']), kmers) return ('_'.join([save_path]) + ' created')
def generate_npes_all(self, save=False, norm=False): data = np.zeros( (len(self.fasta_files), len(self.npe_vocab))).astype(np.float64) # multi processing extraction of npe distributions t_steps = [] s_steps = [] pool = Pool(processes=self.num_p) for ky, (v, t, s) in tqdm.tqdm( pool.imap_unordered(self._get_npe_distribution, self.fasta_files, chunksize=self.num_p), total=len(self.fasta_files)): data[self.indexing[ky], :] = v t_steps.append(t) s_steps.append(s) pool.close() # normalize the frequencies if norm: data = normalize(data, axis=1, norm='l1') data = sparse.csr_matrix(data) if save: FileUtility.save_sparse_csr(save, data) FileUtility.save_list(save + '_meta', self.fasta_files) FileUtility.save_list(save + '_features', self.npe_vocab) FileUtility.save_list(save + '_log', [ ': '.join(['mean_time', str(np.mean(t_steps))]), ': '.join([ 'std_time', str(np.std(t_steps)) ]), ': '.join(['mean_size', str(np.mean(s_steps))]), ': '.join( ['std_size', str(np.std(s_steps))]) ]) return data
def create_continous_mics(): ''' ''' scaler = MinMaxScaler() df = pd.read_table("../data_config/Final_MICs_16.06.16.txt") res = df[[ 'Isolates', 'CIP MIC', 'TOB MIC', 'COL MIC', 'CAZ MIC', 'MEM MIC' ]] matrix = np.array([[ float( str(x).replace('<=', '').replace('≤', '').replace('<=', '').replace( '≥', '').replace('>=', '')) for x in row ] for row in res[[ 'CIP MIC', 'TOB MIC', 'COL MIC', 'CAZ MIC', 'MEM MIC' ]].as_matrix()]) # find nans [[(idx,idy) for idy,y in enumerate(x) if y] for idx, x in enumerate(np.isnan(matrix))] resistances = np.delete(matrix, [509], axis=0) isolates = [ x[0] for idx, x in enumerate(list(df[['Isolates']].values)) if not idx == 509 ] # scale to 0-1 resistances = scaler.fit_transform(resistances) features = ['CIP', 'TOB', 'COL', 'CAZ', 'MEM'] base_path = '/mounts/data/proj/asgari/dissertation/datasets/deepbio/pseudomonas/data_v3/continous_mic_vals' resistances = csr_matrix(resistances) FileUtility.save_sparse_csr(base_path + '_feature_vect', resistances) FileUtility.save_list(base_path + '_isolates_list.txt', isolates) FileUtility.save_list(base_path + '_feature_list.txt', features)
def create_read_tabular_file(path, save_pref='_', feature_normalization=None, transpose=False, override=False): ''' :param path: :param save_pref: :param transpose: if isolates are columns :param feature_normalization: 'binary': {0,1}, '0-1': [0-1], 'percent': {0,1,..,100}, 'zu': zero mean, unit variance :return: ''' print('Start creating ', save_pref) if override or not os.path.exists('_'.join( [save_pref, 'feature', 'vect.npz'])): rows = [ l.strip() for l in codecs.open(path, 'r', 'utf-8').readlines() ] tf_vec = sparse.csr_matrix([[ GenotypeReader.get_float_or_zero(x) for x in entry.split('\t')[1::] ] for entry in rows[1::]]) if transpose: tf_vec = sparse.csr_matrix(tf_vec.toarray().T) isolates = [ feat.replace(' ', '') for feat in rows[0].rstrip().split('\t') ] feature_names = [row.split()[0] for row in rows[1::]] else: isolates = [row.split()[0] for row in rows[1::]] feature_names = [ feat.replace(' ', '') for feat in rows[0].rstrip().split('\t') ] # normalizer / discretizer if feature_normalization: if feature_normalization == 'binary': tf_vec = np.round(MaxAbsScaler().fit_transform(tf_vec)) elif feature_normalization == '01': tf_vec = MaxAbsScaler().fit_transform(tf_vec) elif feature_normalization == 'percent': tf_vec = np.round(MaxAbsScaler().fit_transform(tf_vec) * 100) elif feature_normalization == 'zu': tf_vec = sparse.csr_matrix( preprocessing.StandardScaler().fit_transform( tf_vec.toarray())) FileUtility.save_sparse_csr( '_'.join([save_pref, 'feature', 'vect.npz']), tf_vec) FileUtility.save_list( '_'.join([save_pref, 'feature', 'list.txt']), feature_names) FileUtility.save_list( '_'.join([save_pref, 'strains', 'list.txt']), isolates) print(save_pref, ' created successfully containing ', str(len(isolates)), ' strains and ', str(len(feature_names)), ' features') return (''.join([ save_pref, ' created successfully containing ', str(len(isolates)), ' strains and ', str(len(feature_names)), ' features' ])) else: print(save_pref, ' already exist ') return (''.join([save_pref, ' already exist ']))