def kmer2distance(testset_dir, num_testset, size_testset, size_phy, k_mer=6): for i in range(num_testset): print('testset[{}] calculate the distance matrix ...'.format(i + 1)) for j in range(size_testset): msa_file = os.path.join(testset_dir, str(i + 1), '%d_rn.fasta' % (j + 1)) out_file = os.path.join(testset_dir, str(i + 1), '%d_dist.meg' % (j + 1)) with open(msa_file, 'rt') as f_read: seq_list = [seq for _, seq in SimpleFastaParser(f_read)] vecs = np.array([get_kmer(seq, k_mer=k_mer) for seq in seq_list]) dist = euclidean_distances(vecs, vecs) with open(out_file, 'wt') as f_write: # mega distance format f_write.write('#mega\n') f_write.write( '!TITLE cosine distance matrix of k-mer vectors;\n') f_write.write( '!Format DataType=distance NTaxa={};\n'.format(size_phy)) f_write.write('\n') f_write.write('\n'.join('#{}'.format(k + 1) for k in range(size_phy))) f_write.write('\n\n\n') # distance matrix for r in range(1, size_phy): f_write.write(' '.join(str(a) for a in dist[r, :r])) f_write.write('\n')
def _get_kmer_vecs(self, seqs_batch, k_mer=6): '''Get k-mer representation of sequences. param: seqs_batch: list of sequence list ''' vecs_batch = [[get_kmer(seq, k_mer) for seq in seqs] for seqs in seqs_batch] return vecs_batch
def infer_kmer_vector(seq_file, pickle_file, k_mer=5): print('Inferring k-mer vector with k={}'.format(k_mer)) with open(seq_file, 'rt') as f_read: vectors = [ get_kmer(seq, k_mer=k_mer) for _, seq in tqdm(SimpleFastaParser(f_read)) ] vectors = torch.tensor(vectors, dtype=torch.float) torch.save(vectors, pickle_file)
def compare_variance(method_types, testset_dir, num_testset, size_testset, size_phy, out_filename=None, k_mer=6, silent=False): if out_filename is None: out_filename = 'variance.csv' total_result_file = os.path.join(testset_dir, out_filename) total_results = {method_type: [] for method_type in method_types} for i in range(num_testset): print('testset[{}] Calculating varience ...'.format(i + 1)) results = {method_type: [] for method_type in method_types} for j in range(size_testset): # distance matrix msa_file = os.path.join(testset_dir, str(i + 1), '%d_rn.fasta' % (j + 1)) with open(msa_file, 'rt') as f_read: seq_list = [seq for _, seq in SimpleFastaParser(f_read)] vecs = np.array([get_kmer(seq, k_mer=k_mer) for seq in seq_list]) dist = euclidean_distances(vecs, vecs) for method_type in method_types: tree_file = os.path.join(testset_dir, str(i + 1), '%d_%s.nwk' % (j + 1, method_type)) tree = TreeIO.read(tree_file, 'newick') # print('i: {},j: {}, method_type: {}'.format(i, j, method_type)) var = cal_tree_varience(tree, dist, size_phy) results[method_type].append(var) avg = { method_type: np.mean(results[method_type]) for method_type in method_types } for method_type in method_types: total_results[method_type].append(avg[method_type]) result_file = os.path.join(testset_dir, str(i + 1), out_filename) with open(result_file, 'wt') as f_write: for method_type in method_types: ss = ['{:.2f}'.format(num) for num in results[method_type]] f_write.write('{},{}\n'.format(method_type, ','.join(ss))) with open(total_result_file, 'wt') as f_write: for method_type in method_types: ss = ['{:.2f}'.format(num) for num in total_results[method_type]] f_write.write('{},{}\n'.format(method_type, ','.join(ss)))
def __init__(self, seq_file=None, vec_path=None, k_mer=5, seq2vec=None): # assert isinstance(seq2vec, Doc2Vec), 'seq2vec must be instance of Doc2Vec' # self._seq2vec = seq2vec # self.infer_vec = self._seq2vec.infer_vector if vec_path is not None: self.vectors = torch.load(vec_path) elif seq_file is not None: with open(seq_file, 'rt') as f_read: self.vectors = torch.tensor([get_kmer(seq, k_mer=k_mer) for _, seq in SimpleFastaParser(f_read)], dtype=torch.float) else: raise RuntimeError('vec_path or seq_file is required!')
def infer_tree_tsp(testset_dir, num_testset, size_testset, k_mer=6): for i in range(num_testset): print('testset[{}] Inferring tree by tsp ...'.format(i + 1)) for j in range(size_testset): msa_file = os.path.join(testset_dir, str(i + 1), '%d_rn.fasta' % (j + 1)) out_file = os.path.join(testset_dir, str(i + 1), '%d_tsp.nwk' % (j + 1)) with open(msa_file, 'rt') as f_read: seq_list = [seq for _, seq in SimpleFastaParser(f_read)] vecs = [get_kmer(seq, k_mer=k_mer) for seq in seq_list] co = gurobi_tsp(vecs) tree = cluster_co2tree(vecs, co) # tree = co2tree(vecs, co) TreeIO.write([tree], out_file, 'newick')
def infer_tree_rl(model_path, node_dim, embedding_dim, n_encode_layers, testset_dir, num_testset, size_testset, k_mer=6, out_filename=''): model = AttentionModel(node_dim=node_dim, embedding_dim=embedding_dim, n_encode_layers=n_encode_layers) print(' [*] Loading data from {}'.format(model_path)) load_data = torch_load_cpu(model_path) model.load_state_dict(load_data['model']) # del load_data model.set_decode_type('greedy') for i in range(num_testset): print('testset[{}] Inferring tree by rl ...'.format(i + 1)) for j in range(size_testset): msa_file = os.path.join(testset_dir, str(i + 1), '%d_rn.fasta' % (j + 1)) out_file = os.path.join(testset_dir, str(i + 1), '%d_rl%s.nwk' % (j + 1, out_filename)) with open(msa_file, 'rt') as f_read: seq_list = [seq for _, seq in SimpleFastaParser(f_read)] vecs = [get_kmer(seq, k_mer=k_mer) for seq in seq_list] vecs_tensor = torch.tensor(vecs, dtype=torch.float).unsqueeze(0) _, _, circular_orders = model(vecs_tensor, return_pi=True) co = circular_orders.tolist()[0] tree = co2tree(vecs, co) # tree = cluster_co2tree(vecs, co) TreeIO.write([tree], out_file, 'newick')