def kmer2distance(testset_dir, num_testset, size_testset, size_phy, k_mer=6):
    for i in range(num_testset):
        print('testset[{}] calculate the distance matrix ...'.format(i + 1))
        for j in range(size_testset):
            msa_file = os.path.join(testset_dir, str(i + 1),
                                    '%d_rn.fasta' % (j + 1))
            out_file = os.path.join(testset_dir, str(i + 1),
                                    '%d_dist.meg' % (j + 1))
            with open(msa_file, 'rt') as f_read:
                seq_list = [seq for _, seq in SimpleFastaParser(f_read)]
            vecs = np.array([get_kmer(seq, k_mer=k_mer) for seq in seq_list])
            dist = euclidean_distances(vecs, vecs)

            with open(out_file, 'wt') as f_write:
                # mega distance format
                f_write.write('#mega\n')
                f_write.write(
                    '!TITLE cosine distance matrix of k-mer vectors;\n')
                f_write.write(
                    '!Format DataType=distance NTaxa={};\n'.format(size_phy))
                f_write.write('\n')

                f_write.write('\n'.join('#{}'.format(k + 1)
                                        for k in range(size_phy)))
                f_write.write('\n\n\n')

                # distance matrix
                for r in range(1, size_phy):
                    f_write.write(' '.join(str(a) for a in dist[r, :r]))
                    f_write.write('\n')
 def _get_kmer_vecs(self, seqs_batch, k_mer=6):
     '''Get k-mer representation of sequences.
     param:
         seqs_batch: list of sequence list
     '''
     vecs_batch = [[get_kmer(seq, k_mer) for seq in seqs]
                   for seqs in seqs_batch]
     return vecs_batch
def infer_kmer_vector(seq_file, pickle_file, k_mer=5):
    print('Inferring k-mer vector with k={}'.format(k_mer))
    with open(seq_file, 'rt') as f_read:
        vectors = [
            get_kmer(seq, k_mer=k_mer)
            for _, seq in tqdm(SimpleFastaParser(f_read))
        ]

    vectors = torch.tensor(vectors, dtype=torch.float)
    torch.save(vectors, pickle_file)
def compare_variance(method_types,
                     testset_dir,
                     num_testset,
                     size_testset,
                     size_phy,
                     out_filename=None,
                     k_mer=6,
                     silent=False):
    if out_filename is None:
        out_filename = 'variance.csv'

    total_result_file = os.path.join(testset_dir, out_filename)
    total_results = {method_type: [] for method_type in method_types}

    for i in range(num_testset):
        print('testset[{}] Calculating varience ...'.format(i + 1))
        results = {method_type: [] for method_type in method_types}

        for j in range(size_testset):
            # distance matrix
            msa_file = os.path.join(testset_dir, str(i + 1),
                                    '%d_rn.fasta' % (j + 1))
            with open(msa_file, 'rt') as f_read:
                seq_list = [seq for _, seq in SimpleFastaParser(f_read)]
            vecs = np.array([get_kmer(seq, k_mer=k_mer) for seq in seq_list])
            dist = euclidean_distances(vecs, vecs)

            for method_type in method_types:
                tree_file = os.path.join(testset_dir, str(i + 1),
                                         '%d_%s.nwk' % (j + 1, method_type))
                tree = TreeIO.read(tree_file, 'newick')
                # print('i: {},j: {}, method_type: {}'.format(i, j, method_type))
                var = cal_tree_varience(tree, dist, size_phy)
                results[method_type].append(var)

        avg = {
            method_type: np.mean(results[method_type])
            for method_type in method_types
        }
        for method_type in method_types:
            total_results[method_type].append(avg[method_type])

        result_file = os.path.join(testset_dir, str(i + 1), out_filename)
        with open(result_file, 'wt') as f_write:
            for method_type in method_types:
                ss = ['{:.2f}'.format(num) for num in results[method_type]]
                f_write.write('{},{}\n'.format(method_type, ','.join(ss)))

    with open(total_result_file, 'wt') as f_write:
        for method_type in method_types:
            ss = ['{:.2f}'.format(num) for num in total_results[method_type]]
            f_write.write('{},{}\n'.format(method_type, ','.join(ss)))
예제 #5
0
    def __init__(self, seq_file=None, vec_path=None, k_mer=5, seq2vec=None):
        # assert isinstance(seq2vec, Doc2Vec), 'seq2vec must be instance of Doc2Vec'

        # self._seq2vec = seq2vec
        # self.infer_vec = self._seq2vec.infer_vector

        if vec_path is not None:
            self.vectors = torch.load(vec_path)
        elif seq_file is not None:
            with open(seq_file, 'rt') as f_read:
                self.vectors = torch.tensor([get_kmer(seq, k_mer=k_mer)
                                            for _, seq in SimpleFastaParser(f_read)],
                                            dtype=torch.float)
        else:
            raise RuntimeError('vec_path or seq_file is required!')
def infer_tree_tsp(testset_dir, num_testset, size_testset, k_mer=6):
    for i in range(num_testset):
        print('testset[{}] Inferring tree by tsp ...'.format(i + 1))
        for j in range(size_testset):
            msa_file = os.path.join(testset_dir, str(i + 1),
                                    '%d_rn.fasta' % (j + 1))
            out_file = os.path.join(testset_dir, str(i + 1),
                                    '%d_tsp.nwk' % (j + 1))
            with open(msa_file, 'rt') as f_read:
                seq_list = [seq for _, seq in SimpleFastaParser(f_read)]
            vecs = [get_kmer(seq, k_mer=k_mer) for seq in seq_list]

            co = gurobi_tsp(vecs)

            tree = cluster_co2tree(vecs, co)
            # tree = co2tree(vecs, co)
            TreeIO.write([tree], out_file, 'newick')
def infer_tree_rl(model_path,
                  node_dim,
                  embedding_dim,
                  n_encode_layers,
                  testset_dir,
                  num_testset,
                  size_testset,
                  k_mer=6,
                  out_filename=''):
    model = AttentionModel(node_dim=node_dim,
                           embedding_dim=embedding_dim,
                           n_encode_layers=n_encode_layers)
    print('  [*] Loading data from {}'.format(model_path))
    load_data = torch_load_cpu(model_path)
    model.load_state_dict(load_data['model'])
    # del load_data
    model.set_decode_type('greedy')

    for i in range(num_testset):
        print('testset[{}] Inferring tree by rl ...'.format(i + 1))
        for j in range(size_testset):
            msa_file = os.path.join(testset_dir, str(i + 1),
                                    '%d_rn.fasta' % (j + 1))
            out_file = os.path.join(testset_dir, str(i + 1),
                                    '%d_rl%s.nwk' % (j + 1, out_filename))
            with open(msa_file, 'rt') as f_read:
                seq_list = [seq for _, seq in SimpleFastaParser(f_read)]
            vecs = [get_kmer(seq, k_mer=k_mer) for seq in seq_list]
            vecs_tensor = torch.tensor(vecs, dtype=torch.float).unsqueeze(0)

            _, _, circular_orders = model(vecs_tensor, return_pi=True)
            co = circular_orders.tolist()[0]

            tree = co2tree(vecs, co)
            # tree = cluster_co2tree(vecs, co)
            TreeIO.write([tree], out_file, 'newick')