def dict_to_bioalignment(d, alphabet='generic_alphabet', sorted=True):
     """
     Create a BioPython MultipleSequenceAlignment
     from a dict with pairs consisting of id and sequence.
     """
     alignment = MultipleSeqAlignment([])
     bio_alphabet = getattr(Bio.Alphabet, alphabet)
     for id, seq in d.items():
         seq_record = SeqRecord(Seq(seq, alphabet=bio_alphabet), id=id)
         alignment.append(seq_record)
     if sorted:
         alignment.sort()
     return alignment
示例#2
0
def retrieve_alignment(tre, alnpath, taxonset=range(0, 101), delimiter='_'):
    """
    Parameters
    ----------------
    tre : single-copy treeswift tree generated from James's code
    alnpath : path to the phylip formatted alignment of the genes. The row labels should be a superset of the leafset of 'tre'
    seqlen : sequence length parameter, only the first seqlen columns are taken from the MSA
    taxonset: set, the taxon set of the entire dataset

    Returns the MSA that corresponds to the input tree.
    """
    aln = AlignIO.read(open(alnpath), "phylip")
    seqlen = len(aln[0].seq)
    blank = "-" * seqlen
    whitelist = set(tre.labels(True, False))
    rest = set(taxonset)
    #print(rest)
    res = MultipleSeqAlignment([])
    for r in aln[:, :seqlen]:
        if r.id in whitelist:
            rid = r.id.split(delimiter)[0]
            rid_i = rid
            res.append(SeqRecord(r.seq, id=rid))
            rest.remove(rid_i)
    for rst in rest:
        res.append(SeqRecord(Seq(blank), id=str(rst)))
    res.sort()
    return res
示例#3
0
def test_trimal2(alb_resources, hf):
    tester = Alb.trimal(alb_resources.get_one("o p n"), 'all')
    assert hf.buddy2hash(tester) == "8faaf09741ddb3137653cb77ee66974a"
    tester = alb_resources.get_one("o p n")
    tester.alignments[0]._records = tester.alignments[0]._records[:5]
    Alb.trimal(tester, 'clean')
    assert hf.buddy2hash(tester) == "93a2aa21e6baf5ca70eb2de52ae8dbea"
    tester = alb_resources.get_one("o p n")
    tester_dir = TEMPDIR.subdir()
    tester.write("%s%strimal" % (tester_dir, os.path.sep))
    assert hf.buddy2hash(Alb.trimal(
        tester, 'gappyout')) == "2877ecfb201fc35211a4625f34c7afdd"
    """ Probably not a good idea to be calling binaries like this...
    real_trimal = Popen("trimal -in %s%strimal -gappyout" % (tester_dir, os.path.sep),
                        stdout=PIPE, shell=True).communicate()
    real_trimal = real_trimal[0].decode()
    with open("%s%strimal" % (tester_dir, os.path.sep), "w") as ofile:
        ofile.write(real_trimal)
    tester = Alb.AlignBuddy("%s%strimal" % (tester_dir, os.path.sep))
    assert hf.buddy2hash(tester) == "2877ecfb201fc35211a4625f34c7afdd"
    """
    records = [
        SeqRecord(Seq("A--G-")),
        SeqRecord(Seq("--T--")),
        SeqRecord(Seq("--TG-")),
        SeqRecord(Seq("A---C"))
    ]
    tester = Alb.AlignBuddy([MultipleSeqAlignment(records)])
    Alb.trimal(tester, "gappyout")
    assert "".join([str(rec.seq) for rec in tester.records()]) == ""
示例#4
0
 def test_get_subalignment_sequence_order_maintained(self):
     """
     Sequences given rearranged are still output in input order
     """
     result = AlignedSeq.get_sub_alignment_by_list_id(["s3", "s1"], self.alignment)
     expected = MultipleSeqAlignment([self.alignment[0], self.alignment[2]])
     self.assertTrue(msas_equal(expected, result))
示例#5
0
    def identify_polymorphisms(self):

        self.alignment = MultipleSeqAlignment(self.sequences)
        for col in range(self.alignment.get_alignment_length()):
            polmorphs = set(self.alignment[:,col])
            if len(polmorphs) > 1:
                self.polymorphisms[col] = polmorphs
示例#6
0
 def test_get_subalignment_with_interval(self):
     result = AlignedSeq.get_sub_alignment_by_list_id(
         ["s2", "s3"], self.alignment, [0, 2]
     )
     expected = MultipleSeqAlignment(
         [SeqRecord(Seq("C--"), id="s2"), SeqRecord(Seq("AAT"), id="s3"),]
     )
     self.assertTrue(msas_equal(expected, result))
示例#7
0
 def get_sub_alignment_by_list_id(self,
                                  id_list: List[str],
                                  alignment: MultipleSeqAlignment,
                                  interval=None):
     list_records = [record for record in alignment if record.id in id_list]
     sub_alignment = MultipleSeqAlignment(list_records)
     if interval:
         sub_alignment = sub_alignment[:, interval[0]:interval[1] + 1]
     return sub_alignment
示例#8
0
 def setUpClass(cls):
     cls.alignment = MultipleSeqAlignment(
         [
             SeqRecord(Seq("AAAT"), id="s1"),
             SeqRecord(Seq("C--C"), id="s2"),
             SeqRecord(Seq("AATT"), id="s3"),
             SeqRecord(Seq("GNGG"), id="s4"),
         ]
     )
示例#9
0
 def test_two_identical_sequences_clustered_together(self):
     alignment = MultipleSeqAlignment(
         [
             SeqRecord(Seq("AAAT"), id="s1"),
             SeqRecord(Seq("AAAT"), id="s2"),
             SeqRecord(Seq("C-CC"), id="s3"),
         ]
     )
     result = AlignedSeq.kmeans_cluster_seqs_in_interval([0, 3], alignment, 1)
     self.assertEqual([["s1", "s2"], ["s3"]], result)
示例#10
0
 def test_sequences_in_short_interval_separate_clusters(self):
     alignment = MultipleSeqAlignment(
         [
             SeqRecord(Seq("AAAT"), id="s1"),
             SeqRecord(Seq("AATT"), id="s2"),
             SeqRecord(Seq("AAGT"), id="s3"),
         ]
     )
     result = AlignedSeq.kmeans_cluster_seqs_in_interval([0, 3], alignment, 5)
     # Each sequence is below min_match_len (5), so goes into own cluster
     self.assertEqual([["s1"], ["s2"], ["s3"]], result)
示例#11
0
 def test_all_sequences_below_min_match_len(self):
     alignment = MultipleSeqAlignment(
         [
             SeqRecord(Seq("AA---AT"), id="s1"),
             SeqRecord(Seq("AA---TT"), id="s2"),
             SeqRecord(Seq("CA--CAT"), id="s3"),
         ]
     )
     result = AlignedSeq.kmeans_cluster_seqs_in_interval(
         [0, len(alignment[0])], alignment, 6
     )
     self.assertEqual([["s1"], ["s2"], ["s3"]], result)
示例#12
0
 def modeller_automodel(self, query: SeqRecord, results: Path,
                        num_align: int, atom_files_dir: Path):
     from modeller import environ
     from modeller.automodel import automodel
     for model_index, r in enumerate(
             np.load(results, allow_pickle=True)[:num_align]):
         try:
             aln = AlignIO.read(StringIO(r[-2][0]), 'clustal')
         except:
             logging.error(
                 f'Failed to parse alignment: {r[0]} -> {r[2]} -> {r[4]} -> {r[6]}'
             )
             continue
         assert query.id == aln[0].id and aln[-1].id == r[-3]
         q_rec, t_rec = self._remove_gaps(aln[0], aln[-1])
         try:
             t_rec = self._remove_missing_res(
                 t_rec, (atom_files_dir / aln[-1].id[2:4] /
                         f'{aln[-1].id}.ent').resolve().as_posix())
         except FileNotFoundError as e:
             logging.exception(e)
             continue
         q_rec.name, t_rec.name = '', ''
         q_rec.description = f'sequence:{q_rec.id}::::::::'
         t_rec.description = f'structureX:{t_rec.id}::{t_rec.id[-2].upper()}::{t_rec.id[-2].upper()}::::'
         aln = MultipleSeqAlignment([q_rec, t_rec])
         out_d = results.resolve().parent
         if (out_d / f'{aln[0].id}_{model_index+1}.pdb').exists():
             continue
         cwd = os.getcwd()
         with tempfile.TemporaryDirectory() as tmpdir:
             try:
                 os.chdir(tmpdir)
                 AlignIO.write(aln, 'aln.pir', 'pir')
                 env = environ()
                 env.io.atom_files_directory = [
                     (atom_files_dir / aln[1].id[2:4]).resolve().as_posix()
                 ]
                 mod = automodel(env,
                                 'aln.pir',
                                 knowns=[aln[1].id],
                                 sequence=aln[0].id)
                 mod.make()
                 shutil.copy(
                     list(Path().glob('*.B*.pdb'))[0],
                     out_d / f'{aln[0].id}_{model_index+1}.pdb')
             except Exception as e:
                 logging.error(
                     f'knowns=[{aln[1].id}], sequence={aln[0].id}')
                 logging.exception(e)
             finally:
                 os.chdir(cwd)
示例#13
0
 def _parse(self):
     for i, l in enumerate(self.stdout):
         if re.match('^\(":" denotes', l):
             a = SeqRecord(Seq(self.stdout[i + 1],
                               alphabet=generic_protein),
                           id='Protein_A')
             b = SeqRecord(Seq(self.stdout[i + 3],
                               alphabet=generic_protein),
                           id='Protein_B')
             self.alignment = MultipleSeqAlignment([a, b])
             break
     for l in self.stdout:
         if re.match('^TM-score', l):
             self.tmscore = float(
                 l.split('=')[1].split('(')[0].replace(' ', ''))
             break
     for l in self.stdout:
         if re.match('^MaxSub-score', l):
             self.maxsub = float(
                 l.split('=')[1].split('(')[0].replace(' ', ''))
             break
     for l in self.stdout:
         if re.match('^GDT-TS-score', l):
             self.gdtts = float(
                 l.split('=')[1].split('%')[0].replace(' ', ''))
             break
     for l in self.stdout:
         if re.match('^GDT-HA-score', l):
             self.gdtha = float(
                 l.split('=')[1].split('%')[0].replace(' ', ''))
             break
     for l in self.stdout:
         if re.match('^RMSD of', l):
             self.rmsd = float(l.split('=')[1].replace(' ', ''))
             break
     for l in self.stdout:
         if re.match('^Number of residues in common', l):
             self.num_res_in_common = int(l.split('=')[1].replace(' ', ''))
             break
     for l in self.stdout:
         if re.match('^Structure1: ', l):
             self.len_a = int(
                 l.split('=')[1].split('(')[0].replace(' ', ''))
             break
     for l in self.stdout:
         if re.match('^Structure2: ', l):
             self.len_b = int(
                 l.split('=')[1].split('(')[0].replace(' ', ''))
             break
示例#14
0
    def test_one_long_one_short_sequence_separate_and_ordered_clusters(self):
        alignment = MultipleSeqAlignment(
            [
                SeqRecord(Seq("AATTAATTATATAATAAC"), id="s1"),
                SeqRecord(Seq("A--------------AAT"), id="s2"),
            ]
        )
        order_1 = AlignedSeq.kmeans_cluster_seqs_in_interval(
            [0, len(alignment[0])], alignment, 5
        )
        self.assertEqual(order_1, [["s1"], ["s2"]])

        order_2 = AlignedSeq.kmeans_cluster_seqs_in_interval(
            [0, len(alignment[0])], alignment[::-1], 5
        )
        self.assertEqual(order_2, [["s2"], ["s1"]])
示例#15
0
 def _parse(self):
     for i, l in enumerate(self.stdout):
         if re.match('^TM-score=', l):
             self.tmscore = (float(self.stdout[i].split(' ')[1]),
                             float(self.stdout[i + 1].split(' ')[1]))
             break
     for i, l in enumerate(self.stdout):
         if re.match('^\(":" denotes', l):
             a = SeqRecord(Seq(self.stdout[i + 1],
                               alphabet=generic_protein),
                           id=Path(self.protein_A).stem + '&' +
                           Path(self.protein_B).stem,
                           description=f'TM-score={self.tmscore[0]}')
             b = SeqRecord(Seq(self.stdout[i + 3],
                               alphabet=generic_protein),
                           id=Path(self.protein_B).stem + '&' +
                           Path(self.protein_A).stem,
                           description=f'TM-score={self.tmscore[1]}')
             self.alignment = MultipleSeqAlignment([a, b])
             break
示例#16
0
 def test_first_sequence_placed_in_first_cluster(self):
     """
     Runs kmeans clustering on randomly generated multiple sequence alignments
     """
     seq_len = 20
     num_seqs = 20
     bases = list(standard_bases)
     # Function has different behaviour at below and above seq_len
     for seq_len in [seq_len - 1, seq_len + 1]:
         with self.subTest(min_match_len=seq_len):
             for _ in range(20):  # Run on a number of random alignments
                 records = []
                 for i in range(num_seqs):
                     rand_seq = "".join(
                         [random.choice(bases) for _ in range(seq_len)]
                     )
                     records.append(SeqRecord(Seq(rand_seq), id=f"s{i}"))
                 alignment = MultipleSeqAlignment(records)
                 result = AlignedSeq.kmeans_cluster_seqs_in_interval(
                     [0, seq_len - 1], alignment, 1
                 )
                 self.assertTrue(result[0][0] == "s0")
示例#17
0
 def _local_align(self, record_a: SeqRecord, record_b: SeqRecord,
                  open_gap_score: int):
     aligner = Align.PairwiseAligner()
     aligner.mode = 'local'
     aligner.substitution_matrix = substitution_matrices.load('BLOSUM62')
     aligner.open_gap_score = open_gap_score
     aligner.extend_gap_score = -1
     aln = aligner.align(
         record_a.seq.ungap('-').upper(),
         record_b.seq.ungap('-').upper())[0]
     seq_a = Seq(
         str(aln).splitlines()[0].replace(' ', '-'), generic_protein)
     seq_b = Seq(
         str(aln).splitlines()[2].replace(' ', '-'), generic_protein)
     return MultipleSeqAlignment([
         SeqRecord(seq_a, id=record_a.id),
         SeqRecord(seq_b, id=record_b.id)
     ],
                                 annotations={
                                     'score': aln.score,
                                     'path': aln.path,
                                     'aligned': aln.aligned
                                 })
示例#18
0
                    counter -
                    1] + " but gene " + keyList[n] + " (has " + j + ")\n"

        n = n + 1
    counter = counter + 1

print "I can correct these spelling mistakes...If you want me to..;) \n Do you?? \n 1. Yes \n 2. No \n"

choice = input('\n')
while choice == 1:
    print "Enter the correct taxon name \n"
    usrInp = raw_input('\n')

    for filename in fileList:
        handle = open(filename, 'rU')
        record = list(SeqIO.parse(handle, 'nexus'))
        msa = MultipleSeqAlignment(record)
        for i, val in enumerate(msa):
            if 1.0 > float([x == y for (x, y) in zip(usrInp, msa[i].id)
                            ].count(True)) / len(msa[i].id) > 0.8:
                msa[i].id = usrInp
            else:
                print "No Spelling mistakes found in file %s \n" % filename

        fp = open(filename, 'w')
        SeqIO.write(msa, fp, typeList[file_format - 1])
        handle.close()
        fp.close()
    print "Want me to do some more editing?? \n Well! I can do this whole day \n What about you?? \n 1. Yes \n 2. No \n"
    choice = input('\n')
示例#19
0
    #print(dir(multiple_alignment))
    #sys.exit(0)

    seqs = []
    for seqrec in multiple_alignment:
        #print(dir(seqrec))
        try:
            name_id = seqrec.id.partition(" ")[0].partition(".")[0]
            #print(name_id)
            seqrec.id = organisms[name_id]
            if seqrec.id in organisms.values():
                seqs.append(seqrec)
        except:
            continue

    new_alignment = MultipleSeqAlignment(records=seqs)
    #print(len(new_alignment))

    AlignIO.write(new_alignment, "mm9_" + rg.name + ".fa", "fasta")

    process = subprocess.Popen([
        "/home/joseph/Apps/PhyloCSF/PhyloCSF", "29mammals",
        "mm9_" + rg.name + ".fa", "--removeRefGaps", "--strategy=omega",
        "--orf=StopStop3", "--minCodons=25", "--frames=3"
    ],
                               stdout=subprocess.PIPE)
    out, err = process.communicate()
    print(out)
    #print(out.split("\t")[2])
    #print(out.split("\t")[3])
    #print(out.split("\t")[4])
示例#20
0
        seqAA = safe_translate(seqCDS_ungapped)

        scoreAA, refalnAA, seqalnAA = align_pairwise(refAA, seqAA)
        if scoreAA < 0 or sum(
                seqAA.count(x)
                for x in ['*', 'X']) > 5 or refalnAA.count('-') > 5:
            print(seq.id, "didn't translate properly")
            continue

        seqCDS_aln = seq5pUTR
        pos = 0
        for aa_ref, aa_seq in zip(refalnAA, seqalnAA):
            if aa_seq == '-':
                seqCDS_aln += '---'
                # if the nucleotide sequence is gapped
                # (i.e. because of missing data at the 5p and 3p end, advance pos)
                if seqCDS_ungapped[pos:pos + 3] == '---':
                    pos += 3
            else:
                if len(seqCDS_ungapped) >= pos + 3:
                    seqCDS_aln += seqCDS_ungapped[pos:pos + 3]
                else:
                    seqCDS_aln += '---'
                pos += 3

        seq.seq = Seq.Seq(''.join(seqCDS_aln) + seq3pUTR)
        alignment.append(seq)

    # output
    AlignIO.write(MultipleSeqAlignment(alignment), args.output, 'fasta')
示例#21
0
 def test_ambiguous_sequences_in_short_interval_separate_clusters(self):
     alignment = MultipleSeqAlignment(
         [SeqRecord(Seq("ARAT"), id="s1"), SeqRecord(Seq("WAAT"), id="s2"),]
     )
     result = AlignedSeq.kmeans_cluster_seqs_in_interval([0, 3], alignment, 5)
     self.assertEqual([["s1"], ["s2"]], result)
示例#22
0
 def test_get_subalignment_sequence_order_maintained2(self):
     result = AlignedSeq.get_sub_alignment_by_list_id(["s1", "s3"], self.alignment)
     expected = MultipleSeqAlignment([self.alignment[0], self.alignment[2]])
     self.assertTrue(msas_equal(expected, result))
示例#23
0
 def test_one_seq_returns_single_id(self):
     alignment = MultipleSeqAlignment([SeqRecord(Seq("AAAT"), id="s1")])
     result = AlignedSeq.kmeans_cluster_seqs_in_interval([0, 3], alignment, 1)
     self.assertEqual(result, [["s1"]])
示例#24
0
def impute_ancestors_dnapars(seqs,
                             gl_seq,
                             scratch_dir,
                             gl_name='germline',
                             verbose=True):
    """
    Compute ancestral states via maximum parsimony

    @param seqs: list of sequences
    @param gl_seq: germline sequence
    @param scratch_dir: where to write intermediate dnapars files
    @param gl_name: name of germline (must be less than 10 characters long)

    @return genes_line: information needed to output imputed germline data
    @return seqs_line: information needed to output imputed sequence data
    """
    from gctree.bin.phylip_parse import parse_outfile

    assert (len(gl_name) < 10)

    infile, config, outfile = [
        os.path.join(scratch_dir, fname) for fname in [
            'infile',
            'dnapars.cfg',
            'outfile',
        ]
    ]

    aln = MultipleSeqAlignment([SeqRecord(Seq(gl_seq), id=gl_name)])

    # sequence ID must be less than ten characters, but also dnapars sets internal node
    # names to 1, 2, 3, ..., so name them numbers descending from 100 million, hoping
    # we won't ever have a clone that big...
    for idx, seq in enumerate(seqs):
        aln.append(SeqRecord(Seq(seq), id=str(99999999 - idx)))

    # dnapars uses the name "infile" as default input phylip file
    with open(infile, 'w') as phylip_file:
        phylip_file.write(aln.format('phylip'))

    # and we need to tell it the line where the root sequence occurs
    with open(infile, 'r') as phylip_file:
        for lineno, line in enumerate(phylip_file):
            if line.startswith(gl_name):
                naive_idx = str(lineno)

    # arcane user options for dnapars
    # 'O', naive_idx: the location of the outgroup root
    # 'S', 'Y': less thorough search; runs much faster but output is less exhaustive
    # 'J', 13, 10: randomize input ("jumble") using seed 13 and jumbling 10 times
    # 4: print out steps in each site (to get all nucleotide info)
    # 5: print sequences in at all nodes (to get ancestors)
    # '.': use dot-differencing for display
    # 'Y': accept these options
    with open(config, 'w') as cfg_file:
        cfg_file.write('\n'.join(
            ['O', naive_idx, 'S', 'Y', 'J', '13', '10', '4', '5', '.', 'Y']))

    # defer to command line to construct parsimony trees and ancestral states
    # dnapars has weird behavior if outfile and outtree already exist o_O
    cmd = [
        'cd', scratch_dir, '&& rm -f outfile outtree && dnapars <',
        os.path.basename(config), '> dnapars.log'
    ]
    if verbose:
        print "Calling:", " ".join(cmd)
    res = subprocess.call([" ".join(cmd)], shell=True)

    # phew, finally got some trees
    trees = parse_outfile(outfile, countfile=None, naive=gl_name)

    # take first parsimony tree
    genes_line = []
    seq_line = []
    for idx, descendant in enumerate(trees[0].traverse('preorder')):
        if descendant.is_root():
            descendant.name = gl_name
        else:
            # use dummy name for internal node sequences
            descendant.name = '-'.join([descendant.up.name, descendant.name])
            if [descendant.up.name,
                    descendant.up.sequence.lower()] not in genes_line:
                genes_line.append(
                    [descendant.up.name,
                     descendant.up.sequence.lower()])
            seq_line.append([
                descendant.up.name, descendant.name,
                descendant.sequence.lower()
            ])

    return genes_line, seq_line
示例#25
0
 def test_two_seqs_one_below_min_match_len_separate_clusters(self):
     alignment = MultipleSeqAlignment(
         [SeqRecord(Seq("AATTTAT"), id="s1"), SeqRecord(Seq("AA---AT"), id="s2")]
     )
     result = AlignedSeq.kmeans_cluster_seqs_in_interval([0, 5], alignment, 5)
     self.assertEqual(result, [["s1"], ["s2"]])
 def get_sub_alignment_by_list_id(self, list_of_id, interval=None):
     list_records = [record for record in self.alignment if record.id in list_of_id]
     sub_alignment = MultipleSeqAlignment(list_records)
     if interval:
         sub_alignment = sub_alignment[:, interval[0] : interval[1] + 1]
     return sub_alignment
示例#27
0
class PolymorphismSampler(object):
    """docstring for PolymorphismSampler"""
    def __init__(self):
        super(PolymorphismSampler, self).__init__()
        self.sequences = None
        self.alignment = None
        self.polymorphisms = dict()
        self.subsampled = set()
        self.num_positions = []
        self.num_polymorphisms = []
        
    def read_sequences(self, handle):
        sequences = [s for s in SeqIO.parse(handle, 'fasta')]

        counts = Counter([len(s) for s in sequences])
        mc_length = counts.most_common(1)[0][0]
        
        filtered_sequences = list()
        for s in sequences:
            if len(s.seq) == mc_length:
                filtered_sequences.append(s)
        self.sequences = filtered_sequences

    def identify_polymorphisms(self):

        self.alignment = MultipleSeqAlignment(self.sequences)
        for col in range(self.alignment.get_alignment_length()):
            polmorphs = set(self.alignment[:,col])
            if len(polmorphs) > 1:
                self.polymorphisms[col] = polmorphs

    def number_of_polymorphisms(self):
        total_polymorphisms = 0
        for pos, polymorphs in self.polymorphisms.items():
            total_polymorphisms += len(polymorphs)

        return total_polymorphisms

    def subsample(self):
        while len(self.polymorphisms.keys()) > 0:
            try:

                # Choose a seqrecord at random, based on LH sampling criteria.
                pos = choice(list(self.polymorphisms.keys()))
                letter = choice(list(self.polymorphisms[pos]))
                filtered = MultipleSeqAlignment([s for s in self.alignment if s[pos] == letter])
                seqrecord = choice(filtered)
                self.subsampled.add(seqrecord)

                # Remove polymorphisms
                for pos in self.polymorphisms.keys():
                    if seqrecord.seq[pos] in self.polymorphisms[pos]:
                        self.polymorphisms[pos].remove(seqrecord.seq[pos])

                # Update data
                self.polymorphisms = {k:v for k,v in self.polymorphisms.items() if len(v) > 0}
                self.num_polymorphisms.append(self.number_of_polymorphisms())
                self.num_positions.append(len(self.polymorphisms.keys()))

            except IndexError:
                break
示例#28
0
 def _aligned_number(self, aln: MultipleSeqAlignment):
     return len([
         i for i in range(aln.get_alignment_length())
         if '-' not in aln[:, i]
     ])
示例#29
0
    def search_layer2_evalue_sum_i_blast_merged(self, query: SeqRecord,
                                                graphml: Path, num_align: int):
        graph = networkx.read_graphml(graphml)
        path = []
        for nei1 in graph.neighbors(query.id):
            if 'UniRef50' not in graph.nodes[nei1]['labels'].split(':'):
                continue
            for nei2 in graph.neighbors(nei1):
                if 'UniRef50' not in graph.nodes[nei2]['labels'].split(':'):
                    continue
                for nei3 in graph.neighbors(nei2):
                    if 'SCOP95' not in graph.nodes[nei3]['labels'].split(':'):
                        continue
                    score = graph.get_edge_data(query.id, nei1)['evalue'] \
                            + graph.get_edge_data(nei1, nei2)['evalue'] + graph.get_edge_data(nei2, nei3)['evalue']
                    path.append([
                        query.id, None, nei1, None, nei2, None, nei3, None,
                        score
                    ])
        path = sorted(path, key=lambda _: _[-1])
        # dedup
        results, seen = [], []
        for p in path:
            if p[-3] in seen:
                continue
            seen.append(p[-3])
            results.append(p)
        for r in results[:num_align]:
            n1_seq = query if r[2] == query.id else self._get_seq(
                'uniref50', r[2])
            n2_seq = query if r[4] == query.id else self._get_seq(
                'uniref50', r[4])
            n3_seq = query if r[6] == query.id else self._get_seq(
                'scop95', r[6])
            with tempfile.TemporaryDirectory() as t:
                tmpdir = Path(t)
                SeqIO.write(query, tmpdir / 'query.fasta', 'fasta')
                SeqIO.write(n1_seq, tmpdir / 'n1.fasta', 'fasta')
                r[1], _ = NcbideltablastCommandline(
                    query=(tmpdir / 'query.fasta').as_posix(),
                    subject=(tmpdir / 'n1.fasta').as_posix(),
                    use_sw_tback=True,
                    outfmt=5,
                    rpsdb='cdd_delta')()
                hsp1 = SearchIO.read(StringIO(r[1]), 'blast-xml')[0][0]
                rec1, rec2 = hsp1.aln[0], hsp1.aln[1]
                seq1, seq2 = rec1.seq.tomutable(), rec2.seq.tomutable()
                aln1 = hsp1.aln
                SeqIO.write(aln1[1], tmpdir / 'n1.fasta', 'fasta')
                SeqIO.write(n2_seq, tmpdir / 'n2.fasta', 'fasta')
                r[3], _ = NcbideltablastCommandline(
                    query=(tmpdir / 'n1.fasta').as_posix(),
                    subject=(tmpdir / 'n2.fasta').as_posix(),
                    use_sw_tback=True,
                    outfmt=5,
                    rpsdb='cdd_delta')()
                hsp2 = SearchIO.read(StringIO(r[3]), 'blast-xml')[0][0]
                rec3, rec4 = hsp2.aln[0], hsp2.aln[1]
                seq3, seq4 = rec3.seq.tomutable(), rec4.seq.tomutable()
                aln2 = hsp2.aln
                SeqIO.write(aln2[1], tmpdir / 'n2.fasta', 'fasta')
                SeqIO.write(n3_seq, tmpdir / 'n3.fasta', 'fasta')
                r[5], _ = NcbideltablastCommandline(
                    query=(tmpdir / 'n2.fasta').as_posix(),
                    subject=(tmpdir / 'n3.fasta').as_posix(),
                    use_sw_tback=True,
                    outfmt=5,
                    rpsdb='cdd_delta')()
                try:
                    hsp3 = SearchIO.read(StringIO(r[5]), 'blast-xml')[0][0]
                except IndexError:
                    print(hsp1)
                    r[7] = (None, None)
                    continue
                rec5, rec6 = hsp3.aln[0], hsp3.aln[1]
                seq5, seq6 = rec5.seq.tomutable(), rec6.seq.tomutable()

                seq3 = '-' * hsp2.query_start + seq3
                seq4 = '-' * hsp2.query_start + seq4
                seq5 = '-' * (hsp2.query_start + hsp3.query_start) + seq5
                seq6 = '-' * (hsp2.query_start + hsp3.query_start) + seq6

                for i in range(hsp2.query_start, len(seq2)):
                    if i >= len(seq3):
                        seq3.append('-')
                        seq4.append('-')
                    elif seq2[i] == '-' and seq3[i] != '-':
                        seq3.insert(i, '-')
                        seq4.insert(i, '-')
                    elif seq3[i] == '-' and seq2[i] != '-':
                        seq2.insert(i, '-')
                        seq1.insert(i, '-')

                append = len(seq2) - len(seq3)
                seq3 = seq3 + '-' * append
                seq4 = seq4 + '-' * append

                for i in range(hsp2.query_start + hsp3.query_start, len(seq4)):
                    if i >= len(seq5):
                        seq5.append('-')
                        seq6.append('-')
                    elif seq4[i] == '-' and seq5[i] != '-':
                        seq5.insert(i, '-')
                        seq6.insert(i, '-')
                    elif seq5[i] == '-' and seq4[i] != '-':
                        seq4.insert(i, '-')
                        seq3.insert(i, '-')
                        seq2.insert(i, '-')
                        seq1.insert(i, '-')

                append = len(seq4) - len(seq5)
                seq5 = seq5 + '-' * append
                seq6 = seq6 + '-' * append

                rec1.seq = seq1
                rec2.seq = seq2
                rec3.seq = seq3
                rec4.seq = seq4
                rec5.seq = seq5
                rec6.seq = seq6
                try:
                    r[7] = (MultipleSeqAlignment(
                        [rec1, rec2, rec3, rec4, rec5,
                         rec6]).format('clustal'), None)
                except ValueError:
                    print(hsp1)
                    r[7] = (None, None)
                    continue

        return results
示例#30
0
 def test_two_identical_seqs_returns_two_ids_clustered(self):
     alignment = MultipleSeqAlignment(
         [SeqRecord(Seq("AAAT"), id="s1"), SeqRecord(Seq("AAAT"), id="s2"),]
     )
     result = AlignedSeq.kmeans_cluster_seqs_in_interval([0, 3], alignment, 1)
     self.assertEqual(result, [["s1", "s2"]])