示例#1
0
 def test_incompletegenenameend(self):
     for defline, seq, species in internal.get_gene_fastas(genes=['NP_0010352'],
                                                     dbpaths=self.dbpaths,
                                                     specieslist=self.specieslist):
         self.assertEqual(defline, None)
         self.assertEqual(seq, None)
         self.assertEqual(species, None)
示例#2
0
    def test_nogenelist(self):
        for defline, seq, species in internal.get_gene_fastas(genes='NP_001035293.1',
                                                        dbpaths=self.dbpaths,
                                                        specieslist=self.specieslist):
            self.assertEqual(defline, '>Amel|NP_001035293.1')
            self.assertEqual(seq,
'MPILIPHRNPASANYYENKDGARIVKASHFELDYMLGRKITFFCMATGFPRPEITWLKDGIELYHHKFFQVHEWPVGNDTLKSKMEIDPATQKDAGYYECQADNQYAVDRRGFRTDYVMISY')
            self.assertEqual(species, None)
示例#3
0
 def test_incompletegenenameend(self):
     for defline, seq, species in internal.get_gene_fastas(
             genes=['NP_0010352'],
             dbpaths=self.dbpaths,
             specieslist=self.specieslist):
         self.assertEqual(defline, None)
         self.assertEqual(seq, None)
         self.assertEqual(species, None)
示例#4
0
 def test_duplicate_matches(self):
     for defline, seq, species in internal.get_gene_fastas(genes=['XP','XP_006570708.1'],
                                                     dbpaths=self.dbpaths,
                                                     specieslist=self.specieslist):
         self.assertEqual(defline, '>Amel|XP_006570708.1')
         self.assertEqual(seq,
         'MEIAASAMLDGLKNNRISKLALSRFLSQSLVSCILLGLLLEFRAQLETTGSPANKPASSASGSGTGGTSGTSVGANNLTTSGIATGSSGSGSGATVSGIGTVNAGSSGINIGANVGTGNVASGTVESRTSTIGVQNKQLQNVKGEHPSKAFLQNRSMSLVDMYIDNSEPSENVGQIHFSLEYDFQNTTLILRIIQGKDLPAKDLSGTSDPYVRVTLLPDKKHRLETKIKRRTLNPRWNETFYFEGFPIQKLQSRVLHLHVFDYDRFSRDDSIGEMFLPLCQVDFSDKPSFWKALKPPAKDKCGELLCSLCYHPSNSVLTLTLLKARNLKAKDINGKSDPYVKVWLQFGDKRIEKRKTPIFKCTLNPVFNEAFSFNVPWEKIRECSLDVMVMDFDNIGRNELIGRIQLAGKNGSGASETKHWQDMITKPRQTIVQWHRLKPE' )
         self.assertEqual(species, None)
示例#5
0
 def test_onlyfastafilegiven(self):
     for defline, seq, species in internal.get_gene_fastas(
             fastafile=self.fastafile):
         self.assertEqual(defline, '>Cbir|LOC12345 testgene')
         self.assertEqual(
             seq,
             'ABCDEFGABCDEFGABCDEFGABCDEFGABCDEFGhijklmnopHIJKLMhijklmnopHIJKLMNOPQRSTUV'
         )
         self.assertEqual(species, None)
示例#6
0
 def test_duplicate_matches(self):
     for defline, seq, species in internal.get_gene_fastas(
             genes=['XP', 'XP_006570708.1'],
             dbpaths=self.dbpaths,
             specieslist=self.specieslist):
         self.assertEqual(defline, '>Amel|XP_006570708.1')
         self.assertEqual(
             seq,
             'MEIAASAMLDGLKNNRISKLALSRFLSQSLVSCILLGLLLEFRAQLETTGSPANKPASSASGSGTGGTSGTSVGANNLTTSGIATGSSGSGSGATVSGIGTVNAGSSGINIGANVGTGNVASGTVESRTSTIGVQNKQLQNVKGEHPSKAFLQNRSMSLVDMYIDNSEPSENVGQIHFSLEYDFQNTTLILRIIQGKDLPAKDLSGTSDPYVRVTLLPDKKHRLETKIKRRTLNPRWNETFYFEGFPIQKLQSRVLHLHVFDYDRFSRDDSIGEMFLPLCQVDFSDKPSFWKALKPPAKDKCGELLCSLCYHPSNSVLTLTLLKARNLKAKDINGKSDPYVKVWLQFGDKRIEKRKTPIFKCTLNPVFNEAFSFNVPWEKIRECSLDVMVMDFDNIGRNELIGRIQLAGKNGSGASETKHWQDMITKPRQTIVQWHRLKPE'
         )
         self.assertEqual(species, None)
示例#7
0
 def test_nogenelist(self):
     for defline, seq, species in internal.get_gene_fastas(
             genes='NP_001035293.1',
             dbpaths=self.dbpaths,
             specieslist=self.specieslist):
         self.assertEqual(defline, '>Amel|NP_001035293.1')
         self.assertEqual(
             seq,
             'MPILIPHRNPASANYYENKDGARIVKASHFELDYMLGRKITFFCMATGFPRPEITWLKDGIELYHHKFFQVHEWPVGNDTLKSKMEIDPATQKDAGYYECQADNQYAVDRRGFRTDYVMISY'
         )
         self.assertEqual(species, None)
示例#8
0
 def test_dbpath_no_specieslist(self):
     for defline, seq, species in internal.get_gene_fastas(genes=['NP_001035293.1'],
                                                     dbpaths=self.dbpaths):
         self.assertEqual(defline, None)
         self.assertEqual(seq, None)
         self.assertEqual(species, None)
示例#9
0
 def test_onlygenegiven(self):
     for defline, seq, species in internal.get_gene_fastas(genes=['NP_001035293.1']):
         self.assertEqual(defline, None)
         self.assertEqual(seq, None)
         self.assertEqual(species, None)
示例#10
0
 def test_onlyfastafilegiven(self):
     for defline, seq, species in internal.get_gene_fastas(fastafile=self.fastafile):
         self.assertEqual(defline, '>Cbir|LOC12345 testgene')
         self.assertEqual(seq,
          'ABCDEFGABCDEFGABCDEFGABCDEFGABCDEFGhijklmnopHIJKLMhijklmnopHIJKLMNOPQRSTUV')
         self.assertEqual(species, None)
示例#11
0
 def test_no_argsgiven(self):
     for defline, seq, species in internal.get_gene_fastas():
         self.assertEqual(defline, None)
         self.assertEqual(seq, None)
         self.assertEqual(species, None)
示例#12
0
 def test_badfastaseq(self):
     for defline, seq, species in internal.get_gene_fastas(fastafile=self.fastafileillegal):
         self.assertEqual(defline, None)
         self.assertEqual(seq, None)
         self.assertEqual(species, None)
示例#13
0
 def test_dbpath_no_specieslist(self):
     for defline, seq, species in internal.get_gene_fastas(
             genes=['NP_001035293.1'], dbpaths=self.dbpaths):
         self.assertEqual(defline, None)
         self.assertEqual(seq, None)
         self.assertEqual(species, None)
示例#14
0
 def test_onlygenegiven(self):
     for defline, seq, species in internal.get_gene_fastas(
             genes=['NP_001035293.1']):
         self.assertEqual(defline, None)
         self.assertEqual(seq, None)
         self.assertEqual(species, None)
示例#15
0
 def test_no_argsgiven(self):
     for defline, seq, species in internal.get_gene_fastas():
         self.assertEqual(defline, None)
         self.assertEqual(seq, None)
         self.assertEqual(species, None)
示例#16
0
 def test_badfastaseq(self):
     for defline, seq, species in internal.get_gene_fastas(
             fastafile=self.fastafileillegal):
         self.assertEqual(defline, None)
         self.assertEqual(seq, None)
         self.assertEqual(species, None)
示例#17
0
    for homolog in sorted(homologlist):
        # remove excluded genes before bothering to look up their sequence:
        searchname = internal.fix_leaky_pipes(homolog)
        if searchname in excluded_genes:
            continue
        if homologlist[homolog][0] in excluded_species:
            continue

        # extract sequences of remaining genes and add to conversion dictionary
        itercount += 1

        for defline, seq, spec in internal.get_gene_fastas(
                genes=[searchname],
                species=homologlist[homolog][0],
                fastafile=None,
                dbpaths=dbpaths,
                specieslist=specieslist,
                comment=str(homologlist[homolog][1]) + str(itercount),
                short=False):

            if sequence_filter(seq, args.maxlength, args.minlength):
                continue
            else:
                seqdic[seq] = internal.remove_illegal_characters(defline)

        shortname = internal.phylipise(homologlist[homolog][0], itercount)
        conv_handle.write("%s %-5d %s\n" %
                          (shortname, homologlist[homolog][1], homolog))
        conv_dic[shortname] = (homolog, homologlist[homolog][1])
    conv_handle.close()
示例#18
0
def get_similar_sequences(temp_dir, buildhmmer=False, fastafile=None,
                        specieslist={}, species=None, genes=[], dbpaths={},
                        mincollect=2, globalthresh=0.2, localthresh=0.8,
                        verbalise=lambda *a: None):

    # clean gene list type and content:
    if not isinstance(genes, list):
        genes = [genes]
    genes = [ g for g in genes if g != '' ]

    # count genes provided:
    genelist_num, fasta_num = internal.count_genes(genes, fastafile)
    verbalise("Y", "Genelist size:%d\nFasta size:%d" % (genelist_num, fasta_num ))

    # if fasta files are provided, create a temp fastafile to search against with hmmer:
    if fastafile:
        extra_file = os.path.join(temp_dir, "query_fasta")
        handle = open(extra_file, 'w')
        for defline, seq in internal.parsefasta(fastafile):
            handle.write(">%s\n%s\n" % (defline, seq))
        handle.close()

        extra_file_search = extra_file
    else:
        extra_file_search = None

    if genelist_num + fasta_num > 1:
        buildhmmer = True

    if buildhmmer:
        hmminput = os.path.join(temp_dir, "hmminput.fa")
        handle = open(hmminput, 'w')
        seqcount = 0
        verbalise("B", "Extracting sequence data from %d peptides" % len(genes))

        for defline, seq, species in internal.get_gene_fastas(genes=genes,
                                                    species=None,
                                                    fastafile=fastafile,
                                                    specieslist=specieslist,
                                                    dbpaths=dbpaths):

            if seq:
                seqcount += 1
                fasta_seq = "%s\n%s\n" % (defline, seq)
                handle.write(fasta_seq)
        handle.close()
        if seqcount == 0:
            verbalise("R", "No gene sequences were found.")
            return {}
        # create alignment of input sequences:
        mafft_align1 = os.path.join(temp_dir, "mafft_align_input.fa")
        mafft_align(hmminput, mafft_align1)

        verbalise("B", "Creating hidden markov model from %d sequences" % seqcount)
        # create hmmbuild model of alignment:
        hmmmodel = os.path.join(temp_dir, "hmmmodel.fa")
        open(hmmmodel, 'a').close()
        handle = os.popen(" ".join(['hmmbuild --informat afa', hmmmodel, mafft_align1]))
        handle.close()

        homologlist = hmmer_search(None,
                                    specieslist,
                                    query_species=species,
                                    minthresh=localthresh,
                                    temp_dir=temp_dir,
                                    dbpaths=dbpaths,
                                    mincollect=mincollect,
                                    globalthresh=globalthresh,
                                    hmmfile=hmmmodel,
                                    verbalise=verbalise,
                                    extra_file_search=extra_file_search)

        os.remove(mafft_align1)
        os.remove(hmminput)

    else:
        verbalise("B", "Extracting sequence from %s" % genes)
        if not isinstance(genes, list):
            genes = [genes]
        # run phmmer on a single input gene/sequence:
        seq = ""
        for defline, seq, species in internal.get_gene_fastas(genes=genes,
                                                    species=species,
                                                    fastafile=fastafile,
                                                    specieslist=specieslist,
                                                    dbpaths=dbpaths):

            fasta_seq = "%s\n%s\n" % (defline, seq)
            verbalise("C", fasta_seq)

        if not seq:
            verbalise("R", "No genes sequences were found.")
            return {}
        ## phmmer all lpep files
        homologlist = hmmer_search(fasta_seq,
                                    specieslist,
                                    query_species=species,
                                    minthresh=localthresh,
                                    dbpaths=dbpaths,
                                    temp_dir=temp_dir,
                                    mincollect=mincollect,
                                    globalthresh=globalthresh,
                                    hmmfile=None,
                                    verbalise=verbalise,
                                    extra_file_search=extra_file_search)

    return homologlist
示例#19
0
def get_similar_sequences(temp_dir,
                          buildhmmer=False,
                          fastafile=None,
                          specieslist={},
                          species=None,
                          genes=[],
                          dbpaths={},
                          mincollect=2,
                          globalthresh=0.2,
                          localthresh=0.8,
                          verbalise=lambda *a: None):

    # clean gene list type and content:
    if not isinstance(genes, list):
        genes = [genes]
    genes = [g for g in genes if g != '']

    # count genes provided:
    genelist_num, fasta_num = internal.count_genes(genes, fastafile)
    verbalise("Y",
              "Genelist size:%d\nFasta size:%d" % (genelist_num, fasta_num))

    # if fasta files are provided, create a temp fastafile to search against with hmmer:
    if fastafile:
        extra_file = os.path.join(temp_dir, "query_fasta")
        handle = open(extra_file, 'w')
        for defline, seq in internal.parsefasta(fastafile):
            handle.write(">%s\n%s\n" % (defline, seq))
        handle.close()

        extra_file_search = extra_file
    else:
        extra_file_search = None

    if genelist_num + fasta_num > 1:
        buildhmmer = True

    if buildhmmer:
        hmminput = os.path.join(temp_dir, "hmminput.fa")
        handle = open(hmminput, 'w')
        seqcount = 0
        verbalise("B",
                  "Extracting sequence data from %d peptides" % len(genes))

        for defline, seq, species in internal.get_gene_fastas(
                genes=genes,
                species=None,
                fastafile=fastafile,
                specieslist=specieslist,
                dbpaths=dbpaths):

            if seq:
                seqcount += 1
                fasta_seq = "%s\n%s\n" % (defline, seq)
                handle.write(fasta_seq)
        handle.close()
        if seqcount == 0:
            verbalise("R", "No gene sequences were found.")
            return {}
        # create alignment of input sequences:
        mafft_align1 = os.path.join(temp_dir, "mafft_align_input.fa")
        mafft_align(hmminput, mafft_align1)

        verbalise("B",
                  "Creating hidden markov model from %d sequences" % seqcount)
        # create hmmbuild model of alignment:
        hmmmodel = os.path.join(temp_dir, "hmmmodel.fa")
        open(hmmmodel, 'a').close()
        handle = os.popen(" ".join(
            ['hmmbuild --informat afa', hmmmodel, mafft_align1]))
        handle.close()

        homologlist = hmmer_search(None,
                                   specieslist,
                                   query_species=species,
                                   minthresh=localthresh,
                                   temp_dir=temp_dir,
                                   dbpaths=dbpaths,
                                   mincollect=mincollect,
                                   globalthresh=globalthresh,
                                   hmmfile=hmmmodel,
                                   verbalise=verbalise,
                                   extra_file_search=extra_file_search)

        os.remove(mafft_align1)
        os.remove(hmminput)

    else:
        verbalise("B", "Extracting sequence from %s" % genes)
        if not isinstance(genes, list):
            genes = [genes]
        # run phmmer on a single input gene/sequence:
        seq = ""
        for defline, seq, species in internal.get_gene_fastas(
                genes=genes,
                species=species,
                fastafile=fastafile,
                specieslist=specieslist,
                dbpaths=dbpaths):

            fasta_seq = "%s\n%s\n" % (defline, seq)
            verbalise("C", fasta_seq)

        if not seq:
            verbalise("R", "No genes sequences were found.")
            return {}
        ## phmmer all lpep files
        homologlist = hmmer_search(fasta_seq,
                                   specieslist,
                                   query_species=species,
                                   minthresh=localthresh,
                                   dbpaths=dbpaths,
                                   temp_dir=temp_dir,
                                   mincollect=mincollect,
                                   globalthresh=globalthresh,
                                   hmmfile=None,
                                   verbalise=verbalise,
                                   extra_file_search=extra_file_search)

    return homologlist
示例#20
0
    for homolog in sorted(homologlist):
        # remove excluded genes before bothering to look up their sequence:
        searchname = internal.fix_leaky_pipes(homolog)
        if searchname in excluded_genes:
            continue
        if homologlist[homolog][0] in excluded_species:
            continue

        # extract sequences of remaining genes and add to conversion dictionary
        itercount += 1

        for defline, seq, spec in internal.get_gene_fastas(
            genes=[searchname],
            species=homologlist[homolog][0],
            fastafile=None,
            dbpaths=dbpaths,
            specieslist=specieslist,
            comment=str(homologlist[homolog][1]) + str(itercount),
            short=False,
        ):

            if sequence_filter(seq, args.maxlength, args.minlength):
                continue
            else:
                seqdic[seq] = internal.remove_illegal_characters(defline)

        shortname = internal.phylipise(homologlist[homolog][0], itercount)
        conv_handle.write("%s %-5d %s\n" % (shortname, homologlist[homolog][1], homolog))
        conv_dic[shortname] = (homolog, homologlist[homolog][1])
    conv_handle.close()