Exemplo n.º 1
0
    def __init__(self, args, ref_set=None, og_set=None, load=True):
        self.args = args
        if " " in args.reads:
            self._reads = args.reads.rstrip().split(" ")
        else:
            self._reads = args.reads

        if len(self._reads) == 2:
            self._species_name = self._reads[0].split("/")[-1].split(".")[0]
        else:
            self._species_name = self._reads.split("/")[-1].split(".")[0]

        # load pyopa related stuff
        self.defaults = pyopa.load_default_environments()
        self.envs = self.defaults['environments']
        self.env = self.envs[515]

        if load:
            if ref_set is not None:
                if self.args.single_mapping is None:
                    self.mapped_records = self._map_reads_to_references(
                        ref_set)
                else:
                    self.ref_species = self.args.single_mapping.split(
                        "/")[-1].split("_")[0]
                    self.mapped_records = self._map_reads_to_single_reference(
                        ref_set)
            if self.mapped_records and og_set is not None:
                self.og_records = self._sort_by_og(og_set)
        else:
            if og_set is not None:
                self.mapped_records = self._read_mapping_from_folder()
                self.og_records = self._sort_by_og(og_set)
Exemplo n.º 2
0
    def setUp(self):
        self.s1 = pyopa.Sequence('PDVRTQYSRTKTIKLAQVRKCGAWRVLCLDLIPDLTAKNNHMRTKWTEVQYLAFVVSIVKKRPLSHSLVLITTGKAWNGTWRALPRLSNKLIETAFKEIQAEETVYDTKAFVAGKKPRWVSPFICYGLPFVISRFDFAQYRLKDMLILFSDMLLSRICNFYNGNTGPVPNSKTNEDTDLFFDGLSGMLKLNLKRSDAICHVICYEAPIARVKFGREVKDKFSLPKGGKNPSRRISWNILGILIDRTMFIRPRLVARKEAIHLFDLIGENIDAITQRLRAHKTLMVHESQVVEQPLKVKNLDLRPELVGEEEKNRHGRAKQLDRMANGNMAQIKNGHFKQTYLISVFRPQWLQLQGGCLIAEGFHSEVGGTVDGLKGTPCAQGPVVKGLFAVWRRCDTLAGRYYQKAADIDKLGDILLASLYYIPQGAIITLSEEMAKRIGANVLLVGLINVRYSGIGYEACVGDLAPEVSWLNAGHGNIQMVLHTIDGDGCQTPHGLKIYTDKRLLDLYQGAQLKVTVATTGSVKVSKSMGWLQEGGLDYFALAGRFYRADLREIEHPRAMAVSAHLCAVGLNWVFLADIICDPNEAFKFGKDFEPRTLTYGFANEDENPKNGGATTTSFAVAVYKIKTVATLKVIGKALWKGIQMRTQQGSGPTCQWALRKGKNSILLLAQDSRGGIPKNEFTILGDLPEGQTTTCTHTEIKTRLLYGATVFFMRGDLVGLYADGCSHLYRSSNLMSQACAAAKTILCSLDGERANFSNPTDFAMYNAVFRPRLYTVSFGVFDNNVDVLQAALYYLIMMAMKQYWGVKQGGLEGTLYTWSKVSGKKETSDSRNNPSICVSVCKNPLKDVQLRIAALKRFAEAEEIGKPAVVIRALEPGLTLYILLSSHGSEGKKTHNPILVSAFVVTTVADTSKPKVTYHKDQEMAIYQVLGNNPAGYEVELAFLLPTASSKQQSGRTRKFMDTASGELKEMPIQSSHEITQAADINNLRQLPRTYKKESAKVKVAACKQPPAALNTGIEKVPSHPDGLQLIIEDEWKLLEASSMSQYNEQAKEWPFHKGGIFFKGHEQKCIDASELPRGITRDLRVILINEALVLNTFCGERKLQNEATLILLRAYVWGRHLLANYFRAPNEQDGVLVDIPQGRSTLKSDHLRASIPLFLYTTIETCTSNVTIHKRVQPMIILDIAVAGEGVCDMKNGQVFKRRMARSNDRRLPPGARMKIILFRRNHECYPLQKHQEQWILGAIRTPYGLYNLQEKATLTTRYLIKLQINNRNDLVTTLVSLLMHTRESYIRFTKERRTTESPIDVLAATLYQEFTREVRRAGEQRAGIFFSQDTNYEQAIFETKMAAYPPFGANSWNPTLRYEAWTIIKTPNSKGQEFFLEHMQDVGYGKIASSKYQEKDDDEEVARGRIVPAWY')
        self.s2 = pyopa.Sequence('PPFQPDKKLAGIELVLCNADLPGRSIYLRKVLQANANKRASASKRCTDDDIIKVDSAPDPQRKLVQAGKVPRVLYNGDVSNIISQILICAYVTGASRNFQHVMLLMDKGWGRGFTLMVNYPCPKVLEEFNPTLLTALVIISVYLNSIECERAGVTIAALNVKLEATDRLALLGRQTANTVMRAPLLLLCQGDSAKNTLNWSLEDLAIVFGRAATRVCKNLALLLNSQVFFQKTTGYKSQLGKNVINFDLYKPLVCDLVDATKYMKFYGTNDDSTDIQGRSSEKAAALAAAAMGVVGWHFLAPTGLVGAGSTFSPVFCIKGNAQLCCKRFDIDEWKALLTLQKSKIANIDYLRYRTGAVIEVGANYDGCSGQPKLQCFYDYLIRYPETVLGTNRQERVMTDEGGEHVRDLILRNVLENPTGFIGSGTHPGNISCTLETTNADLIIGSTDYDGVGSYLIIMGTCFMVTGCVVFTYAVMELVRPLKIHIFACAKVILQEADGSQKTNLRGRGKVSSFGDLPVRFRTLDGIATPSTTHAEMGASFDAAVLVIGRTGTAKFRQFATLDNRNLACNINLSSIRRYFNDNNWLEAGAKNAAEILVNHADKSLTPWVVGLGPLLKPGDIACPMIAVSYLVLVIMDMYLASYSDSFAKHLKNKHRTTTSAHKPSNQQLALDGALTAKRSSQAASIIFEAEEWGFLEWAMIGHLQTKMIYDDAFRLNSPEEELLTQATTHKIKPNYLIALQMLHRDFCIGFFHTLIHASVADSIVYASRLKQNAAIIDRGKTARQDLLGIALKLIVSASTKNAASFNRDFKLPVDVMFRFLDKMLNHGVNTIVHGGQDPKNGNPVGAGLPSWAKNIKVELQVTMFQLFESVDCTSELRLLSTAVDTTLHGEVQVMSAKDLFGRFRYRILSAGESLMENGISPKSFVEALKYFIMYYWTDITEPRCRGSALYPITIQPNLYKRTSATSLHPKGERWLPFEETSRTTISTVLMNNALLGICLYKSYQLLDHDFLGDKKQSNKRVSENSFLGIQTLHDPTGYLQKLDHSRLSKFNRDIRWGQGKSPEQWAVTLVPTLFVKKGTNAWRKKNNAEPIIVTTGTNTAPLEELHKAWMQLAHDGIVVSTLTENEKLEFFSFQDGMPSLVLFSIMAETNQLRYIGNKIYASRKWMADAQKASWVYASLPTNSCNWTAVEVAFEPKGECQMAKKFDLHSMAIVMVRLLAQERSDGADGMNNASSVKWLRKEANEKVCKWWFASPKINAMFQTVKIQSSGKYLARNPKAATKDVKKVEQDLLSRIQTQEHGLLWFYVRLIGEISEVPILSCNKALFLTIKLFNKFIRWNIAPLEITSGVDAWHTIFTSSRFSETDTGIEMTALDLTLPQGNWGTMKKKVALAATGFILFLAYSMGTLSKKFEGNHHWTWVYPFFITITVQLYIFNGHTAWVLFNFVEIPGEAIVSLRTGYLNGGRDKTFVEGLVFNSDVGRTYGGYTSNIK')

        defaults = pyopa.load_default_environments()
        self.envs = defaults['environments']
        self.log_pam1 = defaults['log_pam1']
        self.env = self.envs[515]
Exemplo n.º 3
0
    def setUp(self):
        self.precision = 10
        #resource.setrlimit(resource.RLIMIT_STACK, (resource.RLIM_INFINITY, resource.RLIM_INFINITY))

        with open(
                os.path.join(os.path.dirname(__file__), 'data',
                             'testseqs.txt')) as f:
            self.sequences = f.readlines()

        self.sequences = list(
            map(lambda s: pyopa.Sequence(s.strip()), self.sequences))
        self.darwin_results = []

        defaults = pyopa.load_default_environments()
        self.alignment_environments = defaults['environments']
        self.log_pam1 = defaults['log_pam1']
        self.dms = pyopa.MutipleAlEnv(self.alignment_environments,
                                      self.log_pam1)
        """
        write_all_env_files(self.alignment_environments)
        with open(os.path.dirname(__file__) + '/data/matrices/json/logPAM1.json') as lp:
            json_data = json.load(lp)
            #json_data["Scores"] = map(lambda l: map(lambda s: s/(2048*2048*2048), l), json_data["Scores"])
            logPAM1 = pyopa.read_env_json(json_data, self.alignment_environments[0].columns)
            write_env_file(logPAM1, "logPAM1")
        """

        with open(
                os.path.join(os.path.dirname(__file__), 'data',
                             'reference_test_results.dat')) as f:
            #skip header
            next(f)
            reader = csv.reader(f, delimiter='\t')

            for s1, s2, matrix_nr, pam, threshold, score_d, score_f, score_s,\
                score_b, als1, als2, ep_sim, ep_pamn, ep_var, in reader:
                curr = DarwinResult()
                curr.s1_id = int(s1)
                curr.s2_id = int(s2)
                curr.matrix_nr = int(matrix_nr)
                curr.pam = float(pam)
                curr.threshold = float(threshold)
                curr.score_double = float(score_d)
                curr.score_float = float(score_f)
                curr.score_short = float(score_s)
                curr.score_byte = float(score_b)
                curr.als1 = als1
                curr.als2 = als2
                curr.ep_sim = float(ep_sim)
                curr.ep_pamn = float(ep_pamn)
                curr.ep_var = float(ep_var)

                self.darwin_results.append(curr)
                '''
Exemplo n.º 4
0

def nt_epam(s1, s2, dms, env):
    aligned_strings = pyopa.align_strings(s1, s2, env)
    print('EstimatePam:')
    epam_res = dms.estimate_pam(aligned_strings[0], aligned_strings[1])
    print('\tSim: %f' % epam_res[0])
    print('\tPam Number: %f' % epam_res[1])
    print('\tVariance: %f' % epam_res[2])


s1 = pyopa.Sequence('PDVRTQYSRTKTIKLAQVRKCGAWRVLCLDLIPDLTAKNNHMRTKWTEVQYLAFVVSIVKKRPLSHSLVLITTGKAWNGTWRALPRLSNKLIETAFKEIQAEETVYDTKAFVAGKKPRWVSPFICYGLPFVISRFDFAQYRLKDMLILFSDMLLSRICNFYNGNTGPVPNSKTNEDTDLFFDGLSGMLKLNLKRSDAICHVICYEAPIARVKFGREVKDKFSLPKGGKNPSRRISWNILGILIDRTMFIRPRLVARKEAIHLFDLIGENIDAITQRLRAHKTLMVHESQVVEQPLKVKNLDLRPELVGEEEKNRHGRAKQLDRMANGNMAQIKNGHFKQTYLISVFRPQWLQLQGGCLIAEGFHSEVGGTVDGLKGTPCAQGPVVKGLFAVWRRCDTLAGRYYQKAADIDKLGDILLASLYYIPQGAIITLSEEMAKRIGANVLLVGLINVRYSGIGYEACVGDLAPEVSWLNAGHGNIQMVLHTIDGDGCQTPHGLKIYTDKRLLDLYQGAQLKVTVATTGSVKVSKSMGWLQEGGLDYFALAGRFYRADLREIEHPRAMAVSAHLCAVGLNWVFLADIICDPNEAFKFGKDFEPRTLTYGFANEDENPKNGGATTTSFAVAVYKIKTVATLKVIGKALWKGIQMRTQQGSGPTCQWALRKGKNSILLLAQDSRGGIPKNEFTILGDLPEGQTTTCTHTEIKTRLLYGATVFFMRGDLVGLYADGCSHLYRSSNLMSQACAAAKTILCSLDGERANFSNPTDFAMYNAVFRPRLYTVSFGVFDNNVDVLQAALYYLIMMAMKQYWGVKQGGLEGTLYTWSKVSGKKETSDSRNNPSICVSVCKNPLKDVQLRIAALKRFAEAEEIGKPAVVIRALEPGLTLYILLSSHGSEGKKTHNPILVSAFVVTTVADTSKPKVTYHKDQEMAIYQVLGNNPAGYEVELAFLLPTASSKQQSGRTRKFMDTASGELKEMPIQSSHEITQAADINNLRQLPRTYKKESAKVKVAACKQPPAALNTGIEKVPSHPDGLQLIIEDEWKLLEASSMSQYNEQAKEWPFHKGGIFFKGHEQKCIDASELPRGITRDLRVILINEALVLNTFCGERKLQNEATLILLRAYVWGRHLLANYFRAPNEQDGVLVDIPQGRSTLKSDHLRASIPLFLYTTIETCTSNVTIHKRVQPMIILDIAVAGEGVCDMKNGQVFKRRMARSNDRRLPPGARMKIILFRRNHECYPLQKHQEQWILGAIRTPYGLYNLQEKATLTTRYLIKLQINNRNDLVTTLVSLLMHTRESYIRFTKERRTTESPIDVLAATLYQEFTREVRRAGEQRAGIFFSQDTNYEQAIFETKMAAYPPFGANSWNPTLRYEAWTIIKTPNSKGQEFFLEHMQDVGYGKIASSKYQEKDDDEEVARGRIVPAWY')
s2 = pyopa.Sequence('PPFQPDKKLAGIELVLCNADLPGRSIYLRKVLQANANKRASASKRCTDDDIIKVDSAPDPQRKLVQAGKVPRVLYNGDVSNIISQILICAYVTGASRNFQHVMLLMDKGWGRGFTLMVNYPCPKVLEEFNPTLLTALVIISVYLNSIECERAGVTIAALNVKLEATDRLALLGRQTANTVMRAPLLLLCQGDSAKNTLNWSLEDLAIVFGRAATRVCKNLALLLNSQVFFQKTTGYKSQLGKNVINFDLYKPLVCDLVDATKYMKFYGTNDDSTDIQGRSSEKAAALAAAAMGVVGWHFLAPTGLVGAGSTFSPVFCIKGNAQLCCKRFDIDEWKALLTLQKSKIANIDYLRYRTGAVIEVGANYDGCSGQPKLQCFYDYLIRYPETVLGTNRQERVMTDEGGEHVRDLILRNVLENPTGFIGSGTHPGNISCTLETTNADLIIGSTDYDGVGSYLIIMGTCFMVTGCVVFTYAVMELVRPLKIHIFACAKVILQEADGSQKTNLRGRGKVSSFGDLPVRFRTLDGIATPSTTHAEMGASFDAAVLVIGRTGTAKFRQFATLDNRNLACNINLSSIRRYFNDNNWLEAGAKNAAEILVNHADKSLTPWVVGLGPLLKPGDIACPMIAVSYLVLVIMDMYLASYSDSFAKHLKNKHRTTTSAHKPSNQQLALDGALTAKRSSQAASIIFEAEEWGFLEWAMIGHLQTKMIYDDAFRLNSPEEELLTQATTHKIKPNYLIALQMLHRDFCIGFFHTLIHASVADSIVYASRLKQNAAIIDRGKTARQDLLGIALKLIVSASTKNAASFNRDFKLPVDVMFRFLDKMLNHGVNTIVHGGQDPKNGNPVGAGLPSWAKNIKVELQVTMFQLFESVDCTSELRLLSTAVDTTLHGEVQVMSAKDLFGRFRYRILSAGESLMENGISPKSFVEALKYFIMYYWTDITEPRCRGSALYPITIQPNLYKRTSATSLHPKGERWLPFEETSRTTISTVLMNNALLGICLYKSYQLLDHDFLGDKKQSNKRVSENSFLGIQTLHDPTGYLQKLDHSRLSKFNRDIRWGQGKSPEQWAVTLVPTLFVKKGTNAWRKKNNAEPIIVTTGTNTAPLEELHKAWMQLAHDGIVVSTLTENEKLEFFSFQDGMPSLVLFSIMAETNQLRYIGNKIYASRKWMADAQKASWVYASLPTNSCNWTAVEVAFEPKGECQMAKKFDLHSMAIVMVRLLAQERSDGADGMNNASSVKWLRKEANEKVCKWWFASPKINAMFQTVKIQSSGKYLARNPKAATKDVKKVEQDLLSRIQTQEHGLLWFYVRLIGEISEVPILSCNKALFLTIKLFNKFIRWNIAPLEITSGVDAWHTIFTSSRFSETDTGIEMTALDLTLPQGNWGTMKKKVALAATGFILFLAYSMGTLSKKFEGNHHWTWVYPFFITITVQLYIFNGHTAWVLFNFVEIPGEAIVSLRTGYLNGGRDKTFVEGLVFNSDVGRTYGGYTSNIK')

#loading the matrices and gap costs from JSON
defaults = pyopa.load_default_environments()
envs = defaults['environments']
env = envs[515]

print('Aligning\n%s\nto\n%s\n' % (s1, s2))

#calculating local and global scores for the given sequences
local_double = pyopa.align_double(s1, s2, env)
global_double = pyopa.align_double(s1, s2, env, False, True, True)

#the first element is the score, the other elements of the returned list contain the ranges for the local alignment
print('Local score: %f' % local_double[0])
print('Global score: %f' % global_double[0])

#the align_double function is an efficient vectorized C implementation, however, it is possible to call the
#  reference implementation, and compare the double score given by it to the vectorized version (the scores of course
Exemplo n.º 5
0
 def setUp(self):
    self.log_pam1 = pyopa.read_env_json(os.path.join(pyopa.matrix_dir(), 'logPAM1.json'))
    self.env_generated = pyopa.generate_all_env(self.log_pam1, 1266)
    self.env_loaded = pyopa.load_default_environments()['environments']
Exemplo n.º 6
0
    seq_num = len(sequences)
    for i in range(seq_num):
        profile = pyopa.AlignmentProfile()
        profile.create_profile_short(sequences[i], env.int16_matrix)

        for j in range(i + 1, seq_num):
            profile.align_short(sequences[j], env)


def read_sequences(seq_file):
    with open(seq_file) as f:
        sequences = f.readlines()

    return [pyopa.Sequence(s.strip()) for s in sequences]


seqs = read_sequences(
    os.path.join(
        os.path.join(os.path.dirname(os.path.realpath(__file__)),
                     '../test/data/cmp_seqs.txt')))
env = pyopa.load_default_environments()['environments'][515]

print('All against all (double precision): %fs' %
      measure_performance(all_against_all_double, seqs, env))
print('All against all (double precision old): %fs' %
      measure_performance(all_against_all_double_old, seqs, env))
print('All against all (byte estimation): %fs' %
      measure_performance(all_against_all_byte, seqs, env))
print('All against all (short estimation): %fs' %
      measure_performance(all_against_all_short, seqs, env))