def test_outdir(self): outdir = tempfile.mkdtemp(prefix='splitseq_') mask_sequence(self.file_missed, self.file_query, outdir, max_evalue=0.1, min_fragment_length=40) exp_filecontents = { 'non_match': [('>NZ_GG666849.1_2_251-330 # 798 # 2885 # -1 # ID=1_2;parti' 'al=00;start_type=TTG;rbs_motif=AGxAGG/AGGxGG;rbs_spacer=5-10bp' ';gc_cont=0.499\n'), ('IGIQGDTYSEDEDYPELPRTANGRLSSYILVNHKEQVHVYNQIATKLGLQKESGEVVMLPSQ' 'FINRFSLRNEHGRGIPDQ\n')], 'match': [] } obs_filecontents = dict() for type_ in ('match', 'non_match'): filename = outdir + '.' + type_ f = open(filename, 'r') obs_filecontents[type_] = f.readlines() f.close() os.remove(filename) shutil.rmtree(outdir) self.assertDictEqual(obs_filecontents, exp_filecontents)
def test_outdir(self): outdir = tempfile.mkdtemp(prefix='splitseq_') mask_sequence(self.file_missed, self.file_query, outdir, max_evalue=0.1, min_fragment_length=40) exp_filecontents = { 'non_match': [('>NZ_GG666849.1_2_251-330 # 798 # 2885 # -1 # ID=1_2;parti' 'al=00;start_type=TTG;rbs_motif=AGxAGG/AGGxGG;rbs_spacer=5-10bp' ';gc_cont=0.499\n'), ('IGIQGDTYSEDEDYPELPRTANGRLSSYILVNHKEQVHVYNQIATKLGLQKESGEVVMLPSQ' 'FINRFSLRNEHGRGIPDQ\n')], 'match': []} obs_filecontents = dict() for type_ in ('match', 'non_match'): filename = outdir+'.'+type_ f = open(filename, 'r') obs_filecontents[type_] = f.readlines() f.close() os.remove(filename) shutil.rmtree(outdir) self.assertDictEqual(obs_filecontents, exp_filecontents)
def test_split_search_parseerror_2(self): mask_sequence(self.file_hhsearch2, self.file_fasta2, min_prob=95.0, min_fragment_length=40) parse_pdb_match(self.file_hhsearch2)
def test_mask_sequence(self): obs_res = mask_sequence(self.file_missed, self.file_query, None, max_evalue=0.1, min_fragment_length=40) # check that hit selection works correct self.assertEqual(obs_res, self.exp_hits)
def test_level2(self): obs = mask_sequence(self.file_out2, self.file_fasta2, max_evalue=0.1, min_fragment_length=40) positions = [] for type_ in sorted(obs.keys()): for header in sorted(obs[type_]): start, stop = \ header[0].split(' # ')[0].split('_')[-1].split('-') positions.append((type_, start, stop)) self.assertEqual(positions, self.pos2)
def test_pretty_output(self): pretty_fp = get_data_path('test_split_search/NC_000913.3_2.pretty') with open(pretty_fp, 'r') as f: pretty = f.read() mask_obs = mask_sequence(self.file_a, self.file_query, min_fragment_length=5) with captured_output() as (out, err): pretty_output(mask_obs) output = out.getvalue() self.maxDiff = None self.assertEqual(output, pretty)
def test_mask_sequence_information(self): seq = ('MRVLKFGGTSVANAERFLRVADILESNARQGQVATVLSAPAKITNHLVAMIEKTISGQDALP' 'NISDAERIFAELLTGLAAAQPGFPLAQLKTFVDQEFAQIKHVLHGISLLGQCPDSINAALIC' 'RGEKMSIAIMAGVLEARGHNVTVIDPVEKLLAVGHYLESTVDIAESTRRIAASRIPADHMVL' 'MAGFTAGNEKGELVVLGRNGSDYSAAVLAACLRADCCEIWTDVDGVYTCDPRQVPDARLLKS' 'MSYQEAMELSYFGAKVLHPRTITPIAQFQIPCLIKNTGNPQAPGTLIGASRDEDELPVKGIS' 'NLNNMAMFSVSGPGMKGMVGMAARVFAAMSRARISVVLITQSSSEYSISFCVPQSDCVRAER' 'AMQEEFYLELKEGLLEPLAVTERLAIISVVGDGMRTLRGISAKFFAALARANINIVAIAQGS' 'SERSISVVVNNDDATTGVRVTHQMLFN') header = ('gi|556503834|ref|NC_000913.3|_2_1-461 # 2j0w_A # 337 # 2799' ' # 1 # ID=1_2;partial=00;start_type=ATG;rbs_motif=GGAG/GAGG' ';rbs_spacer=5-10bp;gc_cont=0.531') exp = (header, seq) obs = mask_sequence(self.file_a, self.file_query, min_fragment_length=450) self.assertEqual(obs['match'][0], exp) filename = '/tmp/test.mfa' mask_sequence(self.file_a, self.file_query, filename, min_fragment_length=450) f = open(filename + '.match', 'r') obs = f.readlines() f.close() os.remove(filename + '.match') self.assertIn(seq + "\n", obs) self.assertIn(">" + header + "\n", obs) f = open(filename + '.non_match', 'r') obs = f.readlines() f.close() os.remove(filename + '.non_match') self.assertFalse(obs) with self.assertRaises(IOError): mask_sequence(self.file_a, self.file_query, '/dev')
def test_mask_sequence(self): match_header_2 = ( 'gi|556503834|ref|NC_000913.3|_2_464-815 # 1ebf_A # 337 # 2799 # ' '1 # ID=1_2;partial=00;start_type=ATG;rbs_motif=GGAG/GAGG;rbs_spa' 'cer=5-10bp;gc_cont=0.531') # test default behaviour obs = mask_sequence(self.fp_out, self.fp_seqs) self.assertEqual([m[0] for m in obs['match']], [ ('gi|556503834|ref|NC_000913.3|_2_1-461 # 2j0w_A # 337 # 2799 # 1 ' '# ID=1_2;partial=00;start_type=ATG;rbs_motif=GGAG/GAGG;rbs_space' 'r=5-10bp;gc_cont=0.531'), match_header_2 ]) # restrict hits to satisfy at least 33% sequence identifty # --> first match will differ from default above obs = mask_sequence(self.fp_out, self.fp_seqs, min_identity=0.33) self.assertEqual([m[0] for m in obs['match']], [ ('gi|556503834|ref|NC_000913.3|_2_1-462 # 3c1m_A # 337 # 2799 # 1 ' '# ID=1_2;partial=00;start_type=ATG;rbs_motif=GGAG/GAGG;rbs_space' 'r=5-10bp;gc_cont=0.531'), match_header_2 ])
def test_mask_sequence(self): obs = mask_sequence(self.fp_out, self.fp_seqs, subsequences_fp='kurt_', min_prob=95.0, max_evalue=0.1, min_fragment_length=40) self.assertEqual( obs['match'][0][1], ('TMEELLTSLQKKCGTECEEAHRQLVCALNGLAGIHIIKGEYALAAELYREVLRSSEEHKGKLK' 'TDSLQRLHATHNLMELLIARHPGIPPTLRDGRLEEEAKQLREHYMSKCNTEVAEAQQALYPVQ' 'QTIHELQRKIHSNSPWWLNVIHRAIEFTIDEELVQRVRNEITSNYKQQTGKLSMSEKFRDCRG' 'LQFLLTTQMEELNKCQKLVREAVKNLEGPPSRNVIESATVCHLRPARLPLNCCVFCKADELFT' 'EYESKLFSNTVKGQTAIFEEMIEDEEGLVDDRAPTTTRGLWAISETERSMKAILSFAKSHRFD' 'VEFVDEGSTSMDLFEAWKKEYKLLHEYWMALRNRVSAVDELAMATERLRVRDPREPKPNPPVL' 'HIIEPHEVEQNRIKLLNDKAVATSQLQKKLGQLLYLTNLEK')) exp_0 = { 'Probab': 100.0, 'Template_Neff': 8.5, 'P-value': 2.8e-85, 'Similarity': 1.445, 'Sum_probs': 363.5, 'Score': 555.49, 'Cols': 419, 'No': 1, 'Identities': 1.0, 'SS': 0.0, 'alignment': { 'Q T0831': { 'start': 1, 'end': 419, 'sequence': ('TMEELLTSLQKKCGTECEEAHRQLVCALNGLAGIHIIKGEYALAAELYREVLRSSE' 'EHKGKLKTDSLQRLHATHNLMELLIARHPGIPPTLRDGRLEEEAKQLREHYMSKCN' 'TEVAEAQQALYPVQQTIHELQRKIHSNSPWWLNVIHRAIEFTIDEELVQRVRNEIT' 'SNYKQQTGKLSMSEKFRDCRGLQFLLTTQMEELNKCQKLVREAVKNLEGPPSRNVI' 'ESATVCHLRPARLPLNCCVFCKADELFTEYESKLFSNTVKGQTAIFEEMIEDEEGL' 'VDDRAPTTTRGLWAISETERSMKAILSFAKSHRFDVEFVDEGSTSMDLFEAWKKEY' 'KLLHEYWMALRNRVSAVDELAMATERLRVRDPREPKPNPPVLHIIEPHEVEQNRIK' 'LLNDKAVATSQLQKKLGQLLYLTNLEK'), 'totallen': 419 }, 'Q Consensus': { 'start': 1, 'end': 419, 'sequence': ('tmeelltslqkkcgteceeahrqlvcalnglagihiikgeyalaaelyrevlrsse' 'ehkgklktdslqrlhathnlmelliarhpgipptlrdgrleeeakqlrehymskcn' 'tevaeaqqalypvqqtihelqrkihsnspwwlnvihraieftideelvqrvrneit' 'snykqqtgklsmsekfrdcrglqfllttqmeelnkcqklvreavknlegppsrnvi' 'esatvchlrparlplnccvfckadelfteyesklfsntvkgqtaifeemiedeegl' 'vddraptttrglwaisetersmkailsfakshrfdvefvdegstsmdlfeawkkey' 'kllheywmalrnrvsavdelamaterlrvrdprepkpnppvlhiiepheveqnrik' 'llndkavatsqlqkklgqllyltnlek'), 'totallen': 419 }, 'column score': { 'sequence': ('||+|++..|-++|-+|||+++|++|.++|||||||||+|+|..|+++||+||+..+' '++++++++|+||++|+.|||.+++...+||+||+++|..+.+++.+++..|++++.' '..+..|++.+.++.+.+++++.+.++.++||+.+++.+++..++..++++|+++++' '.+|.+..|..++..+|++.+||.+.+++.+++|.++.+-+.+++++|++||..+++' '+++..||++|.+-+...|.+|++++.|..||+.||+.+.+|.+..+++++++++|.' '.++.....++|.|+.|+.|+.+|.|++|++++.|+.+++.+|..-++++++|||||' '+.++.+|++.+..++|.|||.|++-|+|.++|.++.|+||..++|.|+++++.+.+' '+.+++.++...|++++|||.||.||.|') }, 'T Consensus': { 'start': 2, 'end': 420, 'sequence': ('tmeell~~Li~k~~~eceea~R~~v~~~NgLAgl~~l~~~~~~A~~~YrevL~~~~' '~~~~~~~~D~Lq~iH~l~NL~~~l~~~~~~~~~~~~~~~l~~~~~~l~~~Yl~~~~' '~~~~~a~~~~~~~~~~~~~~~~~~~~~~~Ww~~~l~~~~~~~~~~~l~~~i~~~l~' '~~~~~~~~~~~~~~~~~s~~gL~~~l~~~l~~L~~~R~~l~~~l~~L~~~~~~~~v' '~~~~~Ch~~~~~~~~~~C~~C~~~~~~~~yE~~Lf~~~~~~~~~~~~~~~~~~~~~' '~~~~~~~~~~g~~~~S~~e~~lk~i~~~~r~~~~~~~~~~~~~~hl~~le~~rkEf' '~~~r~lw~~~~~~l~a~DEL~ma~~Rlrl~~~~e~~~~~~~~~~i~~~ev~~~~~~' '~~~e~~~a~~~l~r~~gqLrYL~nL~k'), 'totallen': 420 }, 'T 4QN1_A': { 'start': 2, 'end': 420, 'sequence': ('TMEELLTSLQKKCGTECEEAHRQLVCALNGLAGIHIIKGEYALAAELYREVLRSSE' 'EHKGKLKTDSLQRLHATHNLMELLIARHPGIPPTLRDGRLEEEAKQLREHYMSKCN' 'TEVAEAQQALYPVQQTIHELQRKIHSNSPWWLNVIHRAIEFTIDEELVQRVRNEIT' 'SNYKQQTGKLSMSEKFRDCRGLQFLLTTQMEELNKCQKLVREAVKNLEGPPSRNVI' 'ESATVCHLRPARLPLNCCVFCKADELFTEYESKLFSNTVKGQTAIFEEMIEDEEGL' 'VDDRAPTTTRGLWAISETERSMKAILSFAKSHRFDVEFVDEGSTSMDLFEAWKKEY' 'KLLHEYWMALRNRVSAVDELAMATERLRVRDPREPKPNPPVLHIIEPHEVEQNRIK' 'LLNDKAVATSQLQKKLGQLLYLTNLEK'), 'totallen': 420 }, 'T ss_dssp': { 'sequence': ('HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHTCHHHHHHHHHHHHHHHH' 'HTTTTCCCCHHHHHHHHHHHHHCCCCCTTSSCCCTTTTTHHHHHHHHHHHHHHHHH' 'HHHHHHHHTTHHHHHHHHHHHHSSCSSSCHHHHHHHHHHHTTCHHHHHHHHHHHHC' 'CC----------GGGCSSHHHHHHHHHHHHHHHHHHHHHHHHHHHTTCSSCCHHHH' 'HHHCCCCCSCSSSCCCCSHHHHHHHHHHHHHHHHBCCC------------------' '-----------CCSBCHHHHHHHHHHHHHHHTTCCHHHHHHHHHHHHHHHHHHHHH' 'HHHHHHHHHHHHHHHHHHHHHHHHCCCEECCC---------CCEECTTCHHHHHHH' 'HHHHHHHHHHHHHHHHHHHHHHHTTCC') }, 'T ss_pred': { 'sequence': ('CHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHhCCHHHHHHHHHHHHHHHH' 'HhhcCCccchHHHHHHHhhHHHHHHhcCCCCCCCcchhHHHHHHHHHHHHHHHHHH' 'HHHHHHHHHHHHHHHHHHHHHHhhccCCcHHHHHHHHHHHCCCcHHHHHHHHHHHH' 'hhcccccCCcccccccccHHHHHHHHHHHHHHHHHHHHHHHHHHHhhcCCCcHHHH' 'HHhhcCCCCCCCCCCCCCCccccHHHHHHHHHHHhhcccCCCccchHhhhhccccc' 'cccCCCcccCCcccccHHHHHHHHHHHHHHhcCCCHHHHHHHHHHHHHHHHHHHHH' 'HHHHHHHHHHHHHHHHHHHHHHchhhheeCCCCCCCCCCCcccccCHHHHHHHHHH' 'HHHHHHHHHHHHHHHHHHHHHHhcccC') }, 'Confidence': { 'sequence': ('79999999999999999999999999999999999999999999999999999999' '99999999999999999999999999999999999999999999999999999999' '99999999999999999999988888999999999998877899999999999999' '99987777778999999999999999999999999999999999999999999999' '99999999999877789999999999999999999999999999999999999999' '99888889999999999999999999999999999999999999999999999999' '99999999999999999999999999999999999999999999999999999999' '999999999999999999999999976') }, }, 'Aligned_cols': 419, 'E-value': 1.6e-80, 'Hit': ('4QN1_A E3 ubiquitin-protein ligase SHPRH; SHPRH, E3 ligas' 'e, RING, Ubiquitin; 2.48A {H**o sapiens}') } obs = parse_pdb_match(self.fp_out) for k in obs[0].keys(): if type(obs[0][k]) == dict(): self.assertCountEqual(obs[0][k], exp_0[k]) else: self.assertEqual(obs[0][k], exp_0[k]) with open(get_data_path('test_split_search/T0831_block0.out'), 'r') as f: block = "".join(f.readlines()) obs = _parse_hit_block(block) for k in obs.keys(): if type(obs[k]) == dict(): self.assertCountEqual(obs[k], exp_0[k]) else: self.assertEqual(obs[k], exp_0[k])
def test_mask_sequence_filtering(self): s1 = (('gi|556503834|ref|NC_000913.3|_2_1-461 # 2j0w_A # 337 # 2799 # ' '1 # ID=1_2;partial=00;start_type=ATG;rbs_motif=GGAG/GAGG;rbs_s' 'pacer=5-10bp;gc_cont=0.531'), ('MRVLKFGGTSVANAERFLRVADILESNARQGQVATVLSAPAKITNHLVAMIEKTISGQDALP' 'NISDAERIFAELLTGLAAAQPGFPLAQLKTFVDQEFAQIKHVLHGISLLGQCPDSINAALIC' 'RGEKMSIAIMAGVLEARGHNVTVIDPVEKLLAVGHYLESTVDIAESTRRIAASRIPADHMVL' 'MAGFTAGNEKGELVVLGRNGSDYSAAVLAACLRADCCEIWTDVDGVYTCDPRQVPDARLLKS' 'MSYQEAMELSYFGAKVLHPRTITPIAQFQIPCLIKNTGNPQAPGTLIGASRDEDELPVKGIS' 'NLNNMAMFSVSGPGMKGMVGMAARVFAAMSRARISVVLITQSSSEYSISFCVPQSDCVRAER' 'AMQEEFYLELKEGLLEPLAVTERLAIISVVGDGMRTLRGISAKFFAALARANINIVAIAQGS' 'SERSISVVVNNDDATTGVRVTHQMLFN')) s2 = (('gi|556503834|ref|NC_000913.3|_2_462-463 # 337 # 2799 # 1 # ID=' '1_2;partial=00;start_type=ATG;rbs_motif=GGAG/GAGG;rbs_spacer=' '5-10bp;gc_cont=0.531'), 'TD') s3 = (('gi|556503834|ref|NC_000913.3|_2_464-815 # 1ebf_A # 337 # 2799 ' '# 1 # ID=1_2;partial=00;start_type=ATG;rbs_motif=GGAG/GAGG;rbs' '_spacer=5-10bp;gc_cont=0.531'), ('QVIEVFVIGVGGVGGALLEQLKRQQSWLKNKHIDLRVCGVANSKALLTNVHGLNLENWQEEL' 'AQAKEPFNLGRLIRLVKEYHLLNPVIVDCTSSQAVADQYADFLREGFHVVTPNKKANTSSMD' 'YYHQLRYAAEKSRRKFLYDTNVGAGLPVIENLQNLLNAGDELMKFSGILSGSLSYIFGKLDE' 'GMSFSEATTLAREMGYTEPDPRDDLSGMDVARKLLILARETGRELELADIEIEPVLPAEFNA' 'EGDVAAFMANLSQLDDLFAARVAKARDEGKVLRYVGNIDEDGVCRVKIAEVDGNDPLFKVKN' 'GENALAFYSHYYQPLPLVLRGYGAGNDVTAAGVFADLLRTLS')) s4 = (('gi|556503834|ref|NC_000913.3|_2_816-820 # 337 # 2799 # 1 # ID=' '1_2;partial=00;start_type=ATG;rbs_motif=GGAG/GAGG;rbs_spacer=' '5-10bp;gc_cont=0.531', 'WKLGV')) s5 = (('gi|556503834|ref|NC_000913.3|_2_462-820 # 337 # 2799 # 1 # ID' '=1_2;partial=00;start_type=ATG;rbs_motif=GGAG/GAGG;rbs_spacer=' '5-10bp;gc_cont=0.531'), ('TDQVIEVFVIGVGGVGGALLEQLKRQQSWLKNKHIDLRVCGVANSKALLTNVHGLNLENWQE' 'ELAQAKEPFNLGRLIRLVKEYHLLNPVIVDCTSSQAVADQYADFLREGFHVVTPNKKANTSS' 'MDYYHQLRYAAEKSRRKFLYDTNVGAGLPVIENLQNLLNAGDELMKFSGILSGSLSYIFGKL' 'DEGMSFSEATTLAREMGYTEPDPRDDLSGMDVARKLLILARETGRELELADIEIEPVLPAEF' 'NAEGDVAAFMANLSQLDDLFAARVAKARDEGKVLRYVGNIDEDGVCRVKIAEVDGNDPLFKV' 'KNGENALAFYSHYYQPLPLVLRGYGAGNDVTAAGVFADLLRTLSWKLGV')) s6 = (('gi|556503834|ref|NC_000913.3|_2_1-820 # 337 # 2799 # 1 # ID=' '1_2;partial=00;start_type=ATG;rbs_motif=GGAG/GAGG;rbs_spacer=' '5-10bp;gc_cont=0.531'), self.query) obs = mask_sequence(self.file_a, self.file_query, None) exp = {'match': [s1, s3], 'non_match': [s2, s4]} self.assertEqual(obs, exp) obs = mask_sequence(self.file_a, self.file_query, min_prob=100.0) self.assertEqual(obs, {'match': [s1, s3], 'non_match': [s2, s4]}) obs = mask_sequence(self.file_a, self.file_query, max_evalue=4.1e-58) self.assertEqual(obs, {'match': [s1], 'non_match': [s5]}) obs = mask_sequence(self.file_a, self.file_query, max_pvalue=1e-58) self.assertEqual(obs, {'match': [s1], 'non_match': [s5]}) obs = mask_sequence(self.file_a, self.file_query, min_fragment_length=500) self.assertEqual(obs, {'match': [], 'non_match': [s6]}) obs = mask_sequence(self.file_a, self.file_query, min_prob=99.0, max_evalue=4.90e-41, max_pvalue=0.00011, min_fragment_length=200) self.assertEqual(obs, {'match': [s1, s3], 'non_match': []}) obs = mask_sequence(self.file_a, self.file_query, min_prob=99.0, max_evalue=4.90e-41, max_pvalue=0.00011, min_fragment_length=4) self.assertEqual(obs, {'match': [s1, s3], 'non_match': [s4]})