def test_apply_lane_mask_only(self): lm1 = '111111' expected = self.aln1.__iter__() for result in apply_lane_mask_and_gap_filter(self.aln1, lm1, 1): self.assertEqual(result, expected.next() + '\n') # filtering all positions results in a ValueError lm2 = '000000' with self.assertRaises(ValueError): list(apply_lane_mask_and_gap_filter(self.aln1, lm2, 1)) lm3 = '101010' expected = [ '>s1', 'AC-', '>s2', 'A--', '>s3', 'TT-', '>s4', 'AG-', '>s5', '---' ].__iter__() for result in apply_lane_mask_and_gap_filter(self.aln1, lm3, 1): self.assertEqual(result, expected.next() + '\n') lm4 = '000111' expected = [ '>s1', '--T', '>s2', '--T', '>s3', '--T', '>s4', '--T', '>s5', 'A--' ].__iter__() for result in apply_lane_mask_and_gap_filter(self.aln1, lm4, 1): self.assertEqual(result, expected.next() + '\n')
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) # build the output filepath and open it any problems can be caught # before starting the work try: mkdir(opts.output_dir) except OSError: pass input_dir, input_filename = split(opts.input_fasta_file) input_basename, ext = splitext(input_filename) if getsize(opts.input_fasta_file) == 0: raise ValueError("An empty fasta file was provided. " "Did the alignment complete sucessfully? " "Did PyNAST discard all sequences due to too-stringent minimum length " "or minimum percent ID settings?") output_fp = '%s/%s_pfiltered.fasta' % (opts.output_dir, input_basename) try: outfile = open(output_fp, 'w') except IOError: raise IOError("Can't open output_filepath for writing: %s" % output_filepath) if opts.lane_mask_fp and not opts.suppress_lane_mask_filter and not\ opts.entropy_threshold: # read the lane_mask, if one was provided if opts.verbose: print "Reading lane mask..." lane_mask = open(opts.lane_mask_fp).read().strip() else: lane_mask = None # open the input and output files infile = open(opts.input_fasta_file, 'U') if opts.remove_outliers: # apply the lanemask/gap removal, then remove outliers seq_gen = apply_lane_mask_and_gap_filter(infile, lane_mask, opts.allowed_gap_frac, verbose=opts.verbose, entropy_threshold=opts.entropy_threshold) filtered_aln = remove_outliers(seq_gen, opts.threshold) for seq in filtered_aln.Seqs: outfile.write(seq.toFasta()) outfile.write('\n') else: # just apply the lanemask/gap removal for result in apply_lane_mask_and_gap_filter(infile, lane_mask, opts.allowed_gap_frac, verbose=opts.verbose, entropy_threshold=opts.entropy_threshold): outfile.write(result) infile.close() outfile.close()
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) # build the output filepath and open it any problems can be caught # before starting the work try: mkdir(opts.output_dir) except OSError: pass input_dir, input_filename = split(opts.input_fasta_file) input_basename, ext = splitext(input_filename) if getsize(opts.input_fasta_file) == 0: raise ValueError("An empty fasta file was provided. " "Did the alignment complete sucessfully? " "Did PyNAST discard all sequences due to too-stringent minimum length " "or minimum percent ID settings?") output_fp = '%s/%s_pfiltered.fasta' % (opts.output_dir, input_basename) try: outfile = open(output_fp, 'w') except IOError: raise IOError("Can't open output_filepath for writing: %s" % output_filepath) if not opts.suppress_lane_mask_filter and not opts.entropy_threshold: if opts.lane_mask_fp is not None: lane_mask = open(opts.lane_mask_fp, 'U').read().strip() else: lane_mask = get_template_alignment_column_mask() else: lane_mask = None # open the input and output files infile = open(opts.input_fasta_file, 'U') if opts.remove_outliers: # apply the lanemask/gap removal, then remove outliers seq_gen = apply_lane_mask_and_gap_filter(infile, lane_mask, opts.allowed_gap_frac, entropy_threshold=opts.entropy_threshold) filtered_aln = remove_outliers(seq_gen, opts.threshold) for seq in filtered_aln: outfile.write(seq.to_fasta()) outfile.write('\n') else: # just apply the lanemask/gap removal for result in apply_lane_mask_and_gap_filter(infile, lane_mask, opts.allowed_gap_frac, entropy_threshold=opts.entropy_threshold): outfile.write(result) infile.close() outfile.close()
def test_apply_lane_mask_and_gap_filter_w_entropy_threshold(self): expected = self.aln1.__iter__() for result in apply_lane_mask_and_gap_filter(self.aln1, None, 1.0, entropy_threshold=0.0): self.assertEqual(result, expected.next() + '\n') # filtering all positions results in a ValueError with self.assertRaises(ValueError): list(apply_lane_mask_and_gap_filter(self.aln1, None, 1.0, entropy_threshold=1.0))
def test_apply_lane_mask_and_gap_filter_alternate_alignment(self): aln = ['>ACT009', 'AACT-', '>ACT019', 'AACT-', '>ACT011', '-TCT-'] expected = aln.__iter__() for result in apply_lane_mask_and_gap_filter(aln, None, 1.0): self.assertEqual(result, expected.next() + '\n') lm = '00111' expected = ['>ACT009', 'CT', '>ACT019', 'CT', '>ACT011', 'CT'].__iter__() for result in apply_lane_mask_and_gap_filter(aln, lm): self.assertEqual(result, expected.next() + '\n')
def test_apply_lane_mask_and_gap_filter(self): """apply_lane_mask_and_gap_filter: functions as expected """ lm = '111111' expected = self.aln1.__iter__() for result in apply_lane_mask_and_gap_filter(self.aln1, lm, 1.0): self.assertEqual(result, expected.next() + '\n') lm = None expected = self.aln1.__iter__() for result in apply_lane_mask_and_gap_filter(self.aln1, lm, 1.0): self.assertEqual(result, expected.next() + '\n') # gap filter only lm = '111111' expected = [\ '>s1','ACC-T',\ '>s2','AC--T',\ '>s3','TCT-T',\ '>s4','ACG-T',\ '>s5','---A-'\ ].__iter__() for result in apply_lane_mask_and_gap_filter(self.aln1, lm): self.assertEqual(result, expected.next() + '\n') # lm filter only lm = '011111' expected = [\ '>s1','CC--T',\ '>s2','C---T',\ '>s3','CT--T',\ '>s4','CG--T',\ '>s5','--A--'\ ].__iter__() for result in apply_lane_mask_and_gap_filter(self.aln1, lm, 1.0): self.assertEqual(result, expected.next() + '\n') # gap and lm filter lm = '011111' expected = [\ '>s1','CC-T',\ '>s2','C--T',\ '>s3','CT-T',\ '>s4','CG-T',\ '>s5','--A-'\ ].__iter__() for result in apply_lane_mask_and_gap_filter(self.aln1, lm): self.assertEqual(result, expected.next() + '\n')
def test_apply_lane_mask_and_gap_filter(self): """apply_lane_mask_and_gap_filter: functions as expected """ lm = '111111' expected = self.aln1.__iter__() for result in apply_lane_mask_and_gap_filter(self.aln1,lm,1.0): self.assertEqual(result,expected.next()+'\n') lm = None expected = self.aln1.__iter__() for result in apply_lane_mask_and_gap_filter(self.aln1,lm,1.0): self.assertEqual(result,expected.next()+'\n') # gap filter only lm = '111111' expected = [\ '>s1','ACC-T',\ '>s2','AC--T',\ '>s3','TCT-T',\ '>s4','ACG-T',\ '>s5','---A-'\ ].__iter__() for result in apply_lane_mask_and_gap_filter(self.aln1,lm): self.assertEqual(result,expected.next()+'\n') # lm filter only lm = '011111' expected = [\ '>s1','CC--T',\ '>s2','C---T',\ '>s3','CT--T',\ '>s4','CG--T',\ '>s5','--A--'\ ].__iter__() for result in apply_lane_mask_and_gap_filter(self.aln1,lm,1.0): self.assertEqual(result,expected.next()+'\n') # gap and lm filter lm = '011111' expected = [\ '>s1','CC-T',\ '>s2','C--T',\ '>s3','CT-T',\ '>s4','CG-T',\ '>s5','--A-'\ ].__iter__() for result in apply_lane_mask_and_gap_filter(self.aln1,lm): self.assertEqual(result,expected.next()+'\n')
def test_apply_lane_mask_and_gap_filter_w_precomputed_mask(self): lm = '111111' expected = self.aln1.__iter__() for result in apply_lane_mask_and_gap_filter(self.aln1, lm, 1.0): self.assertEqual(result, expected.next() + '\n') lm = None expected = self.aln1.__iter__() for result in apply_lane_mask_and_gap_filter(self.aln1, lm, 1.0): self.assertEqual(result, expected.next() + '\n') # gap filter only lm = '111111' expected = [ '>s1', 'ACC-T', '>s2', 'AC--T', '>s3', 'TCT-T', '>s4', 'ACG-T', '>s5', '---A-' ].__iter__() for result in apply_lane_mask_and_gap_filter(self.aln1, lm): self.assertEqual(result, expected.next() + '\n') # lm filter only lm = '011111' expected = [ '>s1', 'CC--T', '>s2', 'C---T', '>s3', 'CT--T', '>s4', 'CG--T', '>s5', '--A--' ].__iter__() for result in apply_lane_mask_and_gap_filter(self.aln1, lm, 1.0): self.assertEqual(result, expected.next() + '\n') # gap and lm filter lm = '011111' expected = [ '>s1', 'CC-T', '>s2', 'C--T', '>s3', 'CT-T', '>s4', 'CG-T', '>s5', '--A-' ].__iter__() for result in apply_lane_mask_and_gap_filter(self.aln1, lm): self.assertEqual(result, expected.next() + '\n')
def test_apply_lane_mask_and_gap_filter_invalid(self): # passing both a mask and an entropy threshold results in a ValueError with self.assertRaises(ValueError): list( apply_lane_mask_and_gap_filter(self.aln1, '111111', entropy_threshold=0.0))
def test_apply_lane_mask_and_gap_filter_alternate_alignment(self): aln = [ '>ACT009', 'AACT-', '>ACT019', 'AACT-', '>ACT011', '-TCT-' ] expected = aln.__iter__() for result in apply_lane_mask_and_gap_filter(aln, None, 1.0): self.assertEqual(result, expected.next() + '\n') lm = '00111' expected = [ '>ACT009', 'CT', '>ACT019', 'CT', '>ACT011', 'CT' ].__iter__() for result in apply_lane_mask_and_gap_filter(aln, lm): self.assertEqual(result, expected.next() + '\n')
def test_apply_lane_mask_and_gap_filter_alternate_alignment(self): """apply_lane_mask_and_gap_filter: functions as expected with alt aln """ aln = [\ '>ACT009','AACT-',\ '>ACT019','AACT-',\ '>ACT011','-TCT-'\ ] expected = aln.__iter__() for result in apply_lane_mask_and_gap_filter(aln,None,1.0): self.assertEqual(result,expected.next()+'\n') lm = '00111' expected = [\ '>ACT009','CT',\ '>ACT019','CT',\ '>ACT011','CT'\ ].__iter__() for result in apply_lane_mask_and_gap_filter(aln,lm): self.assertEqual(result,expected.next()+'\n')
def test_apply_lane_mask_and_gap_filter_alternate_alignment(self): """apply_lane_mask_and_gap_filter: functions as expected with alt aln """ aln = [\ '>ACT009','AACT-',\ '>ACT019','AACT-',\ '>ACT011','-TCT-'\ ] expected = aln.__iter__() for result in apply_lane_mask_and_gap_filter(aln, None, 1.0): self.assertEqual(result, expected.next() + '\n') lm = '00111' expected = [\ '>ACT009','CT',\ '>ACT019','CT',\ '>ACT011','CT'\ ].__iter__() for result in apply_lane_mask_and_gap_filter(aln, lm): self.assertEqual(result, expected.next() + '\n')
def test_apply_gap_filter_only(self): expected = self.aln1.__iter__() for result in apply_lane_mask_and_gap_filter(self.aln1, None, 1.0): self.assertEqual(result, expected.next() + '\n') expected = [ '>s1', 'ACC-T', '>s2', 'AC--T', '>s3', 'TCT-T', '>s4', 'ACG-T', '>s5', '---A-' ].__iter__() for result in apply_lane_mask_and_gap_filter(self.aln1, None): self.assertEqual(result, expected.next() + '\n') expected = [ '>s1', 'ACCT', '>s2', 'AC-T', '>s3', 'TCTT', '>s4', 'ACGT', '>s5', '----' ].__iter__() for result in apply_lane_mask_and_gap_filter(self.aln1, None, 0.75): self.assertEqual(result, expected.next() + '\n') expected = [ '>s1', 'ACCT', '>s2', 'AC-T', '>s3', 'TCTT', '>s4', 'ACGT', '>s5', '----' ].__iter__() for result in apply_lane_mask_and_gap_filter(self.aln1, None, 0.40): self.assertEqual(result, expected.next() + '\n') expected = [ '>s1', 'ACT', '>s2', 'ACT', '>s3', 'TCT', '>s4', 'ACT', '>s5', '---' ].__iter__() for result in apply_lane_mask_and_gap_filter(self.aln1, None, 0.30): self.assertEqual(result, expected.next() + '\n') # filtering all positions results in a ValueError with self.assertRaises(ValueError): list(apply_lane_mask_and_gap_filter(self.aln1, None, 0.10)) # the following tests were adapted from test_alignment.py in PyCogent aln = ['>a', '--A-BC-', '>b', '-CB-A--', '>c', '--D-EF-'] # default should strip out cols that are 100% gaps expected = ['>a', '-ABC', '>b', 'CBA-', '>c', '-DEF'].__iter__() for result in apply_lane_mask_and_gap_filter(aln, None): self.assertEqual(result, expected.next() + '\n') # if allowed_gap_frac is 1, shouldn't delete anything expected = ['>a', '--A-BC-', '>b', '-CB-A--', '>c', '--D-EF-'].__iter__() for result in apply_lane_mask_and_gap_filter(aln, None, 1): self.assertEqual(result, expected.next() + '\n') # if allowed_gap_frac is 0, should strip out any cols containing gaps expected = ['>a', 'AB', '>b', 'BA', '>c', 'DE'].__iter__() for result in apply_lane_mask_and_gap_filter(aln, None, 0): self.assertEqual(result, expected.next() + '\n') # intermediate numbers should work as expected expected = ['>a', 'ABC', '>b', 'BA-', '>c', 'DEF'].__iter__() for result in apply_lane_mask_and_gap_filter(aln, None, 0.4): self.assertEqual(result, expected.next() + '\n') expected = ['>a', '-ABC', '>b', 'CBA-', '>c', '-DEF'].__iter__() for result in apply_lane_mask_and_gap_filter(aln, None, 0.7): self.assertEqual(result, expected.next() + '\n')
def test_apply_lane_mask_and_gap_filter_real(self): """apply_lane_mask_and_gap_filter: no error on full length seqs """ # No error when applying to full-length sequence actual = apply_lane_mask_and_gap_filter(\ self.aln2,self.aln2_lm)
def test_apply_gap_filter_only(self): expected = self.aln1.__iter__() for result in apply_lane_mask_and_gap_filter(self.aln1, None, 1.0): self.assertEqual(result, expected.next() + '\n') expected = [ '>s1', 'ACC-T', '>s2', 'AC--T', '>s3', 'TCT-T', '>s4', 'ACG-T', '>s5', '---A-' ].__iter__() for result in apply_lane_mask_and_gap_filter(self.aln1, None): self.assertEqual(result, expected.next() + '\n') expected = [ '>s1', 'ACCT', '>s2', 'AC-T', '>s3', 'TCTT', '>s4', 'ACGT', '>s5', '----' ].__iter__() for result in apply_lane_mask_and_gap_filter(self.aln1, None, 0.75): self.assertEqual(result, expected.next() + '\n') expected = [ '>s1', 'ACCT', '>s2', 'AC-T', '>s3', 'TCTT', '>s4', 'ACGT', '>s5', '----' ].__iter__() for result in apply_lane_mask_and_gap_filter(self.aln1, None, 0.40): self.assertEqual(result, expected.next() + '\n') expected = [ '>s1', 'ACT', '>s2', 'ACT', '>s3', 'TCT', '>s4', 'ACT', '>s5', '---' ].__iter__() for result in apply_lane_mask_and_gap_filter(self.aln1, None, 0.30): self.assertEqual(result, expected.next() + '\n') # filtering all positions results in a ValueError with self.assertRaises(ValueError): list(apply_lane_mask_and_gap_filter(self.aln1, None, 0.10)) # the following tests were adapted from test_alignment.py in PyCogent aln = [ '>a', '--A-BC-', '>b', '-CB-A--', '>c', '--D-EF-' ] # default should strip out cols that are 100% gaps expected = [ '>a', '-ABC', '>b', 'CBA-', '>c', '-DEF' ].__iter__() for result in apply_lane_mask_and_gap_filter(aln, None): self.assertEqual(result, expected.next() + '\n') # if allowed_gap_frac is 1, shouldn't delete anything expected = [ '>a', '--A-BC-', '>b', '-CB-A--', '>c', '--D-EF-' ].__iter__() for result in apply_lane_mask_and_gap_filter(aln, None, 1): self.assertEqual(result, expected.next() + '\n') # if allowed_gap_frac is 0, should strip out any cols containing gaps expected = [ '>a', 'AB', '>b', 'BA', '>c', 'DE' ].__iter__() for result in apply_lane_mask_and_gap_filter(aln, None, 0): self.assertEqual(result, expected.next() + '\n') # intermediate numbers should work as expected expected = [ '>a', 'ABC', '>b', 'BA-', '>c', 'DEF' ].__iter__() for result in apply_lane_mask_and_gap_filter(aln, None, 0.4): self.assertEqual(result, expected.next() + '\n') expected = [ '>a', '-ABC', '>b', 'CBA-', '>c', '-DEF' ].__iter__() for result in apply_lane_mask_and_gap_filter(aln, None, 0.7): self.assertEqual(result, expected.next() + '\n')
def test_apply_lane_mask_and_gap_filter_real(self): # No error when applying to full-length sequence actual = apply_lane_mask_and_gap_filter(self.aln2, self.aln2_lm)
def test_apply_lane_mask_and_gap_filter_invalid(self): # passing both a mask and an entropy threshold results in a ValueError with self.assertRaises(ValueError): list(apply_lane_mask_and_gap_filter(self.aln1, '111111', entropy_threshold=0.0))