class SegCheck(unittest.TestCase): mMasker = Masker.MaskerSeg() def testEmpty(self): """test empty input.""" self.assertEqual(self.mMasker(""), "") def testProtein(self): """test protein input.""" self.assertEqual( self.mMasker("ACDEFGHIKLWWWWWWWWWWWWWWwwwwwwwwwwwacdefghikl"), "ACDEFGHIKLXXXXXXXXXXXXXXxxxxxxxxxxxacdefghikl") def testCoding(self): """test coding sequence input.""" self.assertEqual( self.mMasker("GCCTGCGACGAGTTCGGCCACATCAAGCT" "GTGGTGGTGGTGGTGGTGGTGGTGGTGGT" "GGTGGTGGTGGTGGTGGTGGTGGTGGTGG" "tggtggtggtggtggtgggcctgcgacga" "gttcggccacatcaagctg"), "GCCTGCGACGAGTTCGGCCACATCAAGCT" "GNNNNNNNNNNNNNNNNNNNNNNNNNNNN" "NNNNNNNNNNNNNNNNNNNNNNNNNNNNN" "nnnnnnnnnnnnnnnnnngcctgcgacga" "gttcggccacatcaagctg")
def maskSequences(sequences, masker=None): '''return a list of masked sequence. *masker* can be one of dust/dustmasker * run dustmasker on sequences softmask * use softmask to hardmask sequences ''' if masker in ("dust", "dustmasker"): masker_object = Masker.MaskerDustMasker() else: masker_object = None if masker == "softmask": # the genome sequence is repeat soft-masked masked_seq = sequences elif masker in ("dust", "dustmasker"): # run dust masked_seq = masker_object.maskSequences( [x.upper() for x in sequences]) elif masker is None: masked_seq = [x.upper() for x in sequences] else: raise ValueError("unknown masker %s" % masker) # hard mask softmasked characters masked_seq = [re.sub("[a-z]", "N", x) for x in masked_seq] return masked_seq
def maskMali(mali, method="seg"): """mask multiple alignment according to an external masker. """ if method == "seg": masker = Masker.MaskerSeg() elif method == "bias": masker = Masker.MaskerBias() elif method == "random": masker = Masker.MaskerRandom() if mali.getAlphabet() == "na" and method in ("seg", "bias"): for id, s in mali.items(): ss = Genomics.TranslateDNA2Protein(s.mString) mss = masker(ss) columns = [] for x in range(0, len(mss)): if mss[x] in ("X", "x"): columns += range(x, x + 3) mali.getEntry(id).maskColumns(columns) else: for id, s in mali.items(): mali[id].mString = masker(s.mString)
def maskSequences(sequences, masker): if masker == "repeatmasker": # the genome sequence is repeat masked masked_seq = sequences elif masker == "dust": masker_object = Masker.MaskerDustMasker() masked_seq = [masker_object(x.upper()) for x in sequences] else: masked_seq = [x.upper() for x in sequences] # hard mask softmasked characters masked_seq = [re.sub("[a-z]", "N", x) for x in masked_seq] return masked_seq
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genomic sequence to retrieve " "sequences from.") parser.add_option("-m", "--masker", dest="masker", type="choice", choices=("dust", "dustmasker", "softmask", "none"), help="apply masker to mask output sequences " "[%default].") parser.add_option("--output-mode", dest="output_mode", type="choice", choices=("intervals", "leftright", "segments"), help="what to output. " "'intervals' generates a single sequence for " "each bed interval. 'leftright' generates two " "sequences, one in each direction, for each bed " "interval. 'segments' can be used to output " "sequence from bed12 files so that sequence only covers " "the segements [%default]") parser.add_option("--min-sequence-length", dest="min_length", type="int", help="require a minimum sequence length [%default]") parser.add_option("--max-sequence-length", dest="max_length", type="int", help="require a maximum sequence length [%default]") parser.add_option( "--extend-at", dest="extend_at", type="choice", choices=("none", "3", "5", "both", "3only", "5only"), help="extend at 3', 5' or both or no ends. If 3only or 5only " "are set, only the added sequence is returned [default=%default]") parser.add_option( "--extend-by", dest="extend_by", type="int", help="extend by # bases [default=%default]") parser.add_option( "--use-strand", dest="ignore_strand", action="store_false", help="use strand information and return reverse complement " "on intervals located on the negative strand. " "[default=%default]") parser.set_defaults( genome_file=None, masker=None, output_mode="intervals", min_length=0, max_length=0, extend_at=None, extend_by=100, ignore_strand=True, ) (options, args) = E.Start(parser) if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) contigs = fasta.getContigSizes() fasta.setConverter(IndexedFasta.getConverter("zero-both-open")) counter = E.Counter() ids, seqs = [], [] E.info("collecting sequences") for bed in Bed.setName(Bed.iterator(options.stdin)): counter.input += 1 lcontig = fasta.getLength(bed.contig) if options.ignore_strand: strand = "+" else: strand = bed.strand if options.output_mode == "segments" and bed.columns == 12: ids.append("%s %s:%i..%i (%s) %s %s" % (bed.name, bed.contig, bed.start, bed.end, strand, bed["blockSizes"], bed["blockStarts"])) seg_seqs = [fasta.getSequence(bed.contig, strand, start, end) for start, end in bed.toIntervals()] seqs.append("".join(seg_seqs)) elif (options.output_mode == "intervals" or options.output_mode == "segments"): ids.append("%s %s:%i..%i (%s)" % (bed.name, bed.contig, bed.start, bed.end, strand)) seqs.append( fasta.getSequence(bed.contig, strand, bed.start, bed.end)) elif options.output_mode == "leftright": l = bed.end - bed.start start, end = max(0, bed.start - l), bed.end - l ids.append("%s_l %s:%i..%i (%s)" % (bed.name, bed.contig, start, end, strand)) seqs.append(fasta.getSequence(bed.contig, strand, start, end)) start, end = bed.start + l, min(lcontig, bed.end + l) ids.append("%s_r %s:%i..%i (%s)" % (bed.name, bed.contig, start, end, strand)) seqs.append(fasta.getSequence(bed.contig, strand, start, end)) E.info("collected %i sequences" % len(seqs)) masked = Masker.maskSequences(seqs, options.masker) options.stdout.write( "\n".join([">%s\n%s" % (x, y) for x, y in zip(ids, masked)]) + "\n") E.info("masked %i sequences" % len(seqs)) counter.output = len(seqs) E.info("%s" % counter) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--is-gtf", dest="is_gtf", action="store_true", help="input is gtf instead of gff.") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option("-m", "--merge-adjacent", dest="merge", action="store_true", help="merge adjacent intervals with the same attributes." " [default=%default]") parser.add_option("-e", "--feature", dest="feature", type="string", help="filter by a feature, for example 'exon', 'CDS'." " If set to the empty string, all entries are output " "[%default].") parser.add_option("-f", "--maskregions-bed-file", dest="filename_masks", type="string", metavar="gff", help="mask sequences with regions given in gff file " "[%default].") parser.add_option("--remove-masked-regions", dest="remove_masked_regions", action="store_true", help="remove regions instead of masking [%default].") parser.add_option("--min-interval-length", dest="min_length", type="int", help="set minimum length for sequences output " "[%default]") parser.add_option("--max-length", dest="max_length", type="int", help="set maximum length for sequences output " "[%default]") parser.add_option("--extend-at", dest="extend_at", type="choice", choices=("none", "3", "5", "both", "3only", "5only"), help="extend at no end, 3', 5' or both ends. If " "3only or 5only are set, only the added sequence " "is returned [default=%default]") parser.add_option("--extend-by", dest="extend_by", type="int", help="extend by # bases [default=%default]") parser.add_option("--extend-with", dest="extend_with", type="string", help="extend using base [default=%default]") parser.add_option("--masker", dest="masker", type="choice", choices=("dust", "dustmasker", "softmask", "none"), help="apply masker [%default].") parser.add_option("--fold-at", dest="fold_at", type="int", help="fold sequence every n bases[%default].") parser.add_option( "--fasta-name-attribute", dest="naming_attribute", type="string", help="use attribute to name fasta entry. Currently only compatable" " with gff format [%default].") parser.set_defaults(is_gtf=False, genome_file=None, merge=False, feature=None, filename_masks=None, remove_masked_regions=False, min_length=0, max_length=0, extend_at=None, extend_by=100, extend_with=None, masker=None, fold_at=None, naming_attribute=False) (options, args) = E.Start(parser) if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) contigs = fasta.getContigSizes() if options.is_gtf: iterator = GTF.transcript_iterator(GTF.iterator(options.stdin)) else: gffs = GTF.iterator(options.stdin) if options.merge: iterator = GTF.joined_iterator(gffs) else: iterator = GTF.chunk_iterator(gffs) masks = None if options.filename_masks: masks = {} with IOTools.openFile(options.filename_masks, "r") as infile: e = GTF.readAsIntervals(GTF.iterator(infile)) # convert intervals to intersectors for contig in list(e.keys()): intersector = bx.intervals.intersection.Intersecter() for start, end in e[contig]: intersector.add_interval(bx.intervals.Interval(start, end)) masks[contig] = intersector ninput, noutput, nmasked, nskipped_masked = 0, 0, 0, 0 nskipped_length = 0 nskipped_noexons = 0 feature = options.feature # iterator is a list containing groups (lists) of features. # Each group of features have in common the same transcript ID, in case of # GTF files. for ichunk in iterator: ninput += 1 if feature: chunk = [x for x in ichunk if x.feature == feature] else: chunk = ichunk if len(chunk) == 0: nskipped_noexons += 1 E.info("no features in entry from " "%s:%i..%i - %s" % (ichunk[0].contig, ichunk[0].start, ichunk[0].end, str(ichunk[0]))) continue contig, strand = chunk[0].contig, chunk[0].strand if options.is_gtf: name = chunk[0].transcript_id else: if options.naming_attribute: attr_dict = { x.split("=")[0]: x.split("=")[1] for x in chunk[0].attributes.split(";") } name = attr_dict[options.naming_attribute] else: name = str(chunk[0].attributes) lcontig = contigs[contig] positive = Genomics.IsPositiveStrand(strand) intervals = [(x.start, x.end) for x in chunk] intervals.sort() if masks: if contig in masks: masked_regions = [] for start, end in intervals: masked_regions += [(x.start, x.end) for x in masks[contig].find(start, end)] masked_regions = Intervals.combine(masked_regions) if len(masked_regions): nmasked += 1 if options.remove_masked_regions: intervals = Intervals.truncate(intervals, masked_regions) else: raise NotImplementedError("unimplemented") if len(intervals) == 0: nskipped_masked += 1 if options.loglevel >= 1: options.stdlog.write( "# skipped because fully masked: " "%s: regions=%s masks=%s\n" % (name, str([(x.start, x.end) for x in chunk]), masked_regions)) continue out = intervals if options.extend_at and not options.extend_with: if options.extend_at == "5only": intervals = [(max(0, intervals[0][0] - options.extend_by), intervals[0][0])] elif options.extend_at == "3only": intervals = [(intervals[-1][1], min(lcontig, intervals[-1][1] + options.extend_by))] else: if options.extend_at in ("5", "both"): intervals[0] = (max(0, intervals[0][0] - options.extend_by), intervals[0][1]) if options.extend_at in ("3", "both"): intervals[-1] = (intervals[-1][0], min(lcontig, intervals[-1][1] + options.extend_by)) if not positive: intervals = [(lcontig - x[1], lcontig - x[0]) for x in intervals[::-1]] out.reverse() s = [ fasta.getSequence(contig, strand, start, end) for start, end in intervals ] # IMS: allow for masking of sequences s = Masker.maskSequences(s, options.masker) l = sum([len(x) for x in s]) if (l < options.min_length or (options.max_length and l > options.max_length)): nskipped_length += 1 if options.loglevel >= 1: options.stdlog.write("# skipped because length out of bounds " "%s: regions=%s len=%i\n" % (name, str(intervals), l)) continue if options.extend_at and options.extend_with: extension = "".join((options.extend_with, ) * options.extend_by) if options.extend_at in ("5", "both"): s[1] = extension + s[1] if options.extend_at in ("3", "both"): s[-1] = s[-1] + extension if options.fold_at: n = options.fold_at s = "".join(s) seq = "\n".join([s[i:i + n] for i in range(0, len(s), n)]) else: seq = "\n".join(s) options.stdout.write( ">%s %s:%s:%s\n%s\n" % (name, contig, strand, ";".join(["%i-%i" % x for x in out]), seq)) noutput += 1 E.info("ninput=%i, noutput=%i, nmasked=%i, nskipped_noexons=%i, " "nskipped_masked=%i, nskipped_length=%i" % (ninput, noutput, nmasked, nskipped_noexons, nskipped_masked, nskipped_length)) E.Stop()
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version", usage=globals()["__doc__"]) parser.add_option( "-m", "--method", dest="methods", type="choice", action="append", choices=("translate", "translate-to-stop", "truncate-at-stop", "back-translate", "mark-codons", "apply-map", "build-map", "pseudo-codons", "filter", "interleaved-codons", "map-codons", "remove-gaps", "mask-seg", "mask-bias", "mask-codons", "mask-incomplete-codons", "mask-stops", "mask-soft", "remove-stops", "upper", "lower", "reverse-complement", "sample", "shuffle"), help="method to apply to sequences.") parser.add_option("-p", "--parameters", dest="parameters", type="string", help="parameter stack for methods that require one " "[default=%default].") parser.add_option("-x", "--ignore-errors", dest="ignore_errors", action="store_true", help="ignore errors [default = %default].") parser.add_option("--sample-proportion", dest="sample_proportion", type="float", help="sample proportion [default = %default].") parser.add_option("--exclude-pattern", dest="exclude_pattern", type="string", help="exclude all sequences with ids matching pattern " "[default = %default].") parser.add_option("--include-pattern", dest="include_pattern", type="string", help="include only sequences with ids matching pattern " "[default = %default].") parser.add_option("--filter-method", dest="filter_methods", type="string", action="append", help="filtering methods to apply " "[default = %default].") parser.add_option( "-t", "--sequence-type", dest="type", type="choice", choices=("aa", "na"), help="sequence type (aa or na) [%default]. This option determines " "which characters to use for masking [default = %default].") parser.add_option( "-l", "--template-identifier", dest="template_identifier", type="string", help="template for numerical identifier [default = %default] " "for the operation --build-map. A %i is replaced by the position " "of the sequence in the file.") parser.set_defaults( methods=[], parameters="", type="na", aa_mask_chars="xX", aa_mask_char="x", na_mask_chars="nN", na_mask_char="n", gap_chars="-.", gap_char="-", template_identifier="ID%06i", ignore_errors=False, exclude_pattern=None, include_pattern=None, sample_proportion=None, filter_methods=[], ) (options, args) = E.Start(parser) options.parameters = options.parameters.split(",") rx_include, rx_exclude = None, None if options.include_pattern: rx_include = re.compile(options.include_pattern) if options.exclude_pattern: rx_exclude = re.compile(options.exclude_pattern) iterator = FastaIterator.FastaIterator(options.stdin) nseq = 0 map_seq2nid = {} if "apply-map" in options.methods: map_seq2nid = IOTools.ReadMap(open(options.parameters[0], "r")) del options.parameters[0] if options.type == "na": mask_chars = options.na_mask_chars mask_char = options.na_mask_char else: mask_chars = options.aa_mask_chars mask_char = options.aa_mask_char if "map-codons" in options.methods: map_codon2code = IOTools.ReadMap(open(options.parameters[0], "r")) del options.parameters[0] if "mask-soft" in options.methods: f = options.parameters[0] del options.parameters[0] hard_masked_iterator = FastaIterator.FastaIterator(open(f, "r")) if "mask-codons" in options.methods or "back-translate" in options.methods: # open a second stream to read sequences from f = options.parameters[0] del options.parameters[0] other_iterator = FastaIterator.FastaIterator(open(f, "r")) ninput, noutput, nerrors, nskipped = 0, 0, 0, 0 if "sample" in options.methods: if not options.sample_proportion: raise ValueError("specify a sample proportion") sample_proportion = options.sample_proportion else: sample_proportion = None filter_min_sequence_length = None filter_max_sequence_length = None filter_id_list = None for f in options.filter_methods: if f.startswith("min-length"): filter_min_sequence_length = int(f.split("=")[1]) elif f.startswith("max-length"): filter_max_sequence_length = int(f.split("=")[1]) elif f.startswith("id-file"): filter_id_list = [ line[:-1] for line in IOTools.openFile(f.split("=")[1]) ] def raiseIfNotCodon(l, title): '''raise ValueError if sequence length l is not divisible by 3''' if l % 3 != 0: raise ValueError("length of sequence %s not divisible by 3" % (title)) while 1: try: cur_record = next(iterator) except StopIteration: break if cur_record is None: break nseq += 1 ninput += 1 sequence = re.sub(" ", "", cur_record.sequence) l = len(sequence) if rx_include and not rx_include.search(cur_record.title): nskipped += 1 continue if rx_exclude and rx_exclude.search(cur_record.title): nskipped += 1 continue if sample_proportion: if random.random() > sample_proportion: continue if not (filter_id_list is None or cur_record.title in filter_id_list): nskipped += 1 continue for method in options.methods: if method == "translate": # translate such that gaps are preserved seq = [] ls = len(re.sub('[%s]' % options.gap_chars, sequence, "")) if ls % 3 != 0: msg = "length of sequence %s (%i) not divisible by 3" % ( cur_record.title, ls) nerrors += 1 if options.ignore_errors: E.warn(msg) continue else: raise ValueError(msg) for codon in [sequence[x:x + 3] for x in range(0, l, 3)]: aa = Genomics.MapCodon2AA(codon) seq.append(aa) sequence = "".join(seq) elif method == "back-translate": # translate from an amino acid alignment to codon alignment seq = [] try: other_record = next(other_iterator) except StopIteration: raise ValueError("run out of sequences") if cur_record.title != other_record.title: raise "sequence titles don't match: %s %s" % ( cur_record.title, other_record.title) other_sequence = re.sub("[ %s]" % options.gap_chars, "", other_record.sequence) if len(other_sequence) % 3 != 0: raise ValueError( "length of sequence %s not divisible by 3" % (other_record.title)) r = re.sub("[%s]" % options.gap_chars, "", sequence) if len(other_sequence) != len(r) * 3: raise ValueError( "length of sequences do not match: %i vs %i" % (len(other_sequence), len(r))) x = 0 for aa in sequence: if aa in options.gap_chars: c = options.gap_char * 3 else: c = other_sequence[x:x + 3] x += 3 seq.append(c) sequence = "".join(seq) elif method == "pseudo-codons": raiseIfNotCodon(l, cur_record.title) seq = [] for codon in [sequence[x:x + 3] for x in range(0, l, 3)]: aa = Genomics.MapCodon2AA(codon) seq.append(aa) sequence = " ".join(seq) elif method == "reverse-complement": sequence = string.translate( sequence, string.maketrans("ACGTacgt", "TGCAtgca"))[::-1] elif method in ("mask-stops", "remove-stops"): c = [] codon = [] new_sequence = [] if method == "mask-stops": char = options.na_mask_char elif method == "remove-stops": char = options.gap_char for x in sequence: if x not in options.gap_chars: codon.append(x.upper()) c.append(x) if len(codon) == 3: codon = "".join(codon).upper() # mask all non-gaps if Genomics.IsStopCodon(codon): for x in c: if x in options.gap_chars: new_sequence.append(x) else: new_sequence.append(char) else: new_sequence += c c = [] codon = [] new_sequence += c sequence = "".join(new_sequence) elif method == "mask-soft": # Get next hard masked record and extract sequence and length try: cur_hm_record = next(hard_masked_iterator) except StopIteration: break hm_sequence = re.sub(" ", "", cur_hm_record.sequence) lhm = len(hm_sequence) new_sequence = [] # Check lengths of unmasked and soft masked sequences the same if l != lhm: raise ValueError( "length of unmasked and hard masked sequences not " "identical for record %s" % (cur_record.title)) # Check if hard masked seq contains repeat (N), if so replace N # with lowercase sequence from unmasked version if sequence == hm_sequence: pass else: for x, y in zip_longest(sequence, hm_sequence): if y == "N": new_sequence += x.lower() else: new_sequence += x.upper() sequence = "".join(new_sequence) elif method == "map-codons": raiseIfNotCodon(l, cur_record.title) seq = [] for codon in (sequence[x:x + 3].upper() for x in range(0, l, 3)): if codon not in map_codon2code: aa = "X" else: aa = map_codon2code[codon] seq.append(aa) sequence = "".join(seq) elif method == "interleaved-codons": raiseIfNotCodon(l, cur_record.title) seq = [] for codon in [sequence[x:x + 3] for x in range(0, l, 3)]: aa = Genomics.MapCodon2AA(codon) seq.append("%s:%s" % (aa, codon)) sequence = " ".join(seq) elif method == "translate-to-stop": seq = [] for codon in [sequence[x:x + 3] for x in range(0, l, 3)]: if Genomics.IsStopCodon(codon): break aa = Genomics.MapCodon2AA(codon) seq.append(aa) sequence = "".join(seq) elif method == "truncate-at-stop": seq = [] for codon in [sequence[x:x + 3] for x in range(0, l, 3)]: if Genomics.IsStopCodon(codon): break seq.append(codon) sequence = "".join(seq) elif method == "remove-gaps": seq = [] for s in sequence: if s in options.gap_chars: continue seq.append(s) sequence = "".join(seq) elif method == "upper": sequence = sequence.upper() elif method == "lower": sequence = sequence.lower() elif method == "mark-codons": raiseIfNotCodon(l, cur_record.title) seq = [] sequence = " ".join( [sequence[x:x + 3] for x in range(0, l, 3)]) elif method == "apply-map": id = re.match("^(\S+)", cur_record.title).groups()[0] if id in map_seq2nid: rest = cur_record.title[len(id):] cur_record.title = map_seq2nid[id] + rest elif method == "build-map": # build a map of identifiers id = re.match("^(\S+)", cur_record.title).groups()[0] new_id = options.template_identifier % nseq if id in map_seq2nid: raise "duplicate fasta entries - can't map those: %s" % id map_seq2nid[id] = new_id cur_record.title = new_id elif method == "mask-bias": masker = Masker.MaskerBias() sequence = masker(sequence) elif method == "mask-seg": masker = Masker.MaskerSeg() sequence = masker(sequence) elif method == "shuffle": s = list(sequence) random.shuffle(s) sequence = "".join(s) elif method == "mask-incomplete-codons": seq = list(sequence) for x in range(0, l, 3): nm = len([x for x in seq[x:x + 3] if x in mask_chars]) if 0 < nm < 3: seq[x:x + 3] = [mask_char] * 3 sequence = "".join(seq) elif method == "mask-codons": # mask codons based on amino acids given as reference # sequences. other_record = next(other_iterator) if other_record is None: raise ValueError("run out of sequences.") if cur_record.title != other_record.title: raise ValueError("sequence titles don't match: %s %s" % (cur_record.title, other_record.title)) other_sequence = re.sub(" ", "", other_record.sequence) if len(other_sequence) * 3 != len(sequence): raise ValueError( "sequences for %s don't have matching lengths %i - %i" % (cur_record.title, len(other_sequence) * 3, len(sequence))) seq = list(sequence) c = 0 for x in other_sequence: if x in options.aa_mask_chars: if x.isupper(): seq[c:c + 3] = [options.na_mask_char.upper()] * 3 else: seq[c:c + 3] = [options.na_mask_char.lower()] * 3 c += 3 sequence = "".join(seq) l = len(sequence) if filter_min_sequence_length is not None and \ l < filter_min_sequence_length: nskipped += 1 if filter_max_sequence_length is not None and \ l > filter_max_sequence_length: nskipped += 1 continue options.stdout.write(">%s\n%s\n" % (cur_record.title, sequence)) noutput += 1 if "build-map" in options.methods: p = options.parameters[0] if p: outfile = IOTools.openFile(p, "w") else: outfile = options.stdout outfile.write("old\tnew\n") for old_id, new_id in list(map_seq2nid.items()): outfile.write("%s\t%s\n" % (old_id, new_id)) if p: outfile.close() E.info("ninput=%i, noutput=%i, nskipped=%i, nerrors=%i" % (ninput, noutput, nskipped, nerrors)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--is-gtf", dest="is_gtf", action="store_true", help="input is gtf instead of gff.") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option( "-m", "--merge-adjacent", dest="merge", action="store_true", help="merge adjacent intervals with the same attributes." " [default=%default]") parser.add_option( "-e", "--feature", dest="feature", type="string", help="filter by a feature, for example 'exon', 'CDS'." " If set to the empty string, all entries are output " "[%default].") parser.add_option( "-f", "--maskregions-bed-file", dest="filename_masks", type="string", metavar="gff", help="mask sequences with regions given in gff file " "[%default].") parser.add_option( "--remove-masked-regions", dest="remove_masked_regions", action="store_true", help="remove regions instead of masking [%default].") parser.add_option( "--min-interval-length", dest="min_length", type="int", help="set minimum length for sequences output " "[%default]") parser.add_option( "--max-length", dest="max_length", type="int", help="set maximum length for sequences output " "[%default]") parser.add_option( "--extend-at", dest="extend_at", type="choice", choices=("none", "3", "5", "both", "3only", "5only"), help="extend at no end, 3', 5' or both ends. If " "3only or 5only are set, only the added sequence " "is returned [default=%default]") parser.add_option( "--extend-by", dest="extend_by", type="int", help="extend by # bases [default=%default]") parser.add_option( "--extend-with", dest="extend_with", type="string", help="extend using base [default=%default]") parser.add_option( "--masker", dest="masker", type="choice", choices=("dust", "dustmasker", "softmask", "none"), help="apply masker [%default].") parser.add_option( "--fold-at", dest="fold_at", type="int", help="fold sequence every n bases[%default].") parser.add_option( "--fasta-name-attribute", dest="naming_attribute", type="string", help="use attribute to name fasta entry. Currently only compatable" " with gff format [%default].") parser.set_defaults( is_gtf=False, genome_file=None, merge=False, feature=None, filename_masks=None, remove_masked_regions=False, min_length=0, max_length=0, extend_at=None, extend_by=100, extend_with=None, masker=None, fold_at=None, naming_attribute=False ) (options, args) = E.Start(parser) if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) contigs = fasta.getContigSizes() if options.is_gtf: iterator = GTF.transcript_iterator(GTF.iterator(options.stdin)) else: gffs = GTF.iterator(options.stdin) if options.merge: iterator = GTF.joined_iterator(gffs) else: iterator = GTF.chunk_iterator(gffs) masks = None if options.filename_masks: masks = {} with open(options.filename_masks, "r") as infile: e = GTF.readAsIntervals(GTF.iterator(infile)) # convert intervals to intersectors for contig in e.keys(): intersector = bx.intervals.intersection.Intersecter() for start, end in e[contig]: intersector.add_interval(bx.intervals.Interval(start, end)) masks[contig] = intersector ninput, noutput, nmasked, nskipped_masked = 0, 0, 0, 0 nskipped_length = 0 nskipped_noexons = 0 feature = options.feature # for item in iterator: # print len(item) # 3, 2 # for i in item: # print len(i) # 9, 9, 9, 9, 9 # print i.contig # print i.strand # print i.transcript_id # iterator is a list containing groups (lists) of features. # Each group of features have in common the same transcript ID, in case of # GTF files. for ichunk in iterator: ninput += 1 if feature: chunk = filter(lambda x: x.feature == feature, ichunk) else: chunk = ichunk if len(chunk) == 0: nskipped_noexons += 1 E.info("no features in entry from " "%s:%i..%i - %s" % (ichunk[0].contig, ichunk[0].start, ichunk[0].end, str(ichunk[0]))) continue contig, strand = chunk[0].contig, chunk[0].strand if options.is_gtf: name = chunk[0].transcript_id else: if options.naming_attribute: attr_dict = {x.split("=")[0]: x.split("=")[1] for x in chunk[0].attributes.split(";")} name = attr_dict[options.naming_attribute] else: name = str(chunk[0].attributes) lcontig = contigs[contig] positive = Genomics.IsPositiveStrand(strand) intervals = [(x.start, x.end) for x in chunk] intervals.sort() if masks: if contig in masks: masked_regions = [] for start, end in intervals: masked_regions += [(x.start, x.end) for x in masks[contig].find(start, end)] masked_regions = Intervals.combine(masked_regions) if len(masked_regions): nmasked += 1 if options.remove_masked_regions: intervals = Intervals.truncate(intervals, masked_regions) else: raise "unimplemented" if len(intervals) == 0: nskipped_masked += 1 if options.loglevel >= 1: options.stdlog.write("# skipped because fully masked: " "%s: regions=%s masks=%s\n" % (name, str([(x.start, x.end) for x in chunk]), masked_regions)) continue out = intervals if options.extend_at and not options.extend_with: if options.extend_at == "5only": intervals = [(max(0, intervals[0][0] - options.extend_by), intervals[0][0])] elif options.extend_at == "3only": intervals = [(intervals[-1][1], min(lcontig, intervals[-1][1] + options.extend_by))] else: if options.extend_at in ("5", "both"): intervals[0] = (max(0, intervals[0][0] - options.extend_by), intervals[0][1]) if options.extend_at in ("3", "both"): intervals[-1] = (intervals[-1][0], min(lcontig, intervals[-1][1] + options.extend_by)) if not positive: intervals = [(lcontig - x[1], lcontig - x[0]) for x in intervals[::-1]] out.reverse() s = [fasta.getSequence(contig, strand, start, end) for start, end in intervals] # IMS: allow for masking of sequences s = Masker.maskSequences(s, options.masker) l = sum([len(x) for x in s]) if (l < options.min_length or (options.max_length and l > options.max_length)): nskipped_length += 1 if options.loglevel >= 1: options.stdlog.write("# skipped because length out of bounds " "%s: regions=%s len=%i\n" % (name, str(intervals), l)) continue if options.extend_at and options.extend_with: extension = "".join((options.extend_with,) * options.extend_by) if options.extend_at in ("5", "both"): s[1] = extension + s[1] if options.extend_at in ("3", "both"): s[-1] = s[-1] + extension if options.fold_at: n = options.fold_at s = "".join(s) seq = "\n".join([s[i:i+n] for i in range(0, len(s), n)]) else: seq = "\n".join(s) options.stdout.write(">%s %s:%s:%s\n%s\n" % (name, contig, strand, ";".join( ["%i-%i" % x for x in out]), seq)) noutput += 1 E.info("ninput=%i, noutput=%i, nmasked=%i, nskipped_noexons=%i, " "nskipped_masked=%i, nskipped_length=%i" % (ninput, noutput, nmasked, nskipped_noexons, nskipped_masked, nskipped_length)) E.Stop()
class DustMaskerCheck(unittest.TestCase): mMasker = Masker.MaskerDustMasker()
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id: gff2fasta.py 2861 2010-02-23 17:36:32Z andreas $") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome.") parser.add_option("-m", "--masker", dest="masker", type="choice", choices=("dust", "dustmasker", "softmask", "none"), help="apply masker [%default].") parser.add_option("-o", "--mode", dest="mode", type="choice", choices=("intervals", "leftright"), help="what to output [%default]") parser.add_option("--min-length", dest="min_length", type="int", help="require a minimum sequence length [%default]") parser.add_option("--max-length", dest="max_length", type="int", help="require a maximum sequence length [%default]") parser.add_option("--extend-at", dest="extend_at", type="choice", choices=("none", "3", "5", "both", "3only", "5only"), help="extend at no, 3', 5' or both ends. If 3only or 5only are set, only the added sequence is returned [default=%default]") parser.add_option("--extend-by", dest="extend_by", type="int", help="extend by # bases [default=%default]") parser.add_option("--use-strand", dest="ignore_strand", action="store_false", help="use strand information and return reverse complement [default=%default]") parser.set_defaults( genome_file=None, masker=None, mode="intervals", min_length=0, max_length=0, extend_at=None, extend_by=100, ignore_strand=True, ) (options, args) = E.Start(parser) if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) contigs = fasta.getContigSizes() fasta.setConverter(IndexedFasta.getConverter("zero-both-open")) counter = E.Counter() ids, seqs = [], [] E.info("collecting sequences") for bed in Bed.setName(Bed.iterator(options.stdin)): counter.input += 1 lcontig = fasta.getLength(bed.contig) if options.ignore_strand: strand = "+" else: strand = bed.strand if options.mode == "intervals": ids.append("%s %s:%i..%i (%s)" % (bed.name, bed.contig, bed.start, bed.end, strand)) seqs.append( fasta.getSequence(bed.contig, strand, bed.start, bed.end)) elif options.mode == "leftright": l = bed.end - bed.start start, end = max(0, bed.start - l), bed.end - l ids.append("%s_l %s:%i..%i (%s)" % (bed.name, bed.contig, start, end, strand)) seqs.append(fasta.getSequence(bed.contig, strand, start, end)) start, end = bed.start + l, min(lcontig, bed.end + l) ids.append("%s_r %s:%i..%i (%s)" % (bed.name, bed.contig, start, end, strand)) seqs.append(fasta.getSequence(bed.contig, strand, start, end)) E.info("collected %i sequences" % len(seqs)) masked = Masker.maskSequences(seqs, options.masker) options.stdout.write( "\n".join([">%s\n%s" % (x, y) for x, y in zip(ids, masked)]) + "\n") E.info("masked %i sequences" % len(seqs)) counter.output = len(seqs) E.info("%s" % counter) E.Stop()
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genomic sequence to retrieve " "sequences from.") parser.add_option("-m", "--masker", dest="masker", type="choice", choices=("dust", "dustmasker", "softmask", "none"), help="apply masker to mask output sequences " "[%default].") parser.add_option("--output-mode", dest="output_mode", type="choice", choices=("intervals", "leftright", "segments"), help="what to output. " "'intervals' generates a single sequence for " "each bed interval. 'leftright' generates two " "sequences, one in each direction, for each bed " "interval. 'segments' can be used to output " "sequence from bed12 files so that sequence only covers " "the segements [%default]") parser.add_option("--min-sequence-length", dest="min_length", type="int", help="require a minimum sequence length [%default]") parser.add_option("--max-sequence-length", dest="max_length", type="int", help="require a maximum sequence length [%default]") parser.add_option( "--extend-at", dest="extend_at", type="choice", choices=("none", "3", "5", "both", "3only", "5only"), help="extend at 3', 5' or both or no ends. If 3only or 5only " "are set, only the added sequence is returned [default=%default]") parser.add_option("--extend-by", dest="extend_by", type="int", help="extend by # bases [default=%default]") parser.add_option( "--use-strand", dest="ignore_strand", action="store_false", help="use strand information and return reverse complement " "on intervals located on the negative strand. " "[default=%default]") parser.set_defaults( genome_file=None, masker=None, output_mode="intervals", min_length=0, max_length=0, extend_at=None, extend_by=100, ignore_strand=True, ) (options, args) = E.start(parser) if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) contigs = fasta.getContigSizes() fasta.setConverter(IndexedFasta.getConverter("zero-both-open")) counter = E.Counter() ids, seqs = [], [] E.info("collecting sequences") for bed in Bed.setName(Bed.iterator(options.stdin)): counter.input += 1 lcontig = fasta.getLength(bed.contig) if options.ignore_strand: strand = "+" else: strand = bed.strand if options.output_mode == "segments" and bed.columns == 12: ids.append("%s %s:%i..%i (%s) %s %s" % (bed.name, bed.contig, bed.start, bed.end, strand, bed["blockSizes"], bed["blockStarts"])) seg_seqs = [ fasta.getSequence(bed.contig, strand, start, end) for start, end in bed.toIntervals() ] seqs.append("".join(seg_seqs)) elif (options.output_mode == "intervals" or options.output_mode == "segments"): ids.append("%s %s:%i..%i (%s)" % (bed.name, bed.contig, bed.start, bed.end, strand)) seqs.append( fasta.getSequence(bed.contig, strand, bed.start, bed.end)) elif options.output_mode == "leftright": l = bed.end - bed.start start, end = max(0, bed.start - l), bed.end - l ids.append("%s_l %s:%i..%i (%s)" % (bed.name, bed.contig, start, end, strand)) seqs.append(fasta.getSequence(bed.contig, strand, start, end)) start, end = bed.start + l, min(lcontig, bed.end + l) ids.append("%s_r %s:%i..%i (%s)" % (bed.name, bed.contig, start, end, strand)) seqs.append(fasta.getSequence(bed.contig, strand, start, end)) E.info("collected %i sequences" % len(seqs)) masked = Masker.maskSequences(seqs, options.masker) options.stdout.write( "\n".join([">%s\n%s" % (x, y) for x, y in zip(ids, masked)]) + "\n") E.info("masked %i sequences" % len(seqs)) counter.output = len(seqs) E.info("%s" % counter) E.stop()
def main(argv=None): if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: gff2fasta.py 2861 2010-02-23 17:36:32Z andreas $") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome.") parser.add_option("-m", "--masker", dest="masker", type="choice", choices=("dust", "dustmasker", "softmask", "none"), help="apply masker [%default].") parser.add_option("-o", "--mode", dest="mode", type="choice", choices=("intervals", "leftright"), help="what to output [%default]") parser.add_option("--min-length", dest="min_length", type="int", help="require a minimum sequence length [%default]") parser.add_option("--max-length", dest="max_length", type="int", help="require a maximum sequence length [%default]") parser.add_option( "--extend-at", dest="extend_at", type="choice", choices=("none", "3", "5", "both", "3only", "5only"), help= "extend at no, 3', 5' or both ends. If 3only or 5only are set, only the added sequence is returned [default=%default]" ) parser.add_option("--extend-by", dest="extend_by", type="int", help="extend by # bases [default=%default]") parser.add_option( "--use-strand", dest="ignore_strand", action="store_false", help= "use strand information and return reverse complement [default=%default]" ) parser.set_defaults( genome_file=None, masker=None, mode="intervals", min_length=0, max_length=0, extend_at=None, extend_by=100, ignore_strand=True, ) (options, args) = E.Start(parser) if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) contigs = fasta.getContigSizes() fasta.setConverter(IndexedFasta.getConverter("zero-both-open")) counter = E.Counter() ids, seqs = [], [] E.info("collecting sequences") for bed in Bed.setName(Bed.iterator(options.stdin)): counter.input += 1 lcontig = fasta.getLength(bed.contig) if options.ignore_strand: strand = "+" else: strand = bed.strand if options.mode == "intervals": ids.append("%s %s:%i..%i (%s)" % (bed.name, bed.contig, bed.start, bed.end, strand)) seqs.append( fasta.getSequence(bed.contig, strand, bed.start, bed.end)) elif options.mode == "leftright": l = bed.end - bed.start start, end = max(0, bed.start - l), bed.end - l ids.append("%s_l %s:%i..%i (%s)" % (bed.name, bed.contig, start, end, strand)) seqs.append(fasta.getSequence(bed.contig, strand, start, end)) start, end = bed.start + l, min(lcontig, bed.end + l) ids.append("%s_r %s:%i..%i (%s)" % (bed.name, bed.contig, start, end, strand)) seqs.append(fasta.getSequence(bed.contig, strand, start, end)) E.info("collected %i sequences" % len(seqs)) masked = Masker.maskSequences(seqs, options.masker) options.stdout.write( "\n".join([">%s\n%s" % (x, y) for x, y in zip(ids, masked)]) + "\n") E.info("masked %i sequences" % len(seqs)) counter.output = len(seqs) E.info("%s" % counter) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: gff2fasta.py 2861 2010-02-23 17:36:32Z andreas $", usage=globals()["__doc__"]) parser.add_option("--is-gtf", dest="is_gtf", action="store_true", help="input is gtf instead of gff.") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option( "-m", "--merge", dest="merge", action="store_true", help="merge adjacent intervals with the same attributes. " "[default=%default]") parser.add_option( "-e", "--feature", dest="feature", type="string", help="filter by a feature, for example 'exon', 'CDS'. If " "set to the empty string, all entries are output [%default].") parser.add_option( "-f", "--filename-masks", dest="filename_masks", type="string", metavar="gff", help="mask sequences with regions given in gff file [%default].") parser.add_option("--remove-masked-regions", dest="remove_masked_regions", action="store_true", help="remove regions instead of masking [%default].") parser.add_option( "--min-length", dest="min_length", type="int", help="set minimum length for sequences output [%default]") parser.add_option( "--max-length", dest="max_length", type="int", help="set maximum length for sequences output [%default]") parser.add_option("--extend-at", dest="extend_at", type="choice", choices=("none", "3", "5", "both", "3only", "5only"), help="extend at no end, 3', 5' or both ends. If " "3only or 5only are set, only the added sequence " "is returned [default=%default]") parser.add_option("--extend-by", dest="extend_by", type="int", help="extend by # bases [default=%default]") parser.add_option("--masker", dest="masker", type="choice", choices=("dust", "dustmasker", "softmask", "none"), help="apply masker [%default].") parser.set_defaults(is_gtf=False, genome_file=None, merge=False, feature=None, filename_masks=None, remove_masked_regions=False, min_length=0, max_length=0, extend_at=None, extend_by=100, masker=None) (options, args) = E.Start(parser) if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) contigs = fasta.getContigSizes() if options.is_gtf: iterator = GTF.transcript_iterator(GTF.iterator(sys.stdin)) else: gffs = GTF.iterator(sys.stdin) if options.merge: iterator = GTF.joined_iterator(gffs) else: iterator = GTF.chunk_iterator(gffs) masks = None if options.filename_masks: masks = {} with open(options.filename_masks, "r") as infile: e = GTF.readAsIntervals(GFF.iterator(infile)) # convert intervals to intersectors for contig in e.keys(): intersector = bx.intervals.intersection.Intersecter() for start, end in e[contig]: intersector.add_interval(bx.intervals.Interval(start, end)) masks[contig] = intersector ninput, noutput, nmasked, nskipped_masked = 0, 0, 0, 0 nskipped_length = 0 nskipped_noexons = 0 feature = options.feature # for item in iterator: # print len(item) # 3, 2 # for i in item: # print len(i) # 9, 9, 9, 9, 9 # print i.contig # print i.strand # print i.transcript_id # iterator is a list containing groups (lists) of features. # Each group of features have in common the same transcript ID, in case of GTF files. for ichunk in iterator: ninput += 1 if feature: chunk = filter(lambda x: x.feature == feature, ichunk) else: chunk = ichunk if len(chunk) == 0: nskipped_noexons += 1 E.info("no features in entry from %s:%i..%i - %s" % (ichunk[0].contig, ichunk[0].start, ichunk[0].end, str(ichunk[0]))) continue contig, strand = chunk[0].contig, chunk[0].strand if options.is_gtf: name = chunk[0].transcript_id else: name = str(chunk[0].attributes) lcontig = contigs[contig] positive = Genomics.IsPositiveStrand(strand) intervals = [(x.start, x.end) for x in chunk] intervals.sort() if masks: if contig in masks: masked_regions = [] for start, end in intervals: masked_regions += [(x.start, x.end) for x in masks[contig].find(start, end)] masked_regions = Intervals.combine(masked_regions) if len(masked_regions): nmasked += 1 if options.remove_masked_regions: intervals = Intervals.truncate(intervals, masked_regions) else: raise "unimplemented" if len(intervals) == 0: nskipped_masked += 1 if options.loglevel >= 1: options.stdlog.write( "# skipped because fully masked: %s: regions=%s masks=%s\n" %\ (name, str([ (x.start, x.end) for x in chunk ]), masked_regions) ) continue out = intervals if options.extend_at: if options.extend_at == "5only": intervals = [(max(0, intervals[0][0] - options.extend_by), intervals[0][0])] elif options.extend_at == "3only": intervals = [(intervals[-1][1], min(lcontig, intervals[-1][1] + options.extend_by))] else: if options.extend_at in ("5", "both"): intervals[0] = (max(0, intervals[0][0] - options.extend_by), intervals[0][1]) if options.extend_at in ("3", "both"): intervals[-1] = (intervals[-1][0], min(lcontig, intervals[-1][1] + options.extend_by)) if not positive: intervals = [(lcontig - x[1], lcontig - x[0]) for x in intervals[::-1]] out.reverse() s = [ fasta.getSequence(contig, strand, start, end) for start, end in intervals ] #IMS: allow for masking of sequences s = Masker.maskSequences(s, options.masker) l = sum([len(x) for x in s]) if l < options.min_length or (options.max_length and l > options.max_length): nskipped_length += 1 if options.loglevel >= 1: options.stdlog.write( "# skipped because length out of bounds %s: regions=%s len=%i\n" %\ (name, str(intervals), l) ) continue options.stdout.write( ">%s %s:%s:%s\n%s\n" % (name, contig, strand, ";".join(["%i-%i" % x for x in out]), "\n".join(s))) noutput += 1 E.info( "ninput=%i, noutput=%i, nmasked=%i, nskipped_noexons=%i, nskipped_masked=%i, nskipped_length=%i" %\ (ninput, noutput, nmasked, nskipped_noexons, nskipped_masked, nskipped_length ) ) E.Stop()