def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: split_fasta.py 1714 2007-12-11 16:51:12Z andreas $" ) parser.add_option("-f", "--file", dest="input_filename", type="string", help="input filename. If not given, stdin is used.", metavar="FILE") parser.add_option( "-i", "--input-pattern", dest="input_pattern", type="string", help="input pattern. Parses description line in order to extract id.") parser.add_option( "-o", "--output-filename-pattern", dest="output_pattern", type="string", help="output pattern. Gives filename for a given sequence.") parser.add_option( "-n", "--num-sequences", dest="num_sequences", type="int", help="split by number of sequences (not implemented yet).") parser.add_option("-m", "--map", dest="map_filename", type="string", help="map filename. Map identifiers to filenames", metavar="FILE") parser.add_option("-s", "--skip-identifiers", dest="skip_identifiers", action="store_true", help="do not write identifiers.", metavar="FILE") parser.add_option("--min-size", dest="min_size", type="int", help="minimum cluster size.") parser.set_defaults(input_filename=None, map_filename=None, skip_identifiers=False, input_pattern="^(\S+)", min_size=0, num_sequences=None, output_pattern="%s") (options, args) = E.start(parser) if options.input_filename: infile = IOTools.open_file(options.input_filename, "r") else: infile = sys.stdin if options.map_filename: map_id2filename = IOTools.ReadMap(open(options.map_filename, "r")) else: map_id2filename = {} if options.num_sequences: files = FilesChunks(chunk_size=options.num_sequences, output_pattern=options.output_pattern, skip_identifiers=options.skip_identifiers) else: files = Files(output_pattern=options.output_pattern, skip_identifiers=options.skip_identifiers) if options.input_pattern: rx = re.compile(options.input_pattern) else: rx = None ninput = 0 noutput = 0 identifier = None chunk = 0 for seq in FastaIterator.iterate(infile): ninput += 1 if rx: try: identifier = rx.search(seq.title).groups()[0] except AttributeError: print("# parsing error in description line %s" % (seq.title)) else: identifier = seq.title if map_id2filename: if identifier in map_id2filename: identifier = map_id2filename[identifier] else: continue files.Write(identifier, seq) noutput += 1 if options.input_filename: infile.close() # delete all clusters below a minimum size # Note: this has to be done at the end, because # clusters sizes are only available once both the fasta # file and the map has been parsed. if options.min_size: ndeleted = files.DeleteFiles(min_size=options.min_size) else: ndeleted = 0 if options.loglevel >= 1: print("# input=%i, output=%i, ndeleted=%i" % (ninput, noutput, ndeleted)) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: fasta2nj.py 2781 2009-09-10 11:33:14Z andreas $") parser.add_option( "-m", "--map", dest="filename_map", type="string", help="filename with mapping of species ids to swissprot species ids.") parser.set_defaults( separator="|", filename_map=None, ) (options, args) = E.start(parser) if options.filename_map: map_species2sp = IOTools.ReadMap(open(options.filename_map, "r")) ninput, noutput, nerrors = 0, 0, 0 for line in sys.stdin: if line[0] == ">": ninput += 1 id = re.match(">([^/ \t]+)", line[:-1]).groups()[0] data = id.split(options.separator) species = data[0] if len(data) == 2: gene = data[1] transcript = None elif len(data) >= 3: gene = data[2] transcript = data[1] if map_species2sp: try: species = map_species2sp[species] except IndexError: nerrors += 1 if options.loglevel >= 1: options.stdlog.write("# could not map species %s\n" % species) if transcript: options.stdout.write(">%s_%s GENEID=%s\n" % (transcript, species, gene)) else: options.stdout.write(">%s_%s\n" % (species, gene)) noutput += 1 else: options.stdout.write(line) if options.loglevel >= 1: options.stdlog.write("# ninput=%i, noutput=%i, nerrors=%i\n" % (ninput, noutput, nerrors)) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id: r_mann_whitney_u.py 2782 2009-09-10 11:40:29Z andreas $") parser.add_option("-m", "--method", dest="method", type="string", help="method to use [ks=Kolmogorov-Smirnov,mwu=Mann-WhitneyU]") parser.add_option("-a", "--hardcopy", dest="hardcopy", type="string", help="write hardcopy to file.", metavar="FILE") parser.add_option("-1", "--infile1", dest="filename_input1", type="string", help="input filename for distribution 1.") parser.add_option("-2", "--infile2", dest="filename_input2", type="string", help="input filename for distribution 2.") parser.add_option("-p", "--infile-map", dest="filename_input_map", type="string", help="input filename for mapping categories to values.") parser.set_defaults( method="ks", filename_input1=None, filename_input2=None, filename_input_map=None, ) (options, args) = E.start(parser, add_pipe_options=True) map_category2value = {} if options.filename_input_map: map_category2value = IOTools.ReadMap(open(options.filename_input_map, "r"), map_functions=(str, float)) values1, errors1 = IOTools.ReadList(open(options.filename_input1, "r"), map_category=map_category2value) values2, errors2 = IOTools.ReadList(open(options.filename_input2, "r"), map_category=map_category2value) E.info("ninput1=%i, nerrors1=%i, ninput2=%i, nerrors2=%i" % (len(values1), len(errors1), len(values2), len(errors2))) if options.hardcopy: R.png(options.hardcopy, width=1024, height=768) if options.method == "ks": result = R.ks_test(values1, values2) elif options.method == "mwu": result = R.wilcox_test(values1, values2, paired=False) R.assign("v1", values1) R.assign("v2", values2) R.layout(R.matrix((1, 2, 3, 4), 2, 2, byrow=True)) R.boxplot(values1, values2, col=('white', 'red'), main="Boxplot") R("""qqplot( v1, v2, main ='Quantile-quantile plot' ); lines( c(0,1), c(0,1) );""") R("""hist( v1, freq=FALSE, width=0.5, density=10, main='Relative frequency histogram')""") R("""hist( v2, freq=FALSE, add=TRUE, width=0.5, col='red', offset=0.5, density=20, angle=135)""") R("""hist( v1, freq=TRUE, width=0.5, density=10, main='Absolute frequency histogram')""") R("""hist( v2, freq=TRUE, add=TRUE, width=0.5, col='red', offset=0.5, density=20, angle=135)""") print("## Results for %s" % result['method']) for x in ['p.value', 'statistic', 'alternative', 'method']: print(x, result[x]) E.stop()
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version", usage=globals()["__doc__"]) parser.add_option( "-m", "--method", dest="methods", type="choice", action="append", choices=("translate", "translate-to-stop", "truncate-at-stop", "back-translate", "mark-codons", "apply-map", "build-map", "pseudo-codons", "filter", "interleaved-codons", "map-codons", "remove-gaps", "mask-seg", "mask-bias", "mask-codons", "mask-incomplete-codons", "mask-stops", "mask-soft", "remove-stops", "upper", "lower", "reverse-complement", "sample", "shuffle"), help="method to apply to sequences.") parser.add_option("-p", "--parameters", dest="parameters", type="string", help="parameter stack for methods that require one " "[default=%default].") parser.add_option("-x", "--ignore-errors", dest="ignore_errors", action="store_true", help="ignore errors [default = %default].") parser.add_option("--sample-proportion", dest="sample_proportion", type="float", help="sample proportion [default = %default].") parser.add_option("--exclude-pattern", dest="exclude_pattern", type="string", help="exclude all sequences with ids matching pattern " "[default = %default].") parser.add_option("--include-pattern", dest="include_pattern", type="string", help="include only sequences with ids matching pattern " "[default = %default].") parser.add_option("--filter-method", dest="filter_methods", type="string", action="append", help="filtering methods to apply " "[default = %default].") parser.add_option( "-t", "--sequence-type", dest="type", type="choice", choices=("aa", "na"), help="sequence type (aa or na) [%default]. This option determines " "which characters to use for masking [default = %default].") parser.add_option( "-l", "--template-identifier", dest="template_identifier", type="string", help="template for numerical identifier [default = %default] " "for the operation --build-map. A %i is replaced by the position " "of the sequence in the file.") parser.set_defaults( methods=[], parameters="", type="na", aa_mask_chars="xX", aa_mask_char="x", na_mask_chars="nN", na_mask_char="n", gap_chars="-.", gap_char="-", template_identifier="ID%06i", ignore_errors=False, exclude_pattern=None, include_pattern=None, sample_proportion=None, filter_methods=[], ) (options, args) = E.start(parser) options.parameters = options.parameters.split(",") rx_include, rx_exclude = None, None if options.include_pattern: rx_include = re.compile(options.include_pattern) if options.exclude_pattern: rx_exclude = re.compile(options.exclude_pattern) iterator = FastaIterator.FastaIterator(options.stdin) nseq = 0 map_seq2nid = {} if "apply-map" in options.methods: map_seq2nid = IOTools.ReadMap(open(options.parameters[0], "r")) del options.parameters[0] if options.type == "na": mask_chars = options.na_mask_chars mask_char = options.na_mask_char else: mask_chars = options.aa_mask_chars mask_char = options.aa_mask_char if "map-codons" in options.methods: map_codon2code = IOTools.ReadMap(open(options.parameters[0], "r")) del options.parameters[0] if "mask-soft" in options.methods: f = options.parameters[0] del options.parameters[0] hard_masked_iterator = FastaIterator.FastaIterator(open(f, "r")) if "mask-codons" in options.methods or "back-translate" in options.methods: # open a second stream to read sequences from f = options.parameters[0] del options.parameters[0] other_iterator = FastaIterator.FastaIterator(open(f, "r")) ninput, noutput, nerrors, nskipped = 0, 0, 0, 0 if "sample" in options.methods: if not options.sample_proportion: raise ValueError("specify a sample proportion") sample_proportion = options.sample_proportion else: sample_proportion = None filter_min_sequence_length = None filter_max_sequence_length = None filter_id_list = None for f in options.filter_methods: if f.startswith("min-length"): filter_min_sequence_length = int(f.split("=")[1]) elif f.startswith("max-length"): filter_max_sequence_length = int(f.split("=")[1]) elif f.startswith("id-file"): filter_id_list = [ line[:-1] for line in IOTools.open_file(f.split("=")[1]) ] def raiseIfNotCodon(l, title): '''raise ValueError if sequence length l is not divisible by 3''' if l % 3 != 0: raise ValueError("length of sequence %s not divisible by 3" % (title)) while 1: try: cur_record = next(iterator) except StopIteration: break if cur_record is None: break nseq += 1 ninput += 1 sequence = re.sub(" ", "", cur_record.sequence) l = len(sequence) if rx_include and not rx_include.search(cur_record.title): nskipped += 1 continue if rx_exclude and rx_exclude.search(cur_record.title): nskipped += 1 continue if sample_proportion: if random.random() > sample_proportion: continue if not (filter_id_list is None or cur_record.title in filter_id_list): nskipped += 1 continue for method in options.methods: if method == "translate": # translate such that gaps are preserved seq = [] ls = len(re.sub('[%s]' % options.gap_chars, sequence, "")) if ls % 3 != 0: msg = "length of sequence %s (%i) not divisible by 3" % ( cur_record.title, ls) nerrors += 1 if options.ignore_errors: E.warn(msg) continue else: raise ValueError(msg) for codon in [sequence[x:x + 3] for x in range(0, l, 3)]: aa = Genomics.MapCodon2AA(codon) seq.append(aa) sequence = "".join(seq) elif method == "back-translate": # translate from an amino acid alignment to codon alignment seq = [] try: other_record = next(other_iterator) except StopIteration: raise ValueError("run out of sequences") if cur_record.title != other_record.title: raise "sequence titles don't match: %s %s" % ( cur_record.title, other_record.title) other_sequence = re.sub("[ %s]" % options.gap_chars, "", other_record.sequence) if len(other_sequence) % 3 != 0: raise ValueError( "length of sequence %s not divisible by 3" % (other_record.title)) r = re.sub("[%s]" % options.gap_chars, "", sequence) if len(other_sequence) != len(r) * 3: raise ValueError( "length of sequences do not match: %i vs %i" % (len(other_sequence), len(r))) x = 0 for aa in sequence: if aa in options.gap_chars: c = options.gap_char * 3 else: c = other_sequence[x:x + 3] x += 3 seq.append(c) sequence = "".join(seq) elif method == "pseudo-codons": raiseIfNotCodon(l, cur_record.title) seq = [] for codon in [sequence[x:x + 3] for x in range(0, l, 3)]: aa = Genomics.MapCodon2AA(codon) seq.append(aa) sequence = " ".join(seq) elif method == "reverse-complement": sequence = sequence.translate( str.maketrans("ACGTacgt", "TGCAtgca"))[::-1] elif method in ("mask-stops", "remove-stops"): c = [] codon = [] new_sequence = [] if method == "mask-stops": char = options.na_mask_char elif method == "remove-stops": char = options.gap_char for x in sequence: if x not in options.gap_chars: codon.append(x.upper()) c.append(x) if len(codon) == 3: codon = "".join(codon).upper() # mask all non-gaps if Genomics.IsStopCodon(codon): for x in c: if x in options.gap_chars: new_sequence.append(x) else: new_sequence.append(char) else: new_sequence += c c = [] codon = [] new_sequence += c sequence = "".join(new_sequence) elif method == "mask-soft": # Get next hard masked record and extract sequence and length try: cur_hm_record = next(hard_masked_iterator) except StopIteration: break hm_sequence = re.sub(" ", "", cur_hm_record.sequence) lhm = len(hm_sequence) new_sequence = [] # Check lengths of unmasked and soft masked sequences the same if l != lhm: raise ValueError( "length of unmasked and hard masked sequences not " "identical for record %s" % (cur_record.title)) # Check if hard masked seq contains repeat (N), if so replace N # with lowercase sequence from unmasked version if sequence == hm_sequence: pass else: for x, y in zip_longest(sequence, hm_sequence): if y == "N": new_sequence += x.lower() else: new_sequence += x.upper() sequence = "".join(new_sequence) elif method == "map-codons": raiseIfNotCodon(l, cur_record.title) seq = [] for codon in (sequence[x:x + 3].upper() for x in range(0, l, 3)): if codon not in map_codon2code: aa = "X" else: aa = map_codon2code[codon] seq.append(aa) sequence = "".join(seq) elif method == "interleaved-codons": raiseIfNotCodon(l, cur_record.title) seq = [] for codon in [sequence[x:x + 3] for x in range(0, l, 3)]: aa = Genomics.MapCodon2AA(codon) seq.append("%s:%s" % (aa, codon)) sequence = " ".join(seq) elif method == "translate-to-stop": seq = [] for codon in [sequence[x:x + 3] for x in range(0, l, 3)]: if Genomics.IsStopCodon(codon): break aa = Genomics.MapCodon2AA(codon) seq.append(aa) sequence = "".join(seq) elif method == "truncate-at-stop": seq = [] for codon in [sequence[x:x + 3] for x in range(0, l, 3)]: if Genomics.IsStopCodon(codon): break seq.append(codon) sequence = "".join(seq) elif method == "remove-gaps": seq = [] for s in sequence: if s in options.gap_chars: continue seq.append(s) sequence = "".join(seq) elif method == "upper": sequence = sequence.upper() elif method == "lower": sequence = sequence.lower() elif method == "mark-codons": raiseIfNotCodon(l, cur_record.title) seq = [] sequence = " ".join( [sequence[x:x + 3] for x in range(0, l, 3)]) elif method == "apply-map": id = re.match("^(\S+)", cur_record.title).groups()[0] if id in map_seq2nid: rest = cur_record.title[len(id):] cur_record.title = map_seq2nid[id] + rest elif method == "build-map": # build a map of identifiers id = re.match("^(\S+)", cur_record.title).groups()[0] new_id = options.template_identifier % nseq if id in map_seq2nid: raise "duplicate fasta entries - can't map those: %s" % id map_seq2nid[id] = new_id cur_record.title = new_id elif method == "mask-bias": masker = Masker.MaskerBias() sequence = masker(sequence) elif method == "mask-seg": masker = Masker.MaskerSeg() sequence = masker(sequence) elif method == "shuffle": s = list(sequence) random.shuffle(s) sequence = "".join(s) elif method == "mask-incomplete-codons": seq = list(sequence) for x in range(0, l, 3): nm = len([x for x in seq[x:x + 3] if x in mask_chars]) if 0 < nm < 3: seq[x:x + 3] = [mask_char] * 3 sequence = "".join(seq) elif method == "mask-codons": # mask codons based on amino acids given as reference # sequences. other_record = next(other_iterator) if other_record is None: raise ValueError("run out of sequences.") if cur_record.title != other_record.title: raise ValueError("sequence titles don't match: %s %s" % (cur_record.title, other_record.title)) other_sequence = re.sub(" ", "", other_record.sequence) if len(other_sequence) * 3 != len(sequence): raise ValueError( "sequences for %s don't have matching lengths %i - %i" % (cur_record.title, len(other_sequence) * 3, len(sequence))) seq = list(sequence) c = 0 for x in other_sequence: if x in options.aa_mask_chars: if x.isupper(): seq[c:c + 3] = [options.na_mask_char.upper()] * 3 else: seq[c:c + 3] = [options.na_mask_char.lower()] * 3 c += 3 sequence = "".join(seq) l = len(sequence) if filter_min_sequence_length is not None and \ l < filter_min_sequence_length: nskipped += 1 if filter_max_sequence_length is not None and \ l > filter_max_sequence_length: nskipped += 1 continue options.stdout.write(">%s\n%s\n" % (cur_record.title, sequence)) noutput += 1 if "build-map" in options.methods: p = options.parameters[0] if p: outfile = IOTools.open_file(p, "w") else: outfile = options.stdout outfile.write("old\tnew\n") for old_id, new_id in list(map_seq2nid.items()): outfile.write("%s\t%s\n" % (old_id, new_id)) if p: outfile.close() E.info("ninput=%i, noutput=%i, nskipped=%i, nerrors=%i" % (ninput, noutput, nskipped, nerrors)) E.stop()
def main(argv=None): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-w", "--weights-tsv-file", dest="filename_weights", type="string", help="filename with codon frequencies. Multiple filenames " "can be separated by comma.") parser.add_option("-s", "--section", dest="sections", type="choice", action="append", choices=("length", "sequence", "hid", "na", "aa", "cpg", "dn", "degeneracy", "gaps", "codons", "codon-usage", "codon-translator", "codon-bias"), help="which sections to output [%default]") parser.add_option( "-t", "--sequence-type", dest="seqtype", type="choice", choices=("na", "aa"), help="type of sequence: na=nucleotides, aa=amino acids [%default].") parser.add_option( "-e", "--regex-identifier", dest="regex_identifier", type="string", help="regular expression to extract identifier from fasta " "description line.") parser.add_option("--split-fasta-identifier", dest="split_id", action="store_true", help="split fasta description line (starting >) and use " "only text before first space") parser.add_option( "--add-total", dest="add_total", action="store_true", help="add a row with column totals at the end of the table" "[%default]") parser.set_defaults( filename_weights=None, pseudocounts=1, sections=[], regex_identifier="(.+)", seqtype="na", gap_chars='xXnN', split_id=False, add_total=False, ) (options, args) = E.start(parser, argv=argv) rx = re.compile(options.regex_identifier) reference_codons = [] if options.filename_weights: options.filename_weights = options.filename_weights.split(",") for filename in options.filename_weights: if filename == "uniform": reference_codons.append(Genomics.GetUniformCodonUsage()) else: reference_codons.append( IOTools.ReadMap(IOTools.open_file(filename, "r"), has_header=True, map_functions=(str, float))) # print codon table differences options.stdlog.write( "# Difference between supplied codon usage preferences.\n") for x in range(0, len(reference_codons)): for y in range(0, len(reference_codons)): if x == y: continue # calculate KL distance a = reference_codons[x] b = reference_codons[y] d = 0 for codon, p in list(a.items()): if Genomics.IsStopCodon(codon): continue d += b[codon] * math.log(b[codon] / p) options.stdlog.write("# tablediff\t%s\t%s\t%f\n" % (options.filename_weights[x], options.filename_weights[y], d)) iterator = FastaIterator.FastaIterator(options.stdin) def getCounter(section): if options.seqtype == "na": if section == "length": s = SequenceProperties.SequencePropertiesLength() elif section == "sequence": s = SequenceProperties.SequencePropertiesSequence() elif section == "hid": s = SequenceProperties.SequencePropertiesHid() elif section == "na": s = SequenceProperties.SequencePropertiesNA() elif section == "gaps": s = SequenceProperties.SequencePropertiesGaps( options.gap_chars) elif section == "cpg": s = SequenceProperties.SequencePropertiesCpg() elif section == "dn": s = SequenceProperties.SequencePropertiesDN() # these sections requires sequence length to be a multiple of 3 elif section == "aa": s = SequenceProperties.SequencePropertiesAA() elif section == "degeneracy": s = SequenceProperties.SequencePropertiesDegeneracy() elif section == "codon-bias": s = SequenceProperties.SequencePropertiesBias(reference_codons) elif section == "codons": s = SequenceProperties.SequencePropertiesCodons() elif section == "codon-usage": s = SequenceProperties.SequencePropertiesCodonUsage() elif section == "codon-translator": s = SequenceProperties.SequencePropertiesCodonTranslator() else: raise ValueError("unknown section %s" % section) elif options.seqtype == "aa": if section == "length": s = SequenceProperties.SequencePropertiesLength() elif section == "sequence": s = SequenceProperties.SequencePropertiesSequence() elif section == "hid": s = SequenceProperties.SequencePropertiesHid() elif section == "aa": s = SequenceProperties.SequencePropertiesAminoAcids() else: raise ValueError("unknown section %s" % section) return s # setup totals totals = {} for section in options.sections: totals[section] = getCounter(section) options.stdout.write("id") for section in options.sections: options.stdout.write("\t" + "\t".join(totals[section].getHeaders())) options.stdout.write("\n") options.stdout.flush() s = getCounter("hid") s.loadSequence("AAAAAAAAA", "na") for cur_record in iterator: sequence = re.sub(" ", "", cur_record.sequence).upper() if len(sequence) == 0: raise ValueError("empty sequence %s" % cur_record.title) id = rx.search(cur_record.title).groups()[0] if options.split_id is True: options.stdout.write("%s" % id.split()[0]) else: options.stdout.write("%s" % id) options.stdout.flush() for section in options.sections: s = getCounter(section) s.loadSequence(sequence, options.seqtype) totals[section].addProperties(s) options.stdout.write("\t" + "\t".join(s.getFields())) options.stdout.write("\n") if options.add_total: options.stdout.write("total") for section in options.sections: options.stdout.write("\t" + "\t".join(totals[section].getFields())) options.stdout.write("\n") E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version= "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("-t", "--template-bam-file", dest="filename_genome_bam", type="string", help="input bam file for header information [%default]") parser.add_option("-s", "--contigs-tsv-file", dest="filename_contigs", type="string", help="filename with contig sizes [%default]") parser.add_option( "-o", "--colour", dest="colour_mismatches", action="store_true", help="mismatches will use colour differences (CM tag) [%default]") parser.add_option("-i", "--ignore-mismatches", dest="ignore_mismatches", action="store_true", help="ignore mismatches [%default]") parser.add_option( "-c", "--remove-contigs", dest="remove_contigs", type="string", help="','-separated list of contigs to remove [%default]") parser.add_option("-f", "--force-output", dest="force", action="store_true", help="force overwriting of existing files [%default]") parser.add_option("-u", "--unique", dest="unique", action="store_true", help="remove reads not matching uniquely [%default]") parser.set_defaults( filename_genome_bam=None, filename_gtf=None, filename_mismapped=None, remove_contigs=None, force=False, unique=False, colour_mismatches=False, ignore_mismatches=False, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) genomefile, referencenames, referencelengths = None, None, None if options.filename_genome_bam: genomefile = pysam.AlignmentFile(options.filename_genome_bam, "rb") elif options.filename_contigs: contigs = IOTools.ReadMap(IOTools.open_file(options.filename_contigs)) data = list(zip(*list(contigs.items()))) referencenames, referencelengths = data[0], list(map(int, data[1])) else: raise ValueError( "please provide either --template-bam-file or --contigs-tsv-file") infile = pysam.AlignmentFile("-", "rb") outfile = pysam.AlignmentFile("-", "wb", template=genomefile, referencenames=referencenames, referencelengths=referencelengths) if options.colour_mismatches: tag = "CM" else: tag = "NM" nambiguous = 0 ninput = 0 nunmapped = 0 ncigar = 0 nfull = 0 noutput = 0 contig2tid = dict([(y, x) for x, y in enumerate(outfile.references)]) for qname, readgroup in itertools.groupby(infile, lambda x: x.qname): ninput += 1 reads = list(readgroup) if reads[0].is_unmapped: nunmapped += 1 continue # filter for best match best = min([x.opt(tag) for x in reads]) reads = [x for x in reads if x.opt(tag) == best] if len(reads) > 1: nambiguous += 1 continue read = reads[0] # reject complicated matches (indels, etc) # to simplify calculations below. if len(read.cigar) > 1: ncigar += 1 continue # set NH flag to latest count t = dict(read.tags) t['NH'] = 1 read.tags = list(t.items()) sname = infile.getrname(read.tid) contig, first_exon_start, middle, last_exon_end, splice, strand = sname.split( "|") first_exon_end, last_exon_start = middle.split("-") first_exon_start, first_exon_end, last_exon_start, last_exon_end = list( map(int, (first_exon_start, first_exon_end, last_exon_start, last_exon_end))) first_exon_end += 1 total = first_exon_end - first_exon_start + \ last_exon_end - last_exon_start first_exon_length = first_exon_end - first_exon_start match1 = first_exon_length - read.pos intron_length = last_exon_start - first_exon_end match2 = read.qlen - match1 # match lies fully in one exon - ignore if match1 <= 0 or match2 <= 0: nfull += 1 continue # increment pos read.pos = first_exon_start + read.pos read.tid = contig2tid[contig] # 3 = BAM_CREF_SKIP read.cigar = [(0, match1), (3, intron_length), (0, match2)] outfile.write(read) noutput += 1 outfile.close() if genomefile: genomefile.close() c = E.Counter() c.input = ninput c.output = noutput c.full = nfull c.cigar = ncigar c.ambiguous = nambiguous c.unmapped = nunmapped E.info("%s" % str(c)) # write footer and output benchmark information. E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: matrix2stats.py 2795 2009-09-16 15:29:23Z andreas $", usage=globals()["__doc__"]) parser.add_option("-m", "--method", dest="method", type="choice", choices=("chi-squared", "pearson-chi-squared"), help="statistical methods to apply.") parser.add_option("-t", "--header-names", dest="headers", action="store_true", help="matrix has row/column headers.") parser.add_option("--no-headers", dest="headers", action="store_false", help="matrix has no row/column headers.") parser.add_option("-i", "--input-format", dest="input_format", type="choice", choices=("full", "sparse", "phylip"), help="""input format for matrix.""") parser.add_option("-o", "--output-format", dest="output_format", type="choice", choices=("full", "sparse", "phylip"), help="""output format for matrix.""") parser.add_option("-p", "--parameters", dest="parameters", action="append", type="string", help="parameters for various functions.") parser.add_option("-a", "--iteration", dest="iteration", type="choice", choices=("pairwise", "all-vs-all"), help="""how to compute stats [%default].""") parser.set_defaults( method="chi-squared", headers=True, value_format="%6.4f", pvalue_format="%6.4e", input_format="full", write_separators=True, parameters=[], iteration=None, ) (options, args) = E.start(parser) lines = [x for x in sys.stdin.readlines() if x[0] != "#"] chunks = [x for x in range(len(lines)) if lines[x][0] == ">"] if not chunks: options.write_separators = False chunks = [-1] chunks.append(len(lines)) ninput, noutput, nskipped = 0, 0, 0 if options.write_separators: options.stdout.write("test\t") header_prefix = "" if options.method == "chi-squared": header_prefix = "observed\texpected" options.stdout.write("\t".join((header_prefix, "n", "min", "max", "chi", "df", "P", "passed", "phi")) + "\n") elif options.method in ("pearson-chi-squared", ): options.stdout.write("column\t") options.stdout.write("\t".join((header_prefix, "n", "prob", "obs", "exp", "chi", "df", "P", "passed", "phi")) + "\n") if len(options.parameters) == 0: raise "out of parameters - please supply probability or filename with probabilities." param = options.parameters[0] del options.parameters[0] if options.write_separators: probabilities = IOTools.ReadMap(IOTools.open_file(param, "r"), map_functions=(str, float)) else: probability = float(param) for x in range(len(chunks) - 1): ninput += 1 matrix, row_headers, col_headers = MatlabTools.readMatrix( StringIO("".join(lines[chunks[x] + 1:chunks[x + 1]])), format=options.input_format, headers=options.headers) nrows, ncols = matrix.shape if options.loglevel >= 2: options.stdlog.write( "# read matrix: %i x %i, %i row titles, %i colum titles.\n" % (nrows, ncols, len(row_headers), len(col_headers))) if options.write_separators: options.stdout.write(lines[chunks[x]][1:-1] + "\t") pairs = [] if options.iteration == "pairwise": pairs = [] for row1 in range(0, len(row_headers)): for row2 in range(row1 + 1, len(row_headers)): pairs.append((row1, row2)) elif options.iteration == "all-vs-all": pairs = [] for row1 in range(0, len(row_headers)): for row2 in range(0, len(row_headers)): if row1 == row2: continue pairs.append((row1, row2)) if options.method == "chi-squared": for row1, row2 in pairs: row_header1 = row_headers[row1] row_header2 = row_headers[row2] try: result = Stats.doChiSquaredTest( numpy.vstack((matrix[row1], matrix[row2]))) except ValueError: nskipped += 1 continue noutput += 1 options.stdout.write("\t".join( ("%s" % row_header1, "%s" % row_header2, "%i" % result.mSampleSize, "%i" % min(matrix.flat), "%i" % max(matrix.flat), options.value_format % result.mChiSquaredValue, "%i" % result.mDegreesFreedom, options.pvalue_format % result.mProbability, "%s" % result.mSignificance, options.value_format % result.mPhi)) + "\n") elif options.method == "pearson-chi-squared": if nrows != 2: raise ValueError("only implemented for 2xn table") if options.write_separators: id = re.match("(\S+)", lines[chunks[x]][1:-1]).groups()[0] probability = probabilities[id] for col in range(ncols): options.stdout.write("%s\t" % col_headers[col]) result = Stats.doPearsonChiSquaredTest(probability, sum(matrix[:, col]), matrix[0, col]) options.stdout.write("\t".join( ("%i" % result.mSampleSize, "%f" % probability, "%i" % result.mObserved, "%f" % result.mExpected, options.value_format % result.mChiSquaredValue, "%i" % result.mDegreesFreedom, options.pvalue_format % result.mProbability, "%s" % result.mSignificance, options.value_format % result.mPhi))) if col < ncols - 1: options.stdout.write("\n") if options.write_separators: options.stdout.write(lines[chunks[x]][1:-1] + "\t") options.stdout.write("\n") E.info("# ninput=%i, noutput=%i, nskipped=%i\n" % (ninput, noutput, nskipped)) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: compare_clusters.py 2782 2009-09-10 11:40:29Z andreas $" ) parser.add_option("-o", "--output-filename-pattern", dest="output_pattern", type="string", help="output pattern for filenames.") parser.set_defaults( output_pattern=None, format="%5.2f", ) (options, args) = E.start(parser, add_pipe_options=True) if len(args) != 2: raise ValueError("please supply to filenames with the clusters") map_id2cluster1, map_cluster2ids1 = IOTools.ReadMap(IOTools.open_file( args[0]), both_directions=True) map_id2cluster2, map_cluster2ids2 = IOTools.ReadMap(IOTools.open_file( args[1]), both_directions=True) graph = networkx.Graph() for a in list(map_cluster2ids1.keys()): graph.add_node((1, a)) for b in list(map_cluster2ids2.keys()): graph.add_node((2, b)) # build graph between clusters for cluster1, ids1 in list(map_cluster2ids1.items()): for id1 in ids1: if id1 in map_id2cluster2: graph.add_edge((1, cluster1), (2, map_id2cluster2[id1])) components = networkx.connected_components(graph) ####################################################### ####################################################### ####################################################### # write components and compute counts ####################################################### outfile = getFile("components", options) outfile.write("id\ttotal\tn1\tn2\tmembers1\tmembers2\n") n = 0 counts = {} subsets = [] for component in components: m1, m2 = [], [] for x in component: if x[0] == 1: m1.append(x[1]) else: m2.append(x[1]) t = len(component) n1 = len(m1) n2 = len(m2) cc = (n1, n2) if cc not in counts: counts[cc] = 0 counts[cc] += 1 if cc == (1, 1): subsets.append(n) n += 1 outfile.write("%i\t%i\t%i\t%i\t%s\t%s\n" % (n, t, n1, n2, ",".join(m1), ",".join(m2))) if outfile != options.stdout: outfile.close() else: outfile.write("//\n") ####################################################### ####################################################### ####################################################### # write counts ####################################################### outfile = getFile("counts", options) outfile.write("n1\tn2\tcounts\tpcounts1\tpcounts2\n") for cc, c in list(counts.items()): outfile.write( "%i\t%i\t%i\t%s\t%s\n" % (cc[0], cc[1], c, options.format % (100.0 * float(c) / len(map_cluster2ids1)), options.format % (100.0 * float(c) / len(map_cluster2ids2)))) if outfile != options.stdout: outfile.close() else: outfile.write("//\n") ####################################################### ####################################################### ####################################################### # analyze subsets - how many of the 1:1 clusters # contain the exact members? ####################################################### outfile = getFile("subsets", options) outfile.write("id\tn1\tn2\tunion\tinter\tunique1\tunique2\n") ntrue = 0 nrest1 = 0 nrest2 = 0 nother = 0 for component_id in subsets: component = components[component_id] if component[0][0] == 1: id1, id2 = component[0][1], component[1][1] else: id1, id2 = component[1][1], component[0][1] members1 = set(map_cluster2ids1[id1]) members2 = set(map_cluster2ids2[id2]) union = len(members1.union(members2)) intersection = len(members1.intersection(members2)) rest1 = len(members1.difference(members2)) rest2 = len(members2.difference(members1)) if rest1 == 0 and rest2 == 0: ntrue += 1 elif rest1 == 0: nrest1 += 1 elif rest2 == 0: nrest2 += 1 else: nother += 1 outfile.write("%i\t%i\t%i\t%i\t%i\t%i\t%i\n" % (component_id, len(members1), len(members2), union, intersection, rest1, rest2)) if outfile != options.stdout: outfile.close() else: outfile.write("//\n") # write subset statistics ntotal = len(subsets) options.stdout.write("# subset statistics of 1:1 corresponding clusters\n") options.stdout.write("class\tcounts\ttotal\n") options.stdout.write("%s\t%i\t%s\n" % ("total", ntotal, options.format % 100)) options.stdout.write("%s\t%i\t%s\n" % ("true", ntrue, options.format % (100.0 * ntrue / ntotal))) options.stdout.write("%s\t%i\t%s\n" % ("unique1", nrest1, options.format % (100.0 * nrest1 / ntotal))) options.stdout.write("%s\t%i\t%s\n" % ("unique2", nrest2, options.format % (100.0 * nrest2 / ntotal))) options.stdout.write("%s\t%i\t%s\n" % ("other", nother, options.format % (100.0 * nother / ntotal))) E.stop()