Пример #1
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version="%prog version: $Id: gtf2alleles.py 2886 2010-04-07 08:47:46Z andreas $", usage=globals()["__doc__"])

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome [default=%default].")
    parser.add_option("-t", "--tablename", dest="tablename", type="string",
                      help="tablename to get variants from (in samtools pileup format) [default=%default].")
    parser.add_option("-d", "--database", dest="database", type="string",
                      help="sqlite3 database [default=%default].")
    parser.add_option("-f", "--exons-file", dest="filename_exons", type="string",
                      help="filename with transcript model information (gtf formatted file)  [default=%default].")
    parser.add_option("-r", "--filename-reference", dest="filename_reference", type="string",
                      help="filename with transcript models of a reference gene set. Stop codons that do not"
                      " overlap any of the exons in this file are ignore (gtf-formatted file)  [default=%default].")
    parser.add_option("--vcf-file", dest="filename_vcf", type="string",
                      help="filename with variants in VCF format. Should be indexed by tabix  [default=%default].")
    parser.add_option("--pileup-file", dest="filename_pileup", type="string",
                      help="filename with variants in samtools pileup format. Should be indexed by tabix  [default=%default].")
    parser.add_option("--vcf-sample", dest="vcf_sample", type="string",
                      help="sample id for species of interest in vcf formatted file [default=%default].")
    parser.add_option("-s", "--seleno-tsv-file", dest="filename_seleno", type="string",
                      help="filename of a list of transcript ids that are selenoproteins [default=%default].")
    parser.add_option("-m", "--module", dest="modules", type="choice", action="append",
                      choices=("gene-counts", "transcript-effects"),
                      help="modules to apply [default=%default].")
    parser.add_option("-o", "--output-section", dest="output", type="choice", action="append",
                      choices=("all", "peptide", "cds", "table", "gtf", "map"),
                      help="sections to output [default=%default].")
    parser.add_option("-k", "--with-knockouts", dest="with_knockouts", action="store_true",
                      help="add alleles that are knocked out to fasta and gtf files [default=%default].")

    parser.set_defaults(
        genome_file=None,
        filename_exons=None,
        filename_referenec=None,
        filename_seleno=None,
        modules=[],
        border=200,
        separator="|",
        tablename=None,
        database="csvdb",
        output=[],
        with_knockouts=False,
        filename_vcf=None,
        vcf_sample=None,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    ninput, nskipped, noutput = 0, 0, 0

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
    else:
        fasta = None

    if options.filename_seleno:
        seleno = set(IOTools.readList(open(options.filename_seleno, "r")))
    else:
        seleno = {}

    infile_gtf = GTF.gene_iterator(GTF.iterator(options.stdin))

    # acquire variants from SQLlite database
    if options.tablename:
        if not options.database:
            raise ValueError("please supply both database and tablename")
        variant_getter = VariantGetterSqlite(
            options.database, options.tablename)
    elif options.filename_pileup:
        variant_getter = VariantGetterPileup(options.filename_pileup)
    elif options.filename_vcf:
        variant_getter = VariantGetterVCF(
            options.filename_vcf, options.vcf_sample)
    else:
        raise ValueError("please specify a source of variants.")

    if len(options.output) == 0 or "all" in options.output:
        output_all = True
    else:
        output_all = False

    if "cds" in options.output or output_all:
        outfile_cds = E.openOutputFile("cds.fasta")
    else:
        outfile_cds = None

    if "map" in options.output or output_all:
        outfile_map = E.openOutputFile("map.psl")
    else:
        outfile_map = None

    if "peptide" in options.output or output_all:
        outfile_peptides = E.openOutputFile("peptides.fasta")
    else:
        outfile_peptides = None

    if "table" in options.output or output_all:
        outfile_alleles = E.openOutputFile("table")
        outfile_alleles.write("\t".join(
            ("gene_id",
             "transcript_id", "allele_id", "contig", "strand",
             "is_wildtype",
             ("\t".join(Allele._fields)))) + "\n")
    else:
        outfile_alleles = None

    if "gtf" in options.output or output_all:
        outfile_gtf = E.openOutputFile("gtf")
    else:
        outfile_gtf = None

    # id separatar
    separator = options.separator

    for transcripts in infile_gtf:

        gene_id = transcripts[0][0].gene_id

        overall_start = min([min([x.start for x in y]) for y in transcripts])
        overall_end = max([max([x.end for x in y]) for y in transcripts])
        contig = transcripts[0][0].contig
        strand = transcripts[0][0].strand
        is_positive_strand = Genomics.IsPositiveStrand(strand)
        lcontig = fasta.getLength(contig)
        E.info("%s: started processing on %s:%i..%i (%s)" %
               (gene_id, contig, overall_start, overall_end, strand))

        ninput += 1
        extended_start = max(0, overall_start - options.border)
        extended_end = min(lcontig, overall_end + options.border)

        # if contig.startswith("chr"): contig = contig[3:]

        variants = variant_getter(contig, extended_start, extended_end)

        E.debug("%s: found %i variants in %s:%i..%i" %
                (gene_id, len(variants), contig, extended_start, extended_end))

        if E.global_options.loglevel >= 10:
            print("# collected variants:", variants)

        # collect intron/exon sequences
        # coordinates are forward/reverse
        # also updates the coordinates in transcripts
        all_exons, all_introns = collectExonIntronSequences(transcripts, fasta)

        # update variants such that they use the same coordinates
        # as the transcript
        variants = Variants.updateVariants(variants, lcontig, strand)

        # deal with overlapping but consistent variants
        variants = Variants.mergeVariants(variants)

        E.debug("%s: found %i variants after merging in %s:%i..%i" %
                (gene_id, len(variants), contig, extended_start, extended_end))

        if E.global_options.loglevel >= 10:
            print("# merged variants:", variants)

        # collect coordinate offsets and remove conflicting variants
        variants, removed_variants, offsets = Variants.buildOffsets(
            variants, contig=contig)

        if len(removed_variants) > 0:
            E.warn("removed %i conflicting variants" % len(removed_variants))
            for v in removed_variants:
                E.info("removed variant: %s" % str(v))

        E.info("%i variants after filtering" % len(variants))

        if len(variants) > 0:
            # build variants
            indexed_variants = Variants.indexVariants(variants)

            # update exon sequences according to variants
            variant_exons = buildVariantSequences(indexed_variants, all_exons)

            # update intron sequences according to variants
            variant_introns = buildVariantSequences(
                indexed_variants, all_introns)

            if E.global_options.loglevel >= 10:
                for key in variant_exons:
                    print("exon", key)
                    Genomics.printPrettyAlignment(
                        all_exons[key],
                        variant_exons[key][0],
                        variant_exons[key][1],
                    )
                for key in variant_introns:
                    print("intron", key)
                    Genomics.printPrettyAlignment(
                        all_introns[key][:30] + all_introns[key][-30:],
                        variant_introns[key][0][:30] +
                        variant_introns[key][0][-30:],
                        variant_introns[key][1][:30] + variant_introns[key][1][-30:])

        else:
            variant_exons, variant_introns = None, None

        for transcript in transcripts:

            transcript.sort(key=lambda x: x.start)

            transcript_id = transcript[0].transcript_id
            alleles = buildAlleles(transcript,
                                   variant_exons,
                                   variant_introns,
                                   all_exons,
                                   all_introns,
                                   offsets,
                                   is_seleno=transcript_id in seleno,
                                   reference_coordinates=False,
                                   )

            ##############################################################
            ##############################################################
            ##############################################################
            # output
            for aid, al in enumerate(alleles):

                allele, map_cds2reference = al

                reference_cds_sequence = buildCDSSequence(
                    transcript, all_exons)
                is_wildtype = reference_cds_sequence == allele.cds

                allele_id = str(aid)
                assert len(allele.exon_starts) == allele.nexons
                assert len(allele.cds_starts) == allele.nexons
                assert len(allele.frames) == allele.nexons

                # the output id
                outid = separator.join((gene_id, transcript_id, allele_id))

                # output map between cds and reference
                if outfile_map and map_cds2reference:
                    match = Blat.Match()
                    match.mQueryId = allele_id
                    match.mQueryLength = allele.cds_len
                    match.mSbjctId = contig
                    match.mSbjctLength = lcontig
                    match.strand = strand
                    match.fromMap(map_cds2reference, use_strand=True)
                    outfile_map.write("%s\n" % str(match))

                # only output sequences for genes that have not been knocked
                # out, unless required
                if not allele.is_nmd_knockout or options.with_knockouts:

                    if outfile_gtf:
                        gtf = GTF.Entry()
                        gtf.gene_id = gene_id
                        gtf.transcript_id = transcript_id
                        gtf.addAttribute("allele_id", allele_id)
                        gtf.contig = contig
                        gtf.strand = strand
                        gtf.feature = "CDS"
                        gtf.source = "gtfxnsps"
                        l = 0
                        last_cds_start = allele.cds_starts[0]
                        gtf.start = allele.exon_starts[0]
                        gtf.frame = allele.frames[0]

                        for exon_start, cds_start, frame in zip(allele.exon_starts[1:],
                                                                allele.cds_starts[
                                                                    1:],
                                                                allele.frames[1:]):
                            cds_length = cds_start - last_cds_start
                            gtf.end = gtf.start + cds_length
                            if not is_positive_strand:
                                gtf.start, gtf.end = lcontig - \
                                    gtf.end, lcontig - gtf.start
                            outfile_gtf.write(str(gtf) + "\n")

                            gtf.start = exon_start
                            gtf.frame = frame

                            l += cds_length
                            last_cds_start = cds_start

                        cds_length = len(allele.cds) - last_cds_start
                        gtf.end = gtf.start + cds_length
                        if not is_positive_strand:
                            gtf.start, gtf.end = lcontig - \
                                gtf.end, lcontig - gtf.start
                        outfile_gtf.write(str(gtf) + "\n")

                    if outfile_cds:
                        outfile_cds.write(">%s\n%s\n" % (outid, allele.cds))
                    if outfile_peptides:
                        outfile_peptides.write(
                            ">%s\n%s\n" % (outid, allele.peptide))

                # reformat for tabular output
                allele = allele._replace(
                    cds_starts=",".join(map(str, allele.cds_starts)),
                    exon_starts=",".join(map(str, allele.exon_starts)),
                    frames=",".join(map(str, allele.frames)))

                # convert reference coordinates to positive strand coordinates
                if allele.reference_first_stop_start >= 0 and not is_positive_strand:
                    allele = allele._replace(
                        reference_first_stop_start=lcontig -
                        allele.reference_first_stop_end,
                        reference_first_stop_end=lcontig - allele.reference_first_stop_start, )

                if outfile_alleles:
                    outfile_alleles.write("%s\t%s\n" % (
                        "\t".join((gene_id,
                                   transcript_id,
                                   allele_id,
                                   contig,
                                   strand,
                                   "%i" % is_wildtype)),
                        "\t".join(map(str, allele))))

                noutput += 1
                # only output first allele (debugging)
                # break

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    # write footer and output benchmark information.
    E.stop()
Пример #2
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: set_diff.py 2782 2009-09-10 11:40:29Z andreas $")

    parser.add_option("-p", "--add-percent", dest="add_percent", action="store_true",
                      help="add percentage information to each line.")

    parser.add_option("-t", "--header-names", dest="headers", type="string",
                      help="comma separated list of headers. If empty or set to '-', filenames are used.")

    parser.add_option("--skip-header", dest="add_header", action="store_false",
                      help="do not add header to flat format.")

    parser.add_option("--output-with-header", dest="write_header", action="store_true",
                      help="write header and exit.")

    parser.add_option("--with-title", dest="with_title", action="store_true",
                      help="use column titles in input data [%default].")

    parser.add_option("--no-title", dest="with_title", action="store_false",
                      help="there are no titles in input data [%default].")

    parser.set_defaults(
        add_percent=False,
        percent_format="%5.2f",
        headers=None,
        add_header=True,
        write_header=False,
        with_title=True,
    )

    (options, args) = E.start(parser)

    if options.add_header:
        options.stdout.write(
            "set1\tset2\tn1\tn2\tunion\tinter\tunique1\tunique2")
        if options.add_percent:
            options.stdout.write(
                "\tpinter\tpunique1\tpunique2\tpcov1\tpcov2\tpcovmax")
        options.stdout.write("\n")

        if options.write_header:
            sys.exit(0)

    if len(args) < 2:
        raise ValueError("please supply at least two filenames.")

    headers, titles, sets = [], [], []

    if options.headers:
        if options.headers == "-":
            headers = args
        else:
            headers = options.headers.split(",")
            if len(headers) != len(args):
                raise ValueError(
                    "please supply the same number of headers as there are filenames.")

    for f in args:
        if options.with_title:
            title, data = IOTools.readList(
               IOTools.open_file(f, "r"), with_title=options.with_title)
            titles.append(title)
        else:
            data = IOTools.readList(open(f, "r"))
        sets.append(set(data))

    if not headers and titles:
        headers = titles
    else:
        headers = args

    for x in range(len(sets) - 1):
        set1 = sets[x]

        for y in range(x + 1, len(sets)):
            set2 = sets[y]
            l1, l2 = len(set1), len(set2)
            options.stdout.write("%s\t%s\t%i\t%i\t%i\t%i\t%i\t%i" % (headers[x], headers[y],
                                                                     l1, l2,
                                                                     len(set1.union(
                                                                         set2)),
                                                                     len(set1.intersection(
                                                                         set2)),
                                                                     len(set1.difference(
                                                                         set2)),
                                                                     len(set2.difference(set1))))

            if options.add_percent:
                if len(set1) == 0:
                    ri, r1, r2 = 0, 1, 0
                    c1, c2, cm = 1, 0, 0
                elif len(set2) == 0:
                    ri, r1, r2 = 0, 0, 1
                    c1, c2, cm = 0, 1, 0
                else:
                    i = len(set1.intersection(set2))
                    ri, r1, r2 = (
                        i / float(len(set1.union(set2))),
                        len(set1.difference(set2)) / float(l1),
                        len(set2.difference(set1)) / float(l2))
                    c1, c2 = (i / float(l1), i / float(l2))
                    cm = max(c1, c2)

                options.stdout.write(
                    "\t" + ("\t".join([options.percent_format for z in range(6)])) % (ri, r1, r2, c1, c2, cm))

            options.stdout.write("\n")

    E.stop()
Пример #3
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="choice",
                      choices=("apply", "change-format", "renumber-reads",
                               "sample", "sort", "trim3", "trim5", "unique",
                               "reverse-complement", "grep"),
                      help="method to apply [%default]")

    parser.add_option("--target-format",
                      dest="target_format",
                      type="choice",
                      choices=('sanger', 'solexa', 'phred64', 'integer',
                               'illumina-1.8'),
                      help="guess quality score format and set quality scores "
                      "to format [default=%default].")

    parser.add_option(
        "--guess-format",
        dest="guess_format",
        type="choice",
        choices=('sanger', 'solexa', 'phred64', 'integer', 'illumina-1.8'),
        help="quality score format to assume if ambiguous [default=%default].")

    parser.add_option(
        "--sample-size",
        dest="sample_size",
        type="float",
        help="proportion of reads to sample. "
        "Provide a proportion of reads to sample, e.g. 0.1 for 10%, "
        "0.5 for 50%, etc [default=%default].")

    parser.add_option("--pair-fastq-file",
                      dest="pair",
                      type="string",
                      help="if data is paired, filename with second pair. "
                      "Implemented for sampling [default=%default].")

    parser.add_option(
        "--map-tsv-file",
        dest="map_tsv_file",
        type="string",
        help="filename with tab-separated identifiers mapping for "
        "method apply [default=%default].")

    parser.add_option("--num-bases",
                      dest="nbases",
                      type="int",
                      help="number of bases to trim [default=%default].")

    parser.add_option(
        "--seed",
        dest="seed",
        type="int",
        help="seed for random number generator [default=%default].")

    parser.add_option(
        "--pattern-identifier",
        dest="renumber_pattern",
        type="string",
        help="rename reads in file by pattern [default=%default]")

    parser.add_option(
        "--grep-pattern",
        dest="grep_pattern",
        type="string",
        help="subset to reads matching pattern [default=%default]")

    parser.set_defaults(method=None,
                        change_format=None,
                        guess_format=None,
                        sample_size=0.1,
                        nbases=0,
                        pair=None,
                        apply=None,
                        seed=None,
                        renumber_pattern="read_%010i",
                        grep_pattern=".*")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    c = E.Counter()

    if options.method is None:
        raise ValueError("no method specified, please use --method")

    if options.method == "change-format":
        for record in Fastq.iterate_convert(options.stdin,
                                            format=options.target_format,
                                            guess=options.guess_format):
            c.input += 1
            options.stdout.write("%s\n" % record)
            c.output += 1

    elif options.method == "grep":
        for record in Fastq.iterate(options.stdin):
            if re.match(options.grep_pattern, record.seq):
                options.stdout.write("%s\n" % record)

    elif options.method == "reverse-complement":
        for record in Fastq.iterate(options.stdin):
            record.seq = Genomics.complement(record.seq)
            record.quals = record.quals[::-1]
            options.stdout.write("%s\n" % record)

    elif options.method == "sample":
        sample_threshold = min(1.0, options.sample_size)

        random.seed(options.seed)

        if options.pair:
            if not options.output_filename_pattern:
                raise ValueError("please specify output filename pattern for "
                                 "second pair (--output-filename-pattern)")

            outfile1 = options.stdout
            outfile2 = IOTools.open_file(options.output_filename_pattern, "w")

            for record1, record2 in zip(
                    Fastq.iterate(options.stdin),
                    Fastq.iterate(IOTools.open_file(options.pair))):
                c.input += 1
                if random.random() <= sample_threshold:
                    c.output += 1
                    outfile1.write("%s\n" % record1)
                    outfile2.write("%s\n" % record2)
        else:
            for record in Fastq.iterate(options.stdin):
                c.input += 1
                if random.random() <= sample_threshold:
                    c.output += 1
                    options.stdout.write("%s\n" % record)

    elif options.method == "apply":
        ids = set(IOTools.readList(IOTools.open_file(options.apply)))

        for record in Fastq.iterate(options.stdin):
            c.input += 1
            if re.sub(" .*", "", record.identifier).strip() in ids:
                c.output += 1
                options.stdout.write("%s\n" % record)

    elif options.method == "trim3":
        trim3 = options.nbases
        for record in Fastq.iterate(options.stdin):
            c.input += 1
            record.trim(trim3)
            options.stdout.write("%s\n" % record)
            c.output += 1

    elif options.method == "trim5":
        trim5 = options.nbases
        for record in Fastq.iterate(options.stdin):
            c.input += 1
            record.trim5(trim5)
            options.stdout.write("%s\n" % record)
            c.output += 1

    elif options.method == "unique":
        keys = set()
        for record in Fastq.iterate(options.stdin):
            c.input += 1
            if record.identifier in keys:
                continue
            else:
                keys.add(record.identifier)
            options.stdout.write("%s\n" % record)
            c.output += 1

    # Need to change this to incorporate both pairs
    elif options.method == "sort":
        if not options.pair:
            # This is quicker for a single fastq file
            statement = "paste - - - - | sort -k1,1 -t ' ' | tr '\t' '\n'"
            os.system(statement)
        else:
            if not options.output_filename_pattern:
                raise ValueError(
                    "please specify output filename for second pair "
                    "(--output-filename-pattern)")
            E.warn("consider sorting individual fastq files - "
                   "this is memory intensive")
            entries1 = {}
            entries2 = {}

            for record1, record2 in zip(
                    Fastq.iterate(options.stdin),
                    Fastq.iterate(IOTools.open_file(options.pair))):
                entries1[record1.identifier[:-2]] = (record1.seq,
                                                     record1.quals)
                entries2[record2.identifier[:-2]] = (record2.seq,
                                                     record2.quals)

            outfile1 = options.stdout
            outfile2 = IOTools.open_file(options.output_filename_pattern, "w")
            assert len(set(entries1.keys()).intersection(
                set(entries2.keys()))) == len(entries1),\
                "paired files do not contain the same reads "\
                "need to reconcile files"

            for entry in sorted(entries1):
                outfile1.write("@%s/1\n%s\n+\n%s\n" %
                               (entry, entries1[entry][0], entries1[entry][1]))
                outfile2.write("@%s/2\n%s\n+\n%s\n" %
                               (entry, entries2[entry][0], entries2[entry][1]))

    elif options.method == "renumber-reads":
        id_count = 1
        for record in Fastq.iterate(options.stdin):
            record.identifier = options.renumber_pattern % id_count
            id_count += 1
            options.stdout.write("@%s\n%s\n+\n%s\n" %
                                 (record.identifier, record.seq, record.quals))

    # write footer and output benchmark information.
    E.info("%s" % str(c))
    E.stop()