Exemplo n.º 1
0
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--Infile",
                        dest="Infile",
                        type=str,
                        help="Supply file containing filtered 16S fasta file")

    parser.add_argument("--Outfile",
                        dest="Outfile",
                        type=str,
                        help="Supply desired outfile name")

    # add common options (-h/--help, ...) and parse command line
    (args) = E.start(parser, argv=argv)

    ###############################################
    ###############################################
    ############## Execute Functions ##############
    ###############################################
    ###############################################

    specformatter(args.Infile, args.Outfile)

    # write footer and output benchmark information.
    E.stop()
Exemplo n.º 2
0
def main(argv=None):

    parser = E.ArgumentParser(descriptin=__doc__)

    parser.add_argument("-f",
                        "--fasta",
                        dest="input_filename_fasta",
                        type=str,
                        help="filename with fasta sequences. ")

    parser.add_argument("-o",
                        "--output-filename-sequences",
                        dest="output_filename_sequences",
                        type=str,
                        help="output per sequence information to filename")

    parser.set_defaults(input_filename_fasta=None, )

    (args, unknown) = E.start(parser, argv=argv, unknowns=True)

    if len(unnowns) > 0:
        args.input_filename_fasta = args[0]

    sequence_pairs = []

    if args.input_filename_fasta != "-" and os.path.exists(
            args.input_filename_fasta + ".fai"):
        has_index = 1
        fastafile = pysam.FastaFile(args.input_filename_fasta)
        sequence_pairs = list(zip(fastafile.references, fastafile.lengths))
    else:
        has_index = 0
        iterator = pysam.FastxFile(args.input_filename_fasta)
        for record in iterator:
            sequence_pairs.append((record.name, len(record.sequence)))

    lengths = numpy.array([x[1] for x in sequence_pairs])

    args.stdout.write("\t".join(("has_index", "nsequences", "total_length",
                                 "min_length", "max_length", "median_length",
                                 "mean_length")) + "\n")

    if len(lengths) > 0:
        args.stdout.write("\t".join(
            map(str, (has_index, len(sequence_pairs), lengths.sum(),
                      lengths.min(), lengths.max(), numpy.median(lengths),
                      lengths.mean()))) + "\n")
    else:
        args.stdout.write("\t".join(
            map(str, (has_index, len(sequence_pairs), 0, "", "", "", ""))) +
                          "\n")

    if args.output_filename_sequences:
        with iotools.open_file(args.output_filename_sequences, "w") as outf:
            outf.write("name\tlength\n")
            outf.write(
                "\n".join(["\t".join(map(str, x))
                           for x in sequence_pairs]) + "\n")

    E.stop()
Exemplo n.º 3
0
def main(argv=None):

    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument(
        "--regex-filename",
        dest="regex_filename",
        type=str,
        help="extract column name from filename via regular expression ")

    parser.add_argument("--filter",
                        dest="filters",
                        type=str,
                        action="append",
                        choices=("PASS", "SNP"),
                        help="apply filters to VCFs when reading ")

    parser.set_defaults(
        regex_filename=None,
        filters=[],
    )

    (args, unknown) = E.start(parser,
                              argv=argv,
                              add_output_options=True,
                              unknowns=True)

    if len(unknown) < 2:
        raise ValueError("requiring at least 2 input filenames")

    dfs = []
    for filename in unknown:
        if args.regex_filename:
            try:
                name = re.search(args.regex_filename, filename).groups()[0]
            except AttributeError:
                raise ValueError(
                    "regular expression '{}' does not match {}".format(
                        args.regex_filename, filename))
        else:
            name = iotools.snip(os.path.basename(filename), ".vcf.gz")

        E.debug("reading data from {}".format(filename))
        df = read_vcf_positions_into_dataframe(filename, filters=args.filters)
        df[name] = 1
        dfs.append(df)

    ndata = len(dfs)
    merged_df = dfs[0]
    for df in dfs[1:]:
        merged_df = merged_df.merge(df, how="outer")
    merged_df = merged_df.fillna(0)
    ddf = merged_df.drop(["chrom", "pos"], axis=1)
    set_counts = ddf.groupby(by=list(ddf.columns)).size()
    set_counts = set_counts.reset_index()
    set_counts.columns = list(set_counts.columns[:-1]) + ["counts"]

    set_counts.to_csv(args.stdout, sep="\t", index=False)
    E.stop()
Exemplo n.º 4
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.add_argument("-o", "--output-section", dest="output", type=str,
                        choices=("full", "name"),
                        help="output either ``full`` overlapping entries, only the ``name``s.")

    parser.set_defaults(
        output="full",
    )

    # add common options (-h/--help, ...) and parse command line
    (args, unknown) = E.start(parser,
                              argv=argv,
                              unknowns=True)

    if len(unknown) != 2:
        raise ValueError("two arguments required")

    if unknown[0] == "-":
        infile1 = args.stdin
    else:
        infile1 = iotools.open_file(unknown[0], "r")

    infile2 = iotools.open_file(unknown[1], "r")

    idx = Bed.readAndIndex(infile2, with_values=True)

    output = args.output
    outfile = args.stdout

    if output == "name":
        outfile.write("name1\tname2\n")
        outf = lambda x: x.fields[0]
    else:
        outf = str

    for bed in Bed.iterator(infile1):
        try:
            overlaps = idx[bed.contig].find(bed.start, bed.end)
        except (KeyError, IndexError):
            # ignore missing contig and zero length intervals
            continue

        for o in overlaps:
            outfile.write("\t".join((outf(bed), outf(o[2]))) + "\n")

    E.stop()
Exemplo n.º 5
0
def main(argv=sys.argv):

    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.add_argument("--is-gtf", dest="is_gtf", action="store_true",
                        help="input is gtf.")

    parser.set_defaults(
        is_gtf=False,
    )

    (args, unknown) = E.start(parser,
                              add_output_options=True,
                              unknowns=True)

    if len(unknown) == 0:
        files = [args.stdin]
    else:
        files = args

    args.stdout.write("track\t%s" % ("\t".join(counter_gff.fields)))

    if args.is_gtf:
        args.stdout.write("\t%s" % ("\t".join(counter_exons.fields)))
    args.stdout.write("\n")

    for f in files:
        if f == args.stdin:
            infile = f
            args.stdout.write("stdin")
        else:
            infile = iotools.open_file(f)
            args.stdout.write(f)

        counters = []
        if args.is_gtf:
            iterator = GTF.iterator(infile)
            counters.append(counter_gff(iterator))
            counters.append(counter_exons(counters[0]))
        else:
            iterator = GTF.iterator(infile)
            counters.append(counter_gff(iterator))

        c = counters[-1]
        for x in c:
            pass

        for c in counters:
            args.stdout.write("\t%s" % str(c))
        args.stdout.write("\n")

        if infile != sys.stdin:
            infile.close()

    E.stop()
Exemplo n.º 6
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.add_argument(
        "-a", "--first-fastq-file", dest="fastq1", type=str,
        help="supply read1 fastq file")
    parser.add_argument(
        "-b", "--second-fastq-file", dest="fastq2", type=str,
        help="supply read2 fastq file")

    # add common options (-h/--help, ...) and parse command line
    (args, unknown) = E.start(parser,
                              argv=argv,
                              unknowns=True)

    if unknown and len(unknown) == 2:
        args.fastq1, args.fastq2 = unknown

    fastq1 = iotools.open_file(args.fastq1)
    fastq2 = iotools.open_file(args.fastq2)

    E.info("iterating over fastq files")
    f1_count = 0
    for f1, f2 in zip_longest(Fastq.iterate(fastq1),
                              Fastq.iterate(fastq2)):
        if not (f1 and f2) or (not f2 and f1):
            try:
                raise PairedReadError(
                    "unpaired reads detected. Are files sorted? are "
                    "files of equal length?")
            except PairedReadError as e:
                raise PairedReadError(e).with_traceback(sys.exc_info()[2])
        else:
            assert f1.identifier.endswith("/1") and \
                f2.identifier.endswith("/2"), \
                "Reads in file 1 must end with /1 and reads in file 2 with /2"
            args.stdout.write(
                ">%s\n%s\n>%s\n%s\n" %
                (f1.identifier, f1.seq, f2.identifier, f2.seq))
            f1_count += 1

    E.info("output: %i pairs" % f1_count)

    # write footer and output benchmark information.
    E.stop()
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("-i",
                        "--test-option",
                        dest="test_option",
                        type=str,
                        help="test option")

    parser.set_defaults(test_option="test")

    # add common options (-h/--help, ...) and parse command line
    (args) = E.start(parser, argv=argv)

    files = glob.glob(os.path.join(os.path.dirname(__file__), "*.pyx"))

    # do sth
    ninput, nskipped, noutput = 0, 0, 0

    for f in files:
        E.info("rebuilding %s" % f)
        ninput += 1
        prefix, suffix = os.path.splitext(f)
        for ext in (".c", ".pyxbldc"):
            try:
                os.remove(prefix + ext)
            except OSError:
                pass

        dirname, basename = os.path.split(prefix)
        assert basename.startswith("_")

        scriptname = os.path.join(dirname, basename[1:]) + ".py"
        if not os.path.exists(scriptname):
            E.warn("script %s does not exist - skipped" % scriptname)
            nskipped += 1
            continue

        E.info("compiling %s" % scriptname)
        os.system("%s %s --help > /dev/null" % (sys.executable, scriptname))

        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    # write footer and output benchmark information.
    E.stop()
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("-o", "--outdir", dest="outdir", type=str,
                        help="supply output directory")
    parser.add_argument("-p", "--prefix", dest="prefix", type=str,
                        help="supply output file prefix")

    # add common options (-h/--help, ...) and parse command line
    (args) = E.start(parser, argv=argv)

    prefix = os.path.join(args.outdir, args.prefix)
    d_outf = open(prefix + "_domain.tsv", "w")
    k_outf = open(prefix + "_kingdom.tsv", "w")
    p_outf = open(prefix + "_phylum.tsv", "w")
    c_outf = open(prefix + "_class.tsv", "w")
    o_outf = open(prefix + "_order.tsv", "w")
    f_outf = open(prefix + "_family.tsv", "w")
    g_outf = open(prefix + "_genus.tsv", "w")
    s_outf = open(prefix + "_species.tsv", "w")
    
    for line in args.stdin.readlines():
        data = line[:-1].split("\t")
        taxon = data[0]
        counts = data[1:]
        taxonomy = taxon.split("|")

        if "d__" in taxonomy[-1]:
            print(taxon, counts)
            d_outf.write("\t".join([taxon] + counts) + "\n")
        elif "k__" in taxonomy[-1]:
            k_outf.write("\t".join([taxon] + counts) + "\n")
        elif "p__" in taxonomy[-1]:
            p_outf.write("\t".join([taxon] + counts) + "\n")
        elif "c__" in taxonomy[-1]:
            c_outf.write("\t".join([taxon] + counts) + "\n")
        elif "o__" in taxonomy[-1]:
            o_outf.write("\t".join([taxon] + counts) + "\n")
        elif "f__" in taxonomy[-1]:
            f_outf.write("\t".join([taxon] + counts) + "\n")
        elif "g__" in taxonomy[-1]:
            g_outf.write("\t".join([taxon] + counts) + "\n")
        elif "s__" in taxonomy[-1]:
            s_outf.write("\t".join([taxon] + counts) + "\n")

    # write footer and output benchmark information.
    E.stop()
Exemplo n.º 9
0
def main(argv=None):

    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("-d",
                        "--delimiter",
                        dest="delimiter",
                        type=str,
                        help="delimiter to separate columns ")

    parser.add_argument("-m",
                        "--method",
                        dest="methods",
                        type=str,
                        action="append",
                        choices=["row-describe", "column-describe"],
                        help="additional methods to apply ")

    parser.set_defaults(
        delimiter="\t",
        methods=[],
    )

    (args) = E.start(parser, argv=argv, add_output_options=True)

    if not args.methods:
        args.methods = ["summary"]

    table = pandas.read_csv(args.stdin, args.delimiter)

    args.stdout.write("metric\tcount\tpercent\tinfo\n")

    for method in args.methods:
        label = re.sub("-", "_", method)
        if method == "summary":
            for category, count, denominator, info in compute_table_summary(
                    table):
                args.stdout.write("\t".join(
                    map(str, (category, count,
                              iotools.pretty_percent(count, denominator,
                                                     na=""), info))) + "\n")
        elif method == "column-describe":
            df = table.describe().T.stack()
            with E.open_output_file(label) as outf:
                outf.write("label\tcategory\tvalue\n")
                df.to_csv(outf, sep="\t")
        elif method == "row-describe":
            df = table.T.describe().stack()
            with E.open_output_file(label) as outf:
                outf.write("label\tcategory\tvalue\n")
                df.to_csv(outf, sep="\t")

    E.stop()
Exemplo n.º 10
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    parser = E.ArgumentParser()

    parser.add_argument("-p", "--arguments", type=str, dest="arguments",   
                        default="",
                        help="Pass options and arguments to the executable. Please surround options in \"\"")

    parser.add_argument("-o", "--output-dir", type=str, dest="output",
                        default=".",
                        help="Output for the fastq files.")

    parser.add_argument("-f", "--fastqc", dest="fastqc",
                        action="store_true",
                        help="After demultiplexing open the fastq files in FastQC.")

    parser.add_argument("-F", "--fastqc-options", type=str, dest="fastqc_options",
                        default="",
                        help="Options for FastQC. Please surround options in \"\"")

    parser.add_argument("-H", "--bcl2fastq-help", dest="bcl2fastq_help", 
                        action="store_true",
                        help="Print help for Illumina's bcl2fastq conversion software")

    (args) = E.start(parser)

    if subprocess.run("which bcl2fastq", shell=True).returncode:
        raise ValueError("bcl2fastq cannot be found")

    if args.bcl2fastq_help:
        subprocess.run("bcl2fastq --help", shell=True)
        return
    else:
        subprocess.run(f"bcl2fastq {args.arguments} -o {args.output}", shell=True)

    for infile in glob.glob(f"{args.output}/**/*.fastq.gz", recursive=True):
        with gzip.GzipFile(f"{infile}", "r") as f:
            if sum(1 for char in f.read().decode('utf-8') if char == "\n") % 4 != 0:
                raise ValueError(f"{infile} is either corrupt or incomplete.")

    if args.fastqc:
        for infile in glob.glob(f"{args.output}/**/*.fastq.gz", recursive=True):
            subprocess.run(f"fastqc {infile} {args.fastqc_options}", shell=True)
Exemplo n.º 11
0
def main(argv=sys.argv):

    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("-i",
                        "--input-fastq",
                        dest="input_fastq_file",
                        type=str,
                        help="input fastq file")

    parser.add_argument("-m",
                        "--method",
                        dest="method",
                        type=str,
                        choices=["ont2pacbio"],
                        help="methods to apply ")

    parser.set_defaults(
        input_fastq_file=None,
        line_width=80,
        method=None,
    )

    (args, unknown) = E.start(parser,
                              argv,
                              add_output_options=True,
                              unknowns=True)

    if len(unknown) == 1:
        args.input_fastq_file = unknown[0]

    if args.input_fastq_file == "-":
        args.input_fastq_file = args.stdin

    outf = args.stdout
    line_width = args.line_width
    well_no = 0
    for record in pysam.FastqFile(args.input_fastq_file):
        well_no += 1
        quals = record.get_quality_array()
        seq = record.sequence
        qv = int(math.floor(sum(quals) / len(quals)))
        outf.write(">{}/{}/{}_{} RQ=0.{}\n".format("test", well_no, 1,
                                                   len(seq) + 1, qv))
        for x in range(0, len(seq), line_width):
            outf.write(seq[x:x + line_width] + "\n")

    E.stop()
Exemplo n.º 12
0
def main(argv=sys.argv):

    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("-i",
                        "--input-fastq-file",
                        dest="input_fastq_file",
                        type=str,
                        help="input fastq file. ")

    parser.add_argument("-m",
                        "--method",
                        dest="methods",
                        action="append",
                        type=str,
                        choices=("length", ),
                        help="methods to apply ")

    parser.set_defaults(
        methods=[],
        input_fastq_file=None,
    )

    (args, unknown) = E.start(parser, argv, unknowns=True)

    if len(unknown) == 1:
        args.input_fastq_file = unknown[0]

    if args.input_fastq_file is None:
        raise ValueError("missing input fastq file")

    counter = E.Counter()

    # note: complete rewrite with Counters, currently only length
    if args.methods != ["length"]:
        raise NotImplementedError()

    with pysam.FastqFile(args.input_fastq_file) as inf:

        for read in inf:
            counter.input += 1
            args.stdout.write(
                "\t".join(map(str, (read.name, len(read.sequence)))) + "\n")

            counter.output += 1

    E.info(counter)
    E.stop()
Exemplo n.º 13
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.set_defaults()

    # add common options (-h/--help, ...) and parse command line
    (args, unknown) = E.start(parser, argv=argv, unknowns=True)

    if len(unknown) == 0 or (len(unknown) == 1 and unknown[0] == "-"):
        infile = args.stdin
    else:
        infile = fileinput.FileInput(args)

    # do sth
    ninput, nskipped, noutput = 0, 0, 0

    header = False

    for line in infile:
        ninput += 1
        if line.startswith("#"):
            pass
        elif not header:
            header = line
        elif line == header:
            nskipped += 1
            continue

        args.stdout.write(line)
        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    # write footer and output benchmark information.
    E.stop()
Exemplo n.º 14
0
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.ArgumentParser(description=__doc__)

    # add common options (-h/--help, ...) and parse command line
    (args) = E.start(parser, argv=argv)

    infile = argv[-1]
    for record in makeSplicedFasta(infile):
        options.stdout.write(record)

    # write footer and output benchmark information.
    E.stop()
Exemplo n.º 15
0
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("-k",
                        "--keep-header",
                        dest="keep_header",
                        type=int,
                        help="randomize, but keep header in place ")

    parser.set_defaults(keep_header=0)

    # add common options (-h/--help, ...) and parse command line
    (args) = E.start(parser, argv=argv)

    inf = args.stdin
    outf = args.stdout
    c = E.Counter()
    for x in range(args.keep_header):
        c.header += 1
        outf.write(inf.readline())

    lines = inf.readlines()
    c.lines_input = len(lines)
    random.shuffle(lines)
    for line in lines:
        outf.write(line)
    c.lines_output = len(lines)

    E.info(c)

    # write footer and output benchmark information.
    E.stop()
Exemplo n.º 16
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.add_argument(
        "-s", "--method=sort --sort-order", dest="sort", type=str,
        help="fields to take (in sorted order).")

    (args) = E.start(parser, add_csv_options=True)

    reader = csv.DictReader(E.stdin, dialect=args.csv_dialect)

    if args.sort:
        fields = args.sort.split(",")
    else:
        fields = None

    writer = csv.DictWriter(E.stdout,
                            fields,
                            dialect=args.csv_dialect,
                            lineterminator=args.csv_lineterminator,
                            extrasaction='ignore')

    E.stdout.write("\t".join(fields) + "\n")

    for row in reader:
        row = iotools.convertDictionary(row)
        writer.writerow(row)

    E.stop()
Exemplo n.º 17
0
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("-t",
                        "--test",
                        dest="test",
                        type=str,
                        help="supply help")

    # add common options (-h/--help, ...) and parse command line
    (args) = E.start(parser, argv=argv)

    # write footer and output benchmark information.
    E.stop()
Exemplo n.º 18
0
def main():
    parser = E.ArgumentParser()
    (options, args) = E.start(parser, unknowns=True)
    bamfile = pysam.AlignmentFile(args[0])
    outbam = pysam.AlignmentFile(args[1], "wb", template=bamfile)

    chunks = 0

    for read1s, read2s in chunk_bam_by_readname(bamfile):
        chunks += 1
        if chunks % 1000000 == 0:
            E.info("Done %s fragments" % chunks)

        for contig in read1s:
            for read in read1s[contig]:
                read2 = find_read2(read2s, read)
                if read2 is not None:
                    outbam.write(read)
                    outbam.write(read2)

    outbam.close()
    E.stop()
Exemplo n.º 19
0
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--task", dest="task", type=str,
                        choices=["extract_table", "get_coverage",
                                 "clean_table"],
                        help="task to perform")

    parser.add_argument("-t", "--table-name", dest="table", type=str,
                        help="table in SQLite DB to extract")

    # add common options (-h/--help, ...) and parse command line
    (args) = E.start(parser, argv=argv, add_database_options=True)

    if args.task == "extract_table":
        out_df = getTableFromDb(args.database_url, args.table)

    elif args.task == "get_coverage":
        out_df = getModelCoverage(args.database_url,
                                  table_regex="(\S+)_transcript_counts")

    elif args.task == "clean_table":
        infile = argv[-1]
        out_df = cleanStatsTable(infile)

    out_df.to_csv(args.stdout,
                  sep="\t", index_label="track")

    # write footer and output benchmark information.
    E.stop()
Exemplo n.º 20
0
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--Mapping-file", dest="mappingfile", type=str,
                        help= "Supply mapping file in tsv format filename \t taxonomy")

    parser.add_argument("--Outfile", dest="outfile", type=str,
                        help= "Desired outfile name")

    
    # add common options (-h/--help, ...) and parse command line
    (args) = E.start(parser, argv=argv)

    ###############################################
    ###############################################
    ############## Execute Functions ##############
    ###############################################
    ###############################################


    file2tax = FileMap(args.mappingfile)
    outfile = OutFile(args.outfile)
    for fastafile, taxonomy in file2tax.items():
        RenameFastaTitle(fastafile, file2tax, outfile)
     
    
    # write footer and output benchmark information.
    E.stop()
Exemplo n.º 21
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("-r",
                        "--remove",
                        dest="remove",
                        action="store_true",
                        help="remove specified columns, keep all others.")

    parser.add_argument("-u",
                        "--unique",
                        dest="unique",
                        action="store_true",
                        help="output rows are uniq.")

    parser.add_argument(
        "-l",
        "--large",
        dest="large",
        action="store_true",
        help="large columns. Do not use native python csv module.")

    parser.add_argument("-f",
                        "--filename-fields",
                        dest="filename_fields",
                        type=str,
                        help="filename with field information.")

    parser.set_defaults(
        remove=False,
        unique=False,
        large=False,
        filename_fields=None,
    )

    (args, unknown) = E.start(parser,
                              add_csv_options=True,
                              quiet=True,
                              unknowns=True)

    input_fields = unknown

    if args.filename_fields:
        input_fields = [
            x[:-1].split("\t")[0] for x in [
                x for x in iotools.open_file(args.filename_fields,
                                             "r").readlines() if x[0] != "#"
            ]
        ]

    if args.unique:
        outfile = UniqueBuffer(args.stdout)
    else:
        outfile = args.stdout

    while 1:
        line = args.stdin.readline()

        if not line:
            E.stop()
            sys.exit(0)

        if line[0] == "#":
            continue

        first_line = line
        break

    old_fields = first_line[:-1].split("\t")

    fields = []
    for f in input_fields:
        # do pattern search
        if f[0] == "%" and f[-1] == "%":
            pattern = re.compile(f[1:-1])
            for o in old_fields:
                if pattern.search(o) and o not in fields:
                    fields.append(o)
        else:
            if f in old_fields:
                fields.append(f)

    if args.remove:
        fields = set(fields)
        fields = [x for x in old_fields if x not in fields]

    if args.large:
        reader = DictReaderLarge(CommentStripper(args.stdin),
                                 fieldnames=old_fields,
                                 dialect=args.csv_dialect)
    else:
        reader = csv.DictReader(CommentStripper(args.stdin),
                                fieldnames=old_fields,
                                dialect=args.csv_dialect)

    writer = csv.DictWriter(outfile,
                            fields,
                            dialect=args.csv_dialect,
                            lineterminator=args.csv_lineterminator,
                            extrasaction='ignore')

    print("\t".join(fields))

    first_row = True
    ninput, noutput, nerrors = 0, 0, 0

    while 1:
        ninput += 1
        try:
            row = six.next(reader)
        except _csv.Error as msg:
            args.stderr.write("# error while parsing: %s\n" % (msg))
            nerrors += 1
            continue
        except StopIteration:
            break
        if not row:
            break
        writer.writerow(row)
        noutput += 1

    E.info("ninput=%i, noutput=%i, nerrors=%i" % (ninput, noutput, nerrors))

    E.stop()
Exemplo n.º 22
0
def main(argv=None):

    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.add_argument(
        "--input-filename-fasta", dest="input_filename_fasta", type=str,
        help="filename with reference sequence in fasta format ")

    parser.add_argument(
        "--input-filename-bam", dest="input_filename_bam", type=str,
        help="filename with aligned reads ")

    parser.add_argument(
        "--method", dest="methods", type=str, action="append",
        choices=["add-strelka-genotype",
                 "lift-over"],
        help="methods to apply ")

    parser.add_argument(
        "--input-filename-chain", dest="input_filename_chain", type=str,
        help="filename with alignment chain for lift-over ")

    parser.add_argument(
        "--normal-sample-regex", dest="normal_sample_regex", type=str,
        help="regular expression to apply to header to identify normal "
        "sample id ")

    parser.add_argument(
        "--output-filename-unmapped", dest="output_filename_unmapped", type=str,
        help="filename with variants that could not be lifted over ")

    parser.set_defaults(
        input_filename_fasta=None,
        input_filename_bam=None,
        input_filename_vcf="-",
        sample_size=0.001,
        region_size=20,
        methods=[],
        normal_sample_regex=None,
        input_filename_chain=None,
        output_filename_unmapped=None,
    )

    (args, unknown) = E.start(parser,
                              argv=argv,
                              add_output_options=True,
                              unknowns=True)

    if len(unknown) > 0:
        args.input_filename_vcf = unknown[0]

    vcf_in = pysam.VariantFile(args.input_filename_vcf)

    if "lift-over" in args.methods:
        if args.input_filename_chain is None:
            raise ValueError("--method=lift-over requires --input-filename-chain")
        if not os.path.exists(args.input_filename_chain):
            raise OSError("file {} with chain data does not exist".format(
                args.input_filename_chain))
        E.info("reading chain from {}".format(args.input_filename_chain))
        with iotools.open_file(args.input_filename_chain) as inf:
            map_chain, map_contig2length = read_liftover_chain(inf)

    if args.input_filename_fasta:
        fasta = pysam.FastaFile(args.input_filename_fasta)
    else:
        fasta = None

    if args.input_filename_bam:
        bam = pysam.AlignmentFile(args.input_filename_bam)
    else:
        bam = None

    outf = args.stdout

    c = E.Counter()

    if "add-strelka-genotype" in args.methods:
        map_nt2gt = {"ref": "0/0",
                     "het": "0/1",
                     "hom": "1/1",
                     "conflict": "."}

        map_tumour2gt = {"ref": "0/0",
                         "het": "0/1",
                         "hom": "1/1"}

        header = str(vcf_in.header).splitlines()

        header.insert(
            len(header) - 1,
            '##FORMAT=<ID=GT,Number=1,Type=String,Description='
            '"Genotypes of reference and alternative alleles, '
            'added by cgatcore vcf2vcf.">')

        header = "\n".join(header)
        if args.normal_sample_regex:
            normal_sample = re.search(" -bam-file \S+/([^/]+)_S\d+.bam", header).groups()[0]
        else:
            normal_sample = "NORMAL"

        is_first = True

        for record in vcf_in:
            c.input += 1

            if "GT" in record.format:
                if is_first:
                    outf.write(header + "\n")
                    is_first = False
                outf.write(str(record))
                c.has_gt += 1
                continue

            gt_normal = map_nt2gt[record.info["NT"]]
            gt_tumour = record.info["SGT"]
            norm, tumour = gt_tumour.split("->")
            if gt_tumour[0] in "ACGT":
                alts = record.alts
                if alts is None:
                    c.no_alt += 1
                    continue

                if len(record.alts) > 1:
                    c.multi_allelic += 1
                    continue

                _map_tumour2gt = {
                    record.alts[0]: "1",
                    record.ref: "0"}
                try:
                    gt_tumour = "/".join(
                        sorted([_map_tumour2gt[x] for x in tumour]))
                except KeyError:
                    gt_tumour = "."
                    c.ambigous_genotype += 1
            else:
                gt_tumour = map_tumour2gt[tumour]

            fields = str(record)[:-1].split("\t")
            # FORMAT
            fields[8] = ":".join(("GT", fields[8]))
            # SAMPLES
            # makes a few assumptions, fix!
            header_insert_normal = False
            if len(fields) == 11:
                fields[9] = ":".join((gt_normal, fields[9]))
                fields[10] = ":".join((gt_tumour, fields[10]))
            elif len(fields) == 10:
                header_insert_normal = True
                values = fields[9].split(":")
                fields.append(":".join((gt_tumour, fields[9])))
                fields[9] = ":".join([gt_normal] + ["."] * len(values))
            else:
                raise NotImplementedError()

            if is_first:
                if not header_insert_normal:
                    outf.write(header + "\n")
                else:
                    header = re.sub(r"\tFORMAT\t",
                                    "\tFORMAT\t%s\t" % normal_sample, header)
                    outf.write(header + "\n")
                is_first = False
            outf.write("\t".join(fields) + "\n")
            c.output += 1

    elif "lift-over" in args.methods:
        header = str(vcf_in.header).splitlines()

        if fasta:
            # validate contig size
            expected_lengths = dict(list(zip(fasta.references, fasta.lengths)))
        else:
            expected_lengths = map_contig2length

        # update contig names and sizes in VCF header
        header = [x for x in header if not x.startswith("##contig")]
        header[-1:-1] = ["##contig=<ID={},length={}>".format(
            contig, length) for contig, length in sorted(expected_lengths.items())]

        header.insert(
            len(header) - 1,
            '##liftover=<CHAIN={},REFERENCE={}>'.format(
                args.input_filename_chain,
                args.input_filename_fasta))
        outf.write("\n".join(header) + "\n")

        unmapped_contigs = set()
        unknown_contigs = set()

        trans_genotypes = str.maketrans("01", "10")

        if fasta:
            # validate contig size
            expected_lengths = dict(list(zip(fasta.references, fasta.lengths)))
            for contig, length in list(map_contig2length.items()):
                if contig in expected_lengths:
                    if length != expected_lengths[contig]:
                        raise ValueError(
                            "contig lengths mismatch. For contig {} chain files "
                            "says {}, but fasta files says {}".format(
                                contig, length, expected_lengths[contig]))
            E.info("contig sizes in chain file and fasta files correspond.")

        if args.output_filename_unmapped:
            outfile_unmapped = iotools.open_file(args.output_filename_unmapped, "w")
            outfile_unmapped.write("\n".join(header) + "\n")
        else:
            outfile_unmapped = None

        for record in vcf_in:
            c.input += 1

            try:
                mm = map_chain[record.contig]
            except KeyError:
                c.skipped_unmapped_contig += 1
                unmapped_contigs.add(record.contig)
                if outfile_unmapped:
                    outfile_unmapped.write("skipped_unmapped_contig\t{}".format(str(record)))
                continue

            try:
                m = mm.search(record.start, record.stop)
            except AttributeError:
                c.skipped_mapping_error += 1
                if outfile_unmapped:
                    outfile_unmapped.write("skipped_mapping_error\t{}".format(str(record)))
                continue

            if len(m) == 0:
                c.skipped_unmapped_position += 1
                if outfile_unmapped:
                    outfile_unmapped.write("skipped_unmapped_position\t{}".format(str(record)))
                continue
            elif len(m) > 1:
                c.skipped_multimapping_position += 1
                if outfile_unmapped:
                    outfile_unmapped.write("skipped_multimapping_position\t{}".format(str(record)))
                continue

            m = m[0]
            y_contig, y_start, y_end, y_invert = m.data

            if y_invert:
                y_pos = y_end - (record.start - m.start)
            else:
                y_pos = (record.start - m.start) + y_start

            if fasta:
                try:
                    ref_base = fasta.fetch(y_contig, y_pos, y_pos + len(record.ref)).upper()
                except KeyError:
                    c.skipped_unknown_contig += 1
                    unknown_contigs.add(y_contig)
                    ref_base = None
                    continue

            swap_alleles = False
            if ref_base:
                error = False
                if ref_base == record.ref:
                    c.matches += 1
                else:
                    if len(record.alts) == 1:
                        alt_base = record.alts[0]
                        if ref_base == alt_base:
                            swap_alleles = True
                            c.allele_swap_variant += 1
                        else:
                            c.error_mismatch_variant += 1
                            error = "mismatch"
                    else:
                        error = "multi-mismatch"
                        c.error_multi_mismatch_variant += 1

                if error:
                    if outfile_unmapped:
                        outfile_unmapped.write("{}\t{}".format(error, str(record)))
                    c.skipped_error_variant += 1
                    continue

            fields = str(record)[:-1].split("\t")
            fields[0] = y_contig
            fields[1] = str(y_pos)

            if swap_alleles:
                fields[4] = alt_base
                fields[5] = ref_base
                # update genotype fields
                keep = False
                for idx in range(9, len(fields)):
                    gt, rest = fields[idx].split(":", 1)
                    keep = keep or "0" in gt
                    fields[idx] = ":".join((gt.translate(trans_genotypes), rest))

                # remove reference only calls
                if not keep:
                    if outfile_unmapped:
                        outfile_unmapped.write("reference_call\t{}".format(str(record)))
                    c.skipped_allele_swap_reference += 1
                continue

            c.output += 1
            outf.write("\t".join(fields) + "\n")

        c.unmapped_contigs = len(unmapped_contigs)
        c.unknown_contigs = len(unknown_contigs)

        E.info(c.asTable())
        if unknown_contigs:
            E.info("unknown contigs: {}".format(",".join(sorted(unknown_contigs))))
        if unmapped_contigs:
            E.info("unmapped contigs: {}".format(",".join(sorted(unmapped_contigs))))

    E.stop()
Exemplo n.º 23
0
def main(argv=sys.argv):

    # setup command line parser
    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("-s",
                        "--session",
                        dest="session",
                        type=str,
                        help="load session before creating plots ")

    parser.add_argument("-d",
                        "--snapshot-dir",
                        dest="snapshotdir",
                        type=str,
                        help="directory to save snapshots in ")

    parser.add_argument("-f",
                        "--format",
                        dest="format",
                        type=str,
                        choices=("png", "eps", "svg"),
                        help="output file format ")

    parser.add_argument("-o",
                        "--host",
                        dest="host",
                        type=str,
                        help="host that IGV is running on ")

    parser.add_argument("-p",
                        "--port",
                        dest="port",
                        type=int,
                        help="port that IGV listens at ")

    parser.add_argument("-e",
                        "--extend",
                        dest="extend",
                        type=int,
                        help="extend each interval by a number of bases ")

    parser.add_argument("-x",
                        "--expand",
                        dest="expand",
                        type=float,
                        help="expand each region by a certain factor ")

    parser.add_argument("--session-only",
                        dest="session_only",
                        action="store_true",
                        help="plot session after opening, "
                        "ignore intervals ")

    parser.add_argument("-n",
                        "--name",
                        dest="name",
                        type=str,
                        choices=("bed-name", "increment"),
                        help="name to use for snapshot ")

    parser.set_defaults(
        command="igv.sh",
        host='127.0.0.1',
        port=61111,
        snapshotdir=os.getcwd(),
        extend=0,
        format="png",
        expand=1.0,
        session=None,
        session_only=False,
        keep_open=False,
        name="bed-name",
    )

    # add common options (-h/--help, ...) and parse command line
    (args) = E.start(parser, argv=argv, add_output_options=True)

    igv_process = None
    if args.new_instance:
        E.info("starting new IGV process")
        igv_process = IGV.startIGV(command=args.command, port=args.port)
        E.info("new IGV process started")

    E.info("connection to process on %s:%s" % (args.host, args.port))
    E.info("saving images in %s" % args.snapshotdir)
    igv = IGV(host=args.host,
              port=args.port,
              snapshot_dir=os.path.abspath(args.snapshotdir))

    if args.session:
        E.info('loading session from %s' % args.session)
        igv.load(args.session)
        E.info('loaded session')

    if args.session_only:
        E.info('plotting session only ignoring any intervals')
        fn = "%s.%s" % (os.path.basename(args.session), args.format)
        E.info("writing snapshot to '%s'" % os.path.join(args.snapshotdir, fn))
        igv.save(fn)

    else:
        c = E.Counter()
        for bed in pysam.tabix_iterator(args.stdin, parser=pysam.asBed()):

            c.input += 1

            # IGV can not deal with white-space in filenames
            if args.name == "bed-name":
                name = re.sub("\s", "_", bed.name)
            elif args.name == "increment":
                name = str(c.input)

            E.info("going to %s:%i-%i for %s" %
                   (bed.contig, bed.start, bed.end, name))

            start, end = bed.start, bed.end
            extend = args.extend
            if args.expand:
                d = end - start
                extend = max(extend, (args.expand * d - d) // 2)

            start -= extend
            end += extend

            igv.go("%s:%i-%i" % (bed.contig, start, end))

            fn = E.get_output_file("%s.%s" % (name, args.format))
            E.info("writing snapshot to '%s'" % fn)
            igv.save(fn)

            c.snapshots += 1

        E.info(c)

    if igv_process is not None and not args.keep_open:
        E.info('shutting down IGV')
        igv_process.send_signal(signal.SIGKILL)

    E.stop()
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.add_argument("-t",
                        "--template-bam-file",
                        dest="filename_genome_bam",
                        type=str,
                        help="input bam file for header information ")

    parser.add_argument("-s",
                        "--contigs-tsv-file",
                        dest="filename_contigs",
                        type=str,
                        help="filename with contig sizes ")

    parser.add_argument(
        "-o",
        "--colour",
        dest="colour_mismatches",
        action="store_true",
        help="mismatches will use colour differences (CM tag) ")

    parser.add_argument("-i",
                        "--ignore-mismatches",
                        dest="ignore_mismatches",
                        action="store_true",
                        help="ignore mismatches ")

    parser.add_argument("-c",
                        "--remove-contigs",
                        dest="remove_contigs",
                        type=str,
                        help="','-separated list of contigs to remove ")

    parser.add_argument("-f",
                        "--force-output",
                        dest="force",
                        action="store_true",
                        help="force overwriting of existing files ")

    parser.add_argument("-u",
                        "--unique",
                        dest="unique",
                        action="store_true",
                        help="remove reads not matching uniquely ")

    parser.set_defaults(
        filename_genome_bam=None,
        filename_gtf=None,
        filename_mismapped=None,
        remove_contigs=None,
        force=False,
        unique=False,
        colour_mismatches=False,
        ignore_mismatches=False,
    )

    # add common options (-h/--help, ...) and parse command line
    (args) = E.start(parser, argv=argv)

    genomefile, referencenames, referencelengths = None, None, None

    if args.filename_genome_bam:
        genomefile = pysam.AlignmentFile(args.filename_genome_bam, "rb")
    elif args.filename_contigs:
        contigs = iotools.ReadMap(iotools.open_file(args.filename_contigs))
        data = list(zip(*list(contigs.items())))
        referencenames, referencelengths = data[0], list(map(int, data[1]))
    else:
        raise ValueError(
            "please provide either --template-bam-file or --contigs-tsv-file")

    infile = pysam.AlignmentFile("-", "rb")
    outfile = pysam.AlignmentFile("-",
                                  "wb",
                                  template=genomefile,
                                  referencenames=referencenames,
                                  referencelengths=referencelengths)

    if args.colour_mismatches:
        tag = "CM"
    else:
        tag = "NM"

    nambiguous = 0
    ninput = 0
    nunmapped = 0
    ncigar = 0
    nfull = 0
    noutput = 0

    contig2tid = dict([(y, x) for x, y in enumerate(outfile.references)])

    for qname, readgroup in itertools.groupby(infile, lambda x: x.qname):
        ninput += 1
        reads = list(readgroup)
        if reads[0].is_unmapped:
            nunmapped += 1
            continue

        # filter for best match
        best = min([x.opt(tag) for x in reads])
        reads = [x for x in reads if x.opt(tag) == best]
        if len(reads) > 1:
            nambiguous += 1
            continue

        read = reads[0]

        # reject complicated matches (indels, etc)
        # to simplify calculations below.
        if len(read.cigar) > 1:
            ncigar += 1
            continue

        # set NH flag to latest count
        t = dict(read.tags)
        t['NH'] = 1
        read.tags = list(t.items())

        sname = infile.getrname(read.tid)

        contig, first_exon_start, middle, last_exon_end, splice, strand = sname.split(
            "|")
        first_exon_end, last_exon_start = middle.split("-")
        first_exon_start, first_exon_end, last_exon_start, last_exon_end = list(
            map(int, (first_exon_start, first_exon_end, last_exon_start,
                      last_exon_end)))
        first_exon_end += 1

        total = first_exon_end - first_exon_start + \
            last_exon_end - last_exon_start
        first_exon_length = first_exon_end - first_exon_start

        match1 = first_exon_length - read.pos
        intron_length = last_exon_start - first_exon_end
        match2 = read.qlen - match1

        # match lies fully in one exon - ignore
        if match1 <= 0 or match2 <= 0:
            nfull += 1
            continue

        # increment pos
        read.pos = first_exon_start + read.pos
        read.tid = contig2tid[contig]
        # 3 = BAM_CREF_SKIP
        read.cigar = [(0, match1), (3, intron_length), (0, match2)]

        outfile.write(read)

        noutput += 1

    outfile.close()
    if genomefile:
        genomefile.close()

    c = E.Counter()
    c.input = ninput
    c.output = noutput
    c.full = nfull
    c.cigar = ncigar
    c.ambiguous = nambiguous
    c.unmapped = nunmapped

    E.info("%s" % str(c))

    # write footer and output benchmark information.
    E.stop()
Exemplo n.º 25
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.add_argument("-g", "--genome-file", dest="genome_file", type=str,
                        help="filename with genome")

    parser.add_argument("-f", "--features", dest="features", type=str,
                        action="append", help="features to collect ")

    parser.add_argument("-w", "--window-size", dest="window_size", type=int,
                        help="window size in bp for histogram computation. "
                        "Determines the bin size.  ")

    parser.add_argument("-b", "--num-bins", dest="num_bins", type=int,
                        help="number of bins for histogram computation "
                        "if window size is not given. ")

    parser.add_argument("-m", "--method", dest="method", type=str,
                        choices=("genomic", "histogram", ),
                        help="methods to apply. ")

    parser.set_defaults(
        genome_file=None,
        window_size=None,
        num_bins=1000,
        value_format="%6.4f",
        features=[],
        method="genomic",
    )

    (args) = E.start(parser, add_output_options=True)

    if args.genome_file:
        fasta = IndexedFasta.IndexedFasta(args.genome_file)
    else:
        fasta = None

    if args.method == "histogram":

        gff = GTF.readFromFile(args.stdin)

        gff.sort(key=lambda x: (x.contig, x.start))

        chunk = []
        last_contig = None

        for entry in gff:

            if last_contig != entry.contig:
                processChunk(last_contig, chunk, args, fasta)
                last_contig = entry.contig
                chunk = []

            chunk.append(entry)

        processChunk(last_contig, chunk, args, fasta)

    elif args.method == "genomic":
        intervals = collections.defaultdict(int)
        bases = collections.defaultdict(int)
        total = 0
        for entry in GTF.iterator(args.stdin):
            intervals[(entry.contig, entry.source, entry.feature)] += 1
            bases[(entry.contig, entry.source, entry.feature)
                  ] += entry.end - entry.start
            total += entry.end - entry.start

        args.stdout.write("contig\tsource\tfeature\tintervals\tbases")
        if fasta:
            args.stdout.write(
                "\tpercent_coverage\ttotal_percent_coverage\n")
        else:
            args.stdout.write("\n")

        total_genome_size = sum(
            fasta.getContigSizes(with_synonyms=False).values())

        for key in sorted(intervals.keys()):
            nbases = bases[key]
            nintervals = intervals[key]
            contig, source, feature = key
            args.stdout.write("\t".join(("\t".join(key),
                                         str(nintervals),
                                         str(nbases))))
            if fasta:
                args.stdout.write(
                    "\t%f" % (100.0 * float(nbases) / fasta.getLength(contig)))
                args.stdout.write(
                    "\t%f\n" % (100.0 * float(nbases) / total_genome_size))
            else:
                args.stdout.write("\n")

    E.stop()
Exemplo n.º 26
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("-m",
                        "--method",
                        dest="method",
                        type=str,
                        action="store",
                        choices=("hierarchy", "set-field", "set-pattern",
                                 "set-none"),
                        help="Method to use for conversion")

    parser.add_argument("-g",
                        "--gene-type",
                        dest="gene_type",
                        type=str,
                        help="feature type to get gene_id from if possible ")

    parser.add_argument(
        "-t",
        "--transcript-type",
        dest="transcript_type",
        type=str,
        help="feature type to get transcript_id from if possible ")

    parser.add_argument(
        "-d",
        "--no-discard",
        dest="discard",
        action="store_false",
        help=
        "Do not discard feature types specified by GENE_TYPE and TRANSCRIPT_TYPE"
    )

    parser.add_argument("--gene-id",
                        dest="gene_field_or_pattern",
                        type=str,
                        help="Either field or pattern for the gene_id ")

    parser.add_argument("--transcript-id",
                        dest="transcript_field_or_pattern",
                        type=str,
                        help="Either field or pattern for the transcript_id ")

    parser.add_argument(
        "--parent-field",
        dest="parent",
        type=str,
        help="field that specifies the parent relationship. Currently only"
        "if left as Parent will features with multiple parents be parsed"
        "correctly"
        "")

    parser.add_argument(
        "--read-twice",
        dest="read_twice",
        action="store_true",
        help=
        "Instead of holding the whole file in memory, read once for parsing the "
        "hierarchy, and then again for actaully doing the conversion. Means a real file "
        "and not a pipe must be provided."
        "")

    parser.add_argument(
        "--by-chrom",
        dest="by_chrom",
        action="store_true",
        help="Parse input file one choromosome at a time. Reduces memory usage, "
        "but input must be sorted by chromosome and features may not split accross "
        " multiple chromosomes"
        "")

    parser.add_argument(
        "--fail-missing-gene",
        dest="missing_gene",
        action="store_false",
        help="Fail if no feature of type GENE_TYPE is found instead of using "
        "defaulting to highest object in hierarchy"
        "")

    parser.set_defaults(method="hierarchy",
                        gene_type="gene",
                        transcript_type="mRNA",
                        discard=True,
                        gene_field_or_pattern="ID",
                        transcript_field_or_pattern="ID",
                        read_twice=False,
                        by_chrom=False,
                        missing_gene=True,
                        parent="Parent")

    # add common options (-h/--help, ...) and parse command line
    (args) = E.start(parser, argv=argv)

    gffs = GFF3.flat_file_iterator(args.stdin)

    if args.by_chrom:
        gffs = GFF3.chrom_iterator(gffs)
    else:
        gffs = [gffs]

    # running early so that fails early if configuration is wrong
    if args.read_twice:
        # Will throw IOError if args.stdin is not a normal file
        second_gff = GFF3.flat_file_iterator(iotools.open_file(
            args.stdin.name))

        if args.by_chrom:
            second_gff = GFF3.chrom_iterator(second_gff)
        else:
            second_gff = iter([second_gff])
    else:
        second_gff = None

    for chunk in gffs:

        if args.read_twice:
            second_gff_chunk = next(second_gff)
        else:
            chunk = list(chunk)
            second_gff_chunk = chunk

        if args.method == "hierarchy":

            convert_hierarchy(chunk, second_gff_chunk, args)
        elif args.method == "set-field":
            gene_id_pattern = "%%(%s)s" % args.gene_field_or_pattern
            transcript_id_pattern = "%%(%s)s" % args.transcript_field_or_pattern
            convert_set(chunk, gene_id_pattern, transcript_id_pattern, args)
        elif args.method == "set-pattern":
            convert_set(chunk, args.gene_field_or_pattern,
                        args.transcript_field_or_pattern, args)
        elif args.method == "set-none":
            convert_set(chunk, None, None, args)

    # write footer and output benchmark information.
    E.stop()
Exemplo n.º 27
0
def main(argv=None):

    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("-g",
                        "--genome-file",
                        dest="genome_file",
                        type=str,
                        help="filename with genome.")

    parser.add_argument("-q",
                        "--quality-file",
                        dest="quality_file",
                        type=str,
                        help="filename with genomic base quality "
                        "information.")

    parser.add_argument("-b",
                        "--bam-file",
                        dest="bam_files",
                        type=str,
                        metavar="bam",
                        help="filename with read mapping information. "
                        "Multiple files can be submitted in a "
                        "comma-separated list.")

    parser.add_argument("-i",
                        "--bigwig-file",
                        dest="bigwig_file",
                        type=str,
                        metavar="bigwig",
                        help="filename with bigwig information ")

    parser.add_argument("-f",
                        "--gff-file",
                        dest="filename_gff",
                        type=str,
                        action="append",
                        metavar='bed',
                        help="filename with extra gff files. The order "
                        "is important.")

    parser.add_argument("--filename-format",
                        dest="filename_format",
                        type=str,
                        choices=("bed", "gff", "gtf"),
                        help="format of secondary stream.")

    parser.add_argument("--restrict-source",
                        dest="gff_sources",
                        type=str,
                        action="append",
                        help="restrict input to this 'source' in extra "
                        "gff file (for counter: overlap).")

    parser.add_argument("--restrict-feature",
                        dest="gff_features",
                        type=str,
                        action="append",
                        help="restrict input to this 'feature' in extra gff "
                        "file (for counter: overlap).")

    parser.add_argument("-r",
                        "--reporter",
                        dest="reporter",
                        type=str,
                        choices=("genes", "transcripts"),
                        help="report results for 'genes' or 'transcripts' ")

    parser.add_argument("-s",
                        "--section",
                        dest="sections",
                        type=str,
                        action="append",
                        choices=("exons", "introns"),
                        help="select range on which counters will operate ")

    parser.add_argument(
        "-c",
        "--counter",
        dest="counters",
        type=str,
        action="append",
        choices=("bigwig-counts", "binding-pattern", "classifier",
                 "classifier-rnaseq", "classifier-rnaseq-splicing",
                 "classifier-polii", "composition-na", "composition-cpg",
                 "coverage", "distance", "distance-genes", "distance-tss",
                 "length", 'neighbours', "overlap", "overlap-stranded",
                 "overlap-transcripts", "overrun", "position", "proximity",
                 "proximity-exclusive", "proximity-lengthmatched", "quality",
                 "read-coverage", "read-extension", "read-overlap",
                 "read-counts", "read-fullcounts", "readpair-counts",
                 "readpair-fullcounts", "splice", "splice-comparison",
                 "territories"),
        help="select counters to apply to input ")

    parser.add_argument("--add-gtf-source",
                        dest="add_gtf_source",
                        action="store_true",
                        help="add gtf field of source to output ")

    parser.add_argument("--proximal-distance",
                        dest="proximal_distance",
                        type=int,
                        help="distance to be considered proximal to "
                        "an interval.")

    parser.add_argument("--multi-mapping-method",
                        dest="multi_mapping",
                        type=str,
                        choices=('all', 'ignore', 'weight'),
                        help="how to treat multi-mapping reads in "
                        "bam-files. Requires "
                        "the NH flag to be set by the mapper ")

    parser.add_argument("--use-barcodes",
                        dest="use_barcodes",
                        action="store_true",
                        help="Use barcodes to count unique umi's. "
                        "UMI's are specified in the read identifier "
                        "as the last field, where fields are separated "
                        "by underscores, e.g. "
                        "@READ:ILLUMINA:STUFF_NAMINGSTUFF_UMI. "
                        "When true, unique counts are returned. "
                        "Currently only compatible with count-reads")

    parser.add_argument("--sample-probability",
                        dest="sample_probability",
                        type=float,
                        help="Specify the probability of whether any"
                        "given read or read pair in a file bam is counted"
                        "Currently only compatible with count-reads")

    parser.add_argument("--column-prefix",
                        dest="prefixes",
                        type=str,
                        action="append",
                        help="add prefix to column headers - prefixes "
                        "are used in the same order as the counters ")

    parser.add_argument("--library-type",
                        dest="library_type",
                        type=str,
                        choices=("unstranded", "firststrand", "secondstrand",
                                 "fr-unstranded", "fr-firststrand",
                                 "fr-secondstrand"),
                        help="library type of reads in bam file. ")

    parser.add_argument("--min-mapping-quality",
                        dest="minimum_mapping_quality",
                        type=float,
                        help="minimum mapping quality. Reads with a quality "
                        "score of less will be ignored. ")

    parser.set_defaults(genome_file=None,
                        reporter="genes",
                        with_values=True,
                        sections=[],
                        counters=[],
                        filename_gff=[],
                        filename_format=None,
                        gff_features=[],
                        gff_sources=[],
                        add_gtf_source=False,
                        proximal_distance=10000,
                        bam_files=None,
                        multi_mapping='all',
                        library_type='fr-unstranded',
                        prefixes=[],
                        minimum_mapping_quality=0,
                        use_barcodes=False,
                        sample_probability=1.0)

    if not argv:
        argv = sys.argv

    (args) = E.start(parser, add_output_options=True, argv=argv)

    if args.prefixes:
        if len(args.prefixes) != len(args.counters):
            raise ValueError("if any prefix is given, the number of prefixes "
                             "must be the same as the number of counters")

    # get files
    if args.genome_file:
        fasta = IndexedFasta.IndexedFasta(args.genome_file)
    else:
        fasta = None

    if args.quality_file:
        quality = IndexedFasta.IndexedFasta(args.quality_file)
        quality.setTranslator(IndexedFasta.TranslatorBytes())
    else:
        quality = None

    if args.bam_files:
        bam_files = []
        for bamfile in args.bam_files.split(","):
            bam_files.append(pysam.AlignmentFile(bamfile, "rb"))
    else:
        bam_files = None

    if args.bigwig_file:
        bigwig_file = pyBigWig.open(args.bigwig_file)
    else:
        bigwig_file = None

    counters = []

    if not args.sections:
        E.info("counters will use the default section (exons)")
        args.sections.append(None)

    if not args.gff_sources:
        args.gff_sources.append(None)
    if not args.gff_features:
        args.gff_features.append(None)

    cc = E.Counter()

    for n, c in enumerate(args.counters):
        if args.prefixes:
            prefix = args.prefixes[n]
        else:
            prefix = None

        if c == "position":
            for section in args.sections:
                counters.append(
                    GeneModelAnalysis.CounterPosition(section=section,
                                                      options=args,
                                                      prefix=prefix))
        elif c == "length":
            for section in args.sections:
                counters.append(
                    GeneModelAnalysis.CounterLengths(section=section,
                                                     options=args,
                                                     prefix=prefix))
        elif c == "splice":
            if fasta is None:
                raise ValueError('splice requires a genomic sequence')
            counters.append(
                GeneModelAnalysis.CounterSpliceSites(fasta=fasta,
                                                     prefix=prefix))
        elif c == "quality":
            if fasta is None:
                raise ValueError('quality requires a quality score sequence')
            counters.append(
                GeneModelAnalysis.CounterQuality(fasta=quality, prefix=prefix))
        elif c == "overrun":
            counters.append(
                GeneModelAnalysis.CounterOverrun(
                    filename_gff=args.filename_gff,
                    options=args,
                    prefix=prefix))
        elif c == "read-coverage":
            counters.append(
                GeneModelAnalysis.CounterReadCoverage(bam_files,
                                                      options=args,
                                                      prefix=prefix))
        elif c == "read-extension":
            counters.append(
                GeneModelAnalysis.CounterReadExtension(
                    bam_files,
                    filename_gff=args.filename_gff,
                    options=args,
                    prefix=prefix))
        elif c == "read-overlap":
            counters.append(
                GeneModelAnalysis.CounterReadOverlap(
                    bam_files,
                    multi_mapping=args.multi_mapping,
                    minimum_mapping_quality=args.minimum_mapping_quality,
                    options=args,
                    prefix=prefix))
        elif c == "read-counts":
            counters.append(
                GeneModelAnalysis.CounterReadCounts(
                    bam_files,
                    multi_mapping=args.multi_mapping,
                    use_barcodes=args.use_barcodes,
                    sample_probability=args.sample_probability,
                    minimum_mapping_quality=args.minimum_mapping_quality,
                    options=args,
                    prefix=prefix))
        elif c == "read-fullcounts":
            counters.append(
                GeneModelAnalysis.CounterReadCountsFull(
                    bam_files,
                    multi_mapping=args.multi_mapping,
                    sample_probability=args.sample_probability,
                    minimum_mapping_quality=args.minimum_mapping_quality,
                    options=args,
                    prefix=prefix))
        elif c == "readpair-counts":
            counters.append(
                GeneModelAnalysis.CounterReadPairCounts(
                    bam_files,
                    multi_mapping=args.multi_mapping,
                    sample_probability=args.sample_probability,
                    library_type=args.library_type,
                    minimum_mapping_quality=args.minimum_mapping_quality,
                    options=args,
                    prefix=prefix))
        elif c == "readpair-fullcounts":
            counters.append(
                GeneModelAnalysis.CounterReadPairCountsFull(
                    bam_files,
                    multi_mapping=args.multi_mapping,
                    sample_probability=args.sample_probability,
                    minimum_mapping_quality=args.minimum_mapping_quality,
                    options=args,
                    prefix=prefix))
        elif c == "bigwig-counts":
            counters.append(
                GeneModelAnalysis.CounterBigwigCounts(bigwig_file,
                                                      options=args,
                                                      prefix=prefix))
        elif c == "splice-comparison":
            if fasta is None:
                raise ValueError('splice-comparison requires a genomic '
                                 'sequence')
            counters.append(
                GeneModelAnalysis.CounterSpliceSiteComparison(
                    fasta=fasta,
                    filename_gff=args.filename_gff,
                    feature=None,
                    source=None,
                    options=args,
                    prefix=prefix))
        elif c == "composition-na":
            if fasta is None:
                raise ValueError('composition-na requires a genomic sequence')
            for section in args.sections:
                counters.append(
                    GeneModelAnalysis.CounterCompositionNucleotides(
                        fasta=fasta,
                        section=section,
                        options=args,
                        prefix=prefix))
        elif c == "composition-cpg":
            if fasta is None:
                raise ValueError('composition-cpg requires a genomic sequence')
            for section in args.sections:
                counters.append(
                    GeneModelAnalysis.CounterCompositionCpG(fasta=fasta,
                                                            section=section,
                                                            options=args,
                                                            prefix=prefix))

        elif c in ("overlap", "overlap-stranded", "overlap-transcripts",
                   "proximity", "proximity-exclusive",
                   "proximity-lengthmatched", "neighbours", "territories",
                   "distance", "distance-genes", "distance-tss",
                   "binding-pattern", "coverage"):
            if c == "overlap":
                template = GeneModelAnalysis.CounterOverlap
            if c == "overlap-stranded":
                template = GeneModelAnalysis.CounterOverlapStranded
            elif c == "overlap-transcripts":
                template = GeneModelAnalysis.CounterOverlapTranscripts
            elif c == "proximity":
                template = GeneModelAnalysis.CounterProximity
            elif c == "neighbours":
                template = GeneModelAnalysis.CounterNeighbours
            elif c == "proximity-exclusive":
                template = GeneModelAnalysis.CounterProximityExclusive
            elif c == "proximity-lengthmatched":
                template = GeneModelAnalysis.CounterProximityLengthMatched
            elif c == "territories":
                template = GeneModelAnalysis.CounterTerritories
            elif c == "distance":
                template = GeneModelAnalysis.CounterDistance
            elif c == "distance-genes":
                template = GeneModelAnalysis.CounterDistanceGenes
            elif c == "distance-tss":
                template = GeneModelAnalysis.CounterDistanceTranscriptionStartSites
            elif c == "coverage":
                template = GeneModelAnalysis.CounterCoverage
            elif c == "binding-pattern":
                template = GeneModelAnalysis.CounterBindingPattern

            for section in args.sections:
                for source in args.gff_sources:
                    for feature in args.gff_features:
                        counters.append(
                            template(filename_gff=args.filename_gff,
                                     feature=feature,
                                     source=source,
                                     fasta=fasta,
                                     section=section,
                                     options=args,
                                     prefix=prefix))

        elif c == "classifier":
            counters.append(
                GeneModelAnalysis.Classifier(filename_gff=args.filename_gff,
                                             fasta=fasta,
                                             options=args,
                                             prefix=prefix))

        elif c == "classifier-rnaseq":
            counters.append(
                GeneModelAnalysis.ClassifierRNASeq(
                    filename_gff=args.filename_gff,
                    fasta=fasta,
                    options=args,
                    prefix=prefix))
        elif c == "classifier-rnaseq-splicing":
            counters.append(
                GeneModelAnalysis.ClassifierRNASeqSplicing(
                    filename_gff=args.filename_gff,
                    fasta=fasta,
                    options=args,
                    prefix=prefix))
        elif c == "classifier-polii":
            counters.append(
                GeneModelAnalysis.ClassifierPolII(
                    filename_gff=args.filename_gff,
                    feature=None,
                    source=None,
                    fasta=fasta,
                    options=args,
                    prefix=prefix))
        elif c == "binding-pattern":
            counters.append(
                GeneModelAnalysis.CounterBindingPattern(
                    filename_gff=args.filename_gff,
                    feature=None,
                    source=None,
                    fasta=fasta,
                    options=args,
                    prefix=prefix))

    if args.reporter == "genes":
        iterator = GTF.flat_gene_iterator
        header = ["gene_id"]
        fheader = lambda x: [x[0].gene_id]
    elif args.reporter == "transcripts":
        iterator = GTF.transcript_iterator
        header = ["transcript_id"]
        fheader = lambda x: [x[0].transcript_id]

    if args.add_gtf_source:
        header.append("source")
        ffields = lambda x: [x[0].source]
    else:
        ffields = lambda x: []

    args.stdout.write("\t".join(header + [x.getHeader()
                                          for x in counters]) + "\n")

    for gffs in iterator(GTF.iterator(args.stdin)):
        cc.input += 1

        for counter in counters:
            counter.update(gffs)

        skip = len([x for x in counters if x.skip]) == len(counters)
        if skip:
            cc.skipped += 1
            continue

        args.stdout.write("\t".join(
            fheader(gffs) + ffields(gffs) +
            [str(counter) for counter in counters]) + "\n")

        cc.output += 1

    E.info("%s" % str(cc))
    for counter in counters:
        E.info("%s\t%s" % (repr(counter), str(counter.counter)))
    E.stop()
Exemplo n.º 28
0
def main(argv=sys.argv):

    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("-t",
                        "--no-titles",
                        dest="input_has_titles",
                        action="store_false",
                        help="no titles in input .")

    parser.add_argument("--ignore-titles",
                        dest="ignore_titles",
                        action="store_true",
                        help="ignore titles in input ")

    parser.add_argument("-i",
                        "--skip-titles",
                        dest="skip_titles",
                        action="store_true",
                        help="skip output of titles.")

    parser.add_argument("-m",
                        "--missing-value",
                        dest="missing_value",
                        type=str,
                        help="entry to use for missing values.")

    parser.add_argument("--header-names",
                        dest="headers",
                        type=str,
                        help="add headers for files as a ,-separated "
                        "list .")

    parser.add_argument("-c",
                        "--columns",
                        dest="columns",
                        type=str,
                        help="columns to use for joining. Multiple columns "
                        "can be specified as a comma-separated list ")

    parser.add_argument("-k",
                        "--take",
                        dest="take",
                        type=str,
                        action="append",
                        help="columns to take. If not set, all columns "
                        "except for "
                        "the join columns are taken ")

    parser.add_argument("-g",
                        "--glob",
                        dest="glob",
                        type=str,
                        help="wildcard expression for table names.")

    parser.add_argument(
        "-s",
        "--sort-order",
        dest="sort",
        type=str,
        help="sort by column titles in particular given order: "
        "alphabetical|numeric|list of columns.")

    parser.add_argument("-e",
                        "--merge-overlapping",
                        dest="merge",
                        action="store_true",
                        help="simply merge tables without matching up "
                        "rows.")

    parser.add_argument("-a",
                        "--cat",
                        dest="cat",
                        type=str,
                        help="simply concatenate tables. Adds an "
                        "additional column called X with the filename ")

    parser.add_argument("--sort-keys",
                        dest="sort_keys",
                        type=str,
                        choices=("numeric", "alphabetic"),
                        help="sort key columns by value.")

    parser.add_argument("--keep-empty",
                        dest="ignore_empty",
                        action="store_false",
                        help="keep empty tables. The default is "
                        "to ignore them.")

    parser.add_argument("--ignore-empty",
                        dest="ignore_empty",
                        action="store_true",
                        help="ignore empty tables - this is "
                        "the default .")

    parser.add_argument("--add-file-prefix",
                        dest="add_file_prefix",
                        action="store_true",
                        help="add file prefix to "
                        "columns headers. Suitable for multi-column"
                        "tables")

    parser.add_argument("--use-file-prefix",
                        dest="use_file_prefix",
                        action="store_true",
                        help="use file prefix as column headers. "
                        "Suitable for two-column tables ")

    parser.add_argument("--prefixes",
                        dest="prefixes",
                        type=str,
                        help="list of prefixes to use. "
                        ", separated list of prefixes. "
                        "The number of prefixes need to correspond to the "
                        "number of input files")

    parser.add_argument("--regex-filename",
                        dest="regex_filename",
                        type=str,
                        help="pattern to apply to filename to "
                        "build prefix")

    parser.add_argument("--regex-start",
                        dest="regex_start",
                        type=str,
                        help="regular expression to start "
                        "collecting table in a file")

    parser.add_argument("--regex-end",
                        dest="regex_end",
                        type=str,
                        help="regular expression to end collecting "
                        "table in a file")

    parser.add_argument(
        "--sep",
        dest="separator",
        type=str,
        help="table separator to use. The default is to use tabs. ")

    parser.add_argument("--test",
                        dest="test",
                        type=int,
                        help="test combining tables with "
                        "first X rows")

    parser.set_defaults(
        input_has_titles=True,
        skip_titles=False,
        missing_value=None,
        headers=None,
        sort=None,
        glob=None,
        columns="1",
        sort_keys=False,
        merge=False,
        ignore_empty=True,
        regex_start=None,
        regex_end=None,
        add_file_prefix=False,
        use_file_prefix=False,
        cat=None,
        take=[],
        regex_filename="(.*)",
        prefixes=None,
        test=0,
        separator="\t",
    )

    (args, unknown) = E.start(parser, argv=argv, unknowns=True)

    if args.headers:
        if "," in args.headers:
            args.headers = args.headers.split(",")
        else:
            args.headers = re.split("\s+", args.headers.strip())

    if args.sort and args.sort not in ("numeric", "alphabetic"):
        if "," in args.sort:
            args.sort = args.sort.split(",")
        else:
            args.sort = re.split("\s+", args.sort)

    if args.merge:
        args.columns = []
    else:
        args.columns = [int(x) - 1 for x in args.columns.split(",")]

    args.filenames = []

    if args.glob:
        args.filenames += glob.glob(args.glob)

    args.filenames += unknown

    if len(args.filenames) < 1:
        raise ValueError("no tables found.")

    E.info("combining %i tables" % len(args.filenames))

    if args.cat:
        table = concatenate_tables(args.filenames,
                                   regex_filename=args.regex_filename,
                                   separator=args.separator,
                                   headers=args.headers,
                                   missing_value=args.missing_value,
                                   cat=args.cat)

    table.to_csv(args.stdout, sep=args.separator, index=False)
    E.stop()
Exemplo n.º 29
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.set_defaults()

    # add common options (-h/--help, ...) and parse command line
    (args, unknown) = E.start(parser,
                              argv=argv,
                              add_output_options=True,
                              unknowns=True)

    # do sth
    if len(unknown) == 1:
        fastqfile1 = unknown[0]
        fastqfile2 = args.output_filename_pattern % "2"
    elif len(unknown) == 2:
        fastqfile1, fastqfile2 = unknown
    else:
        fastqfile1 = args.output_filename_pattern % "1"
        fastqfile2 = args.output_filename_pattern % "2"

    # only output compressed data
    if not fastqfile1.endswith(".gz"):
        fastqfile1 += ".gz"
    if not fastqfile2.endswith(".gz"):
        fastqfile2 += ".gz"

    if args.stdin != sys.stdin:
        samfile = pysam.AlignmentFile(args.stdin.name, "rb")
    else:
        samfile = pysam.AlignmentFile("-", "rb")

    tmpdir = tempfile.mkdtemp()

    outtemp1 = os.path.join(tmpdir, "pair1.gz")
    outtemp2 = os.path.join(tmpdir, "pair2.gz")

    outstream1 = iotools.open_file(outtemp1, "w")
    outstream2 = iotools.open_file(outtemp2, "w")

    E.info('writing fastq files to temporary directory %s' % tmpdir)

    found1, found2 = set(), set()
    read1_qlen, read2_qlen = 0, 0

    c = E.Counter()
    for read in samfile.fetch(until_eof=True):
        c.input += 1
        if not read.is_paired:
            outstream1.write("\t".join((read.qname, read.seq, read.qual)) +
                             "\n")
            found1.add(read.qname)
            if not read1_qlen:
                read1_qlen = read.qlen
            c.unpaired += 1
        elif read.is_read1:
            outstream1.write("\t".join((read.qname, read.seq, read.qual)) +
                             "\n")
            found1.add(read.qname)
            if not read1_qlen:
                read1_qlen = read.qlen
            c.output1 += 1
        elif read.is_read2:
            if read.qname not in found2:
                outstream2.write("\t".join((read.qname, read.seq, read.qual)) +
                                 "\n")
                found2.add(read.qname)
                if not read2_qlen:
                    read2_qlen = read.qlen
                c.output2 += 1

    if c.unpaired == 0 and c.output1 == 0 and c.output2 == 0:
        E.warn("no reads were found")
        return

    sort_statement = '''gunzip < %s
    | sort -k1,1
    | awk '{printf("@%%s\\n%%s\\n+\\n%%s\\n", $1,$2,$3)}'
    | gzip > %s'''

    if c.output1 == 0 and c.output2 == 0:
        # single end data:
        outstream1.close()
        outstream2.close()
        E.info("sorting fastq files")
        E.run(sort_statement % (outtemp1, fastqfile1))

    else:
        # paired end data
        for qname in found2.difference(found1):
            outstream1.write("\t".join((qname, "N" * read1_qlen,
                                        "B" * read1_qlen)) + "\n")
            c.extra1 += 1

        for qname in found1.difference(found2):
            outstream2.write("\t".join((qname, "N" * read2_qlen,
                                        "B" * read2_qlen)) + "\n")
            c.extra2 += 1

        E.info("%s" % str(c))

        outstream1.close()
        outstream2.close()

        E.info("sorting fastq files")
        E.run(sort_statement % (outtemp1, fastqfile1))
        E.run(sort_statement % (outtemp2, fastqfile2))

    shutil.rmtree(tmpdir)

    # write footer and output benchmark information.
    E.stop()
Exemplo n.º 30
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.add_argument("-u",
                        "--unique",
                        dest="unique",
                        action="store_true",
                        help="output rows are uniq.")

    parser.set_defaults(
        remove=False,
        unique=False,
    )

    (args, unknown) = E.start(parser, add_csv_options=True, unknowns=True)

    if len(args) != 2:
        raise ValueError("please specify two files to join")

    args.filename1, args.filename2 = unknown

    table1 = readTable(iotools.open_file(args.filename1, "r"))
    table2 = readTable(iotools.open_file(args.filename2, "r"))

    if args.unique:
        outfile = UniqueBuffer(sys.stdout)
    else:
        outfile = args.stdout

    # build new field list
    new_fields = []

    for x in args.join_fields1:
        new_fields.append(x)

    for x in fields1:
        if x not in args.join_fields1:
            new_fields.append(x)
        if x not in args.join_fields2:
            new_fields.append(x)

        writer = csv.DictWriter(outfile,
                                fields,
                                dialect=args.csv_dialect,
                                lineterminator=args.csv_lineterminator,
                                extrasaction='ignore')

    if len(lines) > 0:

        old_fields = lines[0][:-1].split("\t")

        if args.remove:
            fields = []
            for x in old_fields:
                if x not in input_fields:
                    fields.append(x)
        else:
            fields = input_fields

        reader = csv.DictReader(lines, dialect=args.csv_dialect)

        print("\t".join(fields))

        first_row = True
        for row in reader:
            row = iotools.convertDictionary(row)
            writer.writerow(row)

    E.stop()