def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-d",
        "--directory",
        dest="directory",
        type="string",
        help="supply directory where the input summaries aer located")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    infiles = glob.glob(os.path.join(options.directory, "*/*genes*summary*"))
    sys.stdout.write("category\tnreads\tpreads\tsample\n")
    for infile in infiles:
        reformat(infile)

    # write footer and output benchmark information.
    E.stop()
示例#2
0
def main(argv=None):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-r",
                      "--run-id",
                      dest="run_id",
                      type="int",
                      help="numerical identifier of a run [%default]")

    parser.add_option("-d",
                      "--database-url",
                      dest="database_url",
                      type="string",
                      help="database url [%default]")

    parser.add_option("-n",
                      "--dry-run",
                      dest="dry_run",
                      action="store_true",
                      help="only show statements to be executed [%default]")

    parser.set_defaults(
        run_id=None,
        database_url="sqlite:///./csvdb",
        dry_run=False,
    )

    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    purge_run_id(options.run_id, options.database_url, dry_run=options.dry_run)

    E.stop()
def main(argv):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-o", "--option", dest="option", type="string")

    (options, args) = E.start(parser, argv)

    with IOTools.open_file(args[0]) as inf:
        data = "".join(inf.readlines()).strip()
    with IOTools.open_file(args[1]) as inf:
        reference = "".join(inf.readlines()).strip()

    data_counts = Counter(data)
    ref_counts = Counter(reference)

    keys = set(list(data_counts.keys()) + list(ref_counts.keys()))

    options.stdout.write("key\tinput\treference\n")
    for key in sorted(keys):
        options.stdout.write(
            "\t".join((key, str(data_counts[key]), str(ref_counts[key]))) + "\n")

    E.stop()
示例#4
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    # stop parsing options at the first argument
    parser.disable_interspersed_args()

    (options, args) = E.Start(parser, add_pipe_options=True)

    if len(args) > 0:

        cmd = args[0]
        if len(args) > 1:
            cmd += " '" + "' '".join(args[1:]) + "'"

        s = subprocess.Popen(cmd, shell=True, cwd=os.getcwd(), close_fds=True)

        (out, err) = s.communicate()
        returncode = s.returncode
    else:
        returncode = 0

    E.Stop()

    sys.exit(returncode)
示例#5
0
def main(argv=None):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "--regex-filename",
        dest="regex_filename",
        type="string",
        help="extract column name from filename via regular expression "
        "[%default]")

    parser.add_option("--filter",
                      dest="filters",
                      type="choice",
                      action="append",
                      choices=("PASS", "SNP"),
                      help="apply filters to VCFs when reading "
                      "[%default]")

    parser.set_defaults(
        regex_filename=None,
        filters=[],
    )

    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    if len(args) < 2:
        raise ValueError("requiring at least 2 input filenames")

    dfs = []
    for filename in args:
        if options.regex_filename:
            try:
                name = re.search(options.regex_filename, filename).groups()[0]
            except AttributeError:
                raise ValueError(
                    "regular expression '{}' does not match {}".format(
                        options.regex_filename, filename))
        else:
            name = iotools.snip(os.path.basename(filename), ".vcf.gz")

        E.debug("reading data from {}".format(filename))
        df = read_vcf_positions_into_dataframe(filename,
                                               filters=options.filters)
        df[name] = 1
        dfs.append(df)

    ndata = len(dfs)
    merged_df = dfs[0]
    for df in dfs[1:]:
        merged_df = merged_df.merge(df, how="outer")
    merged_df = merged_df.fillna(0)
    ddf = merged_df.drop(["chrom", "pos"], axis=1)
    set_counts = ddf.groupby(by=list(ddf.columns)).size()
    set_counts = set_counts.reset_index()
    set_counts.columns = list(set_counts.columns[:-1]) + ["counts"]

    set_counts.to_csv(options.stdout, sep="\t", index=False)
    E.stop()
示例#6
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    dir2files = {}
    for root, directory, files in os.walk("."):
        dir2files[root] = files

    ts = time.time()
    st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d_%H:%M:%S')
    filename = "CWD_%s" % st
    E.info("outputting directory state to %s" % filename)
    with iotools.openFile(filename, "w") as outf:
        outf.write("##contents of cwd on %s\n\n" % st)
        for directory, files in dir2files.items():
            for file in files:
                path = os.path.join(directory, file)
                outf.write(path + "\n")

    # write footer and output benchmark information.
    E.Stop()
示例#7
0
def main(argv=None):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-s", "--sample-size", dest="sample_size", type="float",
        help="sample size. If less than 0, take a proportion of the chromosome size. "
        "If greater than 0, take a fixed number of variants [%default]")

    parser.set_defaults(
        input_filename_fasta=None,
        sample_size=0.001,
        sample_name="NA12878"
    )

    (options, args) = E.start(parser,
                              argv=argv,
                              add_output_options=True)

    if len(args) > 0:
        options.input_filename_fasta = args[0]

    if options.input_filename_fasta == "-":
        options.input_filename_fasta = options.stdin

    outf = options.stdout
    outf.write("##fileformat=VCFv4.1\n")
    outf.write("##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n")
    outf.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t{}\n".format(options.sample_name))

    with pysam.FastxFile(options.input_filename_fasta) as inf:
        for record in inf:
            contig = record.name
            sequence = record.sequence
            if options.sample_size < 1.0:
                nsamples = int(float(len(sequence)) * options.sample_size)
            else:
                nsamples = int(options.sample_size)
            E.info("generating {} sampled variants for contig {}".format(nsamples, contig))
            sampled_positions = set()
            missing_nsamples = nsamples
            while len(sampled_positions) < nsamples:
                raw_positions = random.sample(list(range(len(sequence))), nsamples - len(sampled_positions))
                filtered_positions = [x for x in raw_positions if sequence[x] != "N"]
                sampled_positions.update(filtered_positions)
                E.debug("sample update: total={}, raw={}, filtered={}".format(
                        len(sampled_positions),
                        len(raw_positions),
                        len(filtered_positions)))

            sampled_positions = sorted(sampled_positions)

            for position in sampled_positions:
                base = sequence[position]
                outf.write("{}\t{}\t.\t{}\t{}\t.\t.\t.\tGT\t0/0\n".format(
                        contig, position + 1, base, base))

    E.stop()
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-i",
                      "--test-option",
                      dest="test_option",
                      type="string",
                      help="test option [default=%default].")

    parser.set_defaults(test_option="test")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    files = glob.glob(os.path.join(os.path.dirname(__file__), "*.pyx"))

    # do sth
    ninput, nskipped, noutput = 0, 0, 0

    for f in files:
        E.info("rebuilding %s" % f)
        ninput += 1
        prefix, suffix = os.path.splitext(f)
        for ext in (".c", ".pyxbldc"):
            try:
                os.remove(prefix + ext)
            except OSError:
                pass

        dirname, basename = os.path.split(prefix)
        assert basename.startswith("_")

        scriptname = os.path.join(dirname, basename[1:]) + ".py"
        if not os.path.exists(scriptname):
            E.warn("script %s does not exist - skipped" % scriptname)
            nskipped += 1
            continue

        E.info("compiling %s" % scriptname)
        os.system("%s %s --help > /dev/null" % (sys.executable, scriptname))

        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    # write footer and output benchmark information.
    E.stop()
示例#9
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="choice",
                      choices=("script", "module"),
                      help="type of tests to create [%default].")

    parser.set_defaults(method="script")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if len(args) == 0:
        raise ValueError(
            "setup_test.py requires one or more command line arguments")

    targetdir = os.path.dirname(__file__)

    counter = E.Counter()

    for arg in args:
        counter.input += 1
        script_dirname, basename = os.path.split(arg)

        dirname = os.path.join(targetdir, basename)

        if os.path.exists(dirname):
            E.warn("%s already exists - skipping" % basename)
            counter.skipped += 1
            continue

        os.mkdir(dirname)

        with open(os.path.join(dirname, "tests.yaml"), "w") as outf:
            outf.write(YAML_TEMPLATE)

        counter.created += 1

    E.info("%s" % str(counter))

    # write footer and output benchmark information.
    E.Stop()
示例#10
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version="%prog version: $Id$", usage=globals()["__doc__"])

    (options, args) = E.start(parser, argv=argv)

    if len(args) == 0:
        args.append("-")

    E.info(options.stdin)


    infile = IOTools.open_file(options.stdin.name)
    iterator = FastaIterator.FastaIterator(infile)

   # outfile_info = IOTools.open_file(options.info_file, "w")

    d = collections.OrderedDict()
    cluster_dict = dict()

    # first iterate over the fasta file and generate a dict
    # with the name (title) as the key and the sequence as the value
    # Remove any pseudo sequences
    for cur_record in iterator:


        # This is a temp fix because bedtools getfasta --name seems to have
        # changed the way it names the fasta titles. This may be temp but This
        # will fix this issue for the time being.
        m = re.match("(chr\d+.tRNA\d+-\S+-(pseudo)?)::\S+([+|-])", cur_record.title.replace("(","").replace(")",""))

        if m == None:
            continue
        if m.group(2) == "pseudo":
            pass
        else:
            key = str(m.group(1) +  m.group(3))
            d[key] = cur_record.sequence

    # next iterate of over the dict give the cluster a number
    # this will be used to then map back for the info name

    for key, value in d.items():
        # Add CCA tail
        options.stdout.write((">%s\n%scca\n")%(key, value))

    E.stop()
示例#11
0
def main(argv):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    (options, args) = E.start(parser, argv)

    data = "".join(open(args[0]).readlines())

    print(data[::-1])
示例#12
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: fastas2fasta.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    (options, args) = E.start(parser)

    if len(args) < 2:
        raise ValueError(
            "please supply at least two filenames to concatenate.")

    iterators = []
    for a in args:
        iterators.append(FastaIterator.FastaIterator(iotools.open_file(a,
                                                                       "r")))

    ninput, noutput, nerrors = 0, 0, 0

    while 1:

        sequences = []
        ids = []

        for iterator in iterators:
            try:
                cur_record = next(iterator)
            except StopIteration:
                break

            sequences.append(re.sub(" ", "", cur_record.sequence))
            ids.append(cur_record.title)

        if not sequences:
            break
        ninput += 1

        if len(sequences) != len(iterators):
            raise ValueError("unequal number of sequences in files")

        noutput += 1

        options.stdout.write(">%s\n%s\n" % (ids[0], "".join(sequences)))

    E.info("ninput=%i, noutput=%i, nerrors=%i" % (ninput, noutput, nerrors))

    E.stop()
示例#13
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version="%prog version: $Id$",
        usage=globals()["__doc__"])

    parser.add_option(
        "-a", "--first-fastq-file", dest="fastq1", type="string",
        help="supply read1 fastq file")
    parser.add_option(
        "-b", "--second-fastq-file", dest="fastq2", type="string",
        help="supply read2 fastq file")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    if args and len(args) == 2:
        options.fastq1, options.fastq2 = args

    fastq1 = iotools.open_file(options.fastq1)
    fastq2 = iotools.open_file(options.fastq2)

    E.info("iterating over fastq files")
    f1_count = 0
    for f1, f2 in zip_longest(Fastq.iterate(fastq1),
                              Fastq.iterate(fastq2)):
        if not (f1 and f2) or (not f2 and f1):
            try:
                raise PairedReadError(
                    "unpaired reads detected. Are files sorted? are "
                    "files of equal length?")
            except PairedReadError as e:
                raise PairedReadError(e).with_traceback(sys.exc_info()[2])
        else:
            assert f1.identifier.endswith("/1") and \
                f2.identifier.endswith("/2"), \
                "Reads in file 1 must end with /1 and reads in file 2 with /2"
            options.stdout.write(
                ">%s\n%s\n>%s\n%s\n" %
                (f1.identifier, f1.seq, f2.identifier, f2.seq))
            f1_count += 1

    E.info("output: %i pairs" % f1_count)

    # write footer and output benchmark information.
    E.stop()
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-o",
                      "--ontology",
                      dest="ontology",
                      type="string",
                      help="ontology label")

    parser.add_option("-f",
                      "--filter",
                      dest="filter",
                      action="store_true",
                      help="filter out genesets")

    parser.add_option("-l",
                      "--filter-list",
                      dest="filter_list",
                      type="string",
                      help="list of pathways to keep")

    parser.set_defaults(ontology=None, filter=False, filter_list=None)

    ## add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    if options.filter:
        assert options.filter_list, "must specify a list of pathways to keep"
        filter_set = set()
        for line in open(options.filter_list).readlines():
            filter_set.add(line[:-1])

    inf = options.stdin
    for line in inf.readlines():
        data = line[:-1].split("\t")
        name, description, evidence = data[0], data[0], data[1]
        if options.filter:
            if name not in filter_set: continue
        genes = data[2:]
        for gene in genes:
            options.stdout.write("\t".join(
                [options.ontology, gene, name, description, evidence]) + "\n")

    ## write footer and output benchmark information.
    E.stop()
示例#15
0
def main(argv=None):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-d",
                      "--delimiter",
                      dest="delimiter",
                      type="string",
                      help="delimiter to separate columns [%default]")

    parser.add_option("-m",
                      "--method",
                      dest="methods",
                      type="choice",
                      action="append",
                      choices=["row-describe", "column-describe"],
                      help="additional methods to apply [%default]")

    parser.set_defaults(
        delimiter="\t",
        methods=[],
    )

    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    if not options.methods:
        options.methods = ["summary"]

    table = pandas.read_csv(options.stdin, options.delimiter)

    options.stdout.write("metric\tcount\tpercent\tinfo\n")

    for method in options.methods:
        label = re.sub("-", "_", method)
        if method == "summary":
            for category, count, denominator, info in compute_table_summary(
                    table):
                options.stdout.write("\t".join(
                    map(str, (category, count,
                              iotools.pretty_percent(count, denominator,
                                                     na=""), info))) + "\n")
        elif method == "column-describe":
            df = table.describe().T.stack()
            with E.open_output_file(label) as outf:
                outf.write("label\tcategory\tvalue\n")
                df.to_csv(outf, sep="\t")
        elif method == "row-describe":
            df = table.T.describe().stack()
            with E.open_output_file(label) as outf:
                outf.write("label\tcategory\tvalue\n")
                df.to_csv(outf, sep="\t")

    E.stop()
示例#16
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    (options, args) = E.start(parser, argv=argv)

    if len(args) == 0:
        args.append("-")

    E.info(options.stdin)

    infile = IOTools.open_file(options.stdin.name)

    lines = infile.readlines()

    for line in lines:

        column = line.split()

        new_columns = [
            column[0],
            str(int(column[1]) - 50),
            str(int(column[2]) + 50), column[3], column[4], column[5],
            str(int(column[1]) - 50),
            str(int(column[2]) + 50), column[8], column[9]
        ]

        if "pseudo" not in column[3]:

            if int(column[9]) == 2:
                [c, d] = column[10].split(",")
                block = int(column[2]) - int(column[1]) - int(d) + 50
                new_10 = ''.join(str(int(c) + 50) + ',' + str(int(d) + 50))
                new_11 = ''.join('0' + ',' + str(block))
                new_columns = new_columns + [new_10, new_11]

            else:
                new_columns = new_columns + [
                    str(int(column[10]) + 100), column[11]
                ]

            options.stdout.write('\t'.join(new_columns[0:]) + '\n')

    E.stop()
示例#17
0
def main(argv):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-o", "--option", dest="option", type="string")

    (options, args) = E.start(parser, argv)

    data = "".join(open(args[0]).readlines())

    print(re.sub("o", "a", data))

    E.stop()
示例#18
0
def main(argv=sys.argv):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-i",
                      "--input-fastq-file",
                      dest="input_fastq_file",
                      type="string",
                      help="input fastq file. "
                      "[%default]")

    parser.add_option("-m",
                      "--method",
                      dest="methods",
                      action="append",
                      type="choice",
                      choices=("length", ),
                      help="methods to apply [%default]")

    parser.set_defaults(
        methods=[],
        input_fastq_file=None,
    )

    (options, args) = E.start(parser, argv)

    if len(args) == 1:
        options.input_fastq_file = args[0]

    if options.input_fastq_file is None:
        raise ValueError("missing input fastq file")

    counter = E.Counter()

    # note: complete rewrite with Counters, currently only length
    if options.methods != ["length"]:
        raise NotImplementedError()

    with pysam.FastqFile(options.input_fastq_file) as inf:

        for read in inf:
            counter.input += 1
            options.stdout.write(
                "\t".join(map(str, (read.name, len(read.sequence)))) + "\n")

            counter.output += 1

    E.info(counter)
    E.stop()
示例#19
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-n",
                      "--dry-run",
                      dest="dry_run",
                      action="store_true",
                      help="dry run, do not delete any files [%default]")

    parser.set_defaults(dry_run=False)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    filenames = args

    c = E.Counter()
    for filename in filenames:
        c.checked += 1
        if os.path.exists(filename + ".log"):
            if iotools.isComplete(filename + ".log"):
                c.complete += 1
                continue

        if iotools.isComplete(filename):
            c.complete += 1
            continue

        c.incomplete += 1
        E.info('deleting %s' % filename)
        if options.dry_run:
            continue
        os.unlink(filename)
        c.deleted += 1

    E.info(c)

    # write footer and output benchmark information.
    E.Stop()
示例#20
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    (options, args) = E.start(parser, argv=argv)

    if len(args) == 0:
        args.append("-")

    E.info(options.stdin)

    infile = IOTools.open_file(options.stdin.name)
    iterator = FastaIterator.FastaIterator(infile)

    # outfile_info = IOTools.open_file(options.info_file, "w")

    d = collections.OrderedDict()
    cluster_dict = dict()

    # first iterate over the fasta file and generate a dict
    # with the name (title) as the key and the sequence as the value
    # Remove any pseudo sequences
    for cur_record in iterator:

        key = cur_record.title
        if "pseudo" in key:
            pass

        else:
            d[key] = cur_record.sequence

    # next iterate of over the dict give the cluster a number
    # this will be used to then map back for the info name

    for key, value in d.items():
        # Add CCA tail
        options.stdout.write((">%s\n%scca\n") % (key, value))

    E.stop()
示例#21
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: cgat_script_template.py 2781 2009-09-10 11:33:14Z andreas $",
        usage=globals()["__doc__"])

    parser.set_defaults()

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    if len(args) == 0 or (len(args) == 1 and args[0] == "-"):
        infile = options.stdin
    else:
        infile = fileinput.FileInput(args)

    # do sth
    ninput, nskipped, noutput = 0, 0, 0

    header = False

    for line in infile:
        ninput += 1
        if line.startswith("#"):
            pass
        elif not header:
            header = line
        elif line == header:
            nskipped += 1
            continue

        options.stdout.write(line)
        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    # write footer and output benchmark information.
    E.stop()
示例#22
0
def main(argv=None):

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("--fastq1", dest="fastq1")
    parser.add_option("--to-drop-single", dest='to_remove_singletons')
    parser.add_option("--fastq-out1", dest="fq_out1")
    parser.add_option("--fastq-drop1", dest="fq_dropped1")

    (options, args) = E.start(parser)

    reads_to_remove = IOTools.open_file(
        options.to_remove_singletons).readlines()
    reads_to_remove = set([x.strip() for x in reads_to_remove])

    fastq_out = IOTools.open_file(options.fq_out1, 'w')
    fastq_host = IOTools.open_file(options.fq_dropped1, 'w')

    reads = 0
    dropped_reads = 0
    for read in Fastq.iterate(IOTools.open_file(fastq1)):
        reads += 1
        if read.identifier.split()[0] in reads_to_remove:
            fastq_host.write("@%s\n%s\n+\n%s\n" %
                             (read.identifier, read.seq, read.quals))
            dropped_reads += 1
        else:
            fastq_out.write("@%s\n%s\n+\n%s\n" %
                            (read.identifier, read.seq, read.quals))

    fastq_out.close()
    fastq_host.close()

    try:
        percent_dropped = dropped_reads / float(reads) * 100
    except ZeroDivisionError:
        percent_dropped = 0.0

    E.info('Dropped %i of %i reads (%f percent)' \
           % (dropped_reads, reads, percent_dropped))
示例#23
0
def main(argv=sys.argv):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-i", "--input-fastq", dest="input_fastq_file", type="string",
        help="input fastq file")

    parser.add_option(
        "-m", "--method", dest="method", type="choice",
        choices=["ont2pacbio"],
        help="methods to apply [%default]")

    parser.set_defaults(
        input_fastq_file=None,
        line_width=80,
        method=None,
    )

    (options, args) = E.start(parser, argv, add_output_options=True)

    if len(args) == 1:
        options.input_fastq_file = args[0]

    if options.input_fastq_file == "-":
        options.input_fastq_file = options.stdin

    outf = options.stdout
    line_width = options.line_width
    well_no = 0
    for record in pysam.FastqFile(options.input_fastq_file):
        well_no += 1
        quals = record.get_quality_array()
        seq = record.sequence
        qv = int(math.floor(sum(quals) / len(quals)))
        outf.write(">{}/{}/{}_{} RQ=0.{}\n".format(
            "test", well_no, 1, len(seq) + 1, qv))
        for x in range(0, len(seq), line_width):
            outf.write(seq[x:x + line_width] + "\n")

    E.stop()
示例#24
0
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-k",
                      "--keep-header",
                      dest="keep_header",
                      type="int",
                      help="randomize, but keep header in place [%default]")

    parser.set_defaults(keep_header=0)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    inf = options.stdin
    outf = options.stdout
    c = E.Counter()
    for x in range(options.keep_header):
        c.header += 1
        outf.write(inf.readline())

    lines = inf.readlines()
    c.lines_input = len(lines)
    random.shuffle(lines)
    for line in lines:
        outf.write(line)
    c.lines_output = len(lines)

    E.info(c)

    # write footer and output benchmark information.
    E.stop()
示例#25
0
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    infile = argv[-1]
    for record in makeSplicedFasta(infile):
        options.stdout.write(record)

    # write footer and output benchmark information.
    E.stop()
示例#26
0
def main(argv=None):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    total_counter = E.Counter()
    table = []

    for section, map_task2runner in [("tool", map_tool_to_runner),
                                     ("metric", map_metric_to_runner),
                                     ("split", map_split_to_runner),
                                     ("collate", map_collate_to_runner)]:
        E.debug("processing section: {}".format(section))
        counter = E.Counter()

        for task, taskf in sorted(map_task2runner.items()):
            counter.ntasks += 1
            comments = []
            try:
                version = taskf().get_version()
                counter.version_ok += 1
            except Exception:
                version = ""
                comments.append("unavailable")
                counter.version_fail += 1

            comments = "; ".join(comments)
            table.append((section, task, version, comments))

        E.info("{}: {}".format(section, counter))
        total_counter += counter

    options.stdout.write("section\ttask\tversion\tcomments\n")
    for row in table:
        options.stdout.write("\t".join(map(str, row)) + "\n")

    E.info("{}: {}".format("total", counter))
    E.stop()
示例#27
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$")

    parser.add_option("-s",
                      "--method=sort --sort-order",
                      dest="sort",
                      type="string",
                      help="fields to take (in sorted order).")

    (options, args) = E.start(parser, add_csv_options=True)

    reader = csv.DictReader(E.stdin, dialect=options.csv_dialect)

    if options.sort:
        fields = options.sort.split(",")
    else:
        fields = None

    writer = csv.DictWriter(E.stdout,
                            fields,
                            dialect=options.csv_dialect,
                            lineterminator=options.csv_lineterminator,
                            extrasaction='ignore')

    E.stdout.write("\t".join(fields) + "\n")

    for row in reader:
        row = iotools.convertDictionary(row)
        writer.writerow(row)

    E.stop()
示例#28
0
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("--task", dest="task", type="choice",
                      choices=["extract_table", "get_coverage",
                               "clean_table"],
                      help="task to perform")

    parser.add_option("-t", "--table-name", dest="table", type="string",
                      help="table in SQLite DB to extract")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv, add_database_options=True)

    if options.task == "extract_table":
        out_df = getTableFromDb(options.database_url, options.table)

    elif options.task == "get_coverage":
        out_df = getModelCoverage(options.database_url,
                                  table_regex="(\S+)_transcript_counts")

    elif options.task == "clean_table":
        infile = argv[-1]
        out_df = cleanStatsTable(infile)

    out_df.to_csv(options.stdout,
                  sep="\t", index_label="track")

    # write footer and output benchmark information.
    E.stop()
示例#29
0
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-t",
                      "--test",
                      dest="test",
                      type="string",
                      help="supply help")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    # write footer and output benchmark information.
    E.stop()
示例#30
0
def main(argv=None):

    # Parse the options
    parser = E.OptionParser(
        version=
        "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "-p",
        "--params",
        "--args",
        dest="params",
        type="string",
        help="comma separated list of addtional parameter strings")

    parser.add_option("-m",
                      "--module",
                      dest="module",
                      type="string",
                      help="the full path to the module file",
                      default=None)

    parser.add_option("-i",
                      "--input",
                      dest="input_filenames",
                      type="string",
                      action="append",
                      help="input filename")

    parser.add_option("-o",
                      "--output-section",
                      dest="output_filenames",
                      type="string",
                      action="append",
                      help="output filename")

    parser.add_option("-f",
                      "--function",
                      dest="function",
                      type="string",
                      help="the module function",
                      default=None)

    parser.set_defaults(input_filenames=[], output_filenames=[], params=None)

    (options, args) = E.start(parser)

    # Check a module and function have been specified
    if not options.module or not options.function:
        raise ValueError("Both a function and Module must be specified")

    # initialize defaults
    P.get_parameters()

    # If a full path was given, add this path to the system path
    location = os.path.dirname(options.module)
    if location != "":
        sys.path.append(location)

    # Establish the module name, accomodating cases where the
    # .py extension has been included in the module name
    module_name = os.path.basename(options.module)
    if module_name.endswith(".py"):
        module_base_name = module_name[:-3]
    else:
        module_base_name = module_name

    # Import the specified module and map the specified fuction
    E.info("importing module '%s' " % module_base_name)
    E.debug("sys.path is: %s" % sys.path)

    module = importlib.import_module(module_base_name)
    try:
        function = getattr(module, options.function)
    except AttributeError as msg:
        raise AttributeError(
            msg.message + "unknown function, available functions are: %s" %
            ",".join([x for x in dir(module) if not x.startswith("_")]))

    if options.input_filenames and not options.input_filenames == ["None"]:
        infiles = options.input_filenames
    else:
        infiles = False

    if options.output_filenames and not options.output_filenames == ["None"]:
        outfiles = options.output_filenames
    else:
        outfiles = False

    # Parse the parameters into an array
    if options.params:
        params = [param.strip() for param in options.params.split(",")]
    else:
        params = False

    # deal with single file case
    if infiles and len(infiles) == 1:
        infiles = infiles[0]
    if outfiles and len(outfiles) == 1:
        outfiles = outfiles[0]

    # Make the function call
    if infiles and outfiles and params:
        function(infiles, outfiles, params)
    elif infiles and outfiles and not params:
        function(infiles, outfiles)
    elif params:
        function(params)
    else:
        raise ValueError(
            "Expecting infile+outfile+params or infile+outfile or params")

    E.stop()