Пример #1
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.add_argument(
        "-a", "--first-fastq-file", dest="fastq1", type=str,
        help="supply read1 fastq file")
    parser.add_argument(
        "-b", "--second-fastq-file", dest="fastq2", type=str,
        help="supply read2 fastq file")

    # add common options (-h/--help, ...) and parse command line
    (args, unknown) = E.start(parser,
                              argv=argv,
                              unknowns=True)

    if unknown and len(unknown) == 2:
        args.fastq1, args.fastq2 = unknown

    fastq1 = iotools.open_file(args.fastq1)
    fastq2 = iotools.open_file(args.fastq2)

    E.info("iterating over fastq files")
    f1_count = 0
    for f1, f2 in zip_longest(Fastq.iterate(fastq1),
                              Fastq.iterate(fastq2)):
        if not (f1 and f2) or (not f2 and f1):
            try:
                raise PairedReadError(
                    "unpaired reads detected. Are files sorted? are "
                    "files of equal length?")
            except PairedReadError as e:
                raise PairedReadError(e).with_traceback(sys.exc_info()[2])
        else:
            assert f1.identifier.endswith("/1") and \
                f2.identifier.endswith("/2"), \
                "Reads in file 1 must end with /1 and reads in file 2 with /2"
            args.stdout.write(
                ">%s\n%s\n>%s\n%s\n" %
                (f1.identifier, f1.seq, f2.identifier, f2.seq))
            f1_count += 1

    E.info("output: %i pairs" % f1_count)

    # write footer and output benchmark information.
    E.stop()
Пример #2
0
    def build(self, infiles, outfiles, output_prefix):

        prefix = self.prefix
        offset = Fastq.getOffset("sanger", raises=False)
        outdir = os.path.join(output_prefix + ".dir")
        track = os.path.basename(output_prefix)

        processing_options = self.processing_options
        threads = self.threads

        infile1, infile2 = infiles
        outfile = outfiles[0]

        cmd = '''pandaseq -f %(infile1)s -r %(infile2)s
        %(processing_options)s
        -T %(threads)i
        -U >(gzip > %(outfile)s.unpaired.gz)
        -w >(gzip > %(outfile)s)
        -F
        -G %(output_prefix)s-pandaseq.log.bgz;
        >& %(output_prefix)s-pandaseq.log;
        gzip %(outdir)s/*;
        ''' % locals()

        return cmd
Пример #3
0
    def build(self, infiles, outfiles, output_prefix):

        assert len(infiles) == len(outfiles)
        assert len(infiles) in (1, 2)

        prefix = self.prefix
        offset = Fastq.getOffset("sanger", raises=False)
        processing_options = self.processing_options
        r = {33: 'sanger', 64: 'illumina', 59: 'solexa'}
        quality = r[offset]

        if len(infiles) == 1:
            infile = infiles[0]
            outfile = outfiles[0]
            cmd = '''sickle se
            -g %(processing_options)s
            --qual-type %(quality)s
            --output-file %(outfile)s
            --fastq-file %(infile)s
            2>>%(output_prefix)s.log
            ;''' % locals()

        elif len(infiles) == 2:
            infile1, infile2 = infiles
            outfile1, outfile2 = outfiles
            cmd = '''sickle pe
            -g -s %(processing_options)s
            --qual-type %(quality)s
            -f %(infile1)s -r %(infile2)s
            -o %(outfile1)s -p %(outfile2)s
            2>>%(output_prefix)s.log
            ;''' % locals()

        return cmd
Пример #4
0
def filterReadsByPrimerMatch(infile, outfiles):
    '''Filter out reads where the start of read 1 does not match primer sequence (14bp)'''
    to_cluster = True
    primer = "a"
    if infile.find("_b.") > 0:
        primer = "b"
    if primer == "a":
        primer_seq = PARAMS["grep_primer_a"]
    else:
        primer_seq = PARAMS["grep_primer_b"]
    grep_filter_length = PARAMS["grep_filter_length"]
    primer_subseq = primer_seq[:grep_filter_length]

    track = P.snip(os.path.basename(infile), ".fastq.1.gz")
    infile2 = track + ".fastq.2.gz"
    outfile1, outfile2 = outfiles
    tempfile = "filtered/" + track + ".filtered.fastq.1.gz"

    # filter by primer match
    fastq_in = open(infile, "r")
    fastq_out = open(tempfile, "wb")
    for read in fq.iterate(fastq_in):
        if read.seq[:grep_filter_length] == primer_subseq:
            fastq_out.writeln("@" + read.id)
            fastq_out.writeln(read.seq)
            fastq_out.writeln("+")
            fastq_out.writeln(read.qual)
    fastq_in.close()
    fastq_out.close()

    # reconcile read pairs
    statement = '''python %(scriptsdir)s/fastqs2fastq.py --method=reconcile %(tempfile)s %(infile2)s --output-filename-pattern=filtered/%(track)s.reconciled.fastq.%%i.gz'''
    P.run()
def buildTrueTaxonomicRelativeAbundances(infiles, outfile):
    '''
    get species level relative abundances for the simulateds
    data. This involes creating maps between different identifiers
    from the NCBI taxonomy. This is so that the results are comparable
    to species level analysis from metaphlan
    '''
    levels = ["species", "genus", "family", "order", "class", "phylum"]
    taxa = open(infiles[1])
    header = taxa.readline()
    gi2taxa = collections.defaultdict(list)
    for line in taxa.readlines():
        data = line[:-1].split("\t")
        gi, strain, species, genus, family, order, _class, phylum = data[
            0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]
        gi2taxa[gi] = (species, genus, family, order, _class, phylum)

    outf = open(outfile, "w")
    outf.write("level\ttaxa\trelab\n")
    for i in range(len(levels)):
        total = 0
        result = collections.defaultdict(int)
        for fastq in Fastq.iterate(iotools.openFile(infiles[0])):
            total += 1
            gi = fastq.identifier.split("|")[1]
            result[gi2taxa[gi][i]] += 1
        for taxa, value in result.items():
            outf.write("%s\t%s\t%s\n" %
                       (levels[i], taxa, float(value) / total))
    outf.close()
def buildExpectedCoverageOverGenomes(infiles, outfile):
    '''
    take sequence files and estimate the theoretical
    coverage we would get over genomes in the 
    sample i.e. at 1X coverage
    '''

    # if paired end then will have to multiply
    # by two
    multiply = False
    if infiles[0].endswith(".fastq.1.gz"):
        multiply = True

    # the theoretical coverage is defined as
    # (read length (L) * no. reads (N)) / genome size (G) (bp)

    # get genome sizes into memory
    genomes = open(infiles[1])
    header = genomes.readline()
    genome_sizes = {}
    for line in genomes.readlines():
        data = line[:-1].split("\t")
        gi = data[0].split("_")[1]
        size = data[1]
        genome_sizes[gi] = size

    # get the expected genome size
    expected_genome_sizes = collections.defaultdict(int)
    E.info("iterating over fastq file")
    for fastq in Fastq.iterate(iotools.openFile(infiles[0])):
        gi = fastq.identifier.split("|")[1]
        expected_genome_sizes[gi] += 1
    E.info("iterating over fastq file: DONE")

    # get the proportion of each genome covered
    outf = open(outfile, "w")
    outf.write("gi\texpected_coverage\n")
    for gi, size in expected_genome_sizes.items():
        if multiply:
            size = size * 2
        if gi not in genome_sizes:
            E.warn("could not find gi no. %s in dictionary" % gi)
            continue
        proportion_coverage = float(size) / float(genome_sizes[gi])
        if proportion_coverage > 1:
            proportion_coverage = 1
        outf.write("%s\t%f\n" % (gi, proportion_coverage))
    outf.close()
Пример #7
0
def main(argv=None):

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("--fastq1", dest="fastq1")
    parser.add_option("--to-drop-single", dest='to_remove_singletons')
    parser.add_option("--fastq-out1", dest="fq_out1")
    parser.add_option("--fastq-drop1", dest="fq_dropped1")

    (options, args) = E.start(parser)

    reads_to_remove = IOTools.open_file(
        options.to_remove_singletons).readlines()
    reads_to_remove = set([x.strip() for x in reads_to_remove])

    fastq_out = IOTools.open_file(options.fq_out1, 'w')
    fastq_host = IOTools.open_file(options.fq_dropped1, 'w')

    reads = 0
    dropped_reads = 0
    for read in Fastq.iterate(IOTools.open_file(fastq1)):
        reads += 1
        if read.identifier.split()[0] in reads_to_remove:
            fastq_host.write("@%s\n%s\n+\n%s\n" %
                             (read.identifier, read.seq, read.quals))
            dropped_reads += 1
        else:
            fastq_out.write("@%s\n%s\n+\n%s\n" %
                            (read.identifier, read.seq, read.quals))

    fastq_out.close()
    fastq_host.close()

    try:
        percent_dropped = dropped_reads / float(reads) * 100
    except ZeroDivisionError:
        percent_dropped = 0.0

    E.info('Dropped %i of %i reads (%f percent)' \
           % (dropped_reads, reads, percent_dropped))
Пример #8
0
    def build(self, infiles, outfiles, output_prefix):

        assert len(infiles) == len(outfiles)
        assert len(infiles) in (1, 2)

        prefix = self.prefix
        offset = Fastq.getOffset("sanger", raises=False)
        processing_options = self.processing_options

        assert len(infiles) == len(outfiles)

        cmds = []
        for infile, outfile in zip(infiles, outfiles):

            cmds.append('''zcat %(infile)s
            | fastx_trimmer
            -Q%(offset)s
            %(processing_options)s
            2>> %(output_prefix)s.log
            | gzip > %(outfile)s
            ;''' % locals())

        return " ; ".join(cmds)
Пример #9
0
    def build(self, infiles, outfiles, output_prefix):

        prefix = self.prefix
        offset = Fastq.getOffset("sanger", raises=False)
        outdir = os.path.join(output_prefix + ".dir")
        track = os.path.basename(output_prefix)

        processing_options = self.processing_options

        infile1, infile2 = infiles
        outfile = outfiles[0]

        cmd = '''flash %(infile1)s %(infile2)s
        -p %(offset)s
        %(processing_options)s
        -o %(track)s
        -d %(outdir)s
        >& %(output_prefix)s-flash.log;
        gzip %(outdir)s/*;
        mv %(outdir)s/%(track)s.extendedFrags.fastq.gz %(outfile)s;
        ''' % locals()

        return cmd
Пример #10
0
    def build(self, infiles, outfiles, output_prefix):

        assert len(infiles) == len(outfiles)
        assert len(infiles) in (1, 2)

        offset = Fastq.getOffset("sanger", raises=False)
        processing_options = self.processing_options
        if len(infiles) == 1:
            infile = infiles[0]
            outfile = outfiles[0]
            outdir = os.path.dirname(outfile)
            trim_out = "%s/%s_trimmed.fq.gz" % (outdir, infile.replace(".fastq.gz", ""))
            cmd = '''trim_galore %(processing_options)s
            --phred%(offset)s
            --output_dir %(outdir)s
            %(infile)s
            2>>%(output_prefix)s.log;
            mv %(trim_out)s %(outfile)s;
            ''' % locals()
            outfiles = (outfile,)

        elif len(infiles) == 2:
            infile1, infile2 = infiles
            outfile1, outfile2 = outfiles
            outdir = os.path.dirname(outfile1)
            cmd = '''trim_galore %(processing_options)s
            --paired
            --phred%(offset)s
            --output_dir %(outdir)s
            %(infile1)s %(infile2)s
            2>>%(output_prefix)s.log;
            mv %(outdir)s/%(infile1)s_val_1.fq.gz %(outfile1)s;
            mv %(outdir)s/%(infile2)s_val_2.fq.gz %(outfile2)s;
            ''' % locals()

        return cmd
Пример #11
0
    def build(self, infiles, outfiles, output_prefix):

        assert len(infiles) == len(outfiles)
        assert len(infiles) in (1, 2)

        offset = Fastq.getOffset("sanger", raises=False)
        threads = self.threads
        processing_options = self.processing_options
        if len(infiles) == 1:
            infile = infiles[0]
            outfile = outfiles[0]

            cmd = '''trimmomatic SE
            -threads %(threads)i
            -phred%(offset)s
            %(infile)s %(outfile)s
            %(processing_options)s
            2>> %(output_prefix)s.log
            ;''' % locals()

        elif len(infiles) == 2:
            infile1, infile2 = infiles
            outfile1, outfile2 = outfiles

            cmd = '''trimmomatic PE
            -threads %(threads)i
            -phred%(offset)s
            %(infile1)s %(infile2)s
            %(outfile1)s %(output_prefix)s.1.unpaired
            %(outfile2)s %(output_prefix)s.2.unpaired
            %(processing_options)s
            2>> %(output_prefix)s.log;
            gzip %(output_prefix)s.*.unpaired;
            ''' % locals()

        return cmd
Пример #12
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "--guess-format",
        dest="guess_format",
        type="choice",
        choices=('sanger', 'solexa', 'phred64', 'illumina-1.8', 'integer'),
        help="The default behaviour of the script is to guess the quality "
        "format of the input fastq file. The user can specify the "
        "quality format of the input file using the --guess-format option. "
        "The script will use this format if the "
        "sequence qualities are ambiguous.[default=%default].")

    parser.add_option(
        "--target-format",
        dest="target_format",
        type="choice",
        choices=('sanger', 'solexa', 'phred64', 'illumina-1.8', 'integer'),
        help="The script will convert quality scores to the destination "
        "format unless [default=%default].")

    parser.set_defaults(
        target_format=None,
        guess_format=None,
        min_quality=10,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    c = E.Counter()

    if options.target_format:
        iterator = Fastq.iterate_convert(options.stdin,
                                         format=options.target_format,
                                         guess=options.guess_format)
    else:
        iterator = Fastq.iterate_guess(options.stdin,
                                       guess=options.guess_format)

    options.stdout.write("read\tnfailed\tnN\t%s\n" %
                         ("\t".join(Stats.Summary().getHeaders())))

    min_quality = options.min_quality

    for record in iterator:
        c.input += 1
        quals = record.toPhred()
        nfailed = len([x for x in quals if x < min_quality])
        nns = record.seq.count("N") + record.seq.count(".")
        options.stdout.write(
            "%s\t%i\t%i\t%s\n" %
            (record.identifier, nfailed, nns, str(Stats.Summary(quals))))
        c.output += 1

    # write footer and output benchmark information.
    E.info("%s" % str(c))
    E.stop()
Пример #13
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "--guess-format", dest="guess_format", type="choice",
        choices=('sanger', 'solexa', 'phred64',
                 'illumina-1.8', 'integer'),
        help="The default behaviour of the script is to guess \
        the quality format of the input fastq file. The user \
        can specify the quality format of the input file using \
        the --format option. The script will use this format if \
        sequences qualities are ambiguous.[default=%default].")

    parser.add_option(
        "-f", "--target-format", dest="change_format",
        type="choice", choices=('sanger', 'solexa', 'phred64',
                                'illumina-1.8', 'integer'),
        help="The script guesses the quality format of the input \
        file and converts quality scores to the destination \
        format unless --format is specified [default=%default].")

    parser.set_defaults(
        change_format=None,
        guess_format=None,
        min_quality=10)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    if options.change_format:
        iterator = Fastq.iterate_convert(options.stdin,
                                         format=options.change_format,
                                         guess=options.guess_format)
    else:
        iterator = Fastq.iterate_guess(options.stdin,
                                       guess=options.guess_format)

    min_quality = options.min_quality
    number_of_reads = 0
    number_of_bases = 0
    read_lengths = []
    read_qualities = []
    bases_below_min = 0

    for record in iterator:
        number_of_reads += 1
        quals = record.toPhred()
        length_read = len(quals)
        number_of_bases += length_read
        bases_below_min += len([x for x in quals if x < min_quality])
        read_lengths.append(length_read)
        read_qualities.append(np.mean(quals))

    mean_length = round(np.mean(read_lengths), 2)
    median_length = round(np.median(read_lengths), 2)
    mean_quality = round(np.mean(read_qualities), 2)
    median_quality = round(np.median(read_qualities), 2)

    options.stdout.write(
        "reads\tbases\tmean_length\tmedian_length\tmean_quality\tmedian_quality\tnfailed\n")

    options.stdout.write(
        "%i\t%i\t%s\t%s\t%s\t%s\t%i\n" % (number_of_reads, number_of_bases,
                                          str(mean_length),
                                          str(median_length),
                                          str(mean_quality),
                                          str(median_quality),
                                          bases_below_min))
    E.stop()
Пример #14
0
def process_cgat(options):

    c = E.Counter()

    assert options.input_fastq_file == "-"

    if options.method == "change-format":
        for record in Fastq.iterate_convert(options.stdin,
                                            format=options.target_format,
                                            guess=options.guess_format):
            c.input += 1
            options.stdout.write("%s\n" % record)
            c.output += 1

    elif options.method == "grep":
        for record in Fastq.iterate(options.stdin):
            if re.match(options.grep_pattern, record.seq):
                options.stdout.write("%s\n" % record)

    elif options.method == "reverse-complement":
        for record in Fastq.iterate(options.stdin):
            record.seq = Genomics.complement(record.seq)
            record.quals = record.quals[::-1]
            options.stdout.write("%s\n" % record)

    elif options.method == "sample":
        sample_threshold = min(1.0, options.sample_size)

        random.seed(options.seed)

        if options.pair:
            if not options.output_filename_pattern:
                raise ValueError("please specify output filename pattern for "
                                 "second pair (--output-filename-pattern)")

            outfile1 = options.stdout
            outfile2 = iotools.open_file(options.output_filename_pattern, "w")

            for record1, record2 in zip(
                    Fastq.iterate(options.stdin),
                    Fastq.iterate(iotools.open_file(options.pair))):
                c.input += 1
                if random.random() <= sample_threshold:
                    c.output += 1
                    outfile1.write("%s\n" % record1)
                    outfile2.write("%s\n" % record2)
        else:
            for record in Fastq.iterate(options.stdin):
                c.input += 1
                if random.random() <= sample_threshold:
                    c.output += 1
                    options.stdout.write("%s\n" % record)

    elif options.method == "apply":
        ids = set(iotools.read_list(iotools.open_file(options.apply)))

        for record in Fastq.iterate(options.stdin):
            c.input += 1
            if re.sub(" .*", "", record.identifier).strip() in ids:
                c.output += 1
                options.stdout.write("%s\n" % record)

    elif options.method == "trim3":
        trim3 = options.nbases
        for record in Fastq.iterate(options.stdin):
            c.input += 1
            record.trim(trim3)
            options.stdout.write("%s\n" % record)
            c.output += 1

    elif options.method == "trim5":
        trim5 = options.nbases
        for record in Fastq.iterate(options.stdin):
            c.input += 1
            record.trim5(trim5)
            options.stdout.write("%s\n" % record)
            c.output += 1

    elif options.method == "unique":
        keys = set()
        for record in Fastq.iterate(options.stdin):
            c.input += 1
            if record.identifier in keys:
                continue
            else:
                keys.add(record.identifier)
            options.stdout.write("%s\n" % record)
            c.output += 1

    # Need to change this to incorporate both pairs
    elif options.method == "sort":
        if not options.pair:
            # This is quicker for a single fastq file
            statement = "paste - - - - | sort -k1,1 -t ' ' | tr '\t' '\n'"
            os.system(statement)
        else:
            if not options.output_filename_pattern:
                raise ValueError(
                    "please specify output filename for second pair "
                    "(--output-filename-pattern)")
            E.warn("consider sorting individual fastq files - "
                   "this is memory intensive")
            entries1 = {}
            entries2 = {}

            for record1, record2 in zip(
                    Fastq.iterate(options.stdin),
                    Fastq.iterate(iotools.open_file(options.pair))):
                entries1[record1.identifier[:-2]] = (record1.seq,
                                                     record1.quals)
                entries2[record2.identifier[:-2]] = (record2.seq,
                                                     record2.quals)

            outfile1 = options.stdout
            outfile2 = iotools.open_file(options.output_filename_pattern, "w")
            assert len(set(entries1.keys()).intersection(
                set(entries2.keys()))) == len(entries1),\
                "paired files do not contain the same reads "\
                "need to reconcile files"

            for entry in sorted(entries1):
                outfile1.write("@%s/1\n%s\n+\n%s\n" %
                               (entry, entries1[entry][0], entries1[entry][1]))
                outfile2.write("@%s/2\n%s\n+\n%s\n" %
                               (entry, entries2[entry][0], entries2[entry][1]))

    elif options.method == "renumber-reads":
        id_count = 1
        for record in Fastq.iterate(options.stdin):
            record.identifier = options.renumber_pattern % id_count
            id_count += 1
            options.stdout.write("@%s\n%s\n+\n%s\n" %
                                 (record.identifier, record.seq, record.quals))
    return c
Пример #15
0
def peek(sra, outdir=None):
    """return the full file names for all files which will be extracted

    Parameters
    ----------

    outdir : path
        perform extraction in outdir. If outdir is None, the extraction
        will take place in a temporary directory, which will be deleted
        afterwards.

    Returns
    -------
    files : list
        A list of fastq formatted files that are contained in the archive.
    format : string
        The quality score format in the :term:`fastq` formatted files.

    """

    if outdir is None:
        workdir = tempfile.mkdtemp()
    else:
        workdir = outdir

    # --split-files creates files called prefix_#.fastq.gz,
    # where # is the read number.
    # If file cotains paired end data:
    # output = prefix_1.fastq.gz, prefix_2.fastq.gz
    #    *special case: unpaired reads in a paired end --> prefix.fastq.gz
    #    *special case: if paired reads are stored in a single read,
    #                   fastq-dump will split. There might be a joining
    #                   sequence. The output would thus be:
    #                   prefix_1.fastq.gz, prefix_2.fastq.gz, prefix_3.fastq.gz
    #                   You want files 1 and 3.

    E.run("""fastq-dump --split-files --gzip -X 1000
                 --outdir %(workdir)s %(sra)s""" % locals())
    f = sorted(glob.glob(os.path.join(workdir, "*.fastq.gz")))
    ff = [os.path.basename(x) for x in f]

    if len(f) == 1:
        # sra file contains one read: output = prefix.fastq.gz
        pass

    elif len(f) == 2:
        # sra file contains read pairs:
        # output = prefix_1.fastq.gz, prefix_2.fastq.gz
        assert ff[0].endswith("_1.fastq.gz") and ff[1].endswith("_2.fastq.gz")

    elif len(f) == 3:
        if ff[2].endswith("_3.fastq.gz"):
            f = glob.glob(os.path.join(workdir, "*_[13].fastq.gz"))
        else:
            f = glob.glob(os.path.join(workdir, "*_[13].fastq.gz"))

    # check format of fastqs in .sra
    fastq_format = Fastq.guessFormat(iotools.open_file(f[0], "r"),
                                     raises=False)
    fastq_datatype = Fastq.guessDataType(iotools.open_file(f[0], "r"),
                                         raises=True)

    if outdir is None:
        shutil.rmtree(workdir)

    return f, fastq_format, fastq_datatype
Пример #16
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="choice",
                      choices=('join', ),
                      help="method to apply [default=%default].")

    parser.set_defaults(method="join", )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    if len(args) != 2:
        raise ValueError(
            "please supply at least two fastq files on the commandline")

    fn1, fn2 = args
    c = E.Counter()
    outfile = options.stdout

    if options.method == "join":
        # merge based on diagonals in dotplot
        iter1 = Fastq.iterate(iotools.open_file(fn1))
        iter2 = Fastq.iterate(iotools.open_file(fn2))
        tuple_size = 2
        for left, right in zip(iter1, iter2):
            c.input += 1

            # build dictionary of tuples
            s1, q1 = left.seq, left.quals
            d = collections.defaultdict(list)
            for x in range(len(s1) - tuple_size):
                d[s1[x:x + tuple_size]].append(x)

            s2, q2 = right.seq, right.quals
            s2 = Genomics.reverse_complement(s2)
            q2 = q2[::-1]

            # compute list of offsets/diagonals
            offsets = collections.defaultdict(int)
            for x in range(len(s2) - tuple_size):
                c = s2[x:x + tuple_size]
                for y in d[c]:
                    offsets[x - y] += 1

            # find maximum diagonal
            sorted = sorted([(y, x) for x, y in list(offsets.items())])
            max_count, max_offset = sorted[-1]

            E.debug('%s: maximum offset at %i' % (left.identifier, max_offset))

            # simple merge sequence
            take = len(s2) - max_offset
            merged_seq = s1 + s2[take:]

            # simple merge quality scores
            merged_quals = q1 + q2[take:]

            new_entry = copy.copy(left)
            new_entry.seq = merged_seq
            new_entry.quals = merged_quals
            outfile.write(new_entry)
            c.output += 1

    # write footer and output benchmark information.
    E.info("%s" % str(c))
    E.stop()
Пример #17
0
def main(argv=None):

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("--fastq1", dest="fastq1")
    parser.add_option("--fastq2", dest="fastq2")
    parser.add_option("--fastq3", dest="fastq3")

    parser.add_option("--to-drop-paired", dest='to_remove_paired')
    parser.add_option("--to-drop-single", dest='to_remove_singletons')

    parser.add_option("--fastq-out1", dest="fq_out1")
    parser.add_option("--fastq-out2", dest="fq_out2")
    parser.add_option("--fastq-out3", dest="fq_out3")

    parser.add_option("--fastq-drop1", dest="fq_dropped1")
    parser.add_option("--fastq-drop2", dest="fq_dropped2")
    parser.add_option("--fastq-drop3", dest="fq_dropped3")

    (options, args) = E.start(parser)

    # Fetch the reads to remove
    pairs_to_remove = IOTools.open_file(options.to_remove_paired).readlines()
    pairs_to_remove = set([x.strip() for x in pairs_to_remove])

    print(pairs_to_remove)

    singles_to_remove = IOTools.open_file(
        options.to_remove_singletons).readlines()
    singles_to_remove = set([x.strip() for x in singles_to_remove])

    # open the outfiles
    fastq1_out = IOTools.open_file(options.fq_out1, 'w')
    fastq2_out = IOTools.open_file(options.fq_out2, 'w')
    fastq3_out = IOTools.open_file(options.fq_out3, 'w')

    fastq1_host = IOTools.open_file(options.fq_dropped1, 'w')
    fastq2_host = IOTools.open_file(options.fq_dropped2, 'w')
    fastq3_host = IOTools.open_file(options.fq_dropped3, 'w')

    dropped_pairs = 0
    pairs = 0
    # Drop the paired reads
    for read1, read2 in zip(Fastq.iterate(IOTools.open_file(options.fastq1)),
                            Fastq.iterate(IOTools.open_file(options.fastq2))):
        pairs += 1

        # bmtagger truncates fastq headers at space and won't accept
        # non-identical headers therefore, if one read matches, both
        # are chucked.
        r1_id = read1.identifier.split()[0]
        r2_id = read2.identifier.split()[0]

        print(r1_id)
        print(r2_id)

        if r1_id in pairs_to_remove or r2_id in pairs_to_remove:
            # Both are host
            fastq1_host.write("@%s\n%s\n+\n%s\n" %
                              (read1.identifier, read1.seq, read1.quals))
            fastq2_host.write("@%s\n%s\n+\n%s\n" %
                              (read2.identifier, read2.seq, read2.quals))
            dropped_pairs += 1
        else:
            # Neither are host
            fastq1_out.write("@%s\n%s\n+\n%s\n" %
                             (read1.identifier, read1.seq, read1.quals))
            fastq2_out.write("@%s\n%s\n+\n%s\n" %
                             (read2.identifier, read2.seq, read2.quals))
    # Drop singletons
    singletons = 0
    dropped_singletons = 0
    for read in Fastq.iterate(IOTools.open_file(options.fastq3)):
        singletons += 1
        if read.identifier.split()[0] in singles_to_remove:
            fastq3_host.write("@%s\n%s\n+\n%s\n" %
                              (read.identifier, read.seq, read.quals))
            dropped_singletons += 1
        else:
            fastq3_out.write("@%s\n%s\n+\n%s\n" %
                             (read.identifier, read.seq, read.quals))

    fastq1_out.close()
    fastq2_out.close()
    fastq3_out.close()
    fastq1_host.close()
    fastq2_host.close()
    fastq3_host.close()

    try:
        percent_pairs = dropped_pairs / float(pairs) * 100
    except ZeroDivisionError:
        percent_pairs = 0.0
    try:
        percent_singletons = dropped_singletons / float(singletons) * 100
    except ZeroDivisionError:
        percent_singletons = 0.0

    E.info('Dropped %i of %i read pairs (%f percent)' \
           % (dropped_pairs, pairs, percent_pairs))
    E.info('Dropped %i of %i singletons (%f percent)' \
           % (dropped_singletons, singletons, percent_singletons))