Exemplo n.º 1
0
def test_check_is_left():
    assert check_is_left("seq1/1")
    assert check_is_left("seq1 1::N")
    assert not check_is_left("seq1/2")
    assert not check_is_left("seq1 2::N")

    assert not check_is_left("seq")
    assert not check_is_left("seq 1")

    assert check_is_left("@HWI-ST412:261:d15khacxx:8:1101:3149:2157 1:N:0:ATCACG")
Exemplo n.º 2
0
def test_check_is_left():
    assert check_is_left('seq1/1')
    assert check_is_left('seq1 1::N')
    assert not check_is_left('seq1/2')
    assert not check_is_left('seq1 2::N')

    assert not check_is_left('seq')
    assert not check_is_left('seq 1')

    assert check_is_left(
        '@HWI-ST412:261:d15khacxx:8:1101:3149:2157 1:N:0:ATCACG')
Exemplo n.º 3
0
def test_check_is_left():
    assert check_is_left('seq1/1')
    assert check_is_left('seq1 1::N')
    assert not check_is_left('seq1/2')
    assert not check_is_left('seq1 2::N')

    assert not check_is_left('seq')
    assert not check_is_left('seq 1')

    assert check_is_left(
        '@HWI-ST412:261:d15khacxx:8:1101:3149:2157 1:N:0:ATCACG')
Exemplo n.º 4
0
def main():
    info('interleave-reads.py')
    args = sanitize_help(get_parser()).parse_args()

    check_input_files(args.left, args.force)
    check_input_files(args.right, args.force)
    check_space([args.left, args.right], args.force)

    s1_file = args.left
    s2_file = args.right

    print("Interleaving:\n\t%s\n\t%s" % (s1_file, s2_file), file=sys.stderr)

    outfp = get_file_writer(args.output, args.gzip, args.bzip)

    counter = 0
    screed_iter_1 = screed.open(s1_file)
    screed_iter_2 = screed.open(s2_file)
    for read1, read2 in zip_longest(screed_iter_1, screed_iter_2):
        if read1 is None or read2 is None:
            print(("ERROR: Input files contain different number"
                   " of records."),
                  file=sys.stderr)
            sys.exit(1)

        if counter % 100000 == 0:
            print('...', counter, 'pairs', file=sys.stderr)
        counter += 1

        name1 = read1.name
        name2 = read2.name

        if not args.no_reformat:
            if not check_is_left(name1):
                name1 += '/1'
            if not check_is_right(name2):
                name2 += '/2'

            read1.name = name1
            read2.name = name2

            if not check_is_pair(read1, read2):
                print("ERROR: This doesn't look like paired data! "
                      "%s %s" % (read1.name, read2.name),
                      file=sys.stderr)
                sys.exit(1)

        write_record_pair(read1, read2, outfp)

    print('final: interleaved %d pairs' % counter, file=sys.stderr)
    print('output written to', describe_file_handle(outfp), file=sys.stderr)
Exemplo n.º 5
0
def main():
    info('interleave-reads.py')
    args = sanitize_help(get_parser()).parse_args()

    check_input_files(args.left, args.force)
    check_input_files(args.right, args.force)
    check_space([args.left, args.right], args.force)

    s1_file = args.left
    s2_file = args.right

    fail = False

    print("Interleaving:\n\t%s\n\t%s" % (s1_file, s2_file), file=sys.stderr)

    outfp = get_file_writer(args.output, args.gzip, args.bzip)

    counter = 0
    screed_iter_1 = screed.open(s1_file)
    screed_iter_2 = screed.open(s2_file)
    for read1, read2 in zip_longest(screed_iter_1, screed_iter_2):
        if read1 is None or read2 is None:
            print(("ERROR: Input files contain different number"
                   " of records."), file=sys.stderr)
            sys.exit(1)

        if counter % 100000 == 0:
            print('...', counter, 'pairs', file=sys.stderr)
        counter += 1

        name1 = read1.name
        name2 = read2.name

        if not args.no_reformat:
            if not check_is_left(name1):
                name1 += '/1'
            if not check_is_right(name2):
                name2 += '/2'

            read1.name = name1
            read2.name = name2

            if not check_is_pair(read1, read2):
                print("ERROR: This doesn't look like paired data! "
                      "%s %s" % (read1.name, read2.name), file=sys.stderr)
                sys.exit(1)

        write_record_pair(read1, read2, outfp)

    print('final: interleaved %d pairs' % counter, file=sys.stderr)
    print('output written to', describe_file_handle(outfp), file=sys.stderr)
Exemplo n.º 6
0
def main():
    info('interleave-reads.py')
    args = get_parser().parse_args()

    for _ in args.infiles:
        check_file_status(_, args.force)

    check_space(args.infiles, args.force)

    s1_file = args.infiles[0]
    if len(args.infiles) == 2:
        s2_file = args.infiles[1]
    else:
        s2_file = s1_file.replace('_R1_', '_R2_')
        if s1_file == s2_file:
            print >>sys.stderr, ("ERROR: given only one filename, that "
                                 "doesn't contain _R1_. Exiting.")
            sys.exit(1)

        print >> sys.stderr, ("given only one file; "
                              "guessing that R2 file is %s" % s2_file)

    fail = False
    if not os.path.exists(s1_file):
        print >> sys.stderr, "Error! R1 file %s does not exist" % s1_file
        fail = True

    if not os.path.exists(s2_file):
        print >> sys.stderr, "Error! R2 file %s does not exist" % s2_file
        fail = True

    if fail and not args.force:
        sys.exit(1)

    print >> sys.stderr, "Interleaving:\n\t%s\n\t%s" % (s1_file, s2_file)

    counter = 0
    screed_iter_1 = screed.open(s1_file, parse_description=False)
    screed_iter_2 = screed.open(s2_file, parse_description=False)
    for read1, read2 in itertools.izip_longest(screed_iter_1, screed_iter_2):
        if read1 is None or read2 is None:
            print >>sys.stderr, ("ERROR: Input files contain different number"
                                 " of records.")
            sys.exit(1)

        if counter % 100000 == 0:
            print >> sys.stderr, '...', counter, 'pairs'
        counter += 1

        name1 = read1.name
        if not check_is_left(name1):
            name1 += '/1'
        name2 = read2.name
        if not check_is_right(name2):
            name2 += '/2'

        read1.name = name1
        read2.name = name2

        if not check_is_pair(read1, read2):
            print >>sys.stderr, "ERROR: This doesn't look like paired data! " \
                "%s %s" % (read1.name, read2.name)
            sys.exit(1)

        write_record_pair(read1, read2, args.output)

    print >> sys.stderr, 'final: interleaved %d pairs' % counter
    print >> sys.stderr, 'output written to', args.output.name
Exemplo n.º 7
0
def main():
    info('split-paired-reads.py')
    args = get_parser().parse_args()

    infile = args.infile

    check_input_files(infile, args.force)
    filenames = [infile]
    check_space(filenames, args.force)

    # decide where to put output files - specific directory? or just default?
    if args.output_directory:
        if not os.path.exists(args.output_directory):
            os.makedirs(args.output_directory)
        out1 = args.output_directory + '/' + os.path.basename(infile) + '.1'
        out2 = args.output_directory + '/' + os.path.basename(infile) + '.2'
    else:
        out1 = os.path.basename(infile) + '.1'
        out2 = os.path.basename(infile) + '.2'

    # OVERRIDE output file locations with -1, -2
    if args.output_first:
        out1 = args.output_first
    if args.output_second:
        out2 = args.output_second

    fp_out1 = open(out1, 'w')
    fp_out2 = open(out2, 'w')

    counter1 = 0
    counter2 = 0
    index = None

    screed_iter = screed.open(infile, parse_description=False)

    # walk through all the reads in broken-paired mode.
    for index, is_pair, record1, record2 in broken_paired_reader(screed_iter):
        if index % 100000 == 0 and index:
            print >> sys.stderr, '...', index

        # are we requiring pairs?
        if args.force_paired and not is_pair:
            print >>sys.stderr, 'ERROR, %s is not part of a pair' % \
                record1.name
            sys.exit(1)

        if is_pair:
            write_record(record1, fp_out1)
            counter1 += 1
            write_record(record2, fp_out2)
            counter2 += 1
        else:
            name = record1.name
            if check_is_left(name):
                write_record(record1, fp_out1)
                counter1 += 1
            elif check_is_right(name):
                write_record(record1, fp_out2)
                counter2 += 1
            else:
                print >>sys.stderr, \
                    "Unrecognized format for read pair information: %s" % name
                print >> sys.stderr, "Exiting."
                sys.exit(1)

    print >> sys.stderr, "DONE; split %d sequences (%d left, %d right)" % \
        (counter1 + counter2, counter1, counter2)
    print >> sys.stderr, "/1 reads in %s" % out1
    print >> sys.stderr, "/2 reads in %s" % out2
Exemplo n.º 8
0
def main():
    info('split-paired-reads.py')
    args = get_parser().parse_args()

    infile = args.infile

    check_input_files(infile, args.force)
    filenames = [infile]
    check_space(filenames, args.force)

    # decide where to put output files - specific directory? or just default?
    if args.output_directory:
        if not os.path.exists(args.output_directory):
            os.makedirs(args.output_directory)
        out1 = args.output_directory + '/' + os.path.basename(infile) + '.1'
        out2 = args.output_directory + '/' + os.path.basename(infile) + '.2'
    else:
        out1 = os.path.basename(infile) + '.1'
        out2 = os.path.basename(infile) + '.2'

    # OVERRIDE output file locations with -1, -2
    if args.output_first:
        out1 = args.output_first
    if args.output_second:
        out2 = args.output_second

    fp_out1 = open(out1, 'w')
    fp_out2 = open(out2, 'w')

    counter1 = 0
    counter2 = 0
    index = None

    screed_iter = screed.open(infile, parse_description=False)

    # walk through all the reads in broken-paired mode.
    for index, is_pair, record1, record2 in broken_paired_reader(screed_iter):
        if index % 100000 == 0 and index:
            print >> sys.stderr, '...', index

        # are we requiring pairs?
        if args.force_paired and not is_pair:
            print >>sys.stderr, 'ERROR, %s is not part of a pair' % \
                record1.name
            sys.exit(1)

        if is_pair:
            write_record(record1, fp_out1)
            counter1 += 1
            write_record(record2, fp_out2)
            counter2 += 1
        else:
            name = record1.name
            if check_is_left(name):
                write_record(record1, fp_out1)
                counter1 += 1
            elif check_is_right(name):
                write_record(record1, fp_out2)
                counter2 += 1
            else:
                print >>sys.stderr, \
                    "Unrecognized format for read pair information: %s" % name
                print >>sys.stderr, "Exiting."
                sys.exit(1)

    print >> sys.stderr, "DONE; split %d sequences (%d left, %d right)" % \
        (counter1 + counter2, counter1, counter2)
    print >> sys.stderr, "/1 reads in %s" % out1
    print >> sys.stderr, "/2 reads in %s" % out2
Exemplo n.º 9
0
def main():
    info('interleave-reads.py')
    args = get_parser().parse_args()

    for _ in args.infiles:
        check_file_status(_, args.force)

    check_space(args.infiles, args.force)

    s1_file = args.infiles[0]
    if len(args.infiles) == 2:
        s2_file = args.infiles[1]
    else:
        s2_file = s1_file.replace('_R1_', '_R2_')
        if s1_file == s2_file:
            print >> sys.stderr, ("ERROR: given only one filename, that "
                                  "doesn't contain _R1_. Exiting.")
            sys.exit(1)

        print >> sys.stderr, ("given only one file; "
                              "guessing that R2 file is %s" % s2_file)

    fail = False
    if not os.path.exists(s1_file):
        print >> sys.stderr, "Error! R1 file %s does not exist" % s1_file
        fail = True

    if not os.path.exists(s2_file):
        print >> sys.stderr, "Error! R2 file %s does not exist" % s2_file
        fail = True

    if fail and not args.force:
        sys.exit(1)

    print >> sys.stderr, "Interleaving:\n\t%s\n\t%s" % (s1_file, s2_file)

    counter = 0
    screed_iter_1 = screed.open(s1_file, parse_description=False)
    screed_iter_2 = screed.open(s2_file, parse_description=False)
    for read1, read2 in itertools.izip_longest(screed_iter_1, screed_iter_2):
        if read1 is None or read2 is None:
            print >> sys.stderr, ("ERROR: Input files contain different number"
                                  " of records.")
            sys.exit(1)

        if counter % 100000 == 0:
            print >> sys.stderr, '...', counter, 'pairs'
        counter += 1

        name1 = read1.name
        if not check_is_left(name1):
            name1 += '/1'
        name2 = read2.name
        if not check_is_right(name2):
            name2 += '/2'

        read1.name = name1
        read2.name = name2

        if not check_is_pair(read1, read2):
            print >>sys.stderr, "ERROR: This doesn't look like paired data! " \
                "%s %s" % (read1.name, read2.name)
            sys.exit(1)

        write_record_pair(read1, read2, args.output)

    print >> sys.stderr, 'final: interleaved %d pairs' % counter
    print >> sys.stderr, 'output written to', args.output.name
Exemplo n.º 10
0
def main():
    info('split-paired-reads.py')
    args = get_parser().parse_args()

    infile = args.infile

    filenames = [infile]
    check_input_files(infile, args.force)
    check_space(filenames, args.force)

    # decide where to put output files - specific directory? or just default?
    if infile == '/dev/stdin' or infile == '-':
        if not (args.output_first and args.output_second):
            print >> sys.stderr, ("Accepting input from stdin; "
                                  "output filenames must be provided.")
            sys.exit(1)
    elif args.output_directory:
        if not os.path.exists(args.output_directory):
            os.makedirs(args.output_directory)
        out1 = args.output_directory + '/' + os.path.basename(infile) + '.1'
        out2 = args.output_directory + '/' + os.path.basename(infile) + '.2'
    else:
        out1 = os.path.basename(infile) + '.1'
        out2 = os.path.basename(infile) + '.2'

    # OVERRIDE output file locations with -1, -2
    if args.output_first:
        fp_out1 = args.output_first
        out1 = fp_out1.name
    else:
        # Use default filename created above
        fp_out1 = open(out1, 'w')
    if args.output_second:
        fp_out2 = args.output_second
        out2 = fp_out2.name
    else:
        # Use default filename created above
        fp_out2 = open(out2, 'w')

    counter1 = 0
    counter2 = 0
    index = None

    screed_iter = screed.open(infile, parse_description=False)

    # walk through all the reads in broken-paired mode.
    paired_iter = broken_paired_reader(screed_iter)
    for index, is_pair, record1, record2 in paired_iter:
        if index % 10000 == 0:
            print('...', index, file=sys.stderr)

        # are we requiring pairs?
        if args.force_paired and not is_pair:
            print('ERROR, %s is not part of a pair' % record1.name,
                  file=sys.stderr)
            sys.exit(1)

        if is_pair:
            write_record(record1, fp_out1)
            counter1 += 1
            write_record(record2, fp_out2)
            counter2 += 1
        else:
            name = record1.name
            if check_is_left(name):
                write_record(record1, fp_out1)
                counter1 += 1
            elif check_is_right(name):
                write_record(record1, fp_out2)
                counter2 += 1
            else:
                print("Unrecognized format for read pair information: %s" %
                      name,
                      file=sys.stderr)
                print("Exiting.", file=sys.stderr)
                sys.exit(1)

    print("DONE; split %d sequences (%d left, %d right)" %
          (counter1 + counter2, counter1, counter2),
          file=sys.stderr)
    print("/1 reads in %s" % out1, file=sys.stderr)
    print("/2 reads in %s" % out2, file=sys.stderr)