示例#1
0
 def test_raise_exception(self):
     '''open_file_write() and open_file_read() should raise an exception when can't do the opening'''
     with self.assertRaises(utils.Error):
         utils.open_file_read('this_file_is_not_here_so_throw_error')
     with self.assertRaises(utils.Error):
         utils.open_file_read('this_file_is_not_here_so_throw_error.gz')
     with self.assertRaises(utils.Error):
         utils.open_file_write(os.path.join('not_a_directory', 'this_file_is_not_here_so_throw_error'))
     with self.assertRaises(utils.Error):
         utils.open_file_write(os.path.join('not_a_directory', 'this_file_is_not_here_so_throw_error.gz'))
示例#2
0
    def test_get_next_from_file(self):
        '''get_next_from_file() should read seqs from OK, and raise error at badly formatted file'''
        bad_files = [
            'fastn_unittest_fail_no_AT.fq', 'fastn_unittest_fail_no_seq.fq',
            'fastn_unittest_fail_no_plus.fq', 'fastn_unittest_fail_no_qual.fq'
        ]

        for fname in bad_files:
            f_in = utils.open_file_read(fname)
            fq = fastn.Fastq()
            with self.assertRaises(fastn.Error):
                while fq.get_next_from_file(f_in):
                    pass

            utils.close(f_in)

        fname = 'fastn_unittest_good_file.fq'
        try:
            f_in = open(fname)
        except IOError:
            print("Error opening '" + fname + "'", file=sys.stderr)
            sys.exit(1)

        fq = fastn.Fastq()
        while fq.get_next_from_file(f_in):
            self.assertEqual(fq, fastn.Fastq('ID', 'ACGTA', 'IIIII'))
        utils.close(f_in)
示例#3
0
文件: blast.py 项目: JTumelty/madansi
def file_reader(fname):
    f = utils.open_file_read(fname)

    for line in f:
        yield BlastHit(line)

    utils.close(f)
def get_scaff_results(dir):
    flag_counts = {k: 0 for k in possible_flags}
    flag_counts['skipped'] = 0
    flag_counts['lost'] = 0

    log_file = dir + '/check_scaffolds.log'

    if os.path.exists(log_file):
        f = utils.open_file_read(dir + '/check_scaffolds.log')
        for line in f:
            a = line.split()

            if a[0].isdigit():
                flag_counts[int(a[0])] = int(a[1])
            elif a[0] in ['lost', 'skipped']:
                flag_counts[a[0]] = int(a[1])

        utils.close(f)
    else:
        print('Warning: no log file', log_file, file=sys.stderr)
        flag_counts['bad_joins'] = 0

    flag_counts['bad_joins'] = sum([
        flag_counts[x] for x in flag_counts.keys()
        if x not in [0, 16, 'skipped']
    ])

    return flag_counts
示例#5
0
    def __init__(self, filename):
        f = utils.open_file_read(filename)

        self.version = None
        self.mutations = {}  # (seq name, pos) -> [list of mutations]

        for line in f:
            # first line should define that this is a genome diff file
            if self.version is None:
                if not line.startswith('#=GENOME_DIFF'):
                    raise Error("Error. first line of file '" + filename +
                                "' should start with: #=GENOME_DIFF")

                self.version = line.rstrip().split()[-1]
                continue

            # for now, ignore the rest of the metadata
            if line.startswith('#'):
                continue

            fields = line.rstrip().split('\t')

            if fields[0] in mutation_types:
                mutation = Mutation(line)
                self.mutations[mutation.seq_id, mutation.position] = mutation

        utils.close(f)
示例#6
0
def file_reader(fname):
    f = utils.open_file_read(fname)

    for line in f:
        yield MpileupLine(line)

    utils.close(f)
示例#7
0
def file_reader(fname):
    f = utils.open_file_read(fname)

    for line in f:
        yield BlastHit(line)

    utils.close(f)
示例#8
0
def file_reader(fname):
    f = utils.open_file_read(fname)
    c = Caf()

    while c.get_next_from_file(f):
        yield c

    utils.close(f)
示例#9
0
def file_reader(fname):
    f = utils.open_file_read(fname)
    for line in f:
        if line.startswith('@'):
            continue

        yield SamRecord(line)

    utils.close(f)
示例#10
0
    def test_get_next_from_file(self):
        '''get_next_from_file() should read seqs from OK, including weirdness in file'''
        f_in = utils.open_file_read('fastn_unittest.fa')
        fa = fastn.Fasta()
        counter = 1

        while fa.get_next_from_file(f_in):
            self.assertEqual(fa, fastn.Fasta(str(counter), 'ACGTA'))
            counter += 1

        utils.close(f_in)
示例#11
0
def file_reader(fname):
    f = utils.open_file_read(fname)
    in_header = True

    for line in f:
        if in_header:
            if line.startswith('['):
                in_header = False
            continue
        yield NucmerHit(line)

    utils.close(f)
    def __init__(self, bsub_o, log_file, max_joins, extra_cpu=0, extra_mem=0):
        # get flag counts etc from the log file
        self.flag_counts = {k: 0 for k in ScaffResults.possible_flags}
        self.stats = {k: 0 for k in ScaffResults.evaluation_score_keys}

        if os.path.exists(log_file):
            f = utils.open_file_read(log_file)
            for line in f:
                a = line.split()

                if a[0].isdigit():
                    self.flag_counts[int(a[0])] = int(a[1])
                elif a[0] == 'lost':
                    self.stats['Lost tags'] = int(a[1])
                elif a[0] == 'skipped':
                    self.stats['Skipped tags'] = int(a[1])
            utils.close(f)
            self.stats['Bad joins'] = sum([
                self.flag_counts[x]
                for x in self.flag_counts.keys() if x not in [0, 16]
            ]) + self.stats['Lost tags']
        else:
            print('Warning: no log file', log_file, file=sys.stderr)

        # get cpu and mem from bsub file
        bsub_out = utils.syscall_get_stdout('bsub-out2stats.py -s ' + bsub_o)
        assert len(bsub_out) == 1
        (attempt_no, exit_code, wall_hrs, cpu_secs, cpu_hrs, mem, swap,
         filename) = bsub_out[0].split('\t')
        assert exit_code == '0'

        self.stats['Correct joins'] = self.flag_counts[0]
        self.cpu = int(round(float(cpu_secs), 0))
        self.mem = int(mem)
        self.extra_cpu = extra_cpu
        self.extra_mem = extra_mem
        self.stats['Total CPU'] = self.cpu + extra_cpu
        self.max_mem = max(self.mem, extra_mem)
        self.scores = {k: -1 for k in ScaffResults.evaluation_score_keys}
        self.worksheet_row = -1
        self.potential_joins = max_joins
        self.total_joins = self.stats['Correct joins'] + self.stats['Bad joins']

        if self.total_joins > 0:
            self.percent_joins_correct = self.stats[
                'Correct joins'] / self.total_joins
        else:
            self.percent_joins_correct = 0

        self.percent_correct_joins_made = self.stats[
            'Correct joins'] / self.potential_joins
示例#13
0
def get_sequence_lengths(fname):
    lengths = {}
    f = utils.open_file_read(fname)
    for line in f:
        if not line.startswith('@'):
            break
        elif line.startswith('@SQ'):
            try:
                l = line.rstrip().split('\t')[1:]
                d = {x[:2]:x[3:] for x in l}
                lengths[d['SN']] = int(d['LN'])
            except:
                raise Error('Error getting length from line of BAM header\n' + line)

    utils.close(f)
    return lengths
def file2regions(fname):
    regions = {}

    f = utils.open_file_read(fname)

    for line in f:
        if line.startswith('#'):
            continue

        (chr, start, end) = line.rstrip().split()
        if chr not in regions:
            regions[chr] = []

        regions[chr].append(genome_intervals.Interval(start, end))

    utils.close(f)
    return regions
示例#15
0
    def test_write_and_read(self):
        '''open_file_write() and open_file_read() should do the right thing depending gzipped or not'''
        for filename in ['utils.tmp', 'utils.tmp.gz', 'utils.tmp.bgz']:
            f = utils.open_file_write(filename)
            for i in range(3):
                print(i, file=f)
            utils.close(f)

            counter = 0

            f = utils.open_file_read(filename)
            for line in f:
                self.assertEqual(counter, int(line.strip()))
                counter += 1
            utils.close(f)

            os.unlink(filename)
示例#16
0
def file_reader(fname, read_quals=False):
    f = utils.open_file_read(fname)
    line = f.readline()
    if line.startswith('>'):
        seq = Fasta()
        previous_lines[f] = line
    elif line.startswith('@'):
        seq = Fastq()
        previous_lines[f] = line
    elif line == '':
        utils.close(f)
        return
    else:
        raise Error('Error determining file type from file "' + fname +
                    '".  First line is:\n' + line.rstrip())

    while seq.get_next_from_file(f, read_quals):
        yield seq

    utils.close(f)
parser = argparse.ArgumentParser(
    description=
    'Makes a random genome with sequence lengths and names determined by an fai file. IMPORTANT: not really random, at the moment every base will be an A (or an N if --gaps_file used)',
    usage='%(prog)s [options] <fai file> <outfile>')
parser.add_argument(
    '--gaps_file',
    help='File of gaps, each line in the form: "chr start end" (tab separated)'
)
parser.add_argument('fai_file', help='Name of fai file')
parser.add_argument('outfile', help='Name of output fasta file')
options = parser.parse_args()

gaps = {}
if options.gaps_file:
    f = utils.open_file_read(options.gaps_file)
    for line in f:
        (id, start, end) = line.rstrip().split('\t')
        gap = genome_intervals.Interval(int(start) - 1, int(end) - 1)
        if id not in gaps:
            gaps[id] = []
        gaps[id].append(gap)
    utils.close(f)

f_in = utils.open_file_read(options.fai_file)
f_out = utils.open_file_write(options.outfile)

for line in f_in:
    a = line.rstrip().split()
    fa = fastn.Fasta(a[0], 'A' * int(a[1]))
reference_lengths = {}
fastn.lengths_from_fai(options.reference_fai, reference_lengths)

tags = {}  # id -> tag
tags_by_chr = {}
tags_tsv_file = options.tags_files_prefix + '.tags.tsv'
tags_fa_file = options.tags_files_prefix + '.uniquely-tagged.tags.fa'

if options.circular:
    circular_seqs = set(options.circular)
else:
    circular_seqs = set()

# load tags from file
f = utils.open_file_read(tags_tsv_file)
for line in f:
    a = line.rstrip().split('\t')
    assert ' ' not in a[-1]
    (chr, pos, strand) = a[-1].split(':::')
    assert strand == '+'
    tag = Tag(a[0], chr, pos, len(a[3]))
    assert tag.id not in tags
    tags[tag.id] = tag

    if tag.chr not in tags_by_chr:
        tags_by_chr[tag.chr] = []

    tags_by_chr[tag.chr].append(tag)

utils.close(f)
示例#19
0
    def test_get_next_from_file(self):
        '''get_next_from_file() should read caf records from file correctly'''

        f_in = utils.open_file_read('caf_unittest.caf')

        c = caf.Caf()
        c.get_next_from_file(f_in)

        e = caf.Caf()
        e.id = 'pknbac5b2Aa01.p1k'
        seq = ''.join(['NGGAGAGACTCGGACTAGTTCTACACCCTCACACCTTTGTCCTAAACCTTGAATCTAAGT'
                       'CCTAACACCCTGACACCTTTGTCCTAAGCCCGGAATCTAACTTCTAGCACCCCTACGACC',
                       'CTTATTCCTAAACCCAGAATCTGACTATTGACACCCCTACAACCCTAATTCCAACACCCT',
                       'TACAACCTTCATTCCAACACCGCAACAACCTTCATTCCAGCACCCCAACAACCTTCATTC',
                       'CAACACCCCAAACAACATCATTCCAACACCCCAAACAACATCATTCCAACACCCCAAACA',
                       'ACATCATTCCAACACCCCAAACAACATCATTCCAACACGGCAACAACATCATTCGAACAC',
                       'CCCTACAACATCATTCCAGCACCCCAACAACCTCCCTGCGAAACCCCGAATCCGAATTTT',
                       'GACACCCCTACAACCTTATTCTGACACCCCCAACAAACTTTCTCTAACACCCCAACAACG',
                       'TGACTACTAATACACCTAAAACCTTACTCCTAAACCCGGAATCCGACTTCTAATACCGCA',
                       'ACAACCTTCATTCCTAAACCCGGAATCTGAACCCTGAACCATTAAAACATAAAACGTGGA',
                       'AAATGAACCCCTGAACCATGAAAACCGTGAAAACCTATAACTTGGACCATGAACCTCTCA',
                       'ACCCCGAAATATGAGAACTTTGGAAACCCTAAATTTTGGGAAAACTCCTTTTTTTTTTTT',
                       'TTATTGTACATCCTGTGCGATGGTATACATTTTGGCGAATGCAAAAGAATTAGCATATAT',
                       'ATATGTGTAGGTCTTTGTGATGGTCAGGGGGGAGATCGACTAGGGTGTAGGTCTTTGTGA',
                       'TGGTCAAGGGAGATGGGCCAAAGGGAAGTCGGACAAGGTGAGATGGGCCAAGGAGATGGG',
                       'CCTAGGGTGGATGGGACAAGGGTGGATGGTCAGAGGTGGATGGTCAAGGGTGGATGGTCA',
                       'AGGATGAATGGGCAAGGGAGATGGGCAAAGTAGATGGGCAAGGGTGGATGGACAAGGTGG',
                       'ATGGCCAAAGTGGATGGCAAGGAGGATGGCCCAGGTAATAGGCAAGGAAATGGCCAGGTG',
                       'GATGGACCAGGTGGTGCCCTAATGGAGGCAGGGTGAAGTCCAGGAGGAGGCCCAGGAAAA',
                       'GGCCCAGAGAAACCCAAGGAAAGGCCCAGGGGGTGGGACAGGGGAAGCGCCAAGGGATGC',
                       'CAAGGTGGGGGCCAGAAAATAGCCCAGAAAAGGCCAAAATAAGCCAAGAAAAGCCCCAGA',
                       'AAACCCAAGAAA'])

        quals = [4, 4, 4, 4, 6, 6, 8, 6, 6, 6, 6, 10, 12, 11, 13, 13, 20, 19, 9, 10, 9, 9, 9, 19, 19, 34, 34, 39, 35, 35, 35, 37, 35, 34, 26, 26, 16, 17, 11, 21, 21, 32, 35, 37, 37, 32, 45, 23, 17, 17, 18, 27, 29, 32, 35, 32, 32, 32, 32, 39, 35, 35, 35, 35, 35, 37, 42, 31, 31, 14, 13, 13, 25, 25, 35, 40, 33, 29, 23, 23, 15, 25, 24, 35, 35, 35, 35, 23, 36, 18, 18, 23, 28, 33, 29, 29, 32, 32, 32, 32, 35, 35, 32, 35, 35, 32, 35, 35, 44, 44, 37, 35, 28, 26, 24, 19, 23, 30, 33, 40, 32, 32, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 50, 50, 50, 37, 30, 30, 27, 27, 21, 21, 21, 29, 26, 29, 29, 23, 23, 28, 37, 37, 50, 50, 40, 35, 35, 32, 32, 32, 35, 44, 37, 35, 35, 35, 35, 35, 35, 32, 32, 35, 35, 35, 35, 44, 42, 42, 41, 41, 41, 41, 41, 42, 41, 41, 41, 41, 41, 41, 44, 44, 42, 42, 42, 42, 42, 35, 37, 35, 35, 33, 37, 37, 44, 44, 44, 41, 42, 50, 42, 42, 42, 44, 44, 50, 50, 44, 44, 44, 44, 44, 44, 50, 50, 44, 44, 44, 44, 44, 41, 42, 44, 42, 42, 42, 44, 44, 42, 42, 41, 41, 41, 42, 44, 50, 50, 50, 44, 44, 44, 44, 44, 37, 37, 37, 37, 39, 41, 41, 44, 44, 44, 44, 47, 47, 44, 44, 44, 43, 43, 42, 42, 37, 37, 37, 41, 41, 42, 44, 44, 44, 44, 44, 42, 42, 42, 41, 41, 41, 44, 44, 44, 46, 42, 41, 37, 37, 37, 37, 37, 41, 42, 35, 35, 35, 35, 35, 35, 35, 42, 41, 42, 44, 50, 42, 42, 41, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 37, 37, 41, 44, 44, 47, 37, 37, 33, 33, 33, 27, 27, 37, 37, 47, 47, 47, 47, 47, 50, 44, 44, 42, 50, 35, 35, 35, 42, 42, 44, 50, 50, 50, 42, 42, 42, 42, 35, 35, 37, 42, 50, 44, 44, 44, 44, 44, 44, 47, 47, 47, 47, 44, 50, 44, 44, 44, 44, 47, 47, 44, 47, 50, 50, 50, 48, 37, 17, 17, 13, 22, 22, 35, 36, 42, 42, 35, 35, 35, 37, 37, 42, 50, 35, 35, 35, 35, 37, 37, 35, 35, 33, 33, 33, 33, 42, 42, 42, 41, 41, 41, 41, 41, 41, 50, 37, 44, 44, 44, 42, 37, 37, 21, 21, 21, 33, 33, 42, 50, 50, 44, 44, 44, 44, 44, 44, 44, 42, 37, 44, 44, 44, 44, 42, 42, 42, 42, 42, 44, 44, 44, 50, 50, 44, 44, 44, 37, 37, 35, 33, 33, 21, 21, 33, 33, 33, 41, 42, 42, 41, 44, 44, 44, 44, 42, 42, 42, 42, 44, 41, 44, 37, 42, 37, 41, 41, 42, 42, 50, 50, 44, 44, 44, 44, 42, 42, 27, 33, 27, 33, 33, 37, 37, 50, 35, 35, 35, 37, 37, 44, 44, 50, 44, 44, 44, 37, 37, 35, 31, 31, 37, 37, 44, 44, 44, 44, 50, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 37, 37, 28, 28, 23, 28, 26, 33, 33, 33, 29, 29, 29, 33, 35, 46, 33, 23, 23, 26, 33, 33, 50, 44, 37, 37, 30, 37, 37, 42, 50, 30, 30, 30, 37, 37, 23, 28, 15, 15, 11, 15, 27, 37, 33, 37, 26, 26, 28, 37, 42, 48, 48, 37, 23, 23, 23, 31, 31, 33, 23, 23, 24, 31, 31, 31, 33, 24, 25, 21, 21, 21, 28, 31, 33, 42, 42, 42, 42, 44, 44, 44, 44, 30, 23, 16, 10, 10, 16, 24, 33, 24, 24, 24, 30, 33, 36, 42, 42, 44, 44, 42, 39, 39, 33, 46, 27, 28, 28, 33, 33, 37, 37, 37, 22, 22, 17, 19, 19, 33, 31, 33, 27, 27, 18, 18, 24, 29, 32, 33, 35, 33, 40, 40, 37, 34, 27, 27, 14, 14, 13, 13, 18, 12, 20, 25, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 47, 56, 56, 56, 47, 42, 42, 42, 42, 27, 23, 15, 11, 11, 27, 33, 42, 42, 33, 24, 10, 10, 10, 13, 15, 18, 14, 14, 14, 14, 14, 25, 30, 33, 30, 21, 21, 27, 27, 22, 15, 13, 22, 22, 19, 19, 15, 15, 11, 10, 17, 27, 27, 21, 15, 18, 13, 13, 16, 22, 24, 37, 31, 40, 40, 37, 47, 40, 37, 27, 27, 24, 24, 17, 20, 13, 10, 10, 11, 11, 14, 12, 19, 10, 10, 12, 14, 11, 10, 10, 10, 10, 15, 11, 15, 15, 25, 12, 12, 8, 8, 10, 17, 10, 21, 21, 8, 8, 8, 10, 10, 19, 25, 21, 19, 10, 10, 8, 9, 10, 12, 14, 17, 24, 22, 16, 16, 10, 9, 8, 14, 12, 12, 9, 9, 9, 9, 9, 9, 13, 19, 15, 18, 22, 22, 15, 15, 15, 15, 9, 10, 9, 8, 8, 9, 10, 14, 10, 10, 19, 15, 12, 9, 15, 4, 4, 4, 8, 8, 10, 12, 9, 8, 6, 6, 6, 6, 7, 7, 8, 8, 8, 8, 16, 10, 10, 10, 8, 7, 7, 7, 7, 7, 13, 20, 19, 15, 15, 10, 10, 8, 8, 10, 10, 10, 15, 10, 8, 8, 9, 8, 9, 10, 11, 10, 8, 8, 8, 8, 8, 4, 8, 4, 7, 7, 9, 13, 16, 11, 10, 12, 11, 13, 8, 8, 8, 8, 8, 9, 10, 9, 9, 9, 8, 8, 8, 12, 8, 9, 9, 11, 10, 10, 7, 7, 9, 7, 8, 9, 11, 10, 9, 10, 9, 10, 7, 7, 7, 9, 8, 8, 10, 8, 8, 4, 7, 4, 4, 4, 4, 4, 8, 7, 7, 8, 9, 9, 7, 7, 9, 9, 9, 9, 8, 7, 7, 7, 7, 10, 10, 7, 8, 8, 9, 10, 10, 10, 14, 13, 9, 8, 7, 7, 7, 6, 6, 7, 7, 6, 6, 6, 6, 6, 8, 15, 10, 8, 8, 8, 8, 6, 7, 6, 6, 6, 6, 7, 7, 7, 8, 7, 4, 4, 4, 6, 6, 6, 6, 7, 13, 7, 7, 8, 8, 8, 7, 7, 7, 7, 7, 7, 8, 9, 7, 7, 7, 6, 6, 6, 6, 6, 7, 9, 7, 7, 7, 8, 10, 8, 8, 8, 8, 9, 6, 6, 6, 6, 6, 6, 6, 7, 7, 10, 9, 9, 9, 8, 8, 8, 8, 8, 7, 7, 7, 6, 6, 6, 7, 6, 6, 7, 9, 7, 7, 11, 6, 6, 7, 6, 6, 8, 7, 7, 8, 8, 10, 8, 8, 8, 6, 6, 7, 6, 6, 6, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 7, 7, 7, 7, 7, 12, 9, 14, 10, 10, 10, 10, 8, 9, 8, 8, 8, 7, 7, 7, 7, 7, 13, 7, 7, 6, 6, 6, 6, 6, 6, 8, 7, 7, 7, 6, 6, 6, 6, 6, 8, 8, 8, 9, 7, 7, 7, 8, 9, 7, 6, 6, 6, 6, 6, 6, 8, 8, 6, 6, 6, 6, 7, 7, 7, 8, 8, 9, 8, 9, 8, 8, 8, 8, 8, 8, 8, 12, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 6, 9, 6, 6, 6, 7, 7, 7, 7, 7, 8, 10, 10, 14, 9, 12, 7, 7, 7, 4, 4]
        e.seq = fastn.Fasta(e.id, seq)
        e.seq = e.seq.to_Fastq(quals)
        e.insert_min = 2000
        e.insert_max = 4000
        e.ligation = '96781'
        e.clone = 'pknbac5b2'
        e.clip_start = 23
        e.clip_end = 789

        self.assertEqual(c, e)

        c.get_next_from_file(f_in)

        e = caf.Caf()
        e.id = 'pknbac5b2Aa02.p1k'

        seq = ''.join(['AAAGACATACGACCTTTTTTTTTTTCGATAACAAAGGGTATCCTTTCACCAGAAAAAAAA',
                       'AAAGAACATTCTTCTTTTTTCTTGAAGAACATACATTCTTTTTTTTATTTTATTTTTTTT',
                       'TTTCGACCCCTCAGTGTTGTGGTAGCATGATGTGTTGGACTTGAATGGTATATGTATTGA',
                       'TTGTTTCGTTCGTTATGTAATTTCCGGTTTTTCCCCGTGGCATCCGGATAGTGTATAGTA',
                       'TCCGGTCCCTGTGTTCAAAAAGTTTTTCCTTTTCCCCTTAAAGCAACTGAAGTTAAACCC',
                       'TGAACCTTACTACTGAACCCGGAATTTGACTTCTAAAACCCTGAAGAATGATTCCTATAA',
                       'CCCTAAAAAATCCAACCTAAAACATCCAAACTGAACCATAGAACCTTCCTCCTAAACCCG',
                       'GAATCTATGTTCTAACACCCTGACATCTTTGTCCTAAACCCTGAATCTAAGTTCTAACAT',
                       'CCTGACAACTCTCCCTCCTAAACCCGGAATCTAAATTCGTACACCCTGACACCTCCCCCC',
                       'TAAACCCGGAATCCGCATTCTAACACCCTGACAATTTCCTCCTGAAAAGCGGAATCTGAC',
                       'TTCTAACACCCTGACACCTTTGTCCTGAACCCGGAATCTAAGTTCTTACACCCGGACACC',
                       'TCCCTCCTAAATCCGGAATCTAAGTTCTAACACCCTCACACCTTTGTCCTAAACCTTGAA',
                       'TCTAAGTCCTAACACCCTGACACCTTTGTCCTAAGCCCGGAATCTAACTTCTAGCACCCC',
                       'TACGACCCTTATTCCTAAACCCAGAATCTGACTATTGACACCCCTACAACCCTAATTCCA',
                       'ACACCCTTACAACCTTCATTCCAACACCGCAACAACCTTCATTCCAGCACCCCTACAACT',
                       'TCATTCCTACACCCCAAACAACATCATCCCTACACCCCAAACAACATCATTCCTACACCC',
                       'CAAACACATCATCCAACACCCCATAACACATCATTCCAACACGGCAACAACATCATTCGA',
                       'AACACCCCTACAAATCATTGCAGCACCCCCACTACCTCCCTGCGTATACCCGTATTCGAA',
                       'ATTTTGACACCCCTACTACCTTTATCTGACACCCCCAAAAAACTCCTCTTAAACCCAACA',
                       'AGGGGACTATAATACCCCTAAAACTTTATCTTAACCGGAATCCGAATTCTATACCGAAAA',
                       'AACTTCTTTCCTAACCGGGATCTGTACCCCGAACTTTTAAAATTAAAGGGGAAATGAACC',
                       'CCTGACCAGATAACGGGAAACCTTTATTGTGACAGGAACTCCTACCGCAATATGAAAATT',
                       'GGACCCCAAATTTGGGAAACCCCTTTT'])


        quals = [9, 9, 6, 4, 4, 4, 4, 7, 6, 6, 8, 6, 6, 6, 7, 7, 14, 8, 8, 8, 10, 17, 21, 12, 9, 10, 10, 9, 11, 8, 9, 11, 11, 21, 12, 15, 15, 21, 24, 33, 32, 35, 29, 29, 22, 22, 15, 29, 25, 26, 18, 18, 18, 31, 31, 47, 56, 56, 56, 42, 36, 44, 28, 28, 28, 39, 33, 35, 30, 36, 33, 35, 35, 36, 35, 37, 42, 35, 35, 31, 29, 26, 26, 20, 33, 15, 22, 22, 29, 29, 32, 35, 35, 36, 35, 35, 42, 42, 37, 37, 42, 47, 47, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 44, 47, 47, 47, 47, 47, 35, 30, 30, 23, 24, 30, 45, 37, 37, 37, 35, 23, 23, 11, 11, 13, 23, 31, 21, 21, 19, 20, 23, 29, 23, 20, 16, 16, 30, 29, 29, 28, 28, 28, 24, 24, 17, 29, 29, 33, 33, 35, 31, 37, 18, 15, 12, 16, 16, 23, 27, 24, 32, 29, 32, 32, 24, 26, 29, 37, 29, 30, 35, 35, 33, 35, 35, 31, 33, 31, 31, 35, 31, 31, 31, 31, 27, 33, 33, 42, 35, 37, 37, 21, 21, 21, 21, 37, 37, 50, 50, 50, 50, 50, 33, 33, 18, 16, 15, 25, 19, 20, 33, 33, 33, 35, 35, 33, 33, 33, 18, 18, 18, 33, 24, 33, 33, 33, 27, 33, 33, 33, 33, 33, 22, 33, 33, 33, 24, 24, 21, 24, 24, 31, 31, 11, 11, 11, 31, 33, 44, 44, 37, 42, 42, 47, 44, 44, 44, 44, 44, 44, 44, 47, 50, 50, 42, 42, 42, 41, 42, 42, 47, 47, 37, 37, 27, 33, 33, 33, 33, 35, 35, 42, 41, 37, 37, 44, 50, 50, 33, 33, 27, 33, 37, 42, 42, 42, 41, 41, 33, 33, 27, 27, 33, 33, 37, 50, 35, 35, 35, 35, 35, 35, 35, 42, 35, 37, 35, 37, 35, 41, 37, 42, 42, 42, 42, 50, 50, 50, 42, 35, 33, 33, 21, 21, 16, 23, 19, 27, 27, 33, 35, 41, 50, 37, 35, 35, 42, 50, 50, 50, 44, 44, 44, 50, 42, 42, 37, 37, 35, 35, 35, 44, 44, 50, 50, 41, 37, 37, 37, 37, 35, 35, 35, 37, 37, 37, 44, 37, 37, 33, 33, 22, 33, 37, 35, 33, 33, 21, 21, 21, 33, 33, 41, 41, 44, 44, 44, 44, 44, 50, 50, 44, 44, 37, 50, 33, 33, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 50, 50, 50, 50, 50, 44, 47, 44, 44, 48, 33, 21, 24, 21, 33, 33, 35, 37, 50, 50, 37, 35, 35, 50, 50, 56, 50, 50, 50, 50, 48, 33, 27, 33, 27, 33, 33, 44, 50, 50, 42, 37, 35, 42, 42, 50, 50, 50, 44, 44, 44, 33, 33, 18, 19, 18, 33, 33, 42, 35, 35, 44, 44, 44, 50, 44, 44, 44, 50, 50, 44, 44, 37, 37, 33, 33, 35, 35, 35, 35, 35, 37, 50, 37, 27, 27, 24, 37, 33, 35, 35, 37, 35, 37, 37, 46, 33, 24, 24, 21, 33, 33, 39, 42, 42, 44, 50, 50, 56, 50, 50, 37, 35, 35, 33, 37, 33, 33, 35, 35, 35, 35, 33, 33, 33, 33, 27, 27, 27, 37, 37, 44, 37, 41, 41, 41, 50, 46, 33, 24, 24, 16, 31, 19, 27, 31, 37, 37, 44, 44, 44, 37, 50, 23, 23, 22, 29, 31, 33, 23, 23, 23, 23, 23, 28, 25, 33, 26, 26, 22, 28, 37, 42, 44, 42, 42, 44, 44, 44, 44, 46, 33, 16, 19, 14, 27, 31, 42, 50, 50, 50, 44, 44, 44, 50, 50, 26, 26, 21, 28, 31, 29, 29, 26, 26, 26, 30, 30, 39, 27, 37, 26, 30, 30, 42, 42, 42, 36, 33, 29, 33, 33, 33, 20, 21, 23, 17, 23, 31, 36, 42, 43, 56, 56, 47, 47, 42, 42, 33, 33, 29, 29, 23, 31, 25, 26, 26, 26, 30, 30, 36, 27, 33, 28, 31, 33, 35, 44, 33, 33, 28, 33, 35, 44, 48, 48, 48, 42, 47, 42, 42, 42, 48, 44, 44, 37, 34, 34, 44, 48, 42, 37, 34, 42, 48, 33, 33, 34, 30, 30, 33, 33, 40, 30, 37, 28, 28, 26, 27, 27, 25, 19, 16, 25, 29, 40, 31, 27, 15, 18, 13, 25, 27, 40, 40, 33, 40, 33, 33, 33, 40, 37, 23, 12, 12, 17, 11, 10, 15, 15, 13, 13, 13, 18, 27, 23, 28, 28, 28, 28, 37, 28, 32, 26, 23, 26, 26, 19, 29, 25, 24, 25, 24, 15, 15, 15, 12, 17, 24, 24, 21, 21, 21, 25, 22, 29, 25, 22, 21, 24, 25, 17, 17, 14, 14, 12, 14, 19, 24, 18, 18, 14, 21, 11, 15, 10, 15, 18, 22, 27, 25, 25, 29, 29, 29, 25, 26, 25, 21, 22, 25, 22, 22, 18, 15, 15, 15, 25, 19, 25, 25, 16, 24, 24, 20, 20, 22, 20, 15, 10, 10, 10, 12, 13, 20, 20, 12, 14, 14, 12, 12, 12, 15, 15, 15, 18, 18, 11, 10, 11, 11, 10, 10, 14, 15, 18, 18, 19, 17, 12, 11, 10, 10, 20, 15, 19, 24, 24, 24, 23, 15, 13, 7, 6, 6, 6, 6, 6, 12, 13, 12, 9, 8, 10, 10, 9, 6, 6, 6, 6, 10, 10, 13, 15, 15, 15, 15, 17, 9, 9, 9, 9, 9, 11, 11, 9, 7, 7, 7, 6, 4, 4, 6, 9, 9, 8, 8, 8, 10, 9, 8, 7, 7, 7, 7, 7, 9, 13, 10, 10, 10, 15, 12, 9, 9, 9, 15, 19, 15, 15, 11, 7, 7, 7, 7, 7, 7, 8, 8, 19, 10, 10, 10, 12, 12, 19, 11, 15, 18, 11, 14, 9, 9, 6, 6, 6, 6, 6, 6, 6, 8, 11, 20, 13, 17, 14, 14, 9, 9, 10, 17, 11, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 11, 10, 12, 11, 10, 12, 9, 12, 8, 8, 8, 9, 12, 12, 8, 11, 7, 8, 8, 8, 8, 11, 9, 8, 6, 4, 4, 4, 6, 6, 7, 10, 10, 12, 9, 7, 7, 6, 6, 6, 6, 8, 6, 9, 10, 13, 8, 11, 8, 7, 7, 8, 7, 7, 7, 10, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 10, 14, 10, 8, 12, 8, 8, 6, 7, 9, 8, 7, 6, 8, 7, 4, 4, 7, 7, 6, 7, 6, 6, 6, 6, 8, 11, 8, 8, 8, 8, 12, 10, 12, 11, 11, 11, 10, 12, 10, 7, 7, 9, 4, 4, 8, 6, 6, 6, 6, 6, 6, 7, 10, 7, 7, 7, 7, 7, 9, 9, 9, 7, 7, 7, 6, 6, 6, 7, 7, 7, 10, 11, 9, 7, 6, 6, 8, 6, 6, 8, 8, 8, 7, 7, 7, 7, 7, 7, 8, 7, 7, 7, 7, 8, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 8, 8, 8, 9, 12, 10, 15, 15, 16, 7, 7, 6, 6, 6, 7, 6, 6, 6, 6, 6, 8, 7, 7, 8, 7, 8, 7, 7, 9, 8, 7, 7, 8, 8, 9, 7, 6, 7, 6, 9, 6, 7, 11, 7, 7, 11, 8, 8, 7, 10, 8, 9, 8, 6, 6, 6, 6, 7, 7, 7, 6, 6, 6, 8, 8, 7, 7, 6, 9, 7, 6, 6, 6, 6, 6, 6, 6, 8, 7, 7, 7, 7, 7, 7, 7, 10, 12, 19, 13, 13, 10, 9, 8, 8, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 6, 6, 6, 6, 6, 4, 6, 4, 4, 4, 4, 4, 4, 6, 6, 6, 7, 7, 7, 7, 8, 7, 6, 6, 6, 6, 6, 6, 8, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 11, 12, 9]

        e.seq = fastn.Fasta(e.id, seq)
        e.seq = e.seq.to_Fastq(quals)
        e.insert_min = 2000
        e.insert_max = 4000
        e.ligation = '96781'
        e.clone = 'pknbac5b2'
        e.clip_start = 33
        e.clip_end = 848
        self.assertEqual(c, e)

        utils.close(f_in)
示例#20
0
parser = argparse.ArgumentParser(
    description='Gets GAGE stats from bsub stdout file',
    usage='%(prog)s <gage.o>')
parser.add_argument('infile', help='Name of input gage.o bsub stout file')
options = parser.parse_args()

contigs = -1
scaffs = -1
contig_N50 = -1
scaff_N50 = -1
contig_corr_N50 = -1
scaff_corr_N50 = -1
contig_errs = -1
scaff_errs = -1

f = utils.open_file_read(options.infile)
lines = f.readlines()
utils.close(f)

i = 0
while i < len(lines):
    line = lines[i].rstrip()

    if line == 'Contig Stats':
        contigs = int(lines[i + 1].split()[-1])
        if lines[i + 8].startswith('N50'):
            contig_N50 = int(lines[i + 8].split()[1])
    elif line == 'Scaffold Stats':
        scaffs = int(lines[i + 1].split()[-1])
        if lines[i + 8].startswith('N50'):
            scaff_N50 = int(lines[i + 8].split()[1])
for i in range(len(ScaffResults.headers)):
    h = ScaffResults.headers[i]
    worksheet_all.cell(row=0, column=i).value = h
    if h.endswith(' score') and h[:-6] in ScaffResults.evaluation_score_keys:
        eval_keys_columns[h] = i


current_row = 1



# get the extra cpu and mem usage
extra_cpu = {}
extra_mem = {}

f = utils.open_file_read(options.extra_cpu_file)

for line in f:
    if line.startswith('#'):
        continue
    (dataset, scaffolder, cpu, mem) = line.split('\t')
    mem = int(mem)
    cpu = int(cpu)
    extra_cpu[(dataset, scaffolder)] = cpu
    extra_mem[(dataset, scaffolder)] = mem

utils.close(f)

# gather all the counts for each scaffolding run
results = {k:{} for k in datasets}
示例#22
0
def lengths_from_fai(fai_file, d):
    f = utils.open_file_read(fai_file)
    for line in f:
        (id, length) = line.rstrip().split()[:2]
        d[id] = int(length)
    utils.close(f)