示例#1
0
 def test_write_gg_record(self):
     """Writes a gg record"""
     exp = sorted([
         'BEGIN', 'prokmsa_id=123', 'gg_id=', 'hugenholtz_tax_string=',
         'ncbi_acc_w_ver=xyz', 'ncbi_gi=333', 'n_pos_aligned=',
         'n_pos_unaligned=', 'db_name=', 'gold_id=', 'decision=',
         'prokmsaname=', 'isolation_source=', 'clone=foo', 'organism=',
         'strain=', 'specific_host=', 'authors=', 'title=', 'pubmed=123',
         'journal=', 'study_id=', 'submit_date=', 'country=',
         'ncbi_tax_string=', 'silva_tax_string=', 'rdp_tax_string=',
         'greengenes_tax_string=', 'non_acgt_percent=0.5',
         'perc_ident_to_invariant_core=', 'small_gap_intrusions=',
         'bellerophon=', 'bel3_div_ratio=', 'chim_slyr_a=', 'chim_slyr_b=',
         'chim_slyr_a_tax=', 'chim_slyr_b_tax=', 'aligned_seq=',
         'unaligned_seq=', 'END', ''
     ])
     ggrec = GreengenesRecord({
         'prokmsa_id': 123,
         'ncbi_acc_w_ver': 'xyz',
         'ncbi_gi': '333',
         'pubmed': 123,
         'clone': 'foo',
         'non_acgt_percent': '0.5'
     })
     f = StringIO()
     write_gg_record(f, ggrec)
     f.seek(0)
     obs = sorted(f.read().splitlines())
     self.assertEqual(obs, exp)
示例#2
0
 def test_write_gg_record(self):
     """Writes a gg record"""
     exp = sorted(['BEGIN',
         'prokmsa_id=123',
         'gg_id=',
         'hugenholtz_tax_string=',
         'ncbi_acc_w_ver=xyz',
         'ncbi_gi=333',
         'n_pos_aligned=',
         'n_pos_unaligned=',
         'db_name=',
         'gold_id=',
         'decision=',
         'prokmsaname=',
         'isolation_source=',
         'clone=foo',
         'organism=',
         'strain=',
         'specific_host=',
         'authors=',
         'title=',
         'pubmed=123',
         'journal=',
         'study_id=',
         'submit_date=',
         'country=',
         'ncbi_tax_string=',
         'silva_tax_string=',
         'rdp_tax_string=',
         'greengenes_tax_string=',
         'non_acgt_percent=0.5',
         'perc_ident_to_invariant_core=',
         'small_gap_intrusions=',
         'bellerophon=',
         'bel3_div_ratio=',
         'chim_slyr_a=',
         'chim_slyr_b=',
         'chim_slyr_a_tax=',
         'chim_slyr_b_tax=',
         'aligned_seq=',
         'unaligned_seq=',
         'END',''])
     ggrec = GreengenesRecord({'prokmsa_id':123,'ncbi_acc_w_ver':'xyz',
                             'ncbi_gi':'333','pubmed':123,'clone':'foo',
                               'non_acgt_percent':'0.5'})
     f = StringIO()
     write_gg_record(f, ggrec)
     f.seek(0)
     obs = sorted(f.read().splitlines())
     self.assertEqual(obs, exp)
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    gg_records_fp = opts.gg_records
    output_dir = opts.output_dir
    verbose = opts.verbose
    existing_fp = opts.existing
    tag = opts.tag
    gg_id = opts.starting_gg_id

    invariants = parse_invariants(open(opts.invariants))

    makedirs(output_dir)
    logger = WorkflowLogger(generate_log_fp(output_dir), script_name=argv[0])

    # gg records are not going out as gzip as python's gzip is slow relative
    # to native linux gzip and doesn't compress as well out the door (latter
    # probably fixable)
    output_gg_fp = os.path.join(output_dir, "%s.records.txt" % tag)
    output_map_fp = os.path.join(output_dir, "%s.mapping.txt.gz" % tag)
    output_gg_noggid_fp = os.path.join(output_dir, "%s.records.noggid.txt" \
                                                    % tag)

    existing_records = parse_column(open(existing_fp))

    #records = dict([(r['ncbi_acc_w_ver'], r) \
    #                for r in MinimalGreengenesParser(open(gg_records_fp))])

    for record in MinimalGreengenesParser(open(gg_records_fp)):
        acc = record['ncbi_acc_w_ver']

        ### NEED DOMAIN!
        aln = filter(None, [get_indexed_sequence(i, acc) for i in aligned])
        noaln = filter(None, [get_indexed_sequence(i, acc) for i in unaligned])

        if not aln:
            logline = log_f("GG record %s does not have aligned seq!" % acc)
            logger.write(logline)
            if verbose:
                stdout.write(logline)
            continue

        if not unaln:
            logline = log_f("GG record %s does not have aligned seq!" % acc)
            logger.write(logline)
            if verbose:
                stdout.write(logline)
            continue

        # if > 1 rec, complain

        for aln_id, aln_seq in MinimalFastaParser(open(f)):
            id_ = aln_id.split()[0]  # strip of any comments
            record = records.get(id_, None)

            if record is None:
                logline = log_f("Aligned seq %s does not have a GG record" %
                                id_)
                logger.write(logline)
                if verbose:
                    stdout.write(logline)
                continue

            if id_ in existing_records:
                logline = log_f("%s has previously been observed!" % id_)
                logger.write(logline)
                if verbose:
                    stdout.write(logline)
                continue

            if record['gg_id'] is not None:
                logline = log_f("%s already has gg_id %d!" %\
                                    (id_,record['gg_id']))
                logger.write(logline)
                if verbose:
                    stdout.write(logline)
                continue

            record['gg_id'] = gg_id
            if domain != 'eukarya':
                record['prokMSA_id'] = gg_id
            gg_id += 1

            inv_score = calc_invariant(seq, invariants)
            non_ACGT = calc_nonACGT(seq)

            record['perc_ident_to_invariant_core'] = inv_score
            record['non_ACGT_percent'] = non_ACGT
            record['aligned_seq'] = seq
            record['n_pos_aligned'] = len(seq) - seq.count('-')

    for f in opts.unaligned.split(','):
        logline = log_f("Parsing %s..." % f)
        logger.write(logline)
        if verbose:
            stdout.write(logline)

        domain = get_domain(f)

        for unaln_id, unaln_seq in MinimalFastaParser(open(f)):
            id_ = unaln_id.split()[0]  # strip off any comments
            record = records.get(id_, None)

            if record is None:
                logline = log_f("Unaligned seq %s does not have a GG record" %\
                                 id_)
                logger.write(logline)
                if verbose:
                    stdout.write(logline)
                continue

            # a gg_id should be assigned while trolling the alignment seqs
            if record['gg_id'] is None:
                logline = log_f("%s should have a gg_id by now!" % (id_))
                logger.write(logline)
                if verbose:
                    stdout.write(logline)
                continue

            record['unaligned_seq'] = seq
            record['n_pos_unaligned'] = len(seq)

    logline = log_f("Beginning output...")
    logger.write(logline)
    if verbose:
        stdout.write(logline)

    output_map = open(output_map_fp, 'w')
    output_gg = open(output_gg_fp, 'w')
    output_gg_noggid = open(output_gg_noggid_fp, 'w')
    output_gg_broken = open(output_gg_broken_fp, 'w')

    for record in records.items():
        if record['gg_id'] is None:
            write_gg_record(output_gg_noggid, record)
        else:
            try:
                record.sanityCheck()
            except:
                write_gg_record(output_gg_broken, record)
            else:
                write_gg_record(output_gg, record)
                output_map.write("%s\t%s\n" %
                                 (record['gg_id'], record['ncbi_acc_w_ver']))
    output_gg.close()
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    gg_records_fp = opts.gg_records
    output_dir = opts.output_dir
    verbose = opts.verbose
    existing_fp = opts.existing
    tag = opts.tag
    gg_id = opts.starting_gg_id


    invariants = parse_invariants(open(opts.invariants))

    makedirs(output_dir)
    logger = WorkflowLogger(generate_log_fp(output_dir), script_name=argv[0])

    # gg records are not going out as gzip as python's gzip is slow relative
    # to native linux gzip and doesn't compress as well out the door (latter 
    # probably fixable)
    output_gg_fp = os.path.join(output_dir, "%s.records.txt" % tag)
    output_map_fp = os.path.join(output_dir, "%s.mapping.txt.gz" % tag)
    output_gg_noggid_fp = os.path.join(output_dir, "%s.records.noggid.txt" \
                                                    % tag)
    
    existing_records = parse_column(open(existing_fp))
    
    #records = dict([(r['ncbi_acc_w_ver'], r) \
    #                for r in MinimalGreengenesParser(open(gg_records_fp))])
    
    for record in MinimalGreengenesParser(open(gg_records_fp)):
        acc = record['ncbi_acc_w_ver']

        ### NEED DOMAIN!
        aln = filter(None, [get_indexed_sequence(i, acc) for i in aligned])
        noaln = filter(None, [get_indexed_sequence(i, acc) for i in unaligned])
        
        if not aln:
            logline = log_f("GG record %s does not have aligned seq!" % acc)
            logger.write(logline)
            if verbose:
                stdout.write(logline)
            continue

        if not unaln:
            logline = log_f("GG record %s does not have aligned seq!" % acc)
            logger.write(logline)
            if verbose:
                stdout.write(logline)
            continue

        # if > 1 rec, complain

        
        for aln_id, aln_seq in MinimalFastaParser(open(f)):
            id_ = aln_id.split()[0] # strip of any comments
            record = records.get(id_, None)

            if record is None:
                logline = log_f("Aligned seq %s does not have a GG record" % id_)
                logger.write(logline)
                if verbose:
                    stdout.write(logline)
                continue

            if id_ in existing_records:
                logline = log_f("%s has previously been observed!" % id_)
                logger.write(logline)
                if verbose:
                    stdout.write(logline)
                continue

            if record['gg_id'] is not None:
                logline = log_f("%s already has gg_id %d!" %\
                                    (id_,record['gg_id']))
                logger.write(logline)
                if verbose:
                    stdout.write(logline)
                continue
        
            record['gg_id'] = gg_id
            if domain != 'eukarya':
                record['prokMSA_id'] = gg_id
            gg_id += 1

            inv_score = calc_invariant(seq, invariants)
            non_ACGT = calc_nonACGT(seq)

            record['perc_ident_to_invariant_core'] = inv_score
            record['non_ACGT_percent'] = non_ACGT
            record['aligned_seq'] = seq
            record['n_pos_aligned'] = len(seq) - seq.count('-')

    for f in opts.unaligned.split(','):
        logline = log_f("Parsing %s..." % f)
        logger.write(logline)
        if verbose:
            stdout.write(logline)

        domain = get_domain(f)

        for unaln_id, unaln_seq in MinimalFastaParser(open(f)):
            id_ = unaln_id.split()[0] # strip off any comments
            record = records.get(id_, None)

            if record is None:
                logline = log_f("Unaligned seq %s does not have a GG record" %\
                                 id_)
                logger.write(logline)
                if verbose:
                    stdout.write(logline)
                continue
    
            # a gg_id should be assigned while trolling the alignment seqs
            if record['gg_id'] is None:
                logline = log_f("%s should have a gg_id by now!" % (id_))
                logger.write(logline)
                if verbose:
                    stdout.write(logline)
                continue

            record['unaligned_seq'] = seq
            record['n_pos_unaligned'] = len(seq)
    
    logline = log_f("Beginning output...")
    logger.write(logline)
    if verbose:
        stdout.write(logline)

    output_map = open(output_map_fp,'w')
    output_gg = open(output_gg_fp,'w')
    output_gg_noggid = open(output_gg_noggid_fp, 'w')
    output_gg_broken = open(output_gg_broken_fp, 'w')

    for record in records.items():
        if record['gg_id'] is None:
            write_gg_record(output_gg_noggid, record)
        else:
            try:
                record.sanityCheck()
            except:
                write_gg_record(output_gg_broken, record)
            else:
                write_gg_record(output_gg, record)
                output_map.write("%s\t%s\n" % (record['gg_id'], 
                                               record['ncbi_acc_w_ver']))
    output_gg.close()
示例#5
0
                logger.write(logline)
                if verbose:
                    stdout.write(logline)
                failure_count += 1
                continue

            # gg_record contains gb summary data  
            try:
                gg_record = get_genbank_summary(next_record)
            except KeyError, e:
                failure_count += 1
                continue

            seen.add(accession)
            write_sequence(sequences, accession, sequence)
            write_gg_record(gg_records, gg_record)
            write_obs_record(obs_records, accession)
            
        if failure_count >= max_failures:
            logline = log_f("MAX FAILURES OF %d REACHED IN %s" % (max_failures, \
                                                                  gb_fp))
            logger.write(logline)
            stderr.write(logline)
        else:
            logline = log_f("Parsed %s, %d failures observed." % (gb_fp, \
                                                                  failure_count))
            logger.write(logline)

            if verbose:
                stdout.write(logline)