Exemplo n.º 1
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    input_gbs = opts.input_gbs.split(',')
    output_dir = opts.output_dir
    verbose = opts.verbose
    tag = opts.tag
    existing_fp = opts.existing
    max_failures = opts.max_failures
    
    makedirs(output_dir)
    logger = WorkflowLogger(generate_log_fp(output_dir), script_name=argv[0])

    observed_records = parse_column(open(existing_fp))

    sequences_fp = os.path.join(output_dir, '%s_sequences.fasta.gz' % tag)
    gg_records_fp = os.path.join(output_dir, '%s_ggrecords.txt.gz' % tag)
    obs_records_fp = os.path.join(output_dir, '%s_obsrecords.txt.gz' % tag)
    
    sequences = open(sequences_fp,'w')
    gg_records = open(gg_records_fp, 'w')
    obs_records = open(obs_records_fp, 'w')
    
    seen = set([])
    for gb_fp in input_gbs:
        logline = log_f("Start parsing of %s..." % gb_fp)
        logger.write(logline)

        if verbose:
            stdout.write(logline)

        records = MinimalGenbankParser(open(gb_fp))
        
        failure_count = 0
        alpha = set(['A','T','G','C',
                     'a','t','g','c',
                     'N','n',
                     'R','Y','S','M',
                     'r','y','s','m',
                     'K','k','W','w',
                     'V','v','H','h','B','b','D','d'])

        while True and (failure_count < max_failures):
            # gracefully handle parser errors to a limit
            try:
                next_record = records.next()
            except PartialRecordError, e:
                failure_count += 1
                continue
            except StopIteration:
                break
            except Exception, e:
                logline = log_f("Caught: %s, previous accession: %s" % (e, accession))
                logger.write(logline)
                if verbose:
                    stdout.write(logline)
                failure_count += 1
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    gg_records_fp = opts.gg_records
    output_dir = opts.output_dir
    verbose = opts.verbose
    existing_fp = opts.existing
    tag = opts.tag
    gg_id = opts.starting_gg_id

    invariants = parse_invariants(open(opts.invariants))

    makedirs(output_dir)
    logger = WorkflowLogger(generate_log_fp(output_dir), script_name=argv[0])

    # gg records are not going out as gzip as python's gzip is slow relative
    # to native linux gzip and doesn't compress as well out the door (latter
    # probably fixable)
    output_gg_fp = os.path.join(output_dir, "%s.records.txt" % tag)
    output_map_fp = os.path.join(output_dir, "%s.mapping.txt.gz" % tag)
    output_gg_noggid_fp = os.path.join(output_dir, "%s.records.noggid.txt" \
                                                    % tag)

    existing_records = parse_column(open(existing_fp))

    #records = dict([(r['ncbi_acc_w_ver'], r) \
    #                for r in MinimalGreengenesParser(open(gg_records_fp))])

    for record in MinimalGreengenesParser(open(gg_records_fp)):
        acc = record['ncbi_acc_w_ver']

        ### NEED DOMAIN!
        aln = filter(None, [get_indexed_sequence(i, acc) for i in aligned])
        noaln = filter(None, [get_indexed_sequence(i, acc) for i in unaligned])

        if not aln:
            logline = log_f("GG record %s does not have aligned seq!" % acc)
            logger.write(logline)
            if verbose:
                stdout.write(logline)
            continue

        if not unaln:
            logline = log_f("GG record %s does not have aligned seq!" % acc)
            logger.write(logline)
            if verbose:
                stdout.write(logline)
            continue

        # if > 1 rec, complain

        for aln_id, aln_seq in MinimalFastaParser(open(f)):
            id_ = aln_id.split()[0]  # strip of any comments
            record = records.get(id_, None)

            if record is None:
                logline = log_f("Aligned seq %s does not have a GG record" %
                                id_)
                logger.write(logline)
                if verbose:
                    stdout.write(logline)
                continue

            if id_ in existing_records:
                logline = log_f("%s has previously been observed!" % id_)
                logger.write(logline)
                if verbose:
                    stdout.write(logline)
                continue

            if record['gg_id'] is not None:
                logline = log_f("%s already has gg_id %d!" %\
                                    (id_,record['gg_id']))
                logger.write(logline)
                if verbose:
                    stdout.write(logline)
                continue

            record['gg_id'] = gg_id
            if domain != 'eukarya':
                record['prokMSA_id'] = gg_id
            gg_id += 1

            inv_score = calc_invariant(seq, invariants)
            non_ACGT = calc_nonACGT(seq)

            record['perc_ident_to_invariant_core'] = inv_score
            record['non_ACGT_percent'] = non_ACGT
            record['aligned_seq'] = seq
            record['n_pos_aligned'] = len(seq) - seq.count('-')

    for f in opts.unaligned.split(','):
        logline = log_f("Parsing %s..." % f)
        logger.write(logline)
        if verbose:
            stdout.write(logline)

        domain = get_domain(f)

        for unaln_id, unaln_seq in MinimalFastaParser(open(f)):
            id_ = unaln_id.split()[0]  # strip off any comments
            record = records.get(id_, None)

            if record is None:
                logline = log_f("Unaligned seq %s does not have a GG record" %\
                                 id_)
                logger.write(logline)
                if verbose:
                    stdout.write(logline)
                continue

            # a gg_id should be assigned while trolling the alignment seqs
            if record['gg_id'] is None:
                logline = log_f("%s should have a gg_id by now!" % (id_))
                logger.write(logline)
                if verbose:
                    stdout.write(logline)
                continue

            record['unaligned_seq'] = seq
            record['n_pos_unaligned'] = len(seq)

    logline = log_f("Beginning output...")
    logger.write(logline)
    if verbose:
        stdout.write(logline)

    output_map = open(output_map_fp, 'w')
    output_gg = open(output_gg_fp, 'w')
    output_gg_noggid = open(output_gg_noggid_fp, 'w')
    output_gg_broken = open(output_gg_broken_fp, 'w')

    for record in records.items():
        if record['gg_id'] is None:
            write_gg_record(output_gg_noggid, record)
        else:
            try:
                record.sanityCheck()
            except:
                write_gg_record(output_gg_broken, record)
            else:
                write_gg_record(output_gg, record)
                output_map.write("%s\t%s\n" %
                                 (record['gg_id'], record['ncbi_acc_w_ver']))
    output_gg.close()
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    gg_records_fp = opts.gg_records
    output_dir = opts.output_dir
    verbose = opts.verbose
    existing_fp = opts.existing
    tag = opts.tag
    gg_id = opts.starting_gg_id


    invariants = parse_invariants(open(opts.invariants))

    makedirs(output_dir)
    logger = WorkflowLogger(generate_log_fp(output_dir), script_name=argv[0])

    # gg records are not going out as gzip as python's gzip is slow relative
    # to native linux gzip and doesn't compress as well out the door (latter 
    # probably fixable)
    output_gg_fp = os.path.join(output_dir, "%s.records.txt" % tag)
    output_map_fp = os.path.join(output_dir, "%s.mapping.txt.gz" % tag)
    output_gg_noggid_fp = os.path.join(output_dir, "%s.records.noggid.txt" \
                                                    % tag)
    
    existing_records = parse_column(open(existing_fp))
    
    #records = dict([(r['ncbi_acc_w_ver'], r) \
    #                for r in MinimalGreengenesParser(open(gg_records_fp))])
    
    for record in MinimalGreengenesParser(open(gg_records_fp)):
        acc = record['ncbi_acc_w_ver']

        ### NEED DOMAIN!
        aln = filter(None, [get_indexed_sequence(i, acc) for i in aligned])
        noaln = filter(None, [get_indexed_sequence(i, acc) for i in unaligned])
        
        if not aln:
            logline = log_f("GG record %s does not have aligned seq!" % acc)
            logger.write(logline)
            if verbose:
                stdout.write(logline)
            continue

        if not unaln:
            logline = log_f("GG record %s does not have aligned seq!" % acc)
            logger.write(logline)
            if verbose:
                stdout.write(logline)
            continue

        # if > 1 rec, complain

        
        for aln_id, aln_seq in MinimalFastaParser(open(f)):
            id_ = aln_id.split()[0] # strip of any comments
            record = records.get(id_, None)

            if record is None:
                logline = log_f("Aligned seq %s does not have a GG record" % id_)
                logger.write(logline)
                if verbose:
                    stdout.write(logline)
                continue

            if id_ in existing_records:
                logline = log_f("%s has previously been observed!" % id_)
                logger.write(logline)
                if verbose:
                    stdout.write(logline)
                continue

            if record['gg_id'] is not None:
                logline = log_f("%s already has gg_id %d!" %\
                                    (id_,record['gg_id']))
                logger.write(logline)
                if verbose:
                    stdout.write(logline)
                continue
        
            record['gg_id'] = gg_id
            if domain != 'eukarya':
                record['prokMSA_id'] = gg_id
            gg_id += 1

            inv_score = calc_invariant(seq, invariants)
            non_ACGT = calc_nonACGT(seq)

            record['perc_ident_to_invariant_core'] = inv_score
            record['non_ACGT_percent'] = non_ACGT
            record['aligned_seq'] = seq
            record['n_pos_aligned'] = len(seq) - seq.count('-')

    for f in opts.unaligned.split(','):
        logline = log_f("Parsing %s..." % f)
        logger.write(logline)
        if verbose:
            stdout.write(logline)

        domain = get_domain(f)

        for unaln_id, unaln_seq in MinimalFastaParser(open(f)):
            id_ = unaln_id.split()[0] # strip off any comments
            record = records.get(id_, None)

            if record is None:
                logline = log_f("Unaligned seq %s does not have a GG record" %\
                                 id_)
                logger.write(logline)
                if verbose:
                    stdout.write(logline)
                continue
    
            # a gg_id should be assigned while trolling the alignment seqs
            if record['gg_id'] is None:
                logline = log_f("%s should have a gg_id by now!" % (id_))
                logger.write(logline)
                if verbose:
                    stdout.write(logline)
                continue

            record['unaligned_seq'] = seq
            record['n_pos_unaligned'] = len(seq)
    
    logline = log_f("Beginning output...")
    logger.write(logline)
    if verbose:
        stdout.write(logline)

    output_map = open(output_map_fp,'w')
    output_gg = open(output_gg_fp,'w')
    output_gg_noggid = open(output_gg_noggid_fp, 'w')
    output_gg_broken = open(output_gg_broken_fp, 'w')

    for record in records.items():
        if record['gg_id'] is None:
            write_gg_record(output_gg_noggid, record)
        else:
            try:
                record.sanityCheck()
            except:
                write_gg_record(output_gg_broken, record)
            else:
                write_gg_record(output_gg, record)
                output_map.write("%s\t%s\n" % (record['gg_id'], 
                                               record['ncbi_acc_w_ver']))
    output_gg.close()