def main():
    args = get_args()
    # setup logging
    log, my_name = setup_logging(args)
    #text = " Starting {} ".format(my_name)
    #log.info(text.center(65, "="))
    # find all alignments
    files = get_alignment_files(log, args.alignments, args.input_format)
    # compile our regexes once
    n_bases = re.compile("N|n+")
    x_bases = re.compile("X|x+")
    work = [[file, n_bases, x_bases, args.input_format, args.output, args.do_not_screen_n, args.do_not_screen_x] for file in files]
    log.info("Screening alignments for problematic bases".format(args.cores))
    if args.cores > 1:
        assert args.cores <= multiprocessing.cpu_count(), "You've specified more cores than you have"
        pool = multiprocessing.Pool(args.cores)
        results = pool.map(screen_files, work)
        pool.close()
    else:
        results = map(screen_files, work)
    count = 0
    for result in results:
        if result is None:
            count += 1
        else:
            log.warn("Removed locus {} due to presence of {} bases".format(
                result[0],
                result[1]
            ))
    log.info("Copied {} good alignments".format(count))
    # end
    text = " Completed {} ".format(my_name)
    log.info(text.center(65, "="))
def main():
    args = get_args()
    # setup logging
    log, my_name = setup_logging(args)
    # get input files
    files = get_alignment_files(log, args.alignments, args.input_format)
    sys.stdout.write("Running")
    sys.stdout.flush()
    with open(args.output, 'w') as outf:
        for f in files:
            aln = AlignIO.read(f, args.input_format)
            locus = os.path.splitext(os.path.basename(f))[0]
            for taxon in aln:
                if taxon.id == args.taxon:
                    seq = str(taxon.seq).replace('-', '').replace('?','')
                    record = SeqRecord(Seq(seq), id=locus, name="", description="")
                    if not len(seq) == 0:
                        outf.write(record.format("fasta"))
                        sys.stdout.write(".")
                        sys.stdout.flush()
                    else:
                        log.info("Could not write {}".format(locus))
    print ""
    # end
    text = " Completed {} ".format(my_name)
    log.info(text.center(65, "="))
def main():
    args = get_args()
    # setup logging
    log, my_name = setup_logging(args)
    # read config file output by match_count_config.py
    config = ConfigParser.RawConfigParser(allow_no_value=True)
    # make case sensitive
    config.optionxform = str
    config.read(args.match_count_output)
    # read the incomplete matrix file that contains loci that are incomplete
    if args.incomplete_matrix:
        incomplete = ConfigParser.RawConfigParser(allow_no_value=True)
        incomplete.optionxform = str
        incomplete.read(args.incomplete_matrix)
        missing = get_missing_loci_from_conf_file(incomplete)
    else:
        missing = None
    # get the taxa in the alignment
    organisms = get_names_from_config(log, config, 'Organisms')
    # get input files
    files = get_alignment_files(log, args.alignments, args.input_format)
    work = [[
            file,
            args.input_format,
            organisms,
            args.check_missing,
            missing,
            args.verbatim,
            args.min_taxa,
            args.output,
            args.output_format
        ] for file in files
    ]
    log.info("Adding missing data designators using {} cores".format(args.cores))
    if args.cores > 1:
        assert args.cores <= multiprocessing.cpu_count(), "You've specified more cores than you have"
        pool = multiprocessing.Pool(args.cores)
        results = pool.map(add_designators, work)
    else:
        results = map(add_designators, work)
    for result in results:
        if result is not None:
            log.info("Dropped {} because of too few taxa (N < {})".format(
                result,
                args.min_taxa
            ))
    # end
    text = " Completed {} ".format(my_name)
    log.info(text.center(65, "="))
def main():
    args = get_args()
    # setup logging
    log, my_name = setup_logging(args)
    files = get_alignment_files(log, args.alignments, args.input_format)
    work = [(args, f) for f in files]
    sys.stdout.write("Running")
    sys.stdout.flush()
    if args.cores > 1:
        assert args.cores <= multiprocessing.cpu_count(), "You've specified more cores than you have"
        pool = multiprocessing.Pool(args.cores)
        results = pool.map(worker, work)
    else:
        results = map(worker, work)
    # flatten results
    all_taxa = set([item for sublist in results for item in sublist])
    print ""
    log.info("Taxon names in alignments: {0}".format(
        ','.join(list(all_taxa))
    ))
    # end
    text = " Completed {} ".format(my_name)
    log.info(text.center(65, "="))
Пример #5
0
def main():
    args = get_args()
    # setup logging
    log, my_name = setup_logging(args)
    # find all alignments
    files = get_alignment_files(log, args.alignments, args.input_format)
    work = [[file, args.input_format] for file in files]
    log.info("Computing summary statistics using {} cores".format(args.cores))
    if args.cores > 1:
        assert args.cores <= multiprocessing.cpu_count(), "You've specified more cores than you have"
        pool = multiprocessing.Pool(args.cores)
        summary = pool.map(get_stats, work)
    else:
        summary = map(get_stats, work)
    # alignments
    a_vars = get_lengths(summary)
    log_length_summary(log, len(summary), a_vars)
    # taxa
    t_vars = get_taxa(summary)
    log_taxa_summary(log, t_vars)
    # missing
    m_vars = get_percent_missing(summary)
    log_missing_summary(log, m_vars)
    # characters
    all_bases, sum_characters = total_characters(summary)
    sum_nucleotides = total_nucleotides(summary)
    log_char_summary(log, sum_characters, sum_nucleotides)
    # matrix
    percentages = get_matrix_percentages(t_vars[0])
    log_matrix_summary(log, percentages)
    # taxa dist.
    log_taxa_dist(log, args.show_taxon_counts, t_vars[0])
    # character dist
    log_character_dist(log, all_bases)
    # end
    text = " Completed {} ".format(my_name)
    log.info(text.center(65, "="))
def main():
    args = get_args()
    # setup logging
    log, my_name = setup_logging(args)
    # find all alignments
    files = get_alignment_files(log, args.alignments, args.input_format)
    # determine the minimum count of taxa needed in each alignment, given --percent
    min_count = int(math.floor(args.percent * args.taxa))
    work = [[file, args.input_format, min_count, args.output] for file in files]
    if args.cores > 1:
        assert args.cores <= multiprocessing.cpu_count(), "You've specified more cores than you have"
        pool = multiprocessing.Pool(args.cores)
        results = pool.map(copy_over_files, work)
    else:
        results = map(copy_over_files, work)
    log.info("Copied {0} alignments of {1} total containing ≥ {2} proportion of taxa (n = {3})".format(
        sum(results),
        len(results),
        args.percent,
        min_count
    ))
    # end
    text = " Completed {} ".format(my_name)
    log.info(text.center(65, "="))