Пример #1
0
def make_jobs(commands, job_prefix, queue, jobs_dir="jobs/",
              walltime="72:00:00", ncpus=1, nodes=1, keep_output="oe"):
    """prepare qsub text files.

    command: list of commands

    job_prefix: a short, descriptive name for the job.

    queue: name of the queue to submit to

    jobs_dir: path to directory where job submision scripts are written

    walltime: the maximal walltime

    ncpus: number of cpus

    nodes: number of nodes

    keep_output: keep standard error, standard out, both, or neither
                 o=std out, e=std err, oe=both, n=neither
    """

    filenames = []
    create_dir(jobs_dir)
    for command in commands:
        fd, job_name = mkstemp(dir=jobs_dir, prefix=job_prefix + "_",
                              suffix=".txt")
        close(fd)
        out_fh = open(job_name, "w")

        out_fh.write(QSUB_TEXT % (walltime, ncpus, nodes, queue, job_prefix,
                                  keep_output, command))
        out_fh.close()
        filenames.append(job_name)
    return filenames
Пример #2
0
def write_checkpoint(current_key, ctr, cluster_mapping,
                     ids, bestscores, order, out_fp):
    """write intermediate results to checkpoint file

    current_key: the identifier of the current denoiser round
    ctr: a uniq counter to label the checkpoint
    cluster_mapping: an intermediate cluster mapping as dict
    ids: the dict of active ids
    order:  a list of ids, which defines the order of which flowgrams are clustered
    bestscores: a dict of
    """

    checkpoint_dir = out_fp + "/checkpoints/"
    if (not exists(checkpoint_dir)):
        create_dir(checkpoint_dir)
    out_fp = checkpoint_dir + "/checkpoint%d.pickle" % ctr
    out_fh = open(out_fp, "w")
    pickle.dump(
        (current_key,
         ctr,
         cluster_mapping,
         ids,
         bestscores,
         order),
        out_fh)

    return out_fp
Пример #3
0
def main():
    option_parser, opts, args =\
        parse_command_line_parameters(**script_info)

    input_dir = opts.input_dir
    output_dir = opts.output_dir
    create_dir(output_dir)
    lanes = opts.lanes.split(',')
    bases = opts.bases
    read = opts.read
    ignore_pass_filter = opts.ignore_pass_filter

    for lane in lanes:
        read1_fps = sorted(glob('%s/s_%s_%d_*qseq.txt' % (input_dir,
                                                          lane.replace(
                                                              ',',
                                                              ''),
                                                          read)))
        # sort so results will be consistent across different runs (important
        # so amplicon and barcodes read headers will match)
        output_fp = '%s/s_%s_%s_sequences.fastq' % (output_dir, lane, read)
        output_f = open(output_fp, 'w')
        for read1_fp in read1_fps:
            for record in iter_split_lines(open(read1_fp, 'U')):
                fastq_s, pass_filter = illumina_data_to_fastq(record,
                                                              number_of_bases=bases)
                if ignore_pass_filter or pass_filter != 0:
                    output_f.write('%s\n' % fastq_s)
        output_f.close()
Пример #4
0
def main():
    option_parser, opts, args =\
        parse_command_line_parameters(**script_info)

    if opts.attempt_read_reorientation:
        if not opts.mapping_fp:
            option_parser.error("To use --attempt_read_reorientation, one must "
                                "supply a mapping file that contains both LinkerPrimerSequence "
                                "and ReversePrimer columns.")
    if opts.input_type == "barcode_paired_end":
        if not opts.fastq2:
            option_parser.error("To use input_type of barcode_paired_end, "
                                "a second fastq file must be specified with --fastq2")

    if not opts.fastq2:
        disable_header_match = True
    else:
        disable_header_match = opts.disable_header_match

    fastq1 = qiime_open(opts.fastq1)
    if opts.fastq2:
        fastq2 = qiime_open(opts.fastq2)
    else:
        fastq2 = None
    create_dir(opts.output_dir)
    if opts.mapping_fp:
        map_fp = qiime_open(opts.mapping_fp)
    else:
        map_fp = None

    extract_barcodes(fastq1, fastq2, opts.output_dir, opts.input_type,
                     opts.bc1_len, opts.bc2_len, opts.rev_comp_bc1, opts.rev_comp_bc2,
                     opts.char_delineator, opts.switch_bc_order, map_fp,
                     opts.attempt_read_reorientation, disable_header_match)
Пример #5
0
def main():
    option_parser, opts, args =\
        parse_command_line_parameters(**script_info)

    input_dir = opts.input_dir
    output_dir = opts.output_dir
    create_dir(output_dir)
    lanes = opts.lanes.split(',')
    bases = opts.bases
    read = opts.read
    ignore_pass_filter = opts.ignore_pass_filter

    for lane in lanes:
        read1_fps = sorted(
            glob('%s/s_%s_%d_*qseq.txt' %
                 (input_dir, lane.replace(',', ''), read)))
        # sort so results will be consistent across different runs (important
        # so amplicon and barcodes read headers will match)
        output_fp = '%s/s_%s_%s_sequences.fastq' % (output_dir, lane, read)
        output_f = open(output_fp, 'w')
        for read1_fp in read1_fps:
            for record in iter_split_lines(open(read1_fp, 'U')):
                fastq_s, pass_filter = illumina_data_to_fastq(
                    record, number_of_bases=bases)
                if ignore_pass_filter or pass_filter != 0:
                    output_f.write('%s\n' % fastq_s)
        output_f.close()
Пример #6
0
    def test_create_dir(self):
        """create_dir creates dir and fails meaningful."""

        # create a directory
        tmp_dir_path = mkdtemp()

        # create a random temporary directory name
        tmp_dir_path2 = join(mkdtemp(), str(uuid4()))
        tmp_dir_path3 = join(mkdtemp(), str(uuid4()))

        self.dirs_to_remove += [tmp_dir_path, tmp_dir_path2, tmp_dir_path3]

        # create on existing dir raises OSError if fail_on_exist=True
        self.assertRaises(OSError,
                          create_dir,
                          tmp_dir_path,
                          fail_on_exist=True)
        self.assertEqual(
            create_dir(tmp_dir_path,
                       fail_on_exist=True,
                       handle_errors_externally=True), 1)

        # return should be 1 if dir exist and fail_on_exist=False
        self.assertEqual(create_dir(tmp_dir_path, fail_on_exist=False), 1)

        # if dir not there make it and return always 0
        self.assertEqual(create_dir(tmp_dir_path2), 0)
        self.assertEqual(create_dir(tmp_dir_path3, fail_on_exist=True), 0)
Пример #7
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    if len(opts.taxa_summary_fps) != 2:
        option_parser.error("Exactly two taxa summary files are required. You "
                            "provided %d." % len(opts.taxa_summary_fps))

    # Create the output dir if it doesn't already exist.
    try:
        create_dir(opts.output_dir)
    except:
        option_parser.error("Could not create or access output directory "
                            "specified with the -o option.")

    sample_id_map = None
    if opts.sample_id_map_fp:
        sample_id_map = parse_sample_id_map(open(opts.sample_id_map_fp, 'U'))

    results = compare_taxa_summaries(
        parse_taxa_summary_table(open(opts.taxa_summary_fps[0], 'U')),
        parse_taxa_summary_table(open(opts.taxa_summary_fps[1], 'U')),
        opts.comparison_mode,
        correlation_type=opts.correlation_type,
        tail_type=opts.tail_type,
        num_permutations=opts.num_permutations,
        confidence_level=opts.confidence_level,
        perform_detailed_comparisons=opts.perform_detailed_comparisons,
        sample_id_map=sample_id_map,
        expected_sample_id=opts.expected_sample_id)

    # Write out the sorted and filled taxa summaries, basing their
    # filenames on the original input filenames. If the filenames are the same,
    # append a number to each filename.
    same_filenames = False
    if basename(opts.taxa_summary_fps[0]) == \
       basename(opts.taxa_summary_fps[1]):
        same_filenames = True

    for orig_ts_fp, filled_ts_lines, file_num in zip(opts.taxa_summary_fps,
                                                     results[:2], range(0, 2)):
        filename_suffix = '_sorted_and_filled'
        if same_filenames:
            filename_suffix += '_%d' % file_num
        filled_ts_fp = add_filename_suffix(orig_ts_fp, filename_suffix)
        filled_ts_f = open(join(opts.output_dir, filled_ts_fp), 'w')
        filled_ts_f.write(filled_ts_lines)
        filled_ts_f.close()

    # Write the overall comparison result.
    overall_comp_f = open(join(opts.output_dir, 'overall_comparison.txt'), 'w')
    overall_comp_f.write(results[2])
    overall_comp_f.close()

    # Write the correlation vector containing the pairwise sample comparisons.
    if opts.perform_detailed_comparisons:
        corr_vec_f = open(join(opts.output_dir, 'detailed_comparisons.txt'),
                          'w')
        corr_vec_f.write(results[3])
        corr_vec_f.close()
Пример #8
0
def main():
    args = handle_program_options()

    if osp.isfile(args.out_dir):
        print("--out_dir (-o) option must be a valid directory and not a file",
              file=sys.stderr)
        sys.exit(1)

    # will fail gracefully if dir exists
    skbu.create_dir(args.out_dir)

    metagenomes = []
    if args.metagenome_id is not None:
        metagenomes.append(args.metagenome_id)
    elif args.metagenome_file is not None:
        metagenomes.extend(parse_metagenome_file(args.metagenome_file))
        
    if args.verbose:
        msg = 'Processing requested for {} metagenome(s) found in: {}'
        print(msg.format(len(metagenomes), args.metagenome_file))

    # MG-RAST stage.file ids for downloading
    derep_passed = '150.1'
    screen_passed = '299.1'

    for mg_id in metagenomes:
        if args.verbose:
            print('Processing metagenome: {}'.format(mg_id))
            print('\tDownloading: Dereplication Passed...', end='')
            sys.stdout.flush()
        derepp_rsp = mgapi.mgrast_request('download', mg_id,
                                          {'file': derep_passed},
                                          auth_key=args.auth_key)
        derepp_sc = SequenceCollection.read(StringIO(derepp_rsp.text),
                                            format='fastq',
                                            variant='illumina1.8')
        if args.verbose:
            print('{} sequences'.format(len(derepp_sc)))
            print('\tDownloading: Screen Passed...', end='')
            sys.stdout.flush()
        screenp_rsp = mgapi.mgrast_request('download', mg_id,
                                           {'file': screen_passed},
                                           auth_key=args.auth_key)
        screenp_ids = extract_seq_ids(screenp_rsp.text, fmt='fastq',
                                      variant='illumina1.8')
        if args.verbose:
            print('{} sequences'.format(len(screenp_ids)))

        # filter dereplication passed with IDs from screen passed
        failed_screen = filter_seqs(derepp_sc, screenp_ids)
        if args.verbose:
            nsp = len(screenp_ids)
            print('\tRemoved {} sequences from Dereplication Passed'.format(nsp))
            print('\tleaving {} sequences'.format(len(failed_screen)))

        out_fp = osp.join(args.out_dir, mg_id + '_screen_failed.fastq')
        failed_screen.write(out_fp, format='fastq', variant='illumina1.8')
        if args.verbose:
            print('Sequence data written to: ' + out_fp)
Пример #9
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    if len(opts.taxa_summary_fps) != 2:
        option_parser.error("Exactly two taxa summary files are required. You "
                            "provided %d." % len(opts.taxa_summary_fps))

    # Create the output dir if it doesn't already exist.
    try:
        create_dir(opts.output_dir)
    except:
        option_parser.error("Could not create or access output directory "
                            "specified with the -o option.")

    sample_id_map = None
    if opts.sample_id_map_fp:
        sample_id_map = parse_sample_id_map(open(opts.sample_id_map_fp, 'U'))

    results = compare_taxa_summaries(
        parse_taxa_summary_table(open(opts.taxa_summary_fps[0], 'U')),
        parse_taxa_summary_table(open(opts.taxa_summary_fps[1], 'U')),
        opts.comparison_mode, correlation_type=opts.correlation_type,
        tail_type=opts.tail_type, num_permutations=opts.num_permutations,
        confidence_level=opts.confidence_level,
        perform_detailed_comparisons=opts.perform_detailed_comparisons,
        sample_id_map=sample_id_map,
        expected_sample_id=opts.expected_sample_id)

    # Write out the sorted and filled taxa summaries, basing their
    # filenames on the original input filenames. If the filenames are the same,
    # append a number to each filename.
    same_filenames = False
    if basename(opts.taxa_summary_fps[0]) == \
       basename(opts.taxa_summary_fps[1]):
        same_filenames = True

    for orig_ts_fp, filled_ts_lines, file_num in zip(opts.taxa_summary_fps,
                                                     results[:2], range(0, 2)):
        filename_suffix = '_sorted_and_filled'
        if same_filenames:
            filename_suffix += '_%d' % file_num
        filled_ts_fp = add_filename_suffix(orig_ts_fp, filename_suffix)
        filled_ts_f = open(join(opts.output_dir, filled_ts_fp), 'w')
        filled_ts_f.write(filled_ts_lines)
        filled_ts_f.close()

    # Write the overall comparison result.
    overall_comp_f = open(join(opts.output_dir, 'overall_comparison.txt'), 'w')
    overall_comp_f.write(results[2])
    overall_comp_f.close()

    # Write the correlation vector containing the pairwise sample comparisons.
    if opts.perform_detailed_comparisons:
        corr_vec_f = open(join(opts.output_dir,
                               'detailed_comparisons.txt'), 'w')
        corr_vec_f.write(results[3])
        corr_vec_f.close()
Пример #10
0
def main():

    with open('/data/input/AppSession.json', 'U') as fd_json:
        app = json.load(fd_json)

    jobs = '1'
    # get command attributes, etc
    for item in app['Properties']['Items']:
        if item['Name'] == 'Input.Projects':
            project_id = item['Items'][0]['Id']
        if item['Name'] == 'Input.number-of-jobs':
            jobs = item['Content']

    # from BaseSpace's documentation
    input_dir = '/data/input/samples/'
    base = join('/data/output/appresults/', project_id)
    output_dir = join(base, 'sl-out')

    # for sanity
    create_dir(output_dir)

    # split libraries
    cmd = ("multiple_split_libraries_fastq.py "
           "-i '{input_dir}' -o '{output_dir}'")
    params = {'input_dir': input_dir, 'output_dir': output_dir}

    system_call(cmd.format(**params))
    for log_file in glob(join(output_dir, 'log_*')):
        with open(log_file, 'U') as fd_log:
            print fd_log.read()

    # OTU picking
    input_dir = join(output_dir, 'seqs.fna')
    output_dir = join(base, 'closed-ref')
    cmd = ("pick_closed_reference_otus.py "
           "-i '{input_seqs}' -o '{output_dir}'")
    params = {'input_seqs': input_dir, 'output_dir': output_dir, 'jobs': jobs}

    # see https://github.com/biocore/qiime/issues/2034
    if jobs != '1':
        cmd += ' -a -O {jobs}'

    system_call(cmd.format(**params))
    for log_file in glob(join(output_dir, 'log_*')):
        with open(log_file, 'U') as fd_log:
            print fd_log.read()

    input_dir = join(output_dir, 'otu_table.biom')
    output_dir = join(base, 'closed-ref', 'table-summary.txt')
    cmd = ("biom summarize-table "
           "-i '{input_table}' -o '{output_summary}'")
    params = {'input_table': input_dir, 'output_summary': output_dir}
    system_call(cmd.format(**params))

    return 0
Пример #11
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    # Create the output dir if it doesn't already exist.
    out_dir = opts.output_dir
    try:
        create_dir(out_dir)
    except:
        option_parser.error("Could not create or access output directory "
                            "specified with the -o option.")

    map_f = open(opts.mapping_fp, 'U')
    dm_f = open(opts.distance_matrix_fp, 'U')

    fields = map(strip, opts.fields.split(','))
    fields = [field.strip('"').strip("'") for field in fields]

    color_individual_within_by_field = opts.color_individual_within_by_field
    results = make_distance_boxplots(dm_f, map_f, fields, width=opts.width,
                                     height=opts.height, suppress_all_within=opts.suppress_all_within,
                                     suppress_all_between=opts.suppress_all_between,
                                     suppress_individual_within=opts.suppress_individual_within,
                                     suppress_individual_between=opts.suppress_individual_between,
                                     y_min=opts.y_min, y_max=opts.y_max,
                                     whisker_length=opts.whisker_length, box_width=opts.box_width,
                                     box_color=opts.box_color,
                                     color_individual_within_by_field=color_individual_within_by_field,
                                     sort=opts.sort)

    for field, plot_figure, plot_data, plot_labels, plot_colors in results:
        output_plot_fp = join(out_dir, "%s_Distances.%s" %
                              (field, opts.imagetype))
        plot_figure.savefig(output_plot_fp, format=opts.imagetype,
                            transparent=opts.transparent)

        if not opts.suppress_significance_tests:
            sig_tests_f = open(join(out_dir, "%s_Stats.txt" % field), 'w')
            sig_tests_results = all_pairs_t_test(plot_labels, plot_data,
                                                 tail_type=opts.tail_type,
                                                 num_permutations=opts.num_permutations)
            sig_tests_f.write(sig_tests_results)
            sig_tests_f.close()

        if opts.save_raw_data:
            # Write the raw plot data into a tab-delimited file.
            assert(len(plot_labels) == len(plot_data))
            raw_data_fp = join(out_dir, "%s_Distances.txt" % field)
            raw_data_f = open(raw_data_fp, 'w')

            for label, data in zip(plot_labels, plot_data):
                raw_data_f.write(label.replace(" ", "_") + "\t")
                raw_data_f.write("\t".join(map(str, data)))
                raw_data_f.write("\n")
            raw_data_f.close()
Пример #12
0
def main():

    with open('/data/input/AppSession.json', 'U') as fd_json:
        app = json.load(fd_json)

    jobs = '1'
    # get command attributes, etc
    for item in app['Properties']['Items']:
        if item['Name'] == 'Input.Projects':
            project_id = item['Items'][0]['Id']
        if item['Name'] == 'Input.number-of-jobs':
            jobs = item['Content']

    # from BaseSpace's documentation
    input_dir = '/data/input/samples/'
    base = join('/data/output/appresults/', project_id)
    output_dir = join(base, 'sl-out')

    # for sanity
    create_dir(output_dir)

    # split libraries
    cmd = ("multiple_split_libraries_fastq.py "
           "-i '{input_dir}' -o '{output_dir}'")
    params = {'input_dir': input_dir, 'output_dir': output_dir}

    system_call(cmd.format(**params))
    for log_file in glob(join(output_dir, 'log_*')):
        with open(log_file, 'U') as fd_log:
            print fd_log.read()

    # OTU picking
    input_dir = join(output_dir, 'seqs.fna')
    output_dir = join(base, 'closed-ref')
    cmd = ("pick_closed_reference_otus.py "
           "-i '{input_seqs}' -o '{output_dir}'")
    params = {'input_seqs': input_dir, 'output_dir': output_dir, 'jobs': jobs}

    # see https://github.com/biocore/qiime/issues/2034
    if jobs != '1':
        cmd += ' -a -O {jobs}'

    system_call(cmd.format(**params))
    for log_file in glob(join(output_dir, 'log_*')):
        with open(log_file, 'U') as fd_log:
            print fd_log.read()

    input_dir = join(output_dir, 'otu_table.biom')
    output_dir = join(base, 'closed-ref', 'table-summary.txt')
    cmd = ("biom summarize-table " "-i '{input_table}' -o '{output_summary}'")
    params = {'input_table': input_dir, 'output_summary': output_dir}
    system_call(cmd.format(**params))

    return 0
Пример #13
0
    def setUp(self):
        # create the temporary input files that will be used

        self.iupac = {'A': 'A', 'T': 'T', 'G': 'G', 'C': 'C', 'R': '[AG]',
                      'Y': '[CT]', 'S': '[GC]', 'W':
                      '[AT]', 'K': '[GT]', 'M': '[AC]',
                      'B': '[CGT]', 'D': '[AGT]', 'H': '[ACT]', 'V': '[ACG]', 'N': '[ACGT]'}

        self.output_dir = mkdtemp()
        self.output_dir += '/'

        create_dir(self.output_dir)
Пример #14
0
    def setUp(self):
        # create the temporary input files that will be used

        self.iupac = {'A': 'A', 'T': 'T', 'G': 'G', 'C': 'C', 'R': '[AG]',
                      'Y': '[CT]', 'S': '[GC]', 'W':
                      '[AT]', 'K': '[GT]', 'M': '[AC]',
                      'B': '[CGT]', 'D': '[AGT]', 'H': '[ACT]', 'V': '[ACG]', 'N': '[ACGT]'}

        self.output_dir = mkdtemp()
        self.output_dir += '/'

        create_dir(self.output_dir)
Пример #15
0
    def write_xml_file(self, element, fp):
        """Writes an XML file after calling one of the XML generation
        functions

        Parameters
        ----------
        element : ET.Element
            The Element to be written
        fp : str
            The filepath to which the XML will be written
        """
        create_dir(self.xml_dir)
        ET.ElementTree(element).write(fp, encoding='UTF-8')
Пример #16
0
    def write_xml_file(self, element, fp):
        """Writes an XML file after calling one of the XML generation
        functions

        Parameters
        ----------
        element : ET.Element
            The Element to be written
        fp : str
            The filepath to which the XML will be written
        """
        create_dir(self.xml_dir)
        ET.ElementTree(element).write(fp, encoding='UTF-8')
Пример #17
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    out_dir = opts.output_dir
    categories = opts.categories.split(",")

    # Create the output dir if it doesn't already exist.
    try:
        if not exists(out_dir):
            create_dir(out_dir)
    except:
        option_parser.error(
            "Could not create or access output directory '%s' " "specified with the -o option." % out_dir
        )

    compare_categories(opts.input_dm, opts.mapping_file, opts.method, categories, opts.num_permutations, out_dir)
Пример #18
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    out_dir = opts.output_dir
    categories = opts.categories.split(',')

    # Create the output dir if it doesn't already exist.
    try:
        if not exists(out_dir):
            create_dir(out_dir)
    except:
        option_parser.error("Could not create or access output directory '%s' "
                            "specified with the -o option." % out_dir)

    compare_categories(opts.input_dm, opts.mapping_file, opts.method,
                       categories, opts.num_permutations, out_dir)
Пример #19
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    if opts.num_permutations < 1:
        option_parser.error(
            "--num_permutations must be greater than or equal to 1.")

    # Create the output dir if it doesn't already exist.
    try:
        if not path.exists(opts.output_dir):
            create_dir(opts.output_dir)
    except:
        option_parser.error("Could not create or access output directory "
                            "specified with the -o option.")
    sample_id_map = None
    if opts.sample_id_map_fp:
        sample_id_map = dict([(k, v[0])
                              for k, v in fields_to_dict(open(opts.sample_id_map_fp, "U")).items()])
    input_dm_fps = opts.input_dms
    distmats = [parse_distmat(open(dm_fp, 'U')) for dm_fp in input_dm_fps]

    if opts.method == 'mantel':
        output_f = open(path.join(opts.output_dir, 'mantel_results.txt'), 'w')
        output_f.write(run_mantel_test('mantel', input_dm_fps, distmats,
                       opts.num_permutations, opts.tail_type,
                       comment_mantel_pmantel, sample_id_map=sample_id_map))
    elif opts.method == 'partial_mantel':
        output_f = open(path.join(opts.output_dir,
                        'partial_mantel_results.txt'), 'w')
        output_f.write(run_mantel_test('partial_mantel', input_dm_fps,
                       distmats, opts.num_permutations, opts.tail_type,
                       comment_mantel_pmantel, control_dm_fp=opts.control_dm,
                       control_dm=parse_distmat(open(opts.control_dm, 'U')),
                       sample_id_map=sample_id_map))
    elif opts.method == 'mantel_corr':
        output_f = open(path.join(opts.output_dir,
                        'mantel_correlogram_results.txt'), 'w')
        result_str, correlogram_fps, correlograms = run_mantel_correlogram(
            input_dm_fps, distmats, opts.num_permutations, comment_corr,
            opts.alpha, sample_id_map=sample_id_map,
            variable_size_distance_classes=opts.variable_size_distance_classes)

        output_f.write(result_str)
        for corr_fp, corr in zip(correlogram_fps, correlograms):
            corr.savefig(path.join(opts.output_dir, corr_fp + opts.image_type),
                         format=opts.image_type)
    output_f.close()
Пример #20
0
def main():
    option_parser, opts, args =\
        parse_command_line_parameters(**script_info)

    input_fps = opts.input_fps
    output_dir = opts.output_dir
    create_dir(output_dir)
    barcode_length = opts.barcode_length
    barcode_in_header = opts.barcode_in_header
    barcode_qual_c = opts.barcode_qual_c

    for input_fp in input_fps:
        if input_fp.endswith('.gz'):
            open_f = gzip_open
            input_basename = split(splitext(splitext(input_fp)[0])[0])[1]
        else:
            input_basename = split(splitext(input_fp)[0])[1]
            open_f = open
        sequence_output_fp = '%s/%s.fastq' % (output_dir, input_basename)
        sequence_output_f = open(sequence_output_fp, 'w')
        barcode_output_fp = '%s/%s_barcodes.fastq' % (output_dir,
                                                      input_basename)
        barcode_output_f = open(barcode_output_fp, 'w')
        for line in open_f(input_fp):
            common_fields, sequence, sequence_qual, barcode, barcode_qual =\
                iseq_to_qseq_fields(
                    line,
                    barcode_in_header,
                    barcode_length,
                    barcode_qual_c)

            sequence_s, pass_filter_s = illumina_data_to_fastq(
                (common_fields[0], common_fields[1], common_fields[2],
                 common_fields[3], common_fields[4], common_fields[5],
                 common_fields[6], common_fields[7], sequence, sequence_qual))

            barcode_s, pass_filter_b = illumina_data_to_fastq(
                (common_fields[0], common_fields[1], common_fields[2],
                 common_fields[3], common_fields[4], common_fields[5],
                 common_fields[6], common_fields[7], barcode, barcode_qual),
                barcode_length)
            if pass_filter_s != 0:
                sequence_output_f.write('%s\n' % sequence_s)
                barcode_output_f.write('%s\n' % barcode_s)
        sequence_output_f.close()
        barcode_output_f.close()
Пример #21
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    # Create the output dir if it doesn't already exist.
    try:
        if not path.exists(opts.output_dir):
            create_dir(opts.output_dir)
    except:
        option_parser.error("Could not create or access output directory "
                            "specified with the -o option.")
    sample_id_map = None
    if opts.sample_id_map_fp:
        sample_id_map = dict([(k, v[0])
                              for k, v in fields_to_dict(open(opts.sample_id_map_fp, "U")).items()])
    input_dm_fps = opts.input_dms
    distmats = [parse_distmat(open(dm_fp, 'U')) for dm_fp in input_dm_fps]

    if opts.method == 'mantel':
        output_f = open(path.join(opts.output_dir, 'mantel_results.txt'), 'w')
        output_f.write(run_mantel_test('mantel', input_dm_fps, distmats,
                       opts.num_permutations, opts.tail_type,
                       comment_mantel_pmantel, sample_id_map=sample_id_map))
    elif opts.method == 'partial_mantel':
        output_f = open(path.join(opts.output_dir,
                        'partial_mantel_results.txt'), 'w')
        output_f.write(run_mantel_test('partial_mantel', input_dm_fps,
                       distmats, opts.num_permutations, opts.tail_type,
                       comment_mantel_pmantel, control_dm_fp=opts.control_dm,
                       control_dm=parse_distmat(open(opts.control_dm, 'U')),
                       sample_id_map=sample_id_map))
    elif opts.method == 'mantel_corr':
        output_f = open(path.join(opts.output_dir,
                        'mantel_correlogram_results.txt'), 'w')
        result_str, correlogram_fps, correlograms = run_mantel_correlogram(
            input_dm_fps, distmats, opts.num_permutations, comment_corr,
            opts.alpha, sample_id_map=sample_id_map,
            variable_size_distance_classes=opts.variable_size_distance_classes)

        output_f.write(result_str)
        for corr_fp, corr in zip(correlogram_fps, correlograms):
            corr.savefig(path.join(opts.output_dir, corr_fp + opts.image_type),
                         format=opts.image_type)
    output_f.close()
Пример #22
0
def make_jobs(commands,
              job_prefix,
              queue,
              jobs_dir="jobs/",
              walltime="72:00:00",
              ncpus=1,
              nodes=1,
              keep_output="oe"):
    """prepare qsub text files.

    command: list of commands

    job_prefix: a short, descriptive name for the job.

    queue: name of the queue to submit to

    jobs_dir: path to directory where job submision scripts are written

    walltime: the maximal walltime

    ncpus: number of cpus

    nodes: number of nodes

    keep_output: keep standard error, standard out, both, or neither
                 o=std out, e=std err, oe=both, n=neither
    """

    filenames = []
    create_dir(jobs_dir)
    for command in commands:
        fd, job_name = mkstemp(dir=jobs_dir,
                               prefix=job_prefix + "_",
                               suffix=".txt")
        close(fd)
        out_fh = open(job_name, "w")

        out_fh.write(
            QSUB_TEXT %
            (walltime, ncpus, nodes, queue, job_prefix, keep_output, command))
        out_fh.close()
        filenames.append(job_name)
    return filenames
Пример #23
0
    def setUp(self):
        # create the temporary input files that will be used

        self._files_to_remove = []

        fd, self.qual_fp = mkstemp(prefix='qual_scores_', suffix='.qual')
        close(fd)
        seq_file = open(self.qual_fp, 'w')
        seq_file.write(qual_scores)
        seq_file.close()

        self.output_dir = mkdtemp()
        self.output_dir += '/'

        create_dir(self.output_dir)

        self.expected_output_text_file = expected_output_text_file

        self._files_to_remove =\
            [self.qual_fp]
Пример #24
0
    def test_create_dir(self):
        # create a directory
        tmp_dir_path = mkdtemp()

        # create a random temporary directory name
        tmp_dir_path2 = join(mkdtemp(), str(uuid4()))
        tmp_dir_path3 = join(mkdtemp(), str(uuid4()))

        self.dirs_to_remove += [tmp_dir_path, tmp_dir_path2, tmp_dir_path3]

        # create on existing dir raises OSError if fail_on_exist=True
        self.assertRaises(OSError, create_dir, tmp_dir_path, fail_on_exist=True)
        self.assertEqual(create_dir(tmp_dir_path, fail_on_exist=True, handle_errors_externally=True), 1)

        # return should be 1 if dir exist and fail_on_exist=False
        self.assertEqual(create_dir(tmp_dir_path, fail_on_exist=False), 1)

        # if dir not there make it and return always 0
        self.assertEqual(create_dir(tmp_dir_path2), 0)
        self.assertEqual(create_dir(tmp_dir_path3, fail_on_exist=True), 0)
Пример #25
0
    def setUp(self):
        # create the temporary input files that will be used

        self._files_to_remove = []

        fd, self.qual_fp = mkstemp(prefix='qual_scores_', suffix='.qual')
        close(fd)
        seq_file = open(self.qual_fp, 'w')
        seq_file.write(qual_scores)
        seq_file.close()

        self.output_dir = mkdtemp()
        self.output_dir += '/'

        create_dir(self.output_dir)

        self.expected_output_text_file = expected_output_text_file

        self._files_to_remove =\
            [self.qual_fp]
Пример #26
0
def write_checkpoint(current_key, ctr, cluster_mapping, ids, bestscores, order,
                     out_fp):
    """write intermediate results to checkpoint file

    current_key: the identifier of the current denoiser round
    ctr: a uniq counter to label the checkpoint
    cluster_mapping: an intermediate cluster mapping as dict
    ids: the dict of active ids
    order:  a list of ids, which defines the order of which flowgrams are clustered
    bestscores: a dict of
    """

    checkpoint_dir = out_fp + "/checkpoints/"
    if (not exists(checkpoint_dir)):
        create_dir(checkpoint_dir)
    out_fp = checkpoint_dir + "/checkpoint%d.pickle" % ctr
    out_fh = open(out_fp, "w")
    pickle.dump((current_key, ctr, cluster_mapping, ids, bestscores, order),
                out_fh)

    return out_fp
Пример #27
0
def split_fasta(infile, seqs_per_file, outfile_prefix, working_dir=''):
    """ Split infile into files with seqs_per_file sequences in each

        infile: list of fasta lines or open file object
        seqs_per_file: the number of sequences to include in each file
        out_fileprefix: string used to create output filepath - output filepaths
         are <out_prefix>.<i>.fasta where i runs from 0 to number of output files
        working_dir: directory to prepend to temp filepaths (defaults to
         empty string -- files written to cwd)

        List of output filepaths is returned.

    """
    if seqs_per_file <= 0:
        raise ValueError("seqs_per_file must be > 0!")

    seq_counter = 0
    out_files = []
    if working_dir and not working_dir.endswith('/'):
        working_dir += '/'
        create_dir(working_dir)

    for seq_id, seq in parse_fasta(infile):
        if seq_counter == 0:
            current_out_fp = '%s%s.%d.fasta' \
                % (working_dir, outfile_prefix, len(out_files))
            current_out_file = open(current_out_fp, 'w')
            out_files.append(current_out_fp)
        current_out_file.write('>%s\n%s\n' % (seq_id, seq))
        seq_counter += 1

        if seq_counter == seqs_per_file:
            current_out_file.close()
            seq_counter = 0

    if not current_out_file.closed:
        current_out_file.close()

    return out_files
Пример #28
0
def split_fasta(infile, seqs_per_file, outfile_prefix, working_dir=''):
    """ Split infile into files with seqs_per_file sequences in each

        infile: list of fasta lines or open file object
        seqs_per_file: the number of sequences to include in each file
        out_fileprefix: string used to create output filepath - output filepaths
         are <out_prefix>.<i>.fasta where i runs from 0 to number of output files
        working_dir: directory to prepend to temp filepaths (defaults to
         empty string -- files written to cwd)

        List of output filepaths is returned.

    """
    if seqs_per_file <= 0:
        raise ValueError("seqs_per_file must be > 0!")

    seq_counter = 0
    out_files = []
    if working_dir and not working_dir.endswith('/'):
        working_dir += '/'
        create_dir(working_dir)

    for seq_id, seq in parse_fasta(infile):
        if seq_counter == 0:
            current_out_fp = '%s%s.%d.fasta' \
                % (working_dir, outfile_prefix, len(out_files))
            current_out_file = open(current_out_fp, 'w')
            out_files.append(current_out_fp)
        current_out_file.write('>%s\n%s\n' % (seq_id, seq))
        seq_counter += 1

        if seq_counter == seqs_per_file:
            current_out_file.close()
            seq_counter = 0

    if not current_out_file.closed:
        current_out_file.close()

    return out_files
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    # Create the output dir if it doesn't already exist.
    output_dir = opts.output_dir
    try:
        create_dir(output_dir)
    except:
        option_parser.error("Could not create or access output directory "
                            "specified with the -o/--output_dir option.")

    otu_table_fp = opts.otu_table_fp
    table = load_table(otu_table_fp)

    estimator = ObservationRichnessEstimator(table,
                                             Chao1MultinomialPointEstimator)
    results = estimator(opts.min, opts.max, opts.num_steps,
                        opts.confidence_level)

    out_fp = join(output_dir, 'estimates_table.txt')
    with open(out_fp, 'w') as out_f:
        results.toTable(out_f)
Пример #30
0
def main():
    option_parser, opts, args =\
        parse_command_line_parameters(**script_info)

    if opts.attempt_read_reorientation:
        if not opts.mapping_fp:
            option_parser.error(
                "To use --attempt_read_reorientation, one must "
                "supply a mapping file that contains both LinkerPrimerSequence "
                "and ReversePrimer columns.")
    if opts.input_type == "barcode_paired_end":
        if not opts.fastq2:
            option_parser.error(
                "To use input_type of barcode_paired_end, "
                "a second fastq file must be specified with --fastq2")

    if not opts.fastq2:
        disable_header_match = True
    else:
        disable_header_match = opts.disable_header_match

    fastq1 = qiime_open(opts.fastq1)
    if opts.fastq2:
        fastq2 = qiime_open(opts.fastq2)
    else:
        fastq2 = None
    create_dir(opts.output_dir)
    if opts.mapping_fp:
        map_fp = qiime_open(opts.mapping_fp)
    else:
        map_fp = None

    extract_barcodes(fastq1, fastq2, opts.output_dir, opts.input_type,
                     opts.bc1_len, opts.bc2_len, opts.rev_comp_bc1,
                     opts.rev_comp_bc2, opts.char_delineator,
                     opts.switch_bc_order, map_fp,
                     opts.attempt_read_reorientation, disable_header_match)
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    # Create the output dir if it doesn't already exist.
    try:
        create_dir(opts.output_dir)
    except:
        option_parser.error("Could not create or access output directory "
                            "specified with the -o option.")

    # Parse the distance matrix and mapping file.
    try:
        dist_matrix_header, dist_matrix = parse_distmat(
            open(opts.distance_matrix_fp, 'U'))
    except:
        option_parser.error("This does not look like a valid distance matrix "
                            "file. Please supply a valid distance matrix file using the -d "
                            "option.")
    try:
        mapping, mapping_header, mapping_comments = parse_mapping_file(
            open(opts.mapping_fp, 'U'))
    except QiimeParseError:
        option_parser.error("This does not look like a valid metadata mapping "
                            "file. Please supply a valid mapping file using the -m option.")

    # Make sure the y_min and y_max options make sense, as they can be either
    # 'auto' or a number.
    y_min = opts.y_min
    y_max = opts.y_max
    try:
        y_min = float(y_min)
    except ValueError:
        if y_min == 'auto':
            y_min = None
        else:
            option_parser.error("The --y_min option must be either a number "
                                "or 'auto'.")
    try:
        y_max = float(y_max)
    except ValueError:
        if y_max == 'auto':
            y_max = None
        else:
            option_parser.error("The --y_max option must be either a number "
                                "or 'auto'.")

    # Parse the field states that will be compared to every other field state.
    comparison_field_states = opts.comparison_groups
    comparison_field_states = map(strip, comparison_field_states.split(','))
    comparison_field_states = [field_state.strip('"').strip("'")
                               for field_state in comparison_field_states]
    if comparison_field_states is None:
        option_parser.error("You must provide at least one field state to "
                            "compare (using the -c option).")

    # Get distance comparisons between each field state and each of the
    # comparison field states.
    field = opts.field
    comparison_groupings = get_field_state_comparisons(dist_matrix_header,
                                                       dist_matrix, mapping_header, mapping, field,
                                                       comparison_field_states)

    # Grab a list of all field states that had the comparison field states
    # compared against them. These will be plotted along the x-axis.
    field_states = comparison_groupings.keys()

    def custom_comparator(x, y):
        try:
            num_x = float(x)
            num_y = float(y)
            return int(num_x - num_y)
        except:
            if x < y:
                return -1
            elif x > y:
                return 1
            else:
                return 0

    # Sort the field states as numbers if the elements are numbers, else sort
    # them lexically.
    field_states.sort(custom_comparator)

    # If the label type is numeric, get a list of all field states in sorted
    # numeric order. These will be used to determine the spacing of the
    # field state 'points' along the x-axis.
    x_spacing = None
    if opts.label_type == "numeric":
        try:
            x_spacing = sorted(map(float, field_states))
        except:
            option_parser.error("The 'numeric' label type is invalid because "
                                "not all field states could be converted into "
                                "numbers. Please specify a different label "
                                "type.")

    # Accumulate the data for each field state 'point' along the x-axis.
    plot_data = []
    plot_x_axis_labels = []
    for field_state in field_states:
        field_state_data = []
        for comp_field_state in comparison_field_states:
            field_state_data.append(
                comparison_groupings[field_state][comp_field_state])
        plot_data.append(field_state_data)
        plot_x_axis_labels.append(field_state)

    # Plot the data and labels.
    plot_title = "Distance Comparisons"
    plot_x_label = field
    plot_y_label = "Distance"

    # If we are creating a bar chart or box plot, grab a list of good data
    # colors to use.
    plot_type = opts.plot_type
    plot_colors = None
    if plot_type == "bar" or plot_type == "box":
        plot_colors = [matplotlib_rgb_color(data_colors[color].toRGB())
                       for color in data_color_order]

    assert plot_data, "Error: there is no data to plot!"

    width = opts.width
    height = opts.height
    if width <= 0 or height <= 0:
        option_parser.error("The specified width and height of the image must "
                            "be greater than zero.")

    plot_figure = grouped_distributions(
        opts.plot_type, plot_data, x_values=x_spacing,
        data_point_labels=plot_x_axis_labels,
        distribution_labels=comparison_field_states,
        distribution_markers=plot_colors, x_label=plot_x_label,
        y_label=plot_y_label, title=plot_title,
        x_tick_labels_orientation=opts.x_tick_labels_orientation, y_min=y_min,
        y_max=y_max, whisker_length=opts.whisker_length,
        error_bar_type=opts.error_bar_type,
        distribution_width=opts.distribution_width, figure_width=width,
        figure_height=height)

    # Save the plot in the specified format.
    output_plot_fp = join(opts.output_dir, "%s_Distance_Comparisons.%s" %
                          (field, opts.imagetype))
    plot_figure.savefig(output_plot_fp, format=opts.imagetype,
                        transparent=opts.transparent)

    if not opts.suppress_significance_tests:
        sig_tests_f = open(join(opts.output_dir, "%s_Stats.txt" % field), 'w')

        # Rearrange the plot data into a format suitable for all_pairs_t_test.
        sig_tests_labels = []
        sig_tests_data = []
        for data_point, data_point_label in zip(plot_data, plot_x_axis_labels):
            for dist, comp_field in zip(data_point, comparison_field_states):
                sig_tests_labels.append('%s vs %s' % (data_point_label,
                                                      comp_field))
                sig_tests_data.append(dist)

        sig_tests_results = all_pairs_t_test(sig_tests_labels, sig_tests_data,
                                             tail_type=opts.tail_type,
                                             num_permutations=opts.num_permutations)
        sig_tests_f.write(sig_tests_results)
        sig_tests_f.close()

    if opts.save_raw_data:
        # Write the raw plot data into a tab-delimited file, where each line
        # has the distances between a comparison group and another field state
        # 'point' along the x-axis.
        assert (len(plot_x_axis_labels) == len(plot_data)), "The number of " +\
            "labels do not match the number of points along the x-axis."
        raw_data_fp = join(opts.output_dir,
                           "%s_Distance_Comparisons.txt" % field)
        raw_data_f = open(raw_data_fp, 'w')

        raw_data_f.write("#ComparisonGroup\tFieldState\tDistances\n")
        for label, data in zip(plot_x_axis_labels, plot_data):
            assert (len(comparison_field_states) == len(data)), "The " +\
                "number of specified comparison groups does not match " +\
                "the number of groups found at the current point along " +\
                "the x-axis."
            for comp_field_state, comp_grp_data in zip(comparison_field_states, data):
                raw_data_f.write(comp_field_state + "\t" + label + "\t" +
                                 "\t".join(map(str, comp_grp_data)) + "\n")
        raw_data_f.close()
Пример #32
0
def main():
    option_parser, opts, args =\
        parse_command_line_parameters(**script_info)

    input_fp = opts.input_fp
    output_dir = opts.output_dir

    if opts.num_fraction_for_core_steps < 2:
        option_parser.error(
            "Must perform at least two steps. Increase --num_fraction_for_core_steps.")
    fractions_for_core = np.linspace(opts.min_fraction_for_core,
                                     opts.max_fraction_for_core,
                                     opts.num_fraction_for_core_steps)

    otu_md = opts.otu_md
    valid_states = opts.valid_states
    mapping_fp = opts.mapping_fp

    create_dir(output_dir)

    if valid_states and opts.mapping_fp:
        sample_ids = sample_ids_from_metadata_description(
            open(mapping_fp, 'U'),
            valid_states)
        if len(sample_ids) < 1:
            option_parser.error(
                "--valid_states pattern didn't match any entries in mapping file: \"%s\"" %
                valid_states)
    else:
        # get core across all samples if user doesn't specify a subset of the
        # samples to work with
        sample_ids = None

    input_table = parse_biom_table(open(input_fp, 'U'))

    otu_counts = []
    summary_figure_fp = join(output_dir, 'core_otu_size.pdf')
    for fraction_for_core in fractions_for_core:
        # build a string representation of the fraction as that gets used
        # several times
        fraction_for_core_str = "%1.0f" % (fraction_for_core * 100.)

        # prep output files
        output_fp = join(
            output_dir,
            'core_otus_%s.txt' %
            fraction_for_core_str)
        output_table_fp = join(
            output_dir,
            'core_table_%s.biom' %
            fraction_for_core_str)
        output_f = open(output_fp, 'w')

        try:
            core_table = filter_table_to_core(input_table,
                                              sample_ids,
                                              fraction_for_core)
        except TableException:
            output_f.write(
                "# No OTUs present in %s %% of samples." %
                fraction_for_core_str)
            output_f.close()
            otu_counts.append(0)
            continue

        # write some header information to file
        if sample_ids is None:
            output_f.write(
                "# Core OTUs across %s %% of samples.\n" %
                fraction_for_core_str)
        else:
            output_f.write(
                "# Core OTUs across %s %% of samples matching the sample metadata pattern \"%s\":\n# %s\n" %
                (fraction_for_core_str, valid_states, ' '.join(sample_ids)))

        # write the otu id and corresponding metadata for all core otus
        otu_count = 0
        for value, id_, md in core_table.iter(axis='observation'):
            output_f.write('%s\t%s\n' % (id_, md[otu_md]))
            otu_count += 1
        output_f.close()

        # write the core biom table
        write_biom_table(core_table, output_table_fp)

        # append the otu count to the list of counts
        otu_counts.append(otu_count)

    plot(fractions_for_core, otu_counts)
    xlim(min(fractions_for_core), max(fractions_for_core))
    ylim(0, max(otu_counts) + 1)
    xlabel(
        "Fraction of samples that OTU must be observed in to be considered 'core'")
    ylabel("Number of OTUs")
    savefig(summary_figure_fp)
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    verbose = opts.verbose

    otu_table_fp = opts.otu_table_fp
    output_dir = opts.output_dir
    mapping_fp = opts.mapping_fp
    tree_fp = opts.tree_fp
    verbose = opts.verbose
    print_only = opts.print_only
    seqs_per_sample = opts.seqs_per_sample

    parallel = opts.parallel
    # No longer checking that jobs_to_start > 2, but
    # commenting as we may change our minds about this.
    #if parallel: raise_error_on_parallel_unavailable()

    if opts.parameter_fp:
        try:
            parameter_f = open(opts.parameter_fp, 'U')
        except IOError:
            raise IOError("Can't open parameters file (%s). Does it exist? Do you have read access?"
                          % opts.parameter_fp)
        params = parse_qiime_parameters(parameter_f)
        parameter_f.close()
    else:
        params = parse_qiime_parameters([])
        # empty list returns empty defaultdict for now

    jobs_to_start = opts.jobs_to_start
    default_jobs_to_start = qiime_config['jobs_to_start']
    validate_and_set_jobs_to_start(params,
                                   jobs_to_start,
                                   default_jobs_to_start,
                                   parallel,
                                   option_parser)

    create_dir(output_dir, fail_on_exist=not opts.force)

    if print_only:
        command_handler = print_commands
    else:
        command_handler = call_commands_serially

    if verbose:
        status_update_callback = print_to_stdout
    else:
        status_update_callback = no_status_updates

    run_beta_diversity_through_plots(otu_table_fp=otu_table_fp,
                                     mapping_fp=mapping_fp,
                                     output_dir=output_dir,
                                     command_handler=command_handler,
                                     params=params,
                                     qiime_config=qiime_config,
                                     color_by_interesting_fields_only=not opts.color_by_all_fields,
                                     sampling_depth=seqs_per_sample,
                                     tree_fp=tree_fp,
                                     parallel=parallel,
                                     suppress_emperor_plots=opts.suppress_emperor_plots,
                                     status_update_callback=status_update_callback)
Пример #34
0
def compare(interest_fp, other_fp, output_dir='blast-results-compare',
            interest_pcts=None, interest_alg_lens=None, other_pcts=None,
            other_alg_lens=None, hits_to_first=False, hits_to_second=False):
    """Compare two databases and write the outputs

    Parameters
    ----------
    interest_fp : str
        BLAST results when searching against the database of interest.
    other_fp : str
        BLAST results when searching against the other database.
    output_dir : str, optional
        Name of the output file path.
    interest_pcts : list, optional
        Minimum percentage identity to be considered as a valid result in the
        interest database search results. If None is passed, it defaults to
        `[70]`.
    interest_alg_lens : list, optional
        Minimum alginment length to be considered a valid result in the
        interest database search results. If None is passed, it defaults to
        `[50]`.
    other_pcts : list, optional
        Minimum percentage identity to be considered as a valid result in the
        other database search results. If None is passed, it defaults to
        `[70]`.
    other_lengths : list, optional
        Minimum alginment length to be considered a valid result in the
        other database search results. If None is passed, it defaults to
        `[50]`.
    hits_to_first : bool, optional defaults to False
        Outputs the labels and counts of the sequences being hit in the first
        database.
    hits_to_second : bool, optional defaults to False
        Outputs the labels and counts of the sequences being hit in the second
        database.

    Raises
    ------
    click.BadParameter
        If the `interest_pcts` and the `other_pcts` lists are of different
        length.
        If the `interest_alg_lens` and the `other_alg_lens` lists are of
        different length.
    """

    if interest_pcts is None:
        interest_pcts = [70]
    if interest_alg_lens is None:
        interest_alg_lens = [50]

    db_a = open(interest_fp, 'U')
    db_b = open(other_fp, 'U')

    # try to create the output directory, if it exists, just continue
    create_dir(output_dir, False)

    # run some validations on the input parameters
    if other_pcts:
        if len(interest_pcts) != len(other_pcts):
            raise BadParameter("The percentage values for both databases "
                               "should be the same length: %s - %s" %
                               (interest_pcts, other_pcts))
    else:
        other_pcts = interest_pcts

    if other_alg_lens:
        if len(interest_alg_lens) != len(other_alg_lens):
            raise BadParameter("The alignment length values for both databases"
                               " should be the length : %s - %s" %
                               (interest_alg_lens, other_alg_lens))
    else:
        other_alg_lens = interest_alg_lens

    # process databases
    total_queries, best_hits = parse_first_database(db_a, interest_pcts,
                                                    interest_alg_lens)
    parse_second_database(db_b, best_hits, other_pcts,
                          other_alg_lens)

    # parse results
    results = process_results(interest_pcts, interest_alg_lens,
                              other_pcts, other_alg_lens, best_hits)

    # Collating output and writing full results
    for i, item in enumerate(results):
        filename = join(output_dir, "summary_" + item['filename'] + ".txt")

        with open(filename, 'w') as fd:
            fd.write('\n'.join(item['summary']))

        if i == 0:
            combined_results = []
            combined_results.append(['filename'])
            combined_results.append(['interest db (%s)' %
                                     basename(interest_fp)])
            combined_results.append(['other db (%s)' % basename(other_fp)])
            combined_results.append(['only interest'])
            combined_results.append(['both dbs'])
            combined_results.append(['no hits in interest db'])

        no_hits = total_queries - item['db_interest'] - item['db_other'] - \
            item['perfect_interest'] - item['equal']

        combined_results[0].append(item['filename'])
        combined_results[1].append(str(item['db_interest']))
        combined_results[2].append(str(item['db_other']))
        combined_results[3].append(str(item['perfect_interest']))
        combined_results[4].append(str(item['equal']))
        combined_results[5].append(str(no_hits))

        # tiny helper function to save hits files
        def save_hits(data, name):

            s_hits = sorted(data, key=itemgetter(1), reverse=True)
            filename = join(output_dir, name)
            with open(filename, 'w') as fd:
                fd.write('\n'.join(['%s\t%d' % (k, v)
                                    for k, v in s_hits if v != 0]))

        if hits_to_first:
            save_hits(item['db_seqs_counts']['a'].items(),
                      "hits_to_first_db_%s.txt" % item['filename'])

        if hits_to_second:
            save_hits(item['db_seqs_counts']['b'].items(),
                      "hits_to_second_db_%s.txt" % item['filename'])

    # saving collated results
    with open(join(output_dir, "compile_output.txt"), 'w') as compiled_output:
        compiled_output.write('\n'.join(['\t'.join(item)
                                         for item in combined_results]))

    fn = join(output_dir, "compile_output_no_nohits.txt")
    with open(fn, 'w') as fd:
        fd.write('\n'.join(['\t'.join(item)
                            for item in combined_results[:-1]]))
Пример #35
0
    def _get_job_commands(self,
                          input_fp,
                          output_dir,
                          params,
                          job_prefix,
                          working_dir,
                          command_prefix='/bin/bash; ',
                          command_suffix='; exit'):
        """Generate beta diversity to split single OTU table to multiple jobs

        full_tree=True is faster: beta_diversity.py -f will make things
        go faster, but be sure you already have the correct minimal tree.
        """
        commands = []
        result_filepaths = []

        sids = load_table(input_fp).ids()

        if params['full_tree']:
            full_tree_str = '-f'
        else:
            full_tree_str = ''

        if params['tree_path']:
            tree_str = '-t %s' % params['tree_path']
        else:
            tree_str = ''

        metrics = params['metrics']

        # this is a little bit of an abuse of _merge_to_n_commands, so may
        # be worth generalizing that method - this determines the correct
        # number of samples to process in each command
        sample_id_groups = self._merge_to_n_commands(sids,
                                                     params['jobs_to_start'],
                                                     delimiter=',',
                                                     command_prefix='',
                                                     command_suffix='')

        for i, sample_id_group in enumerate(sample_id_groups):
            working_dir_i = join(working_dir, str(i))
            create_dir(working_dir_i)
            output_dir_i = join(output_dir, str(i))
            create_dir(output_dir_i)
            result_filepaths.append(output_dir_i)
            input_dir, input_fn = split(input_fp)
            input_basename, input_ext = splitext(input_fn)
            sample_id_desc = sample_id_group.replace(',', '_')
            output_fns = ['%s_%s.txt' % (metric, input_basename)
                          for metric in metrics.split(',')]
            rename_command, current_result_filepaths = self._get_rename_command(
                output_fns, working_dir_i, output_dir_i)

            result_filepaths += current_result_filepaths

            bdiv_command = '%s -i %s -o %s %s -m %s %s -r %s' %\
                (self._script_name,
                 input_fp,
                 working_dir_i,
                 tree_str,
                 params['metrics'],
                 full_tree_str,
                 sample_id_group)

            shell_script_fp = '%s/%s%d.sh' % (working_dir_i, job_prefix, i)
            shell_script_commands = [bdiv_command] + rename_command.split(';')
            self._commands_to_shell_script(shell_script_commands,
                                           shell_script_fp)
            commands.append('bash %s' % shell_script_fp)

        commands = self._merge_to_n_commands(commands,
                                             params['jobs_to_start'],
                                             command_prefix=command_prefix,
                                             command_suffix=command_suffix)

        return commands, result_filepaths
Пример #36
0
            interest_taxonomy = sequences_from_query(open(tax_fp, 'U'), query)
        except (PlatypusValueError, PlatypusParseError), e:
            raise BadParameter(e.message)

        if len(interest_taxonomy) == 0:
            raise BadParameter('The query could not retrieve any results, try '
                               'a different one.')
    else:
        interest_taxonomy = {
            l.strip().split('\t')[0].strip(): ''
            for l in open(split_fp, 'U')
        }
        if not interest_taxonomy:
            raise BadParameter('The split_fp is empty!')

    create_dir(output_fp, False)

    interest_fp = open(join(output_fp, 'interest.fna'), 'w')
    rest_fp = open(join(output_fp, 'rest.fna'), 'w')

    for record in read(seqs_fp, format='fasta'):
        full_name = record.id
        seq = record.sequence

        name = full_name.strip().split(' ')[0].strip()

        if name in interest_taxonomy:
            interest_fp.write(">%s\n%s\n" % (full_name, seq))
        else:
            rest_fp.write(">%s\n%s\n" % (full_name, seq))
Пример #37
0
def main():
    args = handle_program_options()

    if osp.isfile(args.out_dir):
        print("--out_dir (-o) option must be a valid directory and not a file",
              file=sys.stderr)
        sys.exit(1)

    # will fail gracefully if dir exists
    skbu.create_dir(args.out_dir)

    metagenomes = []
    if args.metagenome_id is not None:
        metagenomes.append(args.metagenome_id)
    elif args.metagenome_file is not None:
        metagenomes.extend(parse_metagenome_file(args.metagenome_file))

    if args.verbose:
        msg = 'Processing requested for {} metagenome(s) found in: {}'
        print(msg.format(len(metagenomes), args.metagenome_file))

    # MG-RAST stage.file ids for downloading
    derep_passed = '150.1'
    screen_passed = '299.1'

    for mg_id in metagenomes:
        if args.verbose:
            print('Processing metagenome: {}'.format(mg_id))
            print('\tDownloading: Dereplication Passed...', end='')
            sys.stdout.flush()
        derepp_rsp = mgapi.mgrast_request('download',
                                          mg_id, {'file': derep_passed},
                                          auth_key=args.auth_key)
        derepp_sc = SequenceCollection.read(StringIO(derepp_rsp.text),
                                            format='fastq',
                                            variant='illumina1.8')
        if args.verbose:
            print('{} sequences'.format(len(derepp_sc)))
            print('\tDownloading: Screen Passed...', end='')
            sys.stdout.flush()
        screenp_rsp = mgapi.mgrast_request('download',
                                           mg_id, {'file': screen_passed},
                                           auth_key=args.auth_key)
        screenp_ids = extract_seq_ids(screenp_rsp.text,
                                      fmt='fastq',
                                      variant='illumina1.8')
        if args.verbose:
            print('{} sequences'.format(len(screenp_ids)))

        # filter dereplication passed with IDs from screen passed
        failed_screen = filter_seqs(derepp_sc, screenp_ids)
        if args.verbose:
            nsp = len(screenp_ids)
            print(
                '\tRemoved {} sequences from Dereplication Passed'.format(nsp))
            print('\tleaving {} sequences'.format(len(failed_screen)))

        out_fp = osp.join(args.out_dir, mg_id + '_screen_failed.fastq')
        failed_screen.write(out_fp, format='fastq', variant='illumina1.8')
        if args.verbose:
            print('Sequence data written to: ' + out_fp)
Пример #38
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    read_arguments_from_file = opts.read_arguments_from_file

    # these arguments can optionally be read from a file, reasoning is to
    # allow arguments that would span over hundreds of samples and would be
    # prohibitive to execute as a command line call
    if read_arguments_from_file:
        # sample_ids is the only one of these arguments that's returned as a
        # string, the rest of them are lists
        if opts.sample_ids:
            opts.sample_ids = ','.join(parse_items(opts.sample_ids))
        if opts.sequence_read_fps:
            opts.sequence_read_fps = parse_items(opts.sequence_read_fps[0])
        if opts.barcode_read_fps:
            opts.barcode_read_fps = parse_items(opts.barcode_read_fps[0])
        if opts.mapping_fps:
            opts.mapping_fps = parse_items(opts.mapping_fps[0])

    sequence_read_fps = opts.sequence_read_fps
    barcode_read_fps = opts.barcode_read_fps
    sample_ids = None
    if opts.sample_ids is not None:
        sample_ids = opts.sample_ids.split(',')
    mapping_fps = opts.mapping_fps
    phred_quality_threshold = opts.phred_quality_threshold
    retain_unassigned_reads = opts.retain_unassigned_reads
    min_per_read_length_fraction = opts.min_per_read_length_fraction
    max_bad_run_length = opts.max_bad_run_length
    rev_comp = opts.rev_comp
    rev_comp_barcode = opts.rev_comp_barcode
    rev_comp_mapping_barcodes = opts.rev_comp_mapping_barcodes
    seq_max_N = opts.sequence_max_n
    start_seq_id = opts.start_seq_id
    # NEED TO FIX THIS FUNCTIONALITY - CURRENTLY READING THE WRONG FIELD
    # opts.filter_bad_illumina_qual_digit
    filter_bad_illumina_qual_digit = False
    store_qual_scores = opts.store_qual_scores
    store_demultiplexed_fastq = opts.store_demultiplexed_fastq
    barcode_type = opts.barcode_type
    max_barcode_errors = opts.max_barcode_errors

    # if this is not a demultiplexed run,
    if barcode_type == 'not-barcoded':
        if sample_ids is None:
            option_parser.error("If not providing barcode reads (because "
                                "your data is not multiplexed), must provide --sample_ids.")
        if len(sample_ids) != len(sequence_read_fps):
            option_parser.error("If providing --sample_ids (because "
                                "your data is not multiplexed), must provide the same number "
                                "of sample ids as sequence read filepaths.")
        barcode_read_fps = [None] * len(sequence_read_fps)
        mapping_fps = [None] * len(sequence_read_fps)
    elif barcode_read_fps is None:
        option_parser.error("Must provide --barcode_read_fps if "
                            "--barcode_type is not 'not-barcoded'")
    elif mapping_fps is None:
        option_parser.error("Must provide --mapping_fps if "
                            "--barcode_type is not 'not-barcoded'")

    phred_offset = opts.phred_offset
    if phred_offset is not None:
        try:
            phred_offset = int(phred_offset)
        except ValueError:
            # shouldn't be able to get here...
            option_parser.error(
                "If --phred_offset is provided, it must be a valid integer.")

    if opts.last_bad_quality_char is not None:
        option_parser.error('--last_bad_quality_char is no longer supported. '
                            'Use -q instead (see option help text by passing -h)')

    if not (0 < min_per_read_length_fraction <= 1):
        option_parser.error('--min_per_read_length_fraction must be greater '
                            'than 0 and less than or equal to 1. You passed '
                            '%1.5f.' % min_per_read_length_fraction)

    barcode_correction_fn = BARCODE_DECODER_LOOKUP.get(barcode_type, None)

    if len(mapping_fps) == 1 and len(sequence_read_fps) > 1:
        mapping_fps = mapping_fps * len(sequence_read_fps)

    if len(set([len(sequence_read_fps), len(barcode_read_fps),
                len(mapping_fps)])) > 1:
        option_parser.error("Same number of sequence, barcode, and mapping "
                            "files must be provided.")

    output_dir = opts.output_dir
    create_dir(output_dir)

    output_fp_temp = '%s/seqs.fna.incomplete' % output_dir
    output_fp = '%s/seqs.fna' % output_dir
    output_f = open(output_fp_temp, 'w')
    qual_fp_temp = '%s/qual.fna.incomplete' % output_dir
    qual_fp = '%s/seqs.qual' % output_dir
    output_fastq_fp_temp = '%s/seqs.fastq.incomplete' % output_dir
    output_fastq_fp = '%s/seqs.fastq' % output_dir

    if store_qual_scores:
        qual_f = open(qual_fp_temp, 'w')
        # define a qual writer whether we're storing
        # qual strings or not so we don't have to check
        # every time through the for loop below

        def qual_writer(h, q):
            qual_f.write('>%s\n%s\n' % (h, q))
    else:
        def qual_writer(h, q):
            pass

    if store_demultiplexed_fastq:
        output_fastq_f = open(output_fastq_fp_temp, 'w')
        # define a fastq writer whether we're storing
        # qual strings or not so we don't have to check
        # every time through the for loop below

        def fastq_writer(h, s, q):
            output_fastq_f.write(format_fastq_record(h, s, q))
    else:
        def fastq_writer(h, s, q):
            pass

    log_fp = '%s/split_library_log.txt' % output_dir
    log_f = open(log_fp, 'w')
    histogram_fp = '%s/histograms.txt' % output_dir
    histogram_f = open(histogram_fp, 'w')

    for i in range(len(sequence_read_fps)):
        sequence_read_fp = sequence_read_fps[i]
        barcode_read_fp = barcode_read_fps[i]
        mapping_fp = mapping_fps[i]
        if mapping_fp is not None:
            mapping_f = open(mapping_fp, 'U')
            _, _, barcode_to_sample_id, _, _, _, _ = check_map(mapping_f,
                disable_primer_check=True,
                has_barcodes=barcode_read_fp is not None)
        else:
            mapping_f = None
            barcode_to_sample_id = {}

        if rev_comp_mapping_barcodes:
            barcode_to_sample_id = {str(DNA(k).rc()): v for k, v in
                                    barcode_to_sample_id.iteritems()}

        if barcode_type == 'golay_12':
            invalid_golay_barcodes = get_invalid_golay_barcodes(
                barcode_to_sample_id.keys())
            if len(invalid_golay_barcodes) > 0:
                option_parser.error("Some or all barcodes are not valid golay "
                                    "codes. Do they need to be reverse complemented? If these "
                                    "are not golay barcodes pass --barcode_type 12 to disable "
                                    "barcode error correction, or pass --barcode_type # if "
                                    "the barcodes are not 12 base pairs, where # is the size "
                                    "of the barcodes. Invalid codes:\n\t%s" %
                                    ' '.join(invalid_golay_barcodes))

        log_f.write("Input file paths\n")
        if mapping_fp is not None:
            log_f.write('Mapping filepath: %s (md5: %s)\n' %
                        (mapping_fp, safe_md5(open(mapping_fp)).hexdigest()))
        log_f.write('Sequence read filepath: %s (md5: %s)\n' %
                    (sequence_read_fp,
                     str(safe_md5(open(sequence_read_fp)).hexdigest())))

        if sequence_read_fp.endswith('.gz'):
            sequence_read_f = gzip_open(sequence_read_fp)
        else:
            sequence_read_f = open(sequence_read_fp, 'U')

        seq_id = start_seq_id

        if barcode_read_fp is not None:
            log_f.write('Barcode read filepath: %s (md5: %s)\n\n' %
                        (barcode_read_fp,
                         safe_md5(open(barcode_read_fp)).hexdigest()))

            if barcode_read_fp.endswith('.gz'):
                barcode_read_f = gzip_open(barcode_read_fp)
            else:
                barcode_read_f = open(barcode_read_fp, 'U')

            seq_generator = process_fastq_single_end_read_file(
                sequence_read_f, barcode_read_f, barcode_to_sample_id,
                store_unassigned=retain_unassigned_reads,
                max_bad_run_length=max_bad_run_length,
                phred_quality_threshold=phred_quality_threshold,
                min_per_read_length_fraction=min_per_read_length_fraction,
                rev_comp=rev_comp, rev_comp_barcode=rev_comp_barcode,
                seq_max_N=seq_max_N, start_seq_id=start_seq_id,
                filter_bad_illumina_qual_digit=filter_bad_illumina_qual_digit,
                log_f=log_f, histogram_f=histogram_f,
                barcode_correction_fn=barcode_correction_fn,
                max_barcode_errors=max_barcode_errors,
                phred_offset=phred_offset)
        else:
            seq_generator = process_fastq_single_end_read_file_no_barcode(
                sequence_read_f, sample_ids[i],
                store_unassigned=retain_unassigned_reads,
                max_bad_run_length=max_bad_run_length,
                phred_quality_threshold=phred_quality_threshold,
                min_per_read_length_fraction=min_per_read_length_fraction,
                rev_comp=rev_comp, seq_max_N=seq_max_N,
                start_seq_id=start_seq_id,
                filter_bad_illumina_qual_digit=filter_bad_illumina_qual_digit,
                log_f=log_f, histogram_f=histogram_f,
                phred_offset=phred_offset)

        for fasta_header, sequence, quality, seq_id in seq_generator:
            output_f.write('>%s\n%s\n' % (fasta_header, sequence))
            qual_writer(fasta_header, quality)
            fastq_writer(fasta_header, sequence, quality)

        start_seq_id = seq_id + 1
        log_f.write('\n---\n\n')

    output_f.close()
    rename(output_fp_temp, output_fp)

    # process the optional output files, as necessary
    if store_qual_scores:
        qual_f.close()
        rename(qual_fp_temp, qual_fp)

    if store_demultiplexed_fastq:
        output_fastq_f.close()
        rename(output_fastq_fp_temp, output_fastq_fp)
Пример #39
0
def pick_subsampled_open_reference_otus(
        input_fp,
        refseqs_fp,
        output_dir,
        percent_subsample,
        new_ref_set_id,
        command_handler,
        params,
        qiime_config,
        prefilter_refseqs_fp=None,
        run_assign_tax=True,
        run_align_and_tree=True,
        prefilter_percent_id=None,
        min_otu_size=2,
        step1_otu_map_fp=None,
        step1_failures_fasta_fp=None,
        parallel=False,
        suppress_step4=False,
        logger=None,
        suppress_md5=False,
        suppress_index_page=False,
        denovo_otu_picking_method='uclust',
        reference_otu_picking_method='uclust_ref',
        status_update_callback=print_to_stdout,
        minimum_failure_threshold=100000):
    """ Run the data preparation steps of Qiime

        The steps performed by this function are:
          - Pick reference OTUs against refseqs_fp
          - Subsample the failures to n sequences.
          - Pick OTUs de novo on the n failures.
          - Pick representative sequences for the resulting OTUs.
          - Pick reference OTUs on all failures using the
             representative set from step 4 as the reference set.

    """
    # for now only allowing uclust/usearch/sortmerna+sumaclust for otu picking
    allowed_denovo_otu_picking_methods = ['uclust', 'usearch61', 'sumaclust']
    allowed_reference_otu_picking_methods = [
        'uclust_ref', 'usearch61_ref', 'sortmerna'
    ]
    assert denovo_otu_picking_method in allowed_denovo_otu_picking_methods,\
        "Unknown de novo OTU picking method: %s. Known methods are: %s"\
        % (denovo_otu_picking_method,
           ','.join(allowed_denovo_otu_picking_methods))

    assert reference_otu_picking_method in allowed_reference_otu_picking_methods,\
        "Unknown reference OTU picking method: %s. Known methods are: %s"\
        % (reference_otu_picking_method,
           ','.join(allowed_reference_otu_picking_methods))

    # Prepare some variables for the later steps
    index_links = []
    input_dir, input_filename = split(input_fp)
    input_basename, input_ext = splitext(input_filename)
    create_dir(output_dir)
    commands = []
    if logger is None:
        log_fp = generate_log_fp(output_dir)
        logger = WorkflowLogger(log_fp,
                                params=params,
                                qiime_config=qiime_config)

        close_logger_on_success = True
        index_links.append(
            ('Run summary data', log_fp, _index_headers['run_summary']))
    else:
        close_logger_on_success = False

    if not suppress_md5:
        log_input_md5s(
            logger,
            [input_fp, refseqs_fp, step1_otu_map_fp, step1_failures_fasta_fp])

    # if the user has not passed a different reference collection for the pre-filter,
    # used the main refseqs_fp. this is useful if the user wants to provide a smaller
    # reference collection, or to use the input reference collection when running in
    # iterative mode (rather than an iteration's new refseqs)
    if prefilter_refseqs_fp is None:
        prefilter_refseqs_fp = refseqs_fp

    # Step 1: Closed-reference OTU picking on the input file (if not already
    # complete)
    if step1_otu_map_fp and step1_failures_fasta_fp:
        step1_dir = '%s/step1_otus' % output_dir
        create_dir(step1_dir)
        logger.write("Using pre-existing reference otu map and failures.\n\n")
    else:
        if prefilter_percent_id is not None:
            prefilter_dir = '%s/prefilter_otus/' % output_dir
            prefilter_failures_list_fp = '%s/%s_failures.txt' % \
                (prefilter_dir, input_basename)
            prefilter_pick_otu_cmd = pick_reference_otus(
                input_fp, prefilter_dir, reference_otu_picking_method,
                prefilter_refseqs_fp, parallel, params, logger,
                prefilter_percent_id)
            commands.append([('Pick Reference OTUs (prefilter)',
                              prefilter_pick_otu_cmd)])

            prefiltered_input_fp = '%s/prefiltered_%s%s' %\
                (prefilter_dir, input_basename, input_ext)
            filter_fasta_cmd = 'filter_fasta.py -f %s -o %s -s %s -n' %\
                (input_fp, prefiltered_input_fp, prefilter_failures_list_fp)
            commands.append([('Filter prefilter failures from input',
                              filter_fasta_cmd)])
            index_links.append(
                ('Pre-filtered sequence identifiers '
                 '(failed to hit reference at %1.1f%% identity)' %
                 (float(prefilter_percent_id) * 100),
                 prefilter_failures_list_fp, _index_headers['sequences']))

            # Call the command handler on the list of commands
            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []

            input_fp = prefiltered_input_fp
            input_dir, input_filename = split(input_fp)
            input_basename, input_ext = splitext(input_filename)
            if getsize(prefiltered_input_fp) == 0:
                raise ValueError(
                    "All sequences were discarded by the prefilter. "
                    "Are the input sequences in the same orientation "
                    "in your input file and reference file (you can "
                    "add 'pick_otus:enable_rev_strand_match True' to "
                    "your parameters file if not)? Are you using the "
                    "correct reference file?")

        # Build the OTU picking command
        step1_dir = \
            '%s/step1_otus' % output_dir
        step1_otu_map_fp = \
            '%s/%s_otus.txt' % (step1_dir, input_basename)
        step1_pick_otu_cmd = pick_reference_otus(input_fp, step1_dir,
                                                 reference_otu_picking_method,
                                                 refseqs_fp, parallel, params,
                                                 logger)
        commands.append([('Pick Reference OTUs', step1_pick_otu_cmd)])

        # Build the failures fasta file
        step1_failures_list_fp = '%s/%s_failures.txt' % \
            (step1_dir, input_basename)
        step1_failures_fasta_fp = \
            '%s/failures.fasta' % step1_dir
        step1_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\
            (input_fp, step1_failures_list_fp, step1_failures_fasta_fp)

        commands.append([('Generate full failures fasta file',
                          step1_filter_fasta_cmd)])

        # Call the command handler on the list of commands
        command_handler(commands,
                        status_update_callback,
                        logger=logger,
                        close_logger_on_success=False)
        commands = []

    step1_repset_fasta_fp = \
        '%s/step1_rep_set.fna' % step1_dir
    step1_pick_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
        (step1_otu_map_fp, step1_repset_fasta_fp, input_fp)
    commands.append([('Pick rep set', step1_pick_rep_set_cmd)])

    # Call the command handler on the list of commands
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)
    commands = []
    # name the final otu map
    merged_otu_map_fp = '%s/final_otu_map.txt' % output_dir

    # count number of sequences in step 1 failures fasta file
    with open(abspath(step1_failures_fasta_fp), 'U') as step1_failures_fasta_f:
        num_failure_seqs, mean, std = count_seqs_from_file(
            step1_failures_fasta_f)

    # number of failures sequences is greater than the threshold,
    # continue to step 2,3 and 4
    run_step_2_and_3 = num_failure_seqs > minimum_failure_threshold

    if run_step_2_and_3:

        # Subsample the failures fasta file to retain (roughly) the
        # percent_subsample
        step2_dir = '%s/step2_otus/' % output_dir
        create_dir(step2_dir)
        step2_input_fasta_fp = \
                               '%s/subsampled_failures.fasta' % step2_dir
        subsample_fasta(step1_failures_fasta_fp, step2_input_fasta_fp,
                        percent_subsample)

        logger.write('# Subsample the failures fasta file using API \n' +
                     'python -c "import qiime; qiime.util.subsample_fasta' +
                     '(\'%s\', \'%s\', \'%f\')\n\n"' %
                     (abspath(step1_failures_fasta_fp),
                      abspath(step2_input_fasta_fp), percent_subsample))

        # Prep the OTU picking command for the subsampled failures
        step2_cmd = pick_denovo_otus(step2_input_fasta_fp, step2_dir,
                                     new_ref_set_id, denovo_otu_picking_method,
                                     params, logger)
        step2_otu_map_fp = '%s/subsampled_failures_otus.txt' % step2_dir

        commands.append([('Pick de novo OTUs for new clusters', step2_cmd)])

        # Prep the rep set picking command for the subsampled failures
        step2_repset_fasta_fp = '%s/step2_rep_set.fna' % step2_dir
        step2_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
            (step2_otu_map_fp, step2_repset_fasta_fp, step2_input_fasta_fp)
        commands.append([('Pick representative set for subsampled failures',
                          step2_rep_set_cmd)])

        step3_dir = '%s/step3_otus/' % output_dir
        step3_otu_map_fp = '%s/failures_otus.txt' % step3_dir
        step3_failures_list_fp = '%s/failures_failures.txt' % step3_dir

        # remove the indexed reference database from the dictionary of
        # parameters as it must be forced to build a new database
        # using the step2_repset_fasta_fp
        if reference_otu_picking_method == 'sortmerna':
            if 'sortmerna_db' in params['pick_otus']:
                del params['pick_otus']['sortmerna_db']

        step3_cmd = pick_reference_otus(step1_failures_fasta_fp, step3_dir,
                                        reference_otu_picking_method,
                                        step2_repset_fasta_fp, parallel,
                                        params, logger)

        commands.append([('Pick reference OTUs using de novo rep set',
                          step3_cmd)])

        index_links.append((
            'Final map of OTU identifier to sequence identifers (i.e., "OTU map")',
            merged_otu_map_fp, _index_headers['otu_maps']))

    if not suppress_step4:
        step4_dir = '%s/step4_otus/' % output_dir
        if run_step_2_and_3:
            step3_failures_fasta_fp = '%s/failures_failures.fasta' % step3_dir
            step3_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\
                (step1_failures_fasta_fp,
                 step3_failures_list_fp, step3_failures_fasta_fp)
            commands.append([('Create fasta file of step3 failures',
                              step3_filter_fasta_cmd)])

            failures_fp = step3_failures_fasta_fp
            failures_otus_fp = 'failures_failures_otus.txt'
            failures_step = 'step3'
        else:
            failures_fp = step1_failures_fasta_fp
            failures_otus_fp = 'failures_otus.txt'
            failures_step = 'step1'
            step3_otu_map_fp = ""

        step4_cmd = pick_denovo_otus(failures_fp, step4_dir,
                                     '.'.join([new_ref_set_id, 'CleanUp']),
                                     denovo_otu_picking_method, params, logger)

        step4_otu_map_fp = '%s/%s' % (step4_dir, failures_otus_fp)
        commands.append([('Pick de novo OTUs on %s failures' % failures_step,
                          step4_cmd)])

        # Merge the otu maps, note that we are explicitly using the '>' operator
        # otherwise passing the --force flag on the script interface would
        # append the newly created maps to the map that was previously created
        cat_otu_tables_cmd = 'cat %s %s %s > %s' %\
            (step1_otu_map_fp, step3_otu_map_fp,
             step4_otu_map_fp, merged_otu_map_fp)
        commands.append([('Merge OTU maps', cat_otu_tables_cmd)])
        step4_repset_fasta_fp = '%s/step4_rep_set.fna' % step4_dir
        step4_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
            (step4_otu_map_fp, step4_repset_fasta_fp, failures_fp)
        commands.append([('Pick representative set for subsampled failures',
                          step4_rep_set_cmd)])
    else:
        # Merge the otu maps, note that we are explicitly using the '>' operator
        # otherwise passing the --force flag on the script interface would
        # append the newly created maps to the map that was previously created
        if run_step_2_and_3:
            failures_fp = step3_failures_list_fp
        else:
            failures_fp = step1_failures_list_fp
            step3_otu_map_fp = ""

        cat_otu_tables_cmd = 'cat %s %s > %s' %\
            (step1_otu_map_fp, step3_otu_map_fp, merged_otu_map_fp)
        commands.append([('Merge OTU maps', cat_otu_tables_cmd)])

        # Move the step 3 failures file to the top-level directory
        commands.append([
            ('Move final failures file to top-level directory',
             'mv %s %s/final_failures.txt' % (failures_fp, output_dir))
        ])

    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)
    commands = []

    otu_fp = merged_otu_map_fp
    # Filter singletons from the otu map
    otu_no_singletons_fp = '%s/final_otu_map_mc%d.txt' % (output_dir,
                                                          min_otu_size)

    otus_to_keep = filter_otus_from_otu_map(otu_fp, otu_no_singletons_fp,
                                            min_otu_size)

    index_links.append(
        ('Final map of OTU identifier to sequence identifers excluding '
         'OTUs with fewer than %d sequences' % min_otu_size,
         otu_no_singletons_fp, _index_headers['otu_maps']))

    logger.write(
        '# Filter singletons from the otu map using API \n' +
        'python -c "import qiime; qiime.filter.filter_otus_from_otu_map' +
        '(\'%s\', \'%s\', \'%d\')"\n\n' %
        (abspath(otu_fp), abspath(otu_no_singletons_fp), min_otu_size))

    # make the final representative seqs file and a new refseqs file that
    # could be used in subsequent otu picking runs.
    # this is clunky. first, we need to do this without singletons to match
    # the otu map without singletons. next, there is a difference in what
    # we need the reference set to be and what we need the repseqs to be.
    # the reference set needs to be a superset of the input reference set
    # to this set. the repset needs to be only the sequences that were observed
    # in this data set, and we want reps for the step1 reference otus to be
    # reads from this run so we don't hit issues building a tree using
    # sequences of very different lengths. so...
    final_repset_fp = '%s/rep_set.fna' % output_dir
    index_links.append(('OTU representative sequences', final_repset_fp,
                        _index_headers['sequences']))
    final_repset_f = open(final_repset_fp, 'w')
    new_refseqs_fp = '%s/new_refseqs.fna' % output_dir
    index_links.append((
        'New reference sequences (i.e., OTU representative sequences plus input '
        'reference sequences)', new_refseqs_fp, _index_headers['sequences']))
    # write non-singleton otus representative sequences from step1 to the
    # final rep set file
    for otu_id, seq in parse_fasta(open(step1_repset_fasta_fp, 'U')):
        if otu_id.split()[0] in otus_to_keep:
            final_repset_f.write('>%s\n%s\n' % (otu_id, seq))
    logger.write('# Write non-singleton otus representative sequences ' +
                 'from step1 to the final rep set file: %s\n\n' %
                 final_repset_fp)
    # copy the full input refseqs file to the new refseqs_fp
    copyfile(refseqs_fp, new_refseqs_fp)
    new_refseqs_f = open(new_refseqs_fp, 'a')
    new_refseqs_f.write('\n')
    logger.write(
        '# Copy the full input refseqs file to the new refseq file\n' +
        'cp %s %s\n\n' % (refseqs_fp, new_refseqs_fp))
    # iterate over all representative sequences from step2 and step4 and write
    # those corresponding to non-singleton otus to the final representative set
    # file and the new reference sequences file.
    if run_step_2_and_3:
        for otu_id, seq in parse_fasta(open(step2_repset_fasta_fp, 'U')):
            if otu_id.split()[0] in otus_to_keep:
                new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq))
                final_repset_f.write('>%s\n%s\n' % (otu_id, seq))
    if not suppress_step4:
        for otu_id, seq in parse_fasta(open(step4_repset_fasta_fp, 'U')):
            if otu_id.split()[0] in otus_to_keep:
                new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq))
                final_repset_f.write('>%s\n%s\n' % (otu_id, seq))
    new_refseqs_f.close()
    final_repset_f.close()

    # steps 1-4 executed
    if run_step_2_and_3:
        logger.write(
            '# Write non-singleton otus representative sequences from ' +
            'step 2 and step 4 to the final representative set and the new reference'
            + ' set (%s and %s respectively)\n\n' %
            (final_repset_fp, new_refseqs_fp))
    # only steps 1 and 4 executed
    else:
        logger.write(
            '# Write non-singleton otus representative sequences from ' +
            'step 4 to the final representative set and the new reference' +
            ' set (%s and %s respectively)\n\n' %
            (final_repset_fp, new_refseqs_fp))

    # Prep the make_otu_table.py command
    otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size)

    make_otu_table_cmd = 'make_otu_table.py -i %s -o %s' %\
        (otu_no_singletons_fp, otu_table_fp)
    commands.append([("Make the otu table", make_otu_table_cmd)])
    index_links.append(
        ('OTU table exluding OTUs with fewer than %d sequences' % min_otu_size,
         otu_table_fp, _index_headers['otu_tables']))
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)

    commands = []

    # initialize output file names - these differ based on what combination of
    # taxonomy assignment and alignment/tree building is happening.
    if run_assign_tax and run_align_and_tree:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
            '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size)

        align_and_tree_input_otu_table = otu_table_w_tax_fp
        index_links.append((
            'OTU table exluding OTUs with fewer than %d sequences and including OTU '
            'taxonomy assignments' % min_otu_size, otu_table_w_tax_fp,
            _index_headers['otu_tables']))

        pynast_failure_filtered_otu_table_fp = \
            '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir, min_otu_size)
        index_links.append((
            'OTU table exluding OTUs with fewer than %d sequences and sequences that '
            'fail to align with PyNAST and including OTU taxonomy assignments'
            % min_otu_size, pynast_failure_filtered_otu_table_fp,
            _index_headers['otu_tables']))

    elif run_assign_tax:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
            '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size)
        index_links.append((
            'OTU table exluding OTUs with fewer than %d sequences and including OTU '
            'taxonomy assignments' % min_otu_size, otu_table_w_tax_fp,
            _index_headers['otu_tables']))

    elif run_align_and_tree:
        align_and_tree_input_otu_table = otu_table_fp
        pynast_failure_filtered_otu_table_fp = \
            '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir,
                                                           min_otu_size)
        index_links.append((
            'OTU table exluding OTUs with fewer than %d sequences and sequences that '
            'fail to align with PyNAST' % min_otu_size,
            pynast_failure_filtered_otu_table_fp,
            _index_headers['otu_tables']))

    if run_assign_tax:
        if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0:
            logger.write("Final output file exists (%s). Will not rebuild." %
                         otu_table_w_tax_fp)
        else:
            # remove files from partially completed runs
            remove_files([otu_table_w_tax_fp], error_on_missing=False)

            taxonomy_fp = assign_tax(
                repset_fasta_fp=final_repset_fp,
                output_dir=output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                parallel=parallel,
                logger=logger,
                status_update_callback=status_update_callback)

            index_links.append(('OTU taxonomic assignments', taxonomy_fp,
                                _index_headers['taxa_assignments']))

            # Add taxa to otu table
            add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\
                (tax_input_otu_table_fp, taxonomy_fp, otu_table_w_tax_fp)
            commands.append([("Add taxa to OTU table", add_metadata_cmd)])

            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []

    if run_align_and_tree:
        rep_set_tree_fp = join(output_dir, 'rep_set.tre')
        index_links.append(('OTU phylogenetic tree', rep_set_tree_fp,
                            _index_headers['trees']))
        if exists(pynast_failure_filtered_otu_table_fp) and\
           getsize(pynast_failure_filtered_otu_table_fp) > 0:
            logger.write("Final output file exists (%s). Will not rebuild." %
                         pynast_failure_filtered_otu_table_fp)
        else:
            # remove files from partially completed runs
            remove_files([pynast_failure_filtered_otu_table_fp],
                         error_on_missing=False)

            pynast_failures_fp = align_and_tree(
                repset_fasta_fp=final_repset_fp,
                output_dir=output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                parallel=parallel,
                logger=logger,
                status_update_callback=status_update_callback)

            # Build OTU table without PyNAST failures
            table = load_table(align_and_tree_input_otu_table)
            filtered_otu_table = filter_otus_from_otu_table(
                table,
                get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')),
                0,
                inf,
                0,
                inf,
                negate_ids_to_keep=True)
            write_biom_table(filtered_otu_table,
                             pynast_failure_filtered_otu_table_fp)

            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []

    if close_logger_on_success:
        logger.close()

    if not suppress_index_page:
        index_fp = '%s/index.html' % output_dir
        generate_index_page(index_links, index_fp)
Пример #40
0
def iterative_pick_subsampled_open_reference_otus(
        input_fps,
        refseqs_fp,
        output_dir,
        percent_subsample,
        new_ref_set_id,
        command_handler,
        params,
        qiime_config,
        prefilter_refseqs_fp=None,
        prefilter_percent_id=None,
        min_otu_size=2,
        run_assign_tax=True,
        run_align_and_tree=True,
        step1_otu_map_fp=None,
        step1_failures_fasta_fp=None,
        parallel=False,
        suppress_step4=False,
        logger=None,
        suppress_md5=False,
        denovo_otu_picking_method='uclust',
        reference_otu_picking_method='uclust_ref',
        status_update_callback=print_to_stdout,
        minimum_failure_threshold=100000):
    """ Call the pick_subsampled_open_reference_otus workflow on multiple inputs
         and handle processing of the results.
    """
    create_dir(output_dir)
    commands = []

    if logger is None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False

    # if the user has not passed a different reference collection for the pre-filter,
    # used the input refseqs_fp for all iterations. we want to pre-filter all data against
    # the input data as lower percent identity searches with uclust can be slow, so we
    # want the reference collection to stay at a reasonable size.
    if prefilter_refseqs_fp is None:
        prefilter_refseqs_fp = refseqs_fp

    otu_table_fps = []
    repset_fasta_fps = []
    for i, input_fp in enumerate(input_fps):
        iteration_output_dir = '%s/%d/' % (output_dir, i)
        if iteration_output_exists(iteration_output_dir, min_otu_size):
            # if the output from an iteration already exists, skip that
            # iteration (useful for continuing failed runs)
            log_input_md5s(logger, [input_fp, refseqs_fp])
            logger.write(
                'Iteration %d (input file: %s) output data already exists. '
                'Skipping and moving to next.\n\n' % (i, input_fp))
        else:
            pick_subsampled_open_reference_otus(
                input_fp=input_fp,
                refseqs_fp=refseqs_fp,
                output_dir=iteration_output_dir,
                percent_subsample=percent_subsample,
                new_ref_set_id='.'.join([new_ref_set_id,
                                         str(i)]),
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                run_assign_tax=False,
                run_align_and_tree=False,
                prefilter_refseqs_fp=prefilter_refseqs_fp,
                prefilter_percent_id=prefilter_percent_id,
                min_otu_size=min_otu_size,
                step1_otu_map_fp=step1_otu_map_fp,
                step1_failures_fasta_fp=step1_failures_fasta_fp,
                parallel=parallel,
                suppress_step4=suppress_step4,
                logger=logger,
                suppress_md5=suppress_md5,
                suppress_index_page=True,
                denovo_otu_picking_method=denovo_otu_picking_method,
                reference_otu_picking_method=reference_otu_picking_method,
                status_update_callback=status_update_callback,
                minimum_failure_threshold=minimum_failure_threshold)
        # perform post-iteration file shuffling whether the previous iteration's
        # data previously existed or was just computed.
        # step1 otu map and failures can only be used for the first iteration
        # as subsequent iterations need to use updated refseqs files
        step1_otu_map_fp = step1_failures_fasta_fp = None
        new_refseqs_fp = '%s/new_refseqs.fna' % iteration_output_dir
        refseqs_fp = new_refseqs_fp

        otu_table_fps.append('%s/otu_table_mc%d.biom' %
                             (iteration_output_dir, min_otu_size))

        repset_fasta_fps.append('%s/rep_set.fna' % iteration_output_dir)

    # Merge OTU tables - check for existence first as this step has historically
    # been a frequent failure, so is sometimes run manually in failed runs.
    otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size)
    if not (exists(otu_table_fp) and getsize(otu_table_fp) > 0):
        merge_cmd = 'merge_otu_tables.py -i %s -o %s' %\
            (','.join(otu_table_fps), otu_table_fp)
        commands.append([("Merge OTU tables", merge_cmd)])

    # Build master rep set
    final_repset_fp = '%s/rep_set.fna' % output_dir
    final_repset_from_iteration_repsets_fps(repset_fasta_fps, final_repset_fp)

    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)
    commands = []

    # initialize output file names - these differ based on what combination of
    # taxonomy assignment and alignment/tree building is happening.
    if run_assign_tax and run_align_and_tree:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
            '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size)
        align_and_tree_input_otu_table = otu_table_w_tax_fp
        pynast_failure_filtered_otu_table_fp = \
            '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir,
                                                                 min_otu_size)
    elif run_assign_tax:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
            '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size)
    elif run_align_and_tree:
        align_and_tree_input_otu_table = otu_table_fp
        pynast_failure_filtered_otu_table_fp = \
            '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir,
                                                           min_otu_size)

    if run_assign_tax:
        if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0:
            logger.write("Final output file exists (%s). Will not rebuild." %
                         otu_table_w_tax_fp)
        else:
            # remove files from partially completed runs
            remove_files([otu_table_w_tax_fp], error_on_missing=False)

            taxonomy_fp = assign_tax(
                repset_fasta_fp=final_repset_fp,
                output_dir=output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                parallel=parallel,
                logger=logger,
                status_update_callback=status_update_callback)

            # Add taxa to otu table
            add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\
                (tax_input_otu_table_fp, taxonomy_fp, otu_table_w_tax_fp)
            commands.append([("Add taxa to OTU table", add_metadata_cmd)])

            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []

    if run_align_and_tree:
        if exists(pynast_failure_filtered_otu_table_fp) and\
           getsize(pynast_failure_filtered_otu_table_fp) > 0:
            logger.write("Final output file exists (%s). Will not rebuild." %
                         pynast_failure_filtered_otu_table_fp)
        else:
            # remove files from partially completed runs
            remove_files([pynast_failure_filtered_otu_table_fp],
                         error_on_missing=False)

            pynast_failures_fp = align_and_tree(
                repset_fasta_fp=final_repset_fp,
                output_dir=output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                parallel=parallel,
                logger=logger,
                status_update_callback=status_update_callback)

            # Build OTU table without PyNAST failures
            table = load_table(align_and_tree_input_otu_table)
            filtered_otu_table = filter_otus_from_otu_table(
                table,
                get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')),
                0,
                inf,
                0,
                inf,
                negate_ids_to_keep=True)
            write_biom_table(filtered_otu_table,
                             pynast_failure_filtered_otu_table_fp)

            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []

    logger.close()
Пример #41
0
    def _get_job_commands(self,
                          input_fp,
                          output_dir,
                          params,
                          job_prefix,
                          working_dir,
                          command_prefix='/bin/bash; ',
                          command_suffix='; exit'):
        """Generate beta diversity to split single OTU table to multiple jobs

        full_tree=True is faster: beta_diversity.py -f will make things
        go faster, but be sure you already have the correct minimal tree.
        """
        commands = []
        result_filepaths = []

        sids = load_table(input_fp).ids()

        if params['full_tree']:
            full_tree_str = '-f'
        else:
            full_tree_str = ''

        if params['tree_path']:
            tree_str = '-t %s' % params['tree_path']
        else:
            tree_str = ''

        metrics = params['metrics']

        # this is a little bit of an abuse of _merge_to_n_commands, so may
        # be worth generalizing that method - this determines the correct
        # number of samples to process in each command
        sample_id_groups = self._merge_to_n_commands(sids,
                                                     params['jobs_to_start'],
                                                     delimiter=',',
                                                     command_prefix='',
                                                     command_suffix='')

        for i, sample_id_group in enumerate(sample_id_groups):
            working_dir_i = join(working_dir, str(i))
            create_dir(working_dir_i)
            output_dir_i = join(output_dir, str(i))
            create_dir(output_dir_i)
            result_filepaths.append(output_dir_i)
            input_dir, input_fn = split(input_fp)
            input_basename, input_ext = splitext(input_fn)
            sample_id_desc = sample_id_group.replace(',', '_')
            output_fns = [
                '%s_%s.txt' % (metric, input_basename)
                for metric in metrics.split(',')
            ]
            rename_command, current_result_filepaths = self._get_rename_command(
                output_fns, working_dir_i, output_dir_i)

            result_filepaths += current_result_filepaths

            bdiv_command = '%s -i %s -o %s %s -m %s %s -r %s' %\
                (self._script_name,
                 input_fp,
                 working_dir_i,
                 tree_str,
                 params['metrics'],
                 full_tree_str,
                 sample_id_group)

            shell_script_fp = '%s/%s%d.sh' % (working_dir_i, job_prefix, i)
            shell_script_commands = [bdiv_command] + rename_command.split(';')
            self._commands_to_shell_script(shell_script_commands,
                                           shell_script_fp)
            commands.append('bash %s' % shell_script_fp)

        commands = self._merge_to_n_commands(commands,
                                             params['jobs_to_start'],
                                             command_prefix=command_prefix,
                                             command_suffix=command_suffix)

        return commands, result_filepaths
Пример #42
0
        try:
            interest_taxonomy = sequences_from_query(open(tax_fp, 'U'),
                                                     query)
        except (PlatypusValueError, PlatypusParseError), e:
            raise BadParameter(e.message)

        if len(interest_taxonomy) == 0:
            raise BadParameter('The query could not retrieve any results, try '
                               'a different one.')
    else:
        interest_taxonomy = {l.strip().split('\t')[0].strip(): ''
                             for l in open(split_fp, 'U')}
        if not interest_taxonomy:
            raise BadParameter('The split_fp is empty!')

    create_dir(output_fp, False)

    interest_fp = open(join(output_fp, 'interest.fna'), 'w')
    rest_fp = open(join(output_fp, 'rest.fna'), 'w')

    for record in read(seqs_fp, format='fasta'):
        full_name = record.id
        seq = record.sequence

        name = full_name.strip().split(' ')[0].strip()

        if name in interest_taxonomy:
            interest_fp.write(">%s\n%s\n" % (full_name, seq))
        else:
            rest_fp.write(">%s\n%s\n" % (full_name, seq))
Пример #43
0
def main(commandline_args=None):
    parser, opts, args = parse_command_line_parameters(**script_info)

    if(opts.checkpoint_fp):
        bp_fp = opts.checkpoint_fp
        if not exists(bp_fp):
            parser.error(
                'Specified checkpoint file does not exist: %s' %
                bp_fp)

    # peek into sff.txt files to make sure they are parseable
    # cat_sff_fles is lazy and only reads header
    flowgrams, header = cat_sff_files(map(open, opts.sff_fps))

    if(opts.split and opts.preprocess_fp):
        parser.error('Options --split and --preprocess_fp are exclusive')

    if(opts.preprocess_fp):
        pp_fp = opts.preprocess_fp
        if not exists(opts.preprocess_fp):
            parser.error(
                'Specified preprocess directory does not exist: %s' %
                opts.preprocess_fp)
        if not files_exist('%s/prefix_mapping.txt,%s/prefix_dereplicated.fasta' % (pp_fp, pp_fp)):
            parser.error('Specified preprocess directory does not contain expected files: ' +
                         'prefix_mapping.txt and prefix_dereplicated.fasta')

    if opts.titanium:
        opts.error_profile = DENOISER_DATA_DIR + 'Titanium_error_profile.dat'
        opts.low_cutoff = 4
        opts.high_cutoff = 5

    if not exists(opts.error_profile):
        parser.error(
            'Specified error profile %s does not exist' %
            opts.error_profile)

    if opts.output_dir:
        # make sure it always ends on /
        tmpoutdir = opts.output_dir + "/"
        create_dir(tmpoutdir, not opts.force)
    else:
        # make random dir in current dir
        tmpoutdir = mkdtemp(dir="", prefix="denoiser_", suffix="/")
	

    log_fp = 'denoiser.log'

    if opts.split:
        denoise_per_sample(
            opts.sff_fps, opts.fasta_fp, tmpoutdir, opts.cluster,
            opts.num_cpus, opts.squeeze, opts.percent_id, opts.bail,
            opts.primer, opts.low_cutoff, opts.high_cutoff, log_fp,
            opts.low_memory, opts.verbose, opts.error_profile, opts.max_num_iter,
            opts.titanium)
    else:
        denoise_seqs(
            opts.sff_fps, opts.fasta_fp, tmpoutdir, opts.preprocess_fp, opts.cluster,
            opts.num_cpus, opts.squeeze, opts.percent_id, opts.bail, opts.primer,
            opts.low_cutoff, opts.high_cutoff, log_fp, opts.low_memory,
            opts.verbose, opts.error_profile, opts.max_num_iter, opts.titanium,
            opts.checkpoint_fp)
Пример #44
0
def main():
    option_parser, opts, args =\
        parse_command_line_parameters(**script_info)

    input_fps = opts.input_fps
    output_dir = opts.output_dir
    create_dir(output_dir)
    barcode_length = opts.barcode_length
    barcode_in_header = opts.barcode_in_header
    barcode_qual_c = opts.barcode_qual_c

    for input_fp in input_fps:
        if input_fp.endswith('.gz'):
            open_f = gzip_open
            input_basename = split(splitext(splitext(input_fp)[0])[0])[1]
        else:
            input_basename = split(splitext(input_fp)[0])[1]
            open_f = open
        sequence_output_fp = '%s/%s.fastq' % (output_dir, input_basename)
        sequence_output_f = open(sequence_output_fp, 'w')
        barcode_output_fp = '%s/%s_barcodes.fastq' % (output_dir,
                                                      input_basename)
        barcode_output_f = open(barcode_output_fp, 'w')
        for line in open_f(input_fp):
            common_fields, sequence, sequence_qual, barcode, barcode_qual =\
                iseq_to_qseq_fields(
                    line,
                    barcode_in_header,
                    barcode_length,
                    barcode_qual_c)

            sequence_s, pass_filter_s = illumina_data_to_fastq(
                (common_fields[0],
                 common_fields[
                     1],
                 common_fields[
                     2],
                 common_fields[
                     3],
                 common_fields[
                     4],
                 common_fields[
                     5],
                 common_fields[
                     6],
                 common_fields[
                     7],
                 sequence,
                 sequence_qual))

            barcode_s, pass_filter_b = illumina_data_to_fastq(
                (common_fields[0],
                 common_fields[
                     1],
                 common_fields[
                     2],
                 common_fields[
                     3],
                 common_fields[
                     4],
                 common_fields[
                     5],
                 common_fields[
                     6],
                 common_fields[
                     7],
                 barcode,
                 barcode_qual), barcode_length)
            if pass_filter_s != 0:
                sequence_output_f.write('%s\n' % sequence_s)
                barcode_output_f.write('%s\n' % barcode_s)
        sequence_output_f.close()
        barcode_output_f.close()
Пример #45
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    verbose = opts.verbose

    otu_table_fp = opts.otu_table_fp
    output_dir = opts.output_dir
    mapping_fp = opts.mapping_fp
    tree_fp = opts.tree_fp
    verbose = opts.verbose
    print_only = opts.print_only
    seqs_per_sample = opts.seqs_per_sample

    parallel = opts.parallel
    # No longer checking that jobs_to_start > 2, but
    # commenting as we may change our minds about this.
    #if parallel: raise_error_on_parallel_unavailable()

    if opts.parameter_fp:
        try:
            parameter_f = open(opts.parameter_fp, 'U')
        except IOError:
            raise IOError(
                "Can't open parameters file (%s). Does it exist? Do you have read access?"
                % opts.parameter_fp)
        params = parse_qiime_parameters(parameter_f)
        parameter_f.close()
    else:
        params = parse_qiime_parameters([])
        # empty list returns empty defaultdict for now

    jobs_to_start = opts.jobs_to_start
    default_jobs_to_start = qiime_config['jobs_to_start']
    validate_and_set_jobs_to_start(params, jobs_to_start,
                                   default_jobs_to_start, parallel,
                                   option_parser)

    create_dir(output_dir, fail_on_exist=not opts.force)

    if print_only:
        command_handler = print_commands
    else:
        command_handler = call_commands_serially

    if verbose:
        status_update_callback = print_to_stdout
    else:
        status_update_callback = no_status_updates

    run_beta_diversity_through_plots(
        otu_table_fp=otu_table_fp,
        mapping_fp=mapping_fp,
        output_dir=output_dir,
        command_handler=command_handler,
        params=params,
        qiime_config=qiime_config,
        color_by_interesting_fields_only=not opts.color_by_all_fields,
        sampling_depth=seqs_per_sample,
        tree_fp=tree_fp,
        parallel=parallel,
        suppress_emperor_plots=opts.suppress_emperor_plots,
        status_update_callback=status_update_callback)
Пример #46
0
def main():

    spreadsheet_key = None

    with open('/data/input/AppSession.json', 'U') as fd_json:
        app = json.load(fd_json)

    # get command attributes, etc
    for item in app['Properties']['Items']:
        if item['Name'] == 'Input.Projects':
            project_id = item['Items'][0]['Id']
        if item['Name'] == 'Input.spreadsheet-key':
            spreadsheet_key = item['Content']
        if item['Name'] == 'Input.app-result-id':
            results_id = item['Content']['Id']
        if item['Name'] == 'Input.rarefaction-depth':
            depth = item['Content']
        if item['Name'] == 'Input.number-of-jobs':
            jobs = item['Content']


    # from BaseSpace's documentation
    input_dir = '/data/input/appresults/'
    base = join('/data/output/appresults/', project_id)

    create_dir(base)

    # OTU picking
    input_dir = join(input_dir, results_id)

    mapping_fp = join(base, 'mapping-file.txt')
    cmd = ("load_remote_mapping_file.py "
           "-k {spreadsheet_key} -o {mapping_fp}")
    params = {'spreadsheet_key': spreadsheet_key, 'mapping_fp': mapping_fp}

    system_call(cmd.format(**params))

    biom_fp = join(input_dir, 'otu_table.biom')
    tree_fp = glob(join(input_dir, '*.tree'))[0]
    output_dir = join(base, 'corediv-out')

    bt = load_table(biom_fp)

    if bt.is_empty():
        logging.error('BIOM table is empty, cannot perform diversity '
                      'analyses.')
        return 11

    params_fp = join(base, 'alpha-params.txt')
    with open(params_fp, 'w') as alpha_fp:
        alpha_fp.write('alpha_diversity:metrics shannon,PD_whole_tree,'
                       'chao1,observed_species')

    cmd = ("core_diversity_analyses.py "
           "-i {biom_fp} -o {output_dir} -m {mapping_fp} -e {depth} "
           "-t {tree_fp} -p {params_fp}")
    params = {'biom_fp': biom_fp, 'output_dir': output_dir,
              'mapping_fp': mapping_fp, 'depth': depth, 'jobs': jobs,
              'tree_fp': tree_fp, 'params_fp': params_fp}

    # see https://github.com/biocore/qiime/issues/2034
    if jobs != '1':
        cmd += ' -a -O {jobs}'

    system_call(cmd.format(**params))

    for log_file in glob(join(output_dir, 'log_*')):
        with open(log_file, 'U') as fd_log:
            print fd_log.read()

    return 0
Пример #47
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    read_arguments_from_file = opts.read_arguments_from_file

    # these arguments can optionally be read from a file, reasoning is to
    # allow arguments that would span over hundreds of samples and would be
    # prohibitive to execute as a command line call
    if read_arguments_from_file:
        # sample_ids is the only one of these arguments that's returned as a
        # string, the rest of them are lists
        if opts.sample_ids:
            opts.sample_ids = ','.join(parse_items(opts.sample_ids))
        if opts.sequence_read_fps:
            opts.sequence_read_fps = parse_items(opts.sequence_read_fps[0])
        if opts.barcode_read_fps:
            opts.barcode_read_fps = parse_items(opts.barcode_read_fps[0])
        if opts.mapping_fps:
            opts.mapping_fps = parse_items(opts.mapping_fps[0])

    sequence_read_fps = opts.sequence_read_fps
    barcode_read_fps = opts.barcode_read_fps
    sample_ids = None
    if opts.sample_ids is not None:
        sample_ids = opts.sample_ids.split(',')
    mapping_fps = opts.mapping_fps
    phred_quality_threshold = opts.phred_quality_threshold
    retain_unassigned_reads = opts.retain_unassigned_reads
    min_per_read_length_fraction = opts.min_per_read_length_fraction
    max_bad_run_length = opts.max_bad_run_length
    rev_comp = opts.rev_comp
    rev_comp_barcode = opts.rev_comp_barcode
    rev_comp_mapping_barcodes = opts.rev_comp_mapping_barcodes
    seq_max_N = opts.sequence_max_n
    start_seq_id = opts.start_seq_id
    # NEED TO FIX THIS FUNCTIONALITY - CURRENTLY READING THE WRONG FIELD
    # opts.filter_bad_illumina_qual_digit
    filter_bad_illumina_qual_digit = False
    store_qual_scores = opts.store_qual_scores
    store_demultiplexed_fastq = opts.store_demultiplexed_fastq
    barcode_type = opts.barcode_type
    max_barcode_errors = opts.max_barcode_errors

    # if this is not a demultiplexed run,
    if barcode_type == 'not-barcoded':
        if sample_ids is None:
            option_parser.error(
                "If not providing barcode reads (because "
                "your data is not multiplexed), must provide --sample_ids.")
        if len(sample_ids) != len(sequence_read_fps):
            option_parser.error(
                "If providing --sample_ids (because "
                "your data is not multiplexed), must provide the same number "
                "of sample ids as sequence read filepaths.")
        barcode_read_fps = [None] * len(sequence_read_fps)
        mapping_fps = [None] * len(sequence_read_fps)
    elif barcode_read_fps is None:
        option_parser.error("Must provide --barcode_read_fps if "
                            "--barcode_type is not 'not-barcoded'")
    elif mapping_fps is None:
        option_parser.error("Must provide --mapping_fps if "
                            "--barcode_type is not 'not-barcoded'")

    phred_offset = opts.phred_offset
    if phred_offset is not None:
        try:
            phred_offset = int(phred_offset)
        except ValueError:
            # shouldn't be able to get here...
            option_parser.error(
                "If --phred_offset is provided, it must be a valid integer.")

    if opts.last_bad_quality_char is not None:
        option_parser.error(
            '--last_bad_quality_char is no longer supported. '
            'Use -q instead (see option help text by passing -h)')

    if not (0 < min_per_read_length_fraction <= 1):
        option_parser.error('--min_per_read_length_fraction must be greater '
                            'than 0 and less than or equal to 1. You passed '
                            '%1.5f.' % min_per_read_length_fraction)

    barcode_correction_fn = BARCODE_DECODER_LOOKUP.get(barcode_type, None)

    if len(mapping_fps) == 1 and len(sequence_read_fps) > 1:
        mapping_fps = mapping_fps * len(sequence_read_fps)

    if len(
            set([
                len(sequence_read_fps),
                len(barcode_read_fps),
                len(mapping_fps)
            ])) > 1:
        option_parser.error("Same number of sequence, barcode, and mapping "
                            "files must be provided.")

    output_dir = opts.output_dir
    create_dir(output_dir)

    output_fp_temp = '%s/seqs.fna.incomplete' % output_dir
    output_fp = '%s/seqs.fna' % output_dir
    output_f = open(output_fp_temp, 'w')
    qual_fp_temp = '%s/qual.fna.incomplete' % output_dir
    qual_fp = '%s/seqs.qual' % output_dir
    output_fastq_fp_temp = '%s/seqs.fastq.incomplete' % output_dir
    output_fastq_fp = '%s/seqs.fastq' % output_dir

    if store_qual_scores:
        qual_f = open(qual_fp_temp, 'w')

        # define a qual writer whether we're storing
        # qual strings or not so we don't have to check
        # every time through the for loop below

        def qual_writer(h, q):
            qual_f.write('>%s\n%s\n' % (h, q))
    else:

        def qual_writer(h, q):
            pass

    if store_demultiplexed_fastq:
        output_fastq_f = open(output_fastq_fp_temp, 'w')

        # define a fastq writer whether we're storing
        # qual strings or not so we don't have to check
        # every time through the for loop below

        def fastq_writer(h, s, q):
            output_fastq_f.write(format_fastq_record(h, s, q))
    else:

        def fastq_writer(h, s, q):
            pass

    log_fp = '%s/split_library_log.txt' % output_dir
    log_f = open(log_fp, 'w')
    histogram_fp = '%s/histograms.txt' % output_dir
    histogram_f = open(histogram_fp, 'w')

    for i in range(len(sequence_read_fps)):
        sequence_read_fp = sequence_read_fps[i]
        barcode_read_fp = barcode_read_fps[i]
        mapping_fp = mapping_fps[i]
        if mapping_fp is not None:
            mapping_f = open(mapping_fp, 'U')
            _, _, barcode_to_sample_id, _, _, _, _ = check_map(
                mapping_f,
                disable_primer_check=True,
                has_barcodes=barcode_read_fp is not None)
        else:
            mapping_f = None
            barcode_to_sample_id = {}

        if rev_comp_mapping_barcodes:
            barcode_to_sample_id = {
                str(DNA(k).rc()): v
                for k, v in barcode_to_sample_id.iteritems()
            }

        if barcode_type == 'golay_12':
            invalid_golay_barcodes = get_invalid_golay_barcodes(
                barcode_to_sample_id.keys())
            if len(invalid_golay_barcodes) > 0:
                option_parser.error(
                    "Some or all barcodes are not valid golay "
                    "codes. Do they need to be reverse complemented? If these "
                    "are not golay barcodes pass --barcode_type 12 to disable "
                    "barcode error correction, or pass --barcode_type # if "
                    "the barcodes are not 12 base pairs, where # is the size "
                    "of the barcodes. Invalid codes:\n\t%s" %
                    ' '.join(invalid_golay_barcodes))

        log_f.write("Input file paths\n")
        if mapping_fp is not None:
            log_f.write('Mapping filepath: %s (md5: %s)\n' %
                        (mapping_fp, safe_md5(open(mapping_fp)).hexdigest()))
        log_f.write('Sequence read filepath: %s (md5: %s)\n' %
                    (sequence_read_fp,
                     str(safe_md5(open(sequence_read_fp)).hexdigest())))

        if sequence_read_fp.endswith('.gz'):
            sequence_read_f = gzip_open(sequence_read_fp)
        else:
            sequence_read_f = open(sequence_read_fp, 'U')

        seq_id = start_seq_id

        if barcode_read_fp is not None:
            log_f.write(
                'Barcode read filepath: %s (md5: %s)\n\n' %
                (barcode_read_fp, safe_md5(open(barcode_read_fp)).hexdigest()))

            if barcode_read_fp.endswith('.gz'):
                barcode_read_f = gzip_open(barcode_read_fp)
            else:
                barcode_read_f = open(barcode_read_fp, 'U')

            seq_generator = process_fastq_single_end_read_file(
                sequence_read_f,
                barcode_read_f,
                barcode_to_sample_id,
                store_unassigned=retain_unassigned_reads,
                max_bad_run_length=max_bad_run_length,
                phred_quality_threshold=phred_quality_threshold,
                min_per_read_length_fraction=min_per_read_length_fraction,
                rev_comp=rev_comp,
                rev_comp_barcode=rev_comp_barcode,
                seq_max_N=seq_max_N,
                start_seq_id=start_seq_id,
                filter_bad_illumina_qual_digit=filter_bad_illumina_qual_digit,
                log_f=log_f,
                histogram_f=histogram_f,
                barcode_correction_fn=barcode_correction_fn,
                max_barcode_errors=max_barcode_errors,
                phred_offset=phred_offset)
        else:
            seq_generator = process_fastq_single_end_read_file_no_barcode(
                sequence_read_f,
                sample_ids[i],
                store_unassigned=retain_unassigned_reads,
                max_bad_run_length=max_bad_run_length,
                phred_quality_threshold=phred_quality_threshold,
                min_per_read_length_fraction=min_per_read_length_fraction,
                rev_comp=rev_comp,
                seq_max_N=seq_max_N,
                start_seq_id=start_seq_id,
                filter_bad_illumina_qual_digit=filter_bad_illumina_qual_digit,
                log_f=log_f,
                histogram_f=histogram_f,
                phred_offset=phred_offset)

        for fasta_header, sequence, quality, seq_id in seq_generator:
            output_f.write('>%s\n%s\n' % (fasta_header, sequence))
            qual_writer(fasta_header, quality)
            fastq_writer(fasta_header, sequence, quality)

        start_seq_id = seq_id + 1
        log_f.write('\n---\n\n')

    output_f.close()
    rename(output_fp_temp, output_fp)

    # process the optional output files, as necessary
    if store_qual_scores:
        qual_f.close()
        rename(qual_fp_temp, qual_fp)

    if store_demultiplexed_fastq:
        output_fastq_f.close()
        rename(output_fastq_fp_temp, output_fastq_fp)
Пример #48
0
def compare(interest_fp,
            other_fp,
            output_dir='blast-results-compare',
            interest_pcts=None,
            interest_alg_lens=None,
            other_pcts=None,
            other_alg_lens=None,
            hits_to_first=False,
            hits_to_second=False):
    """Compare two databases and write the outputs

    Parameters
    ----------
    interest_fp : str
        BLAST results when searching against the database of interest.
    other_fp : str
        BLAST results when searching against the other database.
    output_dir : str, optional
        Name of the output file path.
    interest_pcts : list, optional
        Minimum percentage identity to be considered as a valid result in the
        interest database search results. If None is passed, it defaults to
        `[70]`.
    interest_alg_lens : list, optional
        Minimum alginment length to be considered a valid result in the
        interest database search results. If None is passed, it defaults to
        `[50]`.
    other_pcts : list, optional
        Minimum percentage identity to be considered as a valid result in the
        other database search results. If None is passed, it defaults to
        `[70]`.
    other_lengths : list, optional
        Minimum alginment length to be considered a valid result in the
        other database search results. If None is passed, it defaults to
        `[50]`.
    hits_to_first : bool, optional defaults to False
        Outputs the labels and counts of the sequences being hit in the first
        database.
    hits_to_second : bool, optional defaults to False
        Outputs the labels and counts of the sequences being hit in the second
        database.

    Raises
    ------
    click.BadParameter
        If the `interest_pcts` and the `other_pcts` lists are of different
        length.
        If the `interest_alg_lens` and the `other_alg_lens` lists are of
        different length.
    """

    if interest_pcts is None:
        interest_pcts = [70]
    if interest_alg_lens is None:
        interest_alg_lens = [50]

    db_a = open(interest_fp, 'U')
    db_b = open(other_fp, 'U')

    # try to create the output directory, if it exists, just continue
    create_dir(output_dir, False)

    # run some validations on the input parameters
    if other_pcts:
        if len(interest_pcts) != len(other_pcts):
            raise BadParameter("The percentage values for both databases "
                               "should be the same length: %s - %s" %
                               (interest_pcts, other_pcts))
    else:
        other_pcts = interest_pcts

    if other_alg_lens:
        if len(interest_alg_lens) != len(other_alg_lens):
            raise BadParameter("The alignment length values for both databases"
                               " should be the length : %s - %s" %
                               (interest_alg_lens, other_alg_lens))
    else:
        other_alg_lens = interest_alg_lens

    # process databases
    total_queries, best_hits = parse_first_database(db_a, interest_pcts,
                                                    interest_alg_lens)
    parse_second_database(db_b, best_hits, other_pcts, other_alg_lens)

    # parse results
    results = process_results(interest_pcts, interest_alg_lens, other_pcts,
                              other_alg_lens, best_hits)

    # Collating output and writing full results
    for i, item in enumerate(results):
        filename = join(output_dir, "summary_" + item['filename'] + ".txt")

        with open(filename, 'w') as fd:
            fd.write('\n'.join(item['summary']))

        if i == 0:
            combined_results = []
            combined_results.append(['filename'])
            combined_results.append(
                ['interest db (%s)' % basename(interest_fp)])
            combined_results.append(['other db (%s)' % basename(other_fp)])
            combined_results.append(['only interest'])
            combined_results.append(['both dbs'])
            combined_results.append(['no hits in interest db'])

        no_hits = total_queries - item['db_interest'] - item['db_other'] - \
            item['perfect_interest'] - item['equal']

        combined_results[0].append(item['filename'])
        combined_results[1].append(str(item['db_interest']))
        combined_results[2].append(str(item['db_other']))
        combined_results[3].append(str(item['perfect_interest']))
        combined_results[4].append(str(item['equal']))
        combined_results[5].append(str(no_hits))

        # tiny helper function to save hits files
        def save_hits(data, name):

            s_hits = sorted(data, key=itemgetter(1), reverse=True)
            filename = join(output_dir, name)
            with open(filename, 'w') as fd:
                fd.write('\n'.join(
                    ['%s\t%d' % (k, v) for k, v in s_hits if v != 0]))

        if hits_to_first:
            save_hits(item['db_seqs_counts']['a'].items(),
                      "hits_to_first_db_%s.txt" % item['filename'])

        if hits_to_second:
            save_hits(item['db_seqs_counts']['b'].items(),
                      "hits_to_second_db_%s.txt" % item['filename'])

    # saving collated results
    with open(join(output_dir, "compile_output.txt"), 'w') as compiled_output:
        compiled_output.write('\n'.join(
            ['\t'.join(item) for item in combined_results]))

    fn = join(output_dir, "compile_output_no_nohits.txt")
    with open(fn, 'w') as fd:
        fd.write('\n'.join(['\t'.join(item)
                            for item in combined_results[:-1]]))
Пример #49
0
def iterative_pick_subsampled_open_reference_otus(
        input_fps,
        refseqs_fp,
        output_dir,
        percent_subsample,
        new_ref_set_id,
        command_handler,
        params,
        qiime_config,
        prefilter_refseqs_fp=None,
        prefilter_percent_id=None,
        min_otu_size=2,
        run_assign_tax=True,
        run_align_and_tree=True,
        step1_otu_map_fp=None,
        step1_failures_fasta_fp=None,
        parallel=False,
        suppress_step4=False,
        logger=None,
        suppress_md5=False,
        denovo_otu_picking_method='uclust',
        reference_otu_picking_method='uclust_ref',
        status_update_callback=print_to_stdout,
        minimum_failure_threshold=100000):
    """ Call the pick_subsampled_open_reference_otus workflow on multiple inputs
         and handle processing of the results.
    """
    create_dir(output_dir)
    commands = []

    if logger is None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False

    # if the user has not passed a different reference collection for the pre-filter,
    # used the input refseqs_fp for all iterations. we want to pre-filter all data against
    # the input data as lower percent identity searches with uclust can be slow, so we
    # want the reference collection to stay at a reasonable size.
    if prefilter_refseqs_fp is None:
        prefilter_refseqs_fp = refseqs_fp

    otu_table_fps = []
    repset_fasta_fps = []
    for i, input_fp in enumerate(input_fps):
        iteration_output_dir = '%s/%d/' % (output_dir, i)
        if iteration_output_exists(iteration_output_dir, min_otu_size):
            # if the output from an iteration already exists, skip that
            # iteration (useful for continuing failed runs)
            log_input_md5s(logger, [input_fp, refseqs_fp])
            logger.write('Iteration %d (input file: %s) output data already exists. '
                         'Skipping and moving to next.\n\n' % (i, input_fp))
        else:
            pick_subsampled_open_reference_otus(input_fp=input_fp,
                                                refseqs_fp=refseqs_fp,
                                                output_dir=iteration_output_dir,
                                                percent_subsample=percent_subsample,
                                                new_ref_set_id='.'.join(
                                                    [new_ref_set_id, str(i)]),
                                                command_handler=command_handler,
                                                params=params,
                                                qiime_config=qiime_config,
                                                run_assign_tax=False,
                                                run_align_and_tree=False,
                                                prefilter_refseqs_fp=prefilter_refseqs_fp,
                                                prefilter_percent_id=prefilter_percent_id,
                                                min_otu_size=min_otu_size,
                                                step1_otu_map_fp=step1_otu_map_fp,
                                                step1_failures_fasta_fp=step1_failures_fasta_fp,
                                                parallel=parallel,
                                                suppress_step4=suppress_step4,
                                                logger=logger,
                                                suppress_md5=suppress_md5,
                                                suppress_index_page=True,
                                                denovo_otu_picking_method=denovo_otu_picking_method,
                                                reference_otu_picking_method=reference_otu_picking_method,
                                                status_update_callback=status_update_callback,
                                                minimum_failure_threshold=minimum_failure_threshold)
        # perform post-iteration file shuffling whether the previous iteration's
        # data previously existed or was just computed.
        # step1 otu map and failures can only be used for the first iteration
        # as subsequent iterations need to use updated refseqs files
        step1_otu_map_fp = step1_failures_fasta_fp = None
        new_refseqs_fp = '%s/new_refseqs.fna' % iteration_output_dir
        refseqs_fp = new_refseqs_fp

        otu_table_fps.append(
            '%s/otu_table_mc%d.biom' %
            (iteration_output_dir, min_otu_size))

        repset_fasta_fps.append('%s/rep_set.fna' % iteration_output_dir)

    # Merge OTU tables - check for existence first as this step has historically
    # been a frequent failure, so is sometimes run manually in failed runs.
    otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size)
    if not (exists(otu_table_fp) and getsize(otu_table_fp) > 0):
        merge_cmd = 'merge_otu_tables.py -i %s -o %s' %\
            (','.join(otu_table_fps), otu_table_fp)
        commands.append([("Merge OTU tables", merge_cmd)])

    # Build master rep set
    final_repset_fp = '%s/rep_set.fna' % output_dir
    final_repset_from_iteration_repsets_fps(repset_fasta_fps, final_repset_fp)

    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)
    commands = []

    # initialize output file names - these differ based on what combination of
    # taxonomy assignment and alignment/tree building is happening.
    if run_assign_tax and run_align_and_tree:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
            '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size)
        align_and_tree_input_otu_table = otu_table_w_tax_fp
        pynast_failure_filtered_otu_table_fp = \
            '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir,
                                                                 min_otu_size)
    elif run_assign_tax:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
            '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size)
    elif run_align_and_tree:
        align_and_tree_input_otu_table = otu_table_fp
        pynast_failure_filtered_otu_table_fp = \
            '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir,
                                                           min_otu_size)

    if run_assign_tax:
        if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0:
            logger.write(
                "Final output file exists (%s). Will not rebuild." %
                otu_table_w_tax_fp)
        else:
            # remove files from partially completed runs
            remove_files([otu_table_w_tax_fp], error_on_missing=False)

            taxonomy_fp = assign_tax(
                repset_fasta_fp=final_repset_fp,
                output_dir=output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                parallel=parallel,
                logger=logger,
                status_update_callback=status_update_callback)

            # Add taxa to otu table
            add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\
                (tax_input_otu_table_fp, taxonomy_fp, otu_table_w_tax_fp)
            commands.append([("Add taxa to OTU table", add_metadata_cmd)])

            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []

    if run_align_and_tree:
        if exists(pynast_failure_filtered_otu_table_fp) and\
           getsize(pynast_failure_filtered_otu_table_fp) > 0:
            logger.write("Final output file exists (%s). Will not rebuild." %
                         pynast_failure_filtered_otu_table_fp)
        else:
            # remove files from partially completed runs
            remove_files([pynast_failure_filtered_otu_table_fp],
                         error_on_missing=False)

            pynast_failures_fp = align_and_tree(
                repset_fasta_fp=final_repset_fp,
                output_dir=output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                parallel=parallel,
                logger=logger,
                status_update_callback=status_update_callback)

            # Build OTU table without PyNAST failures
            table = load_table(align_and_tree_input_otu_table)
            filtered_otu_table = filter_otus_from_otu_table(table,
                get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')),
                0, inf, 0, inf, negate_ids_to_keep=True)
            write_biom_table(filtered_otu_table,
                             pynast_failure_filtered_otu_table_fp)

            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []

    logger.close()
Пример #50
0
def main(commandline_args=None):
    parser, opts, args = parse_command_line_parameters(**script_info)

    if (opts.checkpoint_fp):
        bp_fp = opts.checkpoint_fp
        if not exists(bp_fp):
            parser.error('Specified checkpoint file does not exist: %s' %
                         bp_fp)

    # peek into sff.txt files to make sure they are parseable
    # cat_sff_fles is lazy and only reads header
    flowgrams, header = cat_sff_files(map(open, opts.sff_fps))

    if (opts.split and opts.preprocess_fp):
        parser.error('Options --split and --preprocess_fp are exclusive')

    if (opts.preprocess_fp):
        pp_fp = opts.preprocess_fp
        if not exists(opts.preprocess_fp):
            parser.error('Specified preprocess directory does not exist: %s' %
                         opts.preprocess_fp)
        if not files_exist(
                '%s/prefix_mapping.txt,%s/prefix_dereplicated.fasta' %
            (pp_fp, pp_fp)):
            parser.error(
                'Specified preprocess directory does not contain expected files: '
                + 'prefix_mapping.txt and prefix_dereplicated.fasta')

    if opts.titanium:
        opts.error_profile = DENOISER_DATA_DIR + 'Titanium_error_profile.dat'
        opts.low_cutoff = 4
        opts.high_cutoff = 5

    if not exists(opts.error_profile):
        parser.error('Specified error profile %s does not exist' %
                     opts.error_profile)

    if opts.output_dir:
        # make sure it always ends on /
        tmpoutdir = opts.output_dir + "/"
        create_dir(tmpoutdir, not opts.force)
    else:
        # make random dir in current dir
        tmpoutdir = mkdtemp(dir="", prefix="denoiser_", suffix="/")

    log_fp = 'denoiser.log'

    if opts.split:
        denoise_per_sample(opts.sff_fps, opts.fasta_fp, tmpoutdir,
                           opts.cluster, opts.num_cpus, opts.squeeze,
                           opts.percent_id, opts.bail, opts.primer,
                           opts.low_cutoff, opts.high_cutoff, log_fp,
                           opts.low_memory, opts.verbose, opts.error_profile,
                           opts.max_num_iter, opts.titanium)
    else:
        denoise_seqs(opts.sff_fps, opts.fasta_fp, tmpoutdir,
                     opts.preprocess_fp, opts.cluster, opts.num_cpus,
                     opts.squeeze, opts.percent_id, opts.bail, opts.primer,
                     opts.low_cutoff, opts.high_cutoff, log_fp,
                     opts.low_memory, opts.verbose, opts.error_profile,
                     opts.max_num_iter, opts.titanium, opts.checkpoint_fp)
Пример #51
0
def pick_subsampled_open_reference_otus(input_fp,
                                        refseqs_fp,
                                        output_dir,
                                        percent_subsample,
                                        new_ref_set_id,
                                        command_handler,
                                        params,
                                        qiime_config,
                                        prefilter_refseqs_fp=None,
                                        run_assign_tax=True,
                                        run_align_and_tree=True,
                                        prefilter_percent_id=None,
                                        min_otu_size=2,
                                        step1_otu_map_fp=None,
                                        step1_failures_fasta_fp=None,
                                        parallel=False,
                                        suppress_step4=False,
                                        logger=None,
                                        suppress_md5=False,
                                        suppress_index_page=False,
                                        denovo_otu_picking_method='uclust',
                                        reference_otu_picking_method='uclust_ref',
                                        status_update_callback=print_to_stdout,
                                        minimum_failure_threshold=100000):
    """ Run the data preparation steps of Qiime

        The steps performed by this function are:
          - Pick reference OTUs against refseqs_fp
          - Subsample the failures to n sequences.
          - Pick OTUs de novo on the n failures.
          - Pick representative sequences for the resulting OTUs.
          - Pick reference OTUs on all failures using the
             representative set from step 4 as the reference set.

    """
    # for now only allowing uclust/usearch/sortmerna+sumaclust for otu picking
    allowed_denovo_otu_picking_methods = ['uclust', 'usearch61', 'sumaclust']
    allowed_reference_otu_picking_methods = ['uclust_ref', 'usearch61_ref',
                                             'sortmerna']
    assert denovo_otu_picking_method in allowed_denovo_otu_picking_methods,\
        "Unknown de novo OTU picking method: %s. Known methods are: %s"\
        % (denovo_otu_picking_method,
           ','.join(allowed_denovo_otu_picking_methods))

    assert reference_otu_picking_method in allowed_reference_otu_picking_methods,\
        "Unknown reference OTU picking method: %s. Known methods are: %s"\
        % (reference_otu_picking_method,
           ','.join(allowed_reference_otu_picking_methods))

    # Prepare some variables for the later steps
    index_links = []
    input_dir, input_filename = split(input_fp)
    input_basename, input_ext = splitext(input_filename)
    create_dir(output_dir)
    commands = []
    if logger is None:
        log_fp = generate_log_fp(output_dir)
        logger = WorkflowLogger(log_fp,
                                params=params,
                                qiime_config=qiime_config)

        close_logger_on_success = True
        index_links.append(
                ('Run summary data',
                log_fp,
                _index_headers['run_summary']))
    else:
        close_logger_on_success = False


    if not suppress_md5:
        log_input_md5s(logger, [input_fp,
                                refseqs_fp,
                                step1_otu_map_fp,
                                step1_failures_fasta_fp])

    # if the user has not passed a different reference collection for the pre-filter,
    # used the main refseqs_fp. this is useful if the user wants to provide a smaller
    # reference collection, or to use the input reference collection when running in
    # iterative mode (rather than an iteration's new refseqs)
    if prefilter_refseqs_fp is None:
        prefilter_refseqs_fp = refseqs_fp

    # Step 1: Closed-reference OTU picking on the input file (if not already
    # complete)
    if step1_otu_map_fp and step1_failures_fasta_fp:
        step1_dir = '%s/step1_otus' % output_dir
        create_dir(step1_dir)
        logger.write("Using pre-existing reference otu map and failures.\n\n")
    else:
        if prefilter_percent_id is not None:
            prefilter_dir = '%s/prefilter_otus/' % output_dir
            prefilter_failures_list_fp = '%s/%s_failures.txt' % \
                (prefilter_dir, input_basename)
            prefilter_pick_otu_cmd = pick_reference_otus(
                input_fp, prefilter_dir, reference_otu_picking_method,
                prefilter_refseqs_fp, parallel, params, logger, prefilter_percent_id)
            commands.append(
                [('Pick Reference OTUs (prefilter)', prefilter_pick_otu_cmd)])

            prefiltered_input_fp = '%s/prefiltered_%s%s' %\
                (prefilter_dir, input_basename, input_ext)
            filter_fasta_cmd = 'filter_fasta.py -f %s -o %s -s %s -n' %\
                (input_fp, prefiltered_input_fp, prefilter_failures_list_fp)
            commands.append(
                [('Filter prefilter failures from input', filter_fasta_cmd)])
            index_links.append(
            ('Pre-filtered sequence identifiers '
             '(failed to hit reference at %1.1f%% identity)' % (float(prefilter_percent_id)*100),
                        prefilter_failures_list_fp,
                        _index_headers['sequences']))


            # Call the command handler on the list of commands
            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []

            input_fp = prefiltered_input_fp
            input_dir, input_filename = split(input_fp)
            input_basename, input_ext = splitext(input_filename)
            if getsize(prefiltered_input_fp) == 0:
                raise ValueError(
                    "All sequences were discarded by the prefilter. "
                    "Are the input sequences in the same orientation "
                    "in your input file and reference file (you can "
                    "add 'pick_otus:enable_rev_strand_match True' to "
                    "your parameters file if not)? Are you using the "
                    "correct reference file?")

        # Build the OTU picking command
        step1_dir = \
            '%s/step1_otus' % output_dir
        step1_otu_map_fp = \
            '%s/%s_otus.txt' % (step1_dir, input_basename)
        step1_pick_otu_cmd = pick_reference_otus(
            input_fp, step1_dir, reference_otu_picking_method,
            refseqs_fp, parallel, params, logger)
        commands.append([('Pick Reference OTUs', step1_pick_otu_cmd)])

        # Build the failures fasta file
        step1_failures_list_fp = '%s/%s_failures.txt' % \
            (step1_dir, input_basename)
        step1_failures_fasta_fp = \
            '%s/failures.fasta' % step1_dir
        step1_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\
            (input_fp, step1_failures_list_fp, step1_failures_fasta_fp)

        commands.append([('Generate full failures fasta file',
                          step1_filter_fasta_cmd)])

        # Call the command handler on the list of commands
        command_handler(commands,
                        status_update_callback,
                        logger=logger,
                        close_logger_on_success=False)
        commands = []

    step1_repset_fasta_fp = \
        '%s/step1_rep_set.fna' % step1_dir
    step1_pick_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
        (step1_otu_map_fp, step1_repset_fasta_fp, input_fp)
    commands.append([('Pick rep set', step1_pick_rep_set_cmd)])

    # Call the command handler on the list of commands
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)
    commands = []
    # name the final otu map
    merged_otu_map_fp = '%s/final_otu_map.txt' % output_dir

    # count number of sequences in step 1 failures fasta file
    with open(abspath(step1_failures_fasta_fp), 'U') as step1_failures_fasta_f:
        num_failure_seqs, mean, std = count_seqs_from_file(step1_failures_fasta_f)

    # number of failures sequences is greater than the threshold,
    # continue to step 2,3 and 4
    run_step_2_and_3 = num_failure_seqs > minimum_failure_threshold

    if run_step_2_and_3:

        # Subsample the failures fasta file to retain (roughly) the
        # percent_subsample
        step2_dir = '%s/step2_otus/' % output_dir
        create_dir(step2_dir)
        step2_input_fasta_fp = \
                               '%s/subsampled_failures.fasta' % step2_dir
        subsample_fasta(step1_failures_fasta_fp,
                        step2_input_fasta_fp,
                        percent_subsample)

        logger.write('# Subsample the failures fasta file using API \n' +
                 'python -c "import qiime; qiime.util.subsample_fasta' +
                 '(\'%s\', \'%s\', \'%f\')\n\n"' % (abspath(step1_failures_fasta_fp),
                                                    abspath(
                                                        step2_input_fasta_fp),
                                                    percent_subsample))

        # Prep the OTU picking command for the subsampled failures
        step2_cmd = pick_denovo_otus(step2_input_fasta_fp,
                                     step2_dir,
                                     new_ref_set_id,
                                     denovo_otu_picking_method,
                                     params,
                                     logger)
        step2_otu_map_fp = '%s/subsampled_failures_otus.txt' % step2_dir

        commands.append([('Pick de novo OTUs for new clusters', step2_cmd)])

        # Prep the rep set picking command for the subsampled failures
        step2_repset_fasta_fp = '%s/step2_rep_set.fna' % step2_dir
        step2_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
            (step2_otu_map_fp, step2_repset_fasta_fp, step2_input_fasta_fp)
        commands.append(
            [('Pick representative set for subsampled failures', step2_rep_set_cmd)])

        step3_dir = '%s/step3_otus/' % output_dir
        step3_otu_map_fp = '%s/failures_otus.txt' % step3_dir
        step3_failures_list_fp = '%s/failures_failures.txt' % step3_dir

        # remove the indexed reference database from the dictionary of
        # parameters as it must be forced to build a new database
        # using the step2_repset_fasta_fp
        if reference_otu_picking_method == 'sortmerna':
            if 'sortmerna_db' in params['pick_otus']:
                del params['pick_otus']['sortmerna_db']

        step3_cmd = pick_reference_otus(
            step1_failures_fasta_fp,
            step3_dir,
            reference_otu_picking_method,
            step2_repset_fasta_fp,
            parallel,
            params,
            logger)

        commands.append([
            ('Pick reference OTUs using de novo rep set', step3_cmd)])

        index_links.append(
            ('Final map of OTU identifier to sequence identifers (i.e., "OTU map")',
             merged_otu_map_fp,
             _index_headers['otu_maps']))

    if not suppress_step4:
        step4_dir = '%s/step4_otus/' % output_dir
        if run_step_2_and_3:
            step3_failures_fasta_fp = '%s/failures_failures.fasta' % step3_dir
            step3_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\
                (step1_failures_fasta_fp,
                 step3_failures_list_fp, step3_failures_fasta_fp)
            commands.append([('Create fasta file of step3 failures',
                            step3_filter_fasta_cmd)])

            failures_fp = step3_failures_fasta_fp
            failures_otus_fp = 'failures_failures_otus.txt'
            failures_step = 'step3'
        else:
            failures_fp = step1_failures_fasta_fp
            failures_otus_fp = 'failures_otus.txt'
            failures_step = 'step1'
            step3_otu_map_fp = ""

        step4_cmd = pick_denovo_otus(failures_fp,
                                     step4_dir,
                                     '.'.join([new_ref_set_id, 'CleanUp']),
                                     denovo_otu_picking_method,
                                     params,
                                     logger)

        step4_otu_map_fp = '%s/%s' % (step4_dir, failures_otus_fp)
        commands.append([('Pick de novo OTUs on %s failures' % failures_step, step4_cmd)])

        # Merge the otu maps, note that we are explicitly using the '>' operator
        # otherwise passing the --force flag on the script interface would
        # append the newly created maps to the map that was previously created
        cat_otu_tables_cmd = 'cat %s %s %s > %s' %\
            (step1_otu_map_fp, step3_otu_map_fp,
             step4_otu_map_fp, merged_otu_map_fp)
        commands.append([('Merge OTU maps', cat_otu_tables_cmd)])
        step4_repset_fasta_fp = '%s/step4_rep_set.fna' % step4_dir
        step4_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
            (step4_otu_map_fp, step4_repset_fasta_fp, failures_fp)
        commands.append(
            [('Pick representative set for subsampled failures', step4_rep_set_cmd)])
    else:
        # Merge the otu maps, note that we are explicitly using the '>' operator
        # otherwise passing the --force flag on the script interface would
        # append the newly created maps to the map that was previously created
        if run_step_2_and_3:
            failures_fp = step3_failures_list_fp
        else:
            failures_fp = step1_failures_list_fp
            step3_otu_map_fp = ""

        cat_otu_tables_cmd = 'cat %s %s > %s' %\
            (step1_otu_map_fp, step3_otu_map_fp, merged_otu_map_fp)
        commands.append([('Merge OTU maps', cat_otu_tables_cmd)])

        # Move the step 3 failures file to the top-level directory
        commands.append([('Move final failures file to top-level directory',
                          'mv %s %s/final_failures.txt' % (failures_fp, output_dir))])

    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)
    commands = []

    otu_fp = merged_otu_map_fp
    # Filter singletons from the otu map
    otu_no_singletons_fp = '%s/final_otu_map_mc%d.txt' % (output_dir,
                                                          min_otu_size)

    otus_to_keep = filter_otus_from_otu_map(
        otu_fp,
        otu_no_singletons_fp,
        min_otu_size)

    index_links.append(('Final map of OTU identifier to sequence identifers excluding '
                        'OTUs with fewer than %d sequences' % min_otu_size,
                        otu_no_singletons_fp,
                        _index_headers['otu_maps']))

    logger.write('# Filter singletons from the otu map using API \n' +
                 'python -c "import qiime; qiime.filter.filter_otus_from_otu_map' +
                 '(\'%s\', \'%s\', \'%d\')"\n\n' % (abspath(otu_fp),
                                                    abspath(
                                                        otu_no_singletons_fp),
                                                    min_otu_size))

    # make the final representative seqs file and a new refseqs file that
    # could be used in subsequent otu picking runs.
    # this is clunky. first, we need to do this without singletons to match
    # the otu map without singletons. next, there is a difference in what
    # we need the reference set to be and what we need the repseqs to be.
    # the reference set needs to be a superset of the input reference set
    # to this set. the repset needs to be only the sequences that were observed
    # in this data set, and we want reps for the step1 reference otus to be
    # reads from this run so we don't hit issues building a tree using
    # sequences of very different lengths. so...
    final_repset_fp = '%s/rep_set.fna' % output_dir
    index_links.append(
        ('OTU representative sequences',
         final_repset_fp,
         _index_headers['sequences']))
    final_repset_f = open(final_repset_fp, 'w')
    new_refseqs_fp = '%s/new_refseqs.fna' % output_dir
    index_links.append(
        ('New reference sequences (i.e., OTU representative sequences plus input '
         'reference sequences)',
         new_refseqs_fp,
         _index_headers['sequences']))
    # write non-singleton otus representative sequences from step1 to the
    # final rep set file
    for otu_id, seq in parse_fasta(open(step1_repset_fasta_fp, 'U')):
        if otu_id.split()[0] in otus_to_keep:
            final_repset_f.write('>%s\n%s\n' % (otu_id, seq))
    logger.write('# Write non-singleton otus representative sequences ' +
                 'from step1 to the final rep set file: %s\n\n' % final_repset_fp)
    # copy the full input refseqs file to the new refseqs_fp
    copyfile(refseqs_fp, new_refseqs_fp)
    new_refseqs_f = open(new_refseqs_fp, 'a')
    new_refseqs_f.write('\n')
    logger.write('# Copy the full input refseqs file to the new refseq file\n' +
                 'cp %s %s\n\n' % (refseqs_fp, new_refseqs_fp))
    # iterate over all representative sequences from step2 and step4 and write
    # those corresponding to non-singleton otus to the final representative set
    # file and the new reference sequences file.
    if run_step_2_and_3:
        for otu_id, seq in parse_fasta(open(step2_repset_fasta_fp, 'U')):
            if otu_id.split()[0] in otus_to_keep:
                new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq))
                final_repset_f.write('>%s\n%s\n' % (otu_id, seq))
    if not suppress_step4:
        for otu_id, seq in parse_fasta(open(step4_repset_fasta_fp, 'U')):
            if otu_id.split()[0] in otus_to_keep:
                new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq))
                final_repset_f.write('>%s\n%s\n' % (otu_id, seq))
    new_refseqs_f.close()
    final_repset_f.close()

    # steps 1-4 executed
    if run_step_2_and_3:
        logger.write('# Write non-singleton otus representative sequences from ' +
                     'step 2 and step 4 to the final representative set and the new reference' +
                     ' set (%s and %s respectively)\n\n' % (final_repset_fp, new_refseqs_fp))
    # only steps 1 and 4 executed
    else:
        logger.write('# Write non-singleton otus representative sequences from ' +
                     'step 4 to the final representative set and the new reference' +
                     ' set (%s and %s respectively)\n\n' % (final_repset_fp, new_refseqs_fp))

    # Prep the make_otu_table.py command
    otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size)

    make_otu_table_cmd = 'make_otu_table.py -i %s -o %s' %\
        (otu_no_singletons_fp, otu_table_fp)
    commands.append([("Make the otu table", make_otu_table_cmd)])
    index_links.append(
        ('OTU table exluding OTUs with fewer than %d sequences' % min_otu_size,
         otu_table_fp,
         _index_headers['otu_tables']))
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)

    commands = []

    # initialize output file names - these differ based on what combination of
    # taxonomy assignment and alignment/tree building is happening.
    if run_assign_tax and run_align_and_tree:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
            '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size)

        align_and_tree_input_otu_table = otu_table_w_tax_fp
        index_links.append(
            ('OTU table exluding OTUs with fewer than %d sequences and including OTU '
             'taxonomy assignments' % min_otu_size,
             otu_table_w_tax_fp,
             _index_headers['otu_tables']))

        pynast_failure_filtered_otu_table_fp = \
            '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir, min_otu_size)
        index_links.append(
            ('OTU table exluding OTUs with fewer than %d sequences and sequences that '
            'fail to align with PyNAST and including OTU taxonomy assignments' % min_otu_size,
             pynast_failure_filtered_otu_table_fp,
             _index_headers['otu_tables']))

    elif run_assign_tax:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
            '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size)
        index_links.append(
            ('OTU table exluding OTUs with fewer than %d sequences and including OTU '
            'taxonomy assignments' % min_otu_size,
             otu_table_w_tax_fp,
             _index_headers['otu_tables']))

    elif run_align_and_tree:
        align_and_tree_input_otu_table = otu_table_fp
        pynast_failure_filtered_otu_table_fp = \
            '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir,
                                                           min_otu_size)
        index_links.append(
            ('OTU table exluding OTUs with fewer than %d sequences and sequences that '
             'fail to align with PyNAST' % min_otu_size,
             pynast_failure_filtered_otu_table_fp,
             _index_headers['otu_tables']))

    if run_assign_tax:
        if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0:
            logger.write(
                "Final output file exists (%s). Will not rebuild." %
                otu_table_w_tax_fp)
        else:
            # remove files from partially completed runs
            remove_files([otu_table_w_tax_fp], error_on_missing=False)

            taxonomy_fp = assign_tax(
                repset_fasta_fp=final_repset_fp,
                output_dir=output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                parallel=parallel,
                logger=logger,
                status_update_callback=status_update_callback)

            index_links.append(
                    ('OTU taxonomic assignments',
                    taxonomy_fp,
                    _index_headers['taxa_assignments']))

            # Add taxa to otu table
            add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\
                (tax_input_otu_table_fp, taxonomy_fp, otu_table_w_tax_fp)
            commands.append([("Add taxa to OTU table", add_metadata_cmd)])

            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []

    if run_align_and_tree:
        rep_set_tree_fp = join(output_dir, 'rep_set.tre')
        index_links.append(
            ('OTU phylogenetic tree',
             rep_set_tree_fp,
             _index_headers['trees']))
        if exists(pynast_failure_filtered_otu_table_fp) and\
           getsize(pynast_failure_filtered_otu_table_fp) > 0:
            logger.write("Final output file exists (%s). Will not rebuild." %
                         pynast_failure_filtered_otu_table_fp)
        else:
            # remove files from partially completed runs
            remove_files([pynast_failure_filtered_otu_table_fp],
                         error_on_missing=False)

            pynast_failures_fp = align_and_tree(
                repset_fasta_fp=final_repset_fp,
                output_dir=output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                parallel=parallel,
                logger=logger,
                status_update_callback=status_update_callback)

            # Build OTU table without PyNAST failures
            table = load_table(align_and_tree_input_otu_table)
            filtered_otu_table = filter_otus_from_otu_table(table,
                get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')),
                0, inf, 0, inf, negate_ids_to_keep=True)
            write_biom_table(filtered_otu_table,
                             pynast_failure_filtered_otu_table_fp)

            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []


    if close_logger_on_success:
        logger.close()

    if not suppress_index_page:
        index_fp = '%s/index.html' % output_dir
        generate_index_page(index_links, index_fp)
Пример #52
0
def split_db(tax_fp, seqs_fp, query, output_fp, split_fp):
    """Split a database in parts that match a query and parts that don't

    Parameters
    ----------
    tax_fp : str
        Tab-delimited file with two columns, name/identifier of the sequence
        and the taxonomy. The sequence identifier is the longest string before
        a space in the header of the sequence.
    seqs_fp : str
        Path to a FASTA formatted file to split in interest and rest. Note:
        sequence identifiers must match the ones in the taxonomy file.
    query : str
        The query used to split the database, for example: salmonella. The
        query should be an exact match, no wild cards, it can have spaces, and
        it is case insensitive
    output_fp : str
        Output folder path where the results are stored.
    split_fp : str
        The tab delimited query file, where each line is a different sequence
        and the first column is the sequence id.

    Raises
    ------
    BadParameter
        If the Taxonomy file is empty.
        If the query you passed retrieved no results.
    """

    if query is not None:
        # query the taxonomy file for the required sequence identifiers
        try:
            interest_taxonomy = sequences_from_query(open(tax_fp, 'U'), query)
        except (PlatypusValueError, PlatypusParseError) as e:
            raise BadParameter(e.message)

        if len(interest_taxonomy) == 0:
            raise BadParameter('The query could not retrieve any results, try '
                               'a different one.')
    else:
        interest_taxonomy = {
            l.strip().split('\t')[0].strip(): ''
            for l in open(split_fp, 'U')
        }
        if not interest_taxonomy:
            raise BadParameter('The split_fp is empty!')

    create_dir(output_fp, False)

    interest_fp = open(join(output_fp, 'interest.fna'), 'w')
    rest_fp = open(join(output_fp, 'rest.fna'), 'w')

    for record in read(seqs_fp, format='fasta'):
        full_name = record.id
        seq = record.sequence

        name = full_name.strip().split(' ')[0].strip()

        if name in interest_taxonomy:
            interest_fp.write(">%s\n%s\n" % (full_name, seq))
        else:
            rest_fp.write(">%s\n%s\n" % (full_name, seq))

    interest_fp.close()
    rest_fp.close()
Пример #53
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    # Create the output dir if it doesn't already exist.
    out_dir = opts.output_dir
    try:
        create_dir(out_dir)
    except:
        option_parser.error("Could not create or access output directory "
                            "specified with the -o option.")

    map_f = open(opts.mapping_fp, 'U')
    dm_f = open(opts.distance_matrix_fp, 'U')

    fields = map(strip, opts.fields.split(','))
    fields = [field.strip('"').strip("'") for field in fields]

    color_individual_within_by_field = opts.color_individual_within_by_field
    results = make_distance_boxplots(
        dm_f,
        map_f,
        fields,
        width=opts.width,
        height=opts.height,
        suppress_all_within=opts.suppress_all_within,
        suppress_all_between=opts.suppress_all_between,
        suppress_individual_within=opts.suppress_individual_within,
        suppress_individual_between=opts.suppress_individual_between,
        y_min=opts.y_min,
        y_max=opts.y_max,
        whisker_length=opts.whisker_length,
        box_width=opts.box_width,
        box_color=opts.box_color,
        color_individual_within_by_field=color_individual_within_by_field,
        sort=opts.sort)

    for field, plot_figure, plot_data, plot_labels, plot_colors in results:
        output_plot_fp = join(out_dir,
                              "%s_Distances.%s" % (field, opts.imagetype))
        plot_figure.savefig(output_plot_fp,
                            format=opts.imagetype,
                            transparent=opts.transparent)

        if not opts.suppress_significance_tests:
            sig_tests_f = open(join(out_dir, "%s_Stats.txt" % field), 'w')
            sig_tests_results = all_pairs_t_test(
                plot_labels,
                plot_data,
                tail_type=opts.tail_type,
                num_permutations=opts.num_permutations)
            sig_tests_f.write(sig_tests_results)
            sig_tests_f.close()

        if opts.save_raw_data:
            # Write the raw plot data into a tab-delimited file.
            assert (len(plot_labels) == len(plot_data))
            raw_data_fp = join(out_dir, "%s_Distances.txt" % field)
            raw_data_f = open(raw_data_fp, 'w')

            for label, data in zip(plot_labels, plot_data):
                raw_data_f.write(label.replace(" ", "_") + "\t")
                raw_data_f.write("\t".join(map(str, data)))
                raw_data_f.write("\n")
            raw_data_f.close()
Пример #54
0
def main():

    spreadsheet_key = None

    with open('/data/input/AppSession.json', 'U') as fd_json:
        app = json.load(fd_json)

    # get command attributes, etc
    for item in app['Properties']['Items']:
        if item['Name'] == 'Input.Projects':
            project_id = item['Items'][0]['Id']
        if item['Name'] == 'Input.spreadsheet-key':
            spreadsheet_key = item['Content']
        if item['Name'] == 'Input.app-result-id':
            results_id = item['Content']['Id']
        if item['Name'] == 'Input.rarefaction-depth':
            depth = item['Content']
        if item['Name'] == 'Input.number-of-jobs':
            jobs = item['Content']

    # from BaseSpace's documentation
    input_dir = '/data/input/appresults/'
    base = join('/data/output/appresults/', project_id)

    create_dir(base)

    # OTU picking
    input_dir = join(input_dir, results_id)

    mapping_fp = join(base, 'mapping-file.txt')
    cmd = ("load_remote_mapping_file.py "
           "-k {spreadsheet_key} -o {mapping_fp}")
    params = {'spreadsheet_key': spreadsheet_key, 'mapping_fp': mapping_fp}

    system_call(cmd.format(**params))

    biom_fp = join(input_dir, 'otu_table.biom')
    tree_fp = glob(join(input_dir, '*.tree'))[0]
    output_dir = join(base, 'corediv-out')

    bt = load_table(biom_fp)

    if bt.is_empty():
        logging.error('BIOM table is empty, cannot perform diversity '
                      'analyses.')
        return 11

    params_fp = join(base, 'alpha-params.txt')
    with open(params_fp, 'w') as alpha_fp:
        alpha_fp.write('alpha_diversity:metrics shannon,PD_whole_tree,'
                       'chao1,observed_species')

    cmd = ("core_diversity_analyses.py "
           "-i {biom_fp} -o {output_dir} -m {mapping_fp} -e {depth} "
           "-t {tree_fp} -p {params_fp}")
    params = {
        'biom_fp': biom_fp,
        'output_dir': output_dir,
        'mapping_fp': mapping_fp,
        'depth': depth,
        'jobs': jobs,
        'tree_fp': tree_fp,
        'params_fp': params_fp
    }

    # see https://github.com/biocore/qiime/issues/2034
    if jobs != '1':
        cmd += ' -a -O {jobs}'

    system_call(cmd.format(**params))

    for log_file in glob(join(output_dir, 'log_*')):
        with open(log_file, 'U') as fd_log:
            print fd_log.read()

    return 0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    # Create the output dir if it doesn't already exist.
    try:
        create_dir(opts.output_dir)
    except:
        option_parser.error("Could not create or access output directory "
                            "specified with the -o option.")

    # Parse the distance matrix and mapping file.
    try:
        dist_matrix_header, dist_matrix = parse_distmat(
            open(opts.distance_matrix_fp, 'U'))
    except:
        option_parser.error(
            "This does not look like a valid distance matrix "
            "file. Please supply a valid distance matrix file using the -d "
            "option.")
    try:
        mapping, mapping_header, mapping_comments = parse_mapping_file(
            open(opts.mapping_fp, 'U'))
    except QiimeParseError:
        option_parser.error(
            "This does not look like a valid metadata mapping "
            "file. Please supply a valid mapping file using the -m option.")

    # Make sure the y_min and y_max options make sense, as they can be either
    # 'auto' or a number.
    y_min = opts.y_min
    y_max = opts.y_max
    try:
        y_min = float(y_min)
    except ValueError:
        if y_min == 'auto':
            y_min = None
        else:
            option_parser.error("The --y_min option must be either a number "
                                "or 'auto'.")
    try:
        y_max = float(y_max)
    except ValueError:
        if y_max == 'auto':
            y_max = None
        else:
            option_parser.error("The --y_max option must be either a number "
                                "or 'auto'.")

    # Parse the field states that will be compared to every other field state.
    comparison_field_states = opts.comparison_groups
    comparison_field_states = map(strip, comparison_field_states.split(','))
    comparison_field_states = [
        field_state.strip('"').strip("'")
        for field_state in comparison_field_states
    ]
    if comparison_field_states is None:
        option_parser.error("You must provide at least one field state to "
                            "compare (using the -c option).")

    # Get distance comparisons between each field state and each of the
    # comparison field states.
    field = opts.field
    comparison_groupings = get_field_state_comparisons(
        dist_matrix_header, dist_matrix, mapping_header, mapping, field,
        comparison_field_states)

    # Grab a list of all field states that had the comparison field states
    # compared against them. These will be plotted along the x-axis.
    field_states = comparison_groupings.keys()

    def custom_comparator(x, y):
        try:
            num_x = float(x)
            num_y = float(y)
            return int(num_x - num_y)
        except:
            if x < y:
                return -1
            elif x > y:
                return 1
            else:
                return 0

    # Sort the field states as numbers if the elements are numbers, else sort
    # them lexically.
    field_states.sort(custom_comparator)

    # If the label type is numeric, get a list of all field states in sorted
    # numeric order. These will be used to determine the spacing of the
    # field state 'points' along the x-axis.
    x_spacing = None
    if opts.label_type == "numeric":
        try:
            x_spacing = sorted(map(float, field_states))
        except:
            option_parser.error("The 'numeric' label type is invalid because "
                                "not all field states could be converted into "
                                "numbers. Please specify a different label "
                                "type.")

    # Accumulate the data for each field state 'point' along the x-axis.
    plot_data = []
    plot_x_axis_labels = []
    for field_state in field_states:
        field_state_data = []
        for comp_field_state in comparison_field_states:
            field_state_data.append(
                comparison_groupings[field_state][comp_field_state])
        plot_data.append(field_state_data)
        plot_x_axis_labels.append(field_state)

    # Plot the data and labels.
    plot_title = "Distance Comparisons"
    plot_x_label = field
    plot_y_label = "Distance"

    # If we are creating a bar chart or box plot, grab a list of good data
    # colors to use.
    plot_type = opts.plot_type
    plot_colors = None
    if plot_type == "bar" or plot_type == "box":
        plot_colors = [
            matplotlib_rgb_color(data_colors[color].toRGB())
            for color in data_color_order
        ]

    assert plot_data, "Error: there is no data to plot!"

    width = opts.width
    height = opts.height
    if width <= 0 or height <= 0:
        option_parser.error("The specified width and height of the image must "
                            "be greater than zero.")

    plot_figure = grouped_distributions(
        opts.plot_type,
        plot_data,
        x_values=x_spacing,
        data_point_labels=plot_x_axis_labels,
        distribution_labels=comparison_field_states,
        distribution_markers=plot_colors,
        x_label=plot_x_label,
        y_label=plot_y_label,
        title=plot_title,
        x_tick_labels_orientation=opts.x_tick_labels_orientation,
        y_min=y_min,
        y_max=y_max,
        whisker_length=opts.whisker_length,
        error_bar_type=opts.error_bar_type,
        distribution_width=opts.distribution_width,
        figure_width=width,
        figure_height=height)

    # Save the plot in the specified format.
    output_plot_fp = join(
        opts.output_dir,
        "%s_Distance_Comparisons.%s" % (field, opts.imagetype))
    plot_figure.savefig(output_plot_fp,
                        format=opts.imagetype,
                        transparent=opts.transparent)

    if not opts.suppress_significance_tests:
        sig_tests_f = open(join(opts.output_dir, "%s_Stats.txt" % field), 'w')

        # Rearrange the plot data into a format suitable for all_pairs_t_test.
        sig_tests_labels = []
        sig_tests_data = []
        for data_point, data_point_label in zip(plot_data, plot_x_axis_labels):
            for dist, comp_field in zip(data_point, comparison_field_states):
                sig_tests_labels.append('%s vs %s' %
                                        (data_point_label, comp_field))
                sig_tests_data.append(dist)

        sig_tests_results = all_pairs_t_test(
            sig_tests_labels,
            sig_tests_data,
            tail_type=opts.tail_type,
            num_permutations=opts.num_permutations)
        sig_tests_f.write(sig_tests_results)
        sig_tests_f.close()

    if opts.save_raw_data:
        # Write the raw plot data into a tab-delimited file, where each line
        # has the distances between a comparison group and another field state
        # 'point' along the x-axis.
        assert (len(plot_x_axis_labels) == len(plot_data)), "The number of " +\
            "labels do not match the number of points along the x-axis."
        raw_data_fp = join(opts.output_dir,
                           "%s_Distance_Comparisons.txt" % field)
        raw_data_f = open(raw_data_fp, 'w')

        raw_data_f.write("#ComparisonGroup\tFieldState\tDistances\n")
        for label, data in zip(plot_x_axis_labels, plot_data):
            assert (len(comparison_field_states) == len(data)), "The " +\
                "number of specified comparison groups does not match " +\
                "the number of groups found at the current point along " +\
                "the x-axis."
            for comp_field_state, comp_grp_data in zip(comparison_field_states,
                                                       data):
                raw_data_f.write(comp_field_state + "\t" + label + "\t" +
                                 "\t".join(map(str, comp_grp_data)) + "\n")
        raw_data_f.close()