Пример #1
0
def generate_alpha_rarefaction_data_from_point_in_omega(
        biom_object, metrics, sequences, iterations, tree_object=None):
    """generate alpha rarefaction data from a biom table and mapping file

	Inputs:
	biom_object: OTU table to be rarefied and used to compute alpha diversity
	metrics: list of metrics, phylogenetic or non phylogenetic
	sequences: maximum number of sequences for the rarefaction plots
	iterations: number of repetitions per rarefaction
	tree_object: tree to perform the phylogenetic operations, default is None

	Output:
	alpha_rarefaction_data: dictionary where the keys are alpha diversity
	metrics and the values are tuples; in these tuples the first element is a
	list of column headers for an alpha diversity file, the second element is a
	list of row headers for an alpha diversity file and the third element is a
	list of lists containing the alpha diversity data computed at multiple
	rarefaction depths and as many iterations as specified.
	"""
    # The minimum depth is defined by the size of the maximum depth
    steps = 4
    min_depth = int(ceil(sequences / steps))

    # get a rarefied biom with the proper identifiers
    rarefied_bioms_list = get_rarefactions(biom_object, min_depth, sequences,\
     iterations, steps)

    alpha_rs = {}
    alpha_filenames = []
    # rarefy all the biom objects and get the alpha diversity values
    for rarefied_biom in rarefied_bioms_list:
        # this tag contains data about the iteration and the depth
        identifier = 'alpha_rare_%s_%s' % (str(
            rarefied_biom[0]), str(rarefied_biom[1]))
        alpha_values = single_object_alpha(rarefied_biom[2], metrics,
                                           tree_object)
        alpha_rs[identifier] = (rarefied_biom[0], rarefied_biom[1],
                                alpha_values.split('\n'))
        alpha_filenames.append(identifier)

    # use the rarefaction with the fewest sequences per sample as the reference
    ref_rare = single_object_alpha(rarefied_bioms_list[0][2], metrics,\
     tree_object=tree_object).split('\n')
    all_metrics, all_samples, example_data = parse_matrix(ref_rare)

    # build a dictionary with the data for each of the metrics specified
    metrics_data = {}
    for metric in all_metrics:
        per_metric_data = []
        for filename in alpha_filenames:
            f_metrics, f_samples, f_data = parse_matrix(alpha_rs[filename][2])
            per_metric_data.append(make_output_row(f_metrics, metric,\
             f_samples, f_data, filename, len(all_samples), all_samples))
        metrics_data[metric] = per_metric_data

    # now format the dictionary to make it compatible with make_averages
    alpha_rarefaction_data = _format_rarefactions(metrics_data, all_samples)

    return alpha_rarefaction_data
Пример #2
0
def generate_alpha_rarefaction_data_from_point_in_omega(biom_object, metrics,
													sequences, iterations,
													tree_object=None):
	"""generate alpha rarefaction data from a biom table and mapping file

	Inputs:
	biom_object: OTU table to be rarefied and used to compute alpha diversity
	metrics: list of metrics, phylogenetic or non phylogenetic
	sequences: maximum number of sequences for the rarefaction plots
	iterations: number of repetitions per rarefaction
	tree_object: tree to perform the phylogenetic operations, default is None

	Output:
	alpha_rarefaction_data: dictionary where the keys are alpha diversity
	metrics and the values are tuples; in these tuples the first element is a
	list of column headers for an alpha diversity file, the second element is a
	list of row headers for an alpha diversity file and the third element is a
	list of lists containing the alpha diversity data computed at multiple
	rarefaction depths and as many iterations as specified.
	"""
	# The minimum depth is defined by the size of the maximum depth
	steps = 4
	min_depth = int(ceil(sequences / steps))

	# get a rarefied biom with the proper identifiers
	rarefied_bioms_list = get_rarefactions(biom_object, min_depth, sequences,\
		iterations, steps)

	alpha_rs = {}
	alpha_filenames = []
	# rarefy all the biom objects and get the alpha diversity values
	for rarefied_biom in rarefied_bioms_list:
		# this tag contains data about the iteration and the depth
		identifier = 'alpha_rare_%s_%s' % (str(rarefied_biom[0]), str(rarefied_biom[1]))
		alpha_values = single_object_alpha(rarefied_biom[2], metrics, tree_object)
		alpha_rs[identifier] = (rarefied_biom[0], rarefied_biom[1], alpha_values.split('\n'))
		alpha_filenames.append(identifier)

	# use the rarefaction with the fewest sequences per sample as the reference
	ref_rare = single_object_alpha(rarefied_bioms_list[0][2], metrics,\
		tree_object=tree_object).split('\n')
	all_metrics, all_samples, example_data = parse_matrix(ref_rare)

	# build a dictionary with the data for each of the metrics specified
	metrics_data = {}
	for metric in all_metrics:
		per_metric_data = []
		for filename in alpha_filenames:
			f_metrics, f_samples, f_data = parse_matrix(alpha_rs[filename][2])
			per_metric_data.append(make_output_row(f_metrics, metric,\
				f_samples, f_data, filename, len(all_samples), all_samples))
		metrics_data[metric] = per_metric_data

	# now format the dictionary to make it compatible with make_averages
	alpha_rarefaction_data = _format_rarefactions(metrics_data, all_samples)

	return alpha_rarefaction_data
Пример #3
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    if len(args) != 0:
        parser.error("Positional argument detected.  make sure all" +
                     ' parameters are identified.' +
                     '\ne.g.: include the \"-m\" in \"-m MINIMUM_LENGTH\"')

    input_dir = opts.input_path
    output_dir = opts.output_path
    example_filepath = opts.example_path

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    file_names = os.listdir(input_dir)
    file_names = [fname for fname in file_names if not fname.startswith('.')]

    if example_filepath is None:
        # table row is base_name, seqs_per_sam, iters, ext
        file_name_table = map(parse_rarefaction_fname, file_names)
        # sort on seqs/sam
        sorted_fname_table = sorted(
            file_name_table,
            key=operator.itemgetter(1))
        # now map back to file name
        example_fname = file_names[
            file_name_table.index(sorted_fname_table[0])]
        example_filepath = os.path.join(input_dir, example_fname)
    f = open(example_filepath, 'U')
    all_metrics, all_samples, example_data = parse_matrix(f)
    num_cols = len(all_samples)
    f.close()

    # make the table 1 row at a time
    # we're building a rarefaction by sample mtx from
    # a sample by metric matrix
    # each metric is one output file
    for metric in all_metrics:
        metric_file_data = []
        for fname in file_names:
            # f_ here refers to the input file currently being processed
            # to distinguish from the output file we're building
            f = open(os.path.join(input_dir, fname), 'U')
            f_metrics, f_samples, f_data = parse_matrix(f)
            f.close()
            metric_file_data.append(
                make_output_row(f_metrics, metric, f_samples,
                                f_data, fname, num_cols, all_samples))

        write_output_file(metric_file_data, output_dir, metric, all_samples)
Пример #4
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    if len(args) != 0:
        parser.error("Positional argument detected.  make sure all" +
                     ' parameters are identified.' +
                     '\ne.g.: include the \"-m\" in \"-m MINIMUM_LENGTH\"')

    input_dir = opts.input_path
    output_dir = opts.output_path
    example_filepath = opts.example_path

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    file_names = os.listdir(input_dir)
    file_names = [fname for fname in file_names if not fname.startswith('.')]

    if example_filepath is None:
        # table row is base_name, seqs_per_sam, iters, ext
        file_name_table = map(parse_rarefaction_fname, file_names)
        # sort on seqs/sam
        sorted_fname_table = sorted(
            file_name_table,
            key=operator.itemgetter(1))
        # now map back to file name
        example_fname = file_names[
            file_name_table.index(sorted_fname_table[0])]
        example_filepath = os.path.join(input_dir, example_fname)
    f = open(example_filepath, 'U')
    all_metrics, all_samples, example_data = parse_matrix(f)
    num_cols = len(all_samples)
    f.close()

    # make the table 1 row at a time
    # we're building a rarefaction by sample mtx from
    # a sample by metric matrix
    # each metric is one output file
    for metric in all_metrics:
        metric_file_data = []
        for fname in file_names:
            # f_ here refers to the input file currently being processed
            # to distinguish from the output file we're building
            f = open(os.path.join(input_dir, fname), 'U')
            f_metrics, f_samples, f_data = parse_matrix(f)
            f.close()
            metric_file_data.append(
                make_output_row(f_metrics, metric, f_samples,
                                f_data, fname, num_cols, all_samples))

        write_output_file(metric_file_data, output_dir, metric, all_samples)
Пример #5
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    alpha_fps = opts.alpha_fps
    mapping_fp = opts.mapping_fp
    output_mapping_fp = opts.output_mapping_fp
    binning_method = opts.binning_method
    missing_value_name = opts.missing_value_name
    depth = opts.depth

    # make sure the number of bins is an integer
    try:
        number_of_bins = int(opts.number_of_bins)
    except ValueError:
        raise ValueError, 'The number of bins must be an integer, not %s'\
            % opts.number_of_bins

    # if using collated data, make sure they specify a depth
    if depth is not None:
        alpha_dict = {}

        # build up a dictionary with the filenames as keys and lines as values
        for single_alpha_fp in alpha_fps:
            alpha_dict[splitext(basename(single_alpha_fp))[0]] = open(
                single_alpha_fp, 'U').readlines()

        # format the collated data
        metrics, alpha_sample_ids, alpha_data = mean_alpha(alpha_dict,
            depth)

    # when not using collated data, the user can only specify one input file
    else:
        if len(alpha_fps) > 1:
            option_parser.error('A comma-separated list of files should only be'
                ' passed with the --alpha_fps option when using collated alpha '
                'diversity data and also selecting a rarefaction depth with the'
                ' --depth option.')
        else:
            metrics, alpha_sample_ids, alpha_data = parse_matrix(open(
                alpha_fps[0], 'U'))

    # parse the data from the files
    mapping_file_data, mapping_file_headers, comments = parse_mapping_file(
        open(mapping_fp, 'U'))

    # add the alpha diversity data to the mapping file
    out_mapping_file_data, out_mapping_file_headers = \
        add_alpha_diversity_values_to_mapping_file(metrics, alpha_sample_ids,
        alpha_data, mapping_file_headers, mapping_file_data, number_of_bins,
        binning_method, missing_value_name)

    # format the new data and write it down
    lines = format_mapping_file(out_mapping_file_headers, out_mapping_file_data)
    fd_out = open(output_mapping_fp, 'w')
    fd_out.writelines(lines)
    fd_out.close()
Пример #6
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    alpha_fps = opts.alpha_fps
    mapping_fp = opts.mapping_fp
    output_mapping_fp = opts.output_mapping_fp
    binning_method = opts.binning_method
    missing_value_name = opts.missing_value_name
    depth = opts.depth
    number_of_bins = opts.number_of_bins
    collated_input = opts.collated_input

    # if using collated data, make sure they specify a depth
    if collated_input:
        alpha_dict = {}

        # build up a dictionary with the filenames as keys and lines as values
        for single_alpha_fp in alpha_fps:
            alpha_dict[splitext(basename(single_alpha_fp))[0]] = open(
                single_alpha_fp, 'U').readlines()

        # format the collated data
        try:
            metrics, alpha_sample_ids, alpha_data = mean_alpha(
                alpha_dict, depth)
        except ValueError as e:  # see mean_alpha for the possible exceptions
            option_parser.error(e.message)

    # when not using collated data, the user can only specify one input file
    else:
        if len(alpha_fps) > 1:
            option_parser.error(
                'A comma-separated list of files should only be'
                ' passed with the --alpha_fps option when using collated alpha '
                'diversity data and also selecting a rarefaction depth with the'
                ' --depth option.')
        else:
            metrics, alpha_sample_ids, alpha_data = parse_matrix(
                open(alpha_fps[0], 'U'))

    # parse the data from the files
    mapping_file_data, mapping_file_headers, comments = parse_mapping_file(
        open(mapping_fp, 'U'))

    # add the alpha diversity data to the mapping file
    out_mapping_file_data, out_mapping_file_headers = \
        add_alpha_diversity_values_to_mapping_file(metrics, alpha_sample_ids,
                                                   alpha_data, mapping_file_headers, mapping_file_data, number_of_bins,
                                                   binning_method, missing_value_name)

    # format the new data and write it down
    lines = format_mapping_file(out_mapping_file_headers,
                                out_mapping_file_data)
    fd_out = open(output_mapping_fp, 'w')
    fd_out.writelines(lines)
    fd_out.close()
Пример #7
0
    def single_object_beta(self,
                           otu_table,
                           metric,
                           tree_string,
                           missing_sams=None):
        """ running single_file_beta should give same result using --rows"""
        if missing_sams is None:
            missing_sams = []

        metrics = list_known_nonphylogenetic_metrics()
        metrics.extend(list_known_phylogenetic_metrics())

        # new metrics that don't trivially parallelize must be dealt with
        # carefully
        warnings.filterwarnings(
            'ignore', 'dissimilarity binary_dist_chisq is\
 not parallelized, calculating the whole matrix...')
        warnings.filterwarnings(
            'ignore', 'dissimilarity dist_chisq is not\
 parallelized, calculating the whole matrix...')
        warnings.filterwarnings(
            'ignore', 'dissimilarity dist_gower is not\
 parallelized, calculating the whole matrix...')
        warnings.filterwarnings(
            'ignore', 'dissimilarity dist_hellinger is\
 not parallelized, calculating the whole matrix...')
        warnings.filterwarnings('ignore', 'unifrac had no information for\
 sample M*')

        # self.files_to_remove.extend([input_path,tree_path])
        # self.folders_to_remove.append(output_dir)
        # os.mkdir(output_dir+'/ft/')

        for metric in metrics:
            # do it
            beta_out = single_object_beta(otu_table,
                                          metric,
                                          tree_string,
                                          rowids=None,
                                          full_tree=False)

            sams, dmtx = parse_distmat(beta_out)

            # do it by rows
            for i in range(len(sams)):
                if sams[i] in missing_sams:
                    continue
                rows = sams[i]
                # row_outname = output_dir + '/' + metric + '_' +\
                # in_fname
                r_out = single_object_beta(otu_table,
                                           metric,
                                           tree_string,
                                           rowids=rows,
                                           full_tree=False)
                col_sams, row_sams, row_dmtx = parse_matrix(r_out)

                self.assertEqual(row_dmtx.shape,
                                 (len(rows.split(',')), len(sams)))

                # make sure rows same as full
                for j in range(len(rows.split(','))):
                    for k in range(len(sams)):
                        row_v1 = row_dmtx[j, k]
                        full_v1 =\
                            dmtx[sams.index(row_sams[j]),
                                 sams.index(col_sams[k])]
                        npt.assert_almost_equal(row_v1, full_v1)

            # full tree run:
            if 'full_tree' in str(metric).lower():
                continue
            # do it by rows with full tree
            for i in range(len(sams)):
                if sams[i] in missing_sams:
                    continue
                rows = sams[i]

                #~ row_outname = output_dir + '/ft/' + metric + '_' +\
                #~ in_fname
                r_out = single_object_beta(otu_table,
                                           metric,
                                           tree_string,
                                           rowids=None,
                                           full_tree=True)
                col_sams, row_sams, row_dmtx = parse_matrix(r_out)

                self.assertEqual(row_dmtx.shape,
                                 (len(rows.split(',')), len(sams)))

                # make sure rows same as full
                for j in range(len(rows.split(','))):
                    for k in range(len(sams)):
                        row_v1 = row_dmtx[j, k]
                        full_v1 =\
                            dmtx[sams.index(row_sams[j]),
                                 sams.index(col_sams[k])]
                        npt.assert_almost_equal(row_v1, full_v1)

            # do it with full tree
            r_out = single_object_beta(otu_table,
                                       metric,
                                       tree_string,
                                       rowids=None,
                                       full_tree=True)
            sams_ft, dmtx_ft = parse_distmat(r_out)
            self.assertEqual(sams_ft, sams)
            npt.assert_almost_equal(dmtx_ft, dmtx)
Пример #8
0
    def single_file_beta(self,
                         otu_table_string,
                         tree_string,
                         missing_sams=None,
                         use_metric_list=False):
        """ running single_file_beta should give same result using --rows"""
        if missing_sams is None:
            missing_sams = []
        # setup
        fd, input_path = mkstemp(suffix='.txt')
        os.close(fd)
        in_fname = os.path.split(input_path)[1]
        f = open(input_path, 'w')
        f.write(otu_table_string)
        f.close()
        fd, tree_path = mkstemp(suffix='.tre')
        os.close(fd)
        f = open(tree_path, 'w')
        f.write(tree_string)
        f.close()
        metrics = list_known_nonphylogenetic_metrics()
        metrics.extend(list_known_phylogenetic_metrics())
        output_dir = mkdtemp()

        # new metrics that don't trivially parallelize must be dealt with
        # carefully
        warnings.filterwarnings(
            'ignore', 'dissimilarity binary_dist_chisq is\
 not parallelized, calculating the whole matrix...')
        warnings.filterwarnings(
            'ignore', 'dissimilarity dist_chisq is not\
 parallelized, calculating the whole matrix...')
        warnings.filterwarnings(
            'ignore', 'dissimilarity dist_gower is not\
 parallelized, calculating the whole matrix...')
        warnings.filterwarnings(
            'ignore', 'dissimilarity dist_hellinger is\
 not parallelized, calculating the whole matrix...')
        warnings.filterwarnings('ignore', 'unifrac had no information for\
 sample M*')

        self.files_to_remove.extend([input_path, tree_path])
        self.folders_to_remove.append(output_dir)
        os.mkdir(output_dir + '/ft/')

        for metric in metrics:
            # do it
            if use_metric_list:
                single_file_beta(input_path, [metric],
                                 tree_path,
                                 output_dir,
                                 rowids=None)
            else:
                single_file_beta(input_path,
                                 metric,
                                 tree_path,
                                 output_dir,
                                 rowids=None)
            sams, dmtx = parse_distmat(
                open(output_dir + '/' + metric + '_' + in_fname))

            # do it by rows
            for i in range(len(sams)):
                if sams[i] in missing_sams:
                    continue
                rows = sams[i]
                row_outname = output_dir + '/' + metric + '_' +\
                    in_fname
                if use_metric_list:
                    single_file_beta(input_path, [metric],
                                     tree_path,
                                     output_dir,
                                     rowids=rows)
                else:
                    single_file_beta(input_path,
                                     metric,
                                     tree_path,
                                     output_dir,
                                     rowids=rows)
                col_sams, row_sams, row_dmtx = parse_matrix(open(row_outname))

                self.assertEqual(row_dmtx.shape,
                                 (len(rows.split(',')), len(sams)))

                # make sure rows same as full
                for j in range(len(rows.split(','))):
                    for k in range(len(sams)):
                        row_v1 = row_dmtx[j, k]
                        full_v1 =\
                            dmtx[sams.index(row_sams[j]),
                                 sams.index(col_sams[k])]
                        npt.assert_almost_equal(row_v1, full_v1)

            # full tree run:
            if 'full_tree' in str(metric).lower():
                continue
            # do it by rows with full tree
            for i in range(len(sams)):
                if sams[i] in missing_sams:
                    continue
                rows = sams[i]

                row_outname = output_dir + '/ft/' + metric + '_' +\
                    in_fname
                if use_metric_list:
                    single_file_beta(input_path, [metric],
                                     tree_path,
                                     output_dir + '/ft/',
                                     rowids=rows,
                                     full_tree=True)
                else:
                    single_file_beta(input_path,
                                     metric,
                                     tree_path,
                                     output_dir + '/ft/',
                                     rowids=rows,
                                     full_tree=True)
                col_sams, row_sams, row_dmtx = parse_matrix(open(row_outname))

                self.assertEqual(row_dmtx.shape,
                                 (len(rows.split(',')), len(sams)))

                # make sure rows same as full
                for j in range(len(rows.split(','))):
                    for k in range(len(sams)):
                        row_v1 = row_dmtx[j, k]
                        full_v1 =\
                            dmtx[sams.index(row_sams[j]),
                                 sams.index(col_sams[k])]
                        npt.assert_almost_equal(row_v1, full_v1)

            # do it with full tree
            if use_metric_list:
                single_file_beta(input_path, [metric],
                                 tree_path,
                                 output_dir + '/ft/',
                                 rowids=None,
                                 full_tree=True)
            else:
                single_file_beta(input_path,
                                 metric,
                                 tree_path,
                                 output_dir + '/ft/',
                                 rowids=None,
                                 full_tree=True)
            sams_ft, dmtx_ft = parse_distmat(
                open(output_dir + '/ft/' + metric + '_' + in_fname))
            self.assertEqual(sams_ft, sams)
            npt.assert_almost_equal(dmtx_ft, dmtx)
Пример #9
0
    def single_object_beta(self, otu_table, metric, tree_string,
                            missing_sams=None):
        """ running single_file_beta should give same result using --rows"""
        if missing_sams==None:
            missing_sams = []
        # setup
        #input_path = get_tmp_filename()
        #in_fname = os.path.split(input_path)[1]
        #f = open(input_path,'w')
        #f.write(otu_table_string)
        #f.close()
        #tree_path = get_tmp_filename()
        #f = open(tree_path,'w')
        #f.write(tree_string)
        #f.close()
        metrics = list_known_nonphylogenetic_metrics()
        metrics.extend(list_known_phylogenetic_metrics())
        #output_dir = get_tmp_filename(suffix = '')
        #os.mkdir(output_dir)

        # new metrics that don't trivially parallelize must be dealt with
        # carefully
        warnings.filterwarnings('ignore','dissimilarity binary_dist_chisq is\
 not parallelized, calculating the whole matrix...')
        warnings.filterwarnings('ignore','dissimilarity dist_chisq is not\
 parallelized, calculating the whole matrix...')  
        warnings.filterwarnings('ignore','dissimilarity dist_gower is not\
 parallelized, calculating the whole matrix...')     
        warnings.filterwarnings('ignore','dissimilarity dist_hellinger is\
 not parallelized, calculating the whole matrix...')  
        warnings.filterwarnings('ignore','unifrac had no information for\
 sample M*')

        #self.files_to_remove.extend([input_path,tree_path])
        #self.folders_to_remove.append(output_dir)
        #os.mkdir(output_dir+'/ft/')

        for metric in metrics:
            # do it
            beta_out = single_object_beta(otu_table, metric, 
                                          tree_string,rowids=None,
                                          full_tree=False)
                                          
            sams, dmtx = parse_distmat(beta_out)

            # do it by rows
            for i in range(len(sams)):
                if sams[i] in missing_sams: continue
                rows = sams[i]
                #row_outname = output_dir + '/' + metric + '_' +\
                    #in_fname
                r_out = single_object_beta(otu_table, metric, 
                                          tree_string,rowids=rows,
                                          full_tree=False)
                col_sams, row_sams, row_dmtx = parse_matrix(r_out)

                self.assertEqual(row_dmtx.shape, (len(rows.split(',')),
                    len(sams)))

                # make sure rows same as full
                for j in range(len(rows.split(','))):
                    for k in range(len(sams)):
                        row_v1 = row_dmtx[j,k]
                        full_v1 =\
                            dmtx[sams.index(row_sams[j]),
                                sams.index(col_sams[k])]
                        self.assertFloatEqual(row_v1, full_v1)


            ### full tree run:
            if 'full_tree' in str(metric).lower(): continue
            # do it by rows with full tree
            for i in range(len(sams)):
                if sams[i] in missing_sams: continue
                rows = sams[i]
                
                #~ row_outname = output_dir + '/ft/' + metric + '_' +\
                    #~ in_fname
                r_out = single_object_beta(otu_table, metric, 
                                          tree_string,rowids=None,
                                          full_tree=True)
                col_sams, row_sams, row_dmtx = parse_matrix(r_out)

                self.assertEqual(row_dmtx.shape, (len(rows.split(',')),
                    len(sams)))

                # make sure rows same as full
                for j in range(len(rows.split(','))):
                    for k in range(len(sams)):
                        row_v1 = row_dmtx[j,k]
                        full_v1 =\
                            dmtx[sams.index(row_sams[j]),
                                sams.index(col_sams[k])]
                        self.assertFloatEqual(row_v1, full_v1)

            # # do it with full tree
            r_out = single_object_beta(otu_table, metric, 
                                          tree_string,rowids=None,
                                          full_tree=True)
            sams_ft, dmtx_ft = parse_distmat(r_out)
            self.assertEqual(sams_ft, sams)
            self.assertFloatEqual(dmtx_ft, dmtx)
Пример #10
0
        try:
            metrics, alpha_sample_ids, alpha_data = mean_alpha(alpha_dict, depth)
        except ValueError, e:  # see mean_alpha for the possible exceptions
            option_parser.error(e.message)

    # when not using collated data, the user can only specify one input file
    else:
        if len(alpha_fps) > 1:
            option_parser.error(
                "A comma-separated list of files should only be"
                " passed with the --alpha_fps option when using collated alpha "
                "diversity data and also selecting a rarefaction depth with the"
                " --depth option."
            )
        else:
            metrics, alpha_sample_ids, alpha_data = parse_matrix(open(alpha_fps[0], "U"))

    # parse the data from the files
    mapping_file_data, mapping_file_headers, comments = parse_mapping_file(open(mapping_fp, "U"))

    # add the alpha diversity data to the mapping file
    out_mapping_file_data, out_mapping_file_headers = add_alpha_diversity_values_to_mapping_file(
        metrics,
        alpha_sample_ids,
        alpha_data,
        mapping_file_headers,
        mapping_file_data,
        number_of_bins,
        binning_method,
        missing_value_name,
    )
Пример #11
0
    def single_file_beta(
            self, otu_table_string, tree_string, missing_sams=None,
            use_metric_list=False):
        """ running single_file_beta should give same result using --rows"""
        if missing_sams is None:
            missing_sams = []
        # setup
        fd, input_path = mkstemp(suffix='.txt')
        close(fd)
        in_fname = os.path.split(input_path)[1]
        f = open(input_path, 'w')
        f.write(otu_table_string)
        f.close()
        fd, tree_path = mkstemp(suffix='.tre')
        close(fd)
        f = open(tree_path, 'w')
        f.write(tree_string)
        f.close()
        metrics = list_known_nonphylogenetic_metrics()
        metrics.extend(list_known_phylogenetic_metrics())
        output_dir = mkdtemp()

        # new metrics that don't trivially parallelize must be dealt with
        # carefully
        warnings.filterwarnings('ignore', 'dissimilarity binary_dist_chisq is\
 not parallelized, calculating the whole matrix...')
        warnings.filterwarnings('ignore', 'dissimilarity dist_chisq is not\
 parallelized, calculating the whole matrix...')
        warnings.filterwarnings('ignore', 'dissimilarity dist_gower is not\
 parallelized, calculating the whole matrix...')
        warnings.filterwarnings('ignore', 'dissimilarity dist_hellinger is\
 not parallelized, calculating the whole matrix...')
        warnings.filterwarnings('ignore', 'unifrac had no information for\
 sample M*')

        self.files_to_remove.extend([input_path, tree_path])
        self.folders_to_remove.append(output_dir)
        os.mkdir(output_dir + '/ft/')

        for metric in metrics:
            # do it
            if use_metric_list:
                single_file_beta(input_path, [metric], tree_path, output_dir,
                                 rowids=None)
            else:
                single_file_beta(input_path, metric, tree_path, output_dir,
                                 rowids=None)
            sams, dmtx = parse_distmat(open(output_dir + '/' +
                                            metric + '_' + in_fname))

            # do it by rows
            for i in range(len(sams)):
                if sams[i] in missing_sams:
                    continue
                rows = sams[i]
                row_outname = output_dir + '/' + metric + '_' +\
                    in_fname
                if use_metric_list:
                    single_file_beta(input_path, [metric], tree_path,
                                     output_dir, rowids=rows)
                else:
                    single_file_beta(input_path, metric, tree_path, output_dir,
                                     rowids=rows)
                col_sams, row_sams, row_dmtx = parse_matrix(open(row_outname))

                self.assertEqual(row_dmtx.shape, (len(rows.split(',')),
                                                  len(sams)))

                # make sure rows same as full
                for j in range(len(rows.split(','))):
                    for k in range(len(sams)):
                        row_v1 = row_dmtx[j, k]
                        full_v1 =\
                            dmtx[sams.index(row_sams[j]),
                                 sams.index(col_sams[k])]
                        assert_almost_equal(row_v1, full_v1)

            # full tree run:
            if 'full_tree' in str(metric).lower():
                continue
            # do it by rows with full tree
            for i in range(len(sams)):
                if sams[i] in missing_sams:
                    continue
                rows = sams[i]

                row_outname = output_dir + '/ft/' + metric + '_' +\
                    in_fname
                if use_metric_list:
                    single_file_beta(input_path, [metric], tree_path,
                                     output_dir + '/ft/', rowids=rows, full_tree=True)
                else:
                    single_file_beta(input_path, metric, tree_path,
                                     output_dir + '/ft/', rowids=rows, full_tree=True)
                col_sams, row_sams, row_dmtx = parse_matrix(open(row_outname))

                self.assertEqual(row_dmtx.shape, (len(rows.split(',')),
                                                  len(sams)))

                # make sure rows same as full
                for j in range(len(rows.split(','))):
                    for k in range(len(sams)):
                        row_v1 = row_dmtx[j, k]
                        full_v1 =\
                            dmtx[sams.index(row_sams[j]),
                                 sams.index(col_sams[k])]
                        assert_almost_equal(row_v1, full_v1)

            # do it with full tree
            if use_metric_list:
                single_file_beta(input_path, [metric], tree_path,
                                 output_dir + '/ft/', rowids=None, full_tree=True)
            else:
                single_file_beta(input_path, metric, tree_path,
                                 output_dir + '/ft/', rowids=None, full_tree=True)
            sams_ft, dmtx_ft = parse_distmat(open(output_dir + '/ft/' +
                                                  metric + '_' + in_fname))
            self.assertEqual(sams_ft, sams)
            assert_almost_equal(dmtx_ft, dmtx)
        try:
            metrics, alpha_sample_ids, alpha_data = mean_alpha(
                alpha_dict, depth)
        except ValueError, e:  # see mean_alpha for the possible exceptions
            option_parser.error(e.message)

    # when not using collated data, the user can only specify one input file
    else:
        if len(alpha_fps) > 1:
            option_parser.error(
                'A comma-separated list of files should only be'
                ' passed with the --alpha_fps option when using collated alpha '
                'diversity data and also selecting a rarefaction depth with the'
                ' --depth option.')
        else:
            metrics, alpha_sample_ids, alpha_data = parse_matrix(
                open(alpha_fps[0], 'U'))

    # parse the data from the files
    mapping_file_data, mapping_file_headers, comments = parse_mapping_file(
        open(mapping_fp, 'U'))

    # add the alpha diversity data to the mapping file
    out_mapping_file_data, out_mapping_file_headers = \
        add_alpha_diversity_values_to_mapping_file(metrics, alpha_sample_ids,
        alpha_data, mapping_file_headers, mapping_file_data, number_of_bins,
        binning_method, missing_value_name)

    # format the new data and write it down
    lines = format_mapping_file(out_mapping_file_headers,
                                out_mapping_file_data)
    fd_out = open(output_mapping_fp, 'w')