示例#1
0
def write_output_file(metric_file_data, output_dir, metric, all_samples):
    # now have matrix where output_row is rarefaction analysis
    metric_file_data = sorted(metric_file_data,key=operator.itemgetter(1,2))
    row_names = [row.pop(0) for row in metric_file_data]
    col_names = ['sequences per sample', 'iteration'] + all_samples
    #Numpy shows weird behaviour when converting metric_file_data to array
    #it truncates some values, so better go with straight list of lists
    # format_matrix() now takes 2d lists as well as arrays.
    out_str = format_matrix(metric_file_data, row_names, col_names)
    f = open(os.path.join(output_dir,metric+'.txt'),'w')
    f.write(out_str)
    f.close()
示例#2
0
def write_output_file(metric_file_data, output_dir, metric, all_samples):
    # now have matrix where output_row is rarefaction analysis
    metric_file_data = sorted(metric_file_data,key=operator.itemgetter(1,2))
    row_names = [row.pop(0) for row in metric_file_data]
    col_names = ['sequences per sample', 'iteration'] + all_samples
    #Numpy shows weird behaviour when converting metric_file_data to array
    #it truncates some values, so better go with straight list of lists
    # format_matrix() now takes 2d lists as well as arrays.
    out_str = format_matrix(metric_file_data, row_names, col_names)
    f = open(os.path.join(output_dir,metric+'.txt'),'w')
    f.write(out_str)
    f.close()
示例#3
0
    def test_format_matrix(self):
        """format_matrix should return tab-delimited mat"""
        a = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
        row_labels = ["a", "b", "c"]
        col_labels = [11, 22, 33]
        res = format_matrix(a, row_labels, col_labels)

        # test as list
        self.assertEqual(res, "\t11\t22\t33\na\t1\t2\t3\nb\t4\t5\t6\nc\t7\t8\t9")
        self.assertRaises(ValueError, format_matrix, a, row_labels[:2], col_labels)
        self.assertRaises(ValueError, format_matrix, None, row_labels, col_labels)

        # tes as array
        a = array(a)
        self.assertEqual(res, "\t11\t22\t33\na\t1\t2\t3\nb\t4\t5\t6\nc\t7\t8\t9")
        self.assertRaises(ValueError, format_matrix, a, row_labels[:2], col_labels)
        self.assertRaises(ValueError, format_matrix, None, row_labels, col_labels)
示例#4
0
    def test_format_matrix(self):
        """format_matrix should return tab-delimited mat"""
        a = [[1,2,3], [4,5,6], [7,8,9]]
        row_labels = ['a','b','c']
        col_labels = [11,22,33]
        res = format_matrix(a, row_labels, col_labels)
        
        #test as list
        self.assertEqual(res, 
            '\t11\t22\t33\na\t1\t2\t3\nb\t4\t5\t6\nc\t7\t8\t9')
        self.assertRaises(ValueError, format_matrix, a, row_labels[:2], col_labels)
        self.assertRaises(ValueError, format_matrix, None, row_labels, col_labels)

        #tes as array
        a = array(a)
        self.assertEqual(res, 
                         '\t11\t22\t33\na\t1\t2\t3\nb\t4\t5\t6\nc\t7\t8\t9')
        self.assertRaises(ValueError, format_matrix, a, row_labels[:2], col_labels)
        self.assertRaises(ValueError, format_matrix, None, row_labels, col_labels)
示例#5
0
def single_object_beta(otu_table, metrics, tr, rowids=None, full_tree=False):
    """mod of single_file_beta to recieve and return otu obj, tree str

    uses name in metrics to name output beta diversity files
    assumes input tree is already trimmed to contain only otus present
    in otu_table, doesn't call getSubTree()
    inputs:
                otu_table -- a otu_table in the biom format
                metrics -- metrics (str, comma delimited if more than 1 metric)
                tr -- a phylonode cogent tree object if needed by the chosen beta
                                        diversity metric
                rowids -- comma seperated string
    """
    if isinstance(otu_table, DenseTable):
        otumtx = otu_table._data.T
    else:
        otumtx = asarray([v for v in otu_table.iterSampleData()])

    if tr:
        tree = tr
    else:
        tree = None

    metrics_list = metrics.split(',')

    for metric in metrics_list:
        try:
            metric_f = get_nonphylogenetic_metric(metric)
            is_phylogenetic = False
        except AttributeError:
            try:
                metric_f = get_phylogenetic_metric(metric)
                is_phylogenetic = True
                if tree is None:
                    stderr.write(
                        "metric %s requires a tree, but none found\n" %
                        (metric, ))
                    exit(1)
            except AttributeError:
                stderr.write(
                    "Could not find metric %s.\n\nKnown metrics are: %s\n" %
                    (metric, ', '.join(list_known_metrics())))
                exit(1)
        if rowids is None:
            # standard, full way
            if is_phylogenetic:
                dissims = metric_f(otumtx,
                                   otu_table.ObservationIds,
                                   tree,
                                   otu_table.SampleIds,
                                   make_subtree=(not full_tree))
            else:
                dissims = metric_f(otumtx)

            return (format_distance_matrix(otu_table.SampleIds,
                                           dissims).split('\n'))
        else:
            # only calc d(rowid1, *) for each rowid
            rowids_list = rowids.split(',')
            row_dissims = []  # same order as rowids_list
            for rowid in rowids_list:
                rowidx = otu_table.SampleIds.index(rowid)

                # first test if we can the dissim is a fn of only the pair
                # if not, just calc the whole matrix
                if metric_f.__name__ == 'dist_chisq' or \
                        metric_f.__name__ == 'dist_gower' or \
                        metric_f.__name__ == 'dist_hellinger' or\
                        metric_f.__name__ == 'binary_dist_chisq':
                    warnings.warn(
                        'dissimilarity ' + metric_f.__name__ +
                        ' is not parallelized, calculating the whole matrix...'
                    )
                    row_dissims.append(metric_f(otumtx)[rowidx])
                else:
                    try:
                        row_metric = get_phylogenetic_row_metric(metric)
                    except AttributeError:
                        # do element by element
                        dissims = []
                        for i in range(len(otu_table.SampleIds)):
                            if is_phylogenetic:
                                dissim = metric_f(
                                    otumtx[[rowidx, i], :],
                                    otu_table.ObservationIds,
                                    tree, [
                                        otu_table.SampleIds[rowidx],
                                        otu_table.SampleIds[i]
                                    ],
                                    make_subtree=(not full_tree))[0, 1]
                            else:
                                dissim = metric_f(otumtx[[rowidx, i], :])[0, 1]
                            dissims.append(dissim)
                        row_dissims.append(dissims)
                    else:
                        # do whole row at once
                        dissims = row_metric(otumtx,
                                             otu_table.ObservationIds,
                                             tree,
                                             otu_table.SampleIds,
                                             rowid,
                                             make_subtree=(not full_tree))
                        row_dissims.append(dissims)

            return format_matrix(row_dissims, rowids_list, otu_table.SampleIds)
示例#6
0
def single_file_beta(input_path,
                     metrics,
                     tree_path,
                     output_dir,
                     rowids=None,
                     full_tree=False):
    """ does beta diversity calc on a single otu table

    uses name in metrics to name output beta diversity files
    assumes input tree is already trimmed to contain only otus present in otu
    table, doesn't call getSubTree()
    inputs:
     input_path (str)
     metrics (str, comma delimited if more than 1 metric; or list)
     tree_path (str)
     output_dir (str)
     rowids (comma separated str)
    """
    metrics_list = metrics
    try:
        metrics_list = metrics_list.split(',')
    except AttributeError:
        pass

    otu_table = parse_biom_table(open(input_path, 'U'))

    if isinstance(otu_table, DenseTable):
        otumtx = otu_table._data.T
    else:
        otumtx = asarray([v for v in otu_table.iterSampleData()])

    if tree_path:
        tree = parse_newick(open(tree_path, 'U'), PhyloNode)
    else:
        tree = None

    input_dir, input_filename = os.path.split(input_path)
    input_basename, input_ext = os.path.splitext(input_filename)
    for metric in metrics_list:
        outfilepath = os.path.join(output_dir,
                                   metric + '_' + input_basename + '.txt')
        try:
            metric_f = get_nonphylogenetic_metric(metric)
            is_phylogenetic = False
        except AttributeError:
            try:
                metric_f = get_phylogenetic_metric(metric)
                is_phylogenetic = True
                if tree is None:
                    stderr.write(
                        "metric %s requires a tree, but none found\n" %
                        (metric, ))
                    exit(1)
            except AttributeError:
                stderr.write(
                    "Could not find metric %s.\n\nKnown metrics are: %s\n" %
                    (metric, ', '.join(list_known_metrics())))
                exit(1)
        if rowids is None:
            # standard, full way
            if is_phylogenetic:
                dissims = metric_f(otumtx,
                                   otu_table.ObservationIds,
                                   tree,
                                   otu_table.SampleIds,
                                   make_subtree=(not full_tree))
            else:
                dissims = metric_f(otumtx)
            f = open(outfilepath, 'w')
            f.write(format_distance_matrix(otu_table.SampleIds, dissims))
            f.close()
        else:
            # only calc d(rowid1, *) for each rowid
            rowids_list = rowids.split(',')
            row_dissims = []  # same order as rowids_list
            for rowid in rowids_list:
                rowidx = otu_table.SampleIds.index(rowid)

                # first test if we can the dissim is a fn of only the pair
                # if not, just calc the whole matrix
                if metric_f.__name__ == 'dist_chisq' or \
                        metric_f.__name__ == 'dist_gower' or \
                        metric_f.__name__ == 'dist_hellinger' or\
                        metric_f.__name__ == 'binary_dist_chisq':
                    warnings.warn(
                        'dissimilarity ' + metric_f.__name__ +
                        ' is not parallelized, calculating the whole matrix...'
                    )
                    row_dissims.append(metric_f(otumtx)[rowidx])
                else:
                    try:
                        row_metric = get_phylogenetic_row_metric(metric)
                    except AttributeError:
                        # do element by element
                        dissims = []
                        for i in range(len(otu_table.SampleIds)):
                            if is_phylogenetic:
                                dissim = metric_f(
                                    otumtx[[rowidx, i], :],
                                    otu_table.ObservationIds,
                                    tree, [
                                        otu_table.SampleIds[rowidx],
                                        otu_table.SampleIds[i]
                                    ],
                                    make_subtree=(not full_tree))[0, 1]
                            else:
                                dissim = metric_f(otumtx[[rowidx, i], :])[0, 1]
                            dissims.append(dissim)
                        row_dissims.append(dissims)
                    else:
                        # do whole row at once
                        dissims = row_metric(otumtx,
                                             otu_table.ObservationIds,
                                             tree,
                                             otu_table.SampleIds,
                                             rowid,
                                             make_subtree=(not full_tree))
                        row_dissims.append(dissims)

            # rows_outfilepath = os.path.join(output_dir, metric + '_' +\
            #     '_'.join(rowids_list) + '_' + os.path.split(input_path)[1])
            f = open(outfilepath, 'w')
            f.write(
                format_matrix(row_dissims, rowids_list, otu_table.SampleIds))
            f.close()
示例#7
0
def single_object_beta(otu_table, metrics, tr, rowids=None,
    full_tree=False):
    """mod of single_file_beta to recieve and return otu obj, tree str

    uses name in metrics to name output beta diversity files
    assumes input tree is already trimmed to contain only otus present 
    in otu_table, doesn't call getSubTree()
    inputs:
		otu_table -- a otu_table in the biom format
		metrics -- metrics (str, comma delimited if more than 1 metric)
		tr -- a phylonode cogent tree object if needed by the chosen beta
					diversity metric
		rowids -- comma seperated string
    """ 
    if isinstance(otu_table, DenseTable):
        otumtx = otu_table._data.T
    else:
        otumtx = asarray([v for v in otu_table.iterSampleData()])
    
    if tr:
        tree = tr
    else:
        tree = None

    metrics_list = metrics.split(',')
    
    for metric in metrics_list:
        try:
            metric_f = get_nonphylogenetic_metric(metric)
            is_phylogenetic = False
        except AttributeError:
            try:
                metric_f = get_phylogenetic_metric(metric)
                is_phylogenetic = True
                if tree == None:
                    stderr.write("metric %s requires a tree, but none found\n"\
                        % (metric,))
                    exit(1)
            except AttributeError:
                stderr.write("Could not find metric %s.\n\nKnown metrics are: %s\n"\
                    % (metric, ', '.join(list_known_metrics())))
                exit(1)
        if rowids == None:
            # standard, full way
            if is_phylogenetic:
                dissims = metric_f(otumtx, otu_table.ObservationIds, tree,
                    otu_table.SampleIds, make_subtree = (not full_tree))
            else:
                dissims = metric_f(otumtx)
            
            return format_distance_matrix(otu_table.SampleIds, dissims).split('\n') 
        else:
            # only calc d(rowid1, *) for each rowid
            rowids_list = rowids.split(',')
            row_dissims = [] # same order as rowids_list
            for rowid in rowids_list:
                rowidx = otu_table.SampleIds.index(rowid)
                
                # first test if we can the dissim is a fn of only the pair
                # if not, just calc the whole matrix
                if metric_f.__name__ == 'dist_chisq' or \
                    metric_f.__name__ == 'dist_gower' or \
                    metric_f.__name__ == 'dist_hellinger' or\
                    metric_f.__name__ == 'binary_dist_chisq':
                    warnings.warn('dissimilarity '+metric_f.__name__+\
                      ' is not parallelized, calculating the whole matrix...')
                    row_dissims.append(metric_f(otumtx)[rowidx])
                else:
                    try:
                        row_metric = get_phylogenetic_row_metric(metric)
                    except AttributeError:
                        # do element by element
                        dissims = []
                        for i in range(len(otu_table.SampleIds)):
                            if is_phylogenetic:
                                dissim = metric_f(otumtx[[rowidx,i],:],
                                    otu_table.ObservationIds, tree,
                                    [otu_table.SampleIds[rowidx],
                                    otu_table.SampleIds[i]],
                                    make_subtree = (not full_tree))[0,1]
                            else:
                                dissim = metric_f(otumtx[[rowidx,i],:])[0,1]
                            dissims.append(dissim)
                        row_dissims.append(dissims)
                    else:
                        # do whole row at once
                        dissims = row_metric(otumtx,
                                    otu_table.ObservationIds, tree,
                                    otu_table.SampleIds, rowid,
                                    make_subtree = (not full_tree))
                        row_dissims.append(dissims)
            
            return format_matrix(row_dissims,rowids_list,otu_table.SampleIds)
示例#8
0
def single_file_beta(input_path, metrics, tree_path, output_dir,
    rowids=None, full_tree=False):
    """ does beta diversity calc on a single otu table

    uses name in metrics to name output beta diversity files
    assumes input tree is already trimmed to contain only otus present in otu
    table, doesn't call getSubTree()
    inputs:
     input_path (str)
     metrics (str, comma delimited if more than 1 metric; or list)
     tree_path (str)
     output_dir (str)
     rowids (comma separated str)
    """
    metrics_list = metrics
    try:
        metrics_list = metrics_list.split(',')
    except AttributeError:
        pass

    otu_table = parse_biom_table(open(input_path,'U'))

    if isinstance(otu_table, DenseTable):
        otumtx = otu_table._data.T
    else:
        otumtx = asarray([v for v in otu_table.iterSampleData()])

    if tree_path:
        tree = parse_newick(open(tree_path, 'U'), 
                            PhyloNode)
    else:
        tree = None

    input_dir, input_filename = os.path.split(input_path)
    input_basename, input_ext = os.path.splitext(input_filename)
    for metric in metrics_list:
        outfilepath = os.path.join(output_dir, metric + '_' + \
            input_basename + '.txt')
        try:
            metric_f = get_nonphylogenetic_metric(metric)
            is_phylogenetic = False
        except AttributeError:
            try:
                metric_f = get_phylogenetic_metric(metric)
                is_phylogenetic = True
                if tree == None:
                    stderr.write("metric %s requires a tree, but none found\n"\
                        % (metric,))
                    exit(1)
            except AttributeError:
                stderr.write("Could not find metric %s.\n\nKnown metrics are: %s\n"\
                    % (metric, ', '.join(list_known_metrics())))
                exit(1)
        if rowids == None:
            # standard, full way
            if is_phylogenetic:
                dissims = metric_f(otumtx, otu_table.ObservationIds, \
                    tree, otu_table.SampleIds, make_subtree = (not full_tree))
            else:
                dissims = metric_f(otumtx)
            f = open(outfilepath,'w')
            f.write(format_distance_matrix(otu_table.SampleIds, dissims))
            f.close()
        else:
            # only calc d(rowid1, *) for each rowid
            rowids_list = rowids.split(',')
            row_dissims = [] # same order as rowids_list
            for rowid in rowids_list:
                rowidx = otu_table.SampleIds.index(rowid)
                
                # first test if we can the dissim is a fn of only the pair
                # if not, just calc the whole matrix
                if metric_f.__name__ == 'dist_chisq' or \
                    metric_f.__name__ == 'dist_gower' or \
                    metric_f.__name__ == 'dist_hellinger' or\
                    metric_f.__name__ == 'binary_dist_chisq':
                    warnings.warn('dissimilarity '+metric_f.__name__+\
                      ' is not parallelized, calculating the whole matrix...')
                    row_dissims.append(metric_f(otumtx)[rowidx])
                else:
                    try:
                        row_metric = get_phylogenetic_row_metric(metric)
                    except AttributeError:
                        # do element by element
                        dissims = []
                        for i in range(len(otu_table.SampleIds)):
                            if is_phylogenetic:
                                dissim = metric_f(otumtx[[rowidx,i],:],
                                    otu_table.ObservationIds, tree,
                                    [otu_table.SampleIds[rowidx],
                                    otu_table.SampleIds[i]],
                                    make_subtree = (not full_tree))[0,1]
                            else:
                                dissim = metric_f(otumtx[[rowidx,i],:])[0,1]
                            dissims.append(dissim)
                        row_dissims.append(dissims)
                    else:
                        # do whole row at once
                        dissims = row_metric(otumtx,
                                    otu_table.ObservationIds, tree,
                                    otu_table.SampleIds, rowid,
                                    make_subtree = (not full_tree))
                        row_dissims.append(dissims)

            # rows_outfilepath = os.path.join(output_dir, metric + '_' +\
            #     '_'.join(rowids_list) + '_' + os.path.split(input_path)[1])
            f = open(outfilepath,'w')
            f.write(format_matrix(row_dissims,rowids_list,otu_table.SampleIds))
            f.close()
示例#9
0
 def formatResult(self, result):
     """Generate formatted distance - result is (data, sample_names)"""
     data, sample_names, calc_names = result
     res = format_matrix(data, sample_names, calc_names)
     return res
示例#10
0
 def formatResult(self, result):
     """Generate formatted distance - result is (data, sample_names)"""
     data, sample_names, calc_names = result
     res = format_matrix(data, sample_names, calc_names)
     return res
示例#11
0
def single_file_beta(input_path,
                     metrics,
                     tree_path,
                     output_dir,
                     rowids=None,
                     full_tree=False):
    """ does beta diversity calc on a single otu table

    uses name in metrics to name output beta diversity files
    assumes input tree is already trimmed to contain only otus present in otu
    table, doesn't call getSubTree()
    inputs:
     input_path (str)
     metrics (str, comma delimited if more than 1 metric)
     tree_path (str)
     output_dir (str)
     rowids (comma separated str)
    """
    f = open(input_path, 'U')
    samids, otuids, otumtx, lineages = parse_otu_table(f)
    # otu mtx is otus by samples
    f.close()
    tree = None
    if tree_path:
        f = open(tree_path, 'U')
        tree = parse_newick(f, PhyloNode)
        f.close()
        if not full_tree:
            tree = tree.getSubTree(otuids, ignore_missing=True)

    metrics_list = metrics.split(',')
    for metric in metrics_list:
        outfilepath = os.path.join(output_dir,
                                   metric + '_' + os.path.split(input_path)[1])
        try:
            metric_f = get_nonphylogenetic_metric(metric)
            is_phylogenetic = False
        except AttributeError:
            try:
                metric_f = get_phylogenetic_metric(metric)
                is_phylogenetic = True
                if tree == None:
                    stderr.write("metric %s requires a tree, but none found\n"\
                        % (metric,))
                    exit(1)
            except AttributeError:
                stderr.write("Could not find metric %s.\n\nKnown metrics are: %s\n"\
                    % (metric, ', '.join(list_known_metrics())))
                exit(1)
        if rowids == None:
            # standard, full way
            if is_phylogenetic:
                dissims = metric_f(otumtx.T, otuids, tree, samids)
            else:
                dissims = metric_f(otumtx.T)

            f = open(outfilepath, 'w')
            f.write(format_distance_matrix(samids, dissims))
            f.close()
        else:
            # only calc d(rowid1, *) for each rowid
            rowids_list = rowids.split(',')
            row_dissims = []  # same order as rowids_list
            for rowid in rowids_list:
                rowidx = samids.index(rowid)

                # first test if we can the dissim is a fn of only the pair
                # if not, just calc the whole matrix
                if metric_f.__name__ == 'dist_chisq' or \
                    metric_f.__name__ == 'dist_gower' or \
                    metric_f.__name__ == 'dist_hellinger' or\
                    metric_f.__name__ == 'binary_dist_chisq':
                    row_dissims.append(metric_f(otumtx.T)[rowidx])
                else:
                    try:
                        row_metric = get_phylogenetic_row_metric(metric)
                    except AttributeError:
                        # do element by element
                        dissims = []
                        for i in range(len(samids)):
                            if is_phylogenetic:
                                dissim = metric_f(
                                    otumtx.T[[rowidx, i], :], otuids, tree,
                                    [samids[rowidx], samids[i]])[0, 1]
                            else:
                                dissim = metric_f(otumtx.T[[rowidx, i], :])[0,
                                                                            1]
                            dissims.append(dissim)
                        row_dissims.append(dissims)
                    else:
                        # do whole row at once
                        dissims = row_metric(otumtx.T, otuids, tree, samids,
                                             rowid)
                        row_dissims.append(dissims)

            # rows_outfilepath = os.path.join(output_dir, metric + '_' +\
            #     '_'.join(rowids_list) + '_' + os.path.split(input_path)[1])
            f = open(outfilepath, 'w')
            f.write(format_matrix(row_dissims, rowids_list, samids))
            f.close()