示例#1
0
def pcoa(file):
    samples, distmtx = parse_distmat(file)
    # coords, each row is an axis
    coords, eigvals = ms.principal_coordinates_analysis(distmtx)

    pcnts = (numpy.abs(eigvals) / sum(numpy.abs(eigvals))) * 100
    idxs_descending = pcnts.argsort()[::-1]
    coords = coords[idxs_descending]
    eigvals = eigvals[idxs_descending]
    pcnts = pcnts[idxs_descending]

    return format_coords(samples, coords.T, eigvals, pcnts)
示例#2
0
def pcoa(file):
    samples, distmtx = parse_distmat(file)
    # coords, each row is an axis
    coords, eigvals = ms.principal_coordinates_analysis(distmtx)

    pcnts = (numpy.abs(eigvals) / sum(numpy.abs(eigvals))) * 100
    idxs_descending = pcnts.argsort()[::-1]
    coords = coords[idxs_descending]
    eigvals = eigvals[idxs_descending]
    pcnts = pcnts[idxs_descending]

    return format_coords(samples, coords.T, eigvals, pcnts)
示例#3
0
 def test_principal_coordinate_analysis(self):
     """principal_coordinate_analysis returns array of principal coors"""
     #I took the example in the book (see intro info), and did the
     #principal coordinates analysis, plotted the data and it looked
     #right
     matrix = self.real_matrix
     pcs, eigvals = principal_coordinates_analysis(matrix)
     bigfirstorder = eigvals.argsort()[::-1]
     pcs = pcs[bigfirstorder]
     eigvals = eigvals[bigfirstorder]
     self.assertEqual(len(pcs), 14)
     self.assertFloatEqual(abs(pcs[0, 0]), 0.240788133045)
     self.assertFloatEqual(abs(pcs[1, 0]), 0.233677162)
示例#4
0
 def test_principal_coordinate_analysis(self):
     """principal_coordinate_analysis returns array of principal coors"""
     #I took the example in the book (see intro info), and did the
     #principal coordinates analysis, plotted the data and it looked
     #right
     matrix = self.real_matrix
     pcs, eigvals= principal_coordinates_analysis(matrix)
     bigfirstorder = eigvals.argsort()[::-1]
     pcs = pcs[bigfirstorder]
     eigvals = eigvals[bigfirstorder]
     self.assertEqual(len(pcs), 14)
     self.assertFloatEqual(abs(pcs[0,0]), 0.240788133045)
     self.assertFloatEqual(abs(pcs[1,0]), 0.233677162)
示例#5
0
def hmp_pcoa(biom_path, map_path, distance="hellinger"):
  """
  @biom_path
  @map_path
  @distance
  """
  data,labn,labs,classes = load_data(biom_path, map_path)

  dist_mtrx_fcn = getattr(distance_transform, 'dist_'+distance)
  dist_mtrx = dist_mtrx_fcn(data)
  coords, eigvals=ms.principal_coordinates_analysis(dist_mtrx)
  
  returgvals = np.abs(eigvals)
  pcnts = (np.abs(eigvals)/sum(np.abs(eigvals)))*100
  idxs_descending = pcnts.argsort()[::-1]
  coords = coords[idxs_descending]
  eigvals = eigvals[idxs_descending]
  pcnts = pcnts[idxs_descending]
  return None 
示例#6
0
    def __init__(self,
                 dissimilarity_mtx,
                 initial_pts="pcoa",
                 dimension=2,
                 rand_seed=None,
                 optimization_method=1,
                 verbosity=1,
                 max_iterations=50,
                 setup_only=False,
                 min_rel_improvement=1e-3,
                 min_abs_stress=1e-5):
        """    
        Arguments:
        - dissimilarity_mtx: an n by n numpy float array representing the 
        pairwise dissimilarity of items.  0 on diagonals, symmetric under 
        (i,j) -> (j,i)
        - initial_pts: "random" => random starting points, "pcoa" => 
        pts from pcoa, or a numpy 2d array, ncols = dimension
        - dimension: the desired dimension k of the constructed 
        - rand_seed: used for testing
        - optimization_method: used when points are adjusted to minimize stress:
        0 => justin k's ad hoc method of steepest descent
        1 => cogent's scipy_optimize fmin_bfgs
        """
        self.min_rel_improvement = min_rel_improvement
        self.min_abs_stress = min_abs_stress

        if dimension >= len(dissimilarity_mtx) - 1:
            raise RuntimeError("NMDS requires N-1 dimensions or fewer, "+\
             "where N is the number of samples, or rows in the dissim matrix"+\
             " got %s rows for a %s dimension NMDS" % \
             (len(dissimilarity_mtx), dimension))

        if rand_seed != None:
            seed(rand_seed)

        self.verbosity = verbosity
        num_points = len(dissimilarity_mtx)
        point_range = list(range(num_points))
        self.dimension = dimension
        self.optimization_method = optimization_method

        self._calc_dissim_order(dissimilarity_mtx, point_range)
        # sets self.order
        # note that in the rest of the code, only the order matters, the values
        # of the dissimilarity matrix aren't used

        if initial_pts == "random":
            self.points = self._get_initial_pts(dimension, point_range)
        elif initial_pts == "pcoa":
            pcoa_pts, pcoa_eigs = principal_coordinates_analysis(\
                dissimilarity_mtx)
            order = argsort(pcoa_eigs)[::-1]  # pos to small/neg
            pcoa_pts = pcoa_pts[order].T
            self.points = pcoa_pts[:, :dimension]
        else:
            self.points = initial_pts
        self.points = self._center(self.points)

        self._rescale()
        self._calc_distances()
        # dists relates to points, not to input data

        self._update_dhats()
        # dhats are constrained to be monotonic

        self._calc_stress()
        # self.stress is calculated from dists and dhats

        self.stresses = [self.stress]
        # stress is the metric of badness of fit used in this code
        # index 0 is the initial stress, with a initial set of
        # datapoints. index 1 corresponds to iteration 0 of the loop below

        if setup_only:
            return

        for i in range(max_iterations):
            if self.verbosity >= 1:
                print(("nonmetric broad iteration, stress: ", i,
                       self.stresses[-1]))

            if (self.stresses[-1] < self.min_abs_stress):
                if self.verbosity >= 1:
                    print("stress below cutoff, done")
                break
            self._move_points()
            self._calc_distances()
            self._update_dhats()
            self._calc_stress()
            self.stresses.append(self.stress)

            if (self.stresses[-2]-self.stresses[-1]) / self.stresses[-2] <\
                self.min_rel_improvement:
                if self.verbosity >= 1:
                    print("iteration improvement minimal. converged.")
                break

        # center and rotate the points, since pos, rotation is arbitrary
        # rotation is to align to principal axes of self.points
        self.points = self._center(self.points)
        u, s, vh = svd(self.points, full_matrices=False)
        S = diag(s)
        self.points = dot(u, S)
        # normalize the scaling, which should not change the stress
        self._rescale()
示例#7
0
    for i in range(len(distance_matrix)-1,-1,-1):
        if distance_matrix[i][0] == '#':
            del distance_matrix[i]
    #split each line by tabs
    distance_matrix = [i.split('\t') for i in distance_matrix]
    #convert each element to a number
    distance_matrix = array([[float(i) for i in j] for j in distance_matrix])
    print distance_matrix
else:
    #create distance matrix
    distance_matrix = dist_functions[dist_metric](ptmtx)
o = open("distmtx.txt", 'w')
o.write(distance_matrix)
o.close();
print("1")
aa = pcoa.principal_coordinates_analysis(distance_matrix)
sample_coords = aa[0].transpose()
sp_coords = species_coords(aa[0], ptmtx, dims=len(sample_coords[0])) * 3
print("1")

evals = aa[1]/sum(aa[1])

#scale axes by eigenvalues
sp_coords = sp_coords*array([list(evals)]*len(sp_coords));
sample_coords = sample_coords*array([list(evals)]*len(sample_coords))

o = open('sample_coords.txt', 'w')
for i in sample_coords:
    for j in i:
        o.write(str(j) + '\t')
    o.write('\n')
    for i in range(len(distance_matrix) - 1, -1, -1):
        if distance_matrix[i][0] == '#':
            del distance_matrix[i]
    #split each line by tabs
    distance_matrix = [i.split('\t') for i in distance_matrix]
    #convert each element to a number
    distance_matrix = array([[float(i) for i in j] for j in distance_matrix])
    print distance_matrix
else:
    #create distance matrix
    distance_matrix = dist_functions[dist_metric](ptmtx)
o = open("distmtx.txt", 'w')
o.write(distance_matrix)
o.close()
print("1")
aa = pcoa.principal_coordinates_analysis(distance_matrix)
sample_coords = aa[0].transpose()
sp_coords = species_coords(aa[0], ptmtx, dims=len(sample_coords[0])) * 3
print("1")

evals = aa[1] / sum(aa[1])

#scale axes by eigenvalues
sp_coords = sp_coords * array([list(evals)] * len(sp_coords))
sample_coords = sample_coords * array([list(evals)] * len(sample_coords))

o = open('sample_coords.txt', 'w')
for i in sample_coords:
    for j in i:
        o.write(str(j) + '\t')
    o.write('\n')
示例#9
0
    def __init__(self, dissimilarity_mtx, initial_pts="pcoa", 
        dimension=2, rand_seed=None, optimization_method=1, verbosity=1,
        max_iterations=50, setup_only=False, min_rel_improvement = 1e-3,
        min_abs_stress = 1e-5):
        """    
        Arguments:
        - dissimilarity_mtx: an n by n numpy float array representing the 
        pairwise dissimilarity of items.  0 on diagonals, symmetric under 
        (i,j) -> (j,i)
        - initial_pts: "random" => random starting points, "pcoa" => 
        pts from pcoa, or a numpy 2d array, ncols = dimension
        - dimension: the desired dimension k of the constructed 
        - rand_seed: used for testing
        - optimization_method: used when points are adjusted to minimize stress:
        0 => justin k's ad hoc method of steepest descent
        1 => cogent's scipy_optimize fmin_bfgs
        """
        self.min_rel_improvement = min_rel_improvement
        self.min_abs_stress = min_abs_stress

        if dimension >= len(dissimilarity_mtx) - 1:
            raise RuntimeError("NMDS requires N-1 dimensions or fewer, "+\
             "where N is the number of samples, or rows in the dissim matrix"+\
             " got %s rows for a %s dimension NMDS" % \
             (len(dissimilarity_mtx), dimension))

        if rand_seed != None:
            seed(rand_seed)
        
        self.verbosity = verbosity
        num_points = len(dissimilarity_mtx)
        point_range = list(range(num_points))
        self.dimension = dimension
        self.optimization_method = optimization_method
        
        self._calc_dissim_order(dissimilarity_mtx, point_range)
        # sets self.order
        # note that in the rest of the code, only the order matters, the values
        # of the dissimilarity matrix aren't used
        
        if initial_pts == "random":
            self.points = self._get_initial_pts(dimension, point_range)
        elif initial_pts == "pcoa":
            pcoa_pts, pcoa_eigs = principal_coordinates_analysis(\
                dissimilarity_mtx)
            order = argsort(pcoa_eigs)[::-1] # pos to small/neg
            pcoa_pts = pcoa_pts[order].T
            self.points = pcoa_pts[:,:dimension]
        else:
            self.points = initial_pts
        self.points = self._center(self.points)
        
        self._rescale()
        self._calc_distances() 
        # dists relates to points, not to input data
        
        self._update_dhats()
        # dhats are constrained to be monotonic
        
        self._calc_stress()
        # self.stress is calculated from dists and dhats
        
        self.stresses = [self.stress]
        # stress is the metric of badness of fit used in this code
        # index 0 is the initial stress, with a initial set of 
        # datapoints. index 1 corresponds to iteration 0 of the loop below
        
        if setup_only:
            return

        for i in range(max_iterations):
            if self.verbosity >= 1:
                print(("nonmetric broad iteration, stress: ", i,
                self.stresses[-1]))

            if (self.stresses[-1] < self.min_abs_stress):
                if self.verbosity >= 1:
                    print("stress below cutoff, done") 
                break
            self._move_points()
            self._calc_distances()
            self._update_dhats()
            self._calc_stress()
            self.stresses.append(self.stress)

            if (self.stresses[-2]-self.stresses[-1]) / self.stresses[-2] <\
                self.min_rel_improvement:
                if self.verbosity >= 1:
                    print("iteration improvement minimal. converged.")
                break

        # center and rotate the points, since pos, rotation is arbitrary
        # rotation is to align to principal axes of self.points
        self.points = self._center(self.points)
        u,s,vh = svd(self.points, full_matrices=False)
        S = diag(s)
        self.points = dot(u,S)
        # normalize the scaling, which should not change the stress
        self._rescale()
示例#10
0
def generate_pcoa_file(distmtx, m_n_sample_ids, n_sample_ids, filepath):
    """Make PCoA-related file for D3.js drawings. Generates CSV file.

    :param distmtx: Numpy array matrix of distances
    :param m_n_sample_ids: List of strings containing m and n sample IDs
    :param n_sample_ids: List of strings containing user (or n) sample IDs
    :param filepath: User directory path to which the file will be written
    """
    coords, eigvals = ms.principal_coordinates_analysis(distmtx)
    pcnts = (np.abs(eigvals) / float(sum(np.abs(eigvals)))) * 100
    idxs_descending = pcnts.argsort()[::-1]
    coords = coords[idxs_descending]

    # from google10c
    #print "Distance Matrix distmtx"
    #print distmtx, coords
    print 'm_n_sample_ids' + str(len(m_n_sample_ids)), m_n_sample_ids
    print "n_sample_ids" + str(len(n_sample_ids)), n_sample_ids
    colormap = [
        "#3366cc", "#dc3912", "#ff9900", "#109618", "#990099", "#0099c6",
        "#dd4477", "#66aa00", "#b82e2e", "#316395", "#994499", "#22aa99",
        "#aaaa11", "#6633cc", "#e67300", "#8b0707", "#651067", "#329262",
        "#5574a6", "#3b3eac"
    ]

    tooltip_html = """
    Sample: {}<br>
    Ecosystem: {}<br>
    Envo ID: {}<br>
    Envo Term: {}<br>
    Study: {} <br>
    Study Source: {}
    """
    """Okay messy indexing coming up! eco_samples_idx holds a list of sample
    indices for each ecosystem that is queried (along with color) e.g. {
        ("Biofilm", "grey"): [1,12,...],
        ("Soil", "gold"): [3,16,...]
    }.
    envo_samples_idx holds a list of sample indices for each envo that is
    queried (along with color) e.g. {
        ("ENVO:00009003", "blue"): [1,12,...],
        ("ENVO:00000073", "red"): [3,16,...]
    }.
    tooltip_htmls contains the html formatted string of the metadata for each
    sample. Metdata is a tuple of (title, ontology_ids, ontology_terms,
    ecosystem, study_source, color)
    """
    m_sample_ids = [
        sample_id for sample_id in m_n_sample_ids
        if not sample_id in n_sample_ids
    ]
    user_key = ("User Samples", "red")
    eco_samples_idx = {user_key: []}
    envo_samples_idx = {user_key: []}
    tooltip_htmls = [""] * len(m_n_sample_ids)

    # all other samples excluding user samples
    for sample_id in m_sample_ids:
        metadata = query_pcoa_metadata(sample_id)
        tooltip_htmls[m_n_sample_ids.index(sample_id)] = tooltip_html.format(
            sample_id, metadata[3], ", ".join(metadata[1]),
            ", ".join(metadata[2]), metadata[0], metadata[4])

        color_id = len(envo_samples_idx)
        eco_term = metadata[3]
        eco_color = metadata[5]
        envo_term = metadata[1][0]
        eco_key = (eco_term, eco_color)
        # if the envo has already been encountered before, don't add a new sample
        if envo_term in map(lambda x: x[0], envo_samples_idx):
            envo_color = filter(lambda x: x[0] == envo_term,
                                envo_samples_idx)[0][1]
            envo_key = (envo_term, envo_color)
        else:
            envo_key = (envo_term, colormap[color_id % len(colormap)])

        if eco_key in eco_samples_idx:
            eco_samples_idx[eco_key].append(m_n_sample_ids.index(sample_id))
        else:
            eco_samples_idx[eco_key] = [m_n_sample_ids.index(sample_id)]

        if envo_key in envo_samples_idx:
            envo_samples_idx[envo_key].append(m_n_sample_ids.index(sample_id))
        else:
            envo_samples_idx[envo_key] = [m_n_sample_ids.index(sample_id)]

    # add user samples to eco_samples_idx, envo_samples_idx and html_tooltips
    for sample_id_j in n_sample_ids:
        envo_samples_idx[user_key].append(m_n_sample_ids.index(sample_id_j))
        eco_samples_idx[user_key].append(m_n_sample_ids.index(sample_id_j))
        tooltip_htmls[m_n_sample_ids.index(sample_id_j)] = tooltip_html.format(
            sample_id_j, user_key[0], user_key[0], user_key[0], "", "")
    """ PLOTTING """

    plots = {}
    # loop through groupings
    for group in ["ecosystem", "envo"]:
        # loop through each top 3 pairs of principal coordinates
        for pc1, pc2 in itertools.combinations(range(3), 2):
            # start the plot
            fig, ax = plt.subplots()
            fig.set_figwidth(11)

            # plot all the points except for the users samples, they go last
            group_samples_idx = eco_samples_idx
            non_user_group_samples_idx = [
                e for e in eco_samples_idx.keys() if not e == user_key
            ]
            if group is "envo":
                group_samples_idx = envo_samples_idx
                non_user_group_samples_idx = [
                    e for e in envo_samples_idx.keys() if not e == user_key
                ]

            # scatter the ecosystem-labelled points
            # remember that the keys are in the format
            # ("Ecosystem/Envo", "Color")
            for key in non_user_group_samples_idx:
                ax.scatter(coords.T[group_samples_idx[key], pc1],
                           coords.T[group_samples_idx[key], pc2],
                           marker="o",
                           label=key[0],
                           color=key[1],
                           alpha=1)

            # plot user samples
            ax.scatter(coords.T[group_samples_idx[user_key], pc1],
                       coords.T[group_samples_idx[user_key], pc2],
                       marker="*",
                       s=96,
                       label=user_key[0],
                       color=user_key[1],
                       alpha=1)

            # draw PC axis labels
            ax.set_xlabel("PC%d" % (pc1 + 1))
            ax.set_ylabel("PC%d" % (pc2 + 1))
            ax.set_title("PCoA Plot grouped by %s" % (group.capitalize()))

            # adjust the plot abit for the legend for sample legend
            box = ax.get_position()
            ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
            """INTERACTIVITY"""

            # make interactive legends for sample groupings
            if group is "ecosystem":
                handles, legend_labels = ax.get_legend_handles_labels()
                interactive_legend = InteractiveLegendPlugin(
                    zip(handles, ax.collections),
                    legend_labels,
                    alpha_unsel=0.3,
                    alpha_over=1,
                    start_visible=False)

                mpld3.plugins.connect(fig, interactive_legend)

            # make interactive html labels for non-user samples first, since
            # they are now ordered
            html_labels = np.array(tooltip_htmls)
            for i, key in enumerate(non_user_group_samples_idx):
                tooltip = PointHTMLTooltip(
                    ax.collections[i],
                    labels=list(html_labels[group_samples_idx[key]]))
                mpld3.plugins.connect(fig, tooltip)

            # make interactive html labels for user samples
            tooltip = PointHTMLTooltip(
                ax.collections[-1],
                labels=list(html_labels[group_samples_idx[user_key]]))
            mpld3.plugins.connect(fig, tooltip)

            plot_name = (pc1 + 1, pc2 + 1, group.capitalize())
            plots["PC%d%d%s" % plot_name] = mpld3.fig_to_dict(fig)
            svgfile = "%s_PC%s%s_%s.svg" % (
                (os.path.splitext(filepath)[0], ) + plot_name)
            print "Saving PCoA in %s" % svgfile
            plt.savefig(svgfile)
    """ FINISH! """
    with open(filepath, "w") as f_pcoa:
        json.dump(plots, f_pcoa)