def distance(args):


	json_data = open(args.matrix_file)
	data = json.load(json_data)
	json_data.close()

	datasets = []
	for i in data['columns']:
		#print i['id']
		datasets.append(i['id'])

	z = np.array(data['data'])
	dm = np.transpose(z)

	if args.metric == 'bray_curtis':
		distance_matrix = dt.dist_bray_curtis(dm)
	elif args.metric == 'morisita_horn':
		distance_matrix = dt.dist_morisita_horn(dm)
	elif args.metric == 'canberra':
		distance_matrix = dt.dist_canberra(dm)
	elif args.metric == 'chisq':
		distance_matrix = dt.dist_chisq(dm)
	elif args.metric == 'chord':
		distance_matrix = dt.dist_chord(dm)
	elif args.metric == 'euclidean':
		distance_matrix = dt.dist_euclidean(dm)
	elif args.metric == 'gower':
		distance_matrix = dt.dist_gower(dm)
	elif args.metric == 'hellinger':
		distance_matrix = dt.dist_hellinger(dm)
	elif args.metric == 'kulczynski':
		distance_matrix = dt.dist_kulczynski(dm)
	elif args.metric == 'manhattan':
		distance_matrix = dt.dist_manhattan(dm)
	elif args.metric == 'abund_jaccard':
		distance_matrix = dt.dist_abund_jaccard(dm)
	elif args.metric == 'binary_jaccard':
		distance_matrix = dt.binary_dist_jaccard(dm)
	elif args.metric == 'pearson':
		distance_matrix = dt.dist_pearson(dm)
	elif args.metric == 'soergel':
		distance_matrix = dt.dist_soergel(dm)
	elif args.metric == 'spearman':
		distance_matrix = dt.dist_spearman_approx(dm)
	else:  # default
		distance_matrix = dt.dist_bray_curtis(dm)


	dist = {}
	for i,x in enumerate(distance_matrix):
		for n,d in enumerate(distance_matrix[i]):
			if i < n: # only needs one copy
					dist[ (datasets[i],datasets[n]) ] = d

	#np.savetxt(os.path.join(args.output_dir, args.file_prefix+'_distance.mtx'), distance_matrix)
	if args.to_output == 'distance':
		print(distance_matrix)
	return dist
def distance(args):

    json_data = open(args.matrix_file)
    data = json.load(json_data)
    json_data.close()

    datasets = []
    for i in data['columns']:
        #print i['id']
        datasets.append(i['id'])

    z = np.array(data['data'])
    dm = np.transpose(z)

    if args.metric == 'bray_curtis':
        distance_matrix = dt.dist_bray_curtis(dm)
    elif args.metric == 'morisita_horn':
        distance_matrix = dt.dist_morisita_horn(dm)
    elif args.metric == 'canberra':
        distance_matrix = dt.dist_canberra(dm)
    elif args.metric == 'chisq':
        distance_matrix = dt.dist_chisq(dm)
    elif args.metric == 'chord':
        distance_matrix = dt.dist_chord(dm)
    elif args.metric == 'euclidean':
        distance_matrix = dt.dist_euclidean(dm)
    elif args.metric == 'gower':
        distance_matrix = dt.dist_gower(dm)
    elif args.metric == 'hellinger':
        distance_matrix = dt.dist_hellinger(dm)
    elif args.metric == 'kulczynski':
        distance_matrix = dt.dist_kulczynski(dm)
    elif args.metric == 'manhattan':
        distance_matrix = dt.dist_manhattan(dm)
    elif args.metric == 'abund_jaccard':
        distance_matrix = dt.dist_abund_jaccard(dm)
    elif args.metric == 'binary_jaccard':
        distance_matrix = dt.binary_dist_jaccard(dm)
    elif args.metric == 'pearson':
        distance_matrix = dt.dist_pearson(dm)
    elif args.metric == 'soergel':
        distance_matrix = dt.dist_soergel(dm)
    elif args.metric == 'spearman':
        distance_matrix = dt.dist_spearman_approx(dm)
    else:  # default
        distance_matrix = dt.dist_bray_curtis(dm)

    dist = {}
    for i, x in enumerate(distance_matrix):
        for n, d in enumerate(distance_matrix[i]):
            if i < n:  # only needs one copy
                dist[(datasets[i], datasets[n])] = d

    #np.savetxt(os.path.join(args.output_dir, args.file_prefix+'_distance.mtx'), distance_matrix)
    if args.to_output == 'distance':
        print(distance_matrix)
    return dist
Пример #3
0
def get_dist(metric, mtx):
    if metric == 'bray_curtis':
        dtvar = dt.dist_bray_curtis(mtx, strict=False)
    elif metric == 'morisita_horn':
        dtvar = dt.dist_morisita_horn(mtx, strict=False)
    elif metric == 'canberra':
        dtvar = dt.dist_canberra(mtx, strict=False)
    elif metric == 'jaccard':
        dtvar = dt.binary_dist_jaccard(mtx, strict=False)
    elif metric == 'kulczynski':
        dtvar = dt.dist_kulczynski(mtx, strict=False)
    else:  # default
        dtvar = dt.dist_bray_curtis(mtx, strict=False)

    dist = distance.squareform(dtvar)
    return dist
Пример #4
0
def get_dist(metric, mtx):
    if metric == 'bray_curtis':
        dtvar = dt.dist_bray_curtis(mtx, strict=False)
    elif metric == 'morisita_horn':
        dtvar = dt.dist_morisita_horn(mtx, strict=False)
    elif metric == 'canberra':
        dtvar = dt.dist_canberra(mtx, strict=False)
    elif metric == 'jaccard':
        dtvar = dt.binary_dist_jaccard(mtx, strict=False)
    elif metric == 'kulczynski':
        dtvar = dt.dist_kulczynski(mtx, strict=False)
    else:  # default
        dtvar = dt.dist_bray_curtis(mtx, strict=False)

    dist = distance.squareform( dtvar )
    return dist
def compute_procrustes(result_tables, expected_pc_lookup, taxonomy_level=6, num_dimensions=3, random_trials=999):
    """ Compute Procrustes M2 and p-values for a set of results
    
        result_tables: 2d list of tables to be compared to expected tables, 
         where the data in the inner list is:
          [dataset_id, reference_database_id, method_id, 
           parameter_combination_id, table_fp]
        expected_pc_lookup: 2d dict of dataset_id, reference_db_id to principal
         coordinate matrices, for the expected result coordinate matrices
        taxonomy_level: level to compute results
    """
    ### Start code copied ALMOST* directly from compute_prfs - some re-factoring for re-use is
    ### in order here. *ALMOST refers to changes to parser and variable names since expected
    ### is a pc matrix here.

    for dataset_id, reference_id, method_id, params, actual_table_fp in result_tables:
        ## parse the expected table (unless taxonomy_level is specified, this should be
        ## collapsed on level 6 taxonomy)
        try:
            expected_pc_fp = expected_pc_lookup[dataset_id][reference_id]
        except KeyError:
            raise KeyError, "Can't find expected table for (%s, %s)." % (dataset_id, reference_id)

        ## parse the actual table and collapse it at the specified taxonomic level
        try:
            actual_table = parse_biom_table(open(actual_table_fp, "U"))
        except ValueError:
            raise ValueError, "Couldn't parse BIOM table: %s" % actual_table_fp
        collapse_by_taxonomy = get_taxonomy_collapser(taxonomy_level)
        actual_table = actual_table.collapseObservationsByMetadata(collapse_by_taxonomy)
        ### End code copied directly from compute_prfs.

        # Next block of code, how do I hate thee? Let me count the ways...
        # (1) dist_bray_curtis doesn't take a BIOM Table object
        # (2) pcoa takes a qiime-formatted distance matrix as a list of lines
        # (3) pcoa return a qiime-formatted pc matrix
        # (4) procrustes_monte_carlo needs to pass through the pc "file" multiple
        #     times, so we actually *need* those the pcs that get passed in to be
        #     lists of lines
        dm = dist_bray_curtis(asarray([v for v in actual_table.iterSampleData()]))
        formatted_dm = format_distance_matrix(actual_table.SampleIds, dm)
        actual_pc = pcoa(formatted_dm.split("\n")).split("\n")
        expected_pc = list(open(expected_pc_fp, "U"))

        ## run Procrustes analysis with monte carlo simulation
        actual_m_squared, trial_m_squareds, count_better, mc_p_value = procrustes_monte_carlo(
            expected_pc,
            actual_pc,
            trials=random_trials,
            max_dimensions=num_dimensions,
            sample_id_map=None,
            trial_output_dir=None,
        )

        yield (dataset_id, reference_id, method_id, params, actual_m_squared, mc_p_value)
Пример #6
0
def calculate_distance(args):

	if args.file_format == 'json':
		try:
			json_data = open('./tmp/'+args.in_file)
		except IOError:
			json_data = open(args.in_file)
		except:
			print("NO FILE FOUND ERROR")
			sys.exit()

		data = json.load(json_data)
		json_data.close()
	else: # csv file
		# this doesn't work now
		with open('./tmp/'+args.in_file, 'rb') as csvfile:
			csv_data = csv.reader(csvfile, delimiter=',', quotechar='"')
			for row in csv_data:
				pass

	datasets = []
	for i in data['columns']:

		datasets.append(i['name'])

	z = np.array(data['data'])
	dm = np.transpose(z)

	if args.metric == 'bray_curtis':
		dist = dt.dist_bray_curtis(dm)
	elif args.metric == 'morisita_horn':
		dist = dt.dist_morisita_horn(dm)
	elif args.metric == 'canberra':
		dist = dt.dist_canberra(dm)
	elif args.metric == 'chisq':
		dist = dt.dist_chisq(dm)
	elif args.metric == 'chord':
		dist = dt.dist_chord(dm)
	elif args.metric == 'euclidean':
		dist = dt.dist_euclidean(dm)
	elif args.metric == 'gower':
		dist = dt.dist_gower(dm)
	elif args.metric == 'hellinger':
		dist = dt.dist_hellinger(dm)
	elif args.metric == 'kulczynski':
		dist = dt.dist_kulczynski(dm)
	elif args.metric == 'manhattan':
		dist = dt.dist_manhattan(dm)
	elif args.metric == 'abund_jaccard':
		dist = dt.dist_abund_jaccard(dm)
	elif args.metric == 'binary_jaccard':
		dist = dt.binary_dist_jaccard(dm)
	elif args.metric == 'pearson':
		dist = dt.dist_pearson(dm)
	elif args.metric == 'soergel':
		dist = dt.dist_soergel(dm)
	elif args.metric == 'spearman':
		dist = dt.dist_spearman_approx(dm)
	else:  # default
		dist = dt.dist_bray_curtis(dm)

	distance_matrix1 = {}
	distance_matrix2 = {}
	mat = []
	out_fp = open(args.out_file,'w')

	file_header_line = ','.join([x['name'] for x in data['columns']]) + '\n'

	out_fp.write(file_header_line)


	for row,line in enumerate(data['columns']):
		name = line['name']
		distance_matrix1[name] = {}
		file_data_line = name+','
		for col,d in enumerate(dist[row]):
			file_data_line += str(dist[row][col])+','
			distance_matrix1[name][data['columns'][col]['name']]  = dist[row][col]
			distance_matrix2[(name, data['columns'][col]['name'])]  = dist[row][col]
		file_data_line = file_data_line[:-1]+'\n'
		out_fp.write(file_data_line)

	out_fp.close()
	#if args.function == 'distance' or args.function == 'heatmap':
	print(json.dumps(distance_matrix1))

	arr = []
	for ds1 in distance_matrix1:
		print(ds1)
		tmp = []
		for ds2 in distance_matrix1[ds1]:
			val = distance_matrix1[ds1][ds2]
			tmp.append(val)
		arr.append(tmp)
	#np.array(arr)

	linkage_matrix = linkage(arr,  "single")
	dendrogram(linkage_matrix,           color_threshold=1,                show_leaf_counts=True)
	#image_file = '/Users/avoorhis/node_projects/vamps-node.js/public/tmp_images/'+args.prefix+'.png'
	image_file = 'public/tmp_images/'+args.prefix+'.png'
	plt.savefig(image_file)
Пример #7
0
    print "Removing species with less than two occurrences..."
    sp_io = np.where(~(sp > 0), sp, 1)
    column_sums = np.sum(sp_io, 0)
    to_remove = np.where(column_sums < 2)
    sp = np.delete(sp, to_remove, 1)
    colnames = np.delete(colnames, to_remove)
    
    print "Removing plots with less than two species..."
    pl_io = np.where(~(sp > 0), sp, 1)
    row_sums = np.sum(pl_io, 1)
    to_remove = np.where(row_sums < 2)
    sp = np.delete(sp, to_remove, 0)
    rownames = np.delete(rownames, to_remove)

    #print sp.shape, len(rownames)
    #print sp.shape, len(colnames)

    print "Normalizing species coverage data with McCune logarithm..."
    sp = log_mccune(sp)

    from cogent.cluster.nmds import NMDS, metaNMDS
    from cogent.maths.distance_transform import dist_bray_curtis
    
    print "Calculating distance matrix..."
    distmtx = dist_bray_curtis(sp)
    
    nmds = NMDS(distmtx, dimension = 3)
    print nmds.getPoints()
    print nmds.getStress()

    #nmds = NMDS()