core_rev_difference_matrix += chunk_core_rev_difference_matrix core_rev_opportunity_matrix += chunk_core_rev_opportunity_matrix # Add all snp_mut_difference_matrix += chunk_snp_mut_difference_matrix snp_mut_opportunity_matrix += chunk_snp_mut_opportunity_matrix snp_rev_difference_matrix += chunk_snp_rev_difference_matrix snp_rev_opportunity_matrix += chunk_snp_rev_opportunity_matrix snp_samples = dummy_samples # Now calculate gene differences # Load gene coverage information for species_name sys.stderr.write("Loading pangenome data for %s...\n" % species_name) gene_samples, gene_names, gene_presence_matrix, gene_depth_matrix, marker_coverages, gene_reads_matrix = parse_midas_data.parse_pangenome_data( species_name, allowed_samples=snp_samples, disallowed_genes=shared_pangenome_genes) sys.stderr.write("Done! Loaded %d genes\n" % len(gene_names)) gene_sample_list = list(gene_samples) gene_sample_set = set(gene_samples) # Calculate matrix of number of genes that differ sys.stderr.write("Calculating matrix of gene differences...\n") gene_gain_matrix, gene_loss_matrix, gene_opportunity_matrix = gene_diversity_utils.calculate_coverage_based_gene_hamming_matrix_gain_loss( gene_reads_matrix, gene_depth_matrix, marker_coverages) good_marker_coverages = (marker_coverages >= min_coverage) gene_gain_matrix = gene_gain_matrix * good_marker_coverages[:,
# Only plot samples above a certain depth threshold that are "haploids" low_pi_snp_samples = samples[(median_coverages >= min_coverage) * (pis <= 1e-03)] high_pi_snp_samples = samples[(median_coverages >= min_coverage) * (pis > 1e-03)] # Calculate which pairs of idxs belong to the same sample, which to the same subject # and which to different subjects #snp_same_sample_idxs, snp_same_subject_idxs, snp_diff_subject_idxs = parse_midas_data.calculate_subject_pairs(subject_sample_map, snp_samples) #################################################### # Load gene coverage information for species_name #################################################### sys.stderr.write("Loading pangenome data for %s...\n" % species_name) low_pi_gene_samples, low_pi_gene_names, low_pi_gene_presence_matrix, low_pi_gene_depth_matrix, low_pi_marker_coverages, low_pi_gene_reads_matrix = parse_midas_data.parse_pangenome_data( species_name, allowed_samples=low_pi_snp_samples) high_pi_gene_samples, high_pi_gene_names, high_pi_gene_presence_matrix, high_pi_gene_depth_matrix, high_pi_marker_coverages, high_pi_gene_reads_matrix = parse_midas_data.parse_pangenome_data( species_name, allowed_samples=high_pi_snp_samples) sys.stderr.write("Done!\n") # this represents all gene names regardless of prevalences gene_names, new_species_names = list( parse_midas_data.load_pangenome_genes(species_name)) # convert format of gene names from set to list: gene_names = list(gene_names) ############################################### # Load kegg information ##############################################
print len(metaphlan2_genes), "metaphlan2 genes" metaphlan2_gene_coverages = [] for gene in metaphlan2_genes: if gene in gene_coverages: metaphlan2_gene_coverages.append( gene_coverages[gene] ) metaphlan2_gene_coverages = numpy.array(metaphlan2_gene_coverages) median_metaphlan2_coverages = numpy.median(metaphlan2_gene_coverages,axis=0) #mean_metaphlan2_coverages = metaphlan2_gene_coverages.mean(axis=0) mean_metaphlan2_coverages = (metaphlan2_gene_coverages*(metaphlan2_gene_coverages>=1)).sum(axis=0)/((metaphlan2_gene_coverages>=1).sum(axis=0)) # Load gene coverage information for species_name sys.stderr.write("Loading pangenome data for %s...\n" % species_name) gene_samples, gene_names, gene_presence_matrix, gene_depth_matrix, marker_coverages, gene_reads_matrix = parse_midas_data.parse_pangenome_data(species_name) sys.stderr.write("Done!\n") midas_marker_coverage_map = {} for i in xrange(0,len(gene_samples)): midas_marker_coverage_map[gene_samples[i]] = marker_coverages[i] pylab.figure(1,figsize=(22,2)) marker_gene_coverages = parse_midas_data.parse_marker_gene_coverage_distribution(species_name) max_coverages = [] median_coverages = [] marker_genes = sorted(marker_gene_coverages[marker_gene_coverages.keys()[0]].keys())
sys.stderr.write("Done!\n") # if snp_difference_matrix.shape[0]==0: snp_difference_matrix = numpy.zeros_like(chunk_snp_difference_matrix)*1.0 snp_opportunity_matrix = numpy.zeros_like(snp_difference_matrix)*1.0 # snp_difference_matrix += chunk_snp_difference_matrix snp_opportunity_matrix += chunk_snp_opportunity_matrix snp_substitution_rate = snp_difference_matrix*1.0/(snp_opportunity_matrix+(snp_opportunity_matrix==0)) sys.stderr.write("Done!\n") # Load gene coverage information for species_name sys.stderr.write("Loading pangenome data for %s...\n" % species_name) gene_samples, gene_names, gene_presence_matrix, gene_depth_matrix, marker_coverages, gene_reads_matrix = parse_midas_data.parse_pangenome_data(species_name,allowed_samples=snp_samples,convert_centroid_names=False) sys.stderr.write("Done!\n") prevalence_idxs = (parse_midas_data.calculate_unique_samples(subject_sample_map, gene_samples))*(marker_coverages>=min_coverage) prevalences = gene_diversity_utils.calculate_fractional_gene_prevalences(gene_depth_matrix[:,prevalence_idxs], marker_coverages[prevalence_idxs]) pangenome_prevalences = numpy.array(prevalences,copy=True) pangenome_prevalences.sort() # Calculate matrix of number of genes that differ sys.stderr.write("Calculating matrix of gene differences...\n") gene_difference_matrix, gene_opportunity_matrix = gene_diversity_utils.calculate_coverage_based_gene_hamming_matrix(gene_depth_matrix, marker_coverages, min_log2_fold_change=4) ############################################################## # Now need to make the gene samples and snp samples match up #