# Load gene coverage information for species_name sys.stderr.write("Loading pangenome data for %s...\n" % species_name) gene_samples, gene_names, gene_presence_matrix, gene_depth_matrix, marker_coverages, gene_reads_matrix = parse_midas_data.parse_pangenome_data( species_name, allowed_samples=snp_samples) sys.stderr.write("Done!\n") sys.stderr.write("Loaded gene info for %d samples\n" % len(gene_samples)) gene_copynum_matrix = gene_depth_matrix * 1.0 / (marker_coverages + (marker_coverages == 0)) prevalence_idxs = (parse_midas_data.calculate_unique_samples( subject_sample_map, gene_samples)) * (marker_coverages >= min_coverage) prevalences = gene_diversity_utils.calculate_fractional_gene_prevalences( gene_depth_matrix[:, prevalence_idxs], marker_coverages[prevalence_idxs]) pangenome_prevalences = numpy.array(prevalences, copy=True) pangenome_prevalences.sort() # Calculate matrix of number of genes that differ sys.stderr.write("Calculating matrix of gene differences...\n") gene_gain_matrix, gene_loss_matrix, gene_opportunity_matrix = gene_diversity_utils.calculate_coverage_based_gene_hamming_matrix_gain_loss( gene_depth_matrix, marker_coverages, min_log2_fold_change=4, include_high_copynum=include_high_copynum) gene_difference_matrix = gene_gain_matrix + gene_loss_matrix # Now need to make the gene samples and snp samples match up
pangenome_genes = set(gene_names) for marker_gene in marker_genes: print marker_gene, marker_gene in pangenome_genes reference_gene_idxs = numpy.array([gene_name in reference_genes for gene_name in gene_names]) metaphlan2_gene_idxs = numpy.array([gene_name in metaphlan2_genes for gene_name in gene_names]) marker_gene_idxs = numpy.array([gene_name in marker_genes for gene_name in gene_names]) print marker_genes print marker_gene_idxs.sum() sample_idxs = (parse_midas_data.calculate_unique_samples(subject_sample_map, gene_samples)) * (marker_coverages >= min_coverage) prevalences = gene_diversity_utils.calculate_fractional_gene_prevalences(gene_depth_matrix[:, sample_idxs], marker_coverages[sample_idxs], min_copynum=0.3) reference_prevalences = prevalences[reference_gene_idxs] metaphlan2_prevalences = prevalences[metaphlan2_gene_idxs] marker_prevalences = prevalences[marker_gene_idxs] print marker_prevalences pangenome_xs, pangenome_survivals = stats_utils.calculate_unnormalized_survival_from_vector(prevalences, min_x=0, max_x=1) reference_xs, reference_survivals = stats_utils.calculate_unnormalized_survival_from_vector(reference_prevalences, min_x=0, max_x=1) metaphlan2_xs, metaphlan2_survivals = stats_utils.calculate_unnormalized_survival_from_vector(metaphlan2_prevalences, min_x=0, max_x=1) marker_xs, marker_survivals = stats_utils.calculate_unnormalized_survival_from_vector(marker_prevalences, min_x=0, max_x=1)