# Calculate matrix of number of genes that differ sys.stderr.write("Calculate gene hamming matrix...\n") # Either: for all genes in pan-genome gene_hamming_matrix, num_opportunities = gene_diversity_utils.calculate_coverage_based_gene_hamming_matrix( gene_depth_matrix, marker_coverages, min_log2_fold_change=4) # # Or: just the subset from the MIDAS reference genome #gene_hamming_matrix = diversity_utils.calculate_coverage_based_gene_hamming_matrix(gene_depth_matrix[reference_gene_idxs,:], marker_coverages, min_log2_fold_change=4) # sample_idx_map = parse_midas_data.calculate_sample_idx_map( high_coverage_samples, samples) # Calculate which pairs of idxs belong to the same sample, which to the same subject # and which to different subjects high_coverage_same_sample_idxs, high_coverage_same_subject_idxs, high_coverage_diff_subject_idxs = parse_midas_data.calculate_subject_pairs( subject_sample_map, high_coverage_samples) same_sample_idxs = parse_midas_data.apply_sample_index_map_to_indices( sample_idx_map, high_coverage_same_sample_idxs) same_subject_idxs = parse_midas_data.apply_sample_index_map_to_indices( sample_idx_map, high_coverage_same_subject_idxs) diff_subject_idxs = parse_midas_data.apply_sample_index_map_to_indices( sample_idx_map, high_coverage_diff_subject_idxs) hamming_timepoints = gene_hamming_matrix[same_subject_idxs] hamming_timepoints.sort() hamming_timepoints_dns, hamming_timepoints_survivals = stats_utils.calculate_unnormalized_survival_from_vector( hamming_timepoints, min_x=0.1, max_x=1e05) hamming_timepoints_survivals /= hamming_timepoints_survivals[0] hamming_between = gene_hamming_matrix[diff_subject_idxs]
difference_matrix = core_mut_difference_matrix+core_rev_difference_matrix opportunity_matrix = core_mut_opportunity_matrix + core_rev_opportunity_matrix substitution_rates = difference_matrix*1.0/(opportunity_matrix+(opportunity_matrix==0)) pylab.figure() pylab.xlabel('Synonymous divergence, $d_S$') pylab.ylabel('Private SNV sharing') pylab.semilogx([1e-07,1e-01],[0,0],'k:') pylab.xlim([3e-07,1e-01]) pylab.ylim([-0.05,1.05]) # Add records to output # Calculate which pairs of idxs belong to the same sample, which to the same subject # and which to different subjects same_sample_idxs, same_subject_idxs, diff_subject_idxs = parse_midas_data.calculate_subject_pairs(subject_sample_map, snp_samples) for idxs in [diff_subject_idxs]: for sample_pair_idx in xrange(0,len(idxs[0])): # do both order for i,j in [(idxs[0][sample_pair_idx], idxs[1][sample_pair_idx]), (idxs[1][sample_pair_idx], idxs[0][sample_pair_idx])]: if opportunity_matrix[i,j]==0: continue if core_doubleton_opportunity_matrix[i,j]<0.5: continue ds = max([substitution_rates[i,j],1e-06]) df = doubleton_fractions[i,j]
sys.stderr.write("Loading pangenome data for %s...\n" % species_name) gene_samples, gene_names, gene_presence_matrix, gene_depth_matrix, marker_coverages, gene_reads_matrix = parse_midas_data.parse_pangenome_data( species_name, allowed_samples=snp_samples) sys.stderr.write("Done!\n") # Calculate matrix of number of genes that differ sys.stderr.write("Calculating matrix of gene differences...\n") gene_difference_matrix, gene_opportunity_matrix = gene_diversity_utils.calculate_coverage_based_gene_hamming_matrix( gene_depth_matrix, marker_coverages, min_log2_fold_change=4) # Now need to make the gene samples and snp samples match up desired_samples = gene_samples[marker_coverages > min_coverage] # Calculate which pairs of idxs belong to the same sample, which to the same subject # and which to different subjects desired_same_sample_idxs, desired_same_subject_idxs, desired_diff_subject_idxs = parse_midas_data.calculate_subject_pairs( subject_sample_map, desired_samples) snp_sample_idx_map = parse_midas_data.calculate_sample_idx_map( desired_samples, snp_samples) gene_sample_idx_map = parse_midas_data.calculate_sample_idx_map( desired_samples, gene_samples) same_sample_snp_idxs = parse_midas_data.apply_sample_index_map_to_indices( snp_sample_idx_map, desired_same_sample_idxs) same_sample_gene_idxs = parse_midas_data.apply_sample_index_map_to_indices( gene_sample_idx_map, desired_same_sample_idxs) same_subject_snp_idxs = parse_midas_data.apply_sample_index_map_to_indices( snp_sample_idx_map, desired_same_subject_idxs) same_subject_gene_idxs = parse_midas_data.apply_sample_index_map_to_indices( gene_sample_idx_map, desired_same_subject_idxs)