示例#1
0
# Calculate matrix of number of genes that differ
sys.stderr.write("Calculate gene hamming matrix...\n")
# Either: for all genes in pan-genome
gene_hamming_matrix, num_opportunities = gene_diversity_utils.calculate_coverage_based_gene_hamming_matrix(
    gene_depth_matrix, marker_coverages, min_log2_fold_change=4)
#
# Or: just the subset from the MIDAS reference genome
#gene_hamming_matrix = diversity_utils.calculate_coverage_based_gene_hamming_matrix(gene_depth_matrix[reference_gene_idxs,:], marker_coverages, min_log2_fold_change=4)
#

sample_idx_map = parse_midas_data.calculate_sample_idx_map(
    high_coverage_samples, samples)

# Calculate which pairs of idxs belong to the same sample, which to the same subject
# and which to different subjects
high_coverage_same_sample_idxs, high_coverage_same_subject_idxs, high_coverage_diff_subject_idxs = parse_midas_data.calculate_subject_pairs(
    subject_sample_map, high_coverage_samples)

same_sample_idxs = parse_midas_data.apply_sample_index_map_to_indices(
    sample_idx_map, high_coverage_same_sample_idxs)
same_subject_idxs = parse_midas_data.apply_sample_index_map_to_indices(
    sample_idx_map, high_coverage_same_subject_idxs)
diff_subject_idxs = parse_midas_data.apply_sample_index_map_to_indices(
    sample_idx_map, high_coverage_diff_subject_idxs)

hamming_timepoints = gene_hamming_matrix[same_subject_idxs]
hamming_timepoints.sort()
hamming_timepoints_dns, hamming_timepoints_survivals = stats_utils.calculate_unnormalized_survival_from_vector(
    hamming_timepoints, min_x=0.1, max_x=1e05)
hamming_timepoints_survivals /= hamming_timepoints_survivals[0]

hamming_between = gene_hamming_matrix[diff_subject_idxs]
    difference_matrix = core_mut_difference_matrix+core_rev_difference_matrix
    opportunity_matrix = core_mut_opportunity_matrix + core_rev_opportunity_matrix
        
    substitution_rates = difference_matrix*1.0/(opportunity_matrix+(opportunity_matrix==0))

    pylab.figure()
    pylab.xlabel('Synonymous divergence, $d_S$')
    pylab.ylabel('Private SNV sharing')
    pylab.semilogx([1e-07,1e-01],[0,0],'k:')
    pylab.xlim([3e-07,1e-01])
    pylab.ylim([-0.05,1.05])
        
    # Add records to output
    # Calculate which pairs of idxs belong to the same sample, which to the same subject
    # and which to different subjects
    same_sample_idxs, same_subject_idxs, diff_subject_idxs = parse_midas_data.calculate_subject_pairs(subject_sample_map, snp_samples)

    for idxs in [diff_subject_idxs]:
        for sample_pair_idx in xrange(0,len(idxs[0])):
                
            # do both order
            for i,j in [(idxs[0][sample_pair_idx], idxs[1][sample_pair_idx]), (idxs[1][sample_pair_idx], idxs[0][sample_pair_idx])]:
                
                if opportunity_matrix[i,j]==0:
                    continue
                        
                if core_doubleton_opportunity_matrix[i,j]<0.5:
                    continue
                
                ds = max([substitution_rates[i,j],1e-06])
                df = doubleton_fractions[i,j]
sys.stderr.write("Loading pangenome data for %s...\n" % species_name)
gene_samples, gene_names, gene_presence_matrix, gene_depth_matrix, marker_coverages, gene_reads_matrix = parse_midas_data.parse_pangenome_data(
    species_name, allowed_samples=snp_samples)
sys.stderr.write("Done!\n")

# Calculate matrix of number of genes that differ
sys.stderr.write("Calculating matrix of gene differences...\n")
gene_difference_matrix, gene_opportunity_matrix = gene_diversity_utils.calculate_coverage_based_gene_hamming_matrix(
    gene_depth_matrix, marker_coverages, min_log2_fold_change=4)

# Now need to make the gene samples and snp samples match up
desired_samples = gene_samples[marker_coverages > min_coverage]

# Calculate which pairs of idxs belong to the same sample, which to the same subject
# and which to different subjects
desired_same_sample_idxs, desired_same_subject_idxs, desired_diff_subject_idxs = parse_midas_data.calculate_subject_pairs(
    subject_sample_map, desired_samples)

snp_sample_idx_map = parse_midas_data.calculate_sample_idx_map(
    desired_samples, snp_samples)
gene_sample_idx_map = parse_midas_data.calculate_sample_idx_map(
    desired_samples, gene_samples)

same_sample_snp_idxs = parse_midas_data.apply_sample_index_map_to_indices(
    snp_sample_idx_map, desired_same_sample_idxs)
same_sample_gene_idxs = parse_midas_data.apply_sample_index_map_to_indices(
    gene_sample_idx_map, desired_same_sample_idxs)

same_subject_snp_idxs = parse_midas_data.apply_sample_index_map_to_indices(
    snp_sample_idx_map, desired_same_subject_idxs)
same_subject_gene_idxs = parse_midas_data.apply_sample_index_map_to_indices(
    gene_sample_idx_map, desired_same_subject_idxs)