Exemplo n.º 1
0
sys.stderr.write("Done! (%d genes)\n" % len(metaphlan2_genes))

# Load reference genes
sys.stderr.write("Loading reference genes...\n")
reference_genes = set(parse_midas_data.load_reference_genes(species_name))
reference_gene_idxs = numpy.array(
    [gene_name in reference_genes for gene_name in gene_names])
sys.stderr.write("Done! (%d genes)\n" % len(reference_genes))

print reference_genes[0:10]
print gene_names[0:10]

# Calculate matrix of number of genes that differ
sys.stderr.write("Calculate gene hamming matrix...\n")
# Either: for all genes in pan-genome
gene_hamming_matrix, num_opportunities = gene_diversity_utils.calculate_coverage_based_gene_hamming_matrix(
    gene_depth_matrix, marker_coverages, min_log2_fold_change=4)
#
# Or: just the subset from the MIDAS reference genome
#gene_hamming_matrix = diversity_utils.calculate_coverage_based_gene_hamming_matrix(gene_depth_matrix[reference_gene_idxs,:], marker_coverages, min_log2_fold_change=4)
#

sample_idx_map = parse_midas_data.calculate_sample_idx_map(
    high_coverage_samples, samples)

# Calculate which pairs of idxs belong to the same sample, which to the same subject
# and which to different subjects
high_coverage_same_sample_idxs, high_coverage_same_subject_idxs, high_coverage_diff_subject_idxs = parse_midas_data.calculate_subject_pairs(
    subject_sample_map, high_coverage_samples)

same_sample_idxs = parse_midas_data.apply_sample_index_map_to_indices(
    sample_idx_map, high_coverage_same_sample_idxs)
                                       1e-04)
low_divergence_sample_idxs = list(
    set(diff_subject_idxs[0][low_divergence_diff_sample_idx_idxs])
    | set(diff_subject_idxs[1][low_divergence_diff_sample_idx_idxs]))

low_divergence_samples = snp_samples[low_divergence_sample_idxs]

# Load gene coverage information for species_name
sys.stderr.write("Loading pangenome data for %s...\n" % species_name)
gene_samples, gene_names, gene_presence_matrix, gene_depth_matrix, marker_coverages, gene_reads_matrix = parse_midas_data.parse_pangenome_data(
    species_name, allowed_samples=low_divergence_samples)
sys.stderr.write("Done!\n")

# Calculate matrix of number of genes that differ
sys.stderr.write("Calculating matrix of gene differences...\n")
gene_difference_matrix, gene_opportunity_matrix = gene_diversity_utils.calculate_coverage_based_gene_hamming_matrix(
    gene_depth_matrix, marker_coverages)

# Now need to make the gene samples and snp samples match up
desired_samples = gene_samples[marker_coverages > min_coverage]

# Calculate which pairs of idxs belong to the same sample, which to the same subject
# and which to different subjects
desired_same_sample_idxs, desired_same_subject_idxs, desired_diff_subject_idxs = parse_midas_data.calculate_subject_pairs(
    subject_sample_map, desired_samples)

# Calculate which pairs of idxs belong to the same sample, which to the same subject
# and which to different subjects
desired_same_sample_idxs, desired_same_subject_idxs, desired_diff_subject_idxs = parse_midas_data.calculate_subject_pairs(
    subject_sample_map, desired_samples)

snp_sample_idx_map = parse_midas_data.calculate_sample_idx_map(