parser.add_argument("--species", help="Name of specific species to run code on", default="Bacteroides_vulgatus_57955")
    args = parser.parse_args()

    debug = args.debug
    chunk_size = args.chunk_size
    species_name=args.species

    # Load subject and sample metadata
    sys.stderr.write("Loading sample metadata...\n")
    subject_sample_map = parse_HMP_data.parse_subject_sample_map()
    sys.stderr.write("Done!\n")
    
    # Only plot samples above a certain depth threshold that are "haploids"
    snp_samples = diversity_utils.calculate_haploid_samples(species_name, debug=debug)
    # Only consider one sample per person
    snp_samples =     snp_samples[parse_midas_data.calculate_unique_samples(subject_sample_map, sample_list=snp_samples)]
    sys.stderr.write("Proceeding with %d haploid samples!\n" % len(snp_samples))
        
    if len(snp_samples) < min_sample_size:
        sys.stderr.write("Not enough haploid samples!\n")
        sys.exit(1)
                
    sys.stderr.write("Proceeding with %d haploid samples!\n" % len(snp_samples))

    sys.stderr.write("Loading core genes...\n")
    core_genes = core_gene_utils.parse_core_genes(species_name)
    non_shared_genes = core_gene_utils.parse_non_shared_reference_genes(species_name)
    shared_pangenome_genes = core_gene_utils.parse_shared_genes(species_name)
    sys.stderr.write("Done! Core genome consists of %d genes\n" % len(core_genes))
    sys.stderr.write("%d shared genes and %d non-shared genes\n" % (len(shared_pangenome_genes), len(non_shared_genes)))
sys.stderr.write("Done!\n")

sys.stderr.write("Loaded gene info for %d samples\n" % len(gene_samples))

gene_copynum_matrix = gene_depth_matrix * 1.0 / (marker_coverages +
                                                 (marker_coverages == 0))

clipped_gene_copynum_matrix = numpy.clip(gene_depth_matrix, 0.1,
                                         1e09) / (marker_coverages + 0.1 *
                                                  (marker_coverages == 0))

low_copynum_matrix = (gene_copynum_matrix <= 3)
good_copynum_matrix = (gene_copynum_matrix >= 0.5) * (
    gene_copynum_matrix <= 3)  # why isn't this till 2? NRG

prevalence_idxs = (parse_midas_data.calculate_unique_samples(
    subject_sample_map, gene_samples)) * (marker_coverages >= min_coverage)

prevalences = gene_diversity_utils.calculate_fractional_gene_prevalences(
    gene_depth_matrix[:, prevalence_idxs], marker_coverages[prevalence_idxs])

pangenome_prevalences = numpy.array(prevalences, copy=True)
pangenome_prevalences.sort()

# Calculate matrix of number of genes that differ
sys.stderr.write("Calculating matrix of gene differences...\n")
gene_gain_matrix, gene_loss_matrix, gene_opportunity_matrix = gene_diversity_utils.calculate_coverage_based_gene_hamming_matrix_gain_loss(
    gene_reads_matrix, gene_depth_matrix, marker_coverages)

gene_difference_matrix = gene_gain_matrix + gene_loss_matrix

# Now need to make the gene samples and snp samples match up
Пример #3
0
desired_samples = samples[(median_coverages >= min_coverage) * (pis <= 1e-03)]

###

# Load SNP information for species_name
sys.stderr.write("Loading %s...\n" % species_name)
samples, allele_counts_map, passed_sites_map, final_line_number = parse_midas_data.parse_snps(
    species_name, debug=debug, allowed_samples=desired_samples)
sys.stderr.write("Done!\n")

pi_matrix_syn, avg_pi_matrix_syn = diversity_utils.calculate_pi_matrix(
    allele_counts_map, passed_sites_map, variant_type='4D')

low_diversity_samples = (numpy.diag(avg_pi_matrix_syn) < 1e-03)

unique_samples = parse_midas_data.calculate_unique_samples(
    subject_sample_map, samples)

desired_samples = unique_samples * low_diversity_samples

# initialize distance bins for LD computations
distance_bins = numpy.logspace(
    0, 4, 20
)  # bins start from 1 to 10^4 and there are 20 evenly spaced bins log(1)=0, log(10^4)-4
distance_bin_locations = numpy.array(
    distance_bins[:-1],
    copy=True)  # shifted one to avoid edge effects for plotting.
distance_bins[0] = 0.5  # made smallest bin 0.5 to avoid edge effects
distance_bins[
    -1] = 1e09  # made largest bin very large to catch anything >10^4.

binned_rsquared_numerators = numpy.zeros_like(distance_bin_locations)
Пример #4
0
median_coverages = numpy.array(
    [sample_coverage_map[samples[i]] for i in xrange(0, len(samples))])

###############################################################
# Indexes for SNP samples that have high coverage #
###############################################################

# Only plot samples above a certain depth threshold that are "haploids"
low_pi_snp_samples = samples[(median_coverages >= min_coverage) *
                             (pis <= 1e-03)]
high_pi_snp_samples = samples[(median_coverages >= min_coverage) *
                              (pis > 1e-03)]

# Calculate which pairs of idxs belong to unique samples. Remove any samples that are duplicates (i.e. multiple time pts)
unique_idxs = parse_midas_data.calculate_unique_samples(
    subject_sample_map, low_pi_snp_samples)
low_pi_snp_samples = low_pi_snp_samples[unique_idxs]

unique_idxs = parse_midas_data.calculate_unique_samples(
    subject_sample_map, high_pi_snp_samples)
high_pi_snp_samples = low_pi_snp_samples[unique_idxs]

####################################################
# Load gene coverage information for species_name
####################################################
sys.stderr.write("Loading pangenome data for %s...\n" % species_name)
low_pi_gene_samples, low_pi_gene_names, low_pi_gene_presence_matrix, low_pi_gene_depth_matrix, low_pi_marker_coverages, low_pi_gene_reads_matrix = parse_midas_data.parse_pangenome_data(
    species_name, allowed_samples=low_pi_snp_samples)

high_pi_gene_samples, high_pi_gene_names, high_pi_gene_presence_matrix, high_pi_gene_depth_matrix, high_pi_marker_coverages, high_pi_gene_reads_matrix = parse_midas_data.parse_pangenome_data(
    species_name, allowed_samples=high_pi_snp_samples)