예제 #1
0
 def get_single_subject_idxs(self):
     sub_sam_map = parse_HMP_data.parse_subject_sample_map()
     sam_sub_map = sample_utils.calculate_sample_subject_map(sub_sam_map)
     subs = list(map(sam_sub_map.get, self.good_samples))
     seen = set()
     good_idxs = []
     for i, sub in enumerate(subs):
         if sub not in seen:
             good_idxs.append(i)
             seen.add(sub)
     return np.array(good_idxs)
    other_species_str = ""

################################################################################

min_coverage = config.min_median_coverage
alpha = 0.5  # Confidence interval range for rate estimates
low_pi_threshold = 1e-03
clade_divergence_threshold = 1e-02
modification_divergence_threshold = 1e-03
min_change = 0.8
include_high_copynum = False
#include_high_copynum = True

# Load subject and sample metadata
sys.stderr.write("Loading sample metadata...\n")
subject_sample_map = parse_HMP_data.parse_subject_sample_map()
sample_country_map = parse_HMP_data.parse_sample_country_map()
sample_order_map = parse_HMP_data.parse_sample_order_map()
sys.stderr.write("Done!\n")

# Only plot samples above a certain depth threshold that are involved in timecourse
snp_samples = diversity_utils.calculate_temporal_samples(species_name)

# The subset of samples that are haploid
haploid_samples = set(diversity_utils.calculate_haploid_samples(species_name))

# Only use the subset from North America
# The only temporal samples are from here, best not contaminate the between-subject
# comparisons with out of sample effects
#snp_samples = snp_samples[parse_HMP_data.calculate_country_samples(sample_country_map, sample_list=snp_samples, allowed_countries=set(["United States"]))]