def get_core_gene_vector(species_name): core_genes = core_gene_utils.parse_core_genes(species_name) # sort the genes all_genes = np.array(list(core_genes)) gene_indices = np.array( list(map(lambda name: int(name.split('.')[-1]), all_genes))) all_genes = all_genes[np.argsort(gene_indices)] return all_genes
def main(between_host): # Parse and save all the snps between QP hosts t0 = time.time() if between_host: intermediate_file_path = os.path.join(config.analysis_directory, 'between_hosts_checkpoints') else: intermediate_file_path = os.path.join(config.analysis_directory, 'within_hosts_checkpoints') for species_name in desired_species: print("Start processing {}".format(species_name)) core_genes = core_gene_utils.parse_core_genes(species_name) desired_samples = get_desired_samples(species_name, between_host=between_host) if desired_samples is None or len(desired_samples) == 0: print("{} has no qualified samples".format(species_name)) continue pickle_path = os.path.join(intermediate_file_path, species_name) if not os.path.exists(pickle_path): print('{} has not been processed'.format(species_name)) os.mkdir(pickle_path) else: print('{} already processed'.format(species_name)) continue found_samples, allele_counts_map, passed_sites_map, final_line_number = parse_snps( species_name, allowed_samples=desired_samples, allowed_genes=core_genes, allowed_variant_types=['4D']) pickle.dump(allele_counts_map, open(pickle_path + '/allele_counts_map.pickle', 'wb')) pickle.dump(found_samples, open(pickle_path + '/found_samples.pickle', 'wb')) pickle.dump(passed_sites_map, open(pickle_path + '/passed_sites_map.pickle', 'wb')) print("Done processing {} at {} min".format(species_name, (time.time() - t0) / 60))
debug=debug) # Only consider one sample per person snp_samples = snp_samples[parse_midas_data.calculate_unique_samples( subject_sample_map, sample_list=snp_samples)] sys.stderr.write("Proceeding with %d haploid samples!\n" % len(snp_samples)) if len(snp_samples) < min_sample_size: sys.stderr.write("Not enough haploid samples!\n") continue sys.stderr.write("Proceeding with %d haploid samples!\n" % len(snp_samples)) sys.stderr.write("Loading core genes...\n") core_genes = core_gene_utils.parse_core_genes(species_name) non_shared_genes = core_gene_utils.parse_non_shared_reference_genes( species_name) shared_pangenome_genes = core_gene_utils.parse_shared_genes( species_name) sys.stderr.write("Done! Core genome consists of %d genes\n" % len(core_genes)) sys.stderr.write("%d shared genes and %d non-shared genes\n" % (len(shared_pangenome_genes), len(non_shared_genes))) sys.stderr.write( "Loading pre-computed substitution rates for %s...\n" % species_name) substitution_rate_map = calculate_substitution_rates.load_substitution_rate_map( species_name) sys.stderr.write("Calculating matrix...\n")