def load_GO_annotations_mapper(args): # Load Gene Ontology -> Pfam annotation table logger.verbose( "Loading Pfam to GO Biological Process annotations mappings") pfam_families = file_utilities.load_json(args["pfam_families_filename"]) GO_to_pfam_mapping = file_utilities.load_json(args["go_to_pfam_filename"]) logger.verbose( "Loading of Pfam to GO Biological Process annotations mappings done") return pfam_families, GO_to_pfam_mapping
def import_gene_coord_data(organism_name, args): coordinates_file, gene_to_chr_mapping_file = \ file_utilities.get_coord_file_from_id(organism_name, args["parsed_coordinates_filename_suffix"], args["gene_locations_filename_suffix"], args["out_dir"]) gene_coordinates = import_parsed_coordinates(coordinates_file) gene_to_chr_mapping = file_utilities.load_json(gene_to_chr_mapping_file) return gene_coordinates, gene_to_chr_mapping
def parse_coordinates(args, strand=False): logger.info("Started parsing assembly gene coordinates files") # Create a dictionary for the sizes of each genome. # If one is already present, import it and append any new items. if os.path.exists( args["genome_sizes_filename"]) and args['force'] is False: try: genome_sizes_dictionary = \ manager.dict(file_utilities.load_json(args["genome_sizes_filename"])) except EOFError: genome_sizes_dictionary = manager.dict() else: genome_sizes_dictionary = manager.dict() if len(genome_sizes_dictionary) >= len(args['gbk_files_list']): del args["gbk_files_list"] return genome_sizes_dictionary else: # Checks if everything has already been parsed args["parsed_coordinates_file_list"] = \ file_utilities.get_file_list(args['parsed_coordinates_out_dir'], args["parsed_coordinates_filename_suffix"], verbose=True, error=False) # If there are files to process, starts to parse them: if len(args["gbk_files_list"]) > 0: parsed_coordinates_file_list = manager.list() multi = MyMultiProcess(threads=args["threads"], target=create_parsed_coordinate_dictionary, input=args["gbk_files_list"], args=[ genome_sizes_dictionary, parsed_coordinates_file_list, strand, args ]) multi.run() if len(args["gbk_files_list"]) > 0: logger.info( "Parsed gene coordinates files for %s assemblies generated and saved in folder %s", len(args["gbk_files_list"]), args['parsed_coordinates_out_dir']) logger.info("Individual DNA molecule sizes saved in %s", args['genome_sizes_filename']) args["parsed_coordinates_file_list"] = list( parsed_coordinates_file_list) return genome_sizes_dictionary
def load_all_organisms_coordinates(args): logger.info("Loading domain coordinates for all organisms") bar = Bar('Loading organism_data', max=len(args['pfam_annotations_file_list'])) all_gene_coordinates = {} pfam_families = file_utilities.load_json(args["pfam_families_filename"]) for pfam_annotations_file in args['pfam_annotations_file_list']: organism_name = \ file_utilities. \ get_organism_from_file(pfam_annotations_file, args["pfam_annotations_filename_suffix"]) genome_id, genome_size = args["genome_sizes"][organism_name][0] gene_coordinates, gene_to_chr_mapping = import_gene_coord_data( organism_name, args) gene_hits = import_pfam_hits(pfam_annotations_file, gene_to_chr_mapping, args['evalue'], pfam_families, genome_id, domain_centric=False) coords = np.zeros([len(gene_hits), 2], dtype=int) probs = np.zeros(len(gene_hits), dtype=float) total_prob = 0 for key in gene_hits: total_prob += len(gene_hits[key]) for index, gene in enumerate(gene_hits.keys()): coords[index] = np.array( [gene_coordinates[gene][0], gene_coordinates[gene][1]]) probs[index] = len(gene_hits[gene]) / total_prob all_gene_coordinates[organism_name] = \ [compute_uniform_probability(float(genome_size) / 2), float(genome_size), coords, probs] bar.next() bar.finish() logger.info("Domain coordinates for all organisms loaded") return all_gene_coordinates
def compute_organisms_probabilities(args): """This function is an aggregator to compute the exponential and uniform likelihood of all single genomes, using multiprocess. """ pfam_families = file_utilities.load_json(args["pfam_families_filename"]) pfam_annotations_file_list = args["pfam_annotations_file_list"] n_organisms = len(pfam_annotations_file_list) if not os.path.exists(args["uniform_probabilities_filename"]) or \ args["force"] is True: uniform_probabilities = manager.dict() else: uniform_probabilities = manager.dict( probabilities_files.load_uniform_probabilities( args["uniform_probabilities_filename"])) number_of_pairs = indexing_domain_pairs.index_all_pairs( pfam_annotations_file_list, pfam_families, args) logger.info("Started computing organism-specific clustering " "probabilities of Pfam domains") multi = MyMultiProcess(threads=args["threads"], target=organism_domain_clustering_probabilities, input=pfam_annotations_file_list, args=[uniform_probabilities, pfam_families, args]) multi.run() logger.info( "Computation of organism-specific clustering probabilities of " "Pfam domains for %s organisms done", n_organisms) logger.info( "All individual organism probabilities were saved in directory %s", args["individual_probabilities_dir"]) return uniform_probabilities, number_of_pairs
def load_uniform_probabilities(uniform_probability_filename): return file_utilities.load_json(uniform_probability_filename)
def import_parsed_coordinates(coord_file): """Import the ordered dictionary with the parsed genomic coordinates of each gene.""" return file_utilities.load_json(coord_file)
def organism_domain_clustering_probabilities(pfam_annotations_file, uniform_probabilities, pfam_families, args): """This function computes the exp and unif likelihood for all pairs from a single genome.""" organism_name = \ file_utilities.get_organism_from_file(pfam_annotations_file, args[ "pfam_annotations_filename_suffix"]) # Output file probabilities_filename = \ os.path.join(args["individual_probabilities_dir"], organism_name + args["probabilities_filename_suffix"] + ".json") if not os.path.exists(probabilities_filename) or args['force'] is True: individual_indexes_filename = os.path.join( args["pairs_indexes_dir"], args["pairs_indexes_individual_template"].format(organism_name)) all_domain_pair_indexes = file_utilities.load_json( individual_indexes_filename) # Copy step necessary, because the dictionary is shared among threads and the # following checks make the script slower if ran in multithread: genome_sizes_org = [args["genome_sizes"][organism_name][0]] genome_numeric_id = 0 for genome_molecule in genome_sizes_org: genome_key = \ file_utilities.get_organism_id_from_name(organism_name, genome_numeric_id) logger.verbose( "Started computing individual clustering " "probabilities from file %s", pfam_annotations_file) organism_probabilities = {} genome_size, genome_id = genome_molecule[1], genome_molecule[0] try: gene_coordinates except NameError: # Loads gene coordinates for this chromosome gene_coordinates, gene_to_chr_mapping = \ load_genome_annotations.import_gene_coord_data( organism_name, args) # Import Pfam annotations for this chromosome domain_hits = \ load_genome_annotations.import_pfam_hits(pfam_annotations_file, gene_to_chr_mapping, args["evalue"], pfam_families, genome_id) if len(domain_hits) > 0: # Saves the exponential probabilities for this chromosome organism_probabilities[genome_key] = \ compute_probabilities(domain_hits, all_domain_pair_indexes, gene_coordinates, genome_id, genome_size, pfam_annotations_file, args) # Computes the uniform probability for this chromosome uniform_probabilities[genome_key] = \ compute_uniform_probability(float(genome_size) / 2) genome_numeric_id += 1 file_utilities.save_json(organism_probabilities, probabilities_filename) probabilities_files \ .save_uniform_probabilities(uniform_probabilities, args["uniform_probabilities_filename"]) logger.verbose( "Individual clustering probabilities from file %s, " "DNA molecule %s computed", pfam_annotations_file, genome_id) del all_domain_pair_indexes else: logger.verbose( "Individual clustering probabilities " "from file %s already present", pfam_annotations_file)
def get_gene_pairs_distances(coordinates_file, gene_to_chr_mapping_file, pfam_annotations_file, GO_to_pfam_mapping, pfam_families, genome_id, genome_size, args, return_annotations=False): # Name of the output file to save pairwise distances of GO pairs (optional) if args["save_dist"] is True: GO_pairs_distances_file_name = \ file_utilities \ .new_suffix_file(pfam_annotations_file, args["pfam_annotations_filename_suffix"], "_GO" + \ args["pfam_annotations_filename_suffix"], args["single_distances_dir"], file_extension="") if not os.path.exists(GO_pairs_distances_file_name): repeat_file = True else: repeat_file = False else: GO_pairs_distances_file_name = None repeat_file = True # If this file hasn't already been generated in a previous run: if repeat_file is True or args["force"] is True: logger.verbose("Started computing chromosomal distances of " "GO-related Pfam domain pairs from file %s", pfam_annotations_file) # Load the Pfam annotations for the genes in this genome, # and organises themaccording to their GO biological process annotations gene_coordinates = load_genome_annotations.import_parsed_coordinates( coordinates_file) domain_hits = load_genome_annotations.import_pfam_hits( pfam_annotations_file, file_utilities.load_json( gene_to_chr_mapping_file), args["evalue"], pfam_families, genome_id) # Get list of all gene pairs in the genome with shared GO annotations if return_annotations is False: GO_pairs = get_GO_pairs_set(domain_hits, GO_to_pfam_mapping) else: GO_pairs, GO_pairs_annotations = \ get_GO_pairs_set(domain_hits, GO_to_pfam_mapping, return_annotations) # Compute pairwise distances for gene pairs with shared GO annotations GO_pairs_distances = \ genomic_pairwise_distances.pairwise_distances(GO_pairs, gene_coordinates, genome_size) # If we want to look at non-GO pairs if args["non_go"] is True: all_pfam_genes = set(list(itertools.chain(*domain_hits.values()))) all_pairs = itertools.product(all_pfam_genes, all_pfam_genes) other_pairs = list(set(all_pairs) - set(GO_pairs)) other_pairs_distances = \ genomic_pairwise_distances.pairwise_distances(other_pairs, gene_coordinates, genome_size) if len(GO_pairs_distances) == 0: logger.verbose("No GO-related domain pairs for file %s available", pfam_annotations_file) else: # Save results if args["save_dist"] is True: save_pairwise_distances_file(GO_pairs_distances_file_name, GO_pairs_distances) if args["non_go"] is True: args['non_go_pairs_distances_file_name'] = \ GO_pairs_distances_file_name.replace("_GO", "_not_GO") save_pairwise_distances_file( args['non_go_pairs_distances_file_name'], other_pairs_distances) logger.verbose("Chromosomal distances of GO-related Pfam domain " "pairs for file %s computed", pfam_annotations_file) else: np.load(GO_pairs_distances_file_name) logger.verbose("Chromosomal distances of GO-related Pfam domain pairs " "for file %s already computed", pfam_annotations_file) if args['non_go'] is True: if return_annotations is True: return GO_pairs_distances, other_pairs_distances, \ GO_pairs_annotations else: return GO_pairs_distances, other_pairs_distances else: if return_annotations is True: return GO_pairs_distances, GO_pairs_annotations else: return (GO_pairs_distances,)
def compute_conserved_probabilities(probabilities_files_list, organisms_uniform_probabilities, n_pairs, tree, subclade_id, subclade_final_probabilities_filename, args): """ This function computes the finals conserved clustered probabilities of all organisms whose probabilities are listed in probabilities files list. """ if subclade_id == 0: subclade_id = "root" if args["verbose"] is False: logger.setLevel(logging.INFO) logger.verbose( "Computing final conserved clustering probabilities from subclade %s", subclade_id) # Initialises empty arrays for the results domain_pairs_probabilities = { 'exp_prob': np.zeros(n_pairs), 'uni_prob': np.zeros(n_pairs) } domain_pairs_data = { 'occurrences': np.zeros(n_pairs, dtype=np.uint32), 'n_genes1': np.zeros(n_pairs, dtype=np.uint32), 'n_genes2': np.zeros(n_pairs, dtype=np.uint32), 'fusions': np.zeros(n_pairs, dtype=np.uint16), 'n_organisms': np.zeros(n_pairs, dtype=np.uint16) } if args["weight"] is True: organism_weights = GSC.GSC_normalised(tree) GSC.save_weights(organism_weights, args["weights_file"], subclade_id, "a") for probabilities_filename in probabilities_files_list: logger.verbose( "Adding individual clustering probabilities from file %s", probabilities_filename) organism_name = file_utilities.get_organism_from_file( probabilities_filename, args["probabilities_filename_suffix"]) organism_has_pair = np.zeros(n_pairs, dtype=np.uint8) if args["weight"] is True: organism_weight = organism_weights[organism_name] else: organism_weight = 1 organism_probabilities = file_utilities.load_json( probabilities_filename) for genome_key in organism_probabilities.keys(): add_organism_probabilities( organism_probabilities[genome_key], organisms_uniform_probabilities[genome_key], organism_weight, domain_pairs_probabilities, domain_pairs_data, organism_has_pair) domain_pairs_data["n_organisms"] = \ domain_pairs_data["n_organisms"] + organism_has_pair del organism_has_pair logger.verbose( "Individual clustering probabilities from file %s added", probabilities_filename) domain_pairs_data["prob"] = compute_final_probabilities( domain_pairs_probabilities, args["phi"]) del domain_pairs_probabilities logger.verbose( "Saving final conserved clustering probabilities from subclade %s", subclade_id) prob_files_utilities.\ save_conserved_probabilities(domain_pairs_data, subclade_final_probabilities_filename) logger.verbose( "Final conserved clustering probabilities from " "subclade %s saved in file %s", subclade_id, subclade_final_probabilities_filename)
def main(args): config_file = "global_config.conf" args = init_scripts.create_arguments_dict(args, config_file) if args["verbose"] is True: logger.setLevel(logging.VERBOSE) else: logger.setLevel(logging.INFO) # Check if the input directory exists data_dir = os.path.abspath(args["data_dir"]) if not os.path.exists(data_dir): logger.error("Input directory %s does not exist", data_dir) raise ValueError("Input directory {} does not exist".format(data_dir)) else: args["data_dir"] = data_dir logger.info("Input directory: %s", data_dir) init_scripts.check_input_files(args) # Check if there is something to run (options) parameters = [] for el in ["fit_model", "predict", "permutations", "bootstrap", "sensitivity"]: if args[el] is True: parameters.append(el) if len(parameters) == 0: logger.error("Select any among these options: {}" .format(", --".join(parameters))) raise ValueError("Either one of --{} options have to be selected" .format(", --".join(parameters))) else: logger.info("Clustering analysis will be run in the following mode(s): %s", ", ".join(parameters)) logger.info("Number of threads: %s", args["threads"]) # Creates output directories init_scripts.create_output_directories(args) if args["weight"] is True or args["subclades"] is True: if not os.path.exists(args["tree_file"]): logger.error("Input phylogenetic tree file %s not found", args["tree_file"]) raise ValueError("Input phylogenetic tree file {} not found" .format(args["tree_file"])) else: logger.info("Input phylogenetic tree file: %s", args["tree_file"]) if args["weight"] is True: logger.info("Conserved clustering probabilities will be " "weighted using phylogenetic distances") if args["subclades"] is True: logger.info("Conserved clustering probabilities will be " "computed for each subclade") else: logger.info("Conserved clustering probabilities will be " "computed just for the root node") # Check if the parsed coordinates files already exist, if not create them from the gbk files if args["fit_model"] or args["predict"] is True: args["genome_sizes"] = gbk_to_fast_coordinates.parse_coordinates(args) else: args["genome_sizes"] = load_json(args['genome_sizes_filename']) if args["fit_model"] is True: args["lambd"], args["phi"] = fit_clustering_model.fit_clustering_model(args) else: if os.path.exists(args["general_parameters_filename"]): args["lambd"], args["phi"] = import_export_parameters\ .import_parameters(args["general_parameters_filename"]) if args["bootstrap"] is True: bootstrap.bootstrap_parameters(args) if args["sensitivity"] is True: bootstrap.sensitivity_analysis(args) if args["predict"] is True: if args["fit_model"] is False: logger.info("Imported global mean clustering estimated " "parameter values are lambda={:.3g}, phi={:.3g}" .format(args["lambd"], args["phi"])) else: logger.info("Using default global mean clustering estimated " "parameter values lambda={:.3g}, phi={:.3g}" .format(args["lambd"], args["phi"])) predict.predict_clustering_pairs(args) if args["permutations"] is True: if os.path.exists(args['subclades_dir']): permute.predict_permuted_clustering_pairs(args) else: logger.error("Permutations are accepted only for subclade-specific " "clustering analysis.")
def import_genome_sizes(genome_sizes_file): return file_utilities.load_json(genome_sizes_file)