def filter_samples_by_attributes(root_folder_name, input_file, output_file, filter_specs, atts_and_variations, log_frequency=100000): """ Utility to filter NCBI biosamples by attribute names and/or attribute values :param root_folder_name: :param input_file: :param output_file: :param filter_specs: :param atts_and_variations: :param log_frequency: :return: """ constants.BASE_FOLDER = utils.get_base_folder(root_folder_name) execute = True if os.path.exists(output_file): if not utils.confirm( 'The destination file already exist. Do you want to overwrite it [y/n]? ' ): execute = False if execute: relevant_atts_and_variations = filter_atts_and_variations( filter_specs, atts_and_variations) filter_samples(input_file, output_file, True, True, relevant_atts_and_variations, log_frequency)
def export_samples_to_csv(root_folder_name, input_file, output_file, filter_specs, atts_and_variations, log_frequency=1000): """ Generates a simplified version of the samples in CSV and saves them to a file :param samples: samples in BioSamples's XML format :param attributes: :return: """ constants.BASE_FOLDER = utils.get_base_folder(root_folder_name) execute = True if os.path.exists(output_file): if not utils.confirm( 'The destination file already exist. Do you want to overwrite it [y/n]? ' ): execute = False if execute: # attribute names and variations of the attributes to be exported. We need to do this to be able to aggregate # different attribute variations so that the attribute values will be shown under the same column header relevant_atts_and_variations = filter_utils.filter_atts_and_variations( filter_specs, atts_and_variations) # Read and export samples exported_samples = [] if not os.path.exists(os.path.dirname(output_file)): os.makedirs(os.path.dirname(output_file)) print('Input file: ' + input_file) print('Output file: ' + output_file) print('Attributes to be exported: ' + str(filter_specs)) print('Processing NCBI samples...') # Read biosamples from XML file content = utils.read_xml_or_gz_file(input_file) processed_samples_count = 0 for event, node in content: if event == 'START_ELEMENT' and node.tagName == 'BioSample': content.expandNode(node) node_xml = node.toxml() processed_samples_count = processed_samples_count + 1 if processed_samples_count % log_frequency == 0: print('Processed samples: ' + str(processed_samples_count)) exported_samples.append( sample_to_json(node_xml, relevant_atts_and_variations)) utils.save_json_to_csv(exported_samples, output_file) print('Finished processing NCBI samples') print('- Total samples processed: ' + str(processed_samples_count)) print('- Total samples exported: ' + str(len(exported_samples)))
def main(): constants.BASE_FOLDER = utils.get_base_folder(constants.ROOT_FOLDER_NAME) execute = True if os.path.exists(OUTPUT_FILE): if not utils.confirm( 'The destination file already exist. Do you want to overwrite it [y/n]? ' ): execute = False if execute: filter_utils.filter_samples(INPUT_FILE, OUTPUT_FILE, True, False)
def export_samples_to_json(root_folder_name, input_file, output_file, log_frequency=1000): """ Generates a direct translation of the samples from the BioSample's XML to JSON and saves them to a file :param root_folder_name: :param input_file: :param output_file: :param log_frequency: :return: It saves the samples to the output_file """ constants.BASE_FOLDER = utils.get_base_folder(root_folder_name) execute = True if os.path.exists(output_file): if not utils.confirm( 'The destination file already exist. Do you want to overwrite it [y/n]? ' ): execute = False if execute: # Array of sample dictionaries samples_dct = [] if not os.path.exists(os.path.dirname(output_file)): os.makedirs(os.path.dirname(output_file)) print('Input file: ' + input_file) print('Output file: ' + output_file) print('Processing NCBI samples...') # Read biosamples from XML file content = utils.read_xml_or_gz_file(input_file) processed_samples_count = 0 for event, node in content: if event == 'START_ELEMENT' and node.tagName == 'BioSample': content.expandNode(node) node_xml = node.toxml() sample_dct = xmltodict.parse(node_xml) samples_dct.append(sample_dct) processed_samples_count = processed_samples_count + 1 if processed_samples_count % log_frequency == 0: print('Processed samples: ' + str(processed_samples_count)) with open(output_file, 'w') as f: json.dump(samples_dct, f) print('Finished processing NCBI samples') print('- Total samples processed: ' + str(processed_samples_count)) print('- Total samples exported: ' + str(len(samples_dct)))
def main(): if DOWNLOAD_SAMPLES: print('== Downloading BioSamples ==') print('Source URL: ' + constants.NCBI_DOWNLOAD_URL) if not os.path.exists(constants.NCBI_SAMPLES_FOLDER_DEST): os.makedirs(constants.NCBI_SAMPLES_FOLDER_DEST) dest_path = os.path.join(constants.NCBI_SAMPLES_FOLDER_DEST, constants.NCBI_SAMPLES_FILE_DEST) print('Destination file: ' + dest_path) if os.path.exists(dest_path): if utils.confirm( "The destination file already exist. Do you want to overwrite it [y/n]? " ): urllib.request.urlretrieve(constants.NCBI_DOWNLOAD_URL, dest_path, reporthook=utils.log_progress) else: urllib.request.urlretrieve(constants.NCBI_DOWNLOAD_URL, dest_path, reporthook=utils.log_progress) if DOWNLOAD_PROJECTS: print('== Downloading BioProject ==') print('Source URL: ' + constants.BIOPROJECT_DOWNLOAD_URL) if not os.path.exists(constants.BIOPROJECT_FOLDER_DEST): os.makedirs(constants.BIOPROJECT_FOLDER_DEST) dest_path = os.path.join(constants.BIOPROJECT_FOLDER_DEST, constants.BIOPROJECT_FILE_DEST) print('Destination file: ' + dest_path) if os.path.exists(dest_path): if utils.confirm( "The destination file already exist. Do you want to overwrite it [y/n]? " ): urllib.request.urlretrieve(constants.BIOPROJECT_DOWNLOAD_URL, dest_path, reporthook=utils.log_progress) else: urllib.request.urlretrieve(constants.BIOPROJECT_DOWNLOAD_URL, dest_path, reporthook=utils.log_progress)
def transform_and_export_projects_to_json(input_file, output_file, generate_dictionary, output_file_dictionary, log_frequency=100000): """ Parses an XML file with multiple NCBI bioprojects and exports them to JSON :param input_file: :param output_file: :param generate_dictionary: Generates a dictionary where the keys are the bioproject accessions and exports it to JSON as well :param log_frequency: :return: """ """ :param input_file: :param output_file: :param log_frequency: :return: """ execute = True if os.path.exists(output_file): if not utils.confirm( 'The destination file already exist. Do you want to overwrite it [y/n]? ' ): execute = False if execute: projects = [] if not os.path.exists(os.path.dirname(output_file)): os.makedirs(os.path.dirname(output_file)) print('Input file: ' + input_file) print('Output file: ' + output_file) print('Processing projects...') processed_project_count = 0 # Read projects from XML file context = ET.iterparse(input_file, events=("start", "end")) # Turn it into an iterator context = iter(context) event, root = context.__next__() for event, elem in context: if event == "end" and elem.tag == "Package": processed_project_count = processed_project_count + 1 project = BioProject() details_node = elem.find('Project') project_node = details_node.find('Project') submission_node = details_node.find('Submission') submission_description_node = submission_node.find( 'Description') project_id = project_node.find('ProjectID') archive_id = project_id.find('ArchiveID') ncbi_bio_project_accession = None project_description_node = project_node.find('ProjectDescr') if archive_id.get('accession'): ncbi_bio_project_accession = archive_id.get('accession') else: print('No accession available') if ncbi_bio_project_accession is not None: project.bioprojectAccession = ncbi_bio_project_accession project_name = project_description_node.find('Name') project_title = project_description_node.find('Title') # project_description = project_description_node.find('Description') if project_name is not None: project.projectName = project_name.text if project_title is not None: project.projectTitle = project_title.text # Dont' export description. It's too long and we won't use it. # if project_description is not None: # project.description = project_description.text project_organizations_nodes = submission_description_node.findall( 'Organization') project_organizations = [] if project_organizations_nodes is not None: for organization_node in project_organizations_nodes: organization = BioProjectOrganization() organization_name = organization_node.find('Name') organization_role = organization_node.get('role') organization_type = organization_node.get('type') organization_url = organization_node.get('url') if organization_name is not None: organization.name = organization_name.text if organization_role is not None: organization.role = organization_role if organization_type is not None: organization.type = organization_type if organization_url is not None: organization.url = organization_url project_organizations.append(organization) project.organizations = project_organizations project_grants = project_description_node.findall('Grant') project_pis = {} if project_grants is not None: for grant_node in project_grants: pi_node = grant_node.find('PI') if pi_node is not None: pi_id = pi_node.get('userid') if pi_id in project_pis: pi = project_pis.get(pi_id) affiliations = pi.affiliation else: pi = BioProjectPI() pi.id = pi_id affiliations = [] pi_affiliation = pi_node.get('affil') pi_first = pi_node.find('First') pi_last = pi_node.find('Last') pi_middle = pi_node.find('Middle') pi_given = pi_node.find('Given') if pi_affiliation is not None: affiliations.append(pi_affiliation) pi.affiliation = affiliations if pi_first is not None: pi.first = pi_first.text if pi_last is not None: pi.last = pi_last.text if pi_middle is not None: pi.middle = pi_middle.text if pi_given is not None: pi.given = pi_given.text project_pis[pi_id] = pi project.pis = list(project_pis.values()) projects.append(project) if processed_project_count % log_frequency == 0: print('Processed projects: ' + str(processed_project_count)) root.clear() print('- Total projects processed: ' + str(processed_project_count)) with open(output_file, 'w') as f: # json_string = json.dumps(biosamples, default=obj_dict) # print json.dump(projects, f, default=obj_dict) if generate_dictionary: projects_dict = {p.bioprojectAccession: p for p in projects} with open(output_file_dictionary, 'w') as f: json.dump(projects_dict, f, default=obj_dict) print('Finished processing projects') print('- Total projects processed: ' + str(processed_project_count)) print('- Total projects exported: ' + str(len(projects)))
def transform_and_export_samples_to_json(root_folder_name, input_file, output_file, insert_bioproject_info, projects_file, log_frequency=1000): """ Parses an XML file with multiple NCBI biosamples and exports them to JSON. Optionally, inserts additional BioProject info. """ constants.BASE_FOLDER = utils.get_base_folder(root_folder_name) execute = True if os.path.exists(output_file): if not utils.confirm( 'The destination file already exist. Do you want to overwrite it [y/n]? ' ): execute = False if execute: biosamples = [] if not os.path.exists(os.path.dirname(output_file)): os.makedirs(os.path.dirname(output_file)) print('Input file: ' + input_file) print('Output file: ' + output_file) if insert_bioproject_info: print('Bioprojects input file: ' + output_file) print('Processing NCBI samples...') processed_samples_count = 0 # Read biosamples from XML file tree = ET.parse(input_file) root = tree.getroot() num_biosamples = len(list(root)) if insert_bioproject_info: # Read bioprojects from JSON file with open(projects_file) as f: projects = json.load(f) print('Extracting all samples from file (no. samples: ' + str(num_biosamples) + ')') for child in root: biosample = NcbiBiosample() description_node = child.find('Description') attributes_node = child.find('Attributes') # sample identifiers sample_ids = child.find('Ids') for sample_id in sample_ids: value = sample_id.text if sample_id.get('db') == 'BioSample': biosample.biosampleAccession = value # sample name for sample_id in sample_ids: if sample_id.get('db_label') == 'Sample name': value = sample_id.text biosample.sampleName = value # sample title if description_node is not None and description_node.find( 'Title') is not None: value = description_node.find('Title').text biosample.sampleTitle = value # bioproject accession links = child.find('Links') if links is not None: for link in links: if link.get('target') == 'bioproject': prj_accession = link.get('label') if prj_accession in projects.keys(): biosample.bioprojectAccession = prj_accession biosample.bioproject = copy.deepcopy( projects.get(prj_accession)) # else: # print('Bioproject not found: ' + prj_accession) # organism if description_node is not None: organism_node = description_node.find('Organism') if organism_node is not None and organism_node.get( 'taxonomy_name') is not None: value = organism_node.get('taxonomy_name') biosample.organism = value # attributes biosample_attributes = [] for att in attributes_node: biosample_attribute = NcbiBiosampleAttribute() if att.get('display_name') is not None: att_name = att.get('display_name') else: att_name = att.get('attribute_name') biosample_attribute.attributeName = att_name biosample_attribute.attributeValue = att.text biosample_attributes.append(biosample_attribute) biosample.attributes = biosample_attributes biosamples.append(biosample) processed_samples_count = processed_samples_count + 1 # from pprint import pprint # pprint(vars(biosample)) with open(output_file, 'w') as f: # json_string = json.dumps(biosamples, default=obj_dict) # print json.dump(biosamples, f, default=obj_dict) print('Finished processing NCBI samples') print('- Total samples processed: ' + str(processed_samples_count)) print('- Total samples exported: ' + str(len(biosamples)))
def evaluate_annotations(annotation_evaluation_info_file_reviewed, annotation_evaluation_results_file): """ Evaluates the annotations generated :param annotation_evaluation_info_file_reviewed: :param annotation_evaluation_results_file: :return: """ print( 'Input file (file with info about the correctness of annotations): ' + annotation_evaluation_info_file_reviewed) print('Output file (evaluation results): ' + annotation_evaluation_results_file) run = True if os.path.exists(annotation_evaluation_results_file): if not utils.confirm( 'The destination file already exist. Do you want to overwrite it [y/n]? ' ): run = False if run: if not os.path.exists( os.path.dirname(annotation_evaluation_results_file)): os.makedirs(os.path.dirname(annotation_evaluation_results_file)) with open(annotation_evaluation_info_file_reviewed) as f: outcomes = {"TP": 0, "TN": 0, "FP": 0, "FN": 0} results = { "ALL": copy.deepcopy(outcomes), "disease": copy.deepcopy(outcomes), "tissue": copy.deepcopy(outcomes), "cell type": copy.deepcopy(outcomes), "cell line": copy.deepcopy(outcomes), "sex": copy.deepcopy(outcomes) } annotations_info = json.load(f) for att_name in annotations_info['att-values']: for att_value in annotations_info['att-values'][att_name]: count = annotations_info['att-values'][att_name][ att_value]['count'] outcome = annotations_info['att-values'][att_name][ att_value]['is-correct'] if outcome: results[att_name][ outcome] = results[att_name][outcome] + count results['ALL'][ outcome] = results['ALL'][outcome] + count outcome = None # else: # print("Error: the outcome cannot be null. The execution has been stopped.") # sys.exit(1) # Error # Save evaluation results with open(annotation_evaluation_results_file, 'w') as results_file: print('GENERAL INFO:', file=results_file) print('- Current date: ' + str(datetime.datetime.now()), file=results_file) print('- Input file: ' + os.path.abspath(annotation_evaluation_info_file_reviewed), file=results_file) print("\nANNOTATION RESULTS: ", file=results_file) for results_item in results: print("\nResults for: " + results_item, file=results_file) for outcome in outcomes: print(" " + outcome + ": " + str(results[results_item][outcome]), file=results_file) tp = results[results_item]["TP"] fp = results[results_item]["FP"] tn = results[results_item]["TN"] fn = results[results_item]["FN"] if tp == 0: precision = 0 recall = 0 f_measure = 0 else: precision = tp / (tp + fp) recall = tp / (tp + fn) f_measure = (2 * precision * recall) / (precision + recall) print(" - Precision: " + '{:,.2f}'.format(precision), file=results_file) print(" - Recall: " + '{:,.2f}'.format(recall), file=results_file) print(" - F-Measure: " + '{:,.2f}'.format(f_measure), file=results_file)
def annotate_samples(input_file, output_file, prioritize_pref, use_any_ontology_if_no_results, ignore_values, att_names_values_variations, annotation_filter_specs, preferred_terms_for_att_names, preferred_ontologies_for_att_values, preferred_ontologies_ordered, annotation_cache_file_path, evaluation_output_file, regenerate_annotation_cache=False): """ Annotates a list of BioSample samples in JSON format :param input_file: :param output_file: :return: """ print('Input file (original samples): ' + input_file) print('Output file (annotated samples): ' + output_file) relevant_atts_and_variations = filter_utils.filter_atts_and_variations( annotation_filter_specs, att_names_values_variations) run = True if os.path.exists(output_file): if not utils.confirm( 'The destination file already exist. Do you want to overwrite it [y/n]? ' ): run = False if run: if not os.path.exists(os.path.dirname(output_file)): os.makedirs(os.path.dirname(output_file)) with open(input_file) as f: original_samples = json.load(f) # Generate it if needed if regenerate_annotation_cache or not os.path.exists( annotation_cache_file_path): print('Generating annotation cache. Path: ' + annotation_cache_file_path) build_annotation_cache( original_samples, relevant_atts_and_variations, preferred_terms_for_att_names, preferred_ontologies_for_att_values, preferred_ontologies_ordered, prioritize_pref, use_any_ontology_if_no_results, ignore_values, annotation_cache_file_path, evaluation_output_file) # Read annotation cache with open(annotation_cache_file_path) as f: annotation_cache = json.load(f) annotated_samples = [] for sample in original_samples: annotated_sample = annotate_sample(sample, annotation_cache, relevant_atts_and_variations) annotated_samples.append(annotated_sample) with open(output_file, 'w') as f: json.dump(annotated_samples, f) print('Finished annotating NCBI samples') print('- Total samples processed: ' + str(len(original_samples))) print('- Total samples annotated and saved to output file: ' + str(len(annotated_samples)))