def filter_samples_by_attributes(root_folder_name,
                                 input_file,
                                 output_file,
                                 filter_specs,
                                 atts_and_variations,
                                 log_frequency=100000):
    """
    Utility to filter NCBI biosamples by attribute names and/or attribute values
    :param root_folder_name:
    :param input_file:
    :param output_file:
    :param filter_specs:
    :param atts_and_variations:
    :param log_frequency:
    :return:
    """
    constants.BASE_FOLDER = utils.get_base_folder(root_folder_name)
    execute = True
    if os.path.exists(output_file):
        if not utils.confirm(
                'The destination file already exist. Do you want to overwrite it [y/n]? '
        ):
            execute = False
    if execute:
        relevant_atts_and_variations = filter_atts_and_variations(
            filter_specs, atts_and_variations)
        filter_samples(input_file, output_file, True, True,
                       relevant_atts_and_variations, log_frequency)
예제 #2
0
def export_samples_to_csv(root_folder_name,
                          input_file,
                          output_file,
                          filter_specs,
                          atts_and_variations,
                          log_frequency=1000):
    """
    Generates a simplified version of the samples in CSV and saves them to a file
    :param samples: samples in BioSamples's XML format
    :param attributes:
    :return:
    """
    constants.BASE_FOLDER = utils.get_base_folder(root_folder_name)
    execute = True
    if os.path.exists(output_file):
        if not utils.confirm(
                'The destination file already exist. Do you want to overwrite it [y/n]? '
        ):
            execute = False
    if execute:
        # attribute names and variations of the attributes to be exported. We need to do this to be able to aggregate
        # different attribute variations so that the attribute values will be shown under the same column header
        relevant_atts_and_variations = filter_utils.filter_atts_and_variations(
            filter_specs, atts_and_variations)

        # Read and export samples
        exported_samples = []

        if not os.path.exists(os.path.dirname(output_file)):
            os.makedirs(os.path.dirname(output_file))

        print('Input file: ' + input_file)
        print('Output file: ' + output_file)
        print('Attributes to be exported: ' + str(filter_specs))
        print('Processing NCBI samples...')
        # Read biosamples from XML file
        content = utils.read_xml_or_gz_file(input_file)

        processed_samples_count = 0

        for event, node in content:
            if event == 'START_ELEMENT' and node.tagName == 'BioSample':
                content.expandNode(node)
                node_xml = node.toxml()
                processed_samples_count = processed_samples_count + 1

                if processed_samples_count % log_frequency == 0:
                    print('Processed samples: ' + str(processed_samples_count))

                exported_samples.append(
                    sample_to_json(node_xml, relevant_atts_and_variations))

        utils.save_json_to_csv(exported_samples, output_file)

        print('Finished processing NCBI samples')
        print('- Total samples processed: ' + str(processed_samples_count))
        print('- Total samples exported: ' + str(len(exported_samples)))
예제 #3
0
def main():
    constants.BASE_FOLDER = utils.get_base_folder(constants.ROOT_FOLDER_NAME)
    execute = True
    if os.path.exists(OUTPUT_FILE):
        if not utils.confirm(
                'The destination file already exist. Do you want to overwrite it [y/n]? '
        ):
            execute = False
    if execute:
        filter_utils.filter_samples(INPUT_FILE, OUTPUT_FILE, True, False)
예제 #4
0
def export_samples_to_json(root_folder_name,
                           input_file,
                           output_file,
                           log_frequency=1000):
    """
    Generates a direct translation of the samples from the BioSample's XML to JSON and saves them to a file
    :param root_folder_name:
    :param input_file:
    :param output_file:
    :param log_frequency:
    :return: It saves the samples to the output_file
    """

    constants.BASE_FOLDER = utils.get_base_folder(root_folder_name)
    execute = True
    if os.path.exists(output_file):
        if not utils.confirm(
                'The destination file already exist. Do you want to overwrite it [y/n]? '
        ):
            execute = False
    if execute:

        # Array of sample dictionaries
        samples_dct = []

        if not os.path.exists(os.path.dirname(output_file)):
            os.makedirs(os.path.dirname(output_file))

        print('Input file: ' + input_file)
        print('Output file: ' + output_file)
        print('Processing NCBI samples...')

        # Read biosamples from XML file
        content = utils.read_xml_or_gz_file(input_file)

        processed_samples_count = 0

        for event, node in content:
            if event == 'START_ELEMENT' and node.tagName == 'BioSample':
                content.expandNode(node)
                node_xml = node.toxml()
                sample_dct = xmltodict.parse(node_xml)
                samples_dct.append(sample_dct)

                processed_samples_count = processed_samples_count + 1
                if processed_samples_count % log_frequency == 0:
                    print('Processed samples: ' + str(processed_samples_count))

        with open(output_file, 'w') as f:
            json.dump(samples_dct, f)

        print('Finished processing NCBI samples')
        print('- Total samples processed: ' + str(processed_samples_count))
        print('- Total samples exported: ' + str(len(samples_dct)))
def main():
    if DOWNLOAD_SAMPLES:
        print('== Downloading BioSamples ==')
        print('Source URL: ' + constants.NCBI_DOWNLOAD_URL)
        if not os.path.exists(constants.NCBI_SAMPLES_FOLDER_DEST):
            os.makedirs(constants.NCBI_SAMPLES_FOLDER_DEST)
        dest_path = os.path.join(constants.NCBI_SAMPLES_FOLDER_DEST,
                                 constants.NCBI_SAMPLES_FILE_DEST)
        print('Destination file: ' + dest_path)
        if os.path.exists(dest_path):
            if utils.confirm(
                    "The destination file already exist. Do you want to overwrite it [y/n]? "
            ):
                urllib.request.urlretrieve(constants.NCBI_DOWNLOAD_URL,
                                           dest_path,
                                           reporthook=utils.log_progress)
        else:
            urllib.request.urlretrieve(constants.NCBI_DOWNLOAD_URL,
                                       dest_path,
                                       reporthook=utils.log_progress)

    if DOWNLOAD_PROJECTS:
        print('== Downloading BioProject ==')
        print('Source URL: ' + constants.BIOPROJECT_DOWNLOAD_URL)
        if not os.path.exists(constants.BIOPROJECT_FOLDER_DEST):
            os.makedirs(constants.BIOPROJECT_FOLDER_DEST)
        dest_path = os.path.join(constants.BIOPROJECT_FOLDER_DEST,
                                 constants.BIOPROJECT_FILE_DEST)
        print('Destination file: ' + dest_path)
        if os.path.exists(dest_path):
            if utils.confirm(
                    "The destination file already exist. Do you want to overwrite it [y/n]? "
            ):
                urllib.request.urlretrieve(constants.BIOPROJECT_DOWNLOAD_URL,
                                           dest_path,
                                           reporthook=utils.log_progress)
        else:
            urllib.request.urlretrieve(constants.BIOPROJECT_DOWNLOAD_URL,
                                       dest_path,
                                       reporthook=utils.log_progress)
예제 #6
0
def transform_and_export_projects_to_json(input_file,
                                          output_file,
                                          generate_dictionary,
                                          output_file_dictionary,
                                          log_frequency=100000):
    """
    Parses an XML file with multiple NCBI bioprojects and exports them to JSON
    :param input_file:
    :param output_file:
    :param generate_dictionary: Generates a dictionary where the keys are the bioproject accessions and exports it to JSON as well
    :param log_frequency:
    :return:
    """
    """
    
    :param input_file:
    :param output_file:
    :param log_frequency:
    :return:
    """
    execute = True
    if os.path.exists(output_file):
        if not utils.confirm(
                'The destination file already exist. Do you want to overwrite it [y/n]? '
        ):
            execute = False
    if execute:
        projects = []

        if not os.path.exists(os.path.dirname(output_file)):
            os.makedirs(os.path.dirname(output_file))

        print('Input file: ' + input_file)
        print('Output file: ' + output_file)
        print('Processing projects...')

        processed_project_count = 0

        # Read projects from XML file
        context = ET.iterparse(input_file, events=("start", "end"))

        # Turn it into an iterator
        context = iter(context)
        event, root = context.__next__()

        for event, elem in context:
            if event == "end" and elem.tag == "Package":
                processed_project_count = processed_project_count + 1
                project = BioProject()

                details_node = elem.find('Project')
                project_node = details_node.find('Project')
                submission_node = details_node.find('Submission')
                submission_description_node = submission_node.find(
                    'Description')

                project_id = project_node.find('ProjectID')
                archive_id = project_id.find('ArchiveID')
                ncbi_bio_project_accession = None

                project_description_node = project_node.find('ProjectDescr')

                if archive_id.get('accession'):
                    ncbi_bio_project_accession = archive_id.get('accession')
                else:
                    print('No accession available')

                if ncbi_bio_project_accession is not None:
                    project.bioprojectAccession = ncbi_bio_project_accession

                    project_name = project_description_node.find('Name')
                    project_title = project_description_node.find('Title')
                    # project_description = project_description_node.find('Description')

                    if project_name is not None:
                        project.projectName = project_name.text

                    if project_title is not None:
                        project.projectTitle = project_title.text

                    # Dont' export description. It's too long and we won't use it.
                    # if project_description is not None:
                    #     project.description = project_description.text

                    project_organizations_nodes = submission_description_node.findall(
                        'Organization')
                    project_organizations = []

                    if project_organizations_nodes is not None:
                        for organization_node in project_organizations_nodes:
                            organization = BioProjectOrganization()
                            organization_name = organization_node.find('Name')
                            organization_role = organization_node.get('role')
                            organization_type = organization_node.get('type')
                            organization_url = organization_node.get('url')

                            if organization_name is not None:
                                organization.name = organization_name.text

                            if organization_role is not None:
                                organization.role = organization_role

                            if organization_type is not None:
                                organization.type = organization_type

                            if organization_url is not None:
                                organization.url = organization_url

                            project_organizations.append(organization)

                    project.organizations = project_organizations

                    project_grants = project_description_node.findall('Grant')
                    project_pis = {}

                    if project_grants is not None:
                        for grant_node in project_grants:
                            pi_node = grant_node.find('PI')

                            if pi_node is not None:
                                pi_id = pi_node.get('userid')

                                if pi_id in project_pis:
                                    pi = project_pis.get(pi_id)
                                    affiliations = pi.affiliation
                                else:
                                    pi = BioProjectPI()
                                    pi.id = pi_id
                                    affiliations = []

                                pi_affiliation = pi_node.get('affil')
                                pi_first = pi_node.find('First')
                                pi_last = pi_node.find('Last')
                                pi_middle = pi_node.find('Middle')
                                pi_given = pi_node.find('Given')

                                if pi_affiliation is not None:
                                    affiliations.append(pi_affiliation)
                                    pi.affiliation = affiliations

                                if pi_first is not None:
                                    pi.first = pi_first.text

                                if pi_last is not None:
                                    pi.last = pi_last.text

                                if pi_middle is not None:
                                    pi.middle = pi_middle.text

                                if pi_given is not None:
                                    pi.given = pi_given.text

                                project_pis[pi_id] = pi

                    project.pis = list(project_pis.values())

                    projects.append(project)

                if processed_project_count % log_frequency == 0:
                    print('Processed projects: ' +
                          str(processed_project_count))

                root.clear()

        print('- Total projects processed: ' + str(processed_project_count))

        with open(output_file, 'w') as f:
            # json_string = json.dumps(biosamples, default=obj_dict)
            # print
            json.dump(projects, f, default=obj_dict)

        if generate_dictionary:
            projects_dict = {p.bioprojectAccession: p for p in projects}
            with open(output_file_dictionary, 'w') as f:
                json.dump(projects_dict, f, default=obj_dict)

        print('Finished processing projects')
        print('- Total projects processed: ' + str(processed_project_count))
        print('- Total projects exported: ' + str(len(projects)))
예제 #7
0
def transform_and_export_samples_to_json(root_folder_name,
                                         input_file,
                                         output_file,
                                         insert_bioproject_info,
                                         projects_file,
                                         log_frequency=1000):
    """
       Parses an XML file with multiple NCBI biosamples and exports them to JSON. Optionally, inserts additional BioProject info.

    """
    constants.BASE_FOLDER = utils.get_base_folder(root_folder_name)
    execute = True
    if os.path.exists(output_file):
        if not utils.confirm(
                'The destination file already exist. Do you want to overwrite it [y/n]? '
        ):
            execute = False
    if execute:

        biosamples = []

        if not os.path.exists(os.path.dirname(output_file)):
            os.makedirs(os.path.dirname(output_file))

        print('Input file: ' + input_file)
        print('Output file: ' + output_file)
        if insert_bioproject_info:
            print('Bioprojects input file: ' + output_file)
        print('Processing NCBI samples...')

        processed_samples_count = 0

        # Read biosamples from XML file
        tree = ET.parse(input_file)
        root = tree.getroot()
        num_biosamples = len(list(root))

        if insert_bioproject_info:
            # Read bioprojects from JSON file
            with open(projects_file) as f:
                projects = json.load(f)

        print('Extracting all samples from file (no. samples: ' +
              str(num_biosamples) + ')')
        for child in root:

            biosample = NcbiBiosample()

            description_node = child.find('Description')
            attributes_node = child.find('Attributes')

            # sample identifiers
            sample_ids = child.find('Ids')
            for sample_id in sample_ids:
                value = sample_id.text
                if sample_id.get('db') == 'BioSample':
                    biosample.biosampleAccession = value

            # sample name
            for sample_id in sample_ids:
                if sample_id.get('db_label') == 'Sample name':
                    value = sample_id.text
                    biosample.sampleName = value

            # sample title
            if description_node is not None and description_node.find(
                    'Title') is not None:
                value = description_node.find('Title').text
                biosample.sampleTitle = value

            # bioproject accession
            links = child.find('Links')
            if links is not None:
                for link in links:
                    if link.get('target') == 'bioproject':
                        prj_accession = link.get('label')

                        if prj_accession in projects.keys():
                            biosample.bioprojectAccession = prj_accession
                            biosample.bioproject = copy.deepcopy(
                                projects.get(prj_accession))
                        # else:
                        #     print('Bioproject not found: ' + prj_accession)

            # organism
            if description_node is not None:
                organism_node = description_node.find('Organism')
                if organism_node is not None and organism_node.get(
                        'taxonomy_name') is not None:
                    value = organism_node.get('taxonomy_name')
                    biosample.organism = value

            # attributes
            biosample_attributes = []

            for att in attributes_node:
                biosample_attribute = NcbiBiosampleAttribute()

                if att.get('display_name') is not None:
                    att_name = att.get('display_name')
                else:
                    att_name = att.get('attribute_name')

                biosample_attribute.attributeName = att_name
                biosample_attribute.attributeValue = att.text

                biosample_attributes.append(biosample_attribute)

            biosample.attributes = biosample_attributes
            biosamples.append(biosample)
            processed_samples_count = processed_samples_count + 1

            # from pprint import pprint
            # pprint(vars(biosample))

        with open(output_file, 'w') as f:
            # json_string = json.dumps(biosamples, default=obj_dict)
            # print
            json.dump(biosamples, f, default=obj_dict)

        print('Finished processing NCBI samples')
        print('- Total samples processed: ' + str(processed_samples_count))
        print('- Total samples exported: ' + str(len(biosamples)))
예제 #8
0
def evaluate_annotations(annotation_evaluation_info_file_reviewed,
                         annotation_evaluation_results_file):
    """
    Evaluates the annotations generated
    :param annotation_evaluation_info_file_reviewed:
    :param annotation_evaluation_results_file:
    :return:
    """
    print(
        'Input file (file with info about the correctness of annotations): ' +
        annotation_evaluation_info_file_reviewed)
    print('Output file (evaluation results): ' +
          annotation_evaluation_results_file)

    run = True
    if os.path.exists(annotation_evaluation_results_file):
        if not utils.confirm(
                'The destination file already exist. Do you want to overwrite it [y/n]? '
        ):
            run = False
    if run:
        if not os.path.exists(
                os.path.dirname(annotation_evaluation_results_file)):
            os.makedirs(os.path.dirname(annotation_evaluation_results_file))

        with open(annotation_evaluation_info_file_reviewed) as f:

            outcomes = {"TP": 0, "TN": 0, "FP": 0, "FN": 0}

            results = {
                "ALL": copy.deepcopy(outcomes),
                "disease": copy.deepcopy(outcomes),
                "tissue": copy.deepcopy(outcomes),
                "cell type": copy.deepcopy(outcomes),
                "cell line": copy.deepcopy(outcomes),
                "sex": copy.deepcopy(outcomes)
            }

            annotations_info = json.load(f)
            for att_name in annotations_info['att-values']:
                for att_value in annotations_info['att-values'][att_name]:

                    count = annotations_info['att-values'][att_name][
                        att_value]['count']
                    outcome = annotations_info['att-values'][att_name][
                        att_value]['is-correct']

                    if outcome:
                        results[att_name][
                            outcome] = results[att_name][outcome] + count
                        results['ALL'][
                            outcome] = results['ALL'][outcome] + count
                        outcome = None
                    # else:
                    #     print("Error: the outcome cannot be null. The execution has been stopped.")
                    #     sys.exit(1)
                    # Error

        # Save evaluation results
        with open(annotation_evaluation_results_file, 'w') as results_file:

            print('GENERAL INFO:', file=results_file)
            print('- Current date: ' + str(datetime.datetime.now()),
                  file=results_file)
            print('- Input file: ' +
                  os.path.abspath(annotation_evaluation_info_file_reviewed),
                  file=results_file)

            print("\nANNOTATION RESULTS: ", file=results_file)

            for results_item in results:
                print("\nResults for: " + results_item, file=results_file)
                for outcome in outcomes:
                    print("  " + outcome + ": " +
                          str(results[results_item][outcome]),
                          file=results_file)

                tp = results[results_item]["TP"]
                fp = results[results_item]["FP"]
                tn = results[results_item]["TN"]
                fn = results[results_item]["FN"]

                if tp == 0:
                    precision = 0
                    recall = 0
                    f_measure = 0
                else:
                    precision = tp / (tp + fp)
                    recall = tp / (tp + fn)
                    f_measure = (2 * precision * recall) / (precision + recall)

                print("  - Precision: " + '{:,.2f}'.format(precision),
                      file=results_file)
                print("  - Recall: " + '{:,.2f}'.format(recall),
                      file=results_file)
                print("  - F-Measure: " + '{:,.2f}'.format(f_measure),
                      file=results_file)
예제 #9
0
def annotate_samples(input_file,
                     output_file,
                     prioritize_pref,
                     use_any_ontology_if_no_results,
                     ignore_values,
                     att_names_values_variations,
                     annotation_filter_specs,
                     preferred_terms_for_att_names,
                     preferred_ontologies_for_att_values,
                     preferred_ontologies_ordered,
                     annotation_cache_file_path,
                     evaluation_output_file,
                     regenerate_annotation_cache=False):
    """
    Annotates a list of BioSample samples in JSON format
    :param input_file:
    :param output_file:
    :return:
    """
    print('Input file (original samples): ' + input_file)
    print('Output file (annotated samples): ' + output_file)

    relevant_atts_and_variations = filter_utils.filter_atts_and_variations(
        annotation_filter_specs, att_names_values_variations)
    run = True
    if os.path.exists(output_file):
        if not utils.confirm(
                'The destination file already exist. Do you want to overwrite it [y/n]? '
        ):
            run = False
    if run:
        if not os.path.exists(os.path.dirname(output_file)):
            os.makedirs(os.path.dirname(output_file))

        with open(input_file) as f:
            original_samples = json.load(f)

        # Generate it if needed
        if regenerate_annotation_cache or not os.path.exists(
                annotation_cache_file_path):
            print('Generating annotation cache. Path: ' +
                  annotation_cache_file_path)
            build_annotation_cache(
                original_samples, relevant_atts_and_variations,
                preferred_terms_for_att_names,
                preferred_ontologies_for_att_values,
                preferred_ontologies_ordered, prioritize_pref,
                use_any_ontology_if_no_results, ignore_values,
                annotation_cache_file_path, evaluation_output_file)

        # Read annotation cache
        with open(annotation_cache_file_path) as f:
            annotation_cache = json.load(f)

        annotated_samples = []
        for sample in original_samples:
            annotated_sample = annotate_sample(sample, annotation_cache,
                                               relevant_atts_and_variations)
            annotated_samples.append(annotated_sample)

        with open(output_file, 'w') as f:
            json.dump(annotated_samples, f)

        print('Finished annotating NCBI samples')
        print('- Total samples processed: ' + str(len(original_samples)))
        print('- Total samples annotated and saved to output file: ' +
              str(len(annotated_samples)))