def semantic_validation(self):
        """
        Validation of the data that involve checking its meaning
        This function adds error statements to the errors attribute
        """
        # Check if the references can be retrieved
        references = set([row['Reference'] for row in self.metadata['Analysis'] if row['Reference']])
        for reference in references:
            accessions = retrieve_genbank_assembly_accessions_from_ncbi(reference)
            if len(accessions) == 0:
                self.error_list.append(f'In Analysis, Reference {reference} did not resolve to any accession')
            elif len(accessions) > 1:
                self.error_list.append(f'In Analysis, Reference {reference} resolve to more than one accession: {accessions}')

        # Check taxonomy scientific name pair
        taxid_and_species_list = set([(row['Tax Id'], row['Scientific Name']) for row in self.metadata['Sample'] if row['Tax Id']])
        for taxid, species in taxid_and_species_list:
            try:
                scientific_name = get_scientific_name_from_ensembl(int(taxid))
                if species != scientific_name:
                    self.error_list.append(
                        f'In Samples, Taxonomy {taxid} and scientific name {species} are inconsistent')
            except ValueError as e:
                self.error(str(e))
                self.error_list.append(str(e))
            except HTTPError as e:
                self.error(str(e))
                self.error_list.append(str(e))
    def detect_metadata_attributes(self):
        eva_metadata = EvaXlsxReader(
            self.eload_cfg.query('submission', 'metadata_spreadsheet'))
        analysis_reference = {}
        for analysis in eva_metadata.analysis:
            reference_txt = analysis.get('Reference')
            assembly_accessions = resolve_accession_from_text(
                reference_txt) if reference_txt else None
            if not assembly_accessions:
                assembly_accession = None
            elif len(assembly_accessions) == 1:
                assembly_accession = assembly_accessions[0]
            else:
                self.warning(
                    f"Multiple assemblies found for {analysis.get('Analysis Alias')}: {', '.join(assembly_accessions)} "
                )
                assembly_accession = sorted(assembly_accessions)[-1]
                self.warning(
                    f"Will use the most recent assembly: {assembly_accession}")

            if assembly_accession:
                analysis_reference[analysis.get('Analysis Alias')] = {
                    'assembly_accession': assembly_accession,
                    'vcf_files': []
                }
            else:
                self.error(
                    f"Reference is missing for Analysis {analysis.get('Analysis Alias')}"
                )

        for file in eva_metadata.files:
            if file.get("File Type") == 'vcf':
                file_full = os.path.join(self.eload_dir,
                                         directory_structure['vcf'],
                                         file.get("File Name"))
                analysis_alias = file.get("Analysis Alias")
                analysis_reference[analysis_alias]['vcf_files'].append(
                    file_full)
        self.eload_cfg.set('submission', 'analyses', value=analysis_reference)

        taxonomy_id = eva_metadata.project.get('Tax ID')
        if taxonomy_id and (isinstance(taxonomy_id, int)
                            or taxonomy_id.isdigit()):
            self.eload_cfg.set('submission',
                               'taxonomy_id',
                               value=int(taxonomy_id))
            scientific_name = get_scientific_name_from_ensembl(taxonomy_id)
            self.eload_cfg.set('submission',
                               'scientific_name',
                               value=scientific_name)
        else:
            if taxonomy_id:
                self.error('Taxonomy id %s is invalid:', taxonomy_id)
            else:
                self.error('Taxonomy id is missing for the submission')
示例#3
0
def get_tax_asm_from_ensembl(tax_id):
    try:
        logger.info(f'Query Ensembl for species name using taxonomy {tax_id}')
        sp_name = get_scientific_name_from_ensembl(tax_id)
    except Exception:
        logger.warning(
            f'Could not get species name for taxonomy {tax_id} in Ensembl')
        return None
    # Get assembly from Ensembl
    logger.info(
        f'Query Ensembl for supported assembly for taxonomy {tax_id} using species_name "{sp_name}"'
    )
    assembly = get_supported_asm_from_ensembl(sp_name)
    if assembly != 'None':
        return {'assembly': assembly, 'source': 'Ensembl'}
    else:
        logger.warning(
            f'Could not find supported assembly for taxonomy_id {tax_id} using species_name "{sp_name}" in Ensembl'
        )
        return None
def retrieve_species_name_from_taxid_ensembl(taxid):
    logger.info(f'Query Ensembl for taxonomy {taxid}', )
    return get_scientific_name_from_ensembl(taxid)
示例#5
0
    def _create_project_xml(self):
        """
        This function read the project row from the XLS parser then create and populate an XML element following ENA
        data model.
        :return: The top XML element
        """
        project_row = self.reader.project
        root = Element('PROJECT_SET')

        project_elemt = add_element(
            root,
            'PROJECT',
            alias=project_row.get('Project Alias'),
            accession=project_row.get('Project Accession'),
            center_name=project_row.get('Center'))

        add_element(project_elemt,
                    'TITLE',
                    project_row.get('Project Title'),
                    content_required=True)
        add_element(project_elemt,
                    'DESCRIPTION',
                    project_row.get('Description'),
                    content_required=True)
        if 'Publication(s)' in project_row and project_row.get(
                'Publication(s)'):
            publications_elemt = add_element(project_elemt, 'PUBLICATIONS')
            publications = project_row.get('Publication(s)').strip().split(',')
            for publication in publications:
                pub_elemt = add_element(publications_elemt, 'PUBLICATION')
                pub_links_elemt = add_element(pub_elemt, 'PUBLICATION_LINKS')
                pub_link_elemt = add_element(pub_links_elemt,
                                             'PUBLICATION_LINK')
                xref_link_elemt = add_element(pub_link_elemt, 'XREF_LINK')
                # Assuming format like PubMed:123456
                # TODO: requirement in the format
                pub_db, pub_id = publication.split(':')
                add_element(xref_link_elemt, 'DB', element_text=pub_db)
                add_element(xref_link_elemt, 'ID', element_text=pub_id)

        if 'Collaborator(s)' in project_row and project_row.get(
                'Collaborator(s)'):
            collaborators = project_row.get('Collaborator(s)').strip().split(
                ',')
            collaborators_elemt = add_element(project_elemt, 'COLLABORATORS')
            for collaborator in collaborators:
                add_element(collaborators_elemt,
                            'COLLABORATOR',
                            element_text=collaborator)

        sub_project_elemt = add_element(project_elemt, 'SUBMISSION_PROJECT')
        add_element(sub_project_elemt, 'SEQUENCING_PROJECT')

        if 'Tax ID' in project_row:
            org_elemt = add_element(sub_project_elemt, 'ORGANISM')
            add_element(org_elemt,
                        'TAXON_ID',
                        element_text=str(project_row.get('Tax ID')).strip())
            scientific_name = get_scientific_name_from_ensembl(
                str(project_row.get('Tax ID')).strip())
            add_element(org_elemt,
                        'SCIENTIFIC_NAME',
                        element_text=scientific_name)

            add_element(org_elemt,
                        'STRAIN',
                        element_text=project_row.get('Strain', ''),
                        content_required=True)
            add_element(org_elemt,
                        'BREED',
                        element_text=project_row.get('Breed', ''),
                        content_required=True)

        if project_row.get('Parent Project(s)') or \
                project_row.get('Child Project(s)') or \
                project_row.get('Peer Project(s)'):
            related_prjs_elemt = add_element(project_elemt, 'RELATED_PROJECTS')

            if 'Parent Project(s)' in project_row and project_row.get(
                    'Parent Project(s)'):
                parent_prjs = project_row.get('Parent Project(s)').split(',')
                for parent_prj in parent_prjs:
                    related_prj_elemt = add_element(related_prjs_elemt,
                                                    'RELATED_PROJECT')
                    add_element(related_prj_elemt,
                                'PARENT_PROJECT',
                                accession=parent_prj)

            if 'Child Project(s)' in project_row and project_row.get(
                    'Child Project(s)'):
                children_prjs = project_row.get('Child Project(s)').split(',')
                for child_prj in children_prjs:
                    related_prj_elemt = add_element(related_prjs_elemt,
                                                    'RELATED_PROJECT')
                    add_element(related_prj_elemt,
                                'CHILD_PROJECT',
                                accession=child_prj)

            if 'Peer Project(s)' in project_row and project_row.get(
                    'Peer Project(s)'):
                peer_prjs = project_row.get('Peer Project(s)').split(',')
                for peer_prj in peer_prjs:
                    related_prj_elemt = add_element(related_prjs_elemt,
                                                    'RELATED_PROJECT')
                    add_element(related_prj_elemt,
                                'PEER_PROJECT',
                                accession=peer_prj)

        if 'Link(s)' in project_row and project_row.get('Link(s)'):
            links_elemt = add_element(project_elemt, 'PROJECT_LINKS')
            project_links = project_row.get('Link(s)').split(',')
            add_links(links_elemt, project_links, link_type='PROJECT_LINK')

        # TODO: Is this still relevant because it is not documented in the metadata template
        add_attribute_elements(project_elemt,
                               project_row,
                               object_type='PROJECT')
        return root
示例#6
0
def get_scientific_name_from_taxonomy(taxonomy):
    if taxonomy not in cache['scientific_name_from_taxonomy']:
        species_scientific_name = get_scientific_name_from_ensembl(taxonomy)
        cache['scientific_name_from_taxonomy'][taxonomy] =species_scientific_name
    return cache['scientific_name_from_taxonomy'][taxonomy]