예제 #1
0
    def discover_species(self):
        ensembl_division = (
            SurveyJobKeyValue
            .objects
            .get(survey_job_id=self.survey_job.id,
                 key__exact="ensembl_division")
            .value
        )

        logger.info("Surveying %s division of ensembl.",
                    ensembl_division,
                    survey_job=self.survey_job.id)

        # The main division has a different base URL for its REST API.
        if ensembl_division == "Ensembl":
            r = utils.requests_retry_session().get(MAIN_DIVISION_URL_TEMPLATE)

            # Yes I'm aware that specieses isn't a word. However I need to
            # distinguish between a singlular species and multiple species.
            specieses = r.json()["species"]
        else:
            r = utils.requests_retry_session().get(DIVISION_URL_TEMPLATE.format(division=ensembl_division))
            specieses = r.json()

        try:
            organism_name = SurveyJobKeyValue.objects.get(survey_job_id=self.survey_job.id,
                                                          key__exact="organism_name").value
            organism_name = organism_name.lower().replace(' ', "_")
        except SurveyJobKeyValue.DoesNotExist:
            organism_name = None

        all_new_species = []
        if organism_name:
            for species in specieses:
                # This key varies based on whether the division is the
                # main one or not... why couldn't they just make them
                # consistent?
                if ('species' in species and species['species'] == organism_name) \
                   or ('name' in species and species['name'] == organism_name):
                    all_new_species.append(self._generate_files(species))
                    break
        else:
            for species in specieses:
                all_new_species.append(self._generate_files(species))

        if len(all_new_species) == 0:
            logger.error("Unable to find any species!",
                         ensembl_division=ensembl_division,
                         organism_name=organism_name)

        return all_new_species
예제 #2
0
파일: sra.py 프로젝트: arjunkrish/refinebio
    def gather_sample_metadata(metadata: Dict) -> None:
        formatted_metadata_URL = ENA_METADATA_URL_TEMPLATE.format(
            metadata["sample_accession"])
        response = utils.requests_retry_session().get(formatted_metadata_URL)
        sample_xml = ET.fromstring(response.text)

        sample = sample_xml[0]

        if "center_name" in sample.attrib:
            metadata["sample_center_name"] = sample.attrib["center_name"]

        for child in sample:
            if child.tag == "TITLE":
                metadata["sample_title"] = child.text
            elif child.tag == "SAMPLE_NAME":
                for grandchild in child:
                    if grandchild.tag == "TAXON_ID":
                        metadata["organism_id"] = grandchild.text
                    elif grandchild.tag == "SCIENTIFIC_NAME":
                        metadata["organism_name"] = grandchild.text.upper()
            elif child.tag == "SAMPLE_ATTRIBUTES":
                for grandchild in child:
                    key, value = SraSurveyor.parse_attribute(
                        grandchild, "sample_")
                    metadata[key] = value
예제 #3
0
    def __init__(self, species: Dict):
        """Species is a Dict containing parsed JSON from the Division API."""
        self.url_root = "ensemblgenomes.org/pub/release-{assembly_version}/{short_division}"
        self.division = species["division"]
        self.short_division = DIVISION_LOOKUP[species["division"]]

        mapping = get_strain_mapping_for_organism(species["name"])
        if mapping:
            self.assembly = mapping["assembly"]
            self.strain = mapping["strain"]
        else:
            self.assembly = species["assembly_name"].replace(" ", "_")
            self.strain = None

        assembly_response = utils.requests_retry_session().get(
            DIVISION_RELEASE_URL)
        self.assembly_version = assembly_response.json()["version"]
        self.species_sub_dir = species["name"]
        self.filename_species = species["name"].capitalize()

        # These fields aren't needed for the URL, but they vary between
        # the two REST APIs.
        self.scientific_name = species["name"].upper()

        # This field can be stored in multiple keys, but if
        # `species_taxonomy_id` is there it's the one we want because
        # it's not strain-specific.
        if "species_taxonomy_id" in species:
            self.taxonomy_id = species["species_taxonomy_id"]
        else:
            self.taxonomy_id = species["taxonomy_id"]

        # This field is only needed for EnsemblBacteria and EnsemblFungi.
        self.collection = ""
예제 #4
0
    def test_ordering_mismatch(self):
        """Makes sure that the order samples' keys are in does not affect the title chosen.

        Related: https://github.com/AlexsLemonade/refinebio/pull/304
        """
        experiment_accession_code = "E-TABM-38"

        samples_endpoint = SAMPLES_URL.format(experiment_accession_code)
        r = utils.requests_retry_session().get(samples_endpoint, timeout=60)
        json_samples = r.json()["experiment"]["sample"]
        flattened_json_samples = [
            utils.flatten(json_sample) for json_sample in json_samples
        ]

        SDRF_URL_TEMPLATE = "https://www.ebi.ac.uk/arrayexpress/files/{code}/{code}.sdrf.txt"
        sdrf_url = SDRF_URL_TEMPLATE.format(code=experiment_accession_code)
        parsed_samples = parse_sdrf(sdrf_url)

        title_field = determine_title_field(parsed_samples,
                                            flattened_json_samples)
        sdrf_samples = harmonize_all_samples(parsed_samples, title_field)
        json_titles = [
            extract_title(json_sample, title_field)
            for json_sample in flattened_json_samples
        ]

        # The titles won't match up if the order of the sample dicts
        # isn't corrected for, resulting in a KeyError being raised.
        # So if this doesn't raise a KeyError, then we're good.
        for title in json_titles:
            sdrf_samples[title]
예제 #5
0
파일: sra.py 프로젝트: arjunkrish/refinebio
    def gather_study_metadata(metadata: Dict) -> None:
        formatted_metadata_URL = ENA_METADATA_URL_TEMPLATE.format(
            metadata["study_accession"])
        response = utils.requests_retry_session().get(formatted_metadata_URL)
        study_xml = ET.fromstring(response.text)

        study = study_xml[0]
        for child in study:
            if child.tag == "DESCRIPTOR":
                for grandchild in child:
                    # STUDY_TYPE is the only tag which uses attributes
                    # instead of the text for whatever reason
                    if grandchild.tag == "STUDY_TYPE":
                        metadata[grandchild.tag.lower(
                        )] = grandchild.attrib["existing_study_type"]
                    else:
                        metadata[grandchild.tag.lower()] = grandchild.text
            elif child.tag == "STUDY_ATTRIBUTES":
                for grandchild in child:
                    key, value = SraSurveyor.parse_attribute(
                        grandchild, "study_")
                    metadata[key] = value
            elif child.tag == "STUDY_LINKS":
                for grandchild in child:
                    for ggc in grandchild:
                        if ggc.getchildren()[0].text == "pubmed":
                            metadata["pubmed_id"] = ggc.getchildren()[1].text
                            break
            elif child.tag == "IDENTIFIERS":
                for grandchild in child:
                    if (grandchild.tag == "EXTERNAL_ID" and
                            grandchild.attrib.get("namespace", "") == "GEO"):
                        metadata["external_id"] = grandchild.text
예제 #6
0
    def __init__(self, species: Dict):
        """Species is a Dict containing parsed JSON from the Division API."""
        self.url_root = "ensemblgenomes.org/pub/release-{assembly_version}/{short_division}"
        self.short_division = DIVISION_LOOKUP[species["division"]]
        self.assembly = species["assembly_name"].replace(" ", "_")
        self.assembly_version = utils.requests_retry_session().get(
            DIVISION_RELEASE_URL).json()["version"]

        # Some species are nested within a collection directory. If
        # this is the case, then we need to add that extra directory
        # to the URL, and for whatever reason the filename is not
        # capitalized.
        COLLECTION_REGEX = r"^(.*_collection).*"
        match_object = re.search(COLLECTION_REGEX, species["dbname"])
        if match_object:
            self.species_sub_dir = match_object.group(
                1) + "/" + species["species"]
            self.filename_species = species["species"]
        else:
            self.species_sub_dir = species["species"]
            self.filename_species = species["species"].capitalize()

        # These fields aren't needed for the URL, but they vary between
        # the two REST APIs.
        self.scientific_name = species["name"].upper()
        self.taxonomy_id = species["taxonomy_id"]
예제 #7
0
 def __init__(self, species: Dict):
     self.url_root = "ensembl.org/pub/release-{assembly_version}"
     self.short_division = None
     self.species_sub_dir = species["name"]
     self.filename_species = species["name"].capitalize()
     self.assembly = species["assembly"]
     self.assembly_version = utils.requests_retry_session().get(MAIN_RELEASE_URL).json()["release"]
     self.scientific_name = self.filename_species.replace("_", " ")
     self.taxonomy_id = species["taxon_id"]
예제 #8
0
파일: sra.py 프로젝트: arjunkrish/refinebio
    def gather_run_metadata(run_accession: str) -> Dict:
        """A run refers to a specific read in an experiment."""

        discoverable_accessions = [
            "study_accession", "sample_accession", "submission_accession"
        ]

        response = utils.requests_retry_session().get(
            ENA_METADATA_URL_TEMPLATE.format(run_accession))
        try:
            run_xml = ET.fromstring(response.text)
        except Exception:
            logger.exception("Unable to decode response",
                             response=response.text)
            return {}

        # Necessary because ERP000263 has only one ROOT element containing this error:
        # Entry: ERR15562 display type is either not supported or entry is not found.
        if len(run_xml) == 0:
            return {}

        run_item = run_xml[0]

        useful_attributes = [
            "center_name", "run_center", "run_date", "broker_name", "alias"
        ]
        metadata = {}
        for attribute in useful_attributes:
            if attribute in run_item.attrib:
                metadata[attribute] = run_item.attrib[attribute]
        metadata["run_accession"] = run_accession

        for child in run_item:
            if child.tag == "EXPERIMENT_REF":
                metadata["experiment_accession"] = child.attrib["accession"]
            elif child.tag == "RUN_LINKS":
                for grandchild in child:
                    key, value = SraSurveyor.parse_run_link(grandchild)
                    if value != "" and key in discoverable_accessions:
                        metadata[key] = value
            elif child.tag == "RUN_ATTRIBUTES":
                for grandchild in child:
                    key, value = SraSurveyor.parse_attribute(
                        grandchild, "run_")
                    metadata[key] = value

        return metadata
예제 #9
0
    def gather_study_metadata(metadata: Dict) -> None:
        formatted_metadata_URL = ENA_METADATA_URL_TEMPLATE.format(
            metadata["study_accession"])
        response = utils.requests_retry_session().get(formatted_metadata_URL)
        study_xml = ET.fromstring(response.text)

        study = study_xml[0]
        for child in study:
            if child.tag == "DESCRIPTOR":
                for grandchild in child:
                    # STUDY_TYPE is the only tag which uses attributes
                    # instead of the text for whatever reason
                    if grandchild.tag == "STUDY_TYPE":
                        metadata[grandchild.tag.lower(
                        )] = grandchild.attrib["existing_study_type"]
                    else:
                        metadata[grandchild.tag.lower()] = grandchild.text
            elif child.tag == "STUDY_ATTRIBUTES":
                for grandchild in child:
                    key, value = SraSurveyor.parse_attribute(
                        grandchild, "study_")
                    metadata[key] = value
            elif child.tag == "STUDY_LINKS":
                for grandchild in child:
                    for ggc in grandchild:
                        if ggc.getchildren()[0].text == "pubmed":
                            metadata["pubmed_id"] = ggc.getchildren()[1].text
                            break
            elif child.tag == "IDENTIFIERS":
                for grandchild in child:
                    if (
                            # Check for GEO accessions. These live inside an
                            # EXTERNAL_ID tag with namespace GEO
                            grandchild.tag == "EXTERNAL_ID"
                            and grandchild.attrib.get("namespace", "") == "GEO"
                            and re.match(r"^GSE\d{2,6}", grandchild.text)
                    ) or (
                            # Check for ArrayExpress accessions. These live inside a
                            # SUBMITTER_ID tag, but the namespace is not standardized
                            grandchild.tag == "SUBMITTER_ID" and re.match(
                                r"^E-[A-Z]{4}-\d{2,6}", grandchild.text)):
                        metadata["external_id"] = grandchild.text
                        break
예제 #10
0
파일: sra.py 프로젝트: Quiltomics/refinebio
    def gather_submission_metadata(metadata: Dict) -> None:

        formatted_metadata_URL = ENA_METADATA_URL_TEMPLATE.format(metadata["submission_accession"])
        response = utils.requests_retry_session().get(formatted_metadata_URL)
        submission_xml = ET.fromstring(response.text)[0]
        submission_metadata = submission_xml.attrib

        # We already have these
        submission_metadata.pop("accession", '')
        submission_metadata.pop("alias", '')

        metadata.update(submission_metadata)

        for child in submission_xml:
            if child.tag == "TITLE":
                metadata["submission_title"] = child.text
            elif child.tag == "SUBMISSION_ATTRIBUTES":
                for grandchild in child:
                    metadata[grandchild.find("TAG").text.lower()] = grandchild.find("VALUE").text
예제 #11
0
def parse_sdrf(sdrf_url: str) -> List:
    """ Given a URL to an SDRF file, download parses it into JSON. """

    try:
        sdrf_response = requests_retry_session().get(sdrf_url, timeout=60)
    except Exception:
        logger.exception("Unable to fetch URL: " + sdrf_url)
        return []

    if sdrf_response.status_code != 200:
        logger.error("Unable to fetch URL: " + sdrf_url, response_code=sdrf_response.status_code)
        return []

    sdrf_text = sdrf_response.text

    samples = []

    reader = csv.reader(StringIO(sdrf_text), delimiter="\t")
    for offset, line in enumerate(reader):

        # Get the keys
        if offset == 0:
            keys = line
            continue

        sample_values = line

        # Skip malformed lines
        if len(sample_values) != len(keys):
            continue

        sample = {}
        for col, value in enumerate(sample_values):
            key = keys[col]
            sample[key] = value
        samples.append(sample)

    return samples
예제 #12
0
파일: sra.py 프로젝트: arjunkrish/refinebio
    def gather_experiment_metadata(metadata: Dict) -> None:
        formatted_metadata_URL = ENA_METADATA_URL_TEMPLATE.format(
            metadata["experiment_accession"])
        response = utils.requests_retry_session().get(formatted_metadata_URL)
        experiment_xml = ET.fromstring(response.text)

        experiment = experiment_xml[0]
        for child in experiment:
            if child.tag == "TITLE":
                metadata["experiment_title"] = child.text
            elif child.tag == "DESIGN":
                for grandchild in child:
                    if grandchild.tag == "DESIGN_DESCRIPTION":
                        metadata[
                            "experiment_design_description"] = grandchild.text
                    elif grandchild.tag == "LIBRARY_DESCRIPTOR":
                        SraSurveyor.gather_library_metadata(
                            metadata, grandchild)
                    elif grandchild.tag == "SPOT_DESCRIPTOR":
                        SraSurveyor.gather_spot_metadata(metadata, grandchild)
            elif child.tag == "PLATFORM":
                # This structure is extraneously nested.
                metadata["platform_instrument_model"] = child[0][0].text
예제 #13
0
    def gather_file_report(run_accession: str) -> List[Dict]:
        """Get stats about files and check for unmated reads.

        This endpoint returns a weird format, so some custom parsing is required:
        run_accession	fastq_ftp	fastq_bytes	fastq_md5	submitted_ftp	submitted_bytes	submitted_md5	sra_ftp	sra_bytes	sra_md5
        SRR7353755	ftp.sra.ebi.ac.uk/vol1/fastq/SRR735/005/SRR7353755/SRR7353755.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/SRR735/005/SRR7353755/SRR7353755_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/SRR735/005/SRR7353755/SRR7353755_2.fastq.gz	25176;2856704;3140575	7ef1ba010dcb679217112efa380798b2;6bc5651b7103306d4d65018180ab8d0d;3856c14164612d9879d576a046a9879f	"""
        response = utils.requests_retry_session().get(
            ENA_FILE_REPORT_URL_TEMPLATE.format(accession=run_accession))

        lines = response.text.split("\n")
        split_lines = [line.split("\t") for line in lines]
        header_row = split_lines[0]
        sample_row = split_lines[1]

        file_info = []
        for i, key in enumerate(header_row):
            if key in ["fastq_ftp", "fastq_bytes", "fastq_md5"]:
                for i, value in enumerate(sample_row[i].split(";")):
                    if i >= len(file_info):
                        file_info.append({key: value})
                    else:
                        file_info[i][key] = value

        return file_info
예제 #14
0
    def create_experiment_from_api(
            self, experiment_accession_code: str) -> (Experiment, Dict):
        """Given an experiment accession code, create an Experiment object.

        Also returns a dictionary of additional information about the
        platform discovered for the experiment.

        Will raise an UnsupportedPlatformException if this experiment was
        conducted using a platform which we don't support.

        See an example at: https://www.ebi.ac.uk/arrayexpress/json/v3/experiments/E-MTAB-3050/sample
        """
        request_url = EXPERIMENTS_URL + experiment_accession_code
        experiment_request = utils.requests_retry_session().get(request_url,
                                                                timeout=60)

        try:
            parsed_json = experiment_request.json(
            )["experiments"]["experiment"][0]
        except KeyError:
            logger.error("Remote experiment has no Experiment data!",
                         experiment_accession_code=experiment_accession_code,
                         survey_job=self.survey_job.id)
            raise

        experiment = {}
        experiment["name"] = parsed_json["name"]
        experiment["experiment_accession_code"] = experiment_accession_code

        # This experiment has no platform at all, and is therefore useless.
        if 'arraydesign' not in parsed_json or len(
                parsed_json["arraydesign"]) == 0:
            logger.warn("Remote experiment has no arraydesign listed.",
                        experiment_accession_code=experiment_accession_code,
                        survey_job=self.survey_job.id)
            raise UnsupportedPlatformException
        # If there is more than one arraydesign listed in the experiment
        # then there is no other way to determine which array was used
        # for which sample other than looking at the header of the CEL
        # file. That obviously cannot happen until the CEL file has been
        # downloaded so we can just mark it as UNKNOWN and let the
        # downloader inspect the downloaded file to determine the
        # array then.
        elif len(parsed_json["arraydesign"]
                 ) != 1 or "accession" not in parsed_json["arraydesign"][0]:
            experiment["platform_accession_code"] = UNKNOWN
            experiment["platform_accession_name"] = UNKNOWN
            experiment["manufacturer"] = UNKNOWN
        else:
            external_accession = parsed_json["arraydesign"][0]["accession"]
            for platform in get_supported_microarray_platforms():
                if platform["external_accession"] == external_accession:
                    experiment[
                        "platform_accession_code"] = get_normalized_platform(
                            platform["platform_accession"])

                    # Illumina appears in the accession codes for
                    # platforms manufactured by Illumina
                    if "ILLUMINA" in experiment[
                            "platform_accession_code"].upper():
                        experiment["manufacturer"] = "ILLUMINA"
                        experiment["platform_accession_name"] = platform[
                            "platform_accession"]
                    else:
                        # It's not Illumina, the only other supported Microarray platform is
                        # Affy. As our list of supported platforms grows this logic will
                        # need to get more sophisticated.
                        experiment["manufacturer"] = "AFFYMETRIX"
                        platform_mapping = get_readable_affymetrix_names()
                        experiment[
                            "platform_accession_name"] = platform_mapping[
                                platform["platform_accession"]]

            if "platform_accession_code" not in experiment:
                # We don't know what platform this accession corresponds to.
                experiment["platform_accession_code"] = external_accession
                experiment["platform_accession_name"] = UNKNOWN
                experiment["manufacturer"] = UNKNOWN

        experiment["release_date"] = parsed_json["releasedate"]

        if "lastupdatedate" in parsed_json:
            experiment["last_update_date"] = parsed_json["lastupdatedate"]
        else:
            experiment["last_update_date"] = parsed_json["releasedate"]

        # Create the experiment object
        try:
            experiment_object = Experiment.objects.get(
                accession_code=experiment_accession_code)
            logger.debug(
                "Experiment already exists, skipping object creation.",
                experiment_accession_code=experiment_accession_code,
                survey_job=self.survey_job.id)
        except Experiment.DoesNotExist:
            # We aren't sure these fields will be populated, or how many there will be.
            # Try to join them all together, or set a sensible default.
            experiment_descripton = ""
            if "description" in parsed_json and len(
                    parsed_json["description"]) > 0:
                for description_item in parsed_json["description"]:
                    if "text" in description_item:
                        experiment_descripton = experiment_descripton + description_item[
                            "text"] + "\n"

            if experiment_descripton == "":
                experiment_descripton = "Description not available.\n"

            experiment_object = Experiment()
            experiment_object.accession_code = experiment_accession_code
            experiment_object.source_url = request_url
            experiment_object.source_database = "ARRAY_EXPRESS"
            experiment_object.title = parsed_json["name"]
            # This will need to be updated if we ever use Array
            # Express to get other kinds of data.
            experiment_object.technology = "MICROARRAY"
            experiment_object.description = experiment_descripton
            experiment_object.source_first_published = parse_datetime(
                experiment["release_date"])
            experiment_object.source_last_modified = parse_datetime(
                experiment["last_update_date"])
            experiment_object.save()

            json_xa = ExperimentAnnotation()
            json_xa.experiment = experiment_object
            json_xa.data = parsed_json
            json_xa.is_ccdl = False
            json_xa.save()

            ## Fetch and parse the IDF/SDRF file for any other fields
            IDF_URL_TEMPLATE = "https://www.ebi.ac.uk/arrayexpress/files/{code}/{code}.idf.txt"
            idf_url = IDF_URL_TEMPLATE.format(code=experiment_accession_code)
            idf_text = utils.requests_retry_session().get(idf_url,
                                                          timeout=60).text

            lines = idf_text.split('\n')
            idf_dict = {}
            for line in lines:
                keyval = line.strip().split('\t')
                if len(keyval) == 2:
                    idf_dict[keyval[0]] = keyval[1]
                elif len(keyval) > 2:
                    idf_dict[keyval[0]] = keyval[1:]

            idf_xa = ExperimentAnnotation()
            idf_xa.data = idf_dict
            idf_xa.experiment = experiment_object
            idf_xa.is_ccdl = False
            idf_xa.save()

            if 'Investigation Title' in idf_dict:
                experiment_object.title = idf_dict['Investigation Title']
            if 'Person Affiliation' in idf_dict:
                # This is very rare, ex: E-MEXP-32
                if isinstance(idf_dict['Person Affiliation'], list):

                    unique_people = list(set(idf_dict['Person Affiliation']))
                    experiment_object.submitter_institution = ", ".join(
                        unique_people)[:255]
                else:
                    experiment_object.submitter_institution = idf_dict[
                        'Person Affiliation']

            # Get protocol_description from "<experiment_url>/protocols"
            # instead of from idf_dict, because the former provides more
            # details.
            protocol_url = request_url + '/protocols'
            protocol_request = utils.requests_retry_session().get(protocol_url,
                                                                  timeout=60)
            try:
                experiment_object.protocol_description = protocol_request.json(
                )['protocols']
            except KeyError:
                logger.warning(
                    "Remote experiment has no protocol data!",
                    experiment_accession_code=experiment_accession_code,
                    survey_job=self.survey_job.id)

            if 'Publication Title' in idf_dict:
                # This will happen for some superseries.
                # Ex: E-GEOD-29536
                # Assume most recent is "best:, store the rest in experiment annotation.
                if isinstance(idf_dict['Publication Title'], list):
                    experiment_object.publication_title = "; ".join(
                        idf_dict['Publication Title'])
                else:
                    experiment_object.publication_title = idf_dict[
                        'Publication Title']
                experiment_object.has_publication = True
            if 'Publication DOI' in idf_dict:
                if isinstance(idf_dict['Publication DOI'], list):
                    experiment_object.publication_doi = ", ".join(
                        idf_dict['Publication DOI'])
                else:
                    experiment_object.publication_doi = idf_dict[
                        'Publication DOI']
                experiment_object.has_publication = True
            if 'PubMed ID' in idf_dict:
                if isinstance(idf_dict['PubMed ID'], list):
                    experiment_object.pubmed_id = ", ".join(
                        idf_dict['PubMed ID'])
                else:
                    experiment_object.pubmed_id = idf_dict['PubMed ID']
                experiment_object.has_publication = True

            # Scrape publication title and authorship from Pubmed
            if experiment_object.pubmed_id:
                pubmed_metadata = utils.get_title_and_authors_for_pubmed_id(
                    experiment_object.pubmed_id)
                experiment_object.publication_title = pubmed_metadata[0]
                experiment_object.publication_authors = pubmed_metadata[1]

            experiment_object.save()

        platform_dict = {}
        for k in ('platform_accession_code', 'platform_accession_name',
                  'manufacturer'):
            platform_dict[k] = experiment[k]

        return experiment_object, platform_dict
    def handle(self, *args, **options):
        """Refreshes the metadata for all experiments, or experiments from a specific database
        """
        possible_source_databases = ["ARRAY_EXPRESS", "GEO", "SRA"]

        if options.get("source_database", None) is None:
            experiments = Experiment.objects.all()
        elif options["source_database"] in possible_source_databases:
            source_database = options["source_database"]
            experiments = Experiment.objects.filter(
                source_database=source_database)
        else:
            logger.error('Invalid source database "{}"'.format(
                options["source_database"]) +
                         "\nPossible source databases: {}".format(", ".join(
                             possible_source_databases)))
            sys.exit(1)

        paginator = PerformantPaginator(experiments, PAGE_SIZE)
        page = paginator.page()

        while True:
            for experiment in page.object_list:
                logger.debug("Refreshing metadata for an experiment.",
                             experiment=experiment.accession_code)
                try:
                    if experiment.source_database == "SRA":
                        metadata = SraSurveyor.gather_all_metadata(
                            experiment.samples.first().accession_code)
                        SraSurveyor._apply_metadata_to_experiment(
                            experiment, metadata)

                    elif experiment.source_database == "GEO":
                        gse = GEOparse.get_GEO(
                            experiment.accession_code,
                            destdir="/tmp/management",
                            silent=True,
                        )

                        GeoSurveyor._apply_metadata_to_experiment(
                            experiment, gse)

                    elif experiment.source_database == "ARRAY_EXPRESS":
                        request_url = EXPERIMENTS_URL + experiment.accession_code
                        experiment_request = utils.requests_retry_session(
                        ).get(request_url, timeout=60)
                        try:
                            parsed_json = experiment_request.json(
                            )["experiments"]["experiment"][0]
                        except KeyError:
                            logger.error(
                                "Remote experiment has no Experiment data!",
                                experiment_accession_code=experiment.
                                accession_code,
                                survey_job=self.survey_job.id,
                            )
                            continue
                        ArrayExpressSurveyor._apply_metadata_to_experiment(
                            experiment, parsed_json)

                    experiment.save()

                # If there are any errors, just continue. It's likely that it's
                # just a problem with this experiment.
                except Exception:
                    logger.exception(
                        "exception caught while updating metadata for {}".
                        format(experiment.accession_code))

            if not page.has_next():
                break
            else:
                page = paginator.page(page.next_page_number())

            # 2000 samples queued up every five minutes should be fast
            # enough and also not thrash the DB.
            time.sleep(60 * 5)
예제 #16
0
    def discover_species(self):
        ensembl_division = SurveyJobKeyValue.objects.get(
            survey_job_id=self.survey_job.id,
            key__exact="ensembl_division").value

        logger.info(
            "Surveying %s division of ensembl.",
            ensembl_division,
            survey_job=self.survey_job.id,
        )

        try:
            organism_name = SurveyJobKeyValue.objects.get(
                survey_job_id=self.survey_job.id,
                key__exact="organism_name").value
            organism_name = organism_name.lower().replace(" ", "_")
        except SurveyJobKeyValue.DoesNotExist:
            organism_name = None

        strain_mapping = None
        if ensembl_division in ["EnsemblFungi", "EnsemblBacteria"]:
            if organism_name is None:
                logger.error(
                    "Organism name must be specified for Fungi and Bacteria divisions.",
                    ensembl_division=ensembl_division,
                    organism_name=organism_name,
                )
                return []
            else:
                strain_mapping = get_strain_mapping_for_organism(organism_name)
                if strain_mapping is None:
                    logger.error(
                        ("Organism name must be listed in config/organism_strain_"
                         "mappings.csv for Fungi and Bacteria divisions."),
                        ensembl_division=ensembl_division,
                        organism_name=organism_name,
                    )
                    return []

        # The main division has a different base URL for its REST API.
        if ensembl_division == "Ensembl":
            r = utils.requests_retry_session().get(MAIN_DIVISION_URL_TEMPLATE)

            # Yes I'm aware that specieses isn't a word. However I need to
            # distinguish between a singlular species and multiple species.
            specieses = r.json()["species"]
        else:
            formatted_division_url = DIVISION_URL_TEMPLATE.format(
                division=ensembl_division)
            r = utils.requests_retry_session().get(formatted_division_url)
            specieses = r.json()

        all_new_species = []
        if organism_name:
            if strain_mapping:
                organism_name = organism_name + "_" + strain_mapping[
                    "strain"].lower()

            for species in specieses:
                if (ensembl_division in ["EnsemblFungi", "EnsemblBacteria"]
                        and organism_name in species["name"]):
                    # Fungi and Bacteria have a strain identifier in their
                    # names. This is different than everything else,
                    # so we're going to handle this special case by
                    # just overwriting this. This is okay because we
                    # just have to discover one species for the
                    # organism, and then our strain mapping will make
                    # sure we use the correct strain and assembly.
                    species["name"] = organism_name

                    all_new_species.append(self._generate_files(species))
                    break
                elif "name" in species and organism_name == species["name"]:
                    all_new_species.append(self._generate_files(species))
                    break
        else:
            for species in specieses:
                all_new_species.append(self._generate_files(species))

        if len(all_new_species) == 0:
            logger.error(
                "Unable to find any species!",
                ensembl_division=ensembl_division,
                organism_name=organism_name,
            )

        return all_new_species
예제 #17
0
파일: sra.py 프로젝트: arjunkrish/refinebio
    def discover_experiment_and_samples(self):
        """Returns an experiment and a list of samples for an SRA accession"""
        survey_job = SurveyJob.objects.get(id=self.survey_job.id)
        survey_job_properties = survey_job.get_properties()
        accession = survey_job_properties["experiment_accession_code"]

        # SRA Surveyor is mainly designed for SRRs, this handles SRPs
        if "SRP" in accession or "ERP" in accession or "DRP" in accession:
            response = utils.requests_retry_session().get(
                ENA_METADATA_URL_TEMPLATE.format(accession))
            experiment_xml = ET.fromstring(response.text)[0]
            study_links = experiment_xml[2]  # STUDY_LINKS

            accessions_to_run = []
            for child in study_links:
                if child[0][0].text == "ENA-RUN":

                    all_runs = child[0][1].text

                    # Ranges can be disjoint, separated by commas
                    run_segments = all_runs.split(",")
                    for segment in run_segments:
                        if "-" in segment:
                            start, end = segment.split("-")
                        else:
                            start = segment
                            end = segment
                        start_id = start[3::]
                        end_id = end[3::]

                        for run_id in range(int(start_id), int(end_id) + 1):
                            run_id = str(run_id).zfill(len(start_id))
                            accessions_to_run.append(accession[0] + "RR" +
                                                     run_id)
                    break

            experiment = None
            all_samples = []
            for run_id in accessions_to_run:
                logger.debug(
                    "Surveying SRA Run Accession %s for Experiment %s",
                    run_id,
                    accession,
                    survey_job=self.survey_job.id,
                )

                returned_experiment, samples = self._generate_experiment_and_samples(
                    run_id, accession)

                # Some runs may return (None, None). If this happens
                # we don't want to set experiment to None.
                if returned_experiment:
                    experiment = returned_experiment

                if samples:
                    all_samples += samples

            # So we prevent duplicate downloads, ex for SRP111553
            all_samples = list(set(all_samples))

            # Experiment will always be the same
            return experiment, all_samples

        else:
            logger.debug("Surveying SRA Run Accession %s",
                         accession,
                         survey_job=self.survey_job.id)
            return self._generate_experiment_and_samples(accession)
예제 #18
0
    def create_samples_from_api(self, experiment: Experiment,
                                platform_dict: Dict) -> List[Sample]:
        """Generates a Sample item for each sample in an AE experiment.

        There are many possible data situations for a sample:

            - If the sample only has raw data available:
                - If it is on a platform that we support:
                    Download this raw data and process it
                - If it is not on a platform we support:
                    Don't download anything, don't process anything
            - If the sample has both raw and derived data:
                - If the raw data is on a platform we support:
                    Download the raw data and process it, abandon the derived data
                - If the raw data is not on a platform we support
                    Download the derived data and no-op it, abandon the raw data
            - If the sample only has derived data:
                Download the derived data and no-op it.

        See an example at: https://www.ebi.ac.uk/arrayexpress/json/v3/experiments/E-MTAB-3050/samples

        """

        created_samples = []

        samples_endpoint = SAMPLES_URL.format(experiment.accession_code)
        r = utils.requests_retry_session().get(samples_endpoint, timeout=60)
        samples = r.json()["experiment"]["sample"]

        # The SDRF is the complete metadata record on a sample/property basis.
        # We run this through our harmonizer and then attach the properties
        # to our created samples.
        SDRF_URL_TEMPLATE = "https://www.ebi.ac.uk/arrayexpress/files/{code}/{code}.sdrf.txt"
        sdrf_url = SDRF_URL_TEMPLATE.format(code=experiment.accession_code)
        sdrf_samples = harmony.parse_sdrf(sdrf_url)
        harmonized_samples = harmony.harmonize(sdrf_samples)

        # An experiment can have many samples
        for sample_data in samples:

            # For some reason, this sample has no files associated with it.
            if "file" not in sample_data or len(sample_data['file']) == 0:
                continue

            # Each sample is given an experimenatlly-unique title.
            flat_sample = utils.flatten(sample_data)
            title = harmony.extract_title(flat_sample)

            # A sample may actually have many sub files.
            # If there is raw data, take that.
            # If not, take the derived.
            has_raw = False
            for sub_file in sample_data['file']:

                # For ex: E-GEOD-15645
                if isinstance(sub_file['comment'], list):
                    sub_file_mod = sub_file
                    sub_file_mod['comment'] = sub_file['comment'][0]
                else:
                    sub_file_mod = sub_file

                # Some have the 'data' field, but not the actual data
                # Ex: E-GEOD-9656
                if sub_file_mod['type'] == "data" and sub_file_mod[
                        'comment'].get('value', None) != None:
                    has_raw = True
                if 'raw' in sub_file_mod['comment'].get('value', ''):
                    has_raw = True

            skip_sample = False
            for sub_file in sample_data['file']:

                # Don't get the raw data if it's only a 1-color sample.
                if 'Cy3' in str(sample_data) and 'Cy5' not in str(sample_data):
                    has_raw = False

                # Skip derived data if we have it raw.
                if has_raw and "derived data" in sub_file['type']:
                    continue

                download_url = None
                filename = sub_file["name"]

                # sub_file["comment"] is only a list if there's
                # more than one comment...
                comments = sub_file["comment"]
                if isinstance(comments, list):
                    # Could be: "Derived ArrayExpress Data Matrix FTP
                    # file" or: "ArrayExpress FTP file". If there is
                    # no comment with a name including "FTP file" then
                    # we don't know where to download it so we need to
                    # mark this job as an error. Therefore don't catch
                    # the potential exception where download_url
                    # doesn't get defined.
                    for comment in comments:
                        if "FTP file" in comment["name"]:
                            download_url = comment["value"]
                            break
                else:
                    download_url = comments["value"]

                if not download_url:
                    logger.error(
                        "Sample %s did not specify a download url, skipping.",
                        sample_accession_code,
                        experiment_accession_code=experiment.accession_code,
                        survey_job=self.survey_job.id,
                        sub_file=sub_file)
                    skip_sample = True
                    continue

                if not filename:
                    logger.error(
                        "Sample %s did not specify a filename, skipping.",
                        sample_accession_code,
                        experiment_accession_code=experiment.accession_code,
                        survey_job=self.survey_job.id,
                        sub_file=sub_file)
                    skip_sample = True
                    continue

            if skip_sample:
                continue

            # The accession code is not a simple matter to determine.
            sample_source_name = sample_data["source"].get("name", "")
            sample_assay_name = sample_data["assay"].get("name", "")
            sample_accession_code = self.determine_sample_accession(
                experiment.accession_code, sample_source_name,
                sample_assay_name, filename)

            # Figure out the Organism for this sample
            organism_name = UNKNOWN
            for characteristic in sample_data["characteristic"]:
                if characteristic["category"].upper() == "ORGANISM":
                    organism_name = characteristic["value"].upper()

            if organism_name == UNKNOWN:
                logger.error(
                    "Sample %s did not specify the organism name.",
                    sample_accession_code,
                    experiment_accession_code=experiment.accession_code,
                    survey_job=self.survey_job.id)
                organism = None
                continue
            else:
                organism = Organism.get_object_for_name(organism_name)

            # Create the sample object
            try:
                # Associate it with the experiment, but since it
                # already exists it already has original files
                # associated with it and it's already been downloaded,
                # so don't add it to created_samples.
                sample_object = Sample.objects.get(
                    accession_code=sample_accession_code)

                # If input experiment includes new protocol information,
                # update sample's protocol_info.
                existing_protocols = sample_object.protocol_info
                protocol_info, is_updated = self.update_sample_protocol_info(
                    existing_protocols, experiment.protocol_description,
                    experiment.source_url + '/protocols')
                if is_updated:
                    sample_object.protocol_info = protocol_info
                    sample_obejct.save()

                logger.debug(
                    "Sample %s already exists, skipping object creation.",
                    sample_accession_code,
                    experiment_accession_code=experiment.accession_code,
                    survey_job=self.survey_job.id)
            except Sample.DoesNotExist:
                sample_object = Sample()

                # The basics
                sample_object.source_database = "ARRAY_EXPRESS"
                sample_object.title = title
                sample_object.accession_code = sample_accession_code
                sample_object.source_archive_url = samples_endpoint
                sample_object.organism = organism
                sample_object.platform_name = platform_dict[
                    "platform_accession_name"]
                sample_object.platform_accession_code = platform_dict[
                    "platform_accession_code"]
                sample_object.manufacturer = platform_dict["manufacturer"]
                sample_object.technology = "MICROARRAY"

                protocol_info, is_updated = self.update_sample_protocol_info(
                    existing_protocols=[],
                    experiment_protocol=experiment.protocol_description,
                    protocol_url=experiment.source_url + '/protocols')
                # Do not check is_updated the first time because we must
                # save a list so we can append to it later.
                sample_object.protocol_info = protocol_info

                sample_object.save()

                # Directly assign the harmonized properties
                harmonized_sample = harmonized_samples[title]
                for key, value in harmonized_sample.items():
                    setattr(sample_object, key, value)
                sample_object.save()

                sample_annotation = SampleAnnotation()
                sample_annotation.data = sample_data
                sample_annotation.sample = sample_object
                sample_annotation.is_ccdl = False
                sample_annotation.save()

                original_file = OriginalFile()
                original_file.filename = filename
                original_file.source_filename = filename
                original_file.source_url = download_url
                original_file.is_downloaded = False
                original_file.is_archive = True
                original_file.has_raw = has_raw
                original_file.save()

                original_file_sample_association = OriginalFileSampleAssociation(
                )
                original_file_sample_association.original_file = original_file
                original_file_sample_association.sample = sample_object
                original_file_sample_association.save()

                created_samples.append(sample_object)

                logger.debug(
                    "Created " + str(sample_object),
                    experiment_accession_code=experiment.accession_code,
                    survey_job=self.survey_job.id,
                    sample=sample_object.id)

            # Create associations if they don't already exist
            ExperimentSampleAssociation.objects.get_or_create(
                experiment=experiment, sample=sample_object)

            ExperimentOrganismAssociation.objects.get_or_create(
                experiment=experiment, organism=organism)

        return created_samples