def _apply_metadata_to_experiment(experiment: Experiment, metadata: dict): experiment.source_url = ENA_URL_TEMPLATE.format( experiment.accession_code) experiment.source_database = "SRA" experiment.technology = "RNA-SEQ" # We don't get this value from the API, unfortunately. # experiment.platform_accession_code = experiment["platform_accession_code"] if not experiment.description: experiment.description = "No description." if "study_title" in metadata: experiment.title = metadata["study_title"] if "study_abstract" in metadata: experiment.description = metadata["study_abstract"] if "lab_name" in metadata: experiment.submitter_institution = metadata["lab_name"] if "experiment_design_description" in metadata: experiment.protocol_description = metadata[ "experiment_design_description"] if "pubmed_id" in metadata: experiment.pubmed_id = metadata["pubmed_id"] experiment.has_publication = True if "study_ena_first_public" in metadata: experiment.source_first_published = parse_date( metadata["study_ena_first_public"]) if "study_ena_last_update" in metadata: experiment.source_last_modified = parse_date( metadata["study_ena_last_update"]) # We only want GEO alternate accessions for SRA samples if re.match(r"^GSE\d{2,6}", metadata.get("external_id", "")) is not None: experiment.alternate_accession_code = metadata["external_id"] # Rare, but it happens. if not experiment.protocol_description: # metadata.get() doesn't work here because sometimes the # key is present but its value is None, in which case None # is returned, causing our database constraint to be # violated. if ("library_construction_protocol" in metadata and metadata["library_construction_protocol"]): experiment.protocol_description = metadata[ "library_construction_protocol"] else: experiment.protocol_description = "Protocol was never provided." # Scrape publication title and authorship from Pubmed if experiment.pubmed_id: pubmed_metadata = utils.get_title_and_authors_for_pubmed_id( experiment.pubmed_id) experiment.publication_title = pubmed_metadata[0] experiment.publication_authors = pubmed_metadata[1]
def create_experiment_from_api( self, experiment_accession_code: str) -> (Experiment, Dict): """Given an experiment accession code, create an Experiment object. Also returns a dictionary of additional information about the platform discovered for the experiment. Will raise an UnsupportedPlatformException if this experiment was conducted using a platform which we don't support. See an example at: https://www.ebi.ac.uk/arrayexpress/json/v3/experiments/E-MTAB-3050/sample """ request_url = EXPERIMENTS_URL + experiment_accession_code experiment_request = utils.requests_retry_session().get(request_url, timeout=60) try: parsed_json = experiment_request.json( )["experiments"]["experiment"][0] except KeyError: logger.error("Remote experiment has no Experiment data!", experiment_accession_code=experiment_accession_code, survey_job=self.survey_job.id) raise experiment = {} experiment["name"] = parsed_json["name"] experiment["experiment_accession_code"] = experiment_accession_code # This experiment has no platform at all, and is therefore useless. if 'arraydesign' not in parsed_json or len( parsed_json["arraydesign"]) == 0: logger.warn("Remote experiment has no arraydesign listed.", experiment_accession_code=experiment_accession_code, survey_job=self.survey_job.id) raise UnsupportedPlatformException # If there is more than one arraydesign listed in the experiment # then there is no other way to determine which array was used # for which sample other than looking at the header of the CEL # file. That obviously cannot happen until the CEL file has been # downloaded so we can just mark it as UNKNOWN and let the # downloader inspect the downloaded file to determine the # array then. elif len(parsed_json["arraydesign"] ) != 1 or "accession" not in parsed_json["arraydesign"][0]: experiment["platform_accession_code"] = UNKNOWN experiment["platform_accession_name"] = UNKNOWN experiment["manufacturer"] = UNKNOWN else: external_accession = parsed_json["arraydesign"][0]["accession"] for platform in get_supported_microarray_platforms(): if platform["external_accession"] == external_accession: experiment[ "platform_accession_code"] = get_normalized_platform( platform["platform_accession"]) # Illumina appears in the accession codes for # platforms manufactured by Illumina if "ILLUMINA" in experiment[ "platform_accession_code"].upper(): experiment["manufacturer"] = "ILLUMINA" experiment["platform_accession_name"] = platform[ "platform_accession"] else: # It's not Illumina, the only other supported Microarray platform is # Affy. As our list of supported platforms grows this logic will # need to get more sophisticated. experiment["manufacturer"] = "AFFYMETRIX" platform_mapping = get_readable_affymetrix_names() experiment[ "platform_accession_name"] = platform_mapping[ platform["platform_accession"]] if "platform_accession_code" not in experiment: # We don't know what platform this accession corresponds to. experiment["platform_accession_code"] = external_accession experiment["platform_accession_name"] = UNKNOWN experiment["manufacturer"] = UNKNOWN experiment["release_date"] = parsed_json["releasedate"] if "lastupdatedate" in parsed_json: experiment["last_update_date"] = parsed_json["lastupdatedate"] else: experiment["last_update_date"] = parsed_json["releasedate"] # Create the experiment object try: experiment_object = Experiment.objects.get( accession_code=experiment_accession_code) logger.debug( "Experiment already exists, skipping object creation.", experiment_accession_code=experiment_accession_code, survey_job=self.survey_job.id) except Experiment.DoesNotExist: # We aren't sure these fields will be populated, or how many there will be. # Try to join them all together, or set a sensible default. experiment_descripton = "" if "description" in parsed_json and len( parsed_json["description"]) > 0: for description_item in parsed_json["description"]: if "text" in description_item: experiment_descripton = experiment_descripton + description_item[ "text"] + "\n" if experiment_descripton == "": experiment_descripton = "Description not available.\n" experiment_object = Experiment() experiment_object.accession_code = experiment_accession_code experiment_object.source_url = request_url experiment_object.source_database = "ARRAY_EXPRESS" experiment_object.title = parsed_json["name"] # This will need to be updated if we ever use Array # Express to get other kinds of data. experiment_object.technology = "MICROARRAY" experiment_object.description = experiment_descripton experiment_object.source_first_published = parse_datetime( experiment["release_date"]) experiment_object.source_last_modified = parse_datetime( experiment["last_update_date"]) experiment_object.save() json_xa = ExperimentAnnotation() json_xa.experiment = experiment_object json_xa.data = parsed_json json_xa.is_ccdl = False json_xa.save() ## Fetch and parse the IDF/SDRF file for any other fields IDF_URL_TEMPLATE = "https://www.ebi.ac.uk/arrayexpress/files/{code}/{code}.idf.txt" idf_url = IDF_URL_TEMPLATE.format(code=experiment_accession_code) idf_text = utils.requests_retry_session().get(idf_url, timeout=60).text lines = idf_text.split('\n') idf_dict = {} for line in lines: keyval = line.strip().split('\t') if len(keyval) == 2: idf_dict[keyval[0]] = keyval[1] elif len(keyval) > 2: idf_dict[keyval[0]] = keyval[1:] idf_xa = ExperimentAnnotation() idf_xa.data = idf_dict idf_xa.experiment = experiment_object idf_xa.is_ccdl = False idf_xa.save() if 'Investigation Title' in idf_dict: experiment_object.title = idf_dict['Investigation Title'] if 'Person Affiliation' in idf_dict: # This is very rare, ex: E-MEXP-32 if isinstance(idf_dict['Person Affiliation'], list): unique_people = list(set(idf_dict['Person Affiliation'])) experiment_object.submitter_institution = ", ".join( unique_people)[:255] else: experiment_object.submitter_institution = idf_dict[ 'Person Affiliation'] # Get protocol_description from "<experiment_url>/protocols" # instead of from idf_dict, because the former provides more # details. protocol_url = request_url + '/protocols' protocol_request = utils.requests_retry_session().get(protocol_url, timeout=60) try: experiment_object.protocol_description = protocol_request.json( )['protocols'] except KeyError: logger.warning( "Remote experiment has no protocol data!", experiment_accession_code=experiment_accession_code, survey_job=self.survey_job.id) if 'Publication Title' in idf_dict: # This will happen for some superseries. # Ex: E-GEOD-29536 # Assume most recent is "best:, store the rest in experiment annotation. if isinstance(idf_dict['Publication Title'], list): experiment_object.publication_title = "; ".join( idf_dict['Publication Title']) else: experiment_object.publication_title = idf_dict[ 'Publication Title'] experiment_object.has_publication = True if 'Publication DOI' in idf_dict: if isinstance(idf_dict['Publication DOI'], list): experiment_object.publication_doi = ", ".join( idf_dict['Publication DOI']) else: experiment_object.publication_doi = idf_dict[ 'Publication DOI'] experiment_object.has_publication = True if 'PubMed ID' in idf_dict: if isinstance(idf_dict['PubMed ID'], list): experiment_object.pubmed_id = ", ".join( idf_dict['PubMed ID']) else: experiment_object.pubmed_id = idf_dict['PubMed ID'] experiment_object.has_publication = True # Scrape publication title and authorship from Pubmed if experiment_object.pubmed_id: pubmed_metadata = utils.get_title_and_authors_for_pubmed_id( experiment_object.pubmed_id) experiment_object.publication_title = pubmed_metadata[0] experiment_object.publication_authors = pubmed_metadata[1] experiment_object.save() platform_dict = {} for k in ('platform_accession_code', 'platform_accession_name', 'manufacturer'): platform_dict[k] = experiment[k] return experiment_object, platform_dict
def _generate_experiment_and_samples( self, run_accession: str, study_accession: str = None) -> (Experiment, List[Sample]): """Generates Experiments and Samples for the provided run_accession.""" metadata = SraSurveyor.gather_all_metadata(run_accession) if metadata == {}: if study_accession: logger.error("Could not discover any metadata for run.", accession=run_accession, study_accession=study_accession) else: logger.error("Could not discover any metadata for run.", accession=run_accession) return (None, None) # This will cascade properly if DOWNLOAD_SOURCE == "ENA": if metadata["library_layout"] == "PAIRED": files_urls = [ SraSurveyor._build_ena_file_url(run_accession, "_1"), SraSurveyor._build_ena_file_url(run_accession, "_2") ] else: files_urls = [SraSurveyor._build_ena_file_url(run_accession)] else: files_urls = [SraSurveyor._build_ncbi_file_url(run_accession)] # Figure out the Organism for this sample organism_name = metadata.pop("organism_name", None) if not organism_name: logger.error("Could not discover organism type for run.", accession=run_accession) return (None, None) # This will cascade properly organism_name = organism_name.upper() organism = Organism.get_object_for_name(organism_name) ## # Experiment ## experiment_accession_code = metadata.get('study_accession') try: experiment_object = Experiment.objects.get( accession_code=experiment_accession_code) logger.debug( "Experiment already exists, skipping object creation.", experiment_accession_code=experiment_accession_code, survey_job=self.survey_job.id) except Experiment.DoesNotExist: experiment_object = Experiment() experiment_object.accession_code = experiment_accession_code experiment_object.source_url = ENA_URL_TEMPLATE.format( experiment_accession_code) experiment_object.source_database = "SRA" experiment_object.technology = "RNA-SEQ" # We don't get this value from the API, unfortunately. # experiment_object.platform_accession_code = experiment["platform_accession_code"] if not experiment_object.description: experiment_object.description = "No description." if "study_title" in metadata: experiment_object.title = metadata["study_title"] if "study_abstract" in metadata: experiment_object.description = metadata["study_abstract"] if "lab_name" in metadata: experiment_object.submitter_institution = metadata["lab_name"] if "experiment_design_description" in metadata: experiment_object.protocol_description = metadata[ "experiment_design_description"] if "pubmed_id" in metadata: experiment_object.pubmed_id = metadata["pubmed_id"] experiment_object.has_publication = True if "study_ena_first_public" in metadata: experiment_object.source_first_published = parse_datetime( metadata["study_ena_first_public"]) if "study_ena_last_update" in metadata: experiment_object.source_last_modified = parse_datetime( metadata["study_ena_last_update"]) # Rare, but it happens. if not experiment_object.protocol_description: experiment_object.protocol_description = metadata.get( "library_construction_protocol", "Protocol was never provided.") # Scrape publication title and authorship from Pubmed if experiment_object.pubmed_id: pubmed_metadata = utils.get_title_and_authors_for_pubmed_id( experiment_object.pubmed_id) experiment_object.publication_title = pubmed_metadata[0] experiment_object.publication_authors = pubmed_metadata[1] experiment_object.save() ## # Experiment Metadata ## json_xa = ExperimentAnnotation() json_xa.experiment = experiment_object json_xa.data = metadata json_xa.is_ccdl = False json_xa.save() ## # Samples ## sample_accession_code = metadata.pop('run_accession') # Create the sample object try: sample_object = Sample.objects.get( accession_code=sample_accession_code) # If current experiment includes new protocol information, # merge it into the sample's existing protocol_info. protocol_info, is_updated = self.update_sample_protocol_info( sample_object.protocol_info, experiment_object.protocol_description, experiment_object.source_url) if is_updated: sample_object.protocol_info = protocol_info sample_object.save() logger.debug( "Sample %s already exists, skipping object creation.", sample_accession_code, experiment_accession_code=experiment_object.accession_code, survey_job=self.survey_job.id) except Sample.DoesNotExist: sample_object = Sample() sample_object.source_database = "SRA" sample_object.accession_code = sample_accession_code sample_object.organism = organism sample_object.platform_name = metadata.get( "platform_instrument_model", "UNKNOWN") # The platform_name is human readable and contains spaces, # accession codes shouldn't have spaces though: sample_object.platform_accession_code = sample_object.platform_name.replace( " ", "") sample_object.technology = "RNA-SEQ" if "ILLUMINA" in sample_object.platform_name.upper() \ or "NEXTSEQ" in sample_object.platform_name.upper(): sample_object.manufacturer = "ILLUMINA" elif "ION TORRENT" in sample_object.platform_name.upper(): sample_object.manufacturer = "ION_TORRENT" else: sample_object.manufacturer = "UNKNOWN" # Directly apply the harmonized values sample_object.title = harmony.extract_title(metadata) harmonized_sample = harmony.harmonize([metadata]) for key, value in harmonized_sample.items(): setattr(sample_object, key, value) protocol_info, is_updated = self.update_sample_protocol_info( existing_protocols=[], experiment_protocol=experiment_object.protocol_description, experiment_url=experiment_object.source_url) # Do not check is_updated the first time because we must # save a list so we can append to it later. sample_object.protocol_info = protocol_info sample_object.save() for file_url in files_urls: original_file = OriginalFile.objects.get_or_create( source_url=file_url, source_filename=file_url.split('/')[-1], has_raw=True)[0] original_file_sample_association = OriginalFileSampleAssociation.objects.get_or_create( original_file=original_file, sample=sample_object) # Create associations if they don't already exist ExperimentSampleAssociation.objects.get_or_create( experiment=experiment_object, sample=sample_object) ExperimentOrganismAssociation.objects.get_or_create( experiment=experiment_object, organism=organism) return experiment_object, [sample_object]