示例#1
0
def _run_scan_upc(job_context: Dict) -> Dict:
    """Processes an input CEL file to an output PCL file.

    Does so using the SCAN.UPC package's SCANfast method using R.
    Expects job_context to contain the keys 'input_file', 'output_file',
    and 'brainarray_package'.
    """
    input_file = job_context["input_file_path"]

    try:
        # It's necessary to load the foreach library before calling SCANfast
        # because it doesn't load the library before calling functions
        # from it.
        ro.r("suppressMessages(library('foreach'))")

        # Prevents:
        # RRuntimeWarning: There were 50 or more warnings (use warnings()
        # to see the first 50)
        ro.r("options(warn=1)")

        # All R messages are turned into Python 'warnings' by rpy2. By
        # filtering all of them we silence a lot of useless output
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            scan_upc = ro.r['::']('SCAN.UPC', 'SCANfast')
            job_context['time_start'] = timezone.now()

            # Related: https://github.com/AlexsLemonade/refinebio/issues/64
            if job_context["brainarray_package"]:
                # If we've detected the platform using affy, then this
                # is the best source of truth we'll be able to get, so
                # update the sample to match it.
                platform_accession_code = job_context[
                    "platform_accession_code"]
                platform_name = get_readable_affymetrix_names(
                )[platform_accession_code]

                for sample in job_context["samples"]:
                    sample.platform_accession_code = platform_accession_code
                    sample.platform_name = platform_name
                    sample.save()

                scan_upc(input_file,
                         job_context["output_file_path"],
                         probeSummaryPackage=job_context["brainarray_package"])
            else:
                scan_upc(input_file, job_context["output_file_path"])
            job_context['time_end'] = timezone.now()

    except RRuntimeError as e:
        error_template = (
            "Encountered error in R code while running AFFY_TO_PCL"
            " pipeline during processing of {0}: {1}")
        error_message = error_template.format(input_file, str(e))
        logger.error(error_message, processor_job=job_context["job_id"])
        job_context["job"].failure_reason = error_message
        job_context["success"] = False
        job_context["job"].no_retry = True

    return job_context
示例#2
0
 def test_readable_affymetrix_names(self):
     """Test that the setting for Affymetrix accessions to
     human readable names is set correctly."""
     readable_platform_names = utils.get_readable_affymetrix_names()
     expected_readable_name = "[ChiGene-1_0-st] Affymetrix Chicken Gene 1.0 ST Array"
     self.assertTrue(readable_platform_names["chigene10st"] == expected_readable_name)
     expected_readable_name = "[Xenopus_laevis] Affymetrix Xenopus laevis Genome Array"
     self.assertTrue(readable_platform_names["xenopuslaevis"] == expected_readable_name)
示例#3
0
    def handle(self, *args, **options):
        """Main function for this command.

        Basically does what is described at the top of this file.
        """
        # Create working dir
        LOCAL_ROOT_DIR = get_env_variable("LOCAL_ROOT_DIR",
                                          "/home/user/data_store")
        work_dir = LOCAL_ROOT_DIR + "/affy_correction/"
        os.makedirs(work_dir, exist_ok=True)
        for sample in Sample.objects.filter(technology="RNA-SEQ",
                                            source_database="GEO"):
            for original_file in sample.original_files.all():
                if original_file.is_affy_data():
                    input_file_path = work_dir + original_file.source_filename
                    download_success = _download_file(original_file.source_url,
                                                      input_file_path)

                    if download_success:
                        try:
                            brainarray_package = _determine_brainarray_package(
                                input_file_path)

                            if brainarray_package:
                                logger.info(
                                    "Determined the package for sample %d is: "
                                    + brainarray_package,
                                    sample.id,
                                )
                                # If we've detected the platform using affy, then this
                                # is the best source of truth we'll be able to get, so
                                # update the sample to match it.
                                platform_name = get_readable_affymetrix_names(
                                )[brainarray_package]

                                sample.platform_accession_code = brainarray_package
                                sample.platform_name = platform_name
                        except:
                            logger.exception(
                                "Failed to detect platform from downloaded file %s.",
                                input_file_path,
                            )

                    # Regardless of whether we could detect the
                    # platform successfully or not, we definitely know
                    # it's an Affymetrix Microarray because that's the
                    # only one that makes .CEL files.
                    sample.technology = "MICROARRAY"
                    sample.manufacturer = "AFFYMETRIX"
                    sample.save()

                    # If there's other original files associated with
                    # this sample, we don't need them because we
                    # already corrected the platform.
                    break

        # Cleanup after ourselves:
        shutil.rmtree(work_dir)
示例#4
0
文件: geo.py 项目: erflynn/refinebio
    def set_platform_properties(self, sample_object: Sample,
                                sample_metadata: Dict,
                                gse: GEOparse.GSM) -> Sample:
        """Sets platform-related properties on `sample_object`.

        Uses metadata from `gse` to populate platform_name,
        platform_accession_code, and technology on `sample_object`.
        """

        # Determine platform information
        external_accession = get_normalized_platform(
            gse.metadata.get("platform_id", [UNKNOWN])[0])

        if external_accession == UNKNOWN:
            sample_object.platform_accession_code = UNKNOWN
            sample_object.platform_name = UNKNOWN
            sample_object.manufacturer = UNKNOWN
            # If this sample is Affy, we potentially can extract the
            # platform information from the .CEL file. If it's not we
            # can't do anything. Therefore assume the technology is
            # microarray when we have no platform information.
            sample_object.technology = "MICROARRAY"
            return sample_object

        platform_accession_code = UNKNOWN

        gpl = GEOparse.get_GEO(external_accession,
                               destdir=self.get_temp_path(),
                               how="brief",
                               silent=True)
        platform_title = gpl.metadata.get("title", [UNKNOWN])[0]

        # Check if this is a supported microarray platform.
        for platform in get_supported_microarray_platforms():
            if platform["external_accession"] == external_accession:
                platform_accession_code = platform["platform_accession"]

        if platform_accession_code != UNKNOWN:
            # It's a supported microarray platform.

            # We are using the brain array package as the platform accession code,
            # so, for instance, GPL3213 becomes 'chicken'.
            sample_object.platform_accession_code = platform_accession_code
            sample_object.technology = "MICROARRAY"
            try:

                # Related: https://github.com/AlexsLemonade/refinebio/issues/354
                # If it's Affy we can get a readable name:
                sample_object.platform_name = get_readable_affymetrix_names(
                )[platform_accession_code]
                sample_object.manufacturer = "AFFYMETRIX"

                # Sometimes Affymetrix samples have weird channel
                # protocol metadata, so if we find that it's
                # Affymetrix return it now. Example: GSE113945
                return sample_object
            except KeyError:
                # Otherwise we'll use what we've got.
                sample_object.platform_name = platform_title

            # Determine manufacturer

            platform = sample_object.pretty_platform.upper()
            if "AGILENT" in platform:
                sample_object.manufacturer = "AGILENT"
            elif "ILLUMINA" in platform or "NEXTSEQ" in platform:
                sample_object.manufacturer = "ILLUMINA"
            elif "AFFYMETRIX" in platform:
                sample_object.manufacturer = "AFFYMETRIX"
            else:
                sample_object.manufacturer = UNKNOWN

            return sample_object

        # Check to see if this is a supported RNASeq technology:

        # GEO RNASeq platform titles often have organisms appended to
        # an otherwise recognizable platform. The list of supported
        # RNASeq platforms isn't long, so see if any of them are
        # contained within what GEO gave us.
        # Example: GSE69572 has a platform title of:
        # 'Illumina Genome Analyzer IIx (Glycine max)'
        # Which should really just be 'Illumina Genome Analyzer IIx'
        # because RNASeq platforms are organism agnostic.  However,
        # the platforms 'Illumina Genome Analyzer' and 'Illumina
        # Genome Analyzer II' would also be matched, so make sure that
        # the longest platform names are tested first:
        sorted_platform_list = get_supported_rnaseq_platforms().copy()
        sorted_platform_list.sort(key=len, reverse=True)

        for platform in sorted_platform_list:
            if platform.upper() in platform_title.upper():
                sample_object.technology = "RNA-SEQ"
                sample_object.platform_name = platform
                # We just use RNASeq platform titles as accessions
                sample_object.platform_accession_code = platform

                if "ILLUMINA" in sample_object.platform_name.upper():
                    sample_object.manufacturer = "ILLUMINA"
                elif "NEXTSEQ" in sample_object.platform_name.upper():
                    sample_object.manufacturer = "NEXTSEQ"
                elif "ION TORRENT" in sample_object.platform_name.upper():
                    sample_object.manufacturer = "ION_TORRENT"
                else:
                    sample_object.manufacturer = UNKNOWN

                return sample_object

        # If we've made it this far, we don't know what this platform
        # is, therefore we can't know what its technology is. What we
        # do know is what GEO said was it's platform's accession and
        # title are, and that it's unsupported.
        sample_object.platform_name = platform_title
        sample_object.platform_accession_code = external_accession
        sample_object.technology = UNKNOWN
        sample_object.manufacturer = UNKNOWN

        return sample_object
示例#5
0
    def create_experiment_from_api(
            self, experiment_accession_code: str) -> (Experiment, Dict):
        """Given an experiment accession code, create an Experiment object.

        Also returns a dictionary of additional information about the
        platform discovered for the experiment.

        Will raise an UnsupportedPlatformException if this experiment was
        conducted using a platform which we don't support.

        See an example at: https://www.ebi.ac.uk/arrayexpress/json/v3/experiments/E-MTAB-3050/sample
        """
        request_url = EXPERIMENTS_URL + experiment_accession_code
        experiment_request = utils.requests_retry_session().get(request_url,
                                                                timeout=60)

        try:
            parsed_json = experiment_request.json(
            )["experiments"]["experiment"][0]
        except KeyError:
            logger.error("Remote experiment has no Experiment data!",
                         experiment_accession_code=experiment_accession_code,
                         survey_job=self.survey_job.id)
            raise

        experiment = {}
        experiment["name"] = parsed_json["name"]
        experiment["experiment_accession_code"] = experiment_accession_code

        # This experiment has no platform at all, and is therefore useless.
        if 'arraydesign' not in parsed_json or len(
                parsed_json["arraydesign"]) == 0:
            logger.warn("Remote experiment has no arraydesign listed.",
                        experiment_accession_code=experiment_accession_code,
                        survey_job=self.survey_job.id)
            raise UnsupportedPlatformException
        # If there is more than one arraydesign listed in the experiment
        # then there is no other way to determine which array was used
        # for which sample other than looking at the header of the CEL
        # file. That obviously cannot happen until the CEL file has been
        # downloaded so we can just mark it as UNKNOWN and let the
        # downloader inspect the downloaded file to determine the
        # array then.
        elif len(parsed_json["arraydesign"]
                 ) != 1 or "accession" not in parsed_json["arraydesign"][0]:
            experiment["platform_accession_code"] = UNKNOWN
            experiment["platform_accession_name"] = UNKNOWN
            experiment["manufacturer"] = UNKNOWN
        else:
            external_accession = parsed_json["arraydesign"][0]["accession"]
            for platform in get_supported_microarray_platforms():
                if platform["external_accession"] == external_accession:
                    experiment[
                        "platform_accession_code"] = get_normalized_platform(
                            platform["platform_accession"])

                    # Illumina appears in the accession codes for
                    # platforms manufactured by Illumina
                    if "ILLUMINA" in experiment[
                            "platform_accession_code"].upper():
                        experiment["manufacturer"] = "ILLUMINA"
                        experiment["platform_accession_name"] = platform[
                            "platform_accession"]
                    else:
                        # It's not Illumina, the only other supported Microarray platform is
                        # Affy. As our list of supported platforms grows this logic will
                        # need to get more sophisticated.
                        experiment["manufacturer"] = "AFFYMETRIX"
                        platform_mapping = get_readable_affymetrix_names()
                        experiment[
                            "platform_accession_name"] = platform_mapping[
                                platform["platform_accession"]]

            if "platform_accession_code" not in experiment:
                # We don't know what platform this accession corresponds to.
                experiment["platform_accession_code"] = external_accession
                experiment["platform_accession_name"] = UNKNOWN
                experiment["manufacturer"] = UNKNOWN

        experiment["release_date"] = parsed_json["releasedate"]

        if "lastupdatedate" in parsed_json:
            experiment["last_update_date"] = parsed_json["lastupdatedate"]
        else:
            experiment["last_update_date"] = parsed_json["releasedate"]

        # Create the experiment object
        try:
            experiment_object = Experiment.objects.get(
                accession_code=experiment_accession_code)
            logger.debug(
                "Experiment already exists, skipping object creation.",
                experiment_accession_code=experiment_accession_code,
                survey_job=self.survey_job.id)
        except Experiment.DoesNotExist:
            # We aren't sure these fields will be populated, or how many there will be.
            # Try to join them all together, or set a sensible default.
            experiment_descripton = ""
            if "description" in parsed_json and len(
                    parsed_json["description"]) > 0:
                for description_item in parsed_json["description"]:
                    if "text" in description_item:
                        experiment_descripton = experiment_descripton + description_item[
                            "text"] + "\n"

            if experiment_descripton == "":
                experiment_descripton = "Description not available.\n"

            experiment_object = Experiment()
            experiment_object.accession_code = experiment_accession_code
            experiment_object.source_url = request_url
            experiment_object.source_database = "ARRAY_EXPRESS"
            experiment_object.title = parsed_json["name"]
            # This will need to be updated if we ever use Array
            # Express to get other kinds of data.
            experiment_object.technology = "MICROARRAY"
            experiment_object.description = experiment_descripton
            experiment_object.source_first_published = parse_datetime(
                experiment["release_date"])
            experiment_object.source_last_modified = parse_datetime(
                experiment["last_update_date"])
            experiment_object.save()

            json_xa = ExperimentAnnotation()
            json_xa.experiment = experiment_object
            json_xa.data = parsed_json
            json_xa.is_ccdl = False
            json_xa.save()

            ## Fetch and parse the IDF/SDRF file for any other fields
            IDF_URL_TEMPLATE = "https://www.ebi.ac.uk/arrayexpress/files/{code}/{code}.idf.txt"
            idf_url = IDF_URL_TEMPLATE.format(code=experiment_accession_code)
            idf_text = utils.requests_retry_session().get(idf_url,
                                                          timeout=60).text

            lines = idf_text.split('\n')
            idf_dict = {}
            for line in lines:
                keyval = line.strip().split('\t')
                if len(keyval) == 2:
                    idf_dict[keyval[0]] = keyval[1]
                elif len(keyval) > 2:
                    idf_dict[keyval[0]] = keyval[1:]

            idf_xa = ExperimentAnnotation()
            idf_xa.data = idf_dict
            idf_xa.experiment = experiment_object
            idf_xa.is_ccdl = False
            idf_xa.save()

            if 'Investigation Title' in idf_dict:
                experiment_object.title = idf_dict['Investigation Title']
            if 'Person Affiliation' in idf_dict:
                # This is very rare, ex: E-MEXP-32
                if isinstance(idf_dict['Person Affiliation'], list):

                    unique_people = list(set(idf_dict['Person Affiliation']))
                    experiment_object.submitter_institution = ", ".join(
                        unique_people)[:255]
                else:
                    experiment_object.submitter_institution = idf_dict[
                        'Person Affiliation']

            # Get protocol_description from "<experiment_url>/protocols"
            # instead of from idf_dict, because the former provides more
            # details.
            protocol_url = request_url + '/protocols'
            protocol_request = utils.requests_retry_session().get(protocol_url,
                                                                  timeout=60)
            try:
                experiment_object.protocol_description = protocol_request.json(
                )['protocols']
            except KeyError:
                logger.warning(
                    "Remote experiment has no protocol data!",
                    experiment_accession_code=experiment_accession_code,
                    survey_job=self.survey_job.id)

            if 'Publication Title' in idf_dict:
                # This will happen for some superseries.
                # Ex: E-GEOD-29536
                # Assume most recent is "best:, store the rest in experiment annotation.
                if isinstance(idf_dict['Publication Title'], list):
                    experiment_object.publication_title = "; ".join(
                        idf_dict['Publication Title'])
                else:
                    experiment_object.publication_title = idf_dict[
                        'Publication Title']
                experiment_object.has_publication = True
            if 'Publication DOI' in idf_dict:
                if isinstance(idf_dict['Publication DOI'], list):
                    experiment_object.publication_doi = ", ".join(
                        idf_dict['Publication DOI'])
                else:
                    experiment_object.publication_doi = idf_dict[
                        'Publication DOI']
                experiment_object.has_publication = True
            if 'PubMed ID' in idf_dict:
                if isinstance(idf_dict['PubMed ID'], list):
                    experiment_object.pubmed_id = ", ".join(
                        idf_dict['PubMed ID'])
                else:
                    experiment_object.pubmed_id = idf_dict['PubMed ID']
                experiment_object.has_publication = True

            # Scrape publication title and authorship from Pubmed
            if experiment_object.pubmed_id:
                pubmed_metadata = utils.get_title_and_authors_for_pubmed_id(
                    experiment_object.pubmed_id)
                experiment_object.publication_title = pubmed_metadata[0]
                experiment_object.publication_authors = pubmed_metadata[1]

            experiment_object.save()

        platform_dict = {}
        for k in ('platform_accession_code', 'platform_accession_name',
                  'manufacturer'):
            platform_dict[k] = experiment[k]

        return experiment_object, platform_dict
示例#6
0
def download_array_express(job_id: int) -> None:
    """The main function for the Array Express Downloader.

    Downloads a single zip file containing the .PCL files representing
    samples relating to a single experiement stored in
    ArrayExpress.
    """
    job = utils.start_job(job_id)
    success = True

    file_assocs = DownloaderJobOriginalFileAssociation.objects.filter(downloader_job=job)
    # AE will have multiple files per DownloaderJob, but they are all
    # pieces of the same zip file so they're all referencing the same
    # URL.
    original_file = file_assocs[0].original_file
    url = original_file.source_url
    accession_code = job.accession_code

    # First, get all the unique sample archive URLs.
    # There may be more than one!
    # Then, unpack all the ones downloaded.
    # Then create processor jobs!

    og_files = []
    # The files for all of the samples are
    # contained within the same zip file. Therefore only
    # download the one.
    os.makedirs(LOCAL_ROOT_DIR + '/' + accession_code, exist_ok=True)

    # Add a timestamp in milliseconds to filename to prevent multiple jobs from using the same file.
    filename = url.split('/')[-1] + "." + str(int(time.time() * 1000))
    dl_file_path = LOCAL_ROOT_DIR + '/' + accession_code + '/' + filename + ".zip"
    _download_file(url, dl_file_path, job)

    extracted_files = _extract_files(dl_file_path, accession_code, job)

    for og_file in extracted_files:
        try:
            original_file = OriginalFile.objects.get(
                source_filename=og_file['filename'], source_url=original_file.source_url)
            original_file.is_downloaded = True
            original_file.is_archive = False
            original_file.absolute_file_path = og_file['absolute_path']
            original_file.filename = og_file['absolute_path'].split('/')[-1]
            original_file.calculate_size()
            original_file.save()
            original_file.calculate_sha1()
            og_files.append(original_file)
        except Exception:
            # The suspicion is that there are extra files related to
            # another experiment, that we don't want associated with
            # this one.
            logger.debug("Found a file we didn't have an OriginalFile for! Why did this happen?: "
                        + og_file['filename'],
                        downloader_job=job_id)
            os.remove(og_file["absolute_path"])
            continue

        sample_objects = Sample.objects.filter(originalfile=original_file).order_by('created_at')
        if sample_objects.count() > 1:
            logger.warn("Found an Array Express OriginalFile with more than one sample: %s",
                        filename,
                        downloader_job=job_id)

        # If the file is a .CEL file, it is the ultimate
        # source of truth about the sample's platform.
        sample_object = sample_objects[0]
        if og_file["filename"].upper()[-4:] == ".CEL" and sample_object.has_raw:
            cel_file_platform = None
            platform_accession_code = "UNSUPPORTED"
            try:
                cel_file_platform = microarray.get_platform_from_CEL(
                    original_file.absolute_file_path)

                for platform in get_supported_microarray_platforms():
                    if platform["platform_accession"] == cel_file_platform:
                        platform_accession_code = platform["platform_accession"]
            except Exception as e:
                platform_accession_code = "UNDETERMINABLE"
                logger.warn("Unable to determine platform from CEL file: "
                            + original_file.absolute_file_path,
                            downloader_job=job_id)
            if platform_accession_code == "UNSUPPORTED":
                logger.error("Found a raw .CEL file with an unsupported platform!",
                             file_name=original_file.absolute_file_path,
                             sample=sample_object.id,
                             downloader_job=job_id,
                             cel_file_platform=cel_file_platform)
                job.failure_reason = ("Found a raw .CEL file with an unsupported platform: "
                                      + original_file.absolute_file_path + " ("
                                      + str(cel_file_platform) + ")")
                job.no_retry = True
                success = False

                # The file is unsupported, delete it!
                original_file.delete_local_file()
                original_file.delete()
            elif platform_accession_code == "UNDETERMINABLE":
                # If we cannot determine the platform from the
                # .CEL file, the platform discovered via metadata
                # may be correct so just leave it be.
                pass
            else:
                # We determined the file was collected with a supported Affymetrix platform.
                sample_object.platform_accession_code = platform_accession_code
                sample_object.platform_name = get_readable_affymetrix_names()[
                    platform_accession_code]

            # However, if the filename contains '.CEL' we know
            # it's an Affymetrix Microarray
            sample_object.technology = "MICROARRAY"
            sample_object.manufacterer = "AFFYMETRIX"
            sample_object.save()

    if success:
        logger.debug("File downloaded and extracted successfully.",
                     url=url,
                     downloader_job=job_id)

        utils.create_processor_jobs_for_original_files(og_files, job)

    utils.end_downloader_job(job, success)