示例#1
0
def _replace_dotsra_with_fastq_files(
        sample: Sample, downloader_job: DownloaderJob,
        original_file: OriginalFile) -> List[OriginalFile]:
    """Replaces a .SRA file with two .fastq files.

    This function should only be called on a sample which has unmated
    reads, so it makes the assumption that the sample passed into it
    has at least two read files in ENA.
    """
    read_one_url = _build_ena_file_url(sample.accession_code, "_1")
    read_two_url = _build_ena_file_url(sample.accession_code, "_2")

    # Technically this is a different file, but deleting this one and
    # its associations just to recreate another with the same
    # associations seems rather pointless.
    original_file.source_url = read_one_url
    original_file.source_filename = read_one_url.split("/")[-1]
    original_file.save()

    read_two_original_file = OriginalFile.objects.get_or_create(
        source_url=read_two_url,
        source_filename=read_two_url.split("/")[-1],
        has_raw=True)[0]
    OriginalFileSampleAssociation.objects.get_or_create(
        original_file=read_two_original_file, sample=sample)
    DownloaderJobOriginalFileAssociation.objects.get_or_create(
        original_file=read_two_original_file, downloader_job=downloader_job)
    return [original_file, read_two_original_file]
示例#2
0
def _has_unmated_reads(accession_code: str,
                       downloader_job: DownloaderJob) -> bool:
    """Checks if the SRA accession has unmated reads.

    Returns True if it does and False if it doesn't, and also whether or not it successfully connected to the ENA server"""
    full_ftp_link = _build_ena_file_url(accession_code)

    # Strip off the protocol code because we know it's FTP and the FTP
    # library doesn't want the protocol.
    no_protocol_link = full_ftp_link.split("://")[1]

    # We need to extract the server so we can login to it.
    split_link = no_protocol_link.split("/")
    ftp_server = split_link[0]

    # We need to get the FTP directory the file is in so we can check
    # how many other files are in it. Therefore we're looking to get
    # the path between the server and the filename itself.
    sample_directory = "/".join(split_link[1:-1])

    # Try to connect to FTP
    ftp = FTP(ftp_server)
    ftp.login()
    ftp.cwd(sample_directory)

    try:
        # If there's three files then there's unmated reads, because
        # there's the one read file, the other read file, and the
        # unmated reads.
        if len(ftp.nlst()) == 3:
            return True
        else:
            return False
    except ftplib.all_errors:
        # If we can't find the sample on ENA's FTP server, then we
        # shouldn't try to download it from there.
        return False
    finally:
        if ftp:
            ftp.close()

    # Shouldn't reach here, but just in case default to NCBI.
    return False
示例#3
0
    def _generate_experiment_and_samples(
            self,
            run_accession: str,
            study_accession: str = None) -> (Experiment, List[Sample]):
        """Generates Experiments and Samples for the provided run_accession."""
        metadata = SraSurveyor.gather_all_metadata(run_accession)

        if metadata == {}:
            if study_accession:
                logger.error(
                    "Could not discover any metadata for run.",
                    accession=run_accession,
                    study_accession=study_accession,
                )
            else:
                logger.error("Could not discover any metadata for run.",
                             accession=run_accession)
            return (None, None)  # This will cascade properly

        if DOWNLOAD_SOURCE == "ENA":
            if metadata["library_layout"] == "PAIRED":
                files_urls = [
                    _build_ena_file_url(run_accession, "_1"),
                    _build_ena_file_url(run_accession, "_2"),
                ]
            else:
                files_urls = [_build_ena_file_url(run_accession)]
        else:
            files_urls = [SraSurveyor._build_ncbi_file_url(run_accession)]

        # Figure out the Organism for this sample
        organism_name = metadata.pop("organism_name", None)
        if not organism_name:
            logger.error("Could not discover organism type for run.",
                         accession=run_accession)
            return (None, None)  # This will cascade properly

        organism_name = organism_name.upper()
        organism = Organism.get_object_for_name(organism_name)

        ##
        # Experiment
        ##

        experiment_accession_code = metadata.get("study_accession")
        try:
            experiment_object = Experiment.objects.get(
                accession_code=experiment_accession_code)
            logger.debug(
                "Experiment already exists, skipping object creation.",
                experiment_accession_code=experiment_accession_code,
                survey_job=self.survey_job.id,
            )
        except Experiment.DoesNotExist:
            experiment_object = Experiment()
            experiment_object.accession_code = experiment_accession_code
            SraSurveyor._apply_metadata_to_experiment(experiment_object,
                                                      metadata)
            experiment_object.save()

            ##
            # Experiment Metadata
            ##
            json_xa = ExperimentAnnotation()
            json_xa.experiment = experiment_object
            json_xa.data = metadata
            json_xa.is_ccdl = False
            json_xa.save()

        ##
        # Samples
        ##

        sample_accession_code = metadata.pop("run_accession")
        # Create the sample object
        try:
            sample_object = Sample.objects.get(
                accession_code=sample_accession_code)
            # If current experiment includes new protocol information,
            # merge it into the sample's existing protocol_info.
            protocol_info, is_updated = self.update_sample_protocol_info(
                sample_object.protocol_info,
                experiment_object.protocol_description,
                experiment_object.source_url,
            )
            if is_updated:
                sample_object.protocol_info = protocol_info
                sample_object.save()

            logger.debug(
                "Sample %s already exists, skipping object creation.",
                sample_accession_code,
                experiment_accession_code=experiment_object.accession_code,
                survey_job=self.survey_job.id,
            )
        except Sample.DoesNotExist:
            sample_object = Sample()
            sample_object.source_database = "SRA"
            sample_object.accession_code = sample_accession_code
            sample_object.organism = organism

            sample_object.platform_name = metadata.get(
                "platform_instrument_model", "UNKNOWN")
            # The platform_name is human readable and contains spaces,
            # accession codes shouldn't have spaces though:
            sample_object.platform_accession_code = sample_object.platform_name.replace(
                " ", "")
            sample_object.technology = "RNA-SEQ"
            if ("ILLUMINA" in sample_object.platform_name.upper()
                    or "NEXTSEQ" in sample_object.platform_name.upper()):
                sample_object.manufacturer = "ILLUMINA"
            elif "ION TORRENT" in sample_object.platform_name.upper():
                sample_object.manufacturer = "ION_TORRENT"
            else:
                sample_object.manufacturer = "UNKNOWN"

            SraSurveyor._apply_harmonized_metadata_to_sample(
                sample_object, metadata)

            protocol_info, is_updated = self.update_sample_protocol_info(
                existing_protocols=[],
                experiment_protocol=experiment_object.protocol_description,
                experiment_url=experiment_object.source_url,
            )
            # Do not check is_updated the first time because we must
            # save a list so we can append to it later.
            sample_object.protocol_info = protocol_info

            sample_object.save()

            for file_url in files_urls:
                original_file = OriginalFile.objects.get_or_create(
                    source_url=file_url,
                    source_filename=file_url.split("/")[-1],
                    has_raw=True)[0]
                OriginalFileSampleAssociation.objects.get_or_create(
                    original_file=original_file, sample=sample_object)

        # Create associations if they don't already exist
        ExperimentSampleAssociation.objects.get_or_create(
            experiment=experiment_object, sample=sample_object)

        ExperimentOrganismAssociation.objects.get_or_create(
            experiment=experiment_object, organism=organism)

        return experiment_object, [sample_object]