Пример #1
0
 def __init__(self, dmp_file):
     self.files = FileRepository.all()
     self.dmp_file = dmp_file
     self.bam_path = dmp_file.file.path
     self.metadata = dmp_file.metadata
     self.mutations_extended = self._set_data_muts_txt()
     self.dmp_sample_name = self._set_dmp_sample_name()
Пример #2
0
    def __init__(
        self,
        model,
        job_group_id=None,
        job_group_notifier_id=None,
        request_id=None,
        run_ids=[],
        pipeline=None,
        pairing=None,
        output_directory_prefix=None,
    ):
        if not isinstance(model, OperatorModel):
            raise Exception(
                "Must pass an instance of beagle_etl.models.Operator")

        self.model = model
        self.request_id = request_id
        self.job_group_id = job_group_id
        self.job_group_notifier_id = job_group_notifier_id
        self.run_ids = run_ids
        self.files = FileRepository.all()
        self.pairing = pairing
        # {"pairs": [{"tumor": "tumorSampleName", "normal": "normalSampleName"}]}
        self.output_directory_prefix = output_directory_prefix
        self._jobs = []
        self._pipeline = pipeline
Пример #3
0
 def __init__(self, sample_id):
     self.files = FileRepository.all()
     self.sample_id = sample_id
     self.patient_id, self.cmo_sample_name = self._get_sample_metadata()
     self.dmp_patient_id = self._get_dmp_patient_id()
     self.dmp_bams_tumor = self._find_dmp_bams("T")
     self.dmp_bams_normal = self._find_dmp_bams("N")
Пример #4
0
 def _get_request_id(self):
     files = FileRepository.all()
     request_ids = set()
     for run_id in self.run_ids:
         run = Run.objects.filter(id=run_id)[0]
         sample_name = run.tags['sampleNameTumor']
         sample_files = FileRepository.filter(queryset=files, metadata = {'cmoSampleName': sample_name})
         for f in sample_files:
             metadata = f.metadata
             if 'requestId' in metadata:
                 request_ids.add(metadata['requestId'])
     request_id = "_".join(list(request_ids))
     return request_id
Пример #5
0
 def _get_samples_data(self):
     files = FileRepository.all()
     f = FileRepository.filter(queryset=files,
                               metadata={
                                   "cmoSampleName": self.tumor_sample_name,
                                   "igocomplete": True
                               },
                               filter_redact=True)
     sample = None
     if f:
         # retrieve metadata from first record (should only be one)
         meta = f[0].metadata
         sample_id = meta["sampleId"]
         sample = SampleData(sample_id)
     return sample
Пример #6
0
def get_dmp_bam(patient_id, bait_set, tumor_type):
    """
    From a patient id and bait set, get matching dmp bam normal
    """
    file_objs = FileRepository.all()

    dmp_query = build_dmp_query(patient_id, bait_set)

    dmp_bam = FileRepository.filter(queryset=file_objs, q=dmp_query).order_by('file__file_name').first()

    if dmp_bam:
        sample = build_dmp_sample(dmp_bam, patient_id, bait_set, tumor_type)
        built_sample = build_sample([sample], ignore_sample_formatting=True)
        return built_sample
    return None
Пример #7
0
def get_pooled_normal_files(run_ids, preservation_types, bait_set):

    pooled_normals = FileRepository.all()

    query = Q(file__file_group=settings.POOLED_NORMAL_FILE_GROUP)
    run_id_query = build_run_id_query(run_ids)
    preservation_query = build_preservation_query(preservation_types)

    q = query & run_id_query & preservation_query

    pooled_normals = FileRepository.filter(queryset=pooled_normals, q=q)

    pooled_normals, descriptor, sample_name = get_descriptor(bait_set, pooled_normals, preservation_types, run_ids)

    return pooled_normals, descriptor, sample_name
Пример #8
0
def get_dmp_normal(patient_id, bait_set):
    """
    From a patient id and bait set, get matching dmp bam normal
    """
    file_objs = FileRepository.all()

    dmp_query = build_dmp_query(patient_id, bait_set)

    dmp_bam = FileRepository.filter(
        queryset=file_objs, q=dmp_query).order_by("file__file_name").first()

    if dmp_bam:
        dmp_metadata = dmp_bam.metadata
        specimen_type = "DMP Normal"
        sample_name = dmp_metadata["external_id"]
        sequencingCenter = "MSKCC"
        platform = "Illumina"
        sample = dict()
        sample["id"] = dmp_bam.file.id
        sample["path"] = dmp_bam.file.path
        sample["file_name"] = dmp_bam.file.file_name
        sample["file_type"] = dmp_bam.file.file_type
        metadata = init_metadata()
        metadata["sampleId"] = sample_name
        metadata["sampleName"] = format_sample_name(sample_name, specimen_type)
        metadata["requestId"] = sample_name
        metadata["sequencingCenter"] = sequencingCenter
        metadata["platform"] = platform
        metadata["baitSet"] = bait_set
        metadata["recipe"] = bait_set
        metadata["run_id"] = ""
        metadata["preservation"] = ""
        metadata["libraryId"] = sample_name + "_1"
        metadata["R"] = "Not applicable"
        # because rgid depends on flowCellId and barcodeIndex, we will
        # spoof barcodeIndex so that pairing can work properly; see
        # build_sample in runner.operator.argos_operator.bin
        metadata["barcodeIndex"] = "DMP_BARCODEIDX"
        metadata["flowCellId"] = "DMP_FCID"
        metadata["tumorOrNormal"] = "Normal"
        metadata["patientId"] = patient_id
        metadata["specimenType"] = specimen_type
        sample["metadata"] = metadata
        built_sample = build_sample([sample], ignore_sample_formatting=True)
        return built_sample
    return None
Пример #9
0
def get_oncotree_codes(request_id):
    oncotree_dh = OncotreeDataHandler()
    files = FileRepository.all()
    oncotree_codes_tmp = set(
        FileRepository.filter(queryset=files, metadata={"requestId": request_id}).values_list(
            "metadata__oncoTreeCode", flat=True
        )
    )
    oncotree_codes = list()
    for val in oncotree_codes_tmp:
        if val:
            oncotree_codes.append(val)
    if not oncotree_codes:  # hack; if there are no oncotree codes, just say it's mixed
        return "mixed"
    shared_nodes = oncotree_dh.find_shared_nodes_by_code_list(oncotree_codes)
    common_anc = oncotree_dh.get_highest_level_shared_node(shared_nodes)
    if common_anc.code.lower() == "tissue":
        common_anc.code = "mixed"
    return common_anc.code.lower()
Пример #10
0
def get_request_pi(run_id_list):
    request_pis = set()
    files = FileRepository.all()
    all_request_ids = set()
    # reducing number of queries
    for run_id in run_id_list:
        argos_run = Run.objects.get(id=run_id)
        run_request_id = argos_run.tags["requestId"]
        all_request_ids.add(run_request_id)
    for request_id in all_request_ids:
        investigator_emails = FileRepository.filter(queryset=files, metadata={"requestId": request_id}).values_list(
            "metadata__investigatorEmail", flat=True
        )
        request_pis = request_pis.union(set(investigator_emails))
    request_pis_final = list()
    for request_pi in request_pis:
        if request_pi:
            request_pis_final.append(format_msk_id(request_pi))
    return ",".join(request_pis_final)
Пример #11
0
def get_samples_from_patient_id(patient_id):
    """
    Retrieves samples from the database based on the patient_id

    Only retrieve patients from LIMS file group
    """
    all_files = FileRepository.all()
    q_pid = Q(metadata__patientId=patient_id)
    q_fg = build_argos_file_groups_query()
    q = q_pid & q_fg
    files = FileRepository.filter(queryset=all_files, q=q, filter_redact=True)
    data = list()
    for current_file in files:
        sample = dict()
        sample["id"] = current_file.file.id
        sample["path"] = current_file.file.path
        sample["file_name"] = current_file.file.file_name
        sample["metadata"] = current_file.metadata
        data.append(sample)

    samples = list()
    # group by igoId
    igo_id_group = dict()
    for sample in data:
        igo_id = sample["metadata"]["sampleId"]
        if igo_id not in igo_id_group:
            igo_id_group[igo_id] = list()
        igo_id_group[igo_id].append(sample)

    for igo_id in igo_id_group:
        samples.append(build_sample(igo_id_group[igo_id]))
    samples, bad_samples = remove_with_caveats(samples)
    number_of_bad_samples = len(bad_samples)
    if number_of_bad_samples > 0:
        LOGGER.warning(
            "Some samples for patient query %s have invalid %i values",
            patient_id, number_of_bad_samples)
    return samples
Пример #12
0
    def get_jobs(self, pairing_override=None):
        logger.info("Operator JobGroupNotifer ID %s",
                    self.job_group_notifier_id)
        tmpdir = os.path.join(settings.BEAGLE_SHARED_TMPDIR, str(uuid.uuid4()))
        self.OUTPUT_DIR = tmpdir
        Path(self.OUTPUT_DIR).mkdir(parents=True, exist_ok=True)

        recipe_query = self.build_recipe_query()
        assay_query = self.build_assay_query()
        igocomplete_query = Q(metadata__igocomplete=True)
        missing_fields_query = self.filter_out_missing_fields_query()
        q = recipe_query & assay_query & igocomplete_query & missing_fields_query
        files = FileRepository.all()
        tempo_files = FileRepository.filter(queryset=files, q=q)

        self.send_message("""
            Querying database for the following recipes:
                {recipes}

            Querying database for the following assays/bait sets:
                {assays}
            """.format(recipes="\t\n".join(self.get_recipes()),
                       assays="\t\n".join(self.get_assays())))

        exclude_query = self.get_exclusions()
        if exclude_query:
            tempo_files = tempo_files.exclude(exclude_query)
        # replace with run operator logic, most recent pairing
        pre_pairing = self.load_pairing_file(
            PAIRING_FILE_LOCATION)  # pairing.tsv is not in repo
        if pairing_override:
            normal_samples = pairing_override['normal_samples']
            tumor_samples = pairing_override['tumor_samples']
            num_ns = len(normal_samples)
            num_ts = len(tumor_samples)
            if num_ns != num_ts:
                print("Number of tumors and normals not the same; can't pair")
            else:
                for i in range(0, num_ns):
                    tumor_id = tumor_samples[i]
                    normal_id = normal_samples[i]
                    pre_pairing[tumor_id] = normal_id
        patient_ids = set()
        patient_files = dict()
        no_patient_samples = list()
        for entry in tempo_files:
            patient_id = entry.metadata['patientId']
            if patient_id:
                patient_ids.add(patient_id)
                if patient_id not in patient_files:
                    patient_files[patient_id] = list()
                patient_files[patient_id].append(entry)
            else:
                no_patient_samples.append(entry)

        self.patients = dict()
        self.non_cmo_patients = dict()
        for patient_id in patient_files:
            if "C-" in patient_id[:2]:
                self.patients[patient_id] = patient_obj.Patient(
                    patient_id, patient_files[patient_id], pre_pairing)
            else:
                self.non_cmo_patients[patient_id] = patient_obj.Patient(
                    patient_id, patient_files[patient_id])

        input_json = dict()
        # output these strings to file
        input_json['conflict_data'] = self.create_conflict_samples_txt_file()
        input_json['unpaired_data'] = self.create_unpaired_txt_file()
        input_json['mapping_data'] = self.create_mapping_file()
        input_json['pairing_data'] = self.create_pairing_file()
        input_json['tracker_data'] = self.create_tracker_file()

        pickle_file = os.path.join(self.OUTPUT_DIR, "patients_data_pickle")
        fh = open(pickle_file, 'wb')
        pickle.dump(self.patients, fh)
        os.chmod(pickle_file, 0o777)
        self.register_tmp_file(pickle_file)

        input_json['pickle_data'] = {
            'class': 'File',
            'location': "juno://" + pickle_file
        }

        beagle_version = __version__
        run_date = datetime.now().strftime("%Y%m%d_%H:%M:%f")

        tags = {"beagle_version": beagle_version, "run_date": run_date}

        app = self.get_pipeline_id()
        pipeline = Pipeline.objects.get(id=app)
        pipeline_version = pipeline.version
        output_directory = pipeline.output_directory

        self.debug_json = input_json

        tempo_mpgen_outputs_job_data = {
            'app': app,
            'inputs': input_json,
            'name': "Tempo mpgen %s" % run_date,
            'tags': tags,
            'output_directory': output_directory
        }

        tempo_mpgen_outputs_job = [
            (APIRunCreateSerializer(data=tempo_mpgen_outputs_job_data),
             input_json)
        ]
        return tempo_mpgen_outputs_job
Пример #13
0
def get_pooled_normals(run_ids, preservation_types, bait_set):
    """
    From a list of run_ids, preservation types, and bait sets, get all potential pooled normals
    """
    pooled_normals = FileRepository.all()

    query = Q(file__file_group=settings.POOLED_NORMAL_FILE_GROUP)
    run_id_query = build_run_id_query(run_ids)
    preservation_query = build_preservation_query(preservation_types)

    q = query & run_id_query & preservation_query

    pooled_normals = FileRepository.filter(queryset=pooled_normals, q=q)

    descriptor = get_descriptor(bait_set, pooled_normals)

    if descriptor:  # From returned pooled normals, we found the bait set/recipe we're looking for
        pooled_normals = FileRepository.filter(queryset=pooled_normals,
                                               metadata={'recipe': descriptor})

        # sample_name is FROZENPOOLEDNORMAL unless FFPE is in any of the preservation types
        # in preservation_types
        preservations_lower_case = set([x.lower() for x in preservation_types])
        run_ids_suffix_list = [i for i in run_ids
                               if i]  # remove empty or false string values
        run_ids_suffix = "_".join(run_ids_suffix_list)
        sample_name = "FROZENPOOLEDNORMAL_" + run_ids_suffix
        if "ffpe" in preservations_lower_case:
            sample_name = "FFPEPOOLEDNORMAL_" + run_ids_suffix
    elif "impact505" in bait_set.lower():
        # We didn't find a pooled normal for IMPACT505; return "static" FROZEN or FFPE pool normal
        preservations_lower_case = set([x.lower() for x in preservation_types])
        sample_name = "FROZENPOOLEDNORMAL_IMPACT505_V1"
        if "ffpe" in preservations_lower_case:
            sample_name = "FFPEPOOLEDNORMAL_IMPACT505_V1"
        q = query & Q(metadata__sampleName=sample_name)
        pooled_normals = FileRepository.filter(queryset=pooled_normals, q=q)
        if not pooled_normals:
            LOGGER.error("Could not find IMPACT505 pooled normal to pair %s",
                         sample_name)
            return None
    else:
        return None

    specimen_type = 'Pooled Normal'

    sample_files = list()

    if len(pooled_normals) > 0:
        for pooled_normal in pooled_normals:
            sample = dict()
            sample['id'] = pooled_normal.file.id
            sample['path'] = pooled_normal.file.path
            sample['file_name'] = pooled_normal.file.file_name
            metadata = init_metadata()
            metadata['sampleId'] = sample_name
            metadata['sampleName'] = sample_name
            metadata['requestId'] = sample_name
            metadata['sequencingCenter'] = "MSKCC"
            metadata['platform'] = "Illumina"
            metadata['baitSet'] = descriptor
            metadata['recipe'] = descriptor
            metadata['run_id'] = run_ids
            metadata['preservation'] = preservation_types
            metadata['libraryId'] = sample_name + "_1"
            # because rgid depends on flowCellId and barcodeIndex, we will
            # spoof barcodeIndex so that pairing can work properly; see
            # build_sample in runner.operator.argos_operator.bin
            metadata['R'] = get_r_orientation(pooled_normal.file.file_name)
            metadata['barcodeIndex'] = spoof_barcode(sample['file_name'],
                                                     metadata['R'])
            metadata['flowCellId'] = 'PN_FCID'
            metadata['tumorOrNormal'] = 'Normal'
            metadata['patientId'] = 'PN_PATIENT_ID'
            metadata['specimenType'] = specimen_type
            sample['metadata'] = metadata
            sample_files.append(sample)
        pooled_normal = build_sample(sample_files,
                                     ignore_sample_formatting=True)
        return pooled_normal
    return None
Пример #14
0
def get_pooled_normals(run_ids, preservation_types, bait_set):
    """
    From a list of run_ids, preservation types, and bait sets, get all potential pooled normals
    """
    pooled_normals = FileRepository.all()

    query = Q(file__file_group=settings.POOLED_NORMAL_FILE_GROUP)
    run_id_query = build_run_id_query(run_ids)
    preservation_query = build_preservation_query(preservation_types)

    q = query & run_id_query & preservation_query

    pooled_normals = FileRepository.filter(queryset=pooled_normals, q=q)

    # 'descriptor' should be the same as bait set, but it's labeled
    # descriptor because in pooled normals it's called 'recipe'
    # TODO: change pooled normal field value 'recipe' -> bait_set/baitSet
    descriptor = get_descriptor(bait_set, pooled_normals)

    if not descriptor:  # i.e., no pooled normal
        return None

    pooled_normals = FileRepository.filter(queryset=pooled_normals,
                                           metadata={"recipe": descriptor})
    sample_files = list()

    # sample_name is FROZENPOOLEDNORMAL unless FFPE is in any of the preservation types
    # in preservation_types
    preservations_lower_case = set([x.lower() for x in preservation_types])
    run_ids_suffix_list = [i for i in run_ids
                           if i]  # remove empty or false string values
    run_ids_suffix = "_".join(run_ids_suffix_list)
    sample_name = "FROZENPOOLEDNORMAL_" + run_ids_suffix
    if "ffpe" in preservations_lower_case:
        sample_name = "FFPEPOOLEDNORMAL_" + run_ids_suffix

    specimen_type = "Pooled Normal"
    num_of_pooled_normals = len(pooled_normals)
    if num_of_pooled_normals > 0:
        for pooled_normal in pooled_normals:
            sample = dict()
            sample["id"] = pooled_normal.file.id
            sample["path"] = pooled_normal.file.path
            sample["file_name"] = pooled_normal.file.file_name
            metadata = init_metadata()
            metadata["sampleId"] = sample_name
            metadata["sampleName"] = sample_name
            metadata["requestId"] = sample_name
            metadata["sequencingCenter"] = "MSKCC"
            metadata["platform"] = "Illumina"
            metadata["baitSet"] = descriptor
            metadata["recipe"] = descriptor
            metadata["run_id"] = run_ids
            metadata["preservation"] = preservation_types
            metadata["libraryId"] = sample_name + "_1"
            # because rgid depends on flowCellId and barcodeIndex, we will
            # spoof barcodeIndex so that pairing can work properly; see
            # build_sample in runner.operator.argos_operator.bin
            metadata["R"] = get_r_orientation(pooled_normal.file.file_name)
            metadata["barcodeIndex"] = spoof_barcode(sample["file_name"],
                                                     metadata["R"])
            metadata["flowCellId"] = "PN_FCID"
            metadata["tumorOrNormal"] = "Normal"
            metadata["patientId"] = "PN_PATIENT_ID"
            metadata["specimenType"] = specimen_type
            sample["metadata"] = metadata
            sample_files.append(sample)
        pooled_normal = build_sample(sample_files,
                                     ignore_sample_formatting=True)
        return pooled_normal
    return None
Пример #15
0
    def get_jobs(self, pairing_override=None):
        logger.info("Operator JobGroupNotifer ID %s",
                    self.job_group_notifier_id)
        app = self.get_pipeline_id()
        pipeline = Pipeline.objects.get(id=app)
        pipeline_version = pipeline.version
        output_directory = pipeline.output_directory
        self.OUTPUT_DIR = output_directory

        recipe_query = self.build_recipe_query()
        assay_query = self.build_assay_query()
        igocomplete_query = Q(metadata__igocomplete=True)
        missing_fields_query = self.filter_out_missing_fields_query()
        q = recipe_query & assay_query & igocomplete_query & missing_fields_query
        files = FileRepository.all()
        files = FileRepository.filter(queryset=files, filter_redact=True)
        tempo_files = FileRepository.filter(queryset=files, q=q)
        tempo_files = FileRepository.filter(queryset=tempo_files,
                                            filter_redact=True)

        self.send_message("""
            Querying database for the following recipes:
                {recipes}

            Querying database for the following assays/bait sets:
                {assays}
            """.format(recipes="\t\n".join(self.get_recipes()),
                       assays="\t\n".join(self.get_assays())))

        exclude_query = self.get_exclusions()
        if exclude_query:
            tempo_files = tempo_files.exclude(exclude_query)
        # replace with run operator logic, most recent pairing
        pre_pairing = self.load_pairing_file(
            PAIRING_FILE_LOCATION)  # pairing.tsv is not in repo
        if pairing_override:
            normal_samples = pairing_override["normal_samples"]
            tumor_samples = pairing_override["tumor_samples"]
            num_ns = len(normal_samples)
            num_ts = len(tumor_samples)
            if num_ns != num_ts:
                print("Number of tumors and normals not the same; can't pair")
            else:
                for i in range(0, num_ns):
                    tumor_id = tumor_samples[i]
                    normal_id = normal_samples[i]
                    pre_pairing[tumor_id] = normal_id
        patient_ids = set()
        patient_files = dict()
        no_patient_samples = list()
        for entry in tempo_files:
            patient_id = entry.metadata["patientId"]
            if patient_id:
                patient_ids.add(patient_id)
                if patient_id not in patient_files:
                    patient_files[patient_id] = list()
                patient_files[patient_id].append(entry)
            else:
                no_patient_samples.append(entry)

        self.patients = dict()
        self.non_cmo_patients = dict()
        for patient_id in patient_files:
            if "C-" in patient_id[:2]:
                self.patients[patient_id] = patient_obj.Patient(
                    patient_id, patient_files[patient_id], pre_pairing)
            else:
                self.non_cmo_patients[patient_id] = patient_obj.Patient(
                    patient_id, patient_files[patient_id])

        input_json = dict()
        # output these strings to file
        input_json["conflict_data"] = self.create_conflict_samples_txt_file()
        input_json["unpaired_data"] = self.create_unpaired_txt_file()
        input_json["mapping_data"] = self.create_mapping_file()
        input_json["pairing_data"] = self.create_pairing_file()
        input_json["tracker_data"] = self.create_tracker_file()

        pickle_file = os.path.join(self.OUTPUT_DIR, "patients_data_pickle")
        fh = open(pickle_file, "wb")
        pickle.dump(self.patients, fh)
        os.chmod(pickle_file, 0o777)
        self.register_tmp_file(pickle_file)

        input_json["pickle_data"] = {
            "class": "File",
            "location": "juno://" + pickle_file
        }

        beagle_version = __version__
        run_date = datetime.now().strftime("%Y%m%d_%H:%M:%f")

        tags = {"beagle_version": beagle_version, "run_date": run_date}

        self.send_message("""
            Writing files to {file_path}.

            Run Date: {run_date}
            Beagle Version: {beagle_version}
            """.format(file_path=self.OUTPUT_DIR,
                       run_date=run_date,
                       beagle_version=beagle_version))

        return []
Пример #16
0
def get_file(fpath):
    files = FileRepository.all()
    data = FileRepository.filter(queryset=files, path=fpath)
    if data:
        return data[0]
    return None