Пример #1
0
def match(args):
    """
    Matches all trials in database to patients

    :param daemon: Boolean flag; when true, runs the matchengine once per 24 hours.
    """

    db = get_db(args.mongo_uri)

    while True:
        me = MatchEngine(db)
        me.find_trial_matches()

        # exit if it is not set to run as a nightly automated daemon, otherwise sleep for a day
        if not args.daemon:

            # choose output file format
            if args.json_format:
                file_format = 'json'
            elif args.outpath and len(args.outpath.split('.')) > 1:
                file_format = args.outpath.split('.')[-1]
                if file_format not in ['json', 'csv']:
                    file_format = 'csv'
            else:
                file_format = 'csv'

            # choose output path
            if args.outpath:
                outpath = args.outpath.split('.')[0]
            else:
                outpath = './results'

            # export results
            export_results(args.mongo_uri, file_format, outpath)

            break
        else:
            time.sleep(86400)   # sleep for 24 hours
Пример #2
0
    def setUp(self):
        """
        Descriptions of test patients

        1: >18, Adrenal Gland, Female, BRAF F346R Mutation
        2: >18, Melanoma, Female, EGFR L858R Mutation
        3: >18, Melanoma, Female, EGFR F346A Mutation
        4: >18, Melanoma, Female, EGFR F346B Mutation
        5: >18, Melanoma, Female, EGFR F000F Mutation
        6: >0.5 && <18, Melanoma, Male, EGFR SV
        7: >0.5 && <18, Glioblastoma, Male, EGFR CNV Hetero del
        8: >0.5 && <18, Glioblastoma, Male, EGFR CNV Gain
        9: >0.5 && <18, Glioblastoma, Male, EGFR CNV H**o del
        10: <0.5, Glioblastoma, Male, EGFR CNV High amp

        Descriptions of test trials
        00-001.yml: dose: EGFR L858R && >=18/_SOLID_
        00-002.yml: arm: EGFR L858R && >=18/_SOLID_
        00-003.yml: step: EGFR L858R && >=18/_SOLID_
        00-004.yml dose: EGFR L858R && >=18/_SOLID_
        00-005.yml 2 doses: EGFR L858R && >=18/_SOLID_
        00-006.yml exon: !13
        """

        self.db = get_db(None)
        for res in ["clinical", "dashboard", "filter", "genomic", "hipaa", "match", "normalize", "oplog"
                    "response", "statistics", "status", "team", "trial", "trial_match", "user"]:
            self.db.drop_collection(res)

        self.me = MatchEngine(self.db)

        self.trials = {}
        self.clinical_id = ObjectId()
        self.mrn = 'TCGA-BH-A1FR'
        self.sample_id = 'TCGA-OR-A5J1'
        self.mrns = [self.mrn] + [self.__random_id() for _ in range(9)]
        self.sample_ids = [self.sample_id] + [self.__random_id() for _ in range(9)]
        self.clinical_ids = [self.clinical_id] + [ObjectId() for _ in range(9)]
        self.static_date = dt.datetime.today()

        # clinical collection
        self.oncotree_diagnoses = ['Adrenal Gland'] + ['Melanoma'] * 5 + ['Glioblastoma'] * 4
        self.genders = ['Female'] * 5 + ['Male'] * 5

        # ages
        adult = self.static_date - dt.timedelta(days=365*19)
        child = self.static_date - dt.timedelta(days=365*10)
        infant = self.static_date - dt.timedelta(days=30*4)
        self.ages = [adult] * 5 + [child] * 4 + [infant]

        self.clinical = [{
            '_id': clinical_id,
            'ONCOTREE_PRIMARY_DIAGNOSIS_NAME': diagnosis,
            'SAMPLE_ID': sample_id,
            'VITAL_STATUS': 'alive',
            'MRN': mrn,
            'REPORT_DATE': self.static_date,
            'BIRTH_DATE': age,
            'GENDER': gender
        } for diagnosis, gender, age, clinical_id, sample_id, mrn in zip(
            self.oncotree_diagnoses, self.genders, self.ages, self.clinical_ids, self.sample_ids, self.mrns)]

        # genomic collection
        self.genes = ['BRAF'] + ['EGFR'] * 9
        self.protein_changes = ['p.F346R', 'p.L858R', 'p.F346A', 'p.F346B', 'p.F000F', None, None, None, None, None]
        self.variant_categories = ['MUTATION'] * 5 + ['SV', 'CNV', 'CNV', 'CNV', 'CNV']
        self.wildtypes = [False] * 10
        self.cnv_calls = [None, None, None, None, None, None,
                          'Heterozygous deletion', 'Gain', 'Homozygous deletion', 'High level amplification']
        self.genomic = [{
            'TRUE_VARIANT_CLASSIFICATION': 'In_Frame_Del',
            'TRUE_PROTEIN_CHANGE': protein_change,
            'VARIANT_CATEGORY': variant_category,
            'CHROMOSOME': 'chr3',
            'POSITION': 178952085,
            'TRUE_STRAND': '+',
            'WILDTYPE': wildtype,
            'CLINICAL_ID': _id,
            'CNV_CALL': cnv_call,
            'TRUE_HUGO_SYMBOL': gene,
            'SAMPLE_ID': sample_id,
            'TRUE_TRANSCRIPT_EXON': 19
        } for protein_change, variant_category, wildtype, cnv_call, gene, _id, sample_id in zip(
            self.protein_changes, self.variant_categories, self.wildtypes,
            self.cnv_calls, self.genes, self.clinical_ids, self.sample_ids
        )]

        # test trials
        self.test_trials = ['00-001', '00-002', '00-003']

        # demo match results
        pnos = ['00-001', '00-001', '00-001', '00-002', '00-002', '00-002']
        mlevels = ['arm', 'arm', 'arm', 'dose', 'dose', 'dose']
        iids = ['1', '2', '3', '4', '5', '6']
        galts = ['Alt1', 'Alt2', 'Alt2', 'Alt3', 'Alt3', 'Alt3']
        self.matches = [{
            'mrn': 'SAMPLE1',
            'sample_id': 'SAMPLE1-ID',
            'protocol_no': protocol_no,
            'match_level': match_level,
            'internal_id': internal_id,
            'genomic_alteration': genomic_alteration
        } for protocol_no, match_level, internal_id, genomic_alteration in zip(
            pnos, mlevels, iids, galts
        )]
Пример #3
0
def load(args):
    """
    Sets up MongoDB for matching

    :param args: clinical: Path to csv file containing clinical data. Required fields are:
        - MRN (Unique patient identifier)
        - SAMPLE_ID (Unique sample identifier)
        - ONCOTREE_PRIMARY_DIAGNOSIS_NAME (Disease diagnosis)
        - BIRTH_DATE (Date of birth in format 'YYYY-MM-DD 00:00:00.000')

        Suggested additional fields:
        - ORD_PHYSICIAN_NAME
        - ORD_PHYSICIAN_EMAIL
        - REPORT_DATE
        - VITAL_STATUS (alive or deceased)
        - FIRST_LAST (Patient's first and last name)
        - GENDER (Male or Female)

    :param args: genomic: Path to csv file containing genomic data. The following fields are used in matching:
        - SAMPLE_ID (Unique sample identifier)
        - TRUE_HUGO_SYMBOL (Gene name)
        - TRUE_PROTEIN_CHANGE (Specific variant)
        - TRUE_VARIANT_CLASSIFICATION (Variant type)
        - VARIANT_CATEGORY (CNV, MUTATION, or SV)
        - TRUE_TRANSCRIPT_EXON (Exon number <integer>
        - CNV_CALL (Heterozygous deletion, Homozygous deletion, Gain, High Level amplification, or null)
        - WILDTYPE (True or False)

        Suggested additional fields:
        - CHROMOSOME (Chromosome number in format 'chr01')
        - POSITION <integer>
        - TRUE_CDNA_CHANGE
        - REFERENCE_ALLELE
        - CANONICAL_STRAND (- or +)
        - ALLELE_FRACTION <float>
        - TIER <integer>

    :param args: trials: Path to bson trial file.
    """

    db = get_db(args.mongo_uri)
    t = Trial(db)
    p = Patient(db)

    # Add trials to mongo
    if args.trials:
        logging.info('Adding trials to mongo...')
        t.load_dict[args.trial_format](args.trials)

    # Add patient data to mongo
    if args.clinical and args.genomic:
        logging.info('Reading data into pandas...')
        is_bson = p.load_dict[args.patient_format](args.clinical, args.genomic)

        if not is_bson:

            # reformatting
            for col in ['BIRTH_DATE', 'REPORT_DATE']:
                try:
                    p.clinical_df[col] = p.clinical_df[col].apply(lambda x: str(dt.datetime.strptime(x, '%Y-%m-%d')))
                except ValueError as exc:
                    if col == 'BIRTH_DATE':
                        print '## WARNING ## Birth dates should be formatted %Y-%m-%d to be properly stored in MongoDB.'
                        print '##         ## Birth dates may be malformed in the database and will therefore not match'
                        print '##         ## trial age restrictions properly.'
                        print '##         ## System error: \n%s' % exc

            p.genomic_df['TRUE_TRANSCRIPT_EXON'] = p.genomic_df['TRUE_TRANSCRIPT_EXON'].apply(
                lambda x: int(x) if x != '' and pd.notnull(x) else x)

            # Add clinical data to mongo
            logging.info('Adding clinical data to mongo...')
            clinical_json = json.loads(p.clinical_df.T.to_json()).values()
            for item in clinical_json:
                for col in ['BIRTH_DATE', 'REPORT_DATE']:
                    if col in item:
                        item[col] = dt.datetime.strptime(str(item[col]), '%Y-%m-%d %X')

            db.clinical.insert(clinical_json)

            # Get clinical ids from mongo
            logging.info('Adding clinical ids to genomic data...')
            clinical_doc = list(db.clinical.find({}, {"_id": 1, "SAMPLE_ID": 1}))
            clinical_dict = dict(zip([i['SAMPLE_ID'] for i in clinical_doc], [i['_id'] for i in clinical_doc]))

            # pd -> json
            if args.trial_format == 'pkl':
                genomic_json = json.loads(p.genomic_df.to_json(orient='records'))
            else:
                genomic_json = json.loads(p.genomic_df.T.to_json()).values()

            # Map clinical ids to genomic data
            for item in genomic_json:
                if item['SAMPLE_ID'] in clinical_dict:
                    item["CLINICAL_ID"] = clinical_dict[item['SAMPLE_ID']]
                else:
                    item["CLINICAL_ID"] = None

            # Add genomic data to mongo
            logging.info('Adding genomic data to mongo...')
            db.genomic.insert(genomic_json)

        # Create index
        logging.info('Creating index...')
        db.genomic.create_index([("TRUE_HUGO_SYMBOL", ASCENDING), ("WILDTYPE", ASCENDING)])

    elif args.clinical and not args.genomic or args.genomic and not args.clinical:
        logging.error('If loading patient information, please provide both clinical and genomic data.')
        sys.exit(1)