def test_create_and_find(self): """ Test create genomic file """ # Create genomic file dependent entities self._create_save_dependents() self.assertEqual(Participant.query.count(), 1) self.assertEqual(Biospecimen.query.count(), 2) se = SequencingExperiment.query.all()[0] # Properties keyed on kf_id kwargs_dict = {} for i in range(2): kwargs = { 'external_id': 'genomic_file_{}'.format(i), 'file_name': 'file_{}'.format(i), 'data_type': 'submitted aligned reads', 'file_format': '.cram', 'urls': ['s3://file_{}'.format(i)], 'hashes': { 'md5': str(uuid.uuid4()) }, 'controlled_access': True, 'is_harmonized': True, 'reference_genome': 'Test01', 'paired_end': 1, 'availability': 'Immediate Download', 'sequencing_experiment_id': se.kf_id } # Add genomic file to db session gf = GenomicFile(**kwargs) db.session.add(gf) db.session.flush() kwargs['kf_id'] = gf.kf_id kwargs_dict[kwargs['kf_id']] = kwargs db.session.commit() # Check all input field values with persisted field values # for each genomic file self.indexd.Session().get.side_effect = None for kf_id, kwargs in kwargs_dict.items(): # Mock out the response from indexd for the file mock_file = { 'file_name': kwargs['file_name'], 'urls': kwargs['urls'], 'hashes': kwargs['hashes'] } self.indexd.Session().get.return_value = MockResp(resp=mock_file) gf = GenomicFile.query.get(kf_id) gf.merge_indexd() for k, v in kwargs.items(): self.assertEqual(getattr(gf, k), v)
def _create_genomic_files(self, total): """ Creates genomic files with sequencing experiments """ max_size_mb = 5000 min_size_mb = 1000 gf_list = [] for i in range(total): kwargs = { 'file_name': 'file_{}'.format(i), 'size': (random.randint(min_size_mb, max_size_mb) * MB_TO_BYTES), 'data_type': random.choice(self.data_type_list), 'file_format': random.choice(self.file_format_list), 'urls': ['s3://file_{}'.format(i)], 'controlled_access': random.choice(self.controlled_access_list), 'hashes': { 'md5': str(uuid.uuid4()).replace('-', '') } } se = random.choice(SequencingExperiment.query.all()) gf_list.append( GenomicFile(**kwargs, sequencing_experiment_id=se.kf_id)) return gf_list
def test_post(self): """ Test create a new read_group_genomic_file """ # Create needed entities gf = GenomicFile(external_id='gf0') rg = ReadGroup(external_id='rg0') db.session.add_all([gf, rg]) db.session.commit() kwargs = { 'read_group_id': rg.kf_id, 'genomic_file_id': gf.kf_id, 'external_id': 'rg0-gf0' } # Send get request response = self.client.post(url_for(RG_GF_LIST_URL), data=json.dumps(kwargs), headers=self._api_headers()) # Check response status status_code self.assertEqual(response.status_code, 201) # Check response content response = json.loads(response.data.decode('utf-8')) assert response['results']['kf_id'] self.assertEqual(1, ReadGroupGenomicFile.query.count())
def _create_save_to_db(self): """ Make all entities """ # Create many to many rg and gf rgs = [] gfs = [] for i in range(2): gfs.append(GenomicFile(external_id='gf{}'.format(i))) rgs.append(ReadGroup(external_id='rg{}'.format(i))) db.session.add( ReadGroupGenomicFile(genomic_file=gfs[0], read_group=rgs[0], external_id='rg0-gf0')) db.session.add( ReadGroupGenomicFile(genomic_file=gfs[0], read_group=rgs[1], external_id='rg1-gf0')) db.session.add( ReadGroupGenomicFile(genomic_file=gfs[1], read_group=rgs[0], external_id='rg0-gf1')) db.session.add( ReadGroupGenomicFile(genomic_file=gfs[1], read_group=rgs[1], external_id='rg1-gf1')) db.session.commit() return rgs, gfs
def test_update(self, mock): """ Test update task """ indexd = MockIndexd() mock.Session().post = indexd.post mock.Session().get = indexd.get mock.Session().put = indexd.put # Create and save tasks and dependents participants, tasks = self._create_and_save_tasks() # Create new genomic_file p0 = Participant.query.filter_by(external_id='Fred').one() gf_new = GenomicFile(data_type='slide_image', file_name='slide_image1') (p0.biospecimens[0].genomic_files.append(gf_new)) db.session.commit() # Unlink task from a genomic file and link to a new one tgf = TaskGenomicFile.query.first() t_id = tgf.task_id gf_id = tgf.genomic_file_id tgf.genomic_file_id = gf_new.kf_id db.session.commit() # Check database t = Task.query.get(t_id) gf = GenomicFile.query.get(gf_id) self.assertNotIn(gf, t.genomic_files) self.assertIn(gf_new, t.genomic_files) self.assertEqual(9, GenomicFile.query.count()) self.assertEqual(16, TaskGenomicFile.query.count())
def test_post(self): """ Test create a new sequencing_experiment_genomic_file """ # Create needed entities gf = GenomicFile(external_id='gf0') sc = SequencingCenter(name='sc') se = SequencingExperiment(external_id='se0', experiment_strategy='WGS', is_paired_end=True, platform='platform', sequencing_center=sc) db.session.add_all([gf, se]) db.session.commit() kwargs = {'sequencing_experiment_id': se.kf_id, 'genomic_file_id': gf.kf_id, 'external_id': 'se0-gf0' } # Send get request response = self.client.post(url_for(SE_GF_LIST_URL), data=json.dumps(kwargs), headers=self._api_headers()) # Check response status status_code self.assertEqual(response.status_code, 201) # Check response content response = json.loads(response.data.decode('utf-8')) assert response['results']['kf_id'] self.assertEqual(1, SequencingExperimentGenomicFile.query.count())
def _create_save_genomic_files(self): """ Create and save genomic files to database """ # Create and save genomic file dependent entities self._create_save_dependents() # Create genomic files biospecimen = Biospecimen.query.all()[0] kwargs_dict = {} for i in range(2): kwargs = { 'external_id': 'genomic_file_{}'.format(i), 'file_name': 'file_{}'.format(i), 'size': (random.randint(MIN_SIZE_MB, MAX_SIZE_MB) * MB_TO_BYTES), 'data_type': 'submitted aligned reads', 'file_format': '.cram', 'urls': ['s3://file_{}'.format(i)], 'controlled_access': True, 'is_harmonized': True, 'paired_end': 1, 'reference_genome': 'Test01', 'hashes': {'md5': str(uuid.uuid4())}, 'availability': 'Immediate Download' } # Add genomic file to list in biospecimen gf = GenomicFile(**kwargs) biospecimen.genomic_files.append(gf) db.session.add(gf) db.session.flush() kwargs['kf_id'] = gf.kf_id kwargs_dict[gf.kf_id] = kwargs db.session.commit() return kwargs_dict
def test_update(self, mock): """ Test update cavatica_task """ indexd = MockIndexd() mock.Session().post = indexd.post mock.Session().get = indexd.get mock.Session().put = indexd.put # Create and save cavatica_tasks and dependents participants, cavatica_tasks = self._create_and_save_cavatica_tasks() se = SequencingExperiment.query.all()[0] # Create new genomic_file p0 = Participant.query.filter_by(external_id='Fred').one() gf_new = GenomicFile(data_type='slide_image', file_name='slide_image1', sequencing_experiment_id=se.kf_id) (p0.biospecimens[0].genomic_files.append(gf_new)) db.session.commit() # Unlink cavatica_task from a genomic file and link to a new one ctgf = CavaticaTaskGenomicFile.query.first() ct_id = ctgf.cavatica_task_id gf_id = ctgf.genomic_file_id ctgf.genomic_file_id = gf_new.kf_id db.session.commit() # Check database ct = CavaticaTask.query.get(ct_id) gf = GenomicFile.query.get(gf_id) self.assertNotIn(gf, ct.genomic_files) self.assertIn(gf_new, ct.genomic_files) self.assertEqual(9, GenomicFile.query.count()) self.assertEqual(16, CavaticaTaskGenomicFile.query.count())
def _create_save_to_db(self): """ Make all entities """ # Create sequencing_center sc = SequencingCenter.query.filter_by(name="Baylor").one_or_none() if sc is None: sc = SequencingCenter(name="Baylor") db.session.add(sc) db.session.commit() # Data kwargs = { 'experiment_strategy': 'WXS', 'library_name': 'library', 'library_strand': 'Unstranded', 'is_paired_end': False, 'platform': 'platform', 'instrument_model': '454 GS FLX Titanium', 'max_insert_size': 600, 'mean_insert_size': 500, 'mean_depth': 40, 'total_reads': 800, 'mean_read_length': 200 } # Create many to many se and gf ses = [] gfs = [] for i in range(2): gfs.append( GenomicFile(external_id='gf{}'.format(i)) ) ses.append( SequencingExperiment(**kwargs, sequencing_center=sc, external_id='se{}'.format(i)) ) db.session.add(SequencingExperimentGenomicFile( genomic_file=gfs[0], sequencing_experiment=ses[0], external_id='se0-gf0')) db.session.add(SequencingExperimentGenomicFile( genomic_file=gfs[0], sequencing_experiment=ses[1], external_id='se1-gf0')) db.session.add(SequencingExperimentGenomicFile( genomic_file=gfs[1], sequencing_experiment=ses[0], external_id='se0-gf1')) db.session.add(SequencingExperimentGenomicFile( genomic_file=gfs[1], sequencing_experiment=ses[1], external_id='se1-gf1')) db.session.commit() return ses, gfs
def _create_all_entities(self): """ Create 2 studies with genomic files and read groups """ sc = SequencingCenter(name='sc') studies = [] ses = {} gfs = {} for j in range(2): s = Study(external_id='s{}'.format(j)) p = Participant(external_id='p{}'.format(j)) s.participants.append(p) study_gfs = gfs.setdefault('study{}'.format(j), []) for i in range(3): b = Biospecimen(external_sample_id='b{}'.format(i), analyte_type='DNA', sequencing_center=sc, participant=p) gf = GenomicFile( external_id='study{}-gf{}'.format(j, i), urls=['s3://mybucket/key'], hashes={'md5': 'd418219b883fce3a085b1b7f38b01e37'}) study_gfs.append(gf) b.genomic_files.append(gf) study_ses = ses.setdefault('study{}'.format(j), []) dt = datetime.now() kwargs = { 'experiment_date': str(dt.replace(tzinfo=tz.tzutc())), 'experiment_strategy': 'WXS', 'library_name': 'Test_library_name_1', 'library_strand': 'Unstranded', 'is_paired_end': False, 'platform': 'Illumina', 'instrument_model': '454 GS FLX Titanium', 'max_insert_size': 600, 'mean_insert_size': 500, 'mean_depth': 40, 'total_reads': 800, 'mean_read_length': 200 } se0 = SequencingExperiment(**kwargs, sequencing_center=sc, external_id='study{}-se0'.format(j)) se0.genomic_files.extend(study_gfs[0:2]) se1 = SequencingExperiment(**kwargs, sequencing_center=sc, external_id='study{}-se1'.format(j)) se1.genomic_files.extend([study_gfs[0], study_gfs[-1]]) study_ses.extend([se0, se1]) studies.append(s) db.session.add_all(studies) db.session.commit() return ses, gfs, studies
def _create_all_entities(): """ Create 2 studies with genomic files and read groups """ sc = SequencingCenter(name='sc') studies = [] ses = {} rgs = {} gfs = {} for j in range(2): s = Study(external_id='s{}'.format(j)) p = Participant(external_id='p{}'.format(j)) s.participants.append(p) study_gfs = gfs.setdefault('study{}'.format(j), []) for i in range(3): b = Biospecimen(external_sample_id='b{}'.format(i), analyte_type='DNA', sequencing_center=sc, participant=p) gf = GenomicFile( external_id='study{}-gf{}'.format(j, i), urls=['s3://mybucket/key', 'https://gen3.something.com/did'], hashes={'md5': 'd418219b883fce3a085b1b7f38b01e37'}) study_gfs.append(gf) b.genomic_files.append(gf) study_rgs = rgs.setdefault('study{}'.format(j), []) rg0 = ReadGroup(external_id='study{}-rg0'.format(j)) rg0.genomic_files.extend(study_gfs[0:2]) rg1 = ReadGroup(external_id='study{}-rg1'.format(j)) rg1.genomic_files.extend([study_gfs[0], study_gfs[-1]]) study_ses = ses.setdefault('study{}'.format(j), []) se0 = SequencingExperiment(external_id='study{}-se0'.format(j), experiment_strategy='WGS', is_paired_end=True, platform='platform', sequencing_center=sc) se0.genomic_files.extend(study_gfs[0:2]) se1 = SequencingExperiment(external_id='study{}-se1'.format(j), experiment_strategy='WGS', is_paired_end=True, platform='platform', sequencing_center=sc) se1.genomic_files.extend([study_gfs[0], study_gfs[-1]]) study_rgs.extend([rg0, rg1]) study_ses.extend([se0, se1]) studies.append(s) db.session.add_all(studies) db.session.commit() return ses, rgs, gfs, studies
def _create_entities(self): """ Create participant with required entities """ # Sequencing center sc = SequencingCenter.query.filter_by(name="Baylor").one_or_none() if sc is None: sc = SequencingCenter(name="Baylor") db.session.add(sc) db.session.commit() # Create study study = Study(external_id='phs001') # Participants p = Participant(external_id='p0', is_proband=True, study=study) # Biospecimen bs = Biospecimen(analyte_type='dna', sequencing_center=sc, participant=p) # SequencingExperiment data = { 'external_id': 'se', 'experiment_strategy': 'wgs', 'is_paired_end': True, 'platform': 'platform', 'sequencing_center': sc } se = SequencingExperiment(**data) # Genomic Files genomic_files = [] for i in range(4): data = { 'file_name': 'gf_{}'.format(i), 'data_type': 'submitted aligned read', 'file_format': '.cram', 'urls': ['s3://file_{}'.format(i)], 'hashes': { 'md5': str(uuid.uuid4()) }, 'is_harmonized': True if i % 2 else False } gf = GenomicFile(**data) bs.genomic_files.append(gf) se.genomic_files.append(gf) genomic_files.append(gf) ct = self._create_cavatica_task('ct1') db.session.add(ct) db.session.add(study) db.session.commit()
def genomic_files(client, entities): props = { 'external_id': 'genomic_file_0', 'file_name': 'hg38.bam', 'data_type': 'Aligned Reads', 'file_format': 'bam' } gfs = [GenomicFile(**props) for _ in range(EXPECTED_TOTAL - ENTITY_TOTAL)] db.session.add_all(gfs) db.session.commit() db.session.expunge_all()
def _create_genomic_file(self, _id, data_type='submitted aligned read', sequencing_experiment_id=None, biospec_id=None): """ Create genomic file """ data = { 'file_name': 'file_{}'.format(_id), 'data_type': data_type, 'file_format': '.cram', 'urls': ['s3://file_{}'.format(_id)], 'hashes': {'md5': str(uuid.uuid4())}, 'sequencing_experiment_id': sequencing_experiment_id } return GenomicFile(**data)
def _create_entities(self): # Create study study = Study(external_id='phs001') # Create participant p = Participant(external_id='p1', is_proband=True, study=study) # Create sequencing_center sc = SequencingCenter.query.filter_by(name="Baylor").one_or_none() if sc is None: sc = SequencingCenter(name="Baylor") db.session.add(sc) db.session.commit() # Create sequencing experiments se1 = SequencingExperiment(**self._make_seq_exp('se1'), sequencing_center_id=sc.kf_id) se2 = SequencingExperiment(**self._make_seq_exp('se2'), sequencing_center_id=sc.kf_id) # Create biospecimen bs = Biospecimen(external_sample_id='bio1', analyte_type='dna', participant_id=p.kf_id, sequencing_center_id=sc.kf_id) # Create genomic files gfs = [] for i in range(4): kwargs = { 'file_name': 'file_{}'.format(i), 'data_type': 'submitted aligned read', 'file_format': '.cram', 'urls': ['s3://file_{}'.format(i)], 'hashes': {'md5': str(uuid.uuid4())}, 'controlled_access': True, 'is_harmonized': True, 'reference_genome': 'Test01' } gf = GenomicFile(**kwargs, sequencing_experiment_id=se1.kf_id) if i % 2: se1.genomic_files.append(gf) else: se2.genomic_files.append(gf) gfs.append(gf) bs.genomic_files = gfs p.biospecimens = [bs] db.session.add(p) db.session.commit()
def _create_all_entities(self): """ Create 2 studies with genomic files and read groups """ sc = SequencingCenter(name='sc') studies = [] rgs = {} gfs = {} for j in range(2): s = Study(external_id='s{}'.format(j)) p = Participant(external_id='p{}'.format(j)) s.participants.append(p) study_gfs = gfs.setdefault('study{}'.format(j), []) for i in range(3): b = Biospecimen(external_sample_id='b{}'.format(i), analyte_type='DNA', sequencing_center=sc, participant=p) gf = GenomicFile( external_id='study{}-gf{}'.format(j, i), urls=['s3://mybucket/key'], hashes={'md5': 'd418219b883fce3a085b1b7f38b01e37'}) study_gfs.append(gf) b.genomic_files.append(gf) study_rgs = rgs.setdefault('study{}'.format(j), []) rg0 = ReadGroup(external_id='study{}-rg0'.format(j)) rg0.genomic_files.extend(study_gfs[0:2]) rg1 = ReadGroup(external_id='study{}-rg1'.format(j)) rg1.genomic_files.extend([study_gfs[0], study_gfs[-1]]) study_rgs.extend([rg0, rg1]) studies.append(s) db.session.add_all(studies) db.session.commit() return rgs, gfs, studies
def participants(client): # Add a bunch of studies for pagination for i in range(101): s = Study(external_id='Study_{}'.format(i)) db.session.add(s) for i in range(101): ca = CavaticaApp(name='app', revision=0) db.session.add(ca) # Add a bunch of study files s0 = Study.query.filter_by(external_id='Study_0').one() s1 = Study.query.filter_by(external_id='Study_1').one() for i in range(101): sf = StudyFile(file_name='blah', study_id=s0.kf_id) db.session.add(sf) # Add a bunch of investigators for _ in range(102): inv = Investigator(name='test') inv.studies.extend([s0, s1]) db.session.add(inv) # Add a bunch of families families = [] for i in range(101): families.append(Family(external_id='Family_{}'.format(i))) db.session.add_all(families) db.session.flush() participants = [] f0 = Family.query.filter_by(external_id='Family_0').one() f1 = Family.query.filter_by(external_id='Family_1').one() seq_cen = None for i in range(102): f = f0 if i < 50 else f1 s = s0 if i < 50 else s1 data = { 'external_id': "test", 'is_proband': True, 'race': 'Asian', 'ethnicity': 'Hispanic or Latino', 'diagnosis_category': 'Cancer', 'gender': 'Male' } p = Participant(**data, study_id=s.kf_id, family_id=f.kf_id) diag = Diagnosis() p.diagnoses = [diag] outcome = Outcome() p.outcomes = [outcome] phen = Phenotype() p.phenotypes = [phen] participants.append(p) db.session.add(p) db.session.flush() seq_data = { 'external_id': 'Seq_0', 'experiment_strategy': 'WXS', 'library_name': 'Test_library_name_1', 'library_strand': 'Unstranded', 'is_paired_end': False, 'platform': 'Test_platform_name_1' } gf_kwargs = { 'external_id': 'gf_0', 'file_name': 'hg38.fq', 'data_type': 'Aligned Reads', 'file_format': 'fastq', 'size': 1000, 'urls': ['s3://bucket/key'], 'hashes': { 'md5': str(uuid.uuid4()) }, 'controlled_access': False } seq_cen = SequencingCenter.query.filter_by(name="Baylor")\ .one_or_none() if seq_cen is None: seq_cen = SequencingCenter(external_id='SC_0', name="Baylor") db.session.add(seq_cen) db.session.flush() seq_exp = SequencingExperiment(**seq_data, sequencing_center_id=seq_cen.kf_id) db.session.add(seq_exp) samp = Biospecimen(analyte_type='an analyte', sequencing_center_id=seq_cen.kf_id, participant=p) db.session.add(samp) p.biospecimens = [samp] gf = GenomicFile(**gf_kwargs, sequencing_experiment_id=seq_exp.kf_id) db.session.add(gf) samp.genomic_files.append(gf) samp.diagnoses.append(diag) db.session.flush() rg = ReadGroup(lane_number=4, flow_cell='FL0123') rg.genomic_files.append(gf) ct = CavaticaTask(name='task_{}'.format(i)) ct.genomic_files.append(gf) ca.cavatica_tasks.append(ct) # Family relationships for participant1, participant2 in iterate_pairwise(participants): gender = participant1.gender rel = 'mother' if gender == 'male': rel = 'father' r = FamilyRelationship(participant1=participant1, participant2=participant2, participant1_to_participant2_relation=rel) db.session.add(r) db.session.commit()
def _create_all_entities(self): """ Create 2 studies with same content Content: 3 participants, 4 biospecimens, 4 diagnoses """ # Create entities sc = SequencingCenter.query.filter_by(name='sc').first() if not sc: sc = SequencingCenter(name='sc') studies = [] # Two studies for j in range(2): s = Study(external_id='s{}'.format(j)) p0 = Participant(external_id='study{}-p0'.format(j)) p1 = Participant(external_id='study{}-p1'.format(j)) p2 = Participant(external_id='study{}-p2'.format(j)) # Participant 0 # Has 2 Biospecimens gf = GenomicFile( external_id='study{}-b0-gf0'.format(j), urls=['s3://mybucket/key'], hashes={'md5': 'd418219b883fce3a085b1b7f38b01e37'}) for i in range(2): b = Biospecimen(external_sample_id='study{}-p0-b{}'.format( j, i), analyte_type='DNA', sequencing_center=sc) b.genomic_files.append(gf) # Biospecimen b0 has 2 diagnoses if i == 0: for k in range(2): d = Diagnosis( external_id='study{}-p0-d{}'.format(j, k)) p0.diagnoses.append(d) # Biospecimen b1 has 1 diagnosis else: d = Diagnosis( external_id='study{}-p0-d{}'.format(j, k + 1)) gf = GenomicFile( external_id='study{}-b0-gf{}'.format(j, k + 1), urls=['s3://mybucket/key'], hashes={'md5': 'd418219b883fce3a085b1b7f38b01e37'}) b.genomic_files.append(gf) p0.diagnoses.append(d) p0.biospecimens.append(b) # Participant 1 # Has 1 biospecimen, 1 diagnosis b = Biospecimen(external_sample_id='study{}-p1-b0'.format(j), analyte_type='DNA', sequencing_center=sc) d = Diagnosis(external_id='study{}-p1-d0'.format(j)) gf = GenomicFile( external_id='study{}-b1-gf0'.format(j), urls=['s3://mybucket/key'], hashes={'md5': 'd418219b883fce3a085b1b7f38b01e37'}) b.genomic_files.append(gf) p1.biospecimens.append(b) p1.diagnoses.append(d) # Participant 2 # Has 1 biospecimen b = Biospecimen(external_sample_id='study{}-p2-b0'.format(j), analyte_type='DNA', sequencing_center=sc) p2.biospecimens.append(b) s.participants.extend([p0, p1, p2]) studies.append(s) db.session.add_all(studies) db.session.commit() # Create links between bios and diags bs_dgs = [] # Participant 0 p0 = studies[0].participants[0] # b0-d0 bs_dgs.append( BiospecimenDiagnosis(biospecimen_id=p0.biospecimens[0].kf_id, diagnosis_id=p0.diagnoses[0].kf_id)) # b0-d1 bs_dgs.append( BiospecimenDiagnosis(biospecimen_id=p0.biospecimens[0].kf_id, diagnosis_id=p0.diagnoses[1].kf_id)) # b1-d2 bs_dgs.append( BiospecimenDiagnosis(biospecimen_id=p0.biospecimens[1].kf_id, diagnosis_id=p0.diagnoses[2].kf_id)) # b0-d2 bs_dgs.append( BiospecimenDiagnosis(biospecimen_id=p0.biospecimens[0].kf_id, diagnosis_id=p0.diagnoses[2].kf_id)) # Participant 1 p1 = studies[0].participants[1] # b0-d0 bs_dgs.append( BiospecimenDiagnosis(biospecimen_id=p1.biospecimens[0].kf_id, diagnosis_id=p1.diagnoses[0].kf_id)) db.session.add_all(bs_dgs) db.session.commit()