def update_package_id_from_manifest_result_file(genomic_set_id, csv_file): csv_reader = csv.DictReader(csv_file, delimiter=',') missing_cols = set(CsvColumns.ALL) - set(csv_reader.fieldnames) if len(csv_reader.fieldnames) == 1: csv_file.seek(0, 0) csv_reader = csv.DictReader(csv_file, delimiter='\t') missing_cols = set(CsvColumns.ALL) - set(csv_reader.fieldnames) if missing_cols: raise DataError('CSV is missing columns %s, had columns %s.' % (missing_cols, csv_reader.fieldnames)) ClientIdPackageIdPair = collections.namedtuple('ClientIdPackageIdPair', [ 'client_id', 'package_id', ]) update_queue = collections.deque() dao = GenomicSetMemberDao() try: rows = list(csv_reader) for row in rows: if row[CsvColumns.VALUE] and row[CsvColumns.PACKAGE_ID]: update_queue.append( ClientIdPackageIdPair(row[CsvColumns.VALUE], row[CsvColumns.PACKAGE_ID])) dao.bulk_update_package_id(genomic_set_id, update_queue) except ValueError, e: raise DataError(e)
def update_sample_info_from_genotyping_manifest_file(csv_file): csv_reader = csv.DictReader(csv_file, delimiter=',') if not set(CsvColumns.REQUIRED_COLS).issubset(set(csv_reader.fieldnames)): raise DataError('CSV is missing columns %s, had columns %s.' % (CsvColumns.REQUIRED_COLS, csv_reader.fieldnames)) genotypying_data = collections.namedtuple('genotypingData', [ 'biobank_id', 'genome_type', 'sample_id', 'sample_type', ]) update_queue = collections.deque() dao = GenomicSetMemberDao() try: rows = list(csv_reader) for row in rows: if row[CsvColumns.BIOBANK_ID] and row[CsvColumns.SAMPLE_ID] and row[CsvColumns.SAMPLE_TYPE] \ and row[CsvColumns.TEST_NAME]: biobank_id = row[CsvColumns.BIOBANK_ID][len(BIOBANK_ID_PREFIX):] \ if row[CsvColumns.BIOBANK_ID].startswith(BIOBANK_ID_PREFIX) \ else row[CsvColumns.BIOBANK_ID] update_queue.append( genotypying_data(biobank_id, row[CsvColumns.TEST_NAME], row[CsvColumns.SAMPLE_ID], row[CsvColumns.SAMPLE_TYPE])) dao.bulk_update_genotyping_sample_manifest_data(update_queue) except ValueError, e: raise DataError(e)
def setUp(self, with_data=True, use_mysql=False): super(GenomicSetValidationBaseTestCase, self).setUp(with_data=with_data, use_mysql=use_mysql) self.participant_dao = ParticipantDao() self.summary_dao = ParticipantSummaryDao() self.genomic_set_dao = GenomicSetDao() self.genomic_member_dao = GenomicSetMemberDao() self._participant_i = 0 self.setup_data()
def _save_genomic_set_from_csv(csv_reader, csv_filename, timestamp): """Inserts GenomicSet and GenomicSetMember from a csv.DictReader.""" missing_cols = set(CsvColumns.ALL) - set(csv_reader.fieldnames) if missing_cols: raise DataError( 'CSV is missing columns %s, had columns %s.' % (missing_cols, csv_reader.fieldnames)) member_dao = GenomicSetMemberDao() genomic_set_id = None try: members = [] rows = list(csv_reader) for i, row in enumerate(rows): if i == 0: if row[CsvColumns.GENOMIC_SET_NAME] and row[CsvColumns.GENOMIC_SET_CRITERIA]: genomic_set = _insert_genomic_set_from_row(row, csv_filename, timestamp) genomic_set_id = genomic_set.id else: raise DataError('CSV is missing columns genomic_set_name or genomic_set_criteria') member = _create_genomic_set_member_from_row(genomic_set_id, row) members.append(member) if len(members) >= _BATCH_SIZE: member_dao.upsert_all(members) members = [] if members: member_dao.upsert_all(members) member_dao.update_biobank_id(genomic_set_id) return genomic_set_id except ValueError, e: raise DataError(e)
def test_read_from_csv_file(self): participant = self.participant_dao.insert(Participant(participantId=123, biobankId=1234)) self.summary_dao.insert(self.participant_summary(participant)) bo = self._make_biobank_order(participantId=participant.participantId, biobankOrderId='123', identifiers=[BiobankOrderIdentifier( system=u'https://www.pmi-ops.org', value=u'12345678')]) BiobankOrderDao().insert(bo) participant2 = self.participant_dao.insert(Participant(participantId=124, biobankId=1235)) self.summary_dao.insert(self.participant_summary(participant2)) bo2 = self._make_biobank_order(participantId=participant2.participantId, biobankOrderId='124', identifiers=[BiobankOrderIdentifier( system=u'https://www.pmi-ops.org', value=u'12345679')]) BiobankOrderDao().insert(bo2) participant3 = self.participant_dao.insert(Participant(participantId=125, biobankId=1236)) self.summary_dao.insert(self.participant_summary(participant3)) bo3 = self._make_biobank_order(participantId=participant3.participantId, biobankOrderId='125', identifiers=[BiobankOrderIdentifier( system=u'https://www.pmi-ops.org', value=u'12345680')]) BiobankOrderDao().insert(bo3) samples_file = test_data.open_genomic_set_file('Genomic-Test-Set-test-1.csv') input_filename = 'cloud%s.csv' % self._naive_utc_to_naive_central(clock.CLOCK.now()).strftime( genomic_set_file_handler.INPUT_CSV_TIME_FORMAT) self._write_cloud_csv(input_filename, samples_file) genomic_set_file_handler.read_genomic_set_from_bucket() set_dao = GenomicSetDao() obj = set_dao.get_all()[0] self.assertEqual(obj.genomicSetName, 'name_xxx') self.assertEqual(obj.genomicSetCriteria, 'criteria_xxx') self.assertEqual(obj.genomicSetVersion, 1) member_dao = GenomicSetMemberDao() items = member_dao.get_all() for item in items: self.assertIn(item.participantId, [123, 124, 125]) self.assertIn(item.biobankOrderId, ['123', '124', '125']) self.assertIn(item.biobankId, ['1234', '1235', '1236']) self.assertIn(item.biobankOrderClientId, ['12345678', '12345679', '12345680']) self.assertEqual(item.genomicSetId, 1) self.assertIn(item.genomeType, ['aou_wgs', 'aou_array']) self.assertIn(item.nyFlag, [0, 1]) self.assertIn(item.sexAtBirth, ['F', 'M'])
def test_over_24hours_genomic_set_file_case(self): samples_file = test_data.open_genomic_set_file( 'Genomic-Test-Set-test-3.csv') over_24hours_time = clock.CLOCK.now() - datetime.timedelta(hours=25) input_filename = 'Genomic-Test-Set-v1%s.csv' % self \ ._naive_utc_to_naive_central(over_24hours_time) \ .strftime(genomic_set_file_handler.INPUT_CSV_TIME_FORMAT) self._write_cloud_csv(input_filename, samples_file) genomic_pipeline.process_genomic_water_line() member_dao = GenomicSetMemberDao() members = member_dao.get_all() self.assertEqual(len(members), 0)
def _create_fake_genomic_member( self, genomic_set_id, participant_id, biobank_order_id, validation_status=GenomicValidationStatus.VALID, sex_at_birth='F', genome_type='aou_array', ny_flag='Y'): genomic_set_member = GenomicSetMember() genomic_set_member.genomicSetId = genomic_set_id genomic_set_member.validationStatus = validation_status genomic_set_member.participantId = participant_id genomic_set_member.sexAtBirth = sex_at_birth genomic_set_member.genomeType = genome_type genomic_set_member.nyFlag = 1 if ny_flag == 'Y' else 0 genomic_set_member.biobankOrderId = biobank_order_id member_dao = GenomicSetMemberDao() member_dao.insert(genomic_set_member)
def _create_fake_genomic_member(self, genomic_set_id, participant_id, biobank_order_id, biobank_id, biobank_order_client_id, validation_status=GenomicSetMemberStatus.VALID, validation_flags=None, sex_at_birth='F', genome_type='aou_array', ny_flag='Y'): now = clock.CLOCK.now() genomic_set_member = GenomicSetMember() genomic_set_member.genomicSetId = genomic_set_id genomic_set_member.created = now genomic_set_member.modified = now genomic_set_member.validationStatus = validation_status genomic_set_member.validationFlags = validation_flags genomic_set_member.participantId = participant_id genomic_set_member.sexAtBirth = sex_at_birth genomic_set_member.genomeType = genome_type genomic_set_member.nyFlag = 1 if ny_flag == 'Y' else 0 genomic_set_member.biobankOrderId = biobank_order_id genomic_set_member.biobankId = biobank_id genomic_set_member.biobankOrderClientId = biobank_order_client_id member_dao = GenomicSetMemberDao() member_dao.insert(genomic_set_member)
class GenomicSetValidationBaseTestCase(SqlTestBase): def setUp(self, with_data=True, use_mysql=False): super(GenomicSetValidationBaseTestCase, self).setUp(with_data=with_data, use_mysql=use_mysql) self.participant_dao = ParticipantDao() self.summary_dao = ParticipantSummaryDao() self.genomic_set_dao = GenomicSetDao() self.genomic_member_dao = GenomicSetMemberDao() self._participant_i = 0 self.setup_data() def setup_data(self): pass def make_participant(self, **kwargs): """ Make a participant with custom settings. default should create a valid participant. """ i = self._participant_i self._participant_i += 1 participant = Participant(participantId=i, biobankId=i, **kwargs) self.participant_dao.insert(participant) return participant def make_summary(self, participant, **override_kwargs): """ Make a summary with custom settings. default should create a valid summary. """ valid_kwargs = dict(participantId=participant.participantId, biobankId=participant.biobankId, withdrawalStatus=participant.withdrawalStatus, dateOfBirth=datetime.datetime(2000, 1, 1), firstName='foo', lastName='bar', zipCode='12345', sampleStatus1ED04=SampleStatus.RECEIVED, sampleStatus1SAL2=SampleStatus.RECEIVED, samplesToIsolateDNA=SampleStatus.RECEIVED, consentForStudyEnrollmentTime=datetime.datetime( 2019, 1, 1)) kwargs = dict(valid_kwargs, **override_kwargs) summary = self._participant_summary_with_defaults(**kwargs) self.summary_dao.insert(summary) return summary def make_genomic_set(self, **override_kwargs): """ Make a genomic set with custom settings. default should create a valid set. """ valid_kwargs = dict(genomicSetName='foo', genomicSetCriteria='something', genomicSetVersion=1, genomicSetStatus=GenomicSetStatus.UNSET) kwargs = dict(valid_kwargs, **override_kwargs) genomic_set = GenomicSet(**kwargs) self.genomic_set_dao.insert(genomic_set) return genomic_set def make_genomic_member(self, genomic_set, participant, **override_kwargs): """ Make a genomic member with custom settings. default should create a valid member. """ valid_kwargs = dict(genomicSetId=genomic_set.id, participantId=participant.participantId, sexAtBirth='F', biobankId=participant.biobankId, biobankOrderClientId='12345678') kwargs = dict(valid_kwargs, **override_kwargs) member = GenomicSetMember(**kwargs) self.genomic_member_dao.insert(member) return member
def test_end_to_end_valid_case(self): participant = self._make_participant() self._make_summary(participant) self._make_biobank_order(participantId=participant.participantId, biobankOrderId=participant.participantId, identifiers=[ BiobankOrderIdentifier( system=u'https://www.pmi-ops.org', value=u'12345678') ]) participant2 = self._make_participant() self._make_summary(participant2) self._make_biobank_order(participantId=participant2.participantId, biobankOrderId=participant2.participantId, identifiers=[ BiobankOrderIdentifier( system=u'https://www.pmi-ops.org', value=u'12345679') ]) participant3 = self._make_participant() self._make_summary(participant3) self._make_biobank_order(participantId=participant3.participantId, biobankOrderId=participant3.participantId, identifiers=[ BiobankOrderIdentifier( system=u'https://www.pmi-ops.org', value=u'12345680') ]) samples_file = test_data.open_genomic_set_file( 'Genomic-Test-Set-test-2.csv') input_filename = 'Genomic-Test-Set-v1%s.csv' % self\ ._naive_utc_to_naive_central(clock.CLOCK.now())\ .strftime(genomic_set_file_handler.INPUT_CSV_TIME_FORMAT) self._write_cloud_csv(input_filename, samples_file) manifest_result_file = test_data.open_genomic_set_file( 'Genomic-Manifest-Result-test.csv') manifest_result_filename = 'Genomic-Manifest-Result-AoU-1-v1%s.csv' % self \ ._naive_utc_to_naive_central(clock.CLOCK.now()) \ .strftime(genomic_set_file_handler.INPUT_CSV_TIME_FORMAT) self._write_cloud_csv(manifest_result_filename, manifest_result_file, bucket=_FAKE_BIOBANK_SAMPLE_BUCKET, folder=_FAKE_BUCKET_RESULT_FOLDER) genomic_pipeline.process_genomic_water_line() # verify result file bucket_name = config.getSetting(config.GENOMIC_SET_BUCKET_NAME) path = self._find_latest_genomic_set_csv(bucket_name, 'Validation-Result') csv_file = cloudstorage_api.open(path) csv_reader = csv.DictReader(csv_file, delimiter=',') class ResultCsvColumns(object): """Names of CSV columns that we read from the genomic set upload.""" GENOMIC_SET_NAME = 'genomic_set_name' GENOMIC_SET_CRITERIA = 'genomic_set_criteria' PID = 'pid' BIOBANK_ORDER_ID = 'biobank_order_id' NY_FLAG = 'ny_flag' SEX_AT_BIRTH = 'sex_at_birth' GENOME_TYPE = 'genome_type' STATUS = 'status' INVALID_REASON = 'invalid_reason' ALL = (GENOMIC_SET_NAME, GENOMIC_SET_CRITERIA, PID, BIOBANK_ORDER_ID, NY_FLAG, SEX_AT_BIRTH, GENOME_TYPE, STATUS, INVALID_REASON) missing_cols = set(ResultCsvColumns.ALL) - set(csv_reader.fieldnames) self.assertEqual(len(missing_cols), 0) rows = list(csv_reader) self.assertEqual(len(rows), 3) self.assertEqual(rows[0][ResultCsvColumns.GENOMIC_SET_NAME], 'name_xxx') self.assertEqual(rows[0][ResultCsvColumns.GENOMIC_SET_CRITERIA], 'criteria_xxx') self.assertEqual(rows[0][ResultCsvColumns.STATUS], 'valid') self.assertEqual(rows[0][ResultCsvColumns.INVALID_REASON], '') self.assertEqual(rows[0][ResultCsvColumns.PID], '1') self.assertEqual(rows[0][ResultCsvColumns.BIOBANK_ORDER_ID], '1') self.assertEqual(rows[0][ResultCsvColumns.NY_FLAG], 'Y') self.assertEqual(rows[0][ResultCsvColumns.GENOME_TYPE], 'aou_wgs') self.assertEqual(rows[0][ResultCsvColumns.SEX_AT_BIRTH], 'M') self.assertEqual(rows[1][ResultCsvColumns.GENOMIC_SET_NAME], 'name_xxx') self.assertEqual(rows[1][ResultCsvColumns.GENOMIC_SET_CRITERIA], 'criteria_xxx') self.assertEqual(rows[1][ResultCsvColumns.STATUS], 'valid') self.assertEqual(rows[1][ResultCsvColumns.INVALID_REASON], '') self.assertEqual(rows[1][ResultCsvColumns.PID], '2') self.assertEqual(rows[1][ResultCsvColumns.BIOBANK_ORDER_ID], '2') self.assertEqual(rows[1][ResultCsvColumns.NY_FLAG], 'N') self.assertEqual(rows[1][ResultCsvColumns.GENOME_TYPE], 'aou_array') self.assertEqual(rows[1][ResultCsvColumns.SEX_AT_BIRTH], 'F') self.assertEqual(rows[2][ResultCsvColumns.GENOMIC_SET_NAME], 'name_xxx') self.assertEqual(rows[2][ResultCsvColumns.GENOMIC_SET_CRITERIA], 'criteria_xxx') self.assertEqual(rows[2][ResultCsvColumns.STATUS], 'valid') self.assertEqual(rows[2][ResultCsvColumns.INVALID_REASON], '') self.assertEqual(rows[2][ResultCsvColumns.PID], '3') self.assertEqual(rows[2][ResultCsvColumns.BIOBANK_ORDER_ID], '3') self.assertEqual(rows[2][ResultCsvColumns.NY_FLAG], 'N') self.assertEqual(rows[2][ResultCsvColumns.GENOME_TYPE], 'aou_array') self.assertEqual(rows[2][ResultCsvColumns.SEX_AT_BIRTH], 'M') # verify manifest files bucket_name = config.getSetting(config.BIOBANK_SAMPLES_BUCKET_NAME) class ExpectedCsvColumns(object): VALUE = 'value' BIOBANK_ID = 'biobank_id' SEX_AT_BIRTH = 'sex_at_birth' GENOME_TYPE = 'genome_type' NY_FLAG = 'ny_flag' REQUEST_ID = 'request_id' PACKAGE_ID = 'package_id' ALL = (VALUE, SEX_AT_BIRTH, GENOME_TYPE, NY_FLAG, REQUEST_ID, PACKAGE_ID) path = self._find_latest_genomic_set_csv(bucket_name, 'Manifest') csv_file = cloudstorage_api.open(path) csv_reader = csv.DictReader(csv_file, delimiter=',') missing_cols = set(ExpectedCsvColumns.ALL) - set(csv_reader.fieldnames) self.assertEqual(len(missing_cols), 0) rows = list(csv_reader) self.assertEqual(rows[0][ExpectedCsvColumns.VALUE], '12345678') self.assertEqual(rows[0][ExpectedCsvColumns.BIOBANK_ID], '1') self.assertEqual(rows[0][ExpectedCsvColumns.SEX_AT_BIRTH], 'M') self.assertEqual(rows[0][ExpectedCsvColumns.GENOME_TYPE], 'aou_wgs') self.assertEqual(rows[0][ExpectedCsvColumns.NY_FLAG], 'Y') self.assertEqual(rows[1][ExpectedCsvColumns.VALUE], '12345679') self.assertEqual(rows[1][ExpectedCsvColumns.BIOBANK_ID], '2') self.assertEqual(rows[1][ExpectedCsvColumns.SEX_AT_BIRTH], 'F') self.assertEqual(rows[1][ExpectedCsvColumns.GENOME_TYPE], 'aou_array') self.assertEqual(rows[1][ExpectedCsvColumns.NY_FLAG], 'N') self.assertEqual(rows[2][ExpectedCsvColumns.VALUE], '12345680') self.assertEqual(rows[2][ExpectedCsvColumns.BIOBANK_ID], '3') self.assertEqual(rows[2][ExpectedCsvColumns.SEX_AT_BIRTH], 'M') self.assertEqual(rows[2][ExpectedCsvColumns.GENOME_TYPE], 'aou_array') self.assertEqual(rows[2][ExpectedCsvColumns.NY_FLAG], 'N') # verify manifest result files bucket_name = config.getSetting(config.BIOBANK_SAMPLES_BUCKET_NAME) class ExpectedCsvColumns(object): VALUE = 'value' BIOBANK_ID = 'biobank_id' SEX_AT_BIRTH = 'sex_at_birth' GENOME_TYPE = 'genome_type' NY_FLAG = 'ny_flag' REQUEST_ID = 'request_id' PACKAGE_ID = 'package_id' ALL = (VALUE, SEX_AT_BIRTH, GENOME_TYPE, NY_FLAG, REQUEST_ID, PACKAGE_ID) path = self._find_latest_genomic_set_csv(bucket_name, 'Manifest-Result') csv_file = cloudstorage_api.open(path) csv_reader = csv.DictReader(csv_file, delimiter=',') missing_cols = set(ExpectedCsvColumns.ALL) - set(csv_reader.fieldnames) self.assertEqual(len(missing_cols), 0) rows = list(csv_reader) self.assertEqual(rows[0][ExpectedCsvColumns.VALUE], '12345678') self.assertEqual(rows[0][ExpectedCsvColumns.BIOBANK_ID], '1') self.assertEqual(rows[0][ExpectedCsvColumns.SEX_AT_BIRTH], 'M') self.assertEqual(rows[0][ExpectedCsvColumns.GENOME_TYPE], 'aou_wgs') self.assertEqual(rows[0][ExpectedCsvColumns.NY_FLAG], 'Y') self.assertEqual(rows[0][ExpectedCsvColumns.PACKAGE_ID], 'PKG-XXXX-XXXX1') self.assertEqual(rows[1][ExpectedCsvColumns.VALUE], '12345679') self.assertEqual(rows[1][ExpectedCsvColumns.BIOBANK_ID], '2') self.assertEqual(rows[1][ExpectedCsvColumns.SEX_AT_BIRTH], 'F') self.assertEqual(rows[1][ExpectedCsvColumns.GENOME_TYPE], 'aou_array') self.assertEqual(rows[1][ExpectedCsvColumns.NY_FLAG], 'N') self.assertEqual(rows[1][ExpectedCsvColumns.PACKAGE_ID], 'PKG-XXXX-XXXX2') self.assertEqual(rows[2][ExpectedCsvColumns.VALUE], '12345680') self.assertEqual(rows[2][ExpectedCsvColumns.BIOBANK_ID], '3') self.assertEqual(rows[2][ExpectedCsvColumns.SEX_AT_BIRTH], 'M') self.assertEqual(rows[2][ExpectedCsvColumns.GENOME_TYPE], 'aou_array') self.assertEqual(rows[2][ExpectedCsvColumns.NY_FLAG], 'N') self.assertEqual(rows[2][ExpectedCsvColumns.PACKAGE_ID], 'PKG-XXXX-XXXX3') # verify package id in database member_dao = GenomicSetMemberDao() members = member_dao.get_all() for member in members: self.assertIn( member.packageId, ['PKG-XXXX-XXXX1', 'PKG-XXXX-XXXX2', 'PKG-XXXX-XXXX3'])