def generate_samples(fraction_missing): """Creates fake sample CSV data in GCS. Args: fraction_missing: This many samples which exist as BiobankStoredSamples will not have rows generated in the fake CSV. """ bucket_name = config.getSetting(config.BIOBANK_SAMPLES_BUCKET_NAME) now = clock.CLOCK.now() file_name = '/%s/fake_%s.csv' % (bucket_name, now.strftime(INPUT_CSV_TIME_FORMAT)) num_rows = 0 sample_id_start = random.randint(1000000, 10000000) with cloudstorage_api.open(file_name, mode='w') as dest: writer = csv.writer(dest, delimiter="\t") writer.writerow(_HEADERS) biobank_order_dao = BiobankOrderDao() with biobank_order_dao.session() as session: rows = biobank_order_dao.get_ordered_samples_sample( session, 1 - fraction_missing, _BATCH_SIZE) for biobank_id, collected_time, test in rows: if collected_time is None: logging.warning( 'biobank_id=%s test=%s skipped (collected=%s)', biobank_id, test, collected_time) continue minutes_delta = random.randint( 0, _MAX_MINUTES_BETWEEN_SAMPLE_COLLECTED_AND_CONFIRMED) confirmed_time = collected_time + datetime.timedelta( minutes=minutes_delta) writer.writerow([ sample_id_start + num_rows, None, # no parent confirmed_time.strftime(_TIME_FORMAT), to_client_biobank_id(biobank_id), test, confirmed_time.strftime(_TIME_FORMAT) ]) # reuse confirmed time as created time num_rows += 1 participant_dao = ParticipantDao() with participant_dao.session() as session: rows = participant_dao.get_biobank_ids_sample( session, _PARTICIPANTS_WITH_ORPHAN_SAMPLES, _BATCH_SIZE) for biobank_id, sign_up_time in rows: minutes_delta = random.randint( 0, _MAX_MINUTES_BETWEEN_PARTICIPANT_CREATED_AND_CONFIRMED) confirmed_time = sign_up_time + datetime.timedelta( minutes=minutes_delta) tests = random.sample(BIOBANK_TESTS, random.randint(1, len(BIOBANK_TESTS))) for test in tests: writer.writerow([ sample_id_start + num_rows, None, confirmed_time.strftime(_TIME_FORMAT), to_client_biobank_id(biobank_id), test ]) num_rows += 1 logging.info("Generated %d samples in %s.", num_rows, file_name)
def to_client_json(self, model): result = model.asdict() # Participants that withdrew more than 48 hours ago should have fields other than # WITHDRAWN_PARTICIPANT_FIELDS cleared. if (model.withdrawalStatus == WithdrawalStatus.NO_USE and model.withdrawalTime < clock.CLOCK.now() - WITHDRAWN_PARTICIPANT_VISIBILITY_TIME): result = {k: result.get(k) for k in WITHDRAWN_PARTICIPANT_FIELDS} result['participantId'] = to_client_participant_id(model.participantId) biobank_id = result.get('biobankId') if biobank_id: result['biobankId'] = to_client_biobank_id(biobank_id) date_of_birth = result.get('dateOfBirth') if date_of_birth: result['ageRange'] = get_bucketed_age(date_of_birth, clock.CLOCK.now()) else: result['ageRange'] = UNSET format_json_hpo(result, self.hpo_dao, 'hpoId') _initialize_field_type_sets() for fieldname in _DATE_FIELDS: format_json_date(result, fieldname) for fieldname in _CODE_FIELDS: format_json_code(result, self.code_dao, fieldname) for fieldname in _ENUM_FIELDS: format_json_enum(result, fieldname) if (model.withdrawalStatus == WithdrawalStatus.NO_USE or model.suspensionStatus == SuspensionStatus.NO_CONTACT): result['recontactMethod'] = 'NO_CONTACT' # Strip None values. result = {k: v for k, v in result.iteritems() if v is not None} return result
def open_biobank_samples(biobank_id1, biobank_id2, biobank_id3, test1=None, test2=None, test3=None): """Returns an readable stream for the biobank samples CSV.""" with open(data_path('biobank_samples_1.csv')) as f: csv_str = f.read() % { 'biobank_id1': to_client_biobank_id(biobank_id1), 'biobank_id2': to_client_biobank_id(biobank_id2), 'biobank_id3': to_client_biobank_id(biobank_id3), 'test1': test1 or random.choice(BIOBANK_TESTS), 'test2': test2 or random.choice(BIOBANK_TESTS), 'test3': test3 or random.choice(BIOBANK_TESTS), } return StringIO.StringIO(csv_str)
def to_client_json(self, model): client_json = { 'participantId': to_client_participant_id(model.participantId), 'biobankId': to_client_biobank_id(model.biobankId), 'lastModified': model.lastModified.isoformat(), 'signUpTime': model.signUpTime.isoformat(), 'providerLink': json.loads(model.providerLink), 'withdrawalStatus': model.withdrawalStatus, 'withdrawalTime': model.withdrawalTime, 'suspensionStatus': model.suspensionStatus, 'suspensionTime': model.suspensionTime } format_json_enum(client_json, 'withdrawalStatus') format_json_enum(client_json, 'suspensionStatus') format_json_date(client_json, 'withdrawalTime') format_json_date(client_json, 'suspensionTime') return client_json
def test_reconciliation_query(self): self.setup_codes([RACE_QUESTION_CODE], CodeType.QUESTION) self.setup_codes([RACE_AIAN_CODE, RACE_WHITE_CODE], CodeType.ANSWER) self._questionnaire_id = self.create_questionnaire( 'questionnaire3.json') # MySQL and Python sub-second rounding differs, so trim micros from generated times. order_time = clock.CLOCK.now().replace(microsecond=0) old_order_time = order_time - datetime.timedelta(days=10) within_24_hours = order_time + datetime.timedelta(hours=23) old_within_24_hours = old_order_time + datetime.timedelta(hours=23) late_time = order_time + datetime.timedelta(hours=25) old_late_time = old_order_time + datetime.timedelta(hours=25) file_time = order_time + datetime.timedelta( hours=23) + datetime.timedelta(minutes=59) two_days_ago = file_time - datetime.timedelta(days=2) # On time, recent order and samples; shows up in rx p_on_time = self._insert_participant() # Extra samples ordered now aren't considered missing or late. self._insert_order(p_on_time, 'GoodOrder', BIOBANK_TESTS[:4], order_time, finalized_tests=BIOBANK_TESTS[:3], kit_id='kit1', tracking_number='t1') self._insert_samples(p_on_time, BIOBANK_TESTS[:2], ['GoodSample1', 'GoodSample2'], within_24_hours, within_24_hours - datetime.timedelta(hours=1)) # On time order and samples from 10 days ago; shows up in rx p_old_on_time = self._insert_participant(race_codes=[RACE_AIAN_CODE]) # Old missing samples from 10 days ago don't show up in missing or late. self._insert_order(p_old_on_time, 'OldGoodOrder', BIOBANK_TESTS[:3], old_order_time, kit_id='kit2') self._insert_samples(p_old_on_time, BIOBANK_TESTS[:2], ['OldGoodSample1', 'OldGoodSample2'], old_within_24_hours, old_within_24_hours - datetime.timedelta(hours=1)) # Late, recent order and samples; shows up in rx and late. (But not missing, as it hasn't been # 24 hours since the order.) p_late_and_missing = self._insert_participant() # Extra missing sample doesn't show up as missing as it hasn't been 24 hours yet. o_late_and_missing = self._insert_order(p_late_and_missing, 'SlowOrder', BIOBANK_TESTS[:3], order_time) self._insert_samples(p_late_and_missing, [BIOBANK_TESTS[0]], ['LateSample'], late_time, late_time - datetime.timedelta(minutes=59)) # Late order and samples from 10 days ago; shows up in rx (but not missing, as it was too # long ago. p_old_late_and_missing = self._insert_participant() self._insert_order(p_old_late_and_missing, 'OldSlowOrder', BIOBANK_TESTS[:2], old_order_time) self._insert_samples(p_old_late_and_missing, [BIOBANK_TESTS[0]], ['OldLateSample'], old_late_time, old_late_time - datetime.timedelta(minutes=59)) # Order with missing sample from 2 days ago; shows up in rx and missing. p_two_days_missing = self._insert_participant() # The third test doesn't wind up in missing, as it was never finalized. self._insert_order(p_two_days_missing, 'TwoDaysMissingOrder', BIOBANK_TESTS[:3], two_days_ago, finalized_tests=BIOBANK_TESTS[:2]) # Recent samples with no matching order; shows up in missing. p_extra = self._insert_participant(race_codes=[RACE_WHITE_CODE]) self._insert_samples(p_extra, [BIOBANK_TESTS[-1]], ['NobodyOrderedThisSample'], order_time, order_time - datetime.timedelta(minutes=59)) # Old samples with no matching order; shows up in rx. p_old_extra = self._insert_participant(race_codes=[RACE_AIAN_CODE]) self._insert_samples(p_old_extra, [BIOBANK_TESTS[-1]], ['OldNobodyOrderedThisSample'], old_order_time, old_order_time - datetime.timedelta(hours=1)) # Withdrawn participants don't show up in any reports except withdrawal report. p_withdrawn_old_on_time = self._insert_participant( race_codes=[RACE_AIAN_CODE]) # This updates the version of the participant and its HPO ID. self._insert_order(p_withdrawn_old_on_time, 'OldWithdrawnGoodOrder', BIOBANK_TESTS[:2], old_order_time) p_withdrawn_old_on_time = self.participant_dao.get( p_withdrawn_old_on_time.participantId) self._insert_samples( p_withdrawn_old_on_time, BIOBANK_TESTS[:2], ['OldWithdrawnGoodSample1', 'OldWithdrawnGoodSample2'], old_within_24_hours, old_within_24_hours - datetime.timedelta(hours=1)) self._withdraw(p_withdrawn_old_on_time, within_24_hours) p_withdrawn_late_and_missing = self._insert_participant() self._insert_order(p_withdrawn_late_and_missing, 'WithdrawnSlowOrder', BIOBANK_TESTS[:2], order_time) self._insert_samples(p_withdrawn_late_and_missing, [BIOBANK_TESTS[0]], ['WithdrawnLateSample'], late_time, late_time - datetime.timedelta(minutes=59)) p_withdrawn_late_and_missing = (self.participant_dao.get( p_withdrawn_late_and_missing.participantId)) self._withdraw(p_withdrawn_late_and_missing, within_24_hours) p_withdrawn_old_late_and_missing = self._insert_participant() self._insert_order(p_withdrawn_old_late_and_missing, 'WithdrawnOldSlowOrder', BIOBANK_TESTS[:2], old_order_time) self._insert_samples(p_withdrawn_old_late_and_missing, [BIOBANK_TESTS[0]], ['WithdrawnOldLateSample'], old_late_time, old_late_time - datetime.timedelta(minutes=59)) p_withdrawn_old_late_and_missing = (self.participant_dao.get( p_withdrawn_old_late_and_missing.participantId)) self._withdraw(p_withdrawn_old_late_and_missing, old_late_time) p_withdrawn_extra = self._insert_participant( race_codes=[RACE_WHITE_CODE]) self._insert_samples(p_withdrawn_extra, [BIOBANK_TESTS[-1]], ['WithdrawnNobodyOrderedThisSample'], order_time, order_time - datetime.timedelta(hours=1)) self._withdraw(p_withdrawn_extra, within_24_hours) p_withdrawn_old_extra = self._insert_participant( race_codes=[RACE_AIAN_CODE]) self._insert_samples(p_withdrawn_old_extra, [BIOBANK_TESTS[-1]], ['WithdrawnOldNobodyOrderedThisSample'], old_order_time, old_order_time - datetime.timedelta(hours=1)) self._withdraw(p_withdrawn_old_extra, within_24_hours) p_withdrawn_race_change = self._insert_participant( race_codes=[RACE_AIAN_CODE]) p_withdrawn_race_change_id = to_client_participant_id( p_withdrawn_race_change.participantId) self._submit_race_questionnaire_response(p_withdrawn_race_change_id, [RACE_WHITE_CODE]) self._withdraw(p_withdrawn_race_change, within_24_hours) # for the same participant/test, 3 orders sent and only 2 samples received. Shows up in both # missing (we are missing one sample) and late (the two samples that were received were after # 24 hours.) p_repeated = self._insert_participant() for repetition in xrange(3): self._insert_order( p_repeated, 'RepeatedOrder%d' % repetition, [BIOBANK_TESTS[0]], two_days_ago + datetime.timedelta(hours=repetition)) if repetition != 2: self._insert_samples( p_repeated, [BIOBANK_TESTS[0]], ['RepeatedSample%d' % repetition], within_24_hours + datetime.timedelta(hours=repetition), within_24_hours + datetime.timedelta(hours=repetition - 1)) received, late, missing, withdrawals = 'rx.csv', 'late.csv', 'missing.csv', 'withdrawals.csv' exporter = InMemorySqlExporter(self) biobank_samples_pipeline._query_and_write_reports( exporter, file_time, received, late, missing, withdrawals) exporter.assertFilesEqual((received, late, missing, withdrawals)) # sent-and-received: 4 on-time, 2 late, none of the missing/extra/repeated ones; # includes orders/samples from more than 7 days ago exporter.assertRowCount(received, 6) exporter.assertColumnNamesEqual(received, _CSV_COLUMN_NAMES) row = exporter.assertHasRow( received, { 'biobank_id': to_client_biobank_id(p_on_time.biobankId), 'sent_test': BIOBANK_TESTS[0], 'received_test': BIOBANK_TESTS[0] }) # Also check the values of all remaining fields on one row. self.assertEquals(row['source_site_name'], 'Monroeville Urgent Care Center') self.assertEquals(row['source_site_consortium'], 'Pittsburgh') self.assertEquals(row['source_site_mayolink_client_number'], '7035769') self.assertEquals(row['source_site_hpo'], 'PITT') self.assertEquals(row['source_site_hpo_type'], 'HPO') self.assertEquals(row['finalized_site_name'], 'Monroeville Urgent Care Center') self.assertEquals(row['finalized_site_consortium'], 'Pittsburgh') self.assertEquals(row['finalized_site_mayolink_client_number'], '7035769') self.assertEquals(row['finalized_site_hpo'], 'PITT') self.assertEquals(row['finalized_site_hpo_type'], 'HPO') self.assertEquals(row['finalized_username'], '*****@*****.**') self.assertEquals(row['sent_finalized_time'], database_utils.format_datetime(order_time)) self.assertEquals(row['sent_collection_time'], database_utils.format_datetime(order_time)) self.assertEquals(row['sent_processed_time'], database_utils.format_datetime(order_time)) self.assertEquals(row['received_time'], database_utils.format_datetime(within_24_hours)) self.assertEquals( row['Sample Family Create Date'], database_utils.format_datetime(within_24_hours - datetime.timedelta(hours=1))) self.assertEquals(row['sent_count'], '1') self.assertEquals(row['received_count'], '1') self.assertEquals(row['sent_order_id'], 'OGoodOrder') self.assertEquals(row['received_sample_id'], 'GoodSample1') self.assertEquals(row['biospecimen_kit_id'], 'kit1') self.assertEquals(row['fedex_tracking_number'], 't1') # the other sent-and-received rows exporter.assertHasRow( received, { 'biobank_id': to_client_biobank_id(p_on_time.biobankId), 'sent_test': BIOBANK_TESTS[1] }) exporter.assertHasRow( received, { 'biobank_id': to_client_biobank_id( p_late_and_missing.biobankId), 'sent_test': BIOBANK_TESTS[0] }) exporter.assertHasRow( received, { 'biobank_id': to_client_biobank_id(p_old_on_time.biobankId), 'sent_test': BIOBANK_TESTS[0] }) exporter.assertHasRow( received, { 'biobank_id': to_client_biobank_id(p_old_on_time.biobankId), 'sent_test': BIOBANK_TESTS[1] }) exporter.assertHasRow( received, { 'biobank_id': to_client_biobank_id(p_old_late_and_missing.biobankId), 'sent_test': BIOBANK_TESTS[0] }) # sent-and-received: 2 late; don't include orders/samples from more than 7 days ago exporter.assertRowCount(late, 2) exporter.assertColumnNamesEqual(late, _CSV_COLUMN_NAMES) exporter.assertHasRow( late, { 'biobank_id': to_client_biobank_id( p_late_and_missing.biobankId), 'sent_order_id': 'O%s' % o_late_and_missing.biobankOrderId, 'elapsed_hours': '24' }) exporter.assertHasRow( late, { 'biobank_id': to_client_biobank_id(p_repeated.biobankId), 'elapsed_hours': '45' }) # orders/samples where something went wrong; don't include orders/samples from more than 7 # days ago, or where 24 hours hasn't elapsed yet. exporter.assertRowCount(missing, 4) exporter.assertColumnNamesEqual(missing, _CSV_COLUMN_NAMES) # sample received, nothing ordered exporter.assertHasRow( missing, { 'biobank_id': to_client_biobank_id(p_extra.biobankId), 'sent_order_id': '' }) # order received, no sample exporter.assertHasRow( missing, { 'biobank_id': to_client_biobank_id( p_two_days_missing.biobankId), 'sent_order_id': 'OTwoDaysMissingOrder', 'sent_test': BIOBANK_TESTS[0] }) exporter.assertHasRow( missing, { 'biobank_id': to_client_biobank_id( p_two_days_missing.biobankId), 'sent_order_id': 'OTwoDaysMissingOrder', 'sent_test': BIOBANK_TESTS[1] }) # 3 orders sent, only 2 received multi_sample_row = exporter.assertHasRow( missing, { 'biobank_id': to_client_biobank_id(p_repeated.biobankId), 'sent_count': '3', 'received_count': '2' }) # Also verify the comma-joined fields of the row with multiple orders/samples. self.assertItemsEqual( multi_sample_row['sent_order_id'].split(','), ['ORepeatedOrder1', 'ORepeatedOrder0', 'ORepeatedOrder2']) self.assertItemsEqual( multi_sample_row['received_sample_id'].split(','), ['RepeatedSample0', 'RepeatedSample1']) # We don't include the old withdrawal. exporter.assertRowCount(withdrawals, 5) exporter.assertHasRow( withdrawals, { 'biobank_id': to_client_biobank_id(p_withdrawn_old_on_time.biobankId), 'withdrawal_time': database_utils.format_datetime(within_24_hours), 'is_native_american': 'Y' }) exporter.assertHasRow( withdrawals, { 'biobank_id': to_client_biobank_id(p_withdrawn_late_and_missing.biobankId), 'withdrawal_time': database_utils.format_datetime(within_24_hours), 'is_native_american': 'N' }) exporter.assertHasRow( withdrawals, { 'biobank_id': to_client_biobank_id( p_withdrawn_extra.biobankId), 'withdrawal_time': database_utils.format_datetime(within_24_hours), 'is_native_american': 'N' }) exporter.assertHasRow( withdrawals, { 'biobank_id': to_client_biobank_id( p_withdrawn_old_extra.biobankId), 'withdrawal_time': database_utils.format_datetime(within_24_hours), 'is_native_american': 'Y' }) exporter.assertHasRow( withdrawals, { 'biobank_id': to_client_biobank_id(p_withdrawn_race_change.biobankId), 'withdrawal_time': database_utils.format_datetime(within_24_hours), 'is_native_american': 'N' })