def generate_samples(fraction_missing):
    """Creates fake sample CSV data in GCS.

  Args:
    fraction_missing: This many samples which exist as BiobankStoredSamples will not have rows
        generated in the fake CSV.
  """
    bucket_name = config.getSetting(config.BIOBANK_SAMPLES_BUCKET_NAME)
    now = clock.CLOCK.now()
    file_name = '/%s/fake_%s.csv' % (bucket_name,
                                     now.strftime(INPUT_CSV_TIME_FORMAT))
    num_rows = 0
    sample_id_start = random.randint(1000000, 10000000)
    with cloudstorage_api.open(file_name, mode='w') as dest:
        writer = csv.writer(dest, delimiter="\t")
        writer.writerow(_HEADERS)
        biobank_order_dao = BiobankOrderDao()
        with biobank_order_dao.session() as session:
            rows = biobank_order_dao.get_ordered_samples_sample(
                session, 1 - fraction_missing, _BATCH_SIZE)
            for biobank_id, collected_time, test in rows:
                if collected_time is None:
                    logging.warning(
                        'biobank_id=%s test=%s skipped (collected=%s)',
                        biobank_id, test, collected_time)
                    continue
                minutes_delta = random.randint(
                    0, _MAX_MINUTES_BETWEEN_SAMPLE_COLLECTED_AND_CONFIRMED)
                confirmed_time = collected_time + datetime.timedelta(
                    minutes=minutes_delta)
                writer.writerow([
                    sample_id_start + num_rows,
                    None,  # no parent
                    confirmed_time.strftime(_TIME_FORMAT),
                    to_client_biobank_id(biobank_id),
                    test,
                    confirmed_time.strftime(_TIME_FORMAT)
                ])  # reuse confirmed time as created time
                num_rows += 1
        participant_dao = ParticipantDao()
        with participant_dao.session() as session:
            rows = participant_dao.get_biobank_ids_sample(
                session, _PARTICIPANTS_WITH_ORPHAN_SAMPLES, _BATCH_SIZE)
            for biobank_id, sign_up_time in rows:
                minutes_delta = random.randint(
                    0, _MAX_MINUTES_BETWEEN_PARTICIPANT_CREATED_AND_CONFIRMED)
                confirmed_time = sign_up_time + datetime.timedelta(
                    minutes=minutes_delta)
                tests = random.sample(BIOBANK_TESTS,
                                      random.randint(1, len(BIOBANK_TESTS)))
                for test in tests:
                    writer.writerow([
                        sample_id_start + num_rows, None,
                        confirmed_time.strftime(_TIME_FORMAT),
                        to_client_biobank_id(biobank_id), test
                    ])
                    num_rows += 1
    logging.info("Generated %d samples in %s.", num_rows, file_name)
    def to_client_json(self, model):
        result = model.asdict()
        # Participants that withdrew more than 48 hours ago should have fields other than
        # WITHDRAWN_PARTICIPANT_FIELDS cleared.
        if (model.withdrawalStatus == WithdrawalStatus.NO_USE
                and model.withdrawalTime <
                clock.CLOCK.now() - WITHDRAWN_PARTICIPANT_VISIBILITY_TIME):
            result = {k: result.get(k) for k in WITHDRAWN_PARTICIPANT_FIELDS}

        result['participantId'] = to_client_participant_id(model.participantId)
        biobank_id = result.get('biobankId')
        if biobank_id:
            result['biobankId'] = to_client_biobank_id(biobank_id)
        date_of_birth = result.get('dateOfBirth')
        if date_of_birth:
            result['ageRange'] = get_bucketed_age(date_of_birth,
                                                  clock.CLOCK.now())
        else:
            result['ageRange'] = UNSET
        format_json_hpo(result, self.hpo_dao, 'hpoId')
        _initialize_field_type_sets()
        for fieldname in _DATE_FIELDS:
            format_json_date(result, fieldname)
        for fieldname in _CODE_FIELDS:
            format_json_code(result, self.code_dao, fieldname)
        for fieldname in _ENUM_FIELDS:
            format_json_enum(result, fieldname)
        if (model.withdrawalStatus == WithdrawalStatus.NO_USE
                or model.suspensionStatus == SuspensionStatus.NO_CONTACT):
            result['recontactMethod'] = 'NO_CONTACT'
        # Strip None values.
        result = {k: v for k, v in result.iteritems() if v is not None}

        return result
示例#3
0
def open_biobank_samples(biobank_id1,
                         biobank_id2,
                         biobank_id3,
                         test1=None,
                         test2=None,
                         test3=None):
    """Returns an readable stream for the biobank samples CSV."""
    with open(data_path('biobank_samples_1.csv')) as f:
        csv_str = f.read() % {
            'biobank_id1': to_client_biobank_id(biobank_id1),
            'biobank_id2': to_client_biobank_id(biobank_id2),
            'biobank_id3': to_client_biobank_id(biobank_id3),
            'test1': test1 or random.choice(BIOBANK_TESTS),
            'test2': test2 or random.choice(BIOBANK_TESTS),
            'test3': test3 or random.choice(BIOBANK_TESTS),
        }
    return StringIO.StringIO(csv_str)
 def to_client_json(self, model):
   client_json = {
       'participantId': to_client_participant_id(model.participantId),
       'biobankId': to_client_biobank_id(model.biobankId),
       'lastModified': model.lastModified.isoformat(),
       'signUpTime': model.signUpTime.isoformat(),
       'providerLink': json.loads(model.providerLink),
       'withdrawalStatus': model.withdrawalStatus,
       'withdrawalTime': model.withdrawalTime,
       'suspensionStatus': model.suspensionStatus,
       'suspensionTime': model.suspensionTime
   }
   format_json_enum(client_json, 'withdrawalStatus')
   format_json_enum(client_json, 'suspensionStatus')
   format_json_date(client_json, 'withdrawalTime')
   format_json_date(client_json, 'suspensionTime')
   return client_json
示例#5
0
    def test_reconciliation_query(self):
        self.setup_codes([RACE_QUESTION_CODE], CodeType.QUESTION)
        self.setup_codes([RACE_AIAN_CODE, RACE_WHITE_CODE], CodeType.ANSWER)
        self._questionnaire_id = self.create_questionnaire(
            'questionnaire3.json')
        # MySQL and Python sub-second rounding differs, so trim micros from generated times.
        order_time = clock.CLOCK.now().replace(microsecond=0)
        old_order_time = order_time - datetime.timedelta(days=10)
        within_24_hours = order_time + datetime.timedelta(hours=23)
        old_within_24_hours = old_order_time + datetime.timedelta(hours=23)
        late_time = order_time + datetime.timedelta(hours=25)
        old_late_time = old_order_time + datetime.timedelta(hours=25)
        file_time = order_time + datetime.timedelta(
            hours=23) + datetime.timedelta(minutes=59)
        two_days_ago = file_time - datetime.timedelta(days=2)

        # On time, recent order and samples; shows up in rx
        p_on_time = self._insert_participant()
        # Extra samples ordered now aren't considered missing or late.
        self._insert_order(p_on_time,
                           'GoodOrder',
                           BIOBANK_TESTS[:4],
                           order_time,
                           finalized_tests=BIOBANK_TESTS[:3],
                           kit_id='kit1',
                           tracking_number='t1')
        self._insert_samples(p_on_time, BIOBANK_TESTS[:2],
                             ['GoodSample1', 'GoodSample2'], within_24_hours,
                             within_24_hours - datetime.timedelta(hours=1))

        # On time order and samples from 10 days ago; shows up in rx
        p_old_on_time = self._insert_participant(race_codes=[RACE_AIAN_CODE])
        # Old missing samples from 10 days ago don't show up in missing or late.
        self._insert_order(p_old_on_time,
                           'OldGoodOrder',
                           BIOBANK_TESTS[:3],
                           old_order_time,
                           kit_id='kit2')
        self._insert_samples(p_old_on_time, BIOBANK_TESTS[:2],
                             ['OldGoodSample1', 'OldGoodSample2'],
                             old_within_24_hours,
                             old_within_24_hours - datetime.timedelta(hours=1))

        # Late, recent order and samples; shows up in rx and late. (But not missing, as it hasn't been
        # 24 hours since the order.)
        p_late_and_missing = self._insert_participant()
        # Extra missing sample doesn't show up as missing as it hasn't been 24 hours yet.
        o_late_and_missing = self._insert_order(p_late_and_missing,
                                                'SlowOrder', BIOBANK_TESTS[:3],
                                                order_time)
        self._insert_samples(p_late_and_missing, [BIOBANK_TESTS[0]],
                             ['LateSample'], late_time,
                             late_time - datetime.timedelta(minutes=59))

        # Late order and samples from 10 days ago; shows up in rx (but not missing, as it was too
        # long ago.
        p_old_late_and_missing = self._insert_participant()
        self._insert_order(p_old_late_and_missing, 'OldSlowOrder',
                           BIOBANK_TESTS[:2], old_order_time)
        self._insert_samples(p_old_late_and_missing, [BIOBANK_TESTS[0]],
                             ['OldLateSample'], old_late_time,
                             old_late_time - datetime.timedelta(minutes=59))

        # Order with missing sample from 2 days ago; shows up in rx and missing.
        p_two_days_missing = self._insert_participant()
        # The third test doesn't wind up in missing, as it was never finalized.
        self._insert_order(p_two_days_missing,
                           'TwoDaysMissingOrder',
                           BIOBANK_TESTS[:3],
                           two_days_ago,
                           finalized_tests=BIOBANK_TESTS[:2])

        # Recent samples with no matching order; shows up in missing.
        p_extra = self._insert_participant(race_codes=[RACE_WHITE_CODE])
        self._insert_samples(p_extra, [BIOBANK_TESTS[-1]],
                             ['NobodyOrderedThisSample'], order_time,
                             order_time - datetime.timedelta(minutes=59))

        # Old samples with no matching order; shows up in rx.
        p_old_extra = self._insert_participant(race_codes=[RACE_AIAN_CODE])
        self._insert_samples(p_old_extra, [BIOBANK_TESTS[-1]],
                             ['OldNobodyOrderedThisSample'], old_order_time,
                             old_order_time - datetime.timedelta(hours=1))

        # Withdrawn participants don't show up in any reports except withdrawal report.

        p_withdrawn_old_on_time = self._insert_participant(
            race_codes=[RACE_AIAN_CODE])
        # This updates the version of the participant and its HPO ID.
        self._insert_order(p_withdrawn_old_on_time, 'OldWithdrawnGoodOrder',
                           BIOBANK_TESTS[:2], old_order_time)
        p_withdrawn_old_on_time = self.participant_dao.get(
            p_withdrawn_old_on_time.participantId)
        self._insert_samples(
            p_withdrawn_old_on_time, BIOBANK_TESTS[:2],
            ['OldWithdrawnGoodSample1', 'OldWithdrawnGoodSample2'],
            old_within_24_hours,
            old_within_24_hours - datetime.timedelta(hours=1))
        self._withdraw(p_withdrawn_old_on_time, within_24_hours)

        p_withdrawn_late_and_missing = self._insert_participant()
        self._insert_order(p_withdrawn_late_and_missing, 'WithdrawnSlowOrder',
                           BIOBANK_TESTS[:2], order_time)
        self._insert_samples(p_withdrawn_late_and_missing, [BIOBANK_TESTS[0]],
                             ['WithdrawnLateSample'], late_time,
                             late_time - datetime.timedelta(minutes=59))
        p_withdrawn_late_and_missing = (self.participant_dao.get(
            p_withdrawn_late_and_missing.participantId))
        self._withdraw(p_withdrawn_late_and_missing, within_24_hours)

        p_withdrawn_old_late_and_missing = self._insert_participant()
        self._insert_order(p_withdrawn_old_late_and_missing,
                           'WithdrawnOldSlowOrder', BIOBANK_TESTS[:2],
                           old_order_time)
        self._insert_samples(p_withdrawn_old_late_and_missing,
                             [BIOBANK_TESTS[0]], ['WithdrawnOldLateSample'],
                             old_late_time,
                             old_late_time - datetime.timedelta(minutes=59))
        p_withdrawn_old_late_and_missing = (self.participant_dao.get(
            p_withdrawn_old_late_and_missing.participantId))
        self._withdraw(p_withdrawn_old_late_and_missing, old_late_time)

        p_withdrawn_extra = self._insert_participant(
            race_codes=[RACE_WHITE_CODE])
        self._insert_samples(p_withdrawn_extra, [BIOBANK_TESTS[-1]],
                             ['WithdrawnNobodyOrderedThisSample'], order_time,
                             order_time - datetime.timedelta(hours=1))
        self._withdraw(p_withdrawn_extra, within_24_hours)

        p_withdrawn_old_extra = self._insert_participant(
            race_codes=[RACE_AIAN_CODE])
        self._insert_samples(p_withdrawn_old_extra, [BIOBANK_TESTS[-1]],
                             ['WithdrawnOldNobodyOrderedThisSample'],
                             old_order_time,
                             old_order_time - datetime.timedelta(hours=1))
        self._withdraw(p_withdrawn_old_extra, within_24_hours)

        p_withdrawn_race_change = self._insert_participant(
            race_codes=[RACE_AIAN_CODE])
        p_withdrawn_race_change_id = to_client_participant_id(
            p_withdrawn_race_change.participantId)
        self._submit_race_questionnaire_response(p_withdrawn_race_change_id,
                                                 [RACE_WHITE_CODE])
        self._withdraw(p_withdrawn_race_change, within_24_hours)

        # for the same participant/test, 3 orders sent and only 2 samples received. Shows up in both
        # missing (we are missing one sample) and late (the two samples that were received were after
        # 24 hours.)
        p_repeated = self._insert_participant()
        for repetition in xrange(3):
            self._insert_order(
                p_repeated, 'RepeatedOrder%d' % repetition, [BIOBANK_TESTS[0]],
                two_days_ago + datetime.timedelta(hours=repetition))
            if repetition != 2:
                self._insert_samples(
                    p_repeated, [BIOBANK_TESTS[0]],
                    ['RepeatedSample%d' % repetition],
                    within_24_hours + datetime.timedelta(hours=repetition),
                    within_24_hours + datetime.timedelta(hours=repetition - 1))

        received, late, missing, withdrawals = 'rx.csv', 'late.csv', 'missing.csv', 'withdrawals.csv'
        exporter = InMemorySqlExporter(self)
        biobank_samples_pipeline._query_and_write_reports(
            exporter, file_time, received, late, missing, withdrawals)

        exporter.assertFilesEqual((received, late, missing, withdrawals))

        # sent-and-received: 4 on-time, 2 late, none of the missing/extra/repeated ones;
        # includes orders/samples from more than 7 days ago
        exporter.assertRowCount(received, 6)
        exporter.assertColumnNamesEqual(received, _CSV_COLUMN_NAMES)
        row = exporter.assertHasRow(
            received, {
                'biobank_id': to_client_biobank_id(p_on_time.biobankId),
                'sent_test': BIOBANK_TESTS[0],
                'received_test': BIOBANK_TESTS[0]
            })
        # Also check the values of all remaining fields on one row.
        self.assertEquals(row['source_site_name'],
                          'Monroeville Urgent Care Center')
        self.assertEquals(row['source_site_consortium'], 'Pittsburgh')
        self.assertEquals(row['source_site_mayolink_client_number'], '7035769')
        self.assertEquals(row['source_site_hpo'], 'PITT')
        self.assertEquals(row['source_site_hpo_type'], 'HPO')
        self.assertEquals(row['finalized_site_name'],
                          'Monroeville Urgent Care Center')
        self.assertEquals(row['finalized_site_consortium'], 'Pittsburgh')
        self.assertEquals(row['finalized_site_mayolink_client_number'],
                          '7035769')
        self.assertEquals(row['finalized_site_hpo'], 'PITT')
        self.assertEquals(row['finalized_site_hpo_type'], 'HPO')
        self.assertEquals(row['finalized_username'], '*****@*****.**')
        self.assertEquals(row['sent_finalized_time'],
                          database_utils.format_datetime(order_time))
        self.assertEquals(row['sent_collection_time'],
                          database_utils.format_datetime(order_time))
        self.assertEquals(row['sent_processed_time'],
                          database_utils.format_datetime(order_time))
        self.assertEquals(row['received_time'],
                          database_utils.format_datetime(within_24_hours))
        self.assertEquals(
            row['Sample Family Create Date'],
            database_utils.format_datetime(within_24_hours -
                                           datetime.timedelta(hours=1)))
        self.assertEquals(row['sent_count'], '1')
        self.assertEquals(row['received_count'], '1')
        self.assertEquals(row['sent_order_id'], 'OGoodOrder')
        self.assertEquals(row['received_sample_id'], 'GoodSample1')
        self.assertEquals(row['biospecimen_kit_id'], 'kit1')
        self.assertEquals(row['fedex_tracking_number'], 't1')
        # the other sent-and-received rows
        exporter.assertHasRow(
            received, {
                'biobank_id': to_client_biobank_id(p_on_time.biobankId),
                'sent_test': BIOBANK_TESTS[1]
            })
        exporter.assertHasRow(
            received, {
                'biobank_id': to_client_biobank_id(
                    p_late_and_missing.biobankId),
                'sent_test': BIOBANK_TESTS[0]
            })
        exporter.assertHasRow(
            received, {
                'biobank_id': to_client_biobank_id(p_old_on_time.biobankId),
                'sent_test': BIOBANK_TESTS[0]
            })
        exporter.assertHasRow(
            received, {
                'biobank_id': to_client_biobank_id(p_old_on_time.biobankId),
                'sent_test': BIOBANK_TESTS[1]
            })
        exporter.assertHasRow(
            received, {
                'biobank_id':
                to_client_biobank_id(p_old_late_and_missing.biobankId),
                'sent_test':
                BIOBANK_TESTS[0]
            })

        # sent-and-received: 2 late; don't include orders/samples from more than 7 days ago
        exporter.assertRowCount(late, 2)
        exporter.assertColumnNamesEqual(late, _CSV_COLUMN_NAMES)
        exporter.assertHasRow(
            late, {
                'biobank_id': to_client_biobank_id(
                    p_late_and_missing.biobankId),
                'sent_order_id': 'O%s' % o_late_and_missing.biobankOrderId,
                'elapsed_hours': '24'
            })
        exporter.assertHasRow(
            late, {
                'biobank_id': to_client_biobank_id(p_repeated.biobankId),
                'elapsed_hours': '45'
            })

        # orders/samples where something went wrong; don't include orders/samples from more than 7
        # days ago, or where 24 hours hasn't elapsed yet.
        exporter.assertRowCount(missing, 4)
        exporter.assertColumnNamesEqual(missing, _CSV_COLUMN_NAMES)
        # sample received, nothing ordered
        exporter.assertHasRow(
            missing, {
                'biobank_id': to_client_biobank_id(p_extra.biobankId),
                'sent_order_id': ''
            })
        # order received, no sample
        exporter.assertHasRow(
            missing, {
                'biobank_id': to_client_biobank_id(
                    p_two_days_missing.biobankId),
                'sent_order_id': 'OTwoDaysMissingOrder',
                'sent_test': BIOBANK_TESTS[0]
            })
        exporter.assertHasRow(
            missing, {
                'biobank_id': to_client_biobank_id(
                    p_two_days_missing.biobankId),
                'sent_order_id': 'OTwoDaysMissingOrder',
                'sent_test': BIOBANK_TESTS[1]
            })

        # 3 orders sent, only 2 received
        multi_sample_row = exporter.assertHasRow(
            missing, {
                'biobank_id': to_client_biobank_id(p_repeated.biobankId),
                'sent_count': '3',
                'received_count': '2'
            })

        # Also verify the comma-joined fields of the row with multiple orders/samples.
        self.assertItemsEqual(
            multi_sample_row['sent_order_id'].split(','),
            ['ORepeatedOrder1', 'ORepeatedOrder0', 'ORepeatedOrder2'])
        self.assertItemsEqual(
            multi_sample_row['received_sample_id'].split(','),
            ['RepeatedSample0', 'RepeatedSample1'])

        # We don't include the old withdrawal.
        exporter.assertRowCount(withdrawals, 5)
        exporter.assertHasRow(
            withdrawals, {
                'biobank_id':
                to_client_biobank_id(p_withdrawn_old_on_time.biobankId),
                'withdrawal_time':
                database_utils.format_datetime(within_24_hours),
                'is_native_american':
                'Y'
            })
        exporter.assertHasRow(
            withdrawals, {
                'biobank_id':
                to_client_biobank_id(p_withdrawn_late_and_missing.biobankId),
                'withdrawal_time':
                database_utils.format_datetime(within_24_hours),
                'is_native_american':
                'N'
            })
        exporter.assertHasRow(
            withdrawals, {
                'biobank_id': to_client_biobank_id(
                    p_withdrawn_extra.biobankId),
                'withdrawal_time':
                database_utils.format_datetime(within_24_hours),
                'is_native_american': 'N'
            })
        exporter.assertHasRow(
            withdrawals, {
                'biobank_id': to_client_biobank_id(
                    p_withdrawn_old_extra.biobankId),
                'withdrawal_time':
                database_utils.format_datetime(within_24_hours),
                'is_native_american': 'Y'
            })
        exporter.assertHasRow(
            withdrawals, {
                'biobank_id':
                to_client_biobank_id(p_withdrawn_race_change.biobankId),
                'withdrawal_time':
                database_utils.format_datetime(within_24_hours),
                'is_native_american':
                'N'
            })