예제 #1
0
    def test_multi_key_sorts(self):
        expected_ids =\
            ['a_1', 'a_2', 'a_4', 'a_6', 'a_8', 'a_11', 'a_3', 'a_5', 'a_7', 'a_9', 'a_10']
        expected_pids =\
            ['p_1', 'p_1', 'p_1', 'p_1', 'p_1', 'p_1', 'p_2', 'p_2', 'p_2', 'p_2', 'p_2']
        expected_vals1 =\
            ['100', '101', '101', '102', '102', '104', '101', '102', '102', '103', '104']
        expected_vals2 =\
            ['100', '102', '101', '102', '104', '104', '101', '102', '103', '105', '105']

        ds1 = dataset.Dataset(io.StringIO(sorting_dataset), verbose=False)
        ds1.sort('created_at')
        ds1.sort('patient_id')
        self.assertListEqual([0, 1, 3, 5, 7, 10, 2, 4, 6, 8, 9], ds1.index_)
        self.assertListEqual(expected_ids, ds1.field_by_name('id'))
        self.assertListEqual(expected_pids, ds1.field_by_name('patient_id'))
        self.assertListEqual(expected_vals1, ds1.field_by_name('created_at'))
        self.assertListEqual(expected_vals2, ds1.field_by_name('updated_at'))
        # for i in range(ds1.row_count()):
        #     utils.print_diagnostic_row("{}".format(i), ds1, i, ds1.names_)

        ds2 = dataset.Dataset(io.StringIO(sorting_dataset), verbose=False)
        ds2.sort(('patient_id', 'created_at'))
        self.assertListEqual([0, 1, 3, 5, 7, 10, 2, 4, 6, 8, 9], ds2.index_)
        self.assertListEqual(expected_ids, ds1.field_by_name('id'))
        self.assertListEqual(expected_pids, ds1.field_by_name('patient_id'))
        self.assertListEqual(expected_vals1, ds1.field_by_name('created_at'))
        self.assertListEqual(expected_vals2, ds1.field_by_name('updated_at'))
예제 #2
0
    def test_single_key_sorts(self):
        ds1 = dataset.Dataset(io.StringIO(sorting_dataset), verbose=False)
        ds1.sort('patient_id')
        self.assertListEqual([0, 1, 3, 5, 7, 10, 2, 4, 6, 8, 9], ds1.index_)

        ds2 = dataset.Dataset(io.StringIO(sorting_dataset), verbose=False)
        ds2.sort(('patient_id', ))
        self.assertListEqual([0, 1, 3, 5, 7, 10, 2, 4, 6, 8, 9], ds2.index_)
예제 #3
0
    def test_construction_with_early_filter(self):
        s = io.StringIO(small_dataset)
        ds = dataset.Dataset(s,
                             early_filter=('bar', lambda x: x in ('a', )),
                             verbose=False)

        # field names and fields must match in length
        self.assertEqual(len(ds.names_), len(ds.fields_))

        self.assertEqual(ds.row_count(), 2)

        self.assertEqual(ds.names_, ['id', 'patient_id', 'foo', 'bar'])

        expected_values = [(0, [
            '0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa',
            '11111111111111111111111111111111', '', 'a'
        ]),
                           (2, [
                               '02222222222222222222222222222222',
                               '11111111111111111111111111111111', 'False', 'a'
                           ])]

        # value works as expected
        for row in range(len(expected_values)):
            for col in range(len(expected_values[0][1])):
                self.assertEqual(ds.value(row, col),
                                 expected_values[row][1][col])

        # value_from_fieldname works as expected
        sorted_names = sorted(ds.names_)
        for n in sorted_names:
            index = ds.names_.index(n)
            for row in range(len(expected_values)):
                self.assertEqual(ds.value_from_fieldname(row, n),
                                 expected_values[row][1][index])
예제 #4
0
    def __repr__(self):
        return f"{self.active}"


def sort_by_test_index_count(test_indices_by_patient):
    sorted_patient_text_index_pairs = sorted(
        [t for t in test_indices_by_patient.items()],
        key=lambda t: len(t[1].indices),
        reverse=True)
    return sorted_patient_text_index_pairs


# start
with open(t_file_name) as f:
    t_ds = dataset.Dataset(f)
t_dtss = t_ds.field_by_name('date_taken_specific')
t_patients = group_new_test_indices_by_patient(t_ds)

# get stats and print delta for old tests
# ---------------------------------------

# a_keys = ('id', 'patient_id', 'country_code', 'created_at', 'updated_at', 'version', 'had_covid_test', 'tested_covid_positive')
a_keys = ('patient_id', 'updated_at', 'had_covid_test',
          'tested_covid_positive')
with open(a_file_name) as f:
    a_ds = dataset.Dataset(f, keys=a_keys, show_progress_every=5000000)
    # show_progress_every=5000000, stop_after=1000000)
print('sorting')
a_ds.sort(keys='updated_at')
예제 #5
0
def split_data(patient_data,
               assessment_data,
               bucket_size=500000,
               territories=None):

    with open(patient_data) as f:
        p_ds = dataset.Dataset(f,
                               keys=('id', 'created_at'),
                               show_progress_every=500000)
        # show_progress_every=500000, stop_after=500000)
        p_ds.sort(('created_at', 'id'))
        p_ids = p_ds.field_by_name('id')
        p_dts = p_ds.field_by_name('created_at')

    # put assessment ids into buckets
    buckets = dict()
    bucket_index = 0
    bucket_count = 0
    for i_r in range(p_ds.row_count()):
        if bucket_index == bucket_size:
            bucket_index = 0
            bucket_count += 1
        buckets[p_ids[i_r]] = bucket_count
        bucket_index += 1

    filenames = list()
    for b in range(bucket_count + 1):
        destination_filename = patient_data[:-4] + f"_{b:04d}" + ".csv"
        filenames.append(destination_filename)
    print(filenames)
    sorted_indices = p_ds.index_
    del p_ds

    patient_splitter(patient_data, filenames, sorted_indices, bucket_size)

    print('buckets:', bucket_index)
    with open(assessment_data) as f:
        a_ds = dataset.Dataset(f,
                               keys=('patient_id', 'other_symptoms'),
                               show_progress_every=500000)

    print(utils.build_histogram(buckets.values()))

    print('associating assessments with patients')
    orphaned_assessments = 0
    a_buckets = list()
    a_pids = a_ds.field_by_name('patient_id')
    a_os = a_ds.field_by_name('other_symptoms')
    for i_r in range(a_ds.row_count()):
        if a_pids[i_r] in buckets:
            a_buckets.append(buckets[a_pids[i_r]])
        else:
            orphaned_assessments += 1
            a_buckets.append(-1)

    del a_ds
    print('orphaned_assessments:', orphaned_assessments)

    print(f'{bucket_count + 1} buckets')
    for i in range(bucket_count + 1):
        print('bucket', i)
        destination_filename = assessment_data[:-4] + f"_{i:04d}" + ".csv"
        print(destination_filename)
        # with open(assessment_data) as f:
        #     a_ds = dataset.Dataset(f, filter_fn=lambda j: a_buckets[j] == i, show_progress_every=500000)
        #
        # del a_ds
        assessment_splitter(assessment_data, destination_filename, a_buckets,
                            i)

    print('done!')
예제 #6
0
    def test_sort(self):
        s = io.StringIO(small_dataset)
        ds = dataset.Dataset(s, verbose=False)

        ds.sort(('patient_id', 'id'))
        row_permutations = [2, 0, 1]
예제 #7
0
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from exetera.core import dataset, utils

# fn1 = '/home/ben/covid/patients_export_geocodes_20200406050002.csv'
# fn2 = '/home/ben/covid/patients_export_geocodes_20200413050002.csv'

fn1 = '/home/ben/covid/patients_export_geocodes_20200413050002.csv'
fn2 = '/home/ben/covid/patients_export_geocodes_20200416050002.csv'

print('loading file 1')
with open(fn1) as f1:
    ds1 = dataset.Dataset(f1)
ds1.sort(('id', ))
print('done')

print()
print('loading_file 2')
with open(fn2) as f2:
    ds2 = dataset.Dataset(f2)
ds2.sort(('id', ))
print('done')

fields = ('id', 'updated_at', 'year_of_birth', 'height_cm', 'zipcode',
          'outward_postcode')
i = 0
j = 0
matches = 0
예제 #8
0
def import_with_schema(timestamp, dest_file_name, schema_file, files,
                       overwrite):
    print(timestamp)
    print(schema_file)
    print(files)

    with open(schema_file) as sf:
        schema = load_schema(sf)

    any_parts_present = False
    for sk in schema.keys():
        if sk in files:
            any_parts_present = True
    if not any_parts_present:
        raise ValueError(
            "none of the data sources in 'files' contain relevant data to the schema"
        )

    stop_after = {}
    reserved_column_names = ('j_valid_from', 'j_valid_to')
    datastore = per.DataStore()

    if overwrite:
        mode = 'w'
    else:
        mode = 'r+'

    with h5py.File(dest_file_name, mode) as hf:
        for sk in schema.keys():
            if sk in reserved_column_names:
                msg = "{} is a reserved column name: reserved names are {}"
                raise ValueError(msg.format(sk, reserved_column_names))

            if sk not in files:
                continue

            fields = schema[sk].fields

            with open(files[sk]) as f:
                ds = dataset.Dataset(f, stop_after=1)
            names = set(ds.names_)
            missing_names = names.difference(fields.keys())
            if len(missing_names) > 0:
                msg = "The following fields are present in {} but not part of the schema: {}"
                print("Warning:", msg.format(files[sk], missing_names))
                # raise ValueError(msg.format(files[sk], missing_names))

        for sk in schema.keys():
            if sk not in files:
                continue

            fields = schema[sk].fields
            show_every = 100000

            with open(files[sk]) as f:
                ds = dataset.Dataset(f, stop_after=1)
            names = set(ds.names_)
            missing_names = names.difference(fields.keys())

            DatasetImporter(datastore,
                            files[sk],
                            hf,
                            sk,
                            schema[sk],
                            timestamp,
                            stop_after=stop_after.get(sk, None),
                            show_progress_every=show_every)

            print(sk, hf.keys())
            table = hf[sk]
            ids = datastore.get_reader(table[list(table.keys())[0]])
            jvf = datastore.get_timestamp_writer(table, 'j_valid_from')
            ftimestamp = utils.string_to_datetime(timestamp).timestamp()
            valid_froms = np.full(len(ids), ftimestamp)
            jvf.write(valid_froms)
            jvt = datastore.get_timestamp_writer(table, 'j_valid_to')
            valid_tos = np.full(len(ids), ops.MAX_DATETIME.timestamp())
            jvt.write(valid_tos)

        print(hf.keys())