Exemplo n.º 1
0
    def test_upload_model(self):
        client = Client('dmd')

        DMDProduct.objects.create(
            dmdid=327368008,
            bnf_code='0803042A0AAABAB',
            vpid=327368008,
            name='Bicalutamide 150mg tablets',
        )

        DMDVmpp.objects.create(
            vppid=1206011000001108,
            nm='Bicalutamide 150mg tablets 28 tablet',
            vpid=327368008,
        )

        NCSOConcession.objects.create(
            drug='Bicalutamide 150mg tablets',
            pack_size=28,
            price_concession_pence=499,
            vmpp_id=1206011000001108,
            date='2017-11-01',
         )

        TariffPrice.objects.create(
            vmpp_id=1206011000001108,
            product_id=327368008,
            price_pence=422,
            tariff_category_id=11,
            date='2017-11-01',
        )

        client.upload_model(DMDProduct)
        client.upload_model(DMDVmpp)
        client.upload_model(NCSOConcession)
        client.upload_model(TariffPrice)

        table = client.get_table('product')
        rows = list(table.get_rows_as_dicts())
        self.assertEqual(len(rows), 1)
        self.assertEqual(rows[0]['name'], 'Bicalutamide 150mg tablets')

        table = client.get_table('vmpp')
        rows = list(table.get_rows_as_dicts())
        self.assertEqual(len(rows), 1)
        self.assertEqual(rows[0]['nm'], 'Bicalutamide 150mg tablets 28 tablet')

        table = client.get_table('ncsoconcession')
        rows = list(table.get_rows_as_dicts())
        self.assertEqual(len(rows), 1)
        self.assertEqual(rows[0]['drug'], 'Bicalutamide 150mg tablets')

        table = client.get_table('tariffprice')
        rows = list(table.get_rows_as_dicts())
        self.assertEqual(len(rows), 1)
        self.assertEqual(rows[0]['price_pence'], 422)
Exemplo n.º 2
0
    def fill_views(self):
        client = Client('hscic')

        pool = Pool(processes=len(self.view_paths))
        tables = []

        prescribing_date = ImportLog.objects.latest_in_category(
            'prescribing').current_at.strftime('%Y-%m-%d')

        for path in self.view_paths:
            table_name = "vw__%s" % os.path.basename(path).replace('.sql', '')
            table = client.get_table(table_name)
            tables.append(table)

            with open(path) as f:
                sql = f.read()

            substitutions = {'this_month': prescribing_date}
            args = [table.table_id, sql, substitutions]
            pool.apply_async(query_and_export, args)

        pool.close()
        pool.join()  # wait for all worker processes to exit

        for table in tables:
            self.download_and_import(table)
            self.log("-------------")

        with connection.cursor() as cursor:
            for view_id in self.materialized_views:
                # This is quite slow! up to 10 mins.
                cursor.execute("REFRESH MATERIALIZED VIEW %s" % view_id)
Exemplo n.º 3
0
    def fill_views(self):
        client = Client('hscic')

        pool = Pool(processes=len(self.view_paths))
        tables = []

        prescribing_date = ImportLog.objects.latest_in_category(
            'prescribing').current_at.strftime('%Y-%m-%d')

        for path in self.view_paths:
            table_name = "vw__%s" % os.path.basename(path).replace('.sql', '')
            table = client.get_table(table_name)
            tables.append(table)

            with open(path) as f:
                sql = f.read()

            substitutions = {'this_month': prescribing_date}
            args = [table.name, sql, substitutions]
            pool.apply_async(query_and_export, args)

        pool.close()
        pool.join()  # wait for all worker processes to exit

        for table in tables:
            self.download_and_import(table)
            self.log("-------------")
Exemplo n.º 4
0
def write_zero_prescribing_codes_table(level):
    """Given a BNF `level` (`section`, `chapter`, `paragraph`, etc), write
    a table in bigquery listing all such levels that have zero prescribing.

    Returns a bigquery Table.

    """
    logger.info("Scanning %s to see if it has zero prescribing" % level)
    sql = """
    SELECT
      bnf.%s
    FROM
      {hscic}.normalised_prescribing_standard AS prescribing
    RIGHT JOIN
      {hscic}.bnf bnf
    ON
      prescribing.bnf_code = bnf.presentation_code
    WHERE (
        bnf.presentation_code NOT LIKE '2%%'  -- appliances, etc
    )
    GROUP BY
      bnf.%s
    HAVING
      COUNT(prescribing.bnf_code) = 0
    """ % (level, level)
    client = Client('tmp_eu')
    table = client.get_table('unused_codes_%s' % level)
    table.insert_rows_from_query(sql)
    return table
Exemplo n.º 5
0
def query_and_export(table_name, sql, substitutions):
    try:
        client = Client('hscic')
        table = client.get_table(table_name)

        storage_prefix = 'hscic/views/{}-'.format(table_name)
        logger.info("Generating view %s and saving to %s" %
                    (table_name, storage_prefix))

        logger.info("Running SQL for %s: %s" % (table_name, sql))
        table.insert_rows_from_query(sql, substitutions=substitutions)

        exporter = TableExporter(table, storage_prefix)

        logger.info('Deleting existing data in storage at %s' % storage_prefix)
        exporter.delete_from_storage()

        logger.info('Exporting data to storage at %s' % storage_prefix)
        exporter.export_to_storage()

        logger.info("View generation complete for %s" % table_name)
    except Exception:
        # Log the formatted error, because the multiprocessing pool
        # this is called from only shows the error message (with no
        # traceback)
        logger.error(traceback.format_exc())
        raise
Exemplo n.º 6
0
    def backup_table(self, table_name):
        client = Client("hscic")
        sql = "SELECT max(month) FROM {hscic}.%s" % table_name
        latest_date = client.query(sql).rows[0][0]
        latest_year_and_month = latest_date.strftime("%Y_%m")
        table = client.get_table(table_name)

        storage_client = StorageClient()
        bucket = storage_client.bucket()
        year_and_months = set()

        prefix_base = "backups/{}/".format(table_name)

        for blob in bucket.list_blobs(prefix=prefix_base):
            match = re.search("/(\d{4}_\d{2})/", blob.name)
            year_and_months.add(match.groups()[0])

        if latest_year_and_month in year_and_months:
            print("{} table already backed up for {}".format(
                table_name, latest_year_and_month))
            return

        storage_prefix = "{}/{}/{}-".format(prefix_base, latest_year_and_month,
                                            table_name)
        exporter = TableExporter(table, storage_prefix)
        exporter.export_to_storage()
Exemplo n.º 7
0
    def upload_to_bq(self):
        client = Client("dmd")
        for model in apps.get_app_config("dmd").get_models():
            client.upload_model(model)

        for model in [VMP, AMP, VMPP, AMPP]:
            table_id = "{}_full".format(model.obj_type)
            table = client.get_table(table_id)
            sql = self.sql_for_cls(model)
            table.insert_rows_from_query(sql)
Exemplo n.º 8
0
def update_bnf_table():
    """Update `bnf` table from cloud-stored CSV"""
    storage_client = StorageClient()
    bucket = storage_client.get_bucket()
    blobs = bucket.list_blobs(prefix="hscic/bnf_codes/")
    blobs = sorted(blobs, key=lambda blob: blob.name, reverse=True)
    blob = blobs[0]

    bq_client = BQClient("hscic")
    table = bq_client.get_table("bnf")
    table.insert_rows_from_storage(blob.name, skip_leading_rows=1)
Exemplo n.º 9
0
    def handle(self, *args, measure_ids, **kwargs):
        base_path = os.path.join(settings.APPS_ROOT, "bq_public_tables")

        client = Client("public")

        with open(os.path.join(base_path, "_measure_template.sql")) as f:
            measure_template_sql = f.read()

        if measure_ids:
            measures = Measure.objects.filter(id__in=measure_ids)
        else:
            measures = Measure.objects.all()

        for measure in measures:
            table_name = "measure_" + measure.id
            print(table_name)
            table = client.get_table(table_name)

            numerator_sql = """
            SELECT
                CAST(month AS DATE) AS month,
                practice AS practice_id,
                {numerator_columns}
            FROM {numerator_from}
            WHERE {numerator_where}
            GROUP BY month, practice_id
            """.format(
                numerator_columns=measure.numerator_columns,
                numerator_from=measure.numerator_from,
                numerator_where=measure.numerator_where,
            )

            denominator_sql = """
            SELECT
                CAST(month AS DATE) AS month,
                practice AS practice_id,
                {denominator_columns}
            FROM {denominator_from}
            WHERE {denominator_where}
            GROUP BY month, practice_id
            """.format(
                denominator_columns=measure.denominator_columns,
                denominator_from=measure.denominator_from,
                denominator_where=measure.denominator_where,
            )

            sql = partially_format(
                measure_template_sql,
                numerator_sql=numerator_sql,
                denominator_sql=denominator_sql,
            )

            table.insert_rows_from_query(sql)
    def handle(self, *args, **kwargs):
        base_path = os.path.join(settings.APPS_ROOT, "bq_public_tables")

        client = Client("public")

        for path in glob.glob(os.path.join(base_path, "*.sql")):
            table_name = os.path.splitext(os.path.basename(path))[0]
            table = client.get_table(table_name)

            with open(path) as f:
                sql = f.read()

            table.insert_rows_from_query(sql)
Exemplo n.º 11
0
    def test_upload_model(self):
        client = Client('dmd')
        client.upload_model(VMPP)
        table = client.get_table('vmpp')
        rows = list(table.get_rows_as_dicts())
        self.assertEqual(len(rows), 4)

        for row in rows:
            if row['id'] != 1079211000001106:
                continue

            for k, v in {
                    'invalid': False,
                    'nm': 'Chlortalidone 50mg tablets 28 tablet',
                    'vmp': 317935006,
                    'qtyval': Decimal('28.00'),
                    'qty_uom': 428673006,
                    'combpack': None,
                    'bnf_code': '0202010F0AAAAAA',
            }.items():
                self.assertEqual(row[k], v)
Exemplo n.º 12
0
    def test_upload_model(self):
        client = Client("dmd")
        client.upload_model(VMPP)
        table = client.get_table("vmpp")
        rows = list(table.get_rows_as_dicts())
        self.assertEqual(len(rows), 4)

        for row in rows:
            if row["id"] != 1079211000001106:
                continue

            for k, v in {
                    "invalid": False,
                    "nm": "Chlortalidone 50mg tablets 28 tablet",
                    "vmp": 317935006,
                    "qtyval": Decimal("28.00"),
                    "qty_uom": 428673006,
                    "combpack": None,
                    "bnf_code": "0202010F0AAAAAA",
            }.items():
                self.assertEqual(row[k], v)
Exemplo n.º 13
0
 def get_table(self, table_name):
     client = Client("measures")
     return client.get_table(table_name)
Exemplo n.º 14
0
    def handle(self, *args, **options):
        path = options["filename"]
        head, filename = os.path.split(path)
        converted_path = "{}_formatted.CSV".format(os.path.splitext(path)[0])
        _, year_and_month = os.path.split(head)

        logger.info("path: %s", path)
        logger.info("converted_path: %s", converted_path)
        logger.info("year_and_month: %s", year_and_month)

        date = year_and_month + "_01"
        try:
            datetime.datetime.strptime(date, "%Y_%m_%d")
        except ValueError:
            message = ("The file path must have a YYYY_MM "
                       "date component in the containing directory: ")
            message += path
            raise CommandError(message)

        hscic_dataset_client = Client("hscic")
        tmp_dataset_client = Client("tmp_eu")

        # Check that we haven't already processed data for this month
        sql = """SELECT COUNT(*)
        FROM {dataset}.prescribing
        WHERE month = TIMESTAMP('{date}')""".format(
            dataset=hscic_dataset_client.dataset_id,
            date=date.replace("_", "-"))
        try:
            results = hscic_dataset_client.query(sql)
            assert results.rows[0][0] == 0
        except NotFound:
            pass

        # Create BQ table backed backed by uploaded source CSV file
        raw_data_table_name = "raw_prescribing_data_{}".format(year_and_month)
        gcs_path = "hscic/prescribing/{}/{}".format(year_and_month, filename)

        logger.info("raw_data_table_name: %s", raw_data_table_name)
        logger.info("gcs_path: %s", gcs_path)

        raw_data_table = tmp_dataset_client.create_storage_backed_table(
            raw_data_table_name, RAW_PRESCRIBING_SCHEMA, gcs_path)

        # Append aggregated data to prescribing table
        sql = """
         SELECT
          Area_Team_Code AS sha,
          LEFT(PCO_Code, 3) AS pct,
          Practice_Code AS practice,
          BNF_Code AS bnf_code,
          BNF_Description AS bnf_name,
          SUM(Items) AS items,
          SUM(NIC) AS net_cost,
          SUM(Actual_Cost) AS actual_cost,
          SUM(Quantity * Items) AS quantity,
          TIMESTAMP('%s') AS month,
         FROM %s
         WHERE Practice_Code NOT LIKE '%%998'  -- see issue #349
         GROUP BY
           bnf_code, bnf_name, pct,
           practice, sha
        """ % (
            date.replace("_", "-"),
            raw_data_table.qualified_name,
        )

        logger.info("sql: %s", sql)

        prescribing_table = hscic_dataset_client.get_table("prescribing")
        prescribing_table.insert_rows_from_query(
            sql, legacy=True, write_disposition="WRITE_APPEND")

        # Write aggregated data to new table, for download
        sql = """
         SELECT
          LEFT(PCO_Code, 3) AS pct_id,
          Practice_Code AS practice_code,
          BNF_Code AS presentation_code,
          SUM(Items) AS total_items,
          SUM(NIC) AS net_cost,
          SUM(Actual_Cost) AS actual_cost,
          SUM(Quantity * Items) AS quantity,
          '%s' AS processing_date,
         FROM %s
         WHERE Practice_Code NOT LIKE '%%998'  -- see issue #349
         GROUP BY
           presentation_code, pct_id, practice_code
        """ % (
            date,
            raw_data_table.qualified_name,
        )

        fmtd_data_table_name = "formatted_prescribing_%s" % year_and_month

        logger.info("sql: %s", sql)
        logger.info("fmtd_data_table_name: %s", fmtd_data_table_name)

        fmtd_data_table = tmp_dataset_client.get_table(fmtd_data_table_name)
        fmtd_data_table.insert_rows_from_query(sql, legacy=True)

        # Export new table to storage, and download
        exporter = TableExporter(fmtd_data_table, gcs_path + "_formatted-")
        exporter.export_to_storage(print_header=False)

        with tempfile.NamedTemporaryFile(dir=head) as f:
            exporter.download_from_storage_and_unzip(f)

            # Sort the output.
            #
            # Why? Because this is equivalent to CLUSTERing the table on
            # loading, but less resource-intensive than doing it in
            # Postgres. And the table is too big to sort within BigQuery.
            subprocess.call(
                "ionice -c 2 nice -n 10 sort -k3,3 -k1,1 -k2,2 -t, %s > %s" %
                (f.name, converted_path),
                shell=True,
            )
Exemplo n.º 15
0
    def handle(self, *args, **kwargs):
        update_bnf_table()

        client = BQClient("hscic")

        table = client.get_table("practices")
        table.insert_rows_from_pg(models.Practice, schemas.PRACTICE_SCHEMA)

        table = client.get_table("presentation")
        table.insert_rows_from_pg(
            models.Presentation,
            schemas.PRESENTATION_SCHEMA,
            transformer=schemas.presentation_transform,
        )

        table = client.get_table("practice_statistics")
        columns = [field.name for field in schemas.PRACTICE_STATISTICS_SCHEMA]
        columns[0] = "date"
        columns[-1] = "practice_id"
        table.insert_rows_from_pg(
            models.PracticeStatistics,
            schema=schemas.PRACTICE_STATISTICS_SCHEMA,
            columns=columns,
            transformer=schemas.statistics_transform,
        )

        sql = "SELECT MAX(month) FROM {hscic}.practice_statistics_all_years"
        results = client.query(sql)
        if results.rows[0][0] is None:
            last_uploaded_practice_statistics_date = datetime.date(1900, 1, 1)
        else:
            last_uploaded_practice_statistics_date = results.rows[0][0].date()

        table = client.get_table("practice_statistics_all_years")
        sql = """SELECT *
        FROM {hscic}.practice_statistics
        WHERE month > TIMESTAMP('{date}')"""
        substitutions = {"date": last_uploaded_practice_statistics_date}
        table.insert_rows_from_query(sql,
                                     write_disposition="WRITE_APPEND",
                                     substitutions=substitutions)

        table = client.get_table("pcns")
        table.insert_rows_from_pg(models.PCN, schemas.PCN_SCHEMA)

        table = client.get_table("ccgs")
        table.insert_rows_from_pg(models.PCT,
                                  schemas.CCG_SCHEMA,
                                  transformer=schemas.ccgs_transform)

        table = client.get_table("stps")
        table.insert_rows_from_pg(models.STP, schemas.STP_SCHEMA)

        table = client.get_table("regional_teams")
        table.insert_rows_from_pg(models.RegionalTeam,
                                  schemas.REGIONAL_TEAM_SCHEMA)

        date = models.ImportLog.objects.latest_in_category(
            "prescribing").current_at
        table = client.get_table("prescribing_" + date.strftime("%Y_%m"))
        sql = """SELECT * FROM {hscic}.prescribing_v2
        WHERE month = TIMESTAMP('{date}')"""
        substitutions = {"date": date}
        table.insert_rows_from_query(sql, substitutions=substitutions)
    def handle(self, *args, **kwargs):
        # Make sure that PracticeStatistics and Prescription tables both have
        # latest data.
        latest_practice_statistic_date = models.PracticeStatistics.objects.aggregate(
            Max("date"))["date__max"]
        latest_prescription_date = models.Prescription.objects.aggregate(
            Max("processing_date"))["processing_date__max"]

        if latest_practice_statistic_date != latest_prescription_date:
            msg = ("Latest PracticeStatistics object has date {}, "
                   "while latest Prescription object has processing_date {}".
                   format(latest_practice_statistic_date,
                          latest_prescription_date))
            raise CommandError(msg)

        date = latest_prescription_date

        update_bnf_table()

        client = BQClient("hscic")

        table = client.get_table("practices")
        table.insert_rows_from_pg(models.Practice, schemas.PRACTICE_SCHEMA)

        table = client.get_table("presentation")
        table.insert_rows_from_pg(
            models.Presentation,
            schemas.PRESENTATION_SCHEMA,
            transformer=schemas.presentation_transform,
        )

        table = client.get_table("practice_statistics")
        columns = [field.name for field in schemas.PRACTICE_STATISTICS_SCHEMA]
        columns[0] = "date"
        columns[-1] = "practice_id"
        table.insert_rows_from_pg(
            models.PracticeStatistics,
            schema=schemas.PRACTICE_STATISTICS_SCHEMA,
            columns=columns,
            transformer=schemas.statistics_transform,
        )

        sql = "SELECT MAX(month) FROM {hscic}.practice_statistics_all_years"
        results = client.query(sql)
        if results.rows[0][0] is None:
            last_uploaded_practice_statistics_date = datetime.date(1900, 1, 1)
        else:
            last_uploaded_practice_statistics_date = results.rows[0][0].date()

        table = client.get_table("practice_statistics_all_years")
        sql = """SELECT *
        FROM {hscic}.practice_statistics
        WHERE month > TIMESTAMP('{date}')"""
        substitutions = {"date": last_uploaded_practice_statistics_date}
        table.insert_rows_from_query(sql,
                                     write_disposition="WRITE_APPEND",
                                     substitutions=substitutions)

        table = client.get_table("ccgs")
        table.insert_rows_from_pg(models.PCT,
                                  schemas.CCG_SCHEMA,
                                  transformer=schemas.ccgs_transform)

        table = client.get_table("stps")
        table.insert_rows_from_pg(models.STP, schemas.STP_SCHEMA)

        table = client.get_table("regional_teams")
        table.insert_rows_from_pg(models.RegionalTeam,
                                  schemas.REGIONAL_TEAM_SCHEMA)

        table = client.get_table("prescribing_" + date.strftime("%Y_%m"))
        sql = """SELECT * FROM {hscic}.prescribing
        WHERE month = TIMESTAMP('{date}')"""
        substitutions = {"date": date}
        table.insert_rows_from_query(sql, substitutions=substitutions)
    def handle(self, *args, **options):
        path = options["filename"]
        head, filename = os.path.split(path)
        _, year_and_month = os.path.split(head)

        logger.info("path: %s", path)
        logger.info("year_and_month: %s", year_and_month)

        date = year_and_month + "_01"
        try:
            datetime.datetime.strptime(date, "%Y_%m_%d")
        except ValueError:
            message = ("The file path must have a YYYY_MM "
                       "date component in the containing directory: ")
            message += path
            raise CommandError(message)

        hscic_dataset_client = Client("hscic")
        tmp_dataset_client = Client("tmp_eu")

        # Check that we haven't already processed data for this month
        sql = """SELECT COUNT(*)
        FROM {hscic}.prescribing_v2
        WHERE month = TIMESTAMP('{date}')"""

        try:
            results = hscic_dataset_client.query(
                sql, substitutions={"date": date.replace("_", "-")})
            assert results.rows[0][0] == 0
        except NotFound:
            pass

        # Create BQ table backed backed by uploaded source CSV file
        raw_data_table_name = "raw_prescribing_data_{}".format(year_and_month)
        gcs_path = "hscic/prescribing_v2/{}/{}".format(year_and_month,
                                                       filename)

        logger.info("raw_data_table_name: %s", raw_data_table_name)
        logger.info("gcs_path: %s", gcs_path)

        raw_data_table = tmp_dataset_client.create_storage_backed_table(
            raw_data_table_name, RAW_PRESCRIBING_SCHEMA_V2, gcs_path)

        # Append aggregated data to prescribing table
        sql = """
         SELECT
          AREA_TEAM_CODE AS sha,
          LEFT(PCO_CODE, 3) AS pct,
          PRACTICE_CODE AS practice,
          BNF_CODE AS bnf_code,
          BNF_DESCRIPTION AS bnf_name,
          SUM(ITEMS) AS items,
          SUM(NIC) AS net_cost,
          SUM(ACTUAL_COST) AS actual_cost,
          SUM(TOTAL_QUANTITY) AS quantity,
          TIMESTAMP('%s') AS month,
         FROM %s
         WHERE PRACTICE_CODE NOT LIKE '%%998'  -- see issue #349
         GROUP BY
           bnf_code, bnf_name, pct,
           practice, sha
        """ % (
            date.replace("_", "-"),
            raw_data_table.qualified_name,
        )

        logger.info("sql: %s", sql)

        prescribing_table = hscic_dataset_client.get_table("prescribing_v2")
        prescribing_table.insert_rows_from_query(
            sql, legacy=True, write_disposition="WRITE_APPEND")

        ImportLog.objects.create(current_at=date.replace("_", "-"),
                                 filename=filename,
                                 category="prescribing")
Exemplo n.º 18
0
    def handle(self, *args, **kwargs):
        # Make sure that PracticeStatistics and Prescription tables both have
        # latest data.
        latest_practice_statistic_date = models.PracticeStatistics.objects\
            .aggregate(Max('date'))['date__max']
        latest_prescription_date = models.Prescription.objects\
            .aggregate(Max('processing_date'))['processing_date__max']

        if latest_practice_statistic_date != latest_prescription_date:
            msg = 'Latest PracticeStatistics object has date {}, '\
                'while latest Prescription object has processing_date {}'\
                .format(latest_practice_statistic_date, latest_prescription_date)
            raise CommandError(msg)

        date = latest_prescription_date

        update_bnf_table()

        client = BQClient('hscic')

        table = client.get_table('practices')
        columns = [field.name for field in schemas.PRACTICE_SCHEMA]
        table.insert_rows_from_pg(models.Practice, columns)

        table = client.get_table('presentation')
        columns = [field.name for field in schemas.PRESENTATION_SCHEMA]
        table.insert_rows_from_pg(models.Presentation, columns,
                                  schemas.presentation_transform)

        table = client.get_table('practice_statistics')
        columns = [field.name for field in schemas.PRACTICE_STATISTICS_SCHEMA]
        columns[0] = 'date'
        columns[-1] = 'practice_id'
        table.insert_rows_from_pg(models.PracticeStatistics, columns,
                                  schemas.statistics_transform)

        sql = 'SELECT MAX(month) FROM {hscic}.practice_statistics_all_years'
        results = client.query(sql)
        if results.rows[0][0] is None:
            last_uploaded_practice_statistics_date = datetime.date(1900, 1, 1)
        else:
            last_uploaded_practice_statistics_date = results.rows[0][0].date()

        table = client.get_table('practice_statistics_all_years')
        sql = '''SELECT
                    month,
                    pct_id,
                    practice,
                    male_0_4,
                    female_0_4,
                    male_5_14,
                    female_5_14,
                    male_15_24,
                    female_15_24,
                    male_25_34,
                    female_25_34,
                    male_35_44,
                    female_35_44,
                    male_45_54,
                    female_45_54,
                    male_55_64,
                    female_55_64,
                    male_65_74,
                    female_65_74,
                    male_75_plus,
                    female_75_plus,
                    total_list_size
        FROM {hscic}.practice_statistics
        WHERE month > TIMESTAMP('{date}')'''
        substitutions = {'date': last_uploaded_practice_statistics_date}
        table.insert_rows_from_query(sql,
                                     write_disposition='WRITE_APPEND',
                                     substitutions=substitutions)

        table = client.get_table('ccgs')
        columns = [field.name for field in schemas.CCG_SCHEMA]
        table.insert_rows_from_pg(models.PCT, columns, schemas.ccgs_transform)

        table = client.get_table('ppu_savings')
        columns = [field.name for field in PPU_SAVING_SCHEMA]
        table.insert_rows_from_pg(PPUSaving, columns, ppu_savings_transform)

        table = client.get_table('prescribing_' + date.strftime('%Y_%m'))
        sql = '''SELECT * FROM {hscic}.prescribing
        WHERE month = TIMESTAMP('{date}')'''
        substitutions = {'date': date}
        table.insert_rows_from_query(sql, substitutions=substitutions)
Exemplo n.º 19
0
    def test_the_lot(self):
        client = Client('test')

        schema = build_schema(
            ('a', 'INTEGER'),
            ('b', 'STRING'),
        )

        headers = ['a', 'b']
        rows = [
            (1, 'apple'),
            (2, 'banana'),
            (3, 'coconut'),
        ]

        t1 = client.get_or_create_table('t1', schema)
        t1_qname = t1.qualified_name

        # Test Table.insert_rows_from_csv
        t1.insert_rows_from_csv('gcutils/tests/test_table.csv')

        self.assertEqual(sorted(t1.get_rows()), rows)

        # Test Table.insert_rows_from_query
        t2 = client.get_table('t2')

        sql = 'SELECT * FROM {} WHERE a > 1'.format(t1_qname)
        t2.insert_rows_from_query(sql)

        self.assertEqual(sorted(t2.get_rows()), rows[1:])

        # Test Client.query
        sql = 'SELECT * FROM {} WHERE a > 2'.format(t1_qname)
        results = client.query(sql)

        self.assertEqual(sorted(results.rows), rows[2:])

        # Test Client.query_into_dataframe
        sql = 'SELECT * FROM {} WHERE a > 2'.format(t1_qname)
        df = client.query_into_dataframe(sql)

        self.assertEqual(df.values.tolist(), [list(rows[2])])

        # Test TableExporter.export_to_storage and
        # TableExporter.download_from_storage_and_unzip
        t1_exporter = TableExporter(t1, self.storage_prefix + 'test_table-')
        t1_exporter.export_to_storage()

        with tempfile.NamedTemporaryFile(mode='r+') as f:
            t1_exporter.download_from_storage_and_unzip(f)
            f.seek(0)
            reader = csv.reader(f)
            data = [reader.next()] + sorted(reader)

        self.assertEqual(data, [map(str, row) for row in [headers] + rows])

        # Test Table.insert_rows_from_storage
        storage_path = self.storage_prefix + 'test_table.csv'
        self.upload_to_storage('gcutils/tests/test_table.csv', storage_path)

        t2.insert_rows_from_storage(storage_path)

        self.assertEqual(sorted(t2.get_rows()), rows)

        # Test Client.create_storage_backed_table
        storage_path = self.storage_prefix + 'test_table_headers.csv'
        self.upload_to_storage(
            'gcutils/tests/test_table_headers.csv',
            storage_path
        )

        schema = [
            {'name': 'a', 'type': 'integer'},
            {'name': 'b', 'type': 'string'},
        ]

        t3 = client.create_storage_backed_table(
            't3',
            schema,
            storage_path
        )

        results = client.query('SELECT * FROM {}'.format(t3.qualified_name))

        self.assertEqual(sorted(results.rows), rows)

        self.upload_to_storage(
            'gcutils/tests/test_table_headers_2.csv',
            storage_path
        )

        results = client.query('SELECT * FROM {}'.format(t3.qualified_name))

        self.assertEqual(sorted(results.rows), rows + [(4, u'damson')])

        # Test Client.create_table_with_view
        sql = 'SELECT * FROM {{project}}.{} WHERE a > 1'.format(t1_qname)

        t4 = client.create_table_with_view('t4', sql, False)

        results = client.query('SELECT * FROM {}'.format(t4.qualified_name))

        self.assertEqual(sorted(results.rows), rows[1:])

        # Test Client.insert_rows_from_pg
        PCT.objects.create(code='ABC', name='CCG 1')
        PCT.objects.create(code='XYZ', name='CCG 2')

        def transformer(row):
            return [ord(row[0][0]), row[1]]
        t1.insert_rows_from_pg(PCT, ['code', 'name'], transformer)

        self.assertEqual(sorted(t1.get_rows()), [(65, 'CCG 1'), (88, 'CCG 2')])

        # Test Table.delete_all_rows
        t1.delete_all_rows()

        self.assertEqual(list(t1.get_rows()), [])
Exemplo n.º 20
0
    def test_the_lot(self):
        client = Client("test")
        archive_client = Client("archive")

        orig_schema = build_schema(("a", "STRING"), ("b", "INTEGER"))

        schema = build_schema(("a", "INTEGER"), ("b", "STRING"))

        headers = ["a", "b"]
        rows = [(1, "apple"), (2, "banana"), (3, "coconut")]

        t1 = client.get_or_create_table("t1", orig_schema)
        t1_qname = t1.qualified_name

        # Test Table.insert_rows_from_csv
        t1.insert_rows_from_csv("gcutils/tests/test_table.csv", schema)

        self.assertEqual(sorted(t1.get_rows()), rows)

        # Test Table.insert_rows_from_query
        t2 = client.get_table("t2")

        sql = "SELECT * FROM {} WHERE a > 1".format(t1_qname)
        t2.insert_rows_from_query(sql)

        self.assertEqual(sorted(t2.get_rows()), rows[1:])

        # Test Client.query
        sql = "SELECT * FROM {} WHERE a > 2".format(t1_qname)
        results = client.query(sql)

        self.assertEqual(sorted(results.rows), rows[2:])

        # Test Client.query_into_dataframe
        sql = "SELECT * FROM {} WHERE a > 2".format(t1_qname)
        df = client.query_into_dataframe(sql)

        self.assertEqual(df.values.tolist(), [list(rows[2])])

        # Test TableExporter.export_to_storage and
        # TableExporter.download_from_storage_and_unzip
        t1_exporter = TableExporter(t1, self.storage_prefix + "test_table-")
        t1_exporter.export_to_storage()

        with tempfile.NamedTemporaryFile(mode="r+") as f:
            t1_exporter.download_from_storage_and_unzip(f)
            f.seek(0)
            reader = csv.reader(f)
            data = [next(reader)] + sorted(reader)

        self.assertEqual(data,
                         [list(map(str, row)) for row in [headers] + rows])

        # Test Table.insert_rows_from_storage
        storage_path = self.storage_prefix + "test_table.csv"
        self.upload_to_storage("gcutils/tests/test_table.csv", storage_path)

        t2.insert_rows_from_storage(storage_path)

        self.assertEqual(sorted(t2.get_rows()), rows)

        # Test Client.create_storage_backed_table
        storage_path = self.storage_prefix + "test_table_headers.csv"
        self.upload_to_storage("gcutils/tests/test_table_headers.csv",
                               storage_path)

        schema = build_schema(("a", "INTEGER"), ("b", "STRING"))

        t3 = client.create_storage_backed_table("t3", schema, storage_path)

        results = client.query("SELECT * FROM {}".format(t3.qualified_name))

        self.assertEqual(sorted(results.rows), rows)

        self.upload_to_storage("gcutils/tests/test_table_headers_2.csv",
                               storage_path)

        results = client.query("SELECT * FROM {}".format(t3.qualified_name))

        self.assertEqual(sorted(results.rows), rows + [(4, "damson")])

        # Test Client.create_table_with_view
        sql = "SELECT * FROM {{project}}.{} WHERE a > 1".format(t1_qname)

        t4 = client.create_table_with_view("t4", sql, False)

        results = client.query("SELECT * FROM {}".format(t4.qualified_name))

        self.assertEqual(sorted(results.rows), rows[1:])

        # Test Table.copy_to_new_dataset
        t1.copy_to_new_dataset("archive")
        t1_archived = archive_client.get_table("t1")
        self.assertEqual(sorted(t1_archived.get_rows()), rows)
        self.assertEqual(sorted(t1.get_rows()), rows)

        # Test Table.move_to_new_dataset
        t2.move_to_new_dataset("archive")
        t2_archived = archive_client.get_table("t2")
        self.assertEqual(sorted(t2_archived.get_rows()), rows)
        with self.assertRaises(NotFound):
            list(t2.get_rows())

        # Test Client.insert_rows_from_pg
        PCT.objects.create(code="ABC", name="CCG 1")
        PCT.objects.create(code="XYZ", name="CCG 2")

        def transformer(row):
            return [ord(row[0][0]), row[1]]

        t1.insert_rows_from_pg(
            PCT,
            build_schema(("code", "INTEGER"), ("name", "STRING")),
            transformer=transformer,
        )

        self.assertEqual(sorted(t1.get_rows()), [(65, "CCG 1"), (88, "CCG 2")])

        # Test Table.delete_all_rows
        t1.delete_all_rows()

        self.assertEqual(list(t1.get_rows()), [])
def make_merged_table_for_month(month):
    """Create a new BigQuery table that includes code substitutions, off
    which our savings can be computed.

    What are code substitutions?

    Because (for example) Tramadol tablets and capsules can
    almost always be substituted, we consider them the same chemical
    for the purposes of our analysis.

    Therefore, wherever Tramadol capsules appear in the source data,
    we treat them as Tramadol tablets (for example).

    The mapping of what we consider equivalent is stored in a Google
    Sheet, currently at
    https://docs.google.com/spreadsheets/d/1SvMGCKrmqsNkZYuGW18Sf0wTluXyV4bhyZQaVLcO41c/edit

    The process of updating this spreadsheet (which should be done
    periodically) is documented
    [here](https://github.com/ebmdatalab/price-per-dose/issues/11)

    """
    cases = []
    seen = set()
    df = pd.read_csv(SUBSTITUTIONS_SPREADSHEET)
    df = df[df['Really equivalent?'] == 'Y']
    for row in df.iterrows():
        data = row[1]
        source_code = data[1].strip()
        code_to_merge = data[10].strip()
        if source_code not in seen and code_to_merge not in seen:
            cases.append((code_to_merge, source_code))
        seen.add(source_code)
        seen.add(code_to_merge)
    prescribing_table = 'normalised_prescribing_standard'
    sql = """
      SELECT
        practice,
        pct,
      CASE bnf_code
        %s
        ELSE bnf_code
      END AS bnf_code,
        month,
        actual_cost,
        net_cost,
        quantity
      FROM
        {hscic}.%s
      WHERE month = TIMESTAMP('%s')
    """ % (' '.join([
        "WHEN '%s' THEN '%s'" % (when_code, then_code)
        for (when_code, then_code) in cases
    ]), prescribing_table, month)
    target_table_name = ('prescribing_with_merged_codes_%s' %
                         month.strftime('%Y_%m'))

    client = Client('hscic')
    table = client.get_table(target_table_name)
    table.insert_rows_from_query(sql)
    return target_table_name
    def handle(self, *args, **options):
        path = options['filename']
        head, filename = os.path.split(path)
        converted_path = '{}_formatted.CSV'.format(os.path.splitext(path)[0])
        _, year_and_month = os.path.split(head)

        logger.info('path: %s', path)
        logger.info('converted_path: %s', converted_path)
        logger.info('year_and_month: %s', year_and_month)

        date = year_and_month + '_01'
        try:
            datetime.datetime.strptime(date, '%Y_%m_%d')
        except ValueError:
            message = ('The file path must have a YYYY_MM '
                       'date component in the containing directory: ')
            message += path
            raise CommandError(message)

        hscic_dataset_client = Client('hscic')
        tmp_dataset_client = Client('tmp_eu')

        # Check that we haven't already processed data for this month
        sql = '''SELECT COUNT(*)
        FROM {dataset}.prescribing
        WHERE month = TIMESTAMP('{date}')'''.format(
            dataset=hscic_dataset_client.dataset_id,
            date=date.replace('_', '-'),
        )
        try:
            results = hscic_dataset_client.query(sql)
            assert results.rows[0][0] == 0
        except NotFound:
            pass

        # Create BQ table backed backed by uploaded source CSV file
        raw_data_table_name = 'raw_prescribing_data_{}'.format(year_and_month)
        gcs_path = 'hscic/prescribing/{}/{}'.format(year_and_month, filename)

        logger.info('raw_data_table_name: %s', raw_data_table_name)
        logger.info('gcs_path: %s', gcs_path)

        schema = [
            {'name': 'Regional_Office_Name', 'type': 'string'},
            {'name': 'Regional_Office_Code', 'type': 'string'},
            {'name': 'Area_Team_Name', 'type': 'string'},
            {'name': 'Area_Team_Code', 'type': 'string', 'mode': 'required'},
            {'name': 'PCO_Name', 'type': 'string'},
            {'name': 'PCO_Code', 'type': 'string'},
            {'name': 'Practice_Name', 'type': 'string'},
            {'name': 'Practice_Code', 'type': 'string', 'mode': 'required'},
            {'name': 'BNF_Code', 'type': 'string', 'mode': 'required'},
            {'name': 'BNF_Description', 'type': 'string', 'mode': 'required'},
            {'name': 'Items', 'type': 'integer', 'mode': 'required'},
            {'name': 'Quantity', 'type': 'integer', 'mode': 'required'},
            {'name': 'ADQ_Usage', 'type': 'float'},
            {'name': 'NIC', 'type': 'float', 'mode': 'required'},
            {'name': 'Actual_Cost', 'type': 'float', 'mode': 'required'},
        ]
        raw_data_table = tmp_dataset_client.create_storage_backed_table(
            raw_data_table_name,
            schema,
            gcs_path
        )

        # Append aggregated data to prescribing table
        sql = '''
         SELECT
          Area_Team_Code AS sha,
          LEFT(PCO_Code, 3) AS pct,
          Practice_Code AS practice,
          BNF_Code AS bnf_code,
          BNF_Description AS bnf_name,
          SUM(Items) AS items,
          SUM(NIC) AS net_cost,
          SUM(Actual_Cost) AS actual_cost,
          SUM(Quantity * Items) AS quantity,
          TIMESTAMP('%s') AS month,
         FROM %s
         WHERE Practice_Code NOT LIKE '%%998'  -- see issue #349
         GROUP BY
           bnf_code, bnf_name, pct,
           practice, sha
        ''' % (date.replace('_', '-'), raw_data_table.qualified_name)

        logger.info('sql: %s', sql)

        prescribing_table = hscic_dataset_client.get_table('prescribing')
        prescribing_table.insert_rows_from_query(
            sql,
            legacy=True,
            write_disposition='WRITE_APPEND'
        )

        # Write aggregated data to new table, for download
        sql = '''
         SELECT
          LEFT(PCO_Code, 3) AS pct_id,
          Practice_Code AS practice_code,
          BNF_Code AS presentation_code,
          SUM(Items) AS total_items,
          SUM(NIC) AS net_cost,
          SUM(Actual_Cost) AS actual_cost,
          SUM(Quantity * Items) AS quantity,
          '%s' AS processing_date,
         FROM %s
         WHERE Practice_Code NOT LIKE '%%998'  -- see issue #349
         GROUP BY
           presentation_code, pct_id, practice_code
        ''' % (date, raw_data_table.qualified_name)

        fmtd_data_table_name = 'formatted_prescribing_%s' % year_and_month

        logger.info('sql: %s', sql)
        logger.info('fmtd_data_table_name: %s', fmtd_data_table_name)

        fmtd_data_table = tmp_dataset_client.get_table(fmtd_data_table_name)
        fmtd_data_table.insert_rows_from_query(sql, legacy=True)

        # Export new table to storage, and download
        exporter = TableExporter(fmtd_data_table, gcs_path + '_formatted-')
        exporter.export_to_storage(print_header=False)

        with tempfile.NamedTemporaryFile(dir=head) as f:
            exporter.download_from_storage_and_unzip(f)

            # Sort the output.
            #
            # Why? Because this is equivalent to CLUSTERing the table on
            # loading, but less resource-intensive than doing it in
            # Postgres. And the table is too big to sort within BigQuery.
            subprocess.call(
                "ionice -c 2 nice -n 10 sort -k3,3 -k1,1 -k2,2 -t, %s > %s" % (
                    f.name, converted_path),
                shell=True)