def handle(self, *args, **kwargs):
        self.vmpps = DMDVmpp.objects.values('nm', 'vppid')
        self.counter = {
            'new-and-matched': 0,
            'new-and-unmatched': 0,
            'changed': 0,
            'unchanged': 0,
        }
        self.import_from_archive()
        self.import_from_current()

        logger.info('New and matched: %s', self.counter['new-and-matched'])
        logger.info('New and unmatched: %s', self.counter['new-and-unmatched'])
        logger.info('Changed: %s', self.counter['changed'])
        logger.info('Unchanged: %s', self.counter['unchanged'])

        Client('dmd').upload_model(NCSOConcession)

        msg = '\n'.join([
            'Imported NCSO concessions',
            'New and matched: %s' % self.counter['new-and-matched'],
            'New and unmatched: %s' % self.counter['new-and-unmatched'],
            'Changed: %s' % self.counter['changed'],
            'Unchanged: %s' % self.counter['unchanged'],
        ])
        notify_slack(msg)
示例#2
0
def run_task(task, year, month, **kwargs):
    if TaskLog.objects.filter(
            year=year,
            month=month,
            task_name=task.name,
            status=TaskLog.SUCCESSFUL,
    ).exists():
        # This task has already been run successfully
        return

    task_log = TaskLog.objects.create(
        year=year,
        month=month,
        task_name=task.name,
    )

    try:
        task.run(year, month, **kwargs)
        task_log.mark_succeeded()
    except:
        # We want to catch absolutely every error here, including things that
        # wouldn't be caught by `except Exception` (like `KeyboardInterrupt`),
        # since we want to log that the task didn't complete.
        import traceback
        task_log.mark_failed(formatted_tb=traceback.format_exc())
        msg = 'Importing data for {}_{} has failed when running {}.'.format(
            year, month, task.name)
        notify_slack(msg)
        raise
    def handle(self, *args, **kwargs):
        url = "https://www.nhsbsa.nhs.uk/pharmacies-gp-practices-and-appliance-contractors/drug-tariff/drug-tariff-part-viii/"
        rsp = requests.get(url)
        doc = bs4.BeautifulSoup(rsp.content, "html.parser")

        month_abbrs = [x.lower() for x in calendar.month_abbr]

        imported_months = []

        for a in doc.findAll("a", href=re.compile(r"Part%20VIIIA.+\.xlsx$")):
            # a.attrs['href'] typically has a filename part like
            # Part%20VIIIA%20September%202017.xlsx
            #
            # We split that into ['Part', 'VIIIA', 'September', '2017']
            base_filename = unquote(
                os.path.splitext(os.path.basename(a.attrs["href"]))[0])

            if base_filename == "Part VIIIA Nov 20 updated":
                # November 2020 has a different filename.  In general we want to be
                # warned (through the scraper crashing) about updates (because we have
                # to delete all records for the month in question, and reimport) so
                # special-casing is appropriate here.
                year, month = "2020", 11

            else:
                words = re.split(r"[ -]+", base_filename)
                month_name, year = words[-2:]

                # We have seen the last token in `words` be "19_0".  The year is
                # reported to us via Slack, so if we pull out some nonsense here we
                # *should* notice.
                year = re.match("\d+", year).group()
                if len(year) == 2:
                    year = "20" + year

                # We have seen the month be `September`, `Sept`, and `Sep`.
                month_abbr = month_name.lower()[:3]
                month = month_abbrs.index(month_abbr)

            date = datetime.date(int(year), month, 1)
            if ImportLog.objects.filter(category="tariff",
                                        current_at=date).exists():
                continue

            xls_url = urljoin(url, a.attrs["href"])
            xls_file = BytesIO(requests.get(xls_url).content)

            import_month(xls_file, date)
            imported_months.append((year, month))

        if imported_months:
            client = Client("dmd")
            client.upload_model(TariffPrice)

            for year, month in imported_months:
                msg = "Imported Drug Tariff for %s_%s" % (year, month)
                notify_slack(msg)
        else:
            msg = "Found no new tariff data to import"
            notify_slack(msg)
示例#4
0
def run_all(year, month, under_test=False):
    tasks = load_tasks()

    if not under_test:
        for task in tasks.by_type("manual_fetch"):
            run_task(task, year, month)

        for task in tasks.by_type("auto_fetch"):
            run_task(task, year, month)

    upload_all_to_storage(tasks)

    for task in tasks.by_type("convert").ordered():
        run_task(task, year, month)

    for task in tasks.by_type("import").ordered():
        run_task(task, year, month)

    prescribing_path = tasks["convert_hscic_prescribing"].imported_paths()[-1]
    last_imported = re.findall(r"/(\d{4}_\d{2})/", prescribing_path)[0]

    for task in tasks.by_type("post_process").ordered():
        run_task(task, year, month, last_imported=last_imported)

    if not under_test:
        # Remove numbers.json files.  These are created by check_numbers, and we want to
        # remove them after each import, since all numbers on the site will change with
        # the new data.  We only want to remove these files after importing real data,
        # and not during an end-to-end run.
        check_numbers_glob = os.path.join(
            settings.CHECK_NUMBERS_BASE_PATH, "*", "numbers.json"
        )
        for path in glob.glob(check_numbers_glob):
            os.remove(path)

    TaskLog.objects.create(
        year=year, month=month, task_name="fetch_and_import", status=TaskLog.SUCCESSFUL
    )

    activity = random.choice(
        ["put the kettle on", "have a glass of wine", "get up and stretch"]
    )

    msg = """
Importing data for {}_{} complete!'

You should now:

* tweet about it
* run `sudo systemctl restart app.openprescribing.*.service`
* {}

(Details: https://github.com/ebmdatalab/openprescribing/wiki/Importing-data)
    """.strip().format(
        year, month, activity
    )

    if not under_test:
        notify_slack(msg)
示例#5
0
    def handle(self, *args, **kwargs):
        url = 'https://www.nhsbsa.nhs.uk/pharmacies-gp-practices-and-appliance-contractors/drug-tariff/drug-tariff-part-viii/'
        rsp = requests.get(url)
        doc = bs4.BeautifulSoup(rsp.content, 'html.parser')

        month_names = [x.lower() for x in calendar.month_name]
        month_abbrs = [x.lower() for x in calendar.month_abbr]

        imported_months = []

        for a in doc.findAll('a', href=re.compile('Part%20VIIIA')):
            # a.attrs['href'] typically has a filename part like
            # Part%20VIIIA%20September%202017.xlsx
            #
            # We split that into ['Part', 'VIIIA', 'September', '2017']
            words = re.split(
                r'[ -]+',
                urllib.unquote(os.path.splitext(
                    os.path.basename(a.attrs['href']))[0]))
            month_name, year = words[-2:]

            # We have seen the last token in `words` be "19_0".  The year is
            # reported to us via Slack, so if we pull out some nonsense here we
            # *should* notice.
            year = re.match('\d+', year).group()
            if len(year) == 2:
                year = "20" + year

            try:
                month = month_names.index(month_name.lower())
            except ValueError:
                month = month_abbrs.index(month_name.lower())

            date = datetime.date(int(year), month, 1)

            if ImportLog.objects.filter(
                category='tariff',
                current_at=date
            ).exists():
                continue

            xls_url = urljoin(url, a.attrs['href'])
            xls_file = StringIO(requests.get(xls_url).content)

            import_month(xls_file, date)
            imported_months.append((year, month))

        if imported_months:
            client = Client('dmd')
            client.upload_model(TariffPrice)

            for year, month in imported_months:
                msg = 'Imported Drug Tariff for %s_%s' % (year, month)
                notify_slack(msg)
        else:
            msg = 'Found no new tariff data to import'
            notify_slack(msg)
示例#6
0
def run_all(year, month, under_test=False):
    tasks = load_tasks()

    if not under_test:
        for task in tasks.by_type('manual_fetch'):
            run_task(task, year, month)

        for task in tasks.by_type('auto_fetch'):
            run_task(task, year, month)

    upload_all_to_storage(tasks)

    for task in tasks.by_type('convert').ordered():
        run_task(task, year, month)

    for task in tasks.by_type('import').ordered():
        run_task(task, year, month)

    prescribing_path = tasks['import_hscic_prescribing'].imported_paths()[-1]
    last_imported = re.findall(r'/(\d{4}_\d{2})/', prescribing_path)[0]

    for task in tasks.by_type('post_process').ordered():
        if under_test and 'smoketest' in task.name:
            # Smoketests run against live site, so we should skip when running
            # under test
            continue
        run_task(task, year, month, last_imported=last_imported)

    TaskLog.objects.create(
        year=year,
        month=month,
        task_name='fetch_and_import',
        status=TaskLog.SUCCESSFUL,
    )

    activity = random.choice([
        'Put the kettle on',
        'Have a glass of wine',
        'Get up and stretch',
    ])

    msg = '''
Importing data for {}_{} complete!'

You should now:
* Tweet about it
* Commit the changes to the smoke tests
* {}

(Details: https://github.com/ebmdatalab/openprescribing/wiki/Importing-data)
    '''.strip().format(year, month, activity)

    if not under_test:
        notify_slack(msg)
示例#7
0
    def handle(self, *args, **kwargs):
        if os.environ['DJANGO_SETTINGS_MODULE'] != \
                'openprescribing.settings.e2etest':
            raise CommandError('Command must run with e2etest settings')

        try:
            run_end_to_end()
        except:
            import traceback
            msg = 'End-to-end test failed:\n\n'
            msg += traceback.format_exc()
            notify_slack(msg)
            raise

        notify_slack('Pipeline tests ran to completion')
示例#8
0
    def handle_case_c(self, ccg):
        # The CCG is still open but some practices have moved to another CCG
        name = self.ccg_to_name[ccg]

        if self.dry_run:
            self.stdout.write(
                "Some practices have left CCG and some currently in CCG are not active"
            )
        else:
            msg = """
Practices have left CCG {} ({}) and some remaining practices are not active.
Check whether these inactive practices should have moved.
See instructions in handle_orphan_practices.py.
            """.format(ccg, name).strip()
            notify_slack(msg)
示例#9
0
    def handle(self, *args, **kwargs):
        if os.environ['DJANGO_SETTINGS_MODULE'] != \
                'openprescribing.settings.e2etest':
            raise CommandError('Command must run with e2etest settings')

        try:
            run_end_to_end()
        except Exception:
            msg = 'End-to-end test failed (seed: %s)\n\n' % settings.BQ_NONCE
            msg += 'Check logs in /tmp/'
            notify_slack(msg)
            raise

        msg = 'Pipeline tests ran to completion (seed: %s)' % settings.BQ_NONCE
        notify_slack(msg)
    def handle(self, *args, **kwargs):
        if os.environ[
                "DJANGO_SETTINGS_MODULE"] != "openprescribing.settings.e2etest":
            raise CommandError("Command must run with e2etest settings")

        try:
            run_end_to_end()
        except Exception:
            msg = "End-to-end test failed (seed: %s)\n\n" % settings.BQ_NONCE
            msg += "Check logs in /tmp/"
            notify_slack(msg)
            raise

        msg = "Pipeline tests ran to completion (seed: %s)" % settings.BQ_NONCE
        notify_slack(msg)
示例#11
0
    def handle(self, *args, **kwargs):
        host = 'https://www.nhsbsa.nhs.uk'
        index = (host + '/pharmacies-gp-practices-and-appliance-contractors/'
                 'drug-tariff/drug-tariff-part-viii/')
        rsp = requests.get(index)
        doc = bs4.BeautifulSoup(rsp.content, 'html.parser')

        month_names = [x.lower() for x in calendar.month_name]

        imported_months = []

        for a in doc.findAll('a', href=re.compile('Part%20VIIIA')):
            # a.attrs['href'] typically has a filename part like
            # Part%20VIIIA%20September%202017.xlsx
            #
            # We split that into ['Part', 'VIIIA', 'September', '2017']
            words = re.split(
                r'[ -]+',
                urllib.unquote(
                    os.path.splitext(os.path.basename(a.attrs['href']))[0]))
            month_name, year = words[-2:]
            if len(year) == 2:
                year = "20" + year
                assert "2000" <= year <= str(datetime.date.today().year)
            month = month_names.index(month_name.lower())
            date = datetime.date(int(year), month, 1)

            if ImportLog.objects.filter(category='tariff',
                                        current_at=date).exists():
                continue

            xls_url = urljoin(index, a.attrs['href'])
            xls_file = StringIO(requests.get(xls_url).content)

            import_month(xls_file, date)
            imported_months.append((year, month))

        if imported_months:
            client = Client('dmd')
            client.upload_model(TariffPrice)

            for year, month in imported_months:
                msg = 'Imported Drug Tariff for %s_%s' % (year, month)
                notify_slack(msg)
        else:
            msg = 'Found no new tariff data to import'
            notify_slack(msg)
def import_month(xls_file, date):
    wb = load_workbook(xls_file)
    rows = wb.active.rows

    # The first row is a title, and the second is empty
    next(rows)
    next(rows)

    # The third row is column headings
    header_row = next(rows)
    headers = ["".join((c.value or "?").lower().split()) for c in header_row]
    assert headers == [
        "medicine",
        "packsize",
        "?",
        "vmppsnomedcode",
        "drugtariffcategory",
        "basicprice",
    ]

    with transaction.atomic():
        for row in rows:
            values = [c.value for c in row]
            if all(v is None for v in values):
                continue

            d = dict(zip(headers, values))

            if d["basicprice"] is None:
                msg = "Missing price for {} Drug Tariff for {}".format(
                    d["medicine"], date)
                notify_slack(msg)
                continue

            TariffPrice.objects.get_or_create(
                date=date,
                vmpp_id=d["vmppsnomedcode"],
                tariff_category_id=get_tariff_cat_id(d["drugtariffcategory"]),
                price_pence=int(d["basicprice"]),
            )

        ImportLog.objects.create(category="tariff",
                                 current_at=date,
                                 filename="none")
示例#13
0
    def handle(self, *args, **kwargs):
        self.vmpps = VMPP.objects.values('id', 'nm')
        num_before = NCSOConcession.objects.count()
        self.import_from_archive()
        self.import_from_current()
        self.reconcile()
        num_after = NCSOConcession.objects.count()
        num_unmatched = NCSOConcession.objects.filter(
            vmpp__isnull=True).count()

        Client('dmd').upload_model(NCSOConcession)

        if num_before == num_after:
            msg = 'Found no new concessions to import'
        else:
            msg = 'Imported %s new concessions' % (num_after - num_before)
        if num_unmatched:
            msg += '\nThere are %s unmatched concessions' % num_unmatched
        notify_slack(msg)
示例#14
0
    def handle_case_b(self, ccg, new_ccgs):
        # The CCG has closed and its practices have moved to multiple CCGs
        name = self.ccg_to_name[ccg]

        if self.dry_run:
            self.stdout.write(
                "All practices currently in CCG are closed or dormant")
            self.stdout.write(
                "Active practices previously in CCG are now in multiple CCGs:")
            for new_ccg, count in new_ccgs.most_common():
                new_ccg_name = self.ccg_to_name[new_ccg]
                self.stdout.write("{:3} {} ({})".format(
                    count, new_ccg, new_ccg_name))

        else:
            msg = """
All active practices previously in CCG {} ({}) are now in multiple CCGs:
Check whether inactive practices remaining in CCG should have moved.
See instructions in handle_orphan_practices.py.
            """.format(ccg, name).strip()
            for new_ccg, count in new_ccgs.most_common():
                new_ccg_name = self.ccg_to_name[new_ccg]
                msg += "\n{:3} {} ({})".format(count, new_ccg, new_ccg_name)
            notify_slack(msg)
    def import_from_html(self, heading):
        month_name, year = heading.text.strip().split()
        month_names = list(calendar.month_name)
        month = month_names.index(month_name)

        date = datetime.date(int(year), month, 1)

        if date < datetime.date(2014, 8, 1):
            return

        table = heading.find_next('table')
        trs = table.find_all('tr')
        records = [[td.text.strip() for td in tr.find_all('td')] for tr in trs]

        # Make sure the first row contains expected headers.
        # Unfortunately, the header names are not consistent.
        assert 'drug' in records[0][0].lower()
        assert 'pack' in records[0][1].lower()
        assert 'price' in records[0][2].lower()

        for record in records[1:]:
            drug, pack_size, price_concession = record
            drug = drug.replace('(new)', '').strip()
            match = re.match(u'£(\d+)\.(\d\d)', price_concession)
            price_concession_pence = 100 * int(match.groups()[0]) \
                + int(match.groups()[1])
            self.import_record(date, drug, pack_size, price_concession_pence)

        if NCSOConcession.objects.filter(date=date).count() >= len(records):
            # If there are more records in the database than we have imported,
            # then there was previously a record in the source that is no
            # longer present.  I have seen this once, with a record whose
            # spelling was corrected.  If we see this frequently, we should
            # automate dealing with it; for now we can deal with it manually.
            msg = 'NCSO concession(s) removed from source for {}'.format(date)
            notify_slack(msg)
示例#16
0
    def handle(self, *args, **options):
        if import_in_progress():
            notify_slack("Not checking numbers: import in progress")
            return

        previous_log_path = get_previous_log_path()

        timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
        log_path = os.path.join(settings.CHECK_NUMBERS_BASE_PATH, timestamp)
        mkdir_p(log_path)

        numbers = {}
        options = Options()
        options.headless = True
        with webdriver.Firefox(options=options) as browser:
            browser.set_page_load_timeout(60)

            for name, path in paths_to_scrape():
                source = get_page_source(browser, path, name, log_path)
                numbers_list = extract_numbers(source)
                numbers[name] = {"path": path, "numbers": numbers_list}

        write_numbers(numbers, log_path)

        if previous_log_path is None:
            msg = "Not checking numbers: this is the first deploy since last import"
            notify_slack(msg)
            return

        previous_numbers = load_previous_numbers(previous_log_path)

        differences = compare_numbers(previous_numbers, numbers)

        if differences:
            msg = "The following pages have changed:\n\n"
            msg += "\n".join(differences)
            msg += "\n\nNext step: compare {} and {}".format(
                previous_log_path, log_path
            )
            notify_slack(msg)
示例#17
0
 def notify_slack(self):
     msg = "Imported dm+d data, release " + self.release
     notify_slack(msg)