Пример #1
0
class MonthlyExtract(object):
    URL = "https://restats.decc.gov.uk/app/reporting/decc/monthlyextract/style/csv/csvwhich/reporting.decc.monthlyextract"

    def __init__(self):
        self.web = HttpsWithCookies()
        self.records = []

    def __len__(self):
        return len(self.records)

    def get_data(self):
        global RECORD_FIELDS
        resp = self.web.open(self.URL)
        if resp is None or resp.code != 200:
            return False

        if sys.version_info >= (3, 0):
            csvfile = csv.reader(codecs.iterdecode(resp, 'utf-8'))
        else:
            csvfile = csv.reader(resp)

        for row in csvfile:
            if row[0] == 'Reference':
                RECORD_FIELDS = row
                continue
            d = DeccRecord(row)
            self.records.append(d)
        return True
Пример #2
0
 def __init__(self, endpoint):
     self.web = HttpsWithCookies()
     self.action = None
     self.fields = {}
     self.field_labels = {}
     self.data = None
     self.get_form(endpoint)
Пример #3
0
class MonthlyExtract(object):
    'https://www.gov.uk/government/collections/renewables-statistics'
    #https://www.gov.uk/government/uploads/system/uploads/attachment_data/file/434482/Public_Database_-_May_2015.xlsx

    URL = "https://restats.decc.gov.uk/app/reporting/decc/monthlyextract/style/csv/csvwhich/reporting.decc.monthlyextract"

    def __init__(self):
        self.web = HttpsWithCookies()
        self.records = []

    def __len__(self):
        return len(self.records)

    def get_data(self):
        global RECORD_FIELDS
        resp = self.web.open(self.URL)
        if resp is None or resp.code != 200:
            return False

        if sys.version_info >= (3, 0):
            csvfile = csv.reader(codecs.iterdecode(resp, 'utf-8'))
        else:
            csvfile = csv.reader(resp)

        for row in csvfile:
            if row[0] == 'Reference':
                RECORD_FIELDS = row
                continue
            d = DeccRecord(row)
            self.records.append(d)
        return True
Пример #4
0
 def __init__(self):
     self.web = HttpsWithCookies()
     self.records = []
Пример #5
0
class OfgemForm(object):
    """ Class that represents a form from the Renewables & CHP Ofgem
        website.
        The url to the form should be supplied when creating the object.
    """

    SITE_URL = 'https://www.renewablesandchp.ofgem.gov.uk/Public/'
    def __init__(self, endpoint):
        self.web = HttpsWithCookies()
        self.action = None
        self.fields = {}
        self.field_labels = {}
        self.data = None
        self.get_form(endpoint)

    def _get_field_labels(self, root):
        """ Attempt to get the labels for the various fields on the
            webform. These are in table rows with a parameter of
            isparameterrow set to true. The format is usually for
            5 columns, with the contents
            label : field : spacer : label : field

        """
        for row in root.xpath("//tr[@isparameterrow='true']"):
            tds = row.xpath("td")
            for i in range(0, len(tds), 3):
                label = tds[i].xpath('span')[0].text
                if label is None:
                    label = tds[i].xpath('span//font')[0].text
                _id = tds[i + 1].xpath('span')[0].get('id')
                if _id is None:
                    _id = tds[i + 1].xpath('span//font')[0].getchildren()[0].get('id')
                if _id is not None:
                    if not _id.endswith('ctl00'):
                        _id += '$ctl00'
                    self.field_labels[_id.replace('_', '$')] = label

    def get_form(self, _url):
        """ get_form() is used to request the initial form. Subsequent
            calls are made as required using the update_form() function.
        """
        root = self._get_form_document(_url)
        form = root.xpath("//form")
        if len(form) == 0:
            raise Exception("Failed to get the form")

        get_and_set_from_xml(self, form[0], ["action", "method"])
        if not self.action.startswith('http'):
            self.action = self.SITE_URL + self.action

        self._get_field_labels(form[0])

        for inp in form[0].xpath("//input"):
            if inp.get('type', '') in ['', 'image']:
                continue

            of = OfgemField(inp)

            if len(of.id) > 30:
                # is it likely to be a multi value field input choice?
                if of.id[25:30] == "ctl03":
                    # multi value field input choice...
                    lbls = form[0].xpath("//label[@for='%s']" % of.id)
                    if len(lbls) > 0:
                        of.label = lbls[0].text
                        if of.label.isdigit():
                            of.label = int(of.label)
                    parent = of.id[:25].replace('_', '$') + 'ctl00'
                    if parent in self.fields:
                        self.fields[parent].options.append(of)
                    else:
                        print("Unknown parent...", parent)

            else:
                if of.type != 'radio' or not of.name in self.fields:
                    self.fields[of.name] = of

        selects = form[0].xpath("//select")
        for s in selects:
            of = OfgemField(s)
            self.fields[of.name] = of
            for opt in s.xpath("option"):
                oo = OfgemField(opt)
                oo.label = opt.text
                if oo.label.isdigit():
                    oo.label = int(oo.label)
                of.options.append(oo)

        for fld in self.fields.values():
            fld.set_postback_flag();

    def _get_or_create_field(self, name):
        if name in self.fields:
            return self.fields[name]
        node = etree.Element("input", value="", type="hidden", name=name)
        of = OfgemField(node)
        self.fields[of.name] = of
        return of

    def set_value(self, fld_lbl, val):
        fld = self._find_field_by_label(fld_lbl)
        if fld is None:
            return False

        fld.set_value(val)
        if fld.postback:
            self.update_validation(fld.name)
        return True

    def set_value_by_label(self, fld_lbl, opt):
        fld = self._find_field_by_label(fld_lbl)
        if fld is None:
            return False
        fld.set_value_by_label(opt)
        if fld.postback:
            self.update_validation(fld.name)
        return True

    def get_options(self, fld_lbl):
        fld = self._find_field_by_label(fld_lbl)
        if fld is None:
            return {}
        opts = {}
        for opt in fld.options:
            opts[opt.label] = opt.value
        return opts

    def set_options_by_label(self, lbl, vals):
        fld = self._find_field_by_label(lbl)
        if fld is None:
            return False
        fld.set_values(vals)
        if fld.postback:
            self.update_validation(fld.name)
        return True

    def as_post_data(self):
        post_data = {}
        for v in self.fields.values():
            for fld in v.as_post_data():
                post_data[fld['name']] = fld['value']
        return post_data

    def set_output_type(self, what):
        try:
            fld = self.fields['ReportViewer$ctl01$ctl05$ctl00']
        except KeyError:
            return False

        for opt in fld.options:
            if opt.value.lower() == what.lower():
                fld.set_value(opt.value)
                self.update_validation(fld.name)
                break
        return True

    def update_validation(self, name):
        self._get_or_create_field('__EVENTTARGET').value = name
        root = self._get_form_document()
        if root is None:
            return False
        ev = root.xpath("input[@name='__EVENTVALIDATION']")
        if len(ev):
            self.fields['__EVENTVALIDATION'].value = ev[0].get('value')
        vs = root.xpath("input[@name='__VIEWSTATE']")
        if len(vs):
            self.fields['__VIEWSTATE'].value = vs[0].get('value')
        return True

    def _get_form_document(self, url = None):
        if self.action is None:
            if url is None:
                return None
            if not url.startswith('http'):
                url = self.SITE_URL + url
            resp = self.web.open(url)
        else:
            resp = self.web.open(self.action, urllib.urlencode(self.as_post_data()))
        document = html5lib.parse(resp,
                                  treebuilder="lxml",
                                  namespaceHTMLElements=False)

        return document.getroot()

    def get_data(self):
        self._get_or_create_field('__EVENTTARGET').value = 'ReportViewer$ctl00$ctl05'
        root = self._get_form_document()
        if root is None:
            return False

        data_url = None
        for script in root.xpath("//script"):
            if script.text is None:
                continue
            if "RSToolbar(" in script.text:
                ck = re.search("new RSToolbar\((.*)\);", script.text)
                if ck is None:
                    return False

                data_url = ck.group(1).split(',')[-2].replace('"', '').strip()
                if not data_url.startswith('http'):
                    data_url = self.SITE_URL + data_url + \
                               self.fields['ReportViewer$ctl01$ctl05$ctl00'].option_value()
                break
        if data_url is None:
            return False

        docresp = self.web.open(data_url)
        if docresp is None or docresp.code != 200:
            return False

        self.data = docresp.read()
        if len(self.data) == 0:
            return False

        if docresp.headers['content-type'] == 'text/plain':
            # data is sent as utf-16, so convert to utf-8
            self.data = self.data.decode('utf-16').encode('utf-8')

        return True

    def _find_field_by_label(self, lbl):
        for k, v in self.field_labels.iteritems():
            if lbl.lower() in v.lower():
                if k in self.fields:
                    return self.fields[k]
                return None
        return None

    def add_filter(self, what, val):
        """ Attempt to add a filter.
        """
        rv = False
        fld = self._find_field_by_label(what)
        if fld is not None:
            if len(fld.options) == 0:
                fld.value = val
                rv = True
            else:
                rv = fld.filter_values(val)
            if rv:
                self.update_validation(fld.name)
        return rv

    def set_text_value(self, fld_lbl, txt):
        fld = self._find_field_by_label(fld_lbl)
        if fld is None:
            return False

        poss = fld.name[:-2] + '01'
        # Some text fields have a $ctl01 which is a checkbox for
        # NULL, so check for it
        if poss in self.fields:
            self.fields[poss].set_value(False)
            self.update_validation(poss)

        fld.set_text(txt)
        self.update_validation(fld.name)
        return True

    def dump(self):
        for k,v in self.fields.iteritems():
            if k in self.field_labels:
                print(self.field_labels[k])
            else:
                print(k)
            for opts in v.options:
                print("      %50s : %s" % (opts.label, opts.raw_value()))

    def dump_post_data(self):
        for k, v in self.as_post_data().iteritems():
            print("%-30s: %s" % (k,v))
Пример #6
0
 def __init__(self):
     self.web = HttpsWithCookies()
     self.records = []