class MonthlyExtract(object): URL = "https://restats.decc.gov.uk/app/reporting/decc/monthlyextract/style/csv/csvwhich/reporting.decc.monthlyextract" def __init__(self): self.web = HttpsWithCookies() self.records = [] def __len__(self): return len(self.records) def get_data(self): global RECORD_FIELDS resp = self.web.open(self.URL) if resp is None or resp.code != 200: return False if sys.version_info >= (3, 0): csvfile = csv.reader(codecs.iterdecode(resp, 'utf-8')) else: csvfile = csv.reader(resp) for row in csvfile: if row[0] == 'Reference': RECORD_FIELDS = row continue d = DeccRecord(row) self.records.append(d) return True
def __init__(self, endpoint): self.web = HttpsWithCookies() self.action = None self.fields = {} self.field_labels = {} self.data = None self.get_form(endpoint)
class MonthlyExtract(object): 'https://www.gov.uk/government/collections/renewables-statistics' #https://www.gov.uk/government/uploads/system/uploads/attachment_data/file/434482/Public_Database_-_May_2015.xlsx URL = "https://restats.decc.gov.uk/app/reporting/decc/monthlyextract/style/csv/csvwhich/reporting.decc.monthlyextract" def __init__(self): self.web = HttpsWithCookies() self.records = [] def __len__(self): return len(self.records) def get_data(self): global RECORD_FIELDS resp = self.web.open(self.URL) if resp is None or resp.code != 200: return False if sys.version_info >= (3, 0): csvfile = csv.reader(codecs.iterdecode(resp, 'utf-8')) else: csvfile = csv.reader(resp) for row in csvfile: if row[0] == 'Reference': RECORD_FIELDS = row continue d = DeccRecord(row) self.records.append(d) return True
def __init__(self): self.web = HttpsWithCookies() self.records = []
class OfgemForm(object): """ Class that represents a form from the Renewables & CHP Ofgem website. The url to the form should be supplied when creating the object. """ SITE_URL = 'https://www.renewablesandchp.ofgem.gov.uk/Public/' def __init__(self, endpoint): self.web = HttpsWithCookies() self.action = None self.fields = {} self.field_labels = {} self.data = None self.get_form(endpoint) def _get_field_labels(self, root): """ Attempt to get the labels for the various fields on the webform. These are in table rows with a parameter of isparameterrow set to true. The format is usually for 5 columns, with the contents label : field : spacer : label : field """ for row in root.xpath("//tr[@isparameterrow='true']"): tds = row.xpath("td") for i in range(0, len(tds), 3): label = tds[i].xpath('span')[0].text if label is None: label = tds[i].xpath('span//font')[0].text _id = tds[i + 1].xpath('span')[0].get('id') if _id is None: _id = tds[i + 1].xpath('span//font')[0].getchildren()[0].get('id') if _id is not None: if not _id.endswith('ctl00'): _id += '$ctl00' self.field_labels[_id.replace('_', '$')] = label def get_form(self, _url): """ get_form() is used to request the initial form. Subsequent calls are made as required using the update_form() function. """ root = self._get_form_document(_url) form = root.xpath("//form") if len(form) == 0: raise Exception("Failed to get the form") get_and_set_from_xml(self, form[0], ["action", "method"]) if not self.action.startswith('http'): self.action = self.SITE_URL + self.action self._get_field_labels(form[0]) for inp in form[0].xpath("//input"): if inp.get('type', '') in ['', 'image']: continue of = OfgemField(inp) if len(of.id) > 30: # is it likely to be a multi value field input choice? if of.id[25:30] == "ctl03": # multi value field input choice... lbls = form[0].xpath("//label[@for='%s']" % of.id) if len(lbls) > 0: of.label = lbls[0].text if of.label.isdigit(): of.label = int(of.label) parent = of.id[:25].replace('_', '$') + 'ctl00' if parent in self.fields: self.fields[parent].options.append(of) else: print("Unknown parent...", parent) else: if of.type != 'radio' or not of.name in self.fields: self.fields[of.name] = of selects = form[0].xpath("//select") for s in selects: of = OfgemField(s) self.fields[of.name] = of for opt in s.xpath("option"): oo = OfgemField(opt) oo.label = opt.text if oo.label.isdigit(): oo.label = int(oo.label) of.options.append(oo) for fld in self.fields.values(): fld.set_postback_flag(); def _get_or_create_field(self, name): if name in self.fields: return self.fields[name] node = etree.Element("input", value="", type="hidden", name=name) of = OfgemField(node) self.fields[of.name] = of return of def set_value(self, fld_lbl, val): fld = self._find_field_by_label(fld_lbl) if fld is None: return False fld.set_value(val) if fld.postback: self.update_validation(fld.name) return True def set_value_by_label(self, fld_lbl, opt): fld = self._find_field_by_label(fld_lbl) if fld is None: return False fld.set_value_by_label(opt) if fld.postback: self.update_validation(fld.name) return True def get_options(self, fld_lbl): fld = self._find_field_by_label(fld_lbl) if fld is None: return {} opts = {} for opt in fld.options: opts[opt.label] = opt.value return opts def set_options_by_label(self, lbl, vals): fld = self._find_field_by_label(lbl) if fld is None: return False fld.set_values(vals) if fld.postback: self.update_validation(fld.name) return True def as_post_data(self): post_data = {} for v in self.fields.values(): for fld in v.as_post_data(): post_data[fld['name']] = fld['value'] return post_data def set_output_type(self, what): try: fld = self.fields['ReportViewer$ctl01$ctl05$ctl00'] except KeyError: return False for opt in fld.options: if opt.value.lower() == what.lower(): fld.set_value(opt.value) self.update_validation(fld.name) break return True def update_validation(self, name): self._get_or_create_field('__EVENTTARGET').value = name root = self._get_form_document() if root is None: return False ev = root.xpath("input[@name='__EVENTVALIDATION']") if len(ev): self.fields['__EVENTVALIDATION'].value = ev[0].get('value') vs = root.xpath("input[@name='__VIEWSTATE']") if len(vs): self.fields['__VIEWSTATE'].value = vs[0].get('value') return True def _get_form_document(self, url = None): if self.action is None: if url is None: return None if not url.startswith('http'): url = self.SITE_URL + url resp = self.web.open(url) else: resp = self.web.open(self.action, urllib.urlencode(self.as_post_data())) document = html5lib.parse(resp, treebuilder="lxml", namespaceHTMLElements=False) return document.getroot() def get_data(self): self._get_or_create_field('__EVENTTARGET').value = 'ReportViewer$ctl00$ctl05' root = self._get_form_document() if root is None: return False data_url = None for script in root.xpath("//script"): if script.text is None: continue if "RSToolbar(" in script.text: ck = re.search("new RSToolbar\((.*)\);", script.text) if ck is None: return False data_url = ck.group(1).split(',')[-2].replace('"', '').strip() if not data_url.startswith('http'): data_url = self.SITE_URL + data_url + \ self.fields['ReportViewer$ctl01$ctl05$ctl00'].option_value() break if data_url is None: return False docresp = self.web.open(data_url) if docresp is None or docresp.code != 200: return False self.data = docresp.read() if len(self.data) == 0: return False if docresp.headers['content-type'] == 'text/plain': # data is sent as utf-16, so convert to utf-8 self.data = self.data.decode('utf-16').encode('utf-8') return True def _find_field_by_label(self, lbl): for k, v in self.field_labels.iteritems(): if lbl.lower() in v.lower(): if k in self.fields: return self.fields[k] return None return None def add_filter(self, what, val): """ Attempt to add a filter. """ rv = False fld = self._find_field_by_label(what) if fld is not None: if len(fld.options) == 0: fld.value = val rv = True else: rv = fld.filter_values(val) if rv: self.update_validation(fld.name) return rv def set_text_value(self, fld_lbl, txt): fld = self._find_field_by_label(fld_lbl) if fld is None: return False poss = fld.name[:-2] + '01' # Some text fields have a $ctl01 which is a checkbox for # NULL, so check for it if poss in self.fields: self.fields[poss].set_value(False) self.update_validation(poss) fld.set_text(txt) self.update_validation(fld.name) return True def dump(self): for k,v in self.fields.iteritems(): if k in self.field_labels: print(self.field_labels[k]) else: print(k) for opts in v.options: print(" %50s : %s" % (opts.label, opts.raw_value())) def dump_post_data(self): for k, v in self.as_post_data().iteritems(): print("%-30s: %s" % (k,v))