def get_id_batch (self, date_from, date_to): this_dt = date_from final_result = [] while this_dt <= date_to: response = self.br.open(self._search_url) #self.logger.debug("Start html: %s", response.read()) fields = {} fields[self._date_from_field] = this_dt.strftime(self._request_date_format) fields[self._date_to_field] = this_dt.strftime(self._request_date_format) scrapeutils.setup_form(self.br, self._search_form, fields) #self.logger.debug("ID batch form: %s", str(self.br.form)) response = scrapeutils.submit_form(self.br) html = response.read() #self.logger.debug("ID batch page html: %s", html) try: result = scrapemark.scrape(self._scrape_max_recs, html) max_recs = int(result['max_recs']) except: max_recs = 0 interim_result = [] page_count = 0 max_pages = (2 * self.min_id_goal / 10) + 20 # guard against infinite loop while response and len(interim_result) < max_recs and page_count < max_pages: url = response.geturl() #self.logger.debug("ID batch page html: %s", html) result = scrapemark.scrape(self._scrape_ids, html, url) if result and result.get('records'): page_count += 1 for rec in result['records']: rec[self._date_type] = fields[self._date_to_field] self._clean_ids(result['records']) interim_result.extend(result['records']) else: self.logger.debug("Empty result after %d pages", page_count) break if len(interim_result) >= max_recs: break try: result = scrapemark.scrape(self._scrape_next_submit, html) next_submit = result['next_submit'] scrapeutils.setup_form(self.br, self._search_form) self.logger.debug("ID next form: %s", str(self.br.form)) response = scrapeutils.submit_form(self.br, next_submit) html = response.read() except: # failure to find next page link at end of page sequence here self.logger.debug("No next form link after %d pages", page_count) break if page_count >= max_pages: self.logger.warning("Too many page requests - %d - probable run away loop" % page_count) final_result.extend(interim_result) this_dt += timedelta(days=1) return final_result
def get_html_from_uid(self, uid): response = self.br.open(self._search_url) #self.logger.debug("ID detail start html: %s", response.read()) fields = self._applic_fields fields[self._ref_field] = uid scrapeutils.setup_form(self.br, self._search_form, fields) response = scrapeutils.submit_form(self.br, self._search_submit) html = response.read() sub_html = self._BADCHARS_REGEX.sub(' ', html) #self.logger.debug("detail page html: %s", sub_html) expired = scrapemark.scrape(self._scrape_expired, sub_html) while expired: response = self.br.reload() html = response.read() sub_html = self._BADCHARS_REGEX.sub(' ', html) expired = scrapemark.scrape(self._scrape_expired, sub_html) url = response.geturl() result = scrapemark.scrape(self._scrape_ids, sub_html, url) if result and result.get('records'): self._clean_ids(result['records']) for r in result['records']: if r.get('uid', '') == uid and r.get('url'): self.logger.debug("Scraped url: %s", r['url']) return self.get_html_from_url(r['url']) return None, None
def get_id_batch(self, date_from, date_to): final_result = [] for case in self._case_prefixes: interim_result = [] response = self.br.open(self._search_url) #self.logger.debug("Start html: %s", response.read()) fields = {self._ref_field: case} fields[self._date_from_field] = date_from.strftime( self._request_date_format) fields[self._date_to_field] = date_to.strftime( self._request_date_format) scrapeutils.setup_form(self.br, self._search_form, fields) self.logger.debug("ID batch form: %s", str(self.br.form)) response = scrapeutils.submit_form(self.br) page_count = 0 max_pages = (2 * self.min_id_goal / 10) + 20 # guard against infinite loop while response and page_count < max_pages: html = response.read() url = response.geturl() #self.logger.debug("ID batch page html: %s", html) result = scrapemark.scrape(self._scrape_ids, html, url) if result and result.get('records'): page_count += 1 self._clean_ids(result['records']) interim_result.extend(result['records']) elif not interim_result: # is it a single record? single_result = scrapemark.scrape(self._scrape_one_id, html, url) if single_result: self._clean_record(single_result) interim_result = [single_result] break else: self.logger.debug("Empty result after %d pages", page_count) break try: result = scrapemark.scrape(self._scrape_next_link, html, url) response = self.br.open(result['next_link']) except: self.logger.debug("No next link after %d pages", page_count) break if page_count >= max_pages: self.logger.warning( "Too many page requests - %d - probable run away loop" % page_count) final_result.extend(interim_result) return final_result
def get_id_batch(self, date_from, date_to): final_result = [] response = self.br.open(self._search_url) #self.logger.debug("Start html: %s", response.read()) fields = {} fields.update(self._search_fields) fields[self._date_from_field] = date_from.strftime( self._request_date_format) fields[self._date_to_field] = date_to.strftime( self._request_date_format) scrapeutils.setup_form(self.br, self._search_form, fields) self.logger.debug("ID batch form: %s", str(self.br.form)) response = scrapeutils.submit_form(self.br, self._search_submit) html = response.read() #self.logger.debug("ID batch page html: %s", html) try: result = scrapemark.scrape(self._scrape_max_recs, html) max_recs = int(result['max_recs']) except: max_recs = 0 page_count = 0 max_pages = (2 * self.min_id_goal / 10) + 20 # guard against infinite loop id_list = [] while response and len( final_result) < max_recs and page_count < max_pages: url = response.geturl() result = scrapemark.scrape(self._scrape_ids, html, url) if result and result.get('records'): page_count += 1 self._clean_ids(result['records']) # add IDs one by one and test for duplicates for r in result['records']: if r['uid'] not in id_list: final_result.append(r) id_list.append(r['uid']) else: self.logger.debug("Empty result after %d pages", page_count) break if len(final_result) >= max_recs: break try: response = self.br.follow_link(text=self._next_link) html = response.read() #self.logger.debug("ID next page html: %s", html) except: # normal failure to find next page form at end of page sequence here self.logger.debug("No next link after %d pages", page_count) break if page_count >= max_pages: self.logger.warning( "Too many page requests - %d - probable run away loop" % page_count) return final_result
def get_id_batch(self, date_from, date_to): final_result = [] response = self.br.open(self._search_url) #self.logger.debug("Start html: %s" % response.read()) fields = {} fields[self._date_from_field] = date_from.strftime( self._request_date_format) fields[self._date_to_field] = date_to.strftime( self._request_date_format) scrapeutils.setup_form(self.br, self._search_form, fields) self.logger.debug("ID batch form: %s", str(self.br.form)) response = scrapeutils.submit_form(self.br) html = response.read() #self.logger.debug("ID batch page html: %s", html) try: result = scrapemark.scrape(self._scrape_max_recs, html) max_recs = int(result['max_recs']) except: max_recs = 0 # note max recs is in the footer which is omitted if only one page of results page_count = 0 max_pages = (2 * self.min_id_goal / 10) + 20 # guard against infinite loop while response and page_count < max_pages: url = response.geturl() #self.logger.debug("Batch html: %s" % html) if max_recs == 0: # one page, no footer result = scrapemark.scrape(self._scrape_ids_no_foot, html, url) else: result = scrapemark.scrape(self._scrape_ids, html, url) if result and result.get('records'): page_count += 1 self._clean_ids(result['records']) final_result.extend(result['records']) else: self.logger.debug("Empty result after %d pages", page_count) break if max_recs == 0 or len(final_result) >= max_recs: break try: result = scrapemark.scrape(self._scrape_next_submit, html) scrapeutils.setup_form(self.br, self._search_form) #self.logger.debug("Next page form: %s", str(self.br.form)) response = scrapeutils.submit_form(self.br, result['next_submit']) html = response.read() except: # normal failure to find next page form at end of page sequence here self.logger.debug("No next form after %d pages", page_count) break if page_count >= max_pages: self.logger.warning( "Too many page requests - %d - probable run away loop" % page_count) return final_result
def get_id_batch(self, date_from, date_to): final_result = [] page_count = 0 new_date_from = date_from - timedelta( days=1) # NB from date is exclusive dfrom = new_date_from.strftime(self._request_date_format) dto = date_to.strftime(self._request_date_format) url = self._search_url + '?' + self._page_params % (dfrom, dto, page_count * 10) self.logger.debug("Start URL: %s", url) response = self.br.open(url) html = response.read() #self.logger.debug("ID batch page html: %s", html) try: result = scrapemark.scrape(self._scrape_max_recs1, html) max_recs = int(result['max_recs']) except: try: result = scrapemark.scrape(self._scrape_max_recs2, html) max_recs = int(result['max_recs']) except: max_recs = 0 max_pages = (2 * self.min_id_goal / 10) + 20 # guard against infinite loop while response and len( final_result) < max_recs and page_count < max_pages: url = response.geturl() #self.logger.debug("ID batch page html: %s", html) result = scrapemark.scrape(self._scrape_ids, html, url) if result and result.get('records'): page_count += 1 self._clean_ids(result['records']) final_result.extend(result['records']) else: self.logger.debug("Empty result after %d pages", page_count) break if len(final_result) >= max_recs: break try: url = self._search_url + '?' + self._page_params % ( dfrom, dto, page_count * 10) self.logger.debug("Next URL: %s", url) response = self.br.open(url) html = response.read() except: # failure to find next page link at end of page sequence here self.logger.debug("No next link after %d pages", page_count) break if page_count >= max_pages: self.logger.warning( "Too many page requests - %d - probable run away loop" % page_count) return final_result
def get_id_batch(self, date_from, date_to): final_result = [] fields = {} fields.update(self._search_fields) fields[self._date_from_field] = date_from.strftime( self._request_date_format) fields[self._date_to_field] = date_to.strftime( self._request_date_format) fields[self._uid_field] = '' self.logger.debug("Fields: %s", str(fields)) url = self._search_url + '?' + urllib.urlencode(fields) response = self.br.open(url) html = response.read() #self.logger.debug("ID batch page html: %s", html) try: result = scrapemark.scrape(self._scrape_max_recs, html) max_recs = int(result['max_recs']) except: max_recs = 0 page_count = 0 max_pages = (2 * self.min_id_goal / 10) + 20 # guard against infinite loop while response and len( final_result) < max_recs and page_count < max_pages: url = response.geturl() result = scrapemark.scrape(self._scrape_ids, html, url) if result and result.get('records'): page_count += 1 self._clean_ids(result['records']) final_result.extend(result['records']) else: self.logger.debug("Empty result after %d pages", page_count) break if len(final_result) >= max_recs: break try: nextp = (page_count * 20) + 1 fields['is_NextRow'] = str(nextp) self.logger.debug("Next fields: %s", str(fields)) url = self._search_url + '?' + urllib.urlencode(fields) response = self.br.open(url) html = response.read() #self.logger.debug("ID next page html: %s", html) except: # normal failure to find next page form at end of page sequence here self.logger.debug("No next link after %d pages", page_count) break if page_count >= max_pages: self.logger.warning( "Too many page requests - %d - probable run away loop" % page_count) return final_result
def get_id_batch(self, date_from, date_to): final_result = [] new_date_from = date_from - timedelta( days=1) # start date is exclusive, decrement start date by one day date_to = date_to + timedelta( days=1) # end date is exclusive, increment end date by one day response = self.br.open(self._search_url) #self.logger.debug("Start html: %s", response.read()) fields = {} fields[self._date_from_field] = new_date_from.strftime( self._request_date_format) fields[self._date_to_field] = date_to.strftime( self._request_date_format) scrapeutils.setup_form(self.br, self._search_form, fields) self.logger.debug("ID batch form: %s", str(self.br.form)) response = scrapeutils.submit_form(self.br, self._search_submit) html = response.read() #self.logger.debug("ID batch page html: %s", html) try: result = scrapemark.scrape(self._scrape_max_recs, html) max_recs = int(result['max_recs']) except: max_recs = 0 page_count = 0 max_pages = (2 * self.min_id_goal / 10) + 20 # guard against infinite loop while response and len( final_result) < max_recs and page_count < max_pages: url = response.geturl() #self.logger.debug("Batch html: %s" % html) result = scrapemark.scrape(self._scrape_ids, html, url) if result and result.get('records'): page_count += 1 self._clean_ids(result['records']) final_result.extend(result['records']) else: self.logger.debug("Empty result after %d pages", page_count) break if len(final_result) >= max_recs: break try: result = scrapemark.scrape(self._scrape_next, html, url) response = self.br.open(result['next_link']) html = response.read() except: # normal failure to find next page form at end of page sequence here self.logger.debug("No next link after %d pages", page_count) break if page_count >= max_pages: self.logger.warning( "Too many page requests - %d - probable run away loop" % page_count) return final_result
def get_id_batch(self, date_from, date_to): final_result = [] response = self.br.open(self._search_url) self._adjust_response(response) #self.logger.debug("First page html: %s", response.read()) self.logger.debug(scrapeutils.list_forms(self.br)) fields = {} fields[self._date_from_field] = date_from.strftime( self._request_date_format) fields[self._date_to_field] = date_to.strftime( self._request_date_format) scrapeutils.setup_form(self.br, self._search_form, fields) action = self.br.form.action self.br.form.action = action.replace('https://', 'http://') self.logger.debug("ID batch form: %s", str(self.br.form)) response = scrapeutils.submit_form(self.br) page_count = 0 max_pages = (2 * self.min_id_goal / 10) + 20 # guard against infinite loop while response and page_count < max_pages: html = response.read() url = response.geturl() #self.logger.debug("ID batch page html: %s", html) result = scrapemark.scrape(self._scrape_ids, html, url) if result and result.get('records'): page_count += 1 self._clean_ids(result['records']) final_result.extend(result['records']) else: self.logger.debug("Empty result after %d pages", page_count) break try: if 'next_link' in self._scrape_next: result = scrapemark.scrape(self._scrape_next, html, url) response = self.br.open(result['next_link']) else: scrapeutils.setup_form(self.br, self._scrape_next) action = self.br.form.action self.br.form.action = action.replace('https://', 'http://') self.logger.debug("ID next form: %s", str(self.br.form)) response = scrapeutils.submit_form(self.br) except: # normal failure to find next page form at end of page sequence here self.logger.debug("No next form/link after %d pages", page_count) break if page_count >= max_pages: self.logger.warning( "Too many page requests - %d - probable run away loop" % page_count) return final_result
def get_id_batch(self, date_from, date_to): final_result = [] fields = {} fields.update(self._search_fields) fields[self._date_from_field] = date_from.strftime( self._request_date_format) fields[self._date_to_field] = date_to.strftime( self._request_date_format) response = self.br.open(self._search_url + '?' + urllib.urlencode(fields)) html = response.read() #self.logger.debug("ID batch page html: %s", html) try: result = scrapemark.scrape(self._scrape_max_recs, html) max_recs = int(result['max_recs']) except: max_recs = 0 page_count = 0 max_pages = (2 * self.min_id_goal / 10) + 20 # guard against infinite loop while response and len( final_result) < max_recs and page_count < max_pages: url = response.geturl() #self.logger.debug("Batch html: %s" % html) result = scrapemark.scrape(self._scrape_ids, html, url) if result and result.get('records'): page_count += 1 self._clean_ids(result['records']) final_result.extend(result['records']) elif not final_result: # is it a single record? single_result = scrapemark.scrape(self._scrape_id, html, url) if single_result: single_result['url'] = url self._clean_record(single_result) return [single_result] else: self.logger.debug("Empty result after %d pages", page_count) break if len(final_result) >= max_recs: break try: response = self.br.follow_link(text=self._link_next) html = response.read() except: # normal failure to find next page form at end of page sequence here self.logger.debug("No next link after %d pages", page_count) break if page_count >= max_pages: self.logger.warning( "Too many page requests - %d - probable run away loop" % page_count) return final_result
def get_id_records(self, request_from, request_to, max_recs): if not request_from or not request_to or not max_recs: return [], None, None # if any parameter invalid - try again next time from_rec = int(request_from) to_rec = int(request_to) num_recs = int(max_recs) if from_rec < 1: if to_rec < 1: # both too small return [], None, None from_rec = 1 if to_rec > num_recs: if from_rec > num_recs: # both too large return [], None, None to_rec = num_recs final_result = [] rfrom = None rto = None n = to_rec - from_rec + 1 if self.over_sequence(to_rec): # at max sequence and gathering forward ii, yy, from_rec = self.split_sequence(from_rec, True) else: ii, yy, from_rec = self.split_sequence(from_rec, False) to_rec = from_rec + n - 1 in_current_year = False this_year = date.today().year for i in range(from_rec, to_rec + 1): index, year, new_seq = self.split_sequence(i) if year == this_year and index > 0: in_current_year = True if rfrom is None: rfrom = i rto = new_seq found = False for prefix in self._prefixes: uid = prefix + self.get_uid(index, year) html, url = self.get_html_from_uid(uid) result = scrapemark.scrape(self._scrape_min_data, html) if result and result.get('reference'): final_result.append({'url': url, 'uid': uid}) found = True break if not found: result = scrapemark.scrape(self._scrape_invalid_format, html) if result and result.get('invalid_format'): self.logger.debug( "No valid record for uid ?/%s/%s" % (str(year), str(index).zfill(self._index_digits))) else: return [], None, None # not recognised as bad data - something is wrong - exit if not in_current_year or final_result: return final_result, rfrom, rto else: return [], None, None # empty result is invalid if any of the results are in the current year
def get_id_batch(self, date_from, date_to): new_date_to = date_to + timedelta( days=1) # increment end date by one day final_result = [] response = self.br.open(self._search_url) #self.logger.debug("ID batch start html: %s", response.read()) fields = {} fields[self._date_from_field] = date_from.strftime( self._request_date_format) fields[self._date_to_field] = new_date_to.strftime( self._request_date_format) scrapeutils.setup_form(self.br, self._search_form, fields) self.logger.debug("ID batch form: %s", str(self.br.form)) response = scrapeutils.submit_form(self.br) page_count = 0 max_pages = (2 * self.min_id_goal / 10) + 20 # guard against infinite loop while response and page_count < max_pages: html = response.read() url = response.geturl() #self.logger.debug("ID batch page html: %s", html) result = scrapemark.scrape(self._scrape_ids, html, url) if result and result.get('records'): page_count += 1 self._clean_ids(result['records']) for res in result['records']: if res.get('uid'): # one uid on 1 dec 2015 is empty final_result.append(res) elif not final_result: # is it a single record? single_result = scrapemark.scrape(self._scrape_one_id, html, url) if single_result: self._clean_record(single_result) final_result = [single_result] break else: self.logger.debug("Empty result after %d pages", page_count) break try: result = scrapemark.scrape(self._scrape_next_link, html, url) response = self.br.open(result['next_link']) except: self.logger.debug("No next link after %d pages", page_count) break if page_count >= max_pages: self.logger.warning( "Too many page requests - %d - probable run away loop" % page_count) return final_result
def get_id_period(self, this_date): final_result = [] from_dt, to_dt = scrapeutils.inc_dt(this_date, self._period_type) do_dt = to_dt # the date being tested - can change rurl = urlparse.urljoin(self._search_url, self._results_page) for i in range( 5 ): # note works backwards through all 5 possible week days as some lists are not published exactly on a Friday fields = {} fields.update(self._query_fields) fields[self._date_field] = do_dt.strftime( self._request_date_format) response = self.br.open(rurl + '?' + urllib.urlencode(fields)) page_count = 0 max_pages = (2 * self.min_id_goal / 10) + 20 # guard against infinite loop while response and page_count < max_pages: html = response.read() url = response.geturl() #self.logger.debug("ID batch page html: %s", html) result = scrapemark.scrape(self._scrape_ids, html, url) if result and result.get('records'): page_count += 1 self._clean_ids(result['records']) final_result.extend(result['records']) else: self.logger.debug("Empty result after %d pages", page_count) break try: new_html = self._junk_regex.sub( '', html) # remove internal junk characters result = scrapemark.scrape(self._scrape_next_link, new_html, url) next_link = self._space_regex.sub( '', result['next_link']) # remove all spaces response = self.br.open(next_link) except: self.logger.debug("No next link after %d pages", page_count) break do_dt = do_dt - timedelta( days=1) # try again with a different date if page_count >= max_pages: self.logger.warning( "Too many page requests - %d - probable run away loop" % page_count) return final_result, from_dt, to_dt # note weekly result can be legitimately empty
def max_sequence(self): response = self.br.open(self._search_url) # one fixed page of records html = response.read() url = response.geturl() result1 = scrapemark.scrape(self._scrape_ids, html, url) result2 = scrapemark.scrape(self._scrape_ids_withdrawn, html, url) # no longer listed total = 0 if result1 and result1.get('records'): total += len(result1['records']) if result2 and result2.get('records'): total += len(result2['records']) return total if total else None
def get_id_batch (self, date_from, date_to): final_result = [] response = self.br.open(self._search_url) scrapeutils.setup_form(self.br, self._search_form) self.logger.debug("Start form: %s", str(self.br.form)) response = scrapeutils.submit_form(self.br) fields = {} fields.update(self._search_fields) fields[self._date_from_field] = date_from.strftime(self._request_date_format) fields[self._date_to_field] = date_to.strftime(self._request_date_format) scrapeutils.setup_form(self.br, self._search_form, fields) self.logger.debug("ID batch form: %s", str(self.br.form)) response = scrapeutils.submit_form(self.br, self._search_submit) html = response.read() runaway_pages = (2 * self.min_id_goal / 10) + 20 # guard against infinite loop try: result = scrapemark.scrape(self._scrape_max_pages, html) max_pages = int(result['max_pages']) except: max_pages = runaway_pages page_count = 0 while html and page_count < max_pages: url = response.geturl() #self.logger.debug("Batch html: %s" % html) result = scrapemark.scrape(self._scrape_ids, html, url) if result and result.get('records'): page_count += 1 self._clean_ids(result['records']) final_result.extend(result['records']) else: self.logger.debug("Empty result after %d pages", page_count) break if page_count >= max_pages: break try: scrapeutils.setup_form(self.br, self._search_form) self.logger.debug("ID next form: %s", str(self.br.form)) response = scrapeutils.submit_form(self.br, self._next_submit) html = response.read() except: # normal failure to find next page form at end of page sequence here self.logger.debug("No next form after %d pages", page_count) break if page_count >= runaway_pages: self.logger.warning("Too many page requests - %d - probable run away loop" % page_count) return final_result
def get_id_batch (self, date_from, date_to): final_result = [] fields = {} fields.update(self._search_fields) fields [self._date_from_field] = date_from.strftime(self._request_date_format) fields [self._date_to_field] = date_to.strftime(self._request_date_format) self.logger.debug("Fields: %s", str(fields)) query = urllib.urlencode(fields) url = urlparse.urljoin(self._search_url, self._results_page) + '?' + query response = self.br.open(url) html = response.read() #self.logger.debug("ID batch page html: %s", html) try: result = scrapemark.scrape(self._scrape_max_recs, html) max_recs = int(result['max_recs']) except: max_recs = 0 page_count = 0 max_pages = (2 * self.min_id_goal / 10) + 20 # guard against infinite loop while response and len(final_result) < max_recs and page_count < max_pages: url = response.geturl() result = scrapemark.scrape(self._scrape_ids, html, url) if result and result.get('records'): page_count += 1 self._clean_ids(result['records']) final_result.extend(result['records']) else: self.logger.debug("Empty result after %d pages", page_count) break if len(final_result) >= max_recs: break try: fields = { '__EVENTTARGET': self._next_target } fields['__EVENTARGUMENT'] = 'Page$' + str(page_count+1) scrapeutils.setup_form(self.br, self._search_form, fields) self.logger.debug("Next page form: %s", str(self.br.form)) response = scrapeutils.submit_form(self.br) html = response.read() #self.logger.debug("ID next page html: %s", html) except: # normal failure to find next page link at end of page sequence here self.logger.debug("No next form after %d pages", page_count) break if page_count >= max_pages: self.logger.warning("Too many page requests - %d - probable run away loop" % page_count) return final_result
def get_id_batch(self, date_from, date_to): final_result = [] response = self.br.open(self._search_url) self._adjust_response(response) #self.logger.debug("Start html: %s", response.read()) fields = self._search_fields fields[self._date_from_field] = date_from.strftime( self._request_date_format) fields[self._date_to_field] = date_to.strftime( self._request_date_format) scrapeutils.setup_form(self.br, self._search_form, fields) #self.logger.debug("ID batch form: %s", str(self.br.form)) response = scrapeutils.submit_form(self.br, self._search_submit) page_count = 0 max_pages = (2 * self.min_id_goal / 10) + 20 # guard against infinite loop while response and page_count < max_pages: html = response.read() url = response.geturl() sub_html = self._BADCHARS_REGEX.sub(' ', html) ##self.logger.debug("ID batch page html: %s", sub_html) result = scrapemark.scrape(self._scrape_ids, sub_html, url) if result and result.get('records'): page_count += 1 self._clean_ids(result['records']) final_result.extend(result['records']) else: self.logger.debug("Empty result after %d pages", page_count) break try: result = scrapemark.scrape(self._scrape_next_link, sub_html, url) #print result next_url = myutils.GAPS_REGEX.sub('', result['next_link']) self.logger.debug("ID next url: %s", next_url) response = self.br.open(next_url) self._adjust_response(response) except: # normal failure to find next page link at end of page sequence here self.logger.debug("No next link after %d pages", page_count) break if page_count >= max_pages: self.logger.warning( "Too many page requests - %d - probable run away loop" % page_count) return final_result
def _get_results_pages (self, result_param, final_page = None): # note returns dictionary of records (+ page number) keyed by uid final_result = [] response = self.br.open(self._results_url + '?' + result_param) html = response.read() #self.logger.debug("Batch html: %s" % html) try: result = scrapemark.scrape(self._scrape_max_recs, html) max_recs = int(result['max_recs']) except: max_recs = 0 page_count = 0 max_pages = (4 * self.min_id_goal / 10) + 20 # guard against infinite loop if not final_page: end_page = max_pages else: end_page = final_page while html and len(final_result) < max_recs and page_count < end_page: url = response.geturl() result = scrapemark.scrape(self._scrape_ids, html, url) if result and result.get('records'): page_count += 1 self._clean_ids(result['records']) recno = 0 for r in result['records']: recno += 1 r['pageno'] = page_count r['recno'] = recno final_result.extend(result['records']) else: self.logger.debug("Empty result after %d pages", page_count) break if len(final_result) >= max_recs: break try: result = scrapemark.scrape(self._scrape_next_submit, html, url) scrapeutils.setup_form(self.br, self._result_form) #self.logger.debug("Next form: %s", str(self.br.form)) response = scrapeutils.submit_form(self.br, result['next_submit']) html = response.read() except: # note this should never happen as we know the max_recs value self.logger.error("No next button after %d pages", page_count) return [] if page_count >= max_pages: self.logger.warning("Too many page requests - %d - possible run away loop" % page_count) return final_result
def get_id_batch(self, date_from, date_to): final_result = [] response = self.br.open(self._search_url) #self.logger.debug("Start html: %s", response.read()) fields = {} fields.update(self._search_fields) fields[self._date_from_field] = date_from.strftime( self._request_date_format) fields[self._date_to_field] = date_to.strftime( self._request_date_format) scrapeutils.setup_form(self.br, self._search_form, fields) self.logger.debug("ID batch form: %s", str(self.br.form)) response = scrapeutils.submit_form(self.br) html = response.read() #self.logger.debug("ID batch page html: %s", html) result = scrapemark.scrape(self._scrape_max_pages, html) try: page_list = result['max_pages'].split() max_pages = len(page_list) except: max_pages = 1 page_count = 0 while response and page_count < max_pages: url = response.geturl() result = scrapemark.scrape(self._scrape_ids, html, url) if result and result.get('records'): page_count += 1 self._clean_ids(result['records']) final_result.extend(result['records']) else: self.logger.debug("Empty result after %d pages", page_count) break if page_count >= max_pages: break try: next_url = re.sub(r'pageno=\d*&', 'pageno=' + str(page_count + 1) + '&', url) self.logger.debug("ID next url: %s", next_url) response = self.br.open(next_url) html = response.read() except: # normal failure to find next page form at end of page sequence here self.logger.debug("No next url after %d pages", page_count) break return final_result
def get_id_batch(self, date_from, date_to): final_result = [] response = self.br.open(self._search_url) fields = {} fields.update(self._search_fields) date_from = date_from.strftime(self._request_date_format) date_parts = date_from.split('/') #fields[self._date_from_field['day']] = [ date_parts[0] ] #fields[self._date_from_field['month']] = [ date_parts[1] ] #fields[self._date_from_field['year']] = [ date_parts[2] ] fields[self._date_from_field['day']] = date_parts[0] fields[self._date_from_field['month']] = date_parts[1] fields[self._date_from_field['year']] = date_parts[2] date_to = date_to.strftime(self._request_date_format) date_parts = date_to.split('/') #fields[self._date_to_field['day']] = [ date_parts[0] ] #fields[self._date_to_field['month']] = [ date_parts[1] ] #fields[self._date_to_field['year']] = [ date_parts[2] ] fields[self._date_to_field['day']] = date_parts[0] fields[self._date_to_field['month']] = date_parts[1] fields[self._date_to_field['year']] = date_parts[2] #scrapeutils.setup_form(self.br, self._search_form, fields) #self.logger.debug("ID batch form: %s", str(self.br.form)) #response = scrapeutils.submit_form(self.br, self._search_submit) response = self.br.open(self._search_url, urllib.urlencode(fields)) page_count = 0 max_pages = (2 * self.min_id_goal / 10) + 20 # guard against infinite loop while response and page_count < max_pages: url = response.geturl() html = response.read() self.logger.debug("Batch html: %s" % html) result = scrapemark.scrape(self._scrape_ids, html, url) if result and result.get('records'): page_count += 1 self._clean_ids(result['records']) final_result.extend(result['records']) else: self.logger.debug("Empty result after %d pages", page_count) break try: scrapeutils.setup_form(self.br, self._next_form, self._next_fields) self.logger.debug("ID next form: %s", str(self.br.form)) response = scrapeutils.submit_form(self.br) except: # normal failure to find next page form at end of page sequence here self.logger.debug("No next form after %d pages", page_count) break if page_count >= max_pages: self.logger.warning( "Too many page requests - %d - probable run away loop" % page_count) return final_result
def get_id_batch(self, date_from, date_to): final_result = [] new_date_to = date_to + timedelta( days=1) # end date is exclusive, increment end date by one day fields = {} fields.update(self._search_fields) fields[self._date_from_field] = date_from.strftime( self._request_date_format) fields[self._date_to_field] = new_date_to.strftime( self._request_date_format) self.logger.debug("Fields: %s", str(fields)) query = urllib.urlencode(fields) url = urlparse.urljoin(self._iframe_url, self._results_page) response = self.br.open(url, query) if response: html = response.read() url = response.geturl() #self.logger.debug("Batch html: %s" % html) result = scrapemark.scrape(self._scrape_ids, html, url) if result and result.get('records'): self._clean_ids(result['records']) for res in result['records']: if res.get('uid' ): # sometimes there are blank uids eg 18/3/15 final_result.append(res) return final_result
def get_id_records(self, request_from, request_to, max_recs): if not request_from or not request_to or not max_recs: return [], None, None # if any parameter invalid - try again next time final_result = [] from_rec = int(request_from) to_rec = int(request_to) num_recs = int(max_recs) if from_rec < 1: if to_rec < 1: # both too small return [], None, None from_rec = 1 if to_rec > num_recs: if from_rec > num_recs: # both too large return [], None, None to_rec = num_recs rfrom = None rto = None for i in range(from_rec, to_rec + 1): uid = str(i) html, url = self.get_html_from_uid(uid) result = scrapemark.scrape(self._scrape_min_data, html) if result and result.get('reference'): final_result.append({'url': url, 'uid': uid}) if rfrom is None: rfrom = i rto = i else: self.logger.debug("No valid record for uid %s", uid) if final_result: return final_result, rfrom, rto else: return [], None, None # list scraper - so empty result is always invalid
def get_id_period(self, this_date): final_result = [] #response = self.br.open(self._weekly_url) from_dt, to_dt = scrapeutils.inc_dt(this_date, self._period_type) date_to = to_dt.strftime('X%d/X%m/%Y').replace('X0', 'X').replace('X', '') fields = {} fields.update(self._query_fields) fields[self._date_field] = date_to response = self.br.open(self._results_url, urllib.urlencode(fields)) #scrapeutils.setup_form(self.br, self._search_form, fields) #self.logger.debug("ID batch form: %s", str(self.br.form)) #response = scrapeutils.submit_form(self.br) if response: html = response.read() url = response.geturl() #self.logger.debug("ID batch page html: %s", html) result = scrapemark.scrape(self._scrape_ids, html, url) if result and result.get('records'): self._clean_ids(result['records']) final_result.extend(result['records']) else: return [], None, None return final_result, from_dt, to_dt # note weekly result can be legitimately empty
def _get_details(self, html, this_url): """ Scrapes detailed information for one record given html and url - this is an optional hook to allow data from multiple linked pages to be merged """ result = self._get_detail(html, this_url) if 'scrape_error' in result: return result try: temp_result = scrapemark.scrape(self._scrape_dates_link, html, this_url) dates_url = temp_result['dates_link'] self.logger.debug("Dates url: %s", dates_url) response = self.br.open(dates_url) html, url = self._get_html(response) except: self.logger.warning("No link to dates page found") else: #self.logger.debug("Html obtained from dates url: %s", html) result2 = self._get_detail(html, url, self._scrape_dates_block, self._scrape_min_dates, self._scrape_optional_dates) if 'scrape_error' not in result2: result.update(result2) else: self.logger.warning("No information found on dates page") return result
def max_sequence(self): max_recs = None response = self.br.open(self._search_url) to_date = date.today() - timedelta(days=14) fields = { self._ref_field: '', self._date_field: to_date.strftime(self._request_date_format) } scrapeutils.setup_form(self.br, self._search_form, fields) response = scrapeutils.submit_form(self.br) html, url = self._get_html(response) result = scrapemark.scrape(self._scrape_ids, html, url) if result and result.get('records'): self._clean_ids(result['records']) num_recs = 0 for i in result['records']: try: num = int(i['uid']) if num > num_recs: num_recs = num except: pass self.logger.debug('Number of records %d' % num_recs) if num_recs > 0: max_recs = num_recs return max_recs
def get_html_from_uid (self, uid): if self._start_url: response = self.br.open(self._start_url) response = self.br.open(self._search_url) #self.logger.debug("search html: %s", response.read()) #self.logger.debug(scrapeutils.list_forms(self.br)) fields = {} fields.update(self._search_fields) if uid.isdigit(): fields[self._ref_field] = uid else: fields[self._alt_ref_field] = uid scrapeutils.setup_form(self.br, self._ref_search_form, fields) #self.logger.debug("Uid form: %s", str(self.br.form)) response = scrapeutils.submit_form(self.br, self._ref_search_submit) html, url = self._get_html(response) # note return here can be a single uid match page OR list of multiple matches result = scrapemark.scrape(self._scrape_ids, html, url) if result and result.get('records'): self._clean_ids(result['records']) if len(result['records']) >= 1: fields = {} fields.update(self._search_fields) scrapeutils.setup_form(self.br, self._ref_search_form, fields) #self.logger.debug("Uid form: %s", str(self.br.form)) response = scrapeutils.submit_form(self.br, self._result_submit) return self._get_html(response) return None, None else: return html, url
def get_id_batch(self, date_from, date_to): final_result = [] #response = self.br.open(self._search_url) #self.logger.debug("Start html: %s" % response.read()) # fix buggy option list #html = response.get_data() #html = html.replace('<option value="7">8</option>', '<option value="7">7</option> <option value="8">8</option>') #response.set_data(html) #self.br.set_response(response) fields = {} fields.update(self._search_fields) dfrom = date_from.strftime('X%d/%B/%Y').replace('X0', 'X').replace('X', '') date_parts = dfrom.split('/') fields[self._date_from_field['day']] = date_parts[0] fields[self._date_from_field['month']] = date_parts[1] fields[self._date_from_field['year']] = date_parts[2] dto = date_to.strftime('X%d/%B/%Y').replace('X0', 'X').replace('X', '') date_parts = dto.split('/') fields[self._date_to_field['day']] = date_parts[0] fields[self._date_to_field['month']] = date_parts[1] fields[self._date_to_field['year']] = date_parts[2] #scrapeutils.setup_form(self.br, self._search_form, fields) #self.logger.debug("ID batch form: %s", str(self.br.form)) #response = scrapeutils.submit_form(self.br) url = self._results_url + '?' + urllib.urlencode(fields) #self.logger.debug("Result url: %s" % url) response = self.br.open(url) page_count = 0 max_pages = (2 * self.min_id_goal / 10) + 20 # guard against infinite loop while response and page_count < max_pages: html = response.read() url = response.geturl() #self.logger.debug("Batch html: %s" % html) result = scrapemark.scrape(self._scrape_ids, html, url) if result and result.get('records'): page_count += 1 self._clean_ids(result['records']) final_result.extend(result['records']) else: self.logger.debug("Empty result after %d pages", page_count) break try: response = self.br.follow_link(text=self._link_next) except: # normal failure to find next page form at end of page sequence here self.logger.debug("No next link after %d pages", page_count) break if page_count >= max_pages: self.logger.warning( "Too many page requests - %d - probable run away loop" % page_count) return final_result
def get_html_from_uid(self, uid): response = self.br.open(self._search_url) if self._first_search: # launch search facility page with button appears only on first opening of this url scrapeutils.setup_form(self.br) response = scrapeutils.submit_form(self.br) self._first_search = False #self.logger.debug("Start html: %s", response.read()) fields = {} fields.update(self._ref_search) fields.update(self._ref_page) scrapeutils.setup_form(self.br, self._search_form, fields) #self.logger.debug("Choose search form: %s", str(self.br.form)) response = scrapeutils.submit_form(self.br) fields = {} fields.update(self._ref_page) fields[self._appno_field] = uid scrapeutils.setup_form(self.br, self._search_form, fields) self.logger.debug("Appno form: %s", str(self.br.form)) response = scrapeutils.submit_form(self.br, self._submit_control) html, url = self._get_html(response) #self.logger.debug("Result html: %s", html) result = scrapemark.scrape(self._scrape_ids_ref, html, url) if result and result.get('records'): self._clean_ids(result['records']) for r in result['records']: if r.get('uid', '') == uid and r.get('control'): self.logger.debug("Scraped control: %s", r['control']) fields = {r['control']: uid} scrapeutils.setup_form(self.br, self._search_form, fields) #self.logger.debug("Detail form: %s", str(self.br.form)) response = scrapeutils.submit_form(self.br) return self._get_html(response) return None, None
def get_id_batch(self, date_from, date_to): final_result = [] response = self.br.open(self._search_url) #self.logger.debug("Start html: %s", response.read()) fields = {} fields.update(self._search_fields) fields[self._date_from_field] = date_from.strftime( self._request_date_format) fields[self._date_to_field] = date_to.strftime( self._request_date_format) scrapeutils.setup_form(self.br, self._search_form, fields) self.logger.debug("ID batch form: %s", str(self.br.form)) response = scrapeutils.submit_form(self.br) if response: html = response.read() url = response.geturl() #self.logger.debug("ID batch page html: %s", html) result = scrapemark.scrape(self._scrape_ids, html, url) if result and result.get('records'): self._clean_ids(result['records']) final_result.extend(result['records']) return final_result
def get_id_batch(self, date_from, date_to): final_result = [] fields = {} fields.update(self._search_fields) date_from = date_from.strftime(self._request_date_format) date_parts = date_from.split('/') fields[self._date_from_field['day']] = date_parts[0] fields[self._date_from_field['month']] = date_parts[1] fields[self._date_from_field['year']] = date_parts[2] date_to = date_to.strftime(self._request_date_format) date_parts = date_to.split('/') fields[self._date_to_field['day']] = date_parts[0] fields[self._date_to_field['month']] = date_parts[1] fields[self._date_to_field['year']] = date_parts[2] self.logger.debug("Fields: %s", str(fields)) query = urllib.urlencode(fields) url = self._result_url + '?' + query response = self.br.open(url) if response: html = response.read() url = response.geturl() #self.logger.debug("Batch html: %s" % html) result = scrapemark.scrape(self._scrape_ids, html, url) if result and result.get('records'): self._clean_ids(result['records']) final_result.extend(result['records']) return final_result