def get_html_from_uid (self, uid): if self._start_url: response = self.br.open(self._start_url) response = self.br.open(self._search_url) #self.logger.debug("search html: %s", response.read()) #self.logger.debug(scrapeutils.list_forms(self.br)) fields = {} fields.update(self._search_fields) if uid.isdigit(): fields[self._ref_field] = uid else: fields[self._alt_ref_field] = uid scrapeutils.setup_form(self.br, self._ref_search_form, fields) #self.logger.debug("Uid form: %s", str(self.br.form)) response = scrapeutils.submit_form(self.br, self._ref_search_submit) html, url = self._get_html(response) # note return here can be a single uid match page OR list of multiple matches result = scrapemark.scrape(self._scrape_ids, html, url) if result and result.get('records'): self._clean_ids(result['records']) if len(result['records']) >= 1: fields = {} fields.update(self._search_fields) scrapeutils.setup_form(self.br, self._ref_search_form, fields) #self.logger.debug("Uid form: %s", str(self.br.form)) response = scrapeutils.submit_form(self.br, self._result_submit) return self._get_html(response) return None, None else: return html, url
def get_html_from_uid(self, uid): response = self.br.open(self._search_url) self._adjust_response(response) #self.logger.debug("Start page html: %s", response.read()) # get first brief application page fields = {} fields.update(self._search_fields_applic) fields ['ctl00$sideBar$sdcAppSearch$ddlCaseType'] = uid[0:2] fields ['ctl00$sideBar$sdcAppSearch$ddlCaseYear'] = uid[3:5] fields ['ctl00$sideBar$sdcAppSearch$txtCaseNo'] = uid[6:11] scrapeutils.setup_form(self.br, self._search_form, fields) for control in self.br.form.controls: if control.type == "submit" or control.type == "image": control.disabled = True self.logger.debug("First applic form: %s", str(self.br.form)) response = scrapeutils.submit_form(self.br) self._adjust_response(response) #self.logger.debug("First page: %s", response.read()) # get second detailed application page scrapeutils.setup_form(self.br, self._search_form, self._detail_fields) for control in self.br.form.controls: if control.type == "submit" or control.type == "image": control.disabled = True self.logger.debug("Detail applic form: %s", str(self.br.form)) response = scrapeutils.submit_form(self.br) self._adjust_response(response) html, url = self._get_html(response) #self.logger.debug("Detail page: %s", html) return html, url
def get_html_from_uid(self, uid): response = self.br.open(self._search_url) if self._first_search: # launch search facility page with button appears only on first opening of this url scrapeutils.setup_form(self.br) response = scrapeutils.submit_form(self.br) self._first_search = False #self.logger.debug("Start html: %s", response.read()) fields = {} fields.update(self._ref_search) fields.update(self._ref_page) scrapeutils.setup_form(self.br, self._search_form, fields) #self.logger.debug("Choose search form: %s", str(self.br.form)) response = scrapeutils.submit_form(self.br) fields = {} fields.update(self._ref_page) fields[self._appno_field] = uid scrapeutils.setup_form(self.br, self._search_form, fields) self.logger.debug("Appno form: %s", str(self.br.form)) response = scrapeutils.submit_form(self.br, self._submit_control) html, url = self._get_html(response) #self.logger.debug("Result html: %s", html) result = scrapemark.scrape(self._scrape_ids_ref, html, url) if result and result.get('records'): self._clean_ids(result['records']) for r in result['records']: if r.get('uid', '') == uid and r.get('control'): self.logger.debug("Scraped control: %s", r['control']) fields = {r['control']: uid} scrapeutils.setup_form(self.br, self._search_form, fields) #self.logger.debug("Detail form: %s", str(self.br.form)) response = scrapeutils.submit_form(self.br) return self._get_html(response) return None, None
def get_id_batch (self, date_from, date_to): this_dt = date_from final_result = [] while this_dt <= date_to: response = self.br.open(self._search_url) #self.logger.debug("Start html: %s", response.read()) fields = {} fields[self._date_from_field] = this_dt.strftime(self._request_date_format) fields[self._date_to_field] = this_dt.strftime(self._request_date_format) scrapeutils.setup_form(self.br, self._search_form, fields) #self.logger.debug("ID batch form: %s", str(self.br.form)) response = scrapeutils.submit_form(self.br) html = response.read() #self.logger.debug("ID batch page html: %s", html) try: result = scrapemark.scrape(self._scrape_max_recs, html) max_recs = int(result['max_recs']) except: max_recs = 0 interim_result = [] page_count = 0 max_pages = (2 * self.min_id_goal / 10) + 20 # guard against infinite loop while response and len(interim_result) < max_recs and page_count < max_pages: url = response.geturl() #self.logger.debug("ID batch page html: %s", html) result = scrapemark.scrape(self._scrape_ids, html, url) if result and result.get('records'): page_count += 1 for rec in result['records']: rec[self._date_type] = fields[self._date_to_field] self._clean_ids(result['records']) interim_result.extend(result['records']) else: self.logger.debug("Empty result after %d pages", page_count) break if len(interim_result) >= max_recs: break try: result = scrapemark.scrape(self._scrape_next_submit, html) next_submit = result['next_submit'] scrapeutils.setup_form(self.br, self._search_form) self.logger.debug("ID next form: %s", str(self.br.form)) response = scrapeutils.submit_form(self.br, next_submit) html = response.read() except: # failure to find next page link at end of page sequence here self.logger.debug("No next form link after %d pages", page_count) break if page_count >= max_pages: self.logger.warning("Too many page requests - %d - probable run away loop" % page_count) final_result.extend(interim_result) this_dt += timedelta(days=1) return final_result
def get_id_batch(self, date_from, date_to): final_result = [] response = self.br.open(self._search_url) #self.logger.debug("Start html: %s" % response.read()) fields = {} fields[self._date_from_field] = date_from.strftime( self._request_date_format) fields[self._date_to_field] = date_to.strftime( self._request_date_format) scrapeutils.setup_form(self.br, self._search_form, fields) self.logger.debug("ID batch form: %s", str(self.br.form)) response = scrapeutils.submit_form(self.br) html = response.read() #self.logger.debug("ID batch page html: %s", html) try: result = scrapemark.scrape(self._scrape_max_recs, html) max_recs = int(result['max_recs']) except: max_recs = 0 # note max recs is in the footer which is omitted if only one page of results page_count = 0 max_pages = (2 * self.min_id_goal / 10) + 20 # guard against infinite loop while response and page_count < max_pages: url = response.geturl() #self.logger.debug("Batch html: %s" % html) if max_recs == 0: # one page, no footer result = scrapemark.scrape(self._scrape_ids_no_foot, html, url) else: result = scrapemark.scrape(self._scrape_ids, html, url) if result and result.get('records'): page_count += 1 self._clean_ids(result['records']) final_result.extend(result['records']) else: self.logger.debug("Empty result after %d pages", page_count) break if max_recs == 0 or len(final_result) >= max_recs: break try: result = scrapemark.scrape(self._scrape_next_submit, html) scrapeutils.setup_form(self.br, self._search_form) #self.logger.debug("Next page form: %s", str(self.br.form)) response = scrapeutils.submit_form(self.br, result['next_submit']) html = response.read() except: # normal failure to find next page form at end of page sequence here self.logger.debug("No next form after %d pages", page_count) break if page_count >= max_pages: self.logger.warning( "Too many page requests - %d - probable run away loop" % page_count) return final_result
def get_id_batch(self, date_from, date_to): final_result = [] response = self.br.open(self._search_url + '?advanced_search=true') scrapeutils.setup_form(self.br, self._search_form) #response = scrapeutils.submit_form(self.br, self._advanced_submit) #self.logger.debug("Start html: %s" % response.read()) fields = {} fields[self._date_from_field] = date_from.strftime( self._request_date_format) fields[self._date_to_field] = date_to.strftime( self._request_date_format) scrapeutils.setup_form(self.br, self._search_form, fields) self.logger.debug("ID batch form: %s", str(self.br.form)) response = scrapeutils.submit_form(self.br, self._search_submit) #html = response.read() #self.logger.debug("ID batch page html: %s", html) #try: # result = scrapemark.scrape(self._scrape_max_recs, html) # max_recs = int(result['max_recs']) #except: # max_recs = 0 page_count = 0 max_pages = (2 * self.min_id_goal / 10) + 20 # guard against infinite loop #while response and len(final_result) < max_recs and page_count < max_pages: while response and page_count < max_pages: html = response.read() url = response.geturl() #self.logger.debug("Batch html: %s" % html) result = scrapemark.scrape(self._scrape_ids, html, url) if result and result.get('records'): page_count += 1 self._clean_ids(result['records']) final_result.extend(result['records']) else: self.logger.debug("Empty result after %d pages", page_count) break #if len(final_result) >= max_recs: break try: scrapeutils.setup_form(self.br, self._search_form, self._next_fields) #self.logger.debug("Next page form: %s", str(self.br.form)) response = scrapeutils.submit_form(self.br) except: # normal failure to find next page form at end of page sequence here self.logger.debug("No next form after %d pages", page_count) break if page_count >= max_pages: self.logger.warning( "Too many page requests - %d - probable run away loop" % page_count) return final_result
def get_id_batch(self, date_from, date_to): final_result = [] response = self.br.open(self._search_url) self._adjust_response(response) #self.logger.debug("First page html: %s", response.read()) self.logger.debug(scrapeutils.list_forms(self.br)) fields = {} fields[self._date_from_field] = date_from.strftime( self._request_date_format) fields[self._date_to_field] = date_to.strftime( self._request_date_format) scrapeutils.setup_form(self.br, self._search_form, fields) action = self.br.form.action self.br.form.action = action.replace('https://', 'http://') self.logger.debug("ID batch form: %s", str(self.br.form)) response = scrapeutils.submit_form(self.br) page_count = 0 max_pages = (2 * self.min_id_goal / 10) + 20 # guard against infinite loop while response and page_count < max_pages: html = response.read() url = response.geturl() #self.logger.debug("ID batch page html: %s", html) result = scrapemark.scrape(self._scrape_ids, html, url) if result and result.get('records'): page_count += 1 self._clean_ids(result['records']) final_result.extend(result['records']) else: self.logger.debug("Empty result after %d pages", page_count) break try: if 'next_link' in self._scrape_next: result = scrapemark.scrape(self._scrape_next, html, url) response = self.br.open(result['next_link']) else: scrapeutils.setup_form(self.br, self._scrape_next) action = self.br.form.action self.br.form.action = action.replace('https://', 'http://') self.logger.debug("ID next form: %s", str(self.br.form)) response = scrapeutils.submit_form(self.br) except: # normal failure to find next page form at end of page sequence here self.logger.debug("No next form/link after %d pages", page_count) break if page_count >= max_pages: self.logger.warning( "Too many page requests - %d - probable run away loop" % page_count) return final_result
def get_id_batch(self, date_from, date_to): final_result = [] response = self.br.open(self._search_url) #self.logger.debug("ID batch start html: %s", response.read()) fields = {} fields.update(self._search_fields) fields['p_object_name'] = self._form_name + self._form_object_suffix fields[self._form_name + self._date_from_field_suffix] = date_from.strftime( self._request_date_format) fields[self._form_name + self._date_to_field_suffix] = date_to.strftime( self._request_date_format) scrapeutils.setup_form(self.br, self._search_form, fields) self.logger.debug("ID batch form: %s" % str(self.br.form)) response = scrapeutils.submit_form(self.br) page_count = 0 max_pages = (2 * self.min_id_goal / 10) + 20 # guard against infinite loop while response and page_count < max_pages: html = response.read() url = response.geturl() #self.logger.debug("Batch html: %s" % html) result = scrapemark.scrape(self._scrape_ids, html, url) if result and result.get('records'): page_count += 1 self._clean_ids(result['records']) final_result.extend(result['records']) else: self.logger.debug("Empty result after %d pages", page_count) break try: scrapeutils.setup_form(self.br, self._next_form, self._next_page_fields) self.logger.debug("Next form: %s" % str(self.br.form)) response = scrapeutils.submit_form(self.br) except: # normal failure to find next page form at end of page sequence here self.logger.debug("No next form after %d pages", page_count) break if page_count >= max_pages: self.logger.warning( "Too many page requests - %d - probable run away loop" % page_count) return final_result
def max_sequence(self): max_recs = None response = self.br.open(self._search_url) to_date = date.today() - timedelta(days=14) fields = { self._ref_field: '', self._date_field: to_date.strftime(self._request_date_format) } scrapeutils.setup_form(self.br, self._search_form, fields) response = scrapeutils.submit_form(self.br) html, url = self._get_html(response) result = scrapemark.scrape(self._scrape_ids, html, url) if result and result.get('records'): self._clean_ids(result['records']) num_recs = 0 for i in result['records']: try: num = int(i['uid']) if num > num_recs: num_recs = num except: pass self.logger.debug('Number of records %d' % num_recs) if num_recs > 0: max_recs = num_recs return max_recs
def get_html_from_uid (self, uid): if self._uid_match.match(uid): # all numbers or / #fields = { self._uid_field: uid } url = self._applic_url + urllib.quote_plus(uid) return self.get_html_from_url(url) else: fields = { self._ref_field: uid } response = self.br.open(self._search_url) #self.logger.debug("ID detail start html: %s", response.read()) scrapeutils.setup_form(self.br, self._search_form, fields) response = scrapeutils.submit_form(self.br) html, url = self._get_html(response) result = scrapemark.scrape(self._scrape_ids, html, url) if result and result.get('records'): self._clean_ids(result['records']) if len(result['records']) == 1 and result['records'][0].get('url'): url = result['records'][0]['url'] self.logger.debug("Scraped url: %s", url) return self.get_html_from_url(url) else: for r in result['records']: if r.get('uid', '') == uid and r.get('url'): url = r['url'] self.logger.debug("Scraped url: %s", url) return self.get_html_from_url(url) return None, None
def get_id_batch(self, date_from, date_to): final_result = [] response = self.br.open(self._search_url) #self.logger.debug("Start html: %s", response.read()) fields = {} fields.update(self._search_fields) dfrom = date_from.strftime('X%d/%b/%Y').replace('X0', 'X').replace('X', '') date_parts = dfrom.split('/') fields[self._date_from_field['day']] = [date_parts[0]] fields[self._date_from_field['month']] = [date_parts[1]] fields[self._date_from_field['year']] = [date_parts[2]] dto = date_to.strftime('X%d/%b/%Y').replace('X0', 'X').replace('X', '') date_parts = dto.split('/') fields[self._date_to_field['day']] = [date_parts[0]] fields[self._date_to_field['month']] = [date_parts[1]] fields[self._date_to_field['year']] = [date_parts[2]] scrapeutils.setup_form(self.br, self._search_form, fields) self.logger.debug("ID batch form: %s", str(self.br.form)) response = scrapeutils.submit_form(self.br, self._search_submit) if response: html = response.read() url = response.geturl() #self.logger.debug("Batch html: %s" % html) result = scrapemark.scrape(self._scrape_ids, html, url) if result and result.get('records'): self._clean_ids(result['records']) final_result.extend(result['records']) return final_result
def get_id_batch(self, date_from, date_to): final_result = [] response = self.br.open(self._search_url) fields = {} fields.update(self._search_fields) date_from = date_from.strftime(self._request_date_format) date_parts = date_from.split('/') #fields[self._date_from_field['day']] = [ date_parts[0] ] #fields[self._date_from_field['month']] = [ date_parts[1] ] #fields[self._date_from_field['year']] = [ date_parts[2] ] fields[self._date_from_field['day']] = date_parts[0] fields[self._date_from_field['month']] = date_parts[1] fields[self._date_from_field['year']] = date_parts[2] date_to = date_to.strftime(self._request_date_format) date_parts = date_to.split('/') #fields[self._date_to_field['day']] = [ date_parts[0] ] #fields[self._date_to_field['month']] = [ date_parts[1] ] #fields[self._date_to_field['year']] = [ date_parts[2] ] fields[self._date_to_field['day']] = date_parts[0] fields[self._date_to_field['month']] = date_parts[1] fields[self._date_to_field['year']] = date_parts[2] #scrapeutils.setup_form(self.br, self._search_form, fields) #self.logger.debug("ID batch form: %s", str(self.br.form)) #response = scrapeutils.submit_form(self.br, self._search_submit) response = self.br.open(self._search_url, urllib.urlencode(fields)) page_count = 0 max_pages = (2 * self.min_id_goal / 10) + 20 # guard against infinite loop while response and page_count < max_pages: url = response.geturl() html = response.read() self.logger.debug("Batch html: %s" % html) result = scrapemark.scrape(self._scrape_ids, html, url) if result and result.get('records'): page_count += 1 self._clean_ids(result['records']) final_result.extend(result['records']) else: self.logger.debug("Empty result after %d pages", page_count) break try: scrapeutils.setup_form(self.br, self._next_form, self._next_fields) self.logger.debug("ID next form: %s", str(self.br.form)) response = scrapeutils.submit_form(self.br) except: # normal failure to find next page form at end of page sequence here self.logger.debug("No next form after %d pages", page_count) break if page_count >= max_pages: self.logger.warning( "Too many page requests - %d - probable run away loop" % page_count) return final_result
def get_html_from_uid( self, uid ): # note gives 500 error if do not handle Referer explicitly as below (also see S. Lanark) response = self.br.open(self._search_url) self._adjust_response(response) #self.logger.debug("ID detail start html: %s", response.read()) fields = self._applic_fields fields[self._ref_field] = uid scrapeutils.setup_form(self.br, self._search_form, fields) response = scrapeutils.submit_form(self.br, self._search_submit) html, url = self._get_html(response) sub_html = self._BADCHARS_REGEX.sub(' ', html) #self.logger.debug("Detail page html: %s", sub_html) result = scrapemark.scrape(self._scrape_ids, sub_html, url) #print result if result and result.get('records'): self._clean_ids(result['records']) for r in result['records']: if r.get('uid', '') == uid and r.get('url'): self.logger.debug("Scraped url: %s", r['url']) headers = {} headers.update(self._headers) headers['Referer'] = url self.br.addheaders = headers.items() return self.get_html_from_url(r['url']) return None, None
def get_html_from_uid(self, uid): response = self.br.open(self._search_url) #self.logger.debug("ID detail start html: %s", response.read()) fields = self._applic_fields fields[self._ref_field] = uid scrapeutils.setup_form(self.br, self._search_form, fields) response = scrapeutils.submit_form(self.br, self._search_submit) html = response.read() sub_html = self._BADCHARS_REGEX.sub(' ', html) #self.logger.debug("detail page html: %s", sub_html) expired = scrapemark.scrape(self._scrape_expired, sub_html) while expired: response = self.br.reload() html = response.read() sub_html = self._BADCHARS_REGEX.sub(' ', html) expired = scrapemark.scrape(self._scrape_expired, sub_html) url = response.geturl() result = scrapemark.scrape(self._scrape_ids, sub_html, url) if result and result.get('records'): self._clean_ids(result['records']) for r in result['records']: if r.get('uid', '') == uid and r.get('url'): self.logger.debug("Scraped url: %s", r['url']) return self.get_html_from_url(r['url']) return None, None
def get_id_batch(self, date_from, date_to): final_result = [] response = self.br.open(self._search_url) #self.logger.debug("Start html: %s", response.read()) fields = {} fields.update(self._search_fields) fields[self._date_from_field] = date_from.strftime( self._request_date_format) fields[self._date_to_field] = date_to.strftime( self._request_date_format) scrapeutils.setup_form(self.br, self._search_form, fields) self.logger.debug("ID batch form: %s", str(self.br.form)) response = scrapeutils.submit_form(self.br) if response: html = response.read() url = response.geturl() #self.logger.debug("ID batch page html: %s", html) result = scrapemark.scrape(self._scrape_ids, html, url) if result and result.get('records'): self._clean_ids(result['records']) final_result.extend(result['records']) return final_result
def get_id_batch(self, date_from, date_to): final_result = [] for case in self._case_prefixes: interim_result = [] response = self.br.open(self._search_url) #self.logger.debug("Start html: %s", response.read()) fields = {self._ref_field: case} fields[self._date_from_field] = date_from.strftime( self._request_date_format) fields[self._date_to_field] = date_to.strftime( self._request_date_format) scrapeutils.setup_form(self.br, self._search_form, fields) self.logger.debug("ID batch form: %s", str(self.br.form)) response = scrapeutils.submit_form(self.br) page_count = 0 max_pages = (2 * self.min_id_goal / 10) + 20 # guard against infinite loop while response and page_count < max_pages: html = response.read() url = response.geturl() #self.logger.debug("ID batch page html: %s", html) result = scrapemark.scrape(self._scrape_ids, html, url) if result and result.get('records'): page_count += 1 self._clean_ids(result['records']) interim_result.extend(result['records']) elif not interim_result: # is it a single record? single_result = scrapemark.scrape(self._scrape_one_id, html, url) if single_result: self._clean_record(single_result) interim_result = [single_result] break else: self.logger.debug("Empty result after %d pages", page_count) break try: result = scrapemark.scrape(self._scrape_next_link, html, url) response = self.br.open(result['next_link']) except: self.logger.debug("No next link after %d pages", page_count) break if page_count >= max_pages: self.logger.warning( "Too many page requests - %d - probable run away loop" % page_count) final_result.extend(interim_result) return final_result
def get_id_batch(self, date_from, date_to): final_result = [] response = self.br.open(self._search_url) #self.logger.debug("Start html: %s", response.read()) fields = {} fields.update(self._search_fields) fields[self._date_from_field] = date_from.strftime( self._request_date_format) fields[self._date_to_field] = date_to.strftime( self._request_date_format) scrapeutils.setup_form(self.br, self._search_form, fields) for control in self.br.form.controls: if control.name == "dateaprecv_date:FROM:DATE": control.disabled = True self.logger.debug("ID batch form: %s", str(self.br.form)) response = scrapeutils.submit_form(self.br) """html = response.read() self.logger.debug("ID batch page html: %s", html) result = scrapemark.scrape(self._scrape_max_pages, html) try: page_list = result['max_pages'].split() max_pages = len(page_list) except: max_pages = 1""" max_pages = (2 * self.min_id_goal / 10) + 20 # guard against infinite loop page_count = 0 while response and page_count < max_pages: html = response.read() #self.logger.debug("ID batch page html: %s", html) url = response.geturl() result = scrapemark.scrape(self._scrape_ids, html, url) if result and result.get('records'): page_count += 1 self._clean_ids(result['records']) final_result.extend(result['records']) else: self.logger.debug("Empty result after %d pages", page_count) break if page_count >= max_pages: break try: next_url = re.sub(r'pageno=\d*&', 'pageno=' + str(page_count + 1) + '&', url) self.logger.debug("ID next url: %s", next_url) response = self.br.open(next_url) #html = response.read() except: # normal failure to find next page form at end of page sequence here self.logger.debug("No next url after %d pages", page_count) break if page_count >= max_pages: self.logger.warning( "Too many page requests - %d - probable run away loop" % page_count) return final_result
def get_html_from_uid(self, uid): response = self.br.open(self._search_url) #self.logger.debug("ID detail start html: %s", response.read()) fields = {self._ref_field: uid} scrapeutils.setup_form(self.br, self._search_form, fields) #self.logger.debug("Get UID form: %s", str(self.br.form)) response = scrapeutils.submit_form(self.br, self._search_submit) return self._get_html(response)
def get_id_batch(self, date_from, date_to): final_result = [] response = self.br.open(self._search_url) #self.logger.debug("Start html: %s", response.read()) fields = {} fields.update(self._search_fields) fields[self._date_from_field] = date_from.strftime( self._request_date_format) fields[self._date_to_field] = date_to.strftime( self._request_date_format) scrapeutils.setup_form(self.br, self._search_form, fields) self.logger.debug("ID batch form: %s", str(self.br.form)) response = scrapeutils.submit_form(self.br, self._search_submit) html = response.read() #self.logger.debug("ID batch page html: %s", html) try: result = scrapemark.scrape(self._scrape_max_recs, html) max_recs = int(result['max_recs']) except: max_recs = 0 page_count = 0 max_pages = (2 * self.min_id_goal / 10) + 20 # guard against infinite loop id_list = [] while response and len( final_result) < max_recs and page_count < max_pages: url = response.geturl() result = scrapemark.scrape(self._scrape_ids, html, url) if result and result.get('records'): page_count += 1 self._clean_ids(result['records']) # add IDs one by one and test for duplicates for r in result['records']: if r['uid'] not in id_list: final_result.append(r) id_list.append(r['uid']) else: self.logger.debug("Empty result after %d pages", page_count) break if len(final_result) >= max_recs: break try: response = self.br.follow_link(text=self._next_link) html = response.read() #self.logger.debug("ID next page html: %s", html) except: # normal failure to find next page form at end of page sequence here self.logger.debug("No next link after %d pages", page_count) break if page_count >= max_pages: self.logger.warning( "Too many page requests - %d - probable run away loop" % page_count) return final_result
def get_html_from_uid(self, uid): response = self.br.open(self._disclaimer_url) scrapeutils.setup_form(self.br, self._search_form) #self.logger.debug("Disclaimer form: %s", str(self.br.form)) response = scrapeutils.submit_form(self.br) url = urlparse.urljoin( self._search_url, self._detail_page) + '?AppNo=' + urllib.quote_plus(uid) return self.get_html_from_url(url)
def get_html_from_uid (self, uid): response = self.br.open(self._search_url) scrapeutils.setup_form(self.br, self._search_form) self.logger.debug("Start form: %s", str(self.br.form)) response = scrapeutils.submit_form(self.br) #self.logger.debug("ID detail start html: %s", response.read()) fields = { self._ref_field: uid } scrapeutils.setup_form(self.br, self._search_form, fields) self.logger.debug("Get UID form: %s", str(self.br.form)) response = scrapeutils.submit_form(self.br, self._ref_submit) html, url = self._get_html(response) result = scrapemark.scrape(self._scrape_ids, html, url) if result and result.get('records'): self._clean_ids(result['records']) for rr in result['records']: if rr.get('uid', '') == uid and rr.get('url'): return self.get_html_from_url(rr['url']) return None, None
def get_id_batch(self, date_from, date_to): final_result = [] new_date_from = date_from - timedelta( days=1) # start date is exclusive, decrement start date by one day date_to = date_to + timedelta( days=1) # end date is exclusive, increment end date by one day response = self.br.open(self._search_url) #self.logger.debug("Start html: %s", response.read()) fields = {} fields[self._date_from_field] = new_date_from.strftime( self._request_date_format) fields[self._date_to_field] = date_to.strftime( self._request_date_format) scrapeutils.setup_form(self.br, self._search_form, fields) self.logger.debug("ID batch form: %s", str(self.br.form)) response = scrapeutils.submit_form(self.br, self._search_submit) html = response.read() #self.logger.debug("ID batch page html: %s", html) try: result = scrapemark.scrape(self._scrape_max_recs, html) max_recs = int(result['max_recs']) except: max_recs = 0 page_count = 0 max_pages = (2 * self.min_id_goal / 10) + 20 # guard against infinite loop while response and len( final_result) < max_recs and page_count < max_pages: url = response.geturl() #self.logger.debug("Batch html: %s" % html) result = scrapemark.scrape(self._scrape_ids, html, url) if result and result.get('records'): page_count += 1 self._clean_ids(result['records']) final_result.extend(result['records']) else: self.logger.debug("Empty result after %d pages", page_count) break if len(final_result) >= max_recs: break try: result = scrapemark.scrape(self._scrape_next, html, url) response = self.br.open(result['next_link']) html = response.read() except: # normal failure to find next page form at end of page sequence here self.logger.debug("No next link after %d pages", page_count) break if page_count >= max_pages: self.logger.warning( "Too many page requests - %d - probable run away loop" % page_count) return final_result
def _get_exact_html_from_uid(self, uid): if self._start_url: response = self.br.open(self._start_url) response = self.br.open(self._search_url) #self.logger.debug("ID detail start html: %s", response.read()) self.logger.debug(scrapeutils.list_forms(self.br)) fields = {} fields.update(self._search_fields) fields[self._ref_field] = uid if self._ref_form: scrapeutils.setup_form(self.br, self._ref_form, fields) else: scrapeutils.setup_form(self.br, self._search_form, fields) self.logger.debug("Uid form: %s", str(self.br.form)) if self._ref_submit: response = scrapeutils.submit_form(self.br, self._ref_submit) else: response = scrapeutils.submit_form(self.br, self._search_submit) return self._get_html(response)
def get_id_batch (self, date_from, date_to): final_result = [] response = self.br.open(self._search_url) fields = {} fields.update(self._search_fields) dfrom = date_from.strftime('X%d/%B/%Y').replace('X0','X').replace('X','') date_parts = dfrom.split('/') fields[self._date_from_field['day']] = [ date_parts[0] ] fields[self._date_from_field['month']] = [ date_parts[1] ] fields[self._date_from_field['year']] = [ date_parts[2] ] dto = date_to.strftime('X%d/%B/%Y').replace('X0','X').replace('X','') date_parts = dto.split('/') fields[self._date_to_field['day']] = [ date_parts[0] ] fields[self._date_to_field['month']] = [ date_parts[1] ] fields[self._date_to_field['year']] = [ date_parts[2] ] scrapeutils.setup_form(self.br, self._search_form, fields) self.logger.debug("ID batch form: %s", str(self.br.form)) response = scrapeutils.submit_form(self.br) #html = response.read() #self.logger.debug("ID batch page html: %s", html) #try: # result = scrapemark.scrape(self._scrape_max_recs, html) # max_recs = int(result['max_recs']) #except: # max_recs = 0 page_count = 0 max_pages = (2 * self.min_id_goal / 10) + 20 # guard against infinite loop #while response and len(final_result) < max_recs and page_count < max_pages: while response and page_count < max_pages: html = response.read() url = response.geturl() #self.logger.debug("Batch html: %s" % html) result = scrapemark.scrape(self._scrape_ids, html, url) if result and result.get('records'): page_count += 1 self._clean_ids(result['records']) final_result.extend(result['records']) else: self.logger.debug("Empty result after %d pages", page_count) break #if len(final_result) >= max_recs: break try: response = self.br.follow_link(text=self._link_next) except: # normal failure to find next page form at end of page sequence here self.logger.debug("No next link after %d pages", page_count) break if page_count >= max_pages: self.logger.warning("Too many page requests - %d - probable run away loop" % page_count) return final_result
def get_id_batch(self, date_from, date_to): new_date_to = date_to + timedelta( days=1) # increment end date by one day final_result = [] response = self.br.open(self._search_url) #self.logger.debug("ID batch start html: %s", response.read()) fields = {} fields[self._date_from_field] = date_from.strftime( self._request_date_format) fields[self._date_to_field] = new_date_to.strftime( self._request_date_format) scrapeutils.setup_form(self.br, self._search_form, fields) self.logger.debug("ID batch form: %s", str(self.br.form)) response = scrapeutils.submit_form(self.br) page_count = 0 max_pages = (2 * self.min_id_goal / 10) + 20 # guard against infinite loop while response and page_count < max_pages: html = response.read() url = response.geturl() #self.logger.debug("ID batch page html: %s", html) result = scrapemark.scrape(self._scrape_ids, html, url) if result and result.get('records'): page_count += 1 self._clean_ids(result['records']) for res in result['records']: if res.get('uid'): # one uid on 1 dec 2015 is empty final_result.append(res) elif not final_result: # is it a single record? single_result = scrapemark.scrape(self._scrape_one_id, html, url) if single_result: self._clean_record(single_result) final_result = [single_result] break else: self.logger.debug("Empty result after %d pages", page_count) break try: result = scrapemark.scrape(self._scrape_next_link, html, url) response = self.br.open(result['next_link']) except: self.logger.debug("No next link after %d pages", page_count) break if page_count >= max_pages: self.logger.warning( "Too many page requests - %d - probable run away loop" % page_count) return final_result
def get_id_batch(self, date_from, date_to): # note end date is exclusive final_result = [] response = self.br.open(self._search_url) self._adjust_response(response) #self.logger.debug("ID batch start html: %s", response.read()) new_date_to = date_to + timedelta( days=1) # end date is exclusive, increment end date by one day fields = {} fields.update(self._search_fields) fields[self._date_from_field] = date_from.strftime( self._request_date_format) fields[self._date_to_field] = new_date_to.strftime( self._request_date_format) scrapeutils.setup_form(self.br, self._search_form, fields) self.logger.debug("ID batch form: %s", str(self.br.form)) response = scrapeutils.submit_form(self.br) #try: # result = scrapemark.scrape(self._scrape_max_recs, html) # max_recs = int(result['max_recs']) #except: # max_recs = 1 #self.logger.debug("Max recs: %d", max_recs) page_count = 0 #max_pages = (2 * self.min_id_goal / 10) + 20 # guard against infinite loop #while response and len(final_result) < max_recs and page_count < max_pages: if response: html = response.read() url = response.geturl() #self.logger.debug("ID batch page html: %s", html) result = scrapemark.scrape(self._scrape_ids, html, url) if result and result.get('records'): #page_count += 1 self._clean_ids(result['records']) final_result.extend(result['records']) #else: # self.logger.debug("Empty result after %d pages", page_count) # break #if len(final_result) >= max_recs: break #try: # result = scrapemark.scrape(self._scrape_next_link, html, url) # response = self.br.open(result['next_link']) # html = response.read() #except: # self.logger.debug("No next link after %d pages", page_count) # break #if page_count >= max_pages: # self.logger.warning("Too many page requests - %d - probable run away loop" % page_count) return final_result
def get_id_batch(self, date_from, date_to): final_result = [] response = self.br.open(self._start_url) scrapeutils.setup_form(self.br, self._search_form, self._start_fields) self.logger.debug("ID start form: %s", str(self.br.form)) response = scrapeutils.submit_form(self.br) #response = self.br.open(self._search_url) fields = {} fields[self._date_from_field] = date_from.strftime( self._request_date_format) fields[self._date_to_field] = date_to.strftime( self._request_date_format) scrapeutils.setup_form(self.br, self._search_form, fields) self.logger.debug("ID batch form: %s", str(self.br.form)) response = scrapeutils.submit_form(self.br) #response = self.br.open(self._direct_url, urllib.urlencode(fields)) if response: html = response.read() self.logger.debug("ID batch page html: %s", html) try: result = scrapemark.scrape(self._scrape_max_recs, html) max_recs = int(result['max_recs']) except: max_recs = 0 url = response.geturl() #self.logger.debug("Batch html: %s" % html) result = scrapemark.scrape(self._scrape_ids, html, url) if result and result.get('records'): self._clean_ids(result['records']) final_result.extend(result['records']) return final_result
def _get_html_from_uid(self, uid): # note return here can be a single uid match page OR no match # however a list of multiple matches is an error response = self.br.open(self._search_url) #self.logger.debug("Start html: %s" % response.read()) fields = {} fields.update(self._search_fields) fields[self._ref_field] = uid scrapeutils.setup_form(self.br, self._search_form, fields) self.logger.debug("ID ref form: %s", str(self.br.form)) #response = scrapeutils.submit_form(self.br, self._ref_submit) response = scrapeutils.submit_form(self.br) return self._get_html(response)
def get_id_batch (self, date_from, date_to): final_result = [] fields = {} fields.update(self._search_fields) fields [self._date_from_field] = date_from.strftime(self._request_date_format) fields [self._date_to_field] = date_to.strftime(self._request_date_format) self.logger.debug("Fields: %s", str(fields)) query = urllib.urlencode(fields) url = urlparse.urljoin(self._search_url, self._results_page) + '?' + query response = self.br.open(url) html = response.read() #self.logger.debug("ID batch page html: %s", html) try: result = scrapemark.scrape(self._scrape_max_recs, html) max_recs = int(result['max_recs']) except: max_recs = 0 page_count = 0 max_pages = (2 * self.min_id_goal / 10) + 20 # guard against infinite loop while response and len(final_result) < max_recs and page_count < max_pages: url = response.geturl() result = scrapemark.scrape(self._scrape_ids, html, url) if result and result.get('records'): page_count += 1 self._clean_ids(result['records']) final_result.extend(result['records']) else: self.logger.debug("Empty result after %d pages", page_count) break if len(final_result) >= max_recs: break try: fields = { '__EVENTTARGET': self._next_target } fields['__EVENTARGUMENT'] = 'Page$' + str(page_count+1) scrapeutils.setup_form(self.br, self._search_form, fields) self.logger.debug("Next page form: %s", str(self.br.form)) response = scrapeutils.submit_form(self.br) html = response.read() #self.logger.debug("ID next page html: %s", html) except: # normal failure to find next page link at end of page sequence here self.logger.debug("No next form after %d pages", page_count) break if page_count >= max_pages: self.logger.warning("Too many page requests - %d - probable run away loop" % page_count) return final_result
def get_id_records (self, request_from, request_to, max_recs): if not request_from or not request_to or not max_recs: return [], None, None # if any parameter invalid - try again next time final_result = [] from_rec = int(request_from) to_rec = int(request_to) num_recs = int(max_recs) if from_rec < 1: if to_rec < 1: # both too small return [], None, None from_rec = 1 if to_rec > num_recs: if from_rec > num_recs: # both too large return [], None, None to_rec = num_recs response = self.br.open(self._disclaimer_url) scrapeutils.setup_form(self.br, self._disclaimer_form) #self.logger.debug("Disclaimer form: %s", str(self.br.form)) response = scrapeutils.submit_form(self.br) #page_zero = self._get_results_pages ('OpenForConsultation=True') #n_old = max_recs - len(page_zero) #print 'n_old', n_old #if current_result and from_rec > n_old: # cfrom_rec = from_rec - n_old # cto_rec = to_rec - n_old # return current_result[cfrom_rec-1:cto_rec], from_rec, to_rec max_page, min_rec = self._find_max_pages(from_rec) #print 'mp', max_page, min_rec for d in self._districts: interim_result = self._get_results_pages ('District=' + d, max_page) if interim_result: #print d, len(interim_result) final_result.extend(interim_result) else: #print 'Empty' return [], None, None # list scraper - so individual empty result is also invalid if final_result: #print 'x', len(final_result) fret = sorted(final_result, key=lambda k: (k['pageno'], k['recno'], k['uid']), reverse=True) #self.logger.debug("From: %d To: %d" % (from_rec, to_rec)) new_fret = fret[from_rec-min_rec:to_rec-min_rec+1] for f in new_fret: del f['pageno']; del f['recno'] return new_fret, from_rec, to_rec else: return [], None, None # list scraper - so empty result is always invalid