def loop(self, sync, message, accountlist): # get the bank id for this sesh bankId = message[0]['bankId'] # and find its start URL all_scrapers = AllScrapers() self.startUrl = all_scrapers.getStartUrl(bankId) self.b = mechanize.Browser(factory=mechanize.RobustFactory()) self.b.set_handle_robots(None) self.b.set_debug_redirects(True) # Log HTTP response bodies (ie. the HTML, most of the time). self.b.set_debug_responses(True) # Print HTTP headers. self.b.set_debug_http(True) # Don't handle Refresh redirections self.b.set_handle_refresh(False) logging.info("Start URL = " + self.startUrl) mXacts =[] controller = ScraperController() bank_url = self.startUrl next_step = 1 method = 'GET' post = '' post_data = None do_loop = True request = {} while do_loop: # open the bank page self.b.open(bank_url, post_data) # get raw page data raw = self.b.response().get_data() # write out the page for any debugging self.output_page(str(next_step) + "_page.html", raw.decode('utf-8')) # build up a page scrape request to pass to the bank scraper request['body'] = 'tbd' request['status'] = 200 request['bankurl'] = self.ByteToHex(self.startUrl) request['headers'] = [] request['step'] = next_step request['credentials'] = message logging.debug(str(request)); request['body'] = self.ByteToHex(raw) # call the controller with this page if sync: response = controller.synch_accounts(request, accountlist) else: response = controller.get_accounts(request) logging.debug('>>> -------------------------') logging.debug('>>> -------------------------------------------->') logging.debug('>>> -------------------------') # decypher what went on in the parsing status = response['message'] # if all went well in the scraper if status == 'good': next_request = response['request'] logging.debug(next_request) method = next_request['method'] bank_url = self.HexToByte(next_request['url']) next_step = next_request['step'] post = self.HexToByte(next_request['data']) logging.debug("METHOD: " + method) logging.debug("URL: " + bank_url) logging.debug("STEP:" + str(next_step)) logging.debug("DATA: " + post) post_data = None if method == 'POST': post_data = post if "accountlist" in next_request: aclist = next_request["accountlist"] accountid = "" accountpath = [] if "accountid" in next_request: accountid = next_request["accountid"] if "accountpath" in next_request: accountpath = next_request["accountpath"] request["accountlist"] = aclist request["accountid"] = accountid request["accountpath"] = accountpath if "bankxact" in response: bankxact = response["bankxact"] mXacts.append(bankxact) else: logging.warn("No bankxact") elif sync: # only expect accountlist on the transaction sync logging.warn("no accounts") if method == 'END': do_loop = False if status != 'good': do_loop = False return response
def loop(self, sync, message, accountlist): # get the bank id for this sesh bankId = message[0]['bankId'] # and find its start URL all_scrapers = AllScrapers() self.startUrl = all_scrapers.getStartUrl(bankId) self.b = mechanize.Browser(factory=mechanize.RobustFactory()) self.b.set_handle_robots(None) self.b.set_debug_redirects(True) # Log HTTP response bodies (ie. the HTML, most of the time). self.b.set_debug_responses(True) # Print HTTP headers. self.b.set_debug_http(True) # Don't handle Refresh redirections self.b.set_handle_refresh(False) logging.info("Start URL = " + self.startUrl) mXacts = [] controller = ScraperController() bank_url = self.startUrl next_step = 1 method = 'GET' post = '' post_data = None do_loop = True request = {} while do_loop: # open the bank page self.b.open(bank_url, post_data) # get raw page data raw = self.b.response().get_data() # write out the page for any debugging self.output_page( str(next_step) + "_page.html", raw.decode('utf-8')) # build up a page scrape request to pass to the bank scraper request['body'] = 'tbd' request['status'] = 200 request['bankurl'] = self.ByteToHex(self.startUrl) request['headers'] = [] request['step'] = next_step request['credentials'] = message logging.debug(str(request)) request['body'] = self.ByteToHex(raw) # call the controller with this page if sync: response = controller.synch_accounts(request, accountlist) else: response = controller.get_accounts(request) logging.debug('>>> -------------------------') logging.debug('>>> -------------------------------------------->') logging.debug('>>> -------------------------') # decypher what went on in the parsing status = response['message'] # if all went well in the scraper if status == 'good': next_request = response['request'] logging.debug(next_request) method = next_request['method'] bank_url = self.HexToByte(next_request['url']) next_step = next_request['step'] post = self.HexToByte(next_request['data']) logging.debug("METHOD: " + method) logging.debug("URL: " + bank_url) logging.debug("STEP:" + str(next_step)) logging.debug("DATA: " + post) post_data = None if method == 'POST': post_data = post if "accountlist" in next_request: aclist = next_request["accountlist"] accountid = "" accountpath = [] if "accountid" in next_request: accountid = next_request["accountid"] if "accountpath" in next_request: accountpath = next_request["accountpath"] request["accountlist"] = aclist request["accountid"] = accountid request["accountpath"] = accountpath if "bankxact" in response: bankxact = response["bankxact"] mXacts.append(bankxact) else: logging.warn("No bankxact") elif sync: # only expect accountlist on the transaction sync logging.warn("no accounts") if method == 'END': do_loop = False if status != 'good': do_loop = False return response
def getScraper(self, bankId, credentials): #TODO - check do we need to know about proxy_grab scraperFactory = AllScrapers() return scraperFactory.getScraper(bankId, credentials)