Python AllScrapers примеры использования

Язык программирования: Python

Пространство имен/Пакет: scrapers.all_scrapers

Класс/Тип: AllScrapers

Примеров на hotexamples.com: 3

Python AllScrapers - 3 примера найдено. Это лучшие примеры Python кода для scrapers.all_scrapers.AllScrapers, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

AllScrapers(2)

getScraper(1)

getStartUrl(1)

Пример #1

Показать файл

Файл: scrape_loop.py Проект: JamieMcNaught/Bank-Scraper

    def loop(self, sync, message, accountlist):

        # get the bank id for this sesh
        bankId = message[0]['bankId']
        # and find its start URL
        all_scrapers = AllScrapers()
        self.startUrl = all_scrapers.getStartUrl(bankId)

        self.b = mechanize.Browser(factory=mechanize.RobustFactory())
        self.b.set_handle_robots(None)

        self.b.set_debug_redirects(True)
        # Log HTTP response bodies (ie. the HTML, most of the time).
        self.b.set_debug_responses(True)
        # Print HTTP headers.
        self.b.set_debug_http(True)

        # Don't handle Refresh redirections
        self.b.set_handle_refresh(False)
        
        logging.info("Start URL = " + self.startUrl)

        mXacts =[]

        controller = ScraperController()

        bank_url = self.startUrl
        next_step = 1
        method = 'GET'
        post = ''

        post_data = None

        do_loop = True

        request = {}
        while do_loop:
            
            # open the bank page
            self.b.open(bank_url, post_data)

            # get raw page data
            raw = self.b.response().get_data()

            # write out the page for any debugging
            self.output_page(str(next_step) + "_page.html", raw.decode('utf-8'))

            # build up a page scrape request to pass to the bank scraper
            request['body'] = 'tbd'        
            request['status'] = 200
            request['bankurl'] = self.ByteToHex(self.startUrl)
            request['headers'] = []
            request['step'] = next_step
            request['credentials'] = message

            logging.debug(str(request));

            request['body'] = self.ByteToHex(raw) 

            # call the controller with this page 
            if sync:
                response = controller.synch_accounts(request, accountlist)
            else:
                response = controller.get_accounts(request)

            logging.debug('>>> -------------------------')
            logging.debug('>>> -------------------------------------------->')
            logging.debug('>>> -------------------------')

            # decypher what went on in the parsing
            status = response['message']
              
            # if all went well in the scraper
            if status == 'good':
                next_request = response['request']

                logging.debug(next_request)

                method = next_request['method']
                bank_url = self.HexToByte(next_request['url'])
                next_step = next_request['step']
                post = self.HexToByte(next_request['data'])


                logging.debug("METHOD: " + method)
                logging.debug("URL: " + bank_url)
                logging.debug("STEP:" + str(next_step))
                logging.debug("DATA: " + post)

                post_data = None
                if method == 'POST':
                    post_data = post


                if "accountlist" in next_request:
                    aclist = next_request["accountlist"]
                    accountid = ""
                    accountpath = []
                    
                    if "accountid" in next_request:
                        accountid = next_request["accountid"]
                    
                    if "accountpath" in next_request:
                        accountpath = next_request["accountpath"]
                    
                    request["accountlist"] = aclist
                    request["accountid"] = accountid
                    request["accountpath"] = accountpath
                                                        
                    
                    if "bankxact" in response:
                        bankxact = response["bankxact"]
                        mXacts.append(bankxact)
                    else:
                        logging.warn("No bankxact")
                    
                
                elif sync:               # only expect accountlist on the transaction sync
                    logging.warn("no accounts")


            if method == 'END':
                do_loop = False
            if status != 'good':
                do_loop = False


        return response

Пример #2

Показать файл

Файл: scrape_loop.py Проект: moneytoolkit/Bank-Scraper

    def loop(self, sync, message, accountlist):

        # get the bank id for this sesh
        bankId = message[0]['bankId']
        # and find its start URL
        all_scrapers = AllScrapers()
        self.startUrl = all_scrapers.getStartUrl(bankId)

        self.b = mechanize.Browser(factory=mechanize.RobustFactory())
        self.b.set_handle_robots(None)

        self.b.set_debug_redirects(True)
        # Log HTTP response bodies (ie. the HTML, most of the time).
        self.b.set_debug_responses(True)
        # Print HTTP headers.
        self.b.set_debug_http(True)

        # Don't handle Refresh redirections
        self.b.set_handle_refresh(False)

        logging.info("Start URL = " + self.startUrl)

        mXacts = []

        controller = ScraperController()

        bank_url = self.startUrl
        next_step = 1
        method = 'GET'
        post = ''

        post_data = None

        do_loop = True

        request = {}
        while do_loop:

            # open the bank page
            self.b.open(bank_url, post_data)

            # get raw page data
            raw = self.b.response().get_data()

            # write out the page for any debugging
            self.output_page(
                str(next_step) + "_page.html", raw.decode('utf-8'))

            # build up a page scrape request to pass to the bank scraper
            request['body'] = 'tbd'
            request['status'] = 200
            request['bankurl'] = self.ByteToHex(self.startUrl)
            request['headers'] = []
            request['step'] = next_step
            request['credentials'] = message

            logging.debug(str(request))

            request['body'] = self.ByteToHex(raw)

            # call the controller with this page
            if sync:
                response = controller.synch_accounts(request, accountlist)
            else:
                response = controller.get_accounts(request)

            logging.debug('>>> -------------------------')
            logging.debug('>>> -------------------------------------------->')
            logging.debug('>>> -------------------------')

            # decypher what went on in the parsing
            status = response['message']

            # if all went well in the scraper
            if status == 'good':
                next_request = response['request']

                logging.debug(next_request)

                method = next_request['method']
                bank_url = self.HexToByte(next_request['url'])
                next_step = next_request['step']
                post = self.HexToByte(next_request['data'])

                logging.debug("METHOD: " + method)
                logging.debug("URL: " + bank_url)
                logging.debug("STEP:" + str(next_step))
                logging.debug("DATA: " + post)

                post_data = None
                if method == 'POST':
                    post_data = post

                if "accountlist" in next_request:
                    aclist = next_request["accountlist"]
                    accountid = ""
                    accountpath = []

                    if "accountid" in next_request:
                        accountid = next_request["accountid"]

                    if "accountpath" in next_request:
                        accountpath = next_request["accountpath"]

                    request["accountlist"] = aclist
                    request["accountid"] = accountid
                    request["accountpath"] = accountpath

                    if "bankxact" in response:
                        bankxact = response["bankxact"]
                        mXacts.append(bankxact)
                    else:
                        logging.warn("No bankxact")

                elif sync:  # only expect accountlist on the transaction sync
                    logging.warn("no accounts")

            if method == 'END':
                do_loop = False
            if status != 'good':
                do_loop = False

        return response

Пример #3

Показать файл

    def getScraper(self, bankId, credentials):   #TODO - check do we need to know about proxy_grab

        scraperFactory = AllScrapers()
        return scraperFactory.getScraper(bankId, credentials)