Python BeautifulSoup.findAll примеры, BeautifulSoup.BeautifulSoup.BeautifulSoup.findAll Python примеры использования

Пример #1

0

Показать файл

    def doAllLink(self, page):

        soup = BeautifulSoup(page)

        self.output_page("natwest-xactlist-all-look.html", page)

        #<a href="/StatementsFixedPeriod.aspx?id=B7879D8CABBF283B38AE447E07A4EA5D8DA9A859&amp;persist=%2fwEPBQ1BY2NvdW50TnVtYmVyBQg4ODU3MjIxOA%3d%3d%7c%2fwEPBQhGcm9tRGF0ZQUTMTgvMDUvMjAxMSAwMDowMDowMA%3d%3d%7c%2fwEPBQhTb3J0Q29kZQUGNjAxNzIx%7c%2fwEPBQZUb0RhdGUFEzE4LzA3LzIwMTEgMDA6MDA6MDA%3d%7c%2fwEWBh4JU1MyQUNDRERBDwUUU0lOSEEgTS9TVFUgODg1NzIyMTgFKEI3ODc5RDhDQUJCRjI4M0IzOEFFNDQ3RTA3QTRFQTVEOERBOUE4NTkeCFNTMlNQRERBDxBkZBYBAgNnHgZTUzJXTEEPAgFo&amp;showall=1" title="Show all items on a single page">All</a>

        logging.debug('NatWest checking for all links')

        # find any all link
        links = soup.findAll('a')

        link = None
        for a in links:
            # detect our link
            try:
                if re.search(".tatements.ixed.eriod", a['href']):
                    logging.debug("natwest - got a statement  link")
                    if re.search(".ll", a.text):  # the one that says all
                        link = self.composeLink(a['href'][:])
                        logging.debug("natwest - got an All statement link")
                        break  # only need the first one so break the for loop
            except Exception, e:
                logging.debug('NatWest a link error missing href - ' + str(e))

Пример #2

0

Показать файл

Файл: episodeList.py Проект: Armedite/xbmc-catchuptv-au

 def __init__( self ):
     params = self._parse_argv()
     source = self._fetch_url(self.BASE_URL + unquote_plus(params['showUrl']))
     showIndex = BeautifulSoup(source)
     vidInfo = {'tvshowtitle': showIndex.find('div',id='showDashboard').find('span',{'class':'blueText'}).string, 'studio': 'FOX'}
     seasonsListing = showIndex.findAll('div',{'class':re.compile('dashPageHolder'),'id':re.compile('^fullEp')})
     print len(seasonsListing)
     for season in seasonsListing:
         episodesListing = season.findAll('div',{'class':'episodeListing'})
         for episode in episodesListing:
             listitem = xbmcgui.ListItem(episode.find('h3').find('a').string)
             listitem.setThumbnailImage(episode.find('img',id=re.compile('^epThumb'))['src'])
             episodeLink = episode.find('a',{'class':'thumbnailLink'})
             if episodeLink['href'][0] == '/':
                 episodeUrl = episodeLink['href'][1:]
             else:
                 episodeUrl = episodeLink['href'].replace(self.BASE_URL,'')
             airedDateAndPlot = re.search('Aired\s+([01]?[0-9])/([0-3]?[0-9])/([0-9]{2,4})\s*(?:<br\s*/?>)?\s*(.+?)\s*</div>$',str(episode.find('div',{'class':'episodeInfo'})))
             seasonNum = re.search('Season\s+([0-9]+?)[\s:]',str(episode.find('p',{'class':'seasonNum'})))
             episodeNumAndDuration = re.search('Episode\s+([0-9]+?)\s+?\(((?:[0-9]*?:)?[0-9]*?:[0-9]+?)\)',str(episode.find('p',{'class':'episodeNumLine'})))
             vidInfo['aired'] = '%s-%s-%s' % (airedDateAndPlot.group(3),airedDateAndPlot.group(1),airedDateAndPlot.group(2))
             vidInfo['season'] = int(seasonNum.group(1))
             vidInfo['episode'] = int(episodeNumAndDuration.group(1))
             vidInfo['duration'] = episodeNumAndDuration.group(2)
             vidInfo['title'] = episode.find('h3').find('a').string
             vidInfo['plot'] = decode_htmlentities(airedDateAndPlot.group(4))
             print vidInfo
             listitem.setInfo("video",vidInfo)
             xbmcplugin.addDirectoryItem(handle=int( sys.argv[ 1 ] ),listitem=listitem,url="%s?episodeUrl=%s" % ( sys.argv[ 0 ], quote_plus(episodeUrl)))
     xbmcplugin.endOfDirectory( handle=int( sys.argv[ 1 ] ), succeeded=1 )

Пример #3

0

Показать файл

    def _parseComment(self, communityId, liveInfoFilePath, commentFilePath):
        chatList = []
        if not (os.path.exists(liveInfoFilePath) and os.path.exists(commentFilePath)):
            return chatList

        infoParser = BeautifulSoup(open(liveInfoFilePath, u'r'))
        if not infoParser.find(u'communityid').renderContents() == communityId:
            return chatList

        commentParser = BeautifulSoup(open(commentFilePath, u'r'))
        chatTagList = commentParser.findAll(u'chat', attrs={u'msgkind': u'message_msg'})
        for chatTag in chatTagList:
            communityId = communityId.decode(u'utf-8')
            liveId = infoParser.find(u'liveid').renderContents().decode()
            userId = chatTag.get(u'user').decode(u'utf-8')
            name = chatTag.get(u'nickname').decode(u'utf-8')
            message = chatTag.renderContents().decode(u'utf-8')
            option = chatTag.get(u'mail').decode(u'utf-8') if chatTag.get(u'mail') != '' else None
            date = re.sub(
                ur'(\d{4})/(\d{1,2})/(\d{1,2})\s(\d{1,2}):(\d{1,2}):(\d{1,2})',
                lambda match: u'{0:04d}-{1:02d}-{2:02d} {3:02d}:{4:02d}:{5:02d}'.format(int(match.group(1)), int(match.group(2)), int(match.group(3)), int(match.group(4)), int(match.group(5)), int(match.group(6))),
                chatTag.get(u'date')
            ).decode(u'utf-8')
            chatList.append((communityId, liveId, userId, name, message, option, date))

        return chatList

Пример #4

0

Показать файл

Файл: rbs_scraper.py Проект: JamieMcNaught/Bank-Scraper

    def doAllLink(self, page):
       
        soup = BeautifulSoup(page)
        
        self.output_page("RBS-xactlist-all-look.html", page)

        #<a href="/StatementsFixedPeriod.aspx?id=B7879D8CABBF283B38AE447E07A4EA5D8DA9A859&amp;persist=%2fwEPBQ1BY2NvdW50TnVtYmVyBQg4ODU3MjIxOA%3d%3d%7c%2fwEPBQhGcm9tRGF0ZQUTMTgvMDUvMjAxMSAwMDowMDowMA%3d%3d%7c%2fwEPBQhTb3J0Q29kZQUGNjAxNzIx%7c%2fwEPBQZUb0RhdGUFEzE4LzA3LzIwMTEgMDA6MDA6MDA%3d%7c%2fwEWBh4JU1MyQUNDRERBDwUUU0lOSEEgTS9TVFUgODg1NzIyMTgFKEI3ODc5RDhDQUJCRjI4M0IzOEFFNDQ3RTA3QTRFQTVEOERBOUE4NTkeCFNTMlNQRERBDxBkZBYBAgNnHgZTUzJXTEEPAgFo&amp;showall=1" title="Show all items on a single page">All</a>

        logging.debug('RBS checking for all links')

        # find any all link
        links=soup.findAll('a')

        link = None
        for a in links:
            # detect our link
            try:
                if re.search(".tatements.ixed.eriod", a['href']):
                    logging.debug("RBS - got a statement  link")
                    if re.search(".ll", a.text):                        # the one that says all
                        link = self.composeLink(a['href'][:])
                        logging.debug("RBS - got an All statement link")
                        break                                                   # only need the first one so break the for loop
            except Exception, e:
                logging.debug('RBS a link error missing href - ' + str(e))

Пример #5

0

Показать файл

 def __init__(self):
     params = self._parse_argv()
     source = self._fetch_url(self.BASE_URL +
                              unquote_plus(params['showUrl']))
     showIndex = BeautifulSoup(source)
     vidInfo = {
         'tvshowtitle':
         showIndex.find('div',
                        id='showDashboard').find('span', {
                            'class': 'blueText'
                        }).string,
         'studio':
         'FOX'
     }
     seasonsListing = showIndex.findAll(
         'div', {
             'class': re.compile('dashPageHolder'),
             'id': re.compile('^fullEp')
         })
     print len(seasonsListing)
     for season in seasonsListing:
         episodesListing = season.findAll('div',
                                          {'class': 'episodeListing'})
         for episode in episodesListing:
             listitem = xbmcgui.ListItem(
                 episode.find('h3').find('a').string)
             listitem.setThumbnailImage(
                 episode.find('img', id=re.compile('^epThumb'))['src'])
             episodeLink = episode.find('a', {'class': 'thumbnailLink'})
             if episodeLink['href'][0] == '/':
                 episodeUrl = episodeLink['href'][1:]
             else:
                 episodeUrl = episodeLink['href'].replace(self.BASE_URL, '')
             airedDateAndPlot = re.search(
                 'Aired\s+([01]?[0-9])/([0-3]?[0-9])/([0-9]{2,4})\s*(?:<br\s*/?>)?\s*(.+?)\s*</div>$',
                 str(episode.find('div', {'class': 'episodeInfo'})))
             seasonNum = re.search(
                 'Season\s+([0-9]+?)[\s:]',
                 str(episode.find('p', {'class': 'seasonNum'})))
             episodeNumAndDuration = re.search(
                 'Episode\s+([0-9]+?)\s+?\(((?:[0-9]*?:)?[0-9]*?:[0-9]+?)\)',
                 str(episode.find('p', {'class': 'episodeNumLine'})))
             vidInfo['aired'] = '%s-%s-%s' % (airedDateAndPlot.group(3),
                                              airedDateAndPlot.group(1),
                                              airedDateAndPlot.group(2))
             vidInfo['season'] = int(seasonNum.group(1))
             vidInfo['episode'] = int(episodeNumAndDuration.group(1))
             vidInfo['duration'] = episodeNumAndDuration.group(2)
             vidInfo['title'] = episode.find('h3').find('a').string
             vidInfo['plot'] = decode_htmlentities(
                 airedDateAndPlot.group(4))
             print vidInfo
             listitem.setInfo("video", vidInfo)
             xbmcplugin.addDirectoryItem(
                 handle=int(sys.argv[1]),
                 listitem=listitem,
                 url="%s?episodeUrl=%s" %
                 (sys.argv[0], quote_plus(episodeUrl)))
     xbmcplugin.endOfDirectory(handle=int(sys.argv[1]), succeeded=1)

Пример #6

0

Показать файл

    def rewrite_html(self, guid, html=None, ajax_url=None):
        """if we are not using ajax, then html is IGNORED and we go by the
		cached copy.  html is sometimes used to see if there should be a
		cached copy at all, or if something goes wrong and we just need to
		return unaltered html
		"""

        guid = str(guid)
        cache_dir = os.path.join(self._store_location, guid_hash(guid))
        mapping_file = os.path.join(cache_dir, guid + "-" + "mapping.pickle")

        if not os.path.isfile(mapping_file):
            # quick and dirty check.  are there images?  if not, plain
            # html is fine
            if html.lower().find('<img') >= 0:
                #logging.warning("Should be downloaded images, but couldn't open mapping.  Recaching")
                self.cache_html(guid, html)
            return html

        try:
            mapping = open(mapping_file, 'r')
            rewrite_hash = pickle.load(mapping)
            non_ajax_html = pickle.load(mapping)
            mapping.close()
        except:
            logging.error("error opening cache pickle for guid %s %s" %
                          (guid, mapping_file))
            logging.error(
                "If you have upgraded penguintv, you might need to delete your image cache"
            )
            return html

        if ajax_url is None:
            return non_ajax_html

        #else, rewrite on the fly
        soup = BeautifulSoup(html)
        img_tags = soup.findAll('img')

        if len(img_tags) == 0:
            return html

        for result in img_tags:
            # believe it or not, some img tags don't have a src, they have an id
            # that points to CSS.  At least I think that's what's going on
            if result.has_key('src'):
                if rewrite_hash.has_key(result['src']):
                    if rewrite_hash[result['src']][1] == UrlCacher.DOWNLOADED:
                        #if os.path.isfile(os.path.join(self._store_location, rewrite_hash[result['src']][0])):
                        result['src'] = ajax_url + "/cache/" + rewrite_hash[
                            result['src']][0]
                        #else:
                        #	logging.warning("file not found, not replacing")
                        #	logging.debug("(should we attempt to recache here?")

        return soup.prettify()

Пример #7

0

Показать файл

    def _parseNatWestLinks(self, raw):
        soup = BeautifulSoup(raw)
        accountBLock = soup.findAll('a', attrs={'class': 'accountNameExpand'})

        # got some acount details now so all good
        if len(accountBLock) == 0:
            logging.warning('NatWest no accounts after continue form')
            return 'account problem'

        for ac_link in accountBLock:
            ac_link.string = ac_link.text
            self.accountLinks.append(ac_link)

            # now the accnum list - to get the pair data, cos cant get it from link
            row = ac_link.parent.parent
            try:
                # find the account number span
                acnumSpan = row.find('span', attrs={'class': 'AccountNumber'})
                acnum = acnumSpan.text
                acnum = acnum.replace(' ', '')

                # find the sort code span
                sortSpan = row.find('span', attrs={'class': 'SortCode'})
                sortc = sortSpan.text
                sortc = sortc.replace(' ', '')
                sortc = sortc.replace('-', '')
            except Exception, e:
                logging.exception('NatWest form error - ' + str(e))
                return 'bank error'

            #combine the two - to be our matching number
            num = sortc + "-" + acnum

            actype = 'Cheque'
            # might be a credit card
            if len(acnum) > 14:
                actype = 'Credit'

            # now get balances...
            balance = 0
            baltr = ac_link.parent.parent
            baltds = baltr.findAll('td')
            if len(baltds) > 2:
                baltext = self.tidy_text(baltds[3].text)
                balance = self.normalise_ammount(baltext)

            # and add it to our account list
            acpair = {
                'name': ac_link.text,
                'num': num,
                'type': actype,
                'bal': balance
            }

            self.myAccounts.append(acpair)

Пример #8

0

Показать файл

    def doStep3(self, allofit, page):

        scrape_result = "good"

        #-------------------------------- Grab the form values -----------------------------------------------
        soup = BeautifulSoup(page)

        # write out the start page
        self.output_page("natwest-security.html", page)

        scrape_result = 'good'
        logging.info("NatWest security page2")

        # check if we got returned
        # check for the password input ctl00$mainContent$LI5TABA$DBID_edit then we didnt move on
        errorDiv = soup.findAll(
            'input', attrs={'name': 'ctl00$mainContent$LI5TABA$DBID_edit'})

        if len(errorDiv) != 0:
            logging.info("NatWest security page1 still - customer number bad")
            return 'credentials incorrect'  # if we get here then the form was found hence creds must be wrong

        # find our form
        loginform = soup.find('form', attrs={'name': 'aspnetForm'})

        if loginform == None:
            logging.debug('NatWest no security form')
            return 'bank error'

        values = self.parseForm(loginform)

        # define some variables that would only otherwise exist in a try catch block scope
        # the label text split on spaces
        which1arr = ""
        which2arr = ""
        which3arr = ""

        # the chalenges
        firstDigit = ""
        secondDigit = ""
        thirdDigit = ""

        #>>>>>>> The first set of Pin fields
        #-------------------- get the questions --------------#

        #<label for="ctl00_mainContent_Tab1_LI6PPEA_edit" id="ctl00_mainContent_Tab1_LI6DDALALabel" class="wizCrl wizardLabelRememberMeWide">Enter the 2nd number</label>
        useNewTab = False
        try:
            which1 = soup.find('label',
                               attrs={
                                   'for': 'ctl00_mainContent_LI6PPEA_edit'
                               }).text
        except Exception, e:
            useNewTab = True

Пример #9

0

Показать файл

Файл: OfflineImageCache.py Проект: ywwg/penguintv

	def rewrite_html(self, guid, html=None, ajax_url=None):
		"""if we are not using ajax, then html is IGNORED and we go by the
		cached copy.  html is sometimes used to see if there should be a
		cached copy at all, or if something goes wrong and we just need to
		return unaltered html
		"""

		guid = str(guid)
		cache_dir = os.path.join(self._store_location, guid_hash(guid))
		mapping_file = os.path.join(cache_dir, guid + "-" + "mapping.pickle")

		if not os.path.isfile(mapping_file):
			# quick and dirty check.  are there images?  if not, plain
			# html is fine
			if html.lower().find('<img') >= 0:
				#logging.warning("Should be downloaded images, but couldn't open mapping.  Recaching")
				self.cache_html(guid, html)
			return html

		try:
			mapping = open(mapping_file, 'r')
			rewrite_hash = pickle.load(mapping)
			non_ajax_html = pickle.load(mapping)
			mapping.close()
		except:
			logging.error("error opening cache pickle for guid %s %s" % (guid, mapping_file))
			logging.error("If you have upgraded penguintv, you might need to delete your image cache")
			return html

		if ajax_url is None:
			return non_ajax_html

		#else, rewrite on the fly
		soup = BeautifulSoup(html)
		img_tags = soup.findAll('img')

		if len(img_tags) == 0:
			return html

		for result in img_tags:
			# believe it or not, some img tags don't have a src, they have an id
			# that points to CSS.  At least I think that's what's going on
			if result.has_key('src'):
				if rewrite_hash.has_key(result['src']):
					if rewrite_hash[result['src']][1] == UrlCacher.DOWNLOADED:
						#if os.path.isfile(os.path.join(self._store_location, rewrite_hash[result['src']][0])):
						result['src'] = ajax_url + "/cache/" + rewrite_hash[result['src']][0]
						#else:
						#	logging.warning("file not found, not replacing")
						#	logging.debug("(should we attempt to recache here?")

		return soup.prettify()

Пример #10

0

Показать файл

Файл: rbs_scraper.py Проект: JamieMcNaught/Bank-Scraper

    def _parseRBSLinks(self, raw):
        soup = BeautifulSoup(raw)
        accountBLock=soup.findAll('a', attrs={'class' : 'accountNameExpand'})

        # got some acount details now so all good
        if len(accountBLock) == 0:
            logging.warning('RBS no accounts after continue form')
            return 'account problem'

        for ac_link in accountBLock:
            ac_link.string = ac_link.text
            self.accountLinks.append(ac_link)


            # now the accnum list - to get the pair data, cos cant get it from link
            row = ac_link.parent.parent
            try:
                # find the account number span
                acnumSpan = row.find('span', attrs={'class': 'AccountNumber'})
                acnum = acnumSpan.text
                acnum = acnum.replace(' ', '')

                # find the sort code span
                sortSpan = row.find('span', attrs={'class': 'SortCode'})
                sortc = sortSpan.text
                sortc = sortc.replace(' ', '')
                sortc = sortc.replace('-', '')
            except Exception, e:
                logging.exception('RBS form error - ' + str(e))
                return 'bank error'

            #combine the two - to be our matching number
            num = sortc + "-" + acnum

            actype =  'Cheque'
            # might be a credit card
            if len(acnum) > 14:
                actype =  'Credit'

            # now get balances...
            balance = 0
            baltr = ac_link.parent.parent
            baltds = baltr.findAll('td')
            if len(baltds) > 2:
                baltext = self.tidy_text(baltds[3].text)
                balance = self.normalise_ammount(baltext)

            # and add it to our account list
            acpair = {'name': ac_link.text, 'num': num, 'type': actype, 'bal': balance}

            self.myAccounts.append(acpair)

Пример #11

0

Показать файл

Файл: rbs_scraper.py Проект: JamieMcNaught/Bank-Scraper

    def doStep3(self, allofit, page):
        
        scrape_result = "good"
        
        #-------------------------------- Grab the form values -----------------------------------------------
        soup = BeautifulSoup(page)

        # write out the start page
        self.output_page("RBS-security.html", page)
        
        scrape_result = 'good'
        logging.info("RBS security page2")

        # check if we got returned
        # check for the password input ctl00$mainContent$LI5TABA$DBID_edit then we didnt move on
        errorDiv=soup.findAll('input', attrs={'name' : 'ctl00$mainContent$LI5TABA$DBID_edit'})

        if len(errorDiv) != 0:
            logging.info("RBS security page1 still - customer number bad")
            return  'credentials incorrect'   # if we get here then the form was found hence creds must be wrong
        
        
        # find our form
        loginform=soup.find('form', attrs={'name': 'aspnetForm'})

        if loginform == None:
            logging.debug('RBS no security form')
            return 'bank error'

        values = self.parseForm(loginform)

        # define some variables that would only otherwise exist in a try catch block scope
        # the label text split on spaces
        which1arr = ""
        which2arr = ""
        which3arr = ""

        # the chalenges
        firstDigit  = ""
        secondDigit = ""
        thirdDigit  = ""

        #>>>>>>> The first set of Pin fields
        #-------------------- get the questions --------------#

        #<label for="ctl00_mainContent_Tab1_LI6PPEA_edit" id="ctl00_mainContent_Tab1_LI6DDALALabel" class="wizCrl wizardLabelRememberMeWide">Enter the 2nd number</label>
        useNewTab = False
        try:
            which1=soup.find('label', attrs={'for' : 'ctl00_mainContent_LI6PPEA_edit'}).text
        except Exception, e:
            useNewTab = True

Пример #12

0

Показать файл

Файл: ugacads.py Проект: saketkc/ug_acads

 def get(self):
     bookname  = self.request.get('search')
     req = isbndbpy.Request('books', 'combined', str(bookname))
     resp = req.send().read()
     #print resp
     soup = BS(str(resp))
     books = soup.findAll('bookdata')
     self.response.out.write('<html><body>')
     for bookdata in books:
         self.response.out.write('<br/>Title: '+ str(bookdata.find('title').string ))
         self.response.out.write('<br/>ISBN: ' +str(bookdata.get('isbn13')))
         self.response.out.write('<br/>AUTHOR :'+str( bookdata.find('authorstext').string))
         self.response.out.write('<br/>PUBLISEHR: '+str(bookdata.find('publishertext').string))
         self.response.out.write('<br/> "***********')

Пример #13

0

Показать файл

Файл: natwest_scraper.py Проект: JamieMcNaught/Bank-Scraper

    def _parseNatWestLinks(self, raw):
        soup = BeautifulSoup(raw)
        accountBLock = soup.findAll("a", attrs={"class": "accountNameExpand"})

        # got some acount details now so all good
        if len(accountBLock) == 0:
            logging.warning("NatWest no accounts after continue form")
            return "account problem"

        for ac_link in accountBLock:
            ac_link.string = ac_link.text
            self.accountLinks.append(ac_link)

            # now the accnum list - to get the pair data, cos cant get it from link
            row = ac_link.parent.parent
            try:
                # find the account number span
                acnumSpan = row.find("span", attrs={"class": "AccountNumber"})
                acnum = acnumSpan.text
                acnum = acnum.replace(" ", "")

                # find the sort code span
                sortSpan = row.find("span", attrs={"class": "SortCode"})
                sortc = sortSpan.text
                sortc = sortc.replace(" ", "")
                sortc = sortc.replace("-", "")
            except Exception, e:
                logging.exception("NatWest form error - " + str(e))
                return "bank error"

            # combine the two - to be our matching number
            num = sortc + "-" + acnum

            actype = "Cheque"
            # might be a credit card
            if len(acnum) > 14:
                actype = "Credit"

            # now get balances...
            balance = 0
            baltr = ac_link.parent.parent
            baltds = baltr.findAll("td")
            if len(baltds) > 2:
                baltext = self.tidy_text(baltds[3].text)
                balance = self.normalise_ammount(baltext)

            # and add it to our account list
            acpair = {"name": ac_link.text, "num": num, "type": actype, "bal": balance}

            self.myAccounts.append(acpair)

Пример #14

0

Показать файл

Файл: rbs_scraper.py Проект: JamieMcNaught/Bank-Scraper

    def doStep12(self, page):
       
        #-------------------------------- Grab the form values -----------------------------------------------
        soup = BeautifulSoup(page)
        
        self.output_page("RBS-xactlist-cc-poss.html", page)
    
        rightButtons=soup.findAll('a', attrs={'class' : 'link-button-right'})

        # any buttons?
        if len(rightButtons) == 0:
            logging.error('RBS no cc accountbuttons')
            return 'bank error'

        # RBS is not dynamic -so this static list is fine (unlike Smile)
        acLink = None
        for a in rightButtons:
            # filter out the account detail buttons matching just the statement buttons
            # Bloody hope this regex finds shit in the right order
            if re.search(".ard.tatement.etail", a['href']):
                acLink = a['href'][:]
                



        if acLink == None:
            logging.debug('RBS no cc detail link')
            return 'bank error'

        

        # action = self.urlBase + '/' + loginform['action']

        action = acLink
        try:
            logging.debug("checking link - " + acLink)
            urls = urlparse(acLink);

            # if it parses properly good else    
            
        
        except Exception, e:
            logging.error('RBS cc link error - ' + str(e))
            action = self.urlBase + '/' + acLink

Пример #15

0

Показать файл

    def doStep12(self, page):

        #-------------------------------- Grab the form values -----------------------------------------------
        soup = BeautifulSoup(page)

        self.output_page("natwest-xactlist-cc-poss.html", page)

        rightButtons = soup.findAll('a', attrs={'class': 'link-button-right'})

        # any buttons?
        if len(rightButtons) == 0:
            logging.error('NatWest no cc accountbuttons')
            return 'bank error'

        # natWest is not dynamic -so this static list is fine (unlike Smile)
        acLink = None
        for a in rightButtons:
            # filter out the account detail buttons matching just the statement buttons
            # Bloody hope this regex finds shit in the right order
            if re.search(".ard.tatement.etail", a['href']):
                acLink = a['href'][:]

        if acLink == None:
            logging.debug('NatWest no cc detail link')
            return 'bank error'

        # action = self.urlBase + '/' + loginform['action']

        action = acLink
        try:
            logging.debug("checking link - " + acLink)
            urls = urlparse(acLink)

            # if it parses properly good else

        except Exception, e:
            logging.error('NatWest cc link error - ' + str(e))
            action = self.urlBase + '/' + acLink

Пример #16

0

Показать файл

Файл: rbs_scraper.py Проект: JamieMcNaught/Bank-Scraper

    def doStep4(self, allofit, page):
        
        scrape_result = "good"
        
        #-------------------------------- Grab the form values -----------------------------------------------
        soup = BeautifulSoup(page)
        
        # write out the start page
        self.output_page("RBS-pos-accounts.html", page)
        
        scrape_result = 'good'
        logging.info("RBS message or bad cred check ")

        # if we still have the input then def bad credentials 
        errorDiv=soup.findAll('input', attrs={'name' : 'ctl00$mainContent$LI6PPEA_edit'})

        if len(errorDiv) != 0:
            logging.info("RBS defiantely bad credentials")
            return 'credentials incorrect' 


        accountBLock=soup.findAll('table', attrs={'class' : 'AccountTable'})
        # got some acount details so all good
        if len(accountBLock) > 0:
            logging.debug("RBS defiantely got some good accounts")
            return 'good';

        # find any link

        # if we find a link return it 

        # check for the normal continue button and fail all else - with credentials failure
        continueButton = soup.find('input', attrs={'id' : 'ctl00_mainContent_FinishButton_button'})

        if(continueButton == None):
            logging.warning("RBS cant find finish button credentials incorrect")

            nextButton = soup.find('input', attrs={'id' : 'ctl00_mainContent_NextButton_button'})

            if(nextButton == None):
                logging.warning("RBS cant find next button either")
                return  'credentials incorrect'


        # now find the form that these buttons belong to
        loginform=soup.find('form', attrs={'name': 'aspnetForm'})

        if loginform == None:
            logging.debug('RBS no continue form')
            return 'bank error'

        action = self.urlBase + '/' + loginform['action']
    
        # any hidden values etc        
        values = self.parseForm(loginform)
        

        # build the body content
        data = urllib.urlencode(values)
        self.response = {}
        self.response['url'] = self.ByteToHex(action)
        self.response['data'] = self.ByteToHex(data)
        self.response['method'] = 'POST'
        self.response['step'] = 4
        
        return 'messages'

Пример #17

0

Показать файл

Файл: natwest_scraper.py Проект: JamieMcNaught/Bank-Scraper

    def doStep4(self, allofit, page):

        scrape_result = "good"

        # -------------------------------- Grab the form values -----------------------------------------------
        soup = BeautifulSoup(page)

        # write out the start page
        self.output_page("natwest-pos-accounts.html", page)

        scrape_result = "good"
        logging.info("NatWest message or bad cred check ")

        # if we still have the input then def bad credentials
        errorDiv = soup.findAll("input", attrs={"name": "ctl00$mainContent$LI6PPEA_edit"})

        if len(errorDiv) != 0:
            logging.info("NatWest defiantely bad credentials")
            return "credentials incorrect"

        accountBLock = soup.findAll("table", attrs={"class": "AccountTable"})
        # got some acount details so all good
        if len(accountBLock) > 0:
            logging.debug("NatWest defiantely got some good accounts")
            return "good"

        # find any link

        # if we find a link return it

        # check for the normal continue button and fail all else - with credentials failure
        continueButton = soup.find("input", attrs={"id": "ctl00_mainContent_FinishButton_button"})

        if continueButton == None:
            logging.warning("NatWest cant find finish button credentials incorrect")

            nextButton = soup.find("input", attrs={"id": "ctl00_mainContent_NextButton_button"})

            if nextButton == None:
                logging.warning("NatWest cant find next button either")
                return "credentials incorrect"

        # now find the form that these buttons belong to
        loginform = soup.find("form", attrs={"name": "aspnetForm"})

        if loginform == None:
            logging.debug("NatWest no continue form")
            return "bank error"
        else:
            logging.debug("found a continue form - so clicking it")
        action = self.urlBase + "/" + loginform["action"]

        # any hidden values etc
        values = self.parseForm(loginform)

        # build the body content
        data = urllib.urlencode(values)
        self.response = {}
        self.response["url"] = self.ByteToHex(action)
        self.response["data"] = self.ByteToHex(data)
        self.response["method"] = "POST"
        self.response["step"] = 4

        return "messages"

Пример #18

0

Показать файл

    def doStep4(self, allofit, page):

        scrape_result = "good"

        #-------------------------------- Grab the form values -----------------------------------------------
        soup = BeautifulSoup(page)

        # write out the start page
        self.output_page("natwest-pos-accounts.html", page)

        scrape_result = 'good'
        logging.info("NatWest message or bad cred check ")

        # if we still have the input then def bad credentials
        errorDiv = soup.findAll(
            'input', attrs={'name': 'ctl00$mainContent$LI6PPEA_edit'})

        if len(errorDiv) != 0:
            logging.info("NatWest defiantely bad credentials")
            return 'credentials incorrect'

        accountBLock = soup.findAll('table', attrs={'class': 'AccountTable'})
        # got some acount details so all good
        if len(accountBLock) > 0:
            logging.debug("NatWest defiantely got some good accounts")
            return 'good'

        # find any link

        # if we find a link return it

        # check for the normal continue button and fail all else - with credentials failure
        continueButton = soup.find(
            'input', attrs={'id': 'ctl00_mainContent_FinishButton_button'})

        if (continueButton == None):
            logging.warning(
                "NatWest cant find finish button credentials incorrect")

            nextButton = soup.find(
                'input', attrs={'id': 'ctl00_mainContent_NextButton_button'})

            if (nextButton == None):
                logging.warning("NatWest cant find next button either")
                return 'credentials incorrect'

        # now find the form that these buttons belong to
        loginform = soup.find('form', attrs={'name': 'aspnetForm'})

        if loginform == None:
            logging.debug('NatWest no continue form')
            return 'bank error'
        else:
            logging.debug('found a continue form - so clicking it')
        action = self.urlBase + '/' + loginform['action']

        # any hidden values etc
        values = self.parseForm(loginform)

        # build the body content
        data = urllib.urlencode(values)
        self.response = {}
        self.response['url'] = self.ByteToHex(action)
        self.response['data'] = self.ByteToHex(data)
        self.response['method'] = 'POST'
        self.response['step'] = 4

        return 'messages'

Пример #19

0

Показать файл

Файл: main.py Проект: joshz/mrpopular

    def post(self):
        errors = []
        context = defaultdict(list)
        username = self.request.get("username")

        if username != "":
            bsusername = BeautifulSoup(username)
            username = ''.join(bsusername.findAll(text=True)).rstrip().strip()

            models.add_visit(username)
            github = Github(requests_per_second=1)

            try:
                logging.info("Grabbing repositories for user: %s" % username)
                repos = github.repos.list(username)
                user = models.User.gql('WHERE username = :1', username).get()

                projects_in_store = user.projects
                project = None
                project_detail_list = []
                if len(repos) > 0:
                    for repo in enumerate(repos[:5]):
                        # don't hammer datastore if project list is empty
                        if len(projects_in_store) > 0:
                            logging.info("Grabbing project details from datastore key: %s" % (username + "_" + repo.name))
                            project = obj_to_dictionary(models.Project.get_by_key_name(username + "_" + repo.name))

                        # grab from github
                        if not project:
                            logging.info("Grabbing project details for: %s" % repo.name)
                            project = grab_project_details(repo)

                            context['projects'].append(dict_to_tuple(project))

                            project_detail_list.append(project)

                        # grab from datastore
                        else:
                            if project['graph_url'] == "":
                                logging.info("Grabbing graph url for project: %s" % repo.name)
                                project = grab_project_details(repo)
                                project_detail_list.append(project)
                            context['projects'].append(dict_to_tuple(project))
                        project = None
                    update_models(project_detail_list, user)
                else:
                    logging.error("User %s has no repositories." % username)
                    errors.append('User %s has no repositories.' % username)
                    context = {'errors': errors}

            except RuntimeError:
                logging.error("RuntimeError when grabbing repositories for username: %s" % username)
                errors.append("An error occured - are you sure you spelled the username correctly?")
                context = {'errors': errors}
            except ApplicationError:
                logging.error("Timeout when grabbing repositories, try again")
                errors.append("An error occured - are you sure you spelled the username correctly?")
                context = {'errors': errors}
            except DeadlineExceededError:
                self.response.clear()
                self.response.set_status(500)
                errors.append("App timed out probably because user has too many projects")
                context = {'errors': errors}
                logging.critical("Application timed out while querying for user: %s" % username)
        else:
            errors.append("Enter username")
            context = {'errors': errors}

        self.response.out.write(template.render('templates/index.html', context))

Пример #20

0

Показать файл

    def processAccount(self, acCount, acName, account_path, allofit):

        page = self.HexToByte(allofit['body'])

        # save this page
        self.output_page("account" + str(acCount) + ".html", page)

        soup = BeautifulSoup(page)

        logging.debug('ac path - ' + str(account_path) + ' - end')

        if account_path != "":
            # delete existing current xactions

            logging.debug('Processing :) ')

            self.statementbuilder = StatementBuilder(self.facade, account_path,
                                                     self.token)

            # need to get last statement and make a new one every time
            self.statementbuilder.make_recent_dif_statement(
                'Fd-recent', 'Scraper', None)  #TODO change this

            isVisa = False
            loginform = soup.find(
                'input', attrs={'name': 'cmd_sort_referenceAscending'})
            if loginform != None:
                isVisa = True

                bal_tables = soup.findAll(
                    'table', attrs={'class': 'fdTableBackgroundOne'})
                balance_table = bal_tables[2]

                if balance_table <> None:
                    vals = balance_table.findAll('td')

                    if vals:
                        bal = vals[1].text
                        data = bal.replace('&#163;', u'£')
                        data = data.strip(u'£')
                        if data[-1] == 'D':
                            data = data.replace('DB', '')
                            data = data.replace('D', '')
                            lastbal = int(float(data) * 100)
                            firstbal = 0 - lastbal
                        else:
                            data = data.replace('CR', '')
                            data = data.replace('C', '')
                            firstbal = int(float(data) * 100)

                        self.statementbuilder.set_current_balance(firstbal)

            logging.debug(
                "-----------------------------*******---------------------")
            if isVisa:
                logging.debug("found visa --")

            acTable = soup.find('table', attrs={'class': 'fdStatTable'})

            # if no table then no new data afaik
            if acTable != None:
                datarows = acTable.findAll('tr')

                next = False

                # build the post values up
                atts = {}

                isFirst = True
                firstbal = 0
                firstdate = ""

                lastbal = 0
                lastdate = ""

                doBalance = False

                dp = DateParser()

                for rows in datarows:
                    vals = rows.findAll('td')

                    if vals:
                        for i, val in enumerate(vals):

                            if val.text:
                                data = val.text.strip()
                                data = unescape(data)
                                data = unicode(data)

                            else:
                                data = ""

                            if data != "&nbsp;":
                                data = data.replace('&nbsp;', '')
                                if i == 0:
                                    if data != "":
                                        try:
                                            lastdate = dp.ymd_from_date(
                                                dp.date_from_dmy(data, '/'))
                                        except:
                                            logging.warn(
                                                "Invalid FD date format - probably no transactions"
                                            )
                                            return

                                        if firstdate == "":
                                            firstdate = lastdate

                                    atts['date'] = lastdate

                                if (i == 1 and not isVisa) or (i == 2
                                                               and isVisa):
                                    atts['display'] = data[0:19]
                                    atts['extradisplay'] = data[19:]

                                if (i == 2 and not isVisa) or (i == 3
                                                               and isVisa):
                                    if data != "":
                                        data = data.strip(u'£')
                                        data = data.strip(u'D')
                                        data = data.strip(u'B')
                                        if data == '':
                                            atts['amount'] = 0
                                        else:
                                            atts['amount'] = int(
                                                float(data) * 100)
                                        atts['type'] = 'Debit'

                                if (i == 3 and not isVisa) or (i == 4
                                                               and isVisa):
                                    if data != "":
                                        data = data.strip(u'£')
                                        data = data.strip(u'C')
                                        data = data.strip(u'R')
                                        if data == '':
                                            atts['amount'] = 0
                                        else:
                                            atts['amount'] = int(
                                                float(data) * 100)
                                        atts['type'] = 'Credit'

                                if not isVisa:
                                    if i == 4:
                                        data = data.strip(u'£')
                                        if data != "":
                                            lastbal = int(float(data) * 100)

                                            if isFirst:
                                                isFirst = False
                                                firstbal = lastbal
                                                doBalance = True

                                    if i == 5:
                                        if doBalance:
                                            doBalance = False
                                            if data == "D":
                                                firstbal = 0 - firstbal
                                            self.statementbuilder.set_current_balance(
                                                firstbal)

                        self.statementbuilder.make_xact(atts)

                self.statementbuilder.put_statement()
                self.current_statement = self.current_statement + 1

Пример #21

0

Показать файл

 def _loadUserSetting(self, communityId, userSettingFilePath):
     parser = BeautifulSoup(open(userSettingFilePath, u'r'))
     nameTagList = parser.findAll(u'user', attrs={ u'community': communityId, u'name': True })
     return dict(map(lambda tag: (tag.renderContents(), tag.get(u'name')), nameTagList))

Пример #22

0

Показать файл

Файл: first_direct_scraper.py Проект: JamieMcNaught/Bank-Scraper

    def processAccount(self, acCount, acName, account_path, allofit):
        
        page = self.HexToByte( allofit['body'])
        
        # save this page
        self.output_page("account" + str(acCount) + ".html", page) 
        
        soup = BeautifulSoup(page)
            
        logging.debug('ac path - ' + str(account_path) + ' - end' )
        
        if account_path != "":
            # delete existing current xactions
            
            logging.debug('Processing :) ' )
            
            self.statementbuilder = StatementBuilder(self.facade, account_path, self.token)
           
            # need to get last statement and make a new one every time
            self.statementbuilder.make_recent_dif_statement('Fd-recent', 'Scraper', None) #TODO change this 
                        
            isVisa = False
            loginform=soup.find('input', attrs={'name' : 'cmd_sort_referenceAscending'})
            if loginform != None:
                isVisa = True
                
                bal_tables=soup.findAll('table', attrs={'class' : 'fdTableBackgroundOne'})
                balance_table = bal_tables[2]

                if balance_table <> None:
                    vals = balance_table.findAll('td')

                    if vals:
                        bal = vals[1].text
                        data = bal.replace('&#163;', u'£');
                        data = data.strip(u'£')
                        if data[-1] == 'D':
                            data = data.replace('DB','')
                            data = data.replace('D','')
                            lastbal = int( float(data) * 100 )
                            firstbal = 0 - lastbal
                        else:
                            data = data.replace('CR','')
                            data = data.replace('C','')
                            firstbal = int( float(data) * 100 )
                        
                        self.statementbuilder.set_current_balance(firstbal)    
                   
            
            logging.debug("-----------------------------*******---------------------")
            if isVisa:
                logging.debug("found visa --")
            
            acTable=soup.find('table', attrs={'class' : 'fdStatTable'})
            
            # if no table then no new data afaik
            if acTable != None:
               datarows=acTable.findAll('tr')
               
               next = False
               
                
               # build the post values up
               atts = {}
               
               isFirst = True
               firstbal = 0
               firstdate = ""
               
               lastbal = 0
               lastdate = ""
               
               doBalance = False
               
               dp = DateParser()
                           
               for rows in datarows:
                   vals = rows.findAll('td')
                   
                   if vals:
                       for i, val in enumerate(vals):
                           
                           if val.text:
                               data = val.text.strip()
                               data = unescape(data)
                               data = unicode(data)
                               
                           else:
                               data = ""
                           
                           if data != "&nbsp;":
                               data = data.replace('&nbsp;','')
                               if i == 0:
                                   if data != "":
                                       try:
                                           lastdate = dp.ymd_from_date(dp.date_from_dmy(data,'/'))
                                       except:
                                           logging.warn("Invalid FD date format - probably no transactions")
                                           return
                                       
                                       if firstdate == "":
                                           firstdate = lastdate
                                       
                                   atts['date'] = lastdate
                                   
                               if (i == 1 and not isVisa) or (i == 2 and isVisa):
                                       atts['display'] = data[0:19]
                                       atts['extradisplay'] = data[19:]
                                   
                               if (i == 2 and not isVisa) or (i == 3 and isVisa):
                                   if data != "":
                                       data = data.strip(u'£')
                                       data = data.strip(u'D')
                                       data = data.strip(u'B')
                                       if data == '':
                                           atts['amount'] = 0
                                       else:
                                           atts['amount'] = int( float(data) * 100 )
                                       atts['type'] = 'Debit'
                                           
                               if (i == 3 and not isVisa) or (i == 4 and isVisa):
                                   if data != "":
                                       data = data.strip(u'£')
                                       data = data.strip(u'C')
                                       data = data.strip(u'R')
                                       if data == '':
                                           atts['amount'] = 0
                                       else:
                                           atts['amount'] = int( float(data) * 100 )
                                       atts['type'] = 'Credit'
                                       
                               if not isVisa:
                                   if i == 4:
                                       data = data.strip(u'£')
                                       if data != "":
                                           lastbal = int( float(data) * 100 )
                                           
                                           if isFirst:
                                               isFirst = False
                                               firstbal = lastbal
                                               doBalance = True
                                               
                                   if i == 5:
                                       if doBalance:
                                           doBalance = False
                                           if data == "D":
                                               firstbal = 0 - firstbal
                                           self.statementbuilder.set_current_balance(firstbal) 
                                       
                       self.statementbuilder.make_xact(atts)
           
               self.statementbuilder.put_statement()
               self.current_statement = self.current_statement + 1

Python BeautifulSoup.findAll примеры использования