def doAllLink(self, page): soup = BeautifulSoup(page) self.output_page("natwest-xactlist-all-look.html", page) #<a href="/StatementsFixedPeriod.aspx?id=B7879D8CABBF283B38AE447E07A4EA5D8DA9A859&persist=%2fwEPBQ1BY2NvdW50TnVtYmVyBQg4ODU3MjIxOA%3d%3d%7c%2fwEPBQhGcm9tRGF0ZQUTMTgvMDUvMjAxMSAwMDowMDowMA%3d%3d%7c%2fwEPBQhTb3J0Q29kZQUGNjAxNzIx%7c%2fwEPBQZUb0RhdGUFEzE4LzA3LzIwMTEgMDA6MDA6MDA%3d%7c%2fwEWBh4JU1MyQUNDRERBDwUUU0lOSEEgTS9TVFUgODg1NzIyMTgFKEI3ODc5RDhDQUJCRjI4M0IzOEFFNDQ3RTA3QTRFQTVEOERBOUE4NTkeCFNTMlNQRERBDxBkZBYBAgNnHgZTUzJXTEEPAgFo&showall=1" title="Show all items on a single page">All</a> logging.debug('NatWest checking for all links') # find any all link links = soup.findAll('a') link = None for a in links: # detect our link try: if re.search(".tatements.ixed.eriod", a['href']): logging.debug("natwest - got a statement link") if re.search(".ll", a.text): # the one that says all link = self.composeLink(a['href'][:]) logging.debug("natwest - got an All statement link") break # only need the first one so break the for loop except Exception, e: logging.debug('NatWest a link error missing href - ' + str(e))
def __init__( self ): params = self._parse_argv() source = self._fetch_url(self.BASE_URL + unquote_plus(params['showUrl'])) showIndex = BeautifulSoup(source) vidInfo = {'tvshowtitle': showIndex.find('div',id='showDashboard').find('span',{'class':'blueText'}).string, 'studio': 'FOX'} seasonsListing = showIndex.findAll('div',{'class':re.compile('dashPageHolder'),'id':re.compile('^fullEp')}) print len(seasonsListing) for season in seasonsListing: episodesListing = season.findAll('div',{'class':'episodeListing'}) for episode in episodesListing: listitem = xbmcgui.ListItem(episode.find('h3').find('a').string) listitem.setThumbnailImage(episode.find('img',id=re.compile('^epThumb'))['src']) episodeLink = episode.find('a',{'class':'thumbnailLink'}) if episodeLink['href'][0] == '/': episodeUrl = episodeLink['href'][1:] else: episodeUrl = episodeLink['href'].replace(self.BASE_URL,'') airedDateAndPlot = re.search('Aired\s+([01]?[0-9])/([0-3]?[0-9])/([0-9]{2,4})\s*(?:<br\s*/?>)?\s*(.+?)\s*</div>$',str(episode.find('div',{'class':'episodeInfo'}))) seasonNum = re.search('Season\s+([0-9]+?)[\s:]',str(episode.find('p',{'class':'seasonNum'}))) episodeNumAndDuration = re.search('Episode\s+([0-9]+?)\s+?\(((?:[0-9]*?:)?[0-9]*?:[0-9]+?)\)',str(episode.find('p',{'class':'episodeNumLine'}))) vidInfo['aired'] = '%s-%s-%s' % (airedDateAndPlot.group(3),airedDateAndPlot.group(1),airedDateAndPlot.group(2)) vidInfo['season'] = int(seasonNum.group(1)) vidInfo['episode'] = int(episodeNumAndDuration.group(1)) vidInfo['duration'] = episodeNumAndDuration.group(2) vidInfo['title'] = episode.find('h3').find('a').string vidInfo['plot'] = decode_htmlentities(airedDateAndPlot.group(4)) print vidInfo listitem.setInfo("video",vidInfo) xbmcplugin.addDirectoryItem(handle=int( sys.argv[ 1 ] ),listitem=listitem,url="%s?episodeUrl=%s" % ( sys.argv[ 0 ], quote_plus(episodeUrl))) xbmcplugin.endOfDirectory( handle=int( sys.argv[ 1 ] ), succeeded=1 )
def _parseComment(self, communityId, liveInfoFilePath, commentFilePath): chatList = [] if not (os.path.exists(liveInfoFilePath) and os.path.exists(commentFilePath)): return chatList infoParser = BeautifulSoup(open(liveInfoFilePath, u'r')) if not infoParser.find(u'communityid').renderContents() == communityId: return chatList commentParser = BeautifulSoup(open(commentFilePath, u'r')) chatTagList = commentParser.findAll(u'chat', attrs={u'msgkind': u'message_msg'}) for chatTag in chatTagList: communityId = communityId.decode(u'utf-8') liveId = infoParser.find(u'liveid').renderContents().decode() userId = chatTag.get(u'user').decode(u'utf-8') name = chatTag.get(u'nickname').decode(u'utf-8') message = chatTag.renderContents().decode(u'utf-8') option = chatTag.get(u'mail').decode(u'utf-8') if chatTag.get(u'mail') != '' else None date = re.sub( ur'(\d{4})/(\d{1,2})/(\d{1,2})\s(\d{1,2}):(\d{1,2}):(\d{1,2})', lambda match: u'{0:04d}-{1:02d}-{2:02d} {3:02d}:{4:02d}:{5:02d}'.format(int(match.group(1)), int(match.group(2)), int(match.group(3)), int(match.group(4)), int(match.group(5)), int(match.group(6))), chatTag.get(u'date') ).decode(u'utf-8') chatList.append((communityId, liveId, userId, name, message, option, date)) return chatList
def doAllLink(self, page): soup = BeautifulSoup(page) self.output_page("RBS-xactlist-all-look.html", page) #<a href="/StatementsFixedPeriod.aspx?id=B7879D8CABBF283B38AE447E07A4EA5D8DA9A859&persist=%2fwEPBQ1BY2NvdW50TnVtYmVyBQg4ODU3MjIxOA%3d%3d%7c%2fwEPBQhGcm9tRGF0ZQUTMTgvMDUvMjAxMSAwMDowMDowMA%3d%3d%7c%2fwEPBQhTb3J0Q29kZQUGNjAxNzIx%7c%2fwEPBQZUb0RhdGUFEzE4LzA3LzIwMTEgMDA6MDA6MDA%3d%7c%2fwEWBh4JU1MyQUNDRERBDwUUU0lOSEEgTS9TVFUgODg1NzIyMTgFKEI3ODc5RDhDQUJCRjI4M0IzOEFFNDQ3RTA3QTRFQTVEOERBOUE4NTkeCFNTMlNQRERBDxBkZBYBAgNnHgZTUzJXTEEPAgFo&showall=1" title="Show all items on a single page">All</a> logging.debug('RBS checking for all links') # find any all link links=soup.findAll('a') link = None for a in links: # detect our link try: if re.search(".tatements.ixed.eriod", a['href']): logging.debug("RBS - got a statement link") if re.search(".ll", a.text): # the one that says all link = self.composeLink(a['href'][:]) logging.debug("RBS - got an All statement link") break # only need the first one so break the for loop except Exception, e: logging.debug('RBS a link error missing href - ' + str(e))
def __init__(self): params = self._parse_argv() source = self._fetch_url(self.BASE_URL + unquote_plus(params['showUrl'])) showIndex = BeautifulSoup(source) vidInfo = { 'tvshowtitle': showIndex.find('div', id='showDashboard').find('span', { 'class': 'blueText' }).string, 'studio': 'FOX' } seasonsListing = showIndex.findAll( 'div', { 'class': re.compile('dashPageHolder'), 'id': re.compile('^fullEp') }) print len(seasonsListing) for season in seasonsListing: episodesListing = season.findAll('div', {'class': 'episodeListing'}) for episode in episodesListing: listitem = xbmcgui.ListItem( episode.find('h3').find('a').string) listitem.setThumbnailImage( episode.find('img', id=re.compile('^epThumb'))['src']) episodeLink = episode.find('a', {'class': 'thumbnailLink'}) if episodeLink['href'][0] == '/': episodeUrl = episodeLink['href'][1:] else: episodeUrl = episodeLink['href'].replace(self.BASE_URL, '') airedDateAndPlot = re.search( 'Aired\s+([01]?[0-9])/([0-3]?[0-9])/([0-9]{2,4})\s*(?:<br\s*/?>)?\s*(.+?)\s*</div>$', str(episode.find('div', {'class': 'episodeInfo'}))) seasonNum = re.search( 'Season\s+([0-9]+?)[\s:]', str(episode.find('p', {'class': 'seasonNum'}))) episodeNumAndDuration = re.search( 'Episode\s+([0-9]+?)\s+?\(((?:[0-9]*?:)?[0-9]*?:[0-9]+?)\)', str(episode.find('p', {'class': 'episodeNumLine'}))) vidInfo['aired'] = '%s-%s-%s' % (airedDateAndPlot.group(3), airedDateAndPlot.group(1), airedDateAndPlot.group(2)) vidInfo['season'] = int(seasonNum.group(1)) vidInfo['episode'] = int(episodeNumAndDuration.group(1)) vidInfo['duration'] = episodeNumAndDuration.group(2) vidInfo['title'] = episode.find('h3').find('a').string vidInfo['plot'] = decode_htmlentities( airedDateAndPlot.group(4)) print vidInfo listitem.setInfo("video", vidInfo) xbmcplugin.addDirectoryItem( handle=int(sys.argv[1]), listitem=listitem, url="%s?episodeUrl=%s" % (sys.argv[0], quote_plus(episodeUrl))) xbmcplugin.endOfDirectory(handle=int(sys.argv[1]), succeeded=1)
def rewrite_html(self, guid, html=None, ajax_url=None): """if we are not using ajax, then html is IGNORED and we go by the cached copy. html is sometimes used to see if there should be a cached copy at all, or if something goes wrong and we just need to return unaltered html """ guid = str(guid) cache_dir = os.path.join(self._store_location, guid_hash(guid)) mapping_file = os.path.join(cache_dir, guid + "-" + "mapping.pickle") if not os.path.isfile(mapping_file): # quick and dirty check. are there images? if not, plain # html is fine if html.lower().find('<img') >= 0: #logging.warning("Should be downloaded images, but couldn't open mapping. Recaching") self.cache_html(guid, html) return html try: mapping = open(mapping_file, 'r') rewrite_hash = pickle.load(mapping) non_ajax_html = pickle.load(mapping) mapping.close() except: logging.error("error opening cache pickle for guid %s %s" % (guid, mapping_file)) logging.error( "If you have upgraded penguintv, you might need to delete your image cache" ) return html if ajax_url is None: return non_ajax_html #else, rewrite on the fly soup = BeautifulSoup(html) img_tags = soup.findAll('img') if len(img_tags) == 0: return html for result in img_tags: # believe it or not, some img tags don't have a src, they have an id # that points to CSS. At least I think that's what's going on if result.has_key('src'): if rewrite_hash.has_key(result['src']): if rewrite_hash[result['src']][1] == UrlCacher.DOWNLOADED: #if os.path.isfile(os.path.join(self._store_location, rewrite_hash[result['src']][0])): result['src'] = ajax_url + "/cache/" + rewrite_hash[ result['src']][0] #else: # logging.warning("file not found, not replacing") # logging.debug("(should we attempt to recache here?") return soup.prettify()
def _parseNatWestLinks(self, raw): soup = BeautifulSoup(raw) accountBLock = soup.findAll('a', attrs={'class': 'accountNameExpand'}) # got some acount details now so all good if len(accountBLock) == 0: logging.warning('NatWest no accounts after continue form') return 'account problem' for ac_link in accountBLock: ac_link.string = ac_link.text self.accountLinks.append(ac_link) # now the accnum list - to get the pair data, cos cant get it from link row = ac_link.parent.parent try: # find the account number span acnumSpan = row.find('span', attrs={'class': 'AccountNumber'}) acnum = acnumSpan.text acnum = acnum.replace(' ', '') # find the sort code span sortSpan = row.find('span', attrs={'class': 'SortCode'}) sortc = sortSpan.text sortc = sortc.replace(' ', '') sortc = sortc.replace('-', '') except Exception, e: logging.exception('NatWest form error - ' + str(e)) return 'bank error' #combine the two - to be our matching number num = sortc + "-" + acnum actype = 'Cheque' # might be a credit card if len(acnum) > 14: actype = 'Credit' # now get balances... balance = 0 baltr = ac_link.parent.parent baltds = baltr.findAll('td') if len(baltds) > 2: baltext = self.tidy_text(baltds[3].text) balance = self.normalise_ammount(baltext) # and add it to our account list acpair = { 'name': ac_link.text, 'num': num, 'type': actype, 'bal': balance } self.myAccounts.append(acpair)
def doStep3(self, allofit, page): scrape_result = "good" #-------------------------------- Grab the form values ----------------------------------------------- soup = BeautifulSoup(page) # write out the start page self.output_page("natwest-security.html", page) scrape_result = 'good' logging.info("NatWest security page2") # check if we got returned # check for the password input ctl00$mainContent$LI5TABA$DBID_edit then we didnt move on errorDiv = soup.findAll( 'input', attrs={'name': 'ctl00$mainContent$LI5TABA$DBID_edit'}) if len(errorDiv) != 0: logging.info("NatWest security page1 still - customer number bad") return 'credentials incorrect' # if we get here then the form was found hence creds must be wrong # find our form loginform = soup.find('form', attrs={'name': 'aspnetForm'}) if loginform == None: logging.debug('NatWest no security form') return 'bank error' values = self.parseForm(loginform) # define some variables that would only otherwise exist in a try catch block scope # the label text split on spaces which1arr = "" which2arr = "" which3arr = "" # the chalenges firstDigit = "" secondDigit = "" thirdDigit = "" #>>>>>>> The first set of Pin fields #-------------------- get the questions --------------# #<label for="ctl00_mainContent_Tab1_LI6PPEA_edit" id="ctl00_mainContent_Tab1_LI6DDALALabel" class="wizCrl wizardLabelRememberMeWide">Enter the 2nd number</label> useNewTab = False try: which1 = soup.find('label', attrs={ 'for': 'ctl00_mainContent_LI6PPEA_edit' }).text except Exception, e: useNewTab = True
def rewrite_html(self, guid, html=None, ajax_url=None): """if we are not using ajax, then html is IGNORED and we go by the cached copy. html is sometimes used to see if there should be a cached copy at all, or if something goes wrong and we just need to return unaltered html """ guid = str(guid) cache_dir = os.path.join(self._store_location, guid_hash(guid)) mapping_file = os.path.join(cache_dir, guid + "-" + "mapping.pickle") if not os.path.isfile(mapping_file): # quick and dirty check. are there images? if not, plain # html is fine if html.lower().find('<img') >= 0: #logging.warning("Should be downloaded images, but couldn't open mapping. Recaching") self.cache_html(guid, html) return html try: mapping = open(mapping_file, 'r') rewrite_hash = pickle.load(mapping) non_ajax_html = pickle.load(mapping) mapping.close() except: logging.error("error opening cache pickle for guid %s %s" % (guid, mapping_file)) logging.error("If you have upgraded penguintv, you might need to delete your image cache") return html if ajax_url is None: return non_ajax_html #else, rewrite on the fly soup = BeautifulSoup(html) img_tags = soup.findAll('img') if len(img_tags) == 0: return html for result in img_tags: # believe it or not, some img tags don't have a src, they have an id # that points to CSS. At least I think that's what's going on if result.has_key('src'): if rewrite_hash.has_key(result['src']): if rewrite_hash[result['src']][1] == UrlCacher.DOWNLOADED: #if os.path.isfile(os.path.join(self._store_location, rewrite_hash[result['src']][0])): result['src'] = ajax_url + "/cache/" + rewrite_hash[result['src']][0] #else: # logging.warning("file not found, not replacing") # logging.debug("(should we attempt to recache here?") return soup.prettify()
def _parseRBSLinks(self, raw): soup = BeautifulSoup(raw) accountBLock=soup.findAll('a', attrs={'class' : 'accountNameExpand'}) # got some acount details now so all good if len(accountBLock) == 0: logging.warning('RBS no accounts after continue form') return 'account problem' for ac_link in accountBLock: ac_link.string = ac_link.text self.accountLinks.append(ac_link) # now the accnum list - to get the pair data, cos cant get it from link row = ac_link.parent.parent try: # find the account number span acnumSpan = row.find('span', attrs={'class': 'AccountNumber'}) acnum = acnumSpan.text acnum = acnum.replace(' ', '') # find the sort code span sortSpan = row.find('span', attrs={'class': 'SortCode'}) sortc = sortSpan.text sortc = sortc.replace(' ', '') sortc = sortc.replace('-', '') except Exception, e: logging.exception('RBS form error - ' + str(e)) return 'bank error' #combine the two - to be our matching number num = sortc + "-" + acnum actype = 'Cheque' # might be a credit card if len(acnum) > 14: actype = 'Credit' # now get balances... balance = 0 baltr = ac_link.parent.parent baltds = baltr.findAll('td') if len(baltds) > 2: baltext = self.tidy_text(baltds[3].text) balance = self.normalise_ammount(baltext) # and add it to our account list acpair = {'name': ac_link.text, 'num': num, 'type': actype, 'bal': balance} self.myAccounts.append(acpair)
def doStep3(self, allofit, page): scrape_result = "good" #-------------------------------- Grab the form values ----------------------------------------------- soup = BeautifulSoup(page) # write out the start page self.output_page("RBS-security.html", page) scrape_result = 'good' logging.info("RBS security page2") # check if we got returned # check for the password input ctl00$mainContent$LI5TABA$DBID_edit then we didnt move on errorDiv=soup.findAll('input', attrs={'name' : 'ctl00$mainContent$LI5TABA$DBID_edit'}) if len(errorDiv) != 0: logging.info("RBS security page1 still - customer number bad") return 'credentials incorrect' # if we get here then the form was found hence creds must be wrong # find our form loginform=soup.find('form', attrs={'name': 'aspnetForm'}) if loginform == None: logging.debug('RBS no security form') return 'bank error' values = self.parseForm(loginform) # define some variables that would only otherwise exist in a try catch block scope # the label text split on spaces which1arr = "" which2arr = "" which3arr = "" # the chalenges firstDigit = "" secondDigit = "" thirdDigit = "" #>>>>>>> The first set of Pin fields #-------------------- get the questions --------------# #<label for="ctl00_mainContent_Tab1_LI6PPEA_edit" id="ctl00_mainContent_Tab1_LI6DDALALabel" class="wizCrl wizardLabelRememberMeWide">Enter the 2nd number</label> useNewTab = False try: which1=soup.find('label', attrs={'for' : 'ctl00_mainContent_LI6PPEA_edit'}).text except Exception, e: useNewTab = True
def get(self): bookname = self.request.get('search') req = isbndbpy.Request('books', 'combined', str(bookname)) resp = req.send().read() #print resp soup = BS(str(resp)) books = soup.findAll('bookdata') self.response.out.write('<html><body>') for bookdata in books: self.response.out.write('<br/>Title: '+ str(bookdata.find('title').string )) self.response.out.write('<br/>ISBN: ' +str(bookdata.get('isbn13'))) self.response.out.write('<br/>AUTHOR :'+str( bookdata.find('authorstext').string)) self.response.out.write('<br/>PUBLISEHR: '+str(bookdata.find('publishertext').string)) self.response.out.write('<br/> "***********')
def _parseNatWestLinks(self, raw): soup = BeautifulSoup(raw) accountBLock = soup.findAll("a", attrs={"class": "accountNameExpand"}) # got some acount details now so all good if len(accountBLock) == 0: logging.warning("NatWest no accounts after continue form") return "account problem" for ac_link in accountBLock: ac_link.string = ac_link.text self.accountLinks.append(ac_link) # now the accnum list - to get the pair data, cos cant get it from link row = ac_link.parent.parent try: # find the account number span acnumSpan = row.find("span", attrs={"class": "AccountNumber"}) acnum = acnumSpan.text acnum = acnum.replace(" ", "") # find the sort code span sortSpan = row.find("span", attrs={"class": "SortCode"}) sortc = sortSpan.text sortc = sortc.replace(" ", "") sortc = sortc.replace("-", "") except Exception, e: logging.exception("NatWest form error - " + str(e)) return "bank error" # combine the two - to be our matching number num = sortc + "-" + acnum actype = "Cheque" # might be a credit card if len(acnum) > 14: actype = "Credit" # now get balances... balance = 0 baltr = ac_link.parent.parent baltds = baltr.findAll("td") if len(baltds) > 2: baltext = self.tidy_text(baltds[3].text) balance = self.normalise_ammount(baltext) # and add it to our account list acpair = {"name": ac_link.text, "num": num, "type": actype, "bal": balance} self.myAccounts.append(acpair)
def doStep12(self, page): #-------------------------------- Grab the form values ----------------------------------------------- soup = BeautifulSoup(page) self.output_page("RBS-xactlist-cc-poss.html", page) rightButtons=soup.findAll('a', attrs={'class' : 'link-button-right'}) # any buttons? if len(rightButtons) == 0: logging.error('RBS no cc accountbuttons') return 'bank error' # RBS is not dynamic -so this static list is fine (unlike Smile) acLink = None for a in rightButtons: # filter out the account detail buttons matching just the statement buttons # Bloody hope this regex finds shit in the right order if re.search(".ard.tatement.etail", a['href']): acLink = a['href'][:] if acLink == None: logging.debug('RBS no cc detail link') return 'bank error' # action = self.urlBase + '/' + loginform['action'] action = acLink try: logging.debug("checking link - " + acLink) urls = urlparse(acLink); # if it parses properly good else except Exception, e: logging.error('RBS cc link error - ' + str(e)) action = self.urlBase + '/' + acLink
def doStep12(self, page): #-------------------------------- Grab the form values ----------------------------------------------- soup = BeautifulSoup(page) self.output_page("natwest-xactlist-cc-poss.html", page) rightButtons = soup.findAll('a', attrs={'class': 'link-button-right'}) # any buttons? if len(rightButtons) == 0: logging.error('NatWest no cc accountbuttons') return 'bank error' # natWest is not dynamic -so this static list is fine (unlike Smile) acLink = None for a in rightButtons: # filter out the account detail buttons matching just the statement buttons # Bloody hope this regex finds shit in the right order if re.search(".ard.tatement.etail", a['href']): acLink = a['href'][:] if acLink == None: logging.debug('NatWest no cc detail link') return 'bank error' # action = self.urlBase + '/' + loginform['action'] action = acLink try: logging.debug("checking link - " + acLink) urls = urlparse(acLink) # if it parses properly good else except Exception, e: logging.error('NatWest cc link error - ' + str(e)) action = self.urlBase + '/' + acLink
def doStep4(self, allofit, page): scrape_result = "good" #-------------------------------- Grab the form values ----------------------------------------------- soup = BeautifulSoup(page) # write out the start page self.output_page("RBS-pos-accounts.html", page) scrape_result = 'good' logging.info("RBS message or bad cred check ") # if we still have the input then def bad credentials errorDiv=soup.findAll('input', attrs={'name' : 'ctl00$mainContent$LI6PPEA_edit'}) if len(errorDiv) != 0: logging.info("RBS defiantely bad credentials") return 'credentials incorrect' accountBLock=soup.findAll('table', attrs={'class' : 'AccountTable'}) # got some acount details so all good if len(accountBLock) > 0: logging.debug("RBS defiantely got some good accounts") return 'good'; # find any link # if we find a link return it # check for the normal continue button and fail all else - with credentials failure continueButton = soup.find('input', attrs={'id' : 'ctl00_mainContent_FinishButton_button'}) if(continueButton == None): logging.warning("RBS cant find finish button credentials incorrect") nextButton = soup.find('input', attrs={'id' : 'ctl00_mainContent_NextButton_button'}) if(nextButton == None): logging.warning("RBS cant find next button either") return 'credentials incorrect' # now find the form that these buttons belong to loginform=soup.find('form', attrs={'name': 'aspnetForm'}) if loginform == None: logging.debug('RBS no continue form') return 'bank error' action = self.urlBase + '/' + loginform['action'] # any hidden values etc values = self.parseForm(loginform) # build the body content data = urllib.urlencode(values) self.response = {} self.response['url'] = self.ByteToHex(action) self.response['data'] = self.ByteToHex(data) self.response['method'] = 'POST' self.response['step'] = 4 return 'messages'
def doStep4(self, allofit, page): scrape_result = "good" # -------------------------------- Grab the form values ----------------------------------------------- soup = BeautifulSoup(page) # write out the start page self.output_page("natwest-pos-accounts.html", page) scrape_result = "good" logging.info("NatWest message or bad cred check ") # if we still have the input then def bad credentials errorDiv = soup.findAll("input", attrs={"name": "ctl00$mainContent$LI6PPEA_edit"}) if len(errorDiv) != 0: logging.info("NatWest defiantely bad credentials") return "credentials incorrect" accountBLock = soup.findAll("table", attrs={"class": "AccountTable"}) # got some acount details so all good if len(accountBLock) > 0: logging.debug("NatWest defiantely got some good accounts") return "good" # find any link # if we find a link return it # check for the normal continue button and fail all else - with credentials failure continueButton = soup.find("input", attrs={"id": "ctl00_mainContent_FinishButton_button"}) if continueButton == None: logging.warning("NatWest cant find finish button credentials incorrect") nextButton = soup.find("input", attrs={"id": "ctl00_mainContent_NextButton_button"}) if nextButton == None: logging.warning("NatWest cant find next button either") return "credentials incorrect" # now find the form that these buttons belong to loginform = soup.find("form", attrs={"name": "aspnetForm"}) if loginform == None: logging.debug("NatWest no continue form") return "bank error" else: logging.debug("found a continue form - so clicking it") action = self.urlBase + "/" + loginform["action"] # any hidden values etc values = self.parseForm(loginform) # build the body content data = urllib.urlencode(values) self.response = {} self.response["url"] = self.ByteToHex(action) self.response["data"] = self.ByteToHex(data) self.response["method"] = "POST" self.response["step"] = 4 return "messages"
def doStep4(self, allofit, page): scrape_result = "good" #-------------------------------- Grab the form values ----------------------------------------------- soup = BeautifulSoup(page) # write out the start page self.output_page("natwest-pos-accounts.html", page) scrape_result = 'good' logging.info("NatWest message or bad cred check ") # if we still have the input then def bad credentials errorDiv = soup.findAll( 'input', attrs={'name': 'ctl00$mainContent$LI6PPEA_edit'}) if len(errorDiv) != 0: logging.info("NatWest defiantely bad credentials") return 'credentials incorrect' accountBLock = soup.findAll('table', attrs={'class': 'AccountTable'}) # got some acount details so all good if len(accountBLock) > 0: logging.debug("NatWest defiantely got some good accounts") return 'good' # find any link # if we find a link return it # check for the normal continue button and fail all else - with credentials failure continueButton = soup.find( 'input', attrs={'id': 'ctl00_mainContent_FinishButton_button'}) if (continueButton == None): logging.warning( "NatWest cant find finish button credentials incorrect") nextButton = soup.find( 'input', attrs={'id': 'ctl00_mainContent_NextButton_button'}) if (nextButton == None): logging.warning("NatWest cant find next button either") return 'credentials incorrect' # now find the form that these buttons belong to loginform = soup.find('form', attrs={'name': 'aspnetForm'}) if loginform == None: logging.debug('NatWest no continue form') return 'bank error' else: logging.debug('found a continue form - so clicking it') action = self.urlBase + '/' + loginform['action'] # any hidden values etc values = self.parseForm(loginform) # build the body content data = urllib.urlencode(values) self.response = {} self.response['url'] = self.ByteToHex(action) self.response['data'] = self.ByteToHex(data) self.response['method'] = 'POST' self.response['step'] = 4 return 'messages'
def post(self): errors = [] context = defaultdict(list) username = self.request.get("username") if username != "": bsusername = BeautifulSoup(username) username = ''.join(bsusername.findAll(text=True)).rstrip().strip() models.add_visit(username) github = Github(requests_per_second=1) try: logging.info("Grabbing repositories for user: %s" % username) repos = github.repos.list(username) user = models.User.gql('WHERE username = :1', username).get() projects_in_store = user.projects project = None project_detail_list = [] if len(repos) > 0: for repo in enumerate(repos[:5]): # don't hammer datastore if project list is empty if len(projects_in_store) > 0: logging.info("Grabbing project details from datastore key: %s" % (username + "_" + repo.name)) project = obj_to_dictionary(models.Project.get_by_key_name(username + "_" + repo.name)) # grab from github if not project: logging.info("Grabbing project details for: %s" % repo.name) project = grab_project_details(repo) context['projects'].append(dict_to_tuple(project)) project_detail_list.append(project) # grab from datastore else: if project['graph_url'] == "": logging.info("Grabbing graph url for project: %s" % repo.name) project = grab_project_details(repo) project_detail_list.append(project) context['projects'].append(dict_to_tuple(project)) project = None update_models(project_detail_list, user) else: logging.error("User %s has no repositories." % username) errors.append('User %s has no repositories.' % username) context = {'errors': errors} except RuntimeError: logging.error("RuntimeError when grabbing repositories for username: %s" % username) errors.append("An error occured - are you sure you spelled the username correctly?") context = {'errors': errors} except ApplicationError: logging.error("Timeout when grabbing repositories, try again") errors.append("An error occured - are you sure you spelled the username correctly?") context = {'errors': errors} except DeadlineExceededError: self.response.clear() self.response.set_status(500) errors.append("App timed out probably because user has too many projects") context = {'errors': errors} logging.critical("Application timed out while querying for user: %s" % username) else: errors.append("Enter username") context = {'errors': errors} self.response.out.write(template.render('templates/index.html', context))
def processAccount(self, acCount, acName, account_path, allofit): page = self.HexToByte(allofit['body']) # save this page self.output_page("account" + str(acCount) + ".html", page) soup = BeautifulSoup(page) logging.debug('ac path - ' + str(account_path) + ' - end') if account_path != "": # delete existing current xactions logging.debug('Processing :) ') self.statementbuilder = StatementBuilder(self.facade, account_path, self.token) # need to get last statement and make a new one every time self.statementbuilder.make_recent_dif_statement( 'Fd-recent', 'Scraper', None) #TODO change this isVisa = False loginform = soup.find( 'input', attrs={'name': 'cmd_sort_referenceAscending'}) if loginform != None: isVisa = True bal_tables = soup.findAll( 'table', attrs={'class': 'fdTableBackgroundOne'}) balance_table = bal_tables[2] if balance_table <> None: vals = balance_table.findAll('td') if vals: bal = vals[1].text data = bal.replace('£', u'£') data = data.strip(u'£') if data[-1] == 'D': data = data.replace('DB', '') data = data.replace('D', '') lastbal = int(float(data) * 100) firstbal = 0 - lastbal else: data = data.replace('CR', '') data = data.replace('C', '') firstbal = int(float(data) * 100) self.statementbuilder.set_current_balance(firstbal) logging.debug( "-----------------------------*******---------------------") if isVisa: logging.debug("found visa --") acTable = soup.find('table', attrs={'class': 'fdStatTable'}) # if no table then no new data afaik if acTable != None: datarows = acTable.findAll('tr') next = False # build the post values up atts = {} isFirst = True firstbal = 0 firstdate = "" lastbal = 0 lastdate = "" doBalance = False dp = DateParser() for rows in datarows: vals = rows.findAll('td') if vals: for i, val in enumerate(vals): if val.text: data = val.text.strip() data = unescape(data) data = unicode(data) else: data = "" if data != " ": data = data.replace(' ', '') if i == 0: if data != "": try: lastdate = dp.ymd_from_date( dp.date_from_dmy(data, '/')) except: logging.warn( "Invalid FD date format - probably no transactions" ) return if firstdate == "": firstdate = lastdate atts['date'] = lastdate if (i == 1 and not isVisa) or (i == 2 and isVisa): atts['display'] = data[0:19] atts['extradisplay'] = data[19:] if (i == 2 and not isVisa) or (i == 3 and isVisa): if data != "": data = data.strip(u'£') data = data.strip(u'D') data = data.strip(u'B') if data == '': atts['amount'] = 0 else: atts['amount'] = int( float(data) * 100) atts['type'] = 'Debit' if (i == 3 and not isVisa) or (i == 4 and isVisa): if data != "": data = data.strip(u'£') data = data.strip(u'C') data = data.strip(u'R') if data == '': atts['amount'] = 0 else: atts['amount'] = int( float(data) * 100) atts['type'] = 'Credit' if not isVisa: if i == 4: data = data.strip(u'£') if data != "": lastbal = int(float(data) * 100) if isFirst: isFirst = False firstbal = lastbal doBalance = True if i == 5: if doBalance: doBalance = False if data == "D": firstbal = 0 - firstbal self.statementbuilder.set_current_balance( firstbal) self.statementbuilder.make_xact(atts) self.statementbuilder.put_statement() self.current_statement = self.current_statement + 1
def _loadUserSetting(self, communityId, userSettingFilePath): parser = BeautifulSoup(open(userSettingFilePath, u'r')) nameTagList = parser.findAll(u'user', attrs={ u'community': communityId, u'name': True }) return dict(map(lambda tag: (tag.renderContents(), tag.get(u'name')), nameTagList))
def processAccount(self, acCount, acName, account_path, allofit): page = self.HexToByte( allofit['body']) # save this page self.output_page("account" + str(acCount) + ".html", page) soup = BeautifulSoup(page) logging.debug('ac path - ' + str(account_path) + ' - end' ) if account_path != "": # delete existing current xactions logging.debug('Processing :) ' ) self.statementbuilder = StatementBuilder(self.facade, account_path, self.token) # need to get last statement and make a new one every time self.statementbuilder.make_recent_dif_statement('Fd-recent', 'Scraper', None) #TODO change this isVisa = False loginform=soup.find('input', attrs={'name' : 'cmd_sort_referenceAscending'}) if loginform != None: isVisa = True bal_tables=soup.findAll('table', attrs={'class' : 'fdTableBackgroundOne'}) balance_table = bal_tables[2] if balance_table <> None: vals = balance_table.findAll('td') if vals: bal = vals[1].text data = bal.replace('£', u'£'); data = data.strip(u'£') if data[-1] == 'D': data = data.replace('DB','') data = data.replace('D','') lastbal = int( float(data) * 100 ) firstbal = 0 - lastbal else: data = data.replace('CR','') data = data.replace('C','') firstbal = int( float(data) * 100 ) self.statementbuilder.set_current_balance(firstbal) logging.debug("-----------------------------*******---------------------") if isVisa: logging.debug("found visa --") acTable=soup.find('table', attrs={'class' : 'fdStatTable'}) # if no table then no new data afaik if acTable != None: datarows=acTable.findAll('tr') next = False # build the post values up atts = {} isFirst = True firstbal = 0 firstdate = "" lastbal = 0 lastdate = "" doBalance = False dp = DateParser() for rows in datarows: vals = rows.findAll('td') if vals: for i, val in enumerate(vals): if val.text: data = val.text.strip() data = unescape(data) data = unicode(data) else: data = "" if data != " ": data = data.replace(' ','') if i == 0: if data != "": try: lastdate = dp.ymd_from_date(dp.date_from_dmy(data,'/')) except: logging.warn("Invalid FD date format - probably no transactions") return if firstdate == "": firstdate = lastdate atts['date'] = lastdate if (i == 1 and not isVisa) or (i == 2 and isVisa): atts['display'] = data[0:19] atts['extradisplay'] = data[19:] if (i == 2 and not isVisa) or (i == 3 and isVisa): if data != "": data = data.strip(u'£') data = data.strip(u'D') data = data.strip(u'B') if data == '': atts['amount'] = 0 else: atts['amount'] = int( float(data) * 100 ) atts['type'] = 'Debit' if (i == 3 and not isVisa) or (i == 4 and isVisa): if data != "": data = data.strip(u'£') data = data.strip(u'C') data = data.strip(u'R') if data == '': atts['amount'] = 0 else: atts['amount'] = int( float(data) * 100 ) atts['type'] = 'Credit' if not isVisa: if i == 4: data = data.strip(u'£') if data != "": lastbal = int( float(data) * 100 ) if isFirst: isFirst = False firstbal = lastbal doBalance = True if i == 5: if doBalance: doBalance = False if data == "D": firstbal = 0 - firstbal self.statementbuilder.set_current_balance(firstbal) self.statementbuilder.make_xact(atts) self.statementbuilder.put_statement() self.current_statement = self.current_statement + 1