def doRequest(self): d = Download(self.Url) if d.doRequest(): return 1 self.recs = d.getSOURCE() return 0
def isGoogleSearch(schema, ip): d = Download(schema + '://' + ip) if d.doRequest(): return False if Utility.containsGoogle(d.getSOURCE()): return True return False
def run(self): url = self.BASE_URL + self.SeasonId + self.BASE_URL_PART_3 + str(self.PageNumber) + self.BASE_URL_PART_5 d = Download(url) if d.doRequest(): # fail print 'ERROR: ' + self.SeasonId + '-' + str(self.PageNumber) else: utfstr2file(d.getSOURCE(), './data/' + self.SeasonId + '-' + str(self.PageNumber) + '.raw') return url
def requestHtml(self): url = self.BaseUrl + self.ISBN # print url, self.User_Agent d = Download(url, self.User_Agent) if d.doRequest(): return 1 self.HTML = d.getSOURCE() return 0
def request(self): baseUrl = 'http://shaishufang.com/index.php/site/detail/uid/' postFix = '/status//category/none/friend/false' url = baseUrl + self.UID + '/ubid/' + self.BID + postFix d = Download(url, self.Cookie, self.Proxy) if d.doRequest(): return False self.HTML = d.getSOURCE() return True
def doRequest(self): playerId = str(self.PlayerId) seasonType = self.SeasonType.replace(" ", "+") url = self.Url + "PlayerId=" + playerId + "&SeasonType=" + seasonType + "&League=" + self.LeagueId d = Download(url) if d.doRequest() == 1: return 1 self.recs = dumps(loads(d.getSOURCE())) return 0
def request(self): baseUrl = "http://shaishufang.com/index.php/site/main/uid/" postFix = "/friend/false/category//status//type//page/" url = baseUrl + self.UID + postFix + str(self.Page) d = Download(url, self.Cookie, self.Proxy) if d.doRequest(): return False self.HTML = d.getSOURCE() return True
def run(self): url = self.BASE_URL + self.SeasonId + self.BASE_URL_PART_3 + str( self.PageNumber) + self.BASE_URL_PART_5 d = Download(url) if d.doRequest(): # fail print 'ERROR: ' + self.SeasonId + '-' + str(self.PageNumber) else: utfstr2file( d.getSOURCE(), './data/' + self.SeasonId + '-' + str(self.PageNumber) + '.raw') return url
def run(self): while True: print 'INFO: ........................................ START' stats = self.dbm.getStats() print 'INFO: deadLinks-', stats[0], ' unvisitedLinks-', stats[1], ' visitedLinks-', stats[2] # get an url from unvisitedLinks url = self.dbm.retrieveUnvisitedLink() if url == False: print 'DEBUG: DONE -- retrieveUnvisitedLink return False' break print 'DEBUG: Processing ', url if not self.urlFilter.isPlainText(url): print 'DEBUG: NotPlainTextURL ', url continue if not self.domainFilter.isInDomain(url): print 'DEBUG: NOT IN DOMAIN ', url continue # requet the url d = Download(url) if d.doRequest() == 1: if not self.dbm.createDeadLink(url): print 'DEBUG: deadLinks already contain ', url else: print 'DEBUG: Add To deadLinks ', url else: if self.dbm.createVisitedLink(url): print 'DEBUG: Add To visitedLinks ', url else: print 'DEBUG: Failed Add To visitedLinks ', url # extract urls from the sourc2 u = URLExtractor(d.getSOURCE(), url) tmpUrls = u.getUrls() if tmpUrls: for url in tmpUrls: if self.dbm.isInDeadLink(url): continue elif self.dbm.isInVisitedLink(url): continue elif self.dbm.isInUnvisitedLink(url): continue else: print 'DEBUG: Add To unvisitedLink ', url self.dbm.createUnvisitedLink(url) print 'INFO: ........................................ END'
def getStats(self): d = Download(self.API) if d.doRequest(): return False res = [] j = loads(d.getSOURCE()) for item in j['resultSets'][1]['rowSet']: res.append(item[1:]) if len(res) == 0: return False else: return res
def worker(appids, isbns, appidsCycle): # appidsCycle = cycle(appids) for isbn in isbns: url = 'http://' + appidsCycle.next() + '.appspot.com/url?url=' + 'http://book.douban.com/isbn/' + str(isbn) # print 'DEBUG: ', url d = Download(url) if d.doRequest(): print isbn, 'network error' continue j = json.loads(d.getSOURCE()) print isbn, j['status_code'] return
def run(self, processName='MainProcess'): for isbn in self.ISBNS: url = 'http://www.amazon.cn/s/ref=nb_sb_noss?field-keywords=' + isbn d = Download(url) if d.doRequest(): print 'ERROR[' + processName + ']: ', isbn, 'NERR' appendstr2file(isbn, './NERR.txt') continue asin = ASINParser(d.getSOURCE()) if asin.getAsin(): print 'INFO[' + processName + ']: ', isbn, asin.getAsin() appendstr2file(isbn + ',' + asin.getAsin(), './OK.txt') else: print 'WARN[' + processName + ']: ', isbn, 'NOER' appendstr2file(isbn, './NOER.txt')
def Google_Web_Search_Helper(q, hl='en', start=0): Google_Web_Search_URL = 'https://www.google.com/search?' if not q: return {} else: Google_Web_Search_URL = Google_Web_Search_URL + 'q=' + q Google_Web_Search_URL = Google_Web_Search_URL + '&hl=' + hl Google_Web_Search_URL = Google_Web_Search_URL + '&start=' + start d = Download(Google_Web_Search_URL) if d.doRequest(): return {} else: g = GoogleSearchResultParser(d.getSOURCE()) return g.getJson() """
def run(self, processName='MainProcess'): for asin in self.ASINS: url = 'http://www.amazon.cn/dp/' + asin d = Download(url) if d.doRequest(): print 'ERROR[' + processName + ']: ', asin, 'NERR' appendstr2file(asin, './NERRBasicInfo.txt') continue b = BasicInfoParser(d.getSOURCE()) jsonRes = b.basicInfo() if json.loads(jsonRes): print 'info[' + processName + ']: ', asin appendstr2file(jsonRes, './OKBasicInfo.txt') else: print 'WARN[' + processName + ']: ', asin, 'NOER' appendstr2file(asin, './NOERBasicInfo.txt')
def walker(self): while True: urls = self.dbm.retrieveUnvisitedLinks(0, 100) urls = self.urlFilter.getFilteredUrls(urls) if len(urls) == 0: break for url in urls: print 'INFO: Processing ', url d = Download(url) if d.doRequest() == 1: self.dbm.createDeadLink(url) else: self.dbm.createVisitedLink(url) u = URLExtractor(d.getSOURCE(), url) tmpUrls = u.getUrls() if tmpUrls: self.dbm.createUnvisitedLinks(list(set(tmpUrls))) return True
def getStats(self): d = Download(self.API) if d.doRequest(): return False res = [] j = loads(d.getSOURCE()) for item in j['resultSets'][0]['rowSet']: tmp = [] name = item[3] pos = item[5] if item[6] == 'null': height = 'None' else: height = item[6] if item[7] == " ": weight = 'None' else: weight = item[7] age = item[9] if item[10] == 'R' or item[10] == 'None' or item[10] == None: exp = 0 else: exp = item[10] tmp.append(name) tmp.append(pos) tmp.append(height) tmp.append(weight) tmp.append(age) tmp.append(exp) res.append(tmp) if len(res) == 0: return False else: return res
#!/usr/bin/env python #coding=utf-8 # # Author: Archer Reilly # Date: 11/Aug/2014 # File: PlayerInfoParserTest.py # Description: test the PlayerInfoParser class # Website: http://csrgxtu.blog.com/ # # Produced By CSRGXTU from PlayerInfoParser import PlayerInfoParser from Download import Download URL = "http://sports.qq.com/d/f_players/3/2890/" player = Download(URL) if player.doRequest() != 0: print "Download Cant Do Requst" else: print "Successfully Do Request" playerParser = PlayerInfoParser(player.getSOURCE())
def doRequest(self, url): d = Download(url) if d.doRequest() == None: return None else: return d.getSOURCE()
from TeamInfoParser import TeamInfoParser """ page = requests.get('http://econpy.pythonanywhere.com/ex/001.html') print page.text parser = Parser(page.text) #print parser.getBuyers() """ URL = "http://sports.qq.com/d/f_teams/1/42/" soccer = Download(URL) if soccer.doRequest() == 0: print "Successfully do request" else: print "Failed do request" html = soccer.getSOURCE() parser = TeamInfoParser(html) name = parser.getTeamName() print "name:", unicode(name).encode('utf8') name_cn = parser.getTeamNameCN() print "name_cn:", unicode(name_cn).encode('utf8') logo = parser.getTeamLogo() print "logo:", logo city = parser.getTeamCity() print "city:", city league = parser.getTeamLeague() print "league:", league found_time = parser.getTeamFoundTime() print "found_time:", found_time home_court_cn = parser.getTeamHomeCourtCN() print "home_court_cn:", home_court_cn
from Parser import Parser from TeamInfoParser import TeamInfoParser """ page = requests.get('http://econpy.pythonanywhere.com/ex/001.html') print page.text parser = Parser(page.text) #print parser.getBuyers() """ URL = "http://sports.qq.com/d/f_teams/1/42/" soccer = Download(URL) if soccer.doRequest() == 0: print "Successfully do request" else: print "Failed do request" html = soccer.getSOURCE() parser = TeamInfoParser(html) name = parser.getTeamName() print "name:", unicode(name).encode('utf8') name_cn = parser.getTeamNameCN() print "name_cn:", unicode(name_cn).encode('utf8') logo = parser.getTeamLogo() print "logo:", logo city = parser.getTeamCity() print "city:", city league = parser.getTeamLeague() print "league:", league found_time = parser.getTeamFoundTime() print "found_time:", found_time home_court_cn = parser.getTeamHomeCourtCN() print "home_court_cn:", home_court_cn
#!/usr/bin/env python # # Usage: python crawlerapitester.py 10 # from Download import Download import json import sys url = 'http://csrgxtu01.appspot.com/url?url=http://book.douban.com/isbn/9787508653594' for i in range(int(sys.argv[1])): d = Download(url) if d.doRequest(): print i, 'cant doRequest' continue j = json.loads(d.getSOURCE()) print i, j['err'], j['status_code']