def __init__(self,parent=None): SiteComments.__init__(self) self.r = RegexUtility() # self.basicstorage = BaseInfoStorage() # self.commentstorage = CommentsStorage() if parent: self.website = parent.website
def __download__(self, downloaderlist): valid_json_files = [] for impl in downloaderlist: json_files = impl.download() for dfile in json_files: for ufile in self.upload_file_list.keys(): if RegexUtility.match( Downloader.DOWNLOAD_FORMAT1.format(file=ufile), dfile): self.upload_file_list.pop(ufile) if FileUtility.exists(dfile): valid_json_files.append(dfile) Logger.getlogging().info('downloadedjsonfile\t' + dfile) elif RegexUtility.match( Downloader.DOWNLOAD_FORMAT2.format(file=ufile), dfile): value = RegexUtility.parse( Downloader.DOWNLOAD_FORMAT2.format(file=ufile), dfile)[0] if FileUtility.exists(dfile): valid_json_files.append(dfile) Logger.getlogging().info('downloadedjsonfile\t' + dfile) if value[0] == value[1]: self.upload_file_list.pop(ufile) retransmissionfiles = impl.outtimefiles() for fl in retransmissionfiles.keys(): # 下载异常 if fl not in self.all_retransmissionfiles: self.all_retransmissionfiles[fl] = retransmissionfiles[fl] self.all_retransmissionfiles[fl].retrans_num += 1 self.all_retransmissionfiles[fl].taskinfo = impl self.retransmissionfiles[fl] = self.all_retransmissionfiles[fl] if self.retransmissionfiles[ fl].retrans_num <= self.retransmissionlimitnum: # 虽然下载失败了,但假装已下载,故在upload_file_list删除 self.upload_file_list.pop(fl) Logger.getlogging().debug( 'download fail file {fl}:{num}th fail'.format( fl=fl, num=self.all_retransmissionfiles[fl].retrans_num)) else: # 虽然下载失败了,但假装已下载,故在upload_file_list删除;不再重传,在重传列表中删除 self.upload_file_list.pop(fl) self.retransmissionfiles.pop(fl) Logger.getlogging().debug( 'download fail file {fl}:more then {num}th fail'. format( fl=fl, num=self.all_retransmissionfiles[fl].retrans_num - 1)) return valid_json_files
def __init__(self, post_url, parent=None): # 使用该URL识别回传S2查询结果的类,推荐使用主站URL SiteS2Query.__init__(self) self.fakeoriginalurl = post_url self.r = RegexUtility() #self.post_url = BBSS2PostQuery.POST_URL self.post_url = post_url self.queryinfo = '' if parent: self.website = parent.website self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36', 'Accept-Encoding': 'gzip, deflate, sdch' }
def get(self, url): saveJson = {} try: Logger.getlogging().debug('Downloading: {url}'.format(url=url)) request = urllib2.Request(url, headers=self.headers) response = urllib2.urlopen(request, timeout=self.timeout) code = response.getcode() info = response.info() # 判断返回的code,如果不是200,则返回空 if code == 200: html = response.read() if (("Content-Encoding" in info) and (info['Content-Encoding'] == "gzip")): html = zlib.decompress(html, 16 + zlib.MAX_WBITS); Logger.getlogging().debug('Request Sucessed: {url}'.format(url=url)) else: Logger.getlogging().error('open {url} error, code = {code}'.format(url=url, code=code)) Logger.getlogging().error('Request Failed: {url}'.format(url=url)) return None except: Logger.getlogging().error('Request Failed: {url}'.format(url=url)) Logger.printexception() return None charset = RegexUtility.getid('charset', html) html = Common.trydecode(html, charset) saveJson['foundin'] = Common.urlenc(url) saveJson['html'] = Common.urlenc(html.encode(constant.CHARSET_UTF8)) saveJson['crawler_time'] = int(time.time()) jsonStr = json.dumps(saveJson) return jsonStr
def __init__(self, parent=None): # 使用该URL识别回传S2查询结果的类,推荐使用主站URL SiteS2Query.__init__(self) self.fakeoriginalurl = 'https://donghua.dmzj.com/' self.r = RegexUtility() if parent: self.website = parent.website
def analysis(self, line, post=False): param = ProcessParam() js = json.loads(line) param.crawler_time = TimeUtility.getuniformtime2(js['crawler_time']) param.url = Common.urldec(js['foundin']) param.content = js['html'] if post: param.data = js['data'] if js['html'][:3] == constant.GZIP_CODE: param.content = zlib.decompress(param.content, 16 + zlib.MAX_WBITS) # decode content = Common.urldec(param.content) charset = RegexUtility.getid('charset', content) content = Common.trydecode(content, charset) param.content = content if 'property' in js: for property in js['property']: if not property.has_key('result'): continue if property['property_name'] == u'page_body': param.page_body = Common.trydecode(Common.urldec(property['result'][0]['text']), constant.CHARSET_GBK) elif property['property_name'] == u'page_title': param.page_title = Common.trydecode(Common.urldec(property['result'][0]['text']), constant.CHARSET_GBK) elif property['property_name'] == u'html_time': param.html_time = TimeUtility.getuniformtime2(property['result'][0]['text']) return param
def analysis(self, line, method): try: js = json.loads(line) param = ProcessParam() param.crawler_time = TimeUtility.getuniformtime(js['crawler_time']) param.url = Common.urldec(js['foundin']) param.content = js['html'] if method == constant.REQUEST_TYPE_POST: param.data = js['data'] if js['html'][:3] == constant.GZIP_CODE: param.content = zlib.decompress(param.content, 16 + zlib.MAX_WBITS) # decode content = Common.urldec(param.content) charset = RegexUtility.getid('charset', content) content = Common.trydecode(content, charset) param.content = content return param except: line = line.replace('\n', '').strip() if not line or line[0] == '#': return Logger.getlogging().debug(line) param = ProcessParam() param.url = line if method == constant.REQUEST_TYPE_POST: js = json.loads(line) param.url = js['url'] param.data = js['data'] param.content = HttpCacher.getcontent(line, method) if param.content is None: return return param
def updateurlfilecontext(self, filename, urlfilecontext): for key in self.urlsfilemap.keys(): if RegexUtility.match(key + '.*', filename): self.urlsfilemap[key] = urlfilecontext break else: self.urlsfilemap[filename] = urlfilecontext
def analysis(line): param = ProcessParam() js = json.loads(line) param.url = js['foundin'] param.content = js['html'] if js['html'][:3] == constant.GZIP_CODE: param.content = zlib.decompress(param.content, 16 + zlib.MAX_WBITS) # decode content = Common.urldec(param.content) charset = RegexUtility.getid('charset', content) content = Common.trydecode(content, charset) param.content = content return param
def process(self, params): type = SinaComments.COMMON_PATTERN_TYPE for key in SinaComments.REGEX_PATTERNS.keys(): if RegexUtility.match(SinaComments.REGEX_PATTERNS[key], params.originalurl): type = key Logger.getlogging().debug('{url}:{key}'.format(url=params.originalurl, key=type)) if type == SinaComments.VIDEO_PATTERN_TYPE: CommonComments(self).process(params) elif type == SinaComments.BLOG_PATTERN_TYPE: BlogComments(self).process(params) elif type == SinaComments.MANHUA_PATTERN_TYPE: self.manhuaprocess(params) elif type == SinaComments.STGP_PATTERN_TYPE: CommonComments(self).process(params) else: CommonComments(self).process(params)
def referurl(self, params): res = True for pattern in ETLController.REFER_URLS_PATTERN: if RegexUtility.match(pattern, params.url): html = etree.HTML(params.content) oriurl = html.xpath('//*[@rel="canonical"]/@href') if oriurl: Logger.getlogging().debug('inurl:' + params.url) params.url = oriurl[0] params.originalurl = params.url Logger.getlogging().debug('outurl:' + params.url) else: res = False Logger.getlogging().warning('nocanonical:' + params.url) break return res
def get(self, url): saveJson = {} try: Logger.getlogging().debug('Downloading: {url}'.format(url=url)) self.driver.get(url) time.sleep(self.waitsec) html = self.driver.page_source Logger.getlogging().debug('Request Sucessed: {url}'.format(url=url)) except: Logger.getlogging().error('Request Failed: {url}'.format(url=url)) Logger.printexception() return None charset = RegexUtility.getid('charset', html) html = Common.trydecode(html, charset) saveJson['foundin'] = Common.urlenc(url) saveJson['html'] = Common.urlenc(html.encode(constant.CHARSET_UTF8)) saveJson['crawler_time'] = int(time.time()) jsonStr = json.dumps(saveJson) return jsonStr
def str2cmtnum(value, key): value = value.replace(',', '') multiplier = 1 for unit in SiteBasicInfo.UNITS.keys(): if unit in value: multiplier = SiteBasicInfo.UNITS[unit] values = re.findall(r'\d+[.]?\d*', value) res = -1 if len(values) == 1: res = float(values[0]) * multiplier elif len(values) > 1: value = SiteBasicInfo.strip(value, '').replace(u'(', '(').replace( u')', ')').replace(u':', ':') for format in SiteBasicInfo.NUMBER_FORMATS[key]: str = RegexUtility.search(format, value) if str: res, c = SiteBasicInfo.str2num(str.group(0)) break else: res = float(values[0]) * multiplier return int(res), len(values)
def __getcontent(self, url, method): database = bsddb.btopen(self.file, 'c') if database.has_key(Common.md5(url)): content = Common.urldec( database[Common.md5(url)]).decode(CHARSET_DEFAULT) database.close() return content if method == constant.REQUEST_TYPE_POST: js = json.loads(url) content = HttpUtility().post(js['url'], js['data']) elif method == constant.REQUEST_TYPE_WEBKIT: content = HttpUtility().wget(url) elif method == constant.REQUEST_TYPE_IMG: content = HttpUtility().get(url) content = binascii.b2a_hex(content) else: content = HttpUtility().get(url) if content is None: database.close() return None charset = RegexUtility().getid('charset', content) unic = Common.trydecode(content, charset) utf8str = unic.encode(CHARSET_UTF8) charset = CHARSET_UTF8 self.urlmap[Common.md5(url)] = unic # content = content.encode('utf8') line = { "md5": Common.md5(url), "charset": charset, "html": Common.urlenc(utf8str), "url": Common.urlenc(url) } if len(utf8str) > 2000: database = bsddb.btopen(self.file, 'c') database[Common.md5(url)] = Common.urlenc(utf8str) database.close() # FileUtility.writeline(self.file, json.dumps(line)) return utf8str.decode(CHARSET_UTF8)
def __init__(self, parent): # 使用该URL识别回传S2查询结果的类,推荐使用主站URL SiteS2Query.__init__(self) self.fakeoriginalurl = 'https://tieba.baidu.com' self.r = RegexUtility() self.website = parent.website
def __init__(self): SiteS2Query.__init__(self) self.r = RegexUtility() self.fakeoriginalurl = 'http://cartoon.pptv.com/'
def match(self, url): for pt in self.patterns: if RegexUtility.match(pt, url): return True return False
def __init__(self): # 使用该URL识别回传S2查询结果的类,推荐使用主站URL SiteS2Query.__init__(self) self.fakeoriginalurl = 'http://www.52tian.net/' self.querylastdays = SpiderConfigure.getinstance().getlastdays() self.r = RegexUtility()
class tian52S2Query(SiteS2Query): TIAN52_QUERY_TEMPLATE = 'http://www.52tian.net/-----------/{q}/' TIAN52_QUERY_P_TEMPLATE = 'http://www.52tian.net/-----------{p}/{q}/' TIAN52_S2QUERY_FIRST_PAGE = 'S2QUERY_FIRST_PAGE' TIAN52_S2QUERY_EACH_PAGE = 'S2QUERY_EACH_PAGE' TIAN52_S2QUERY_EACH_PAGE_CMP = 'S2QUERY_EACH_PAGE_CMP' ############################################################################################## # @functions:__init__ # @param: none # @return:none # @author:Liyanrui # @date:2016/11/24 # @note:天上人间动漫网搜类的构造器,初始化内部变量 ############################################################################################## def __init__(self): # 使用该URL识别回传S2查询结果的类,推荐使用主站URL SiteS2Query.__init__(self) self.fakeoriginalurl = 'http://www.52tian.net/' self.querylastdays = SpiderConfigure.getinstance().getlastdays() self.r = RegexUtility() ################################################################################################################ # @functions:query # @info: query condition # @return:none # @note:SiteS2Query,S2 query ################################################################################################################ def query(self, info): q = Common.urlenc(info) urls = [tian52S2Query.TIAN52_QUERY_TEMPLATE.format(q=q)] self.__storeqeuryurllist__(urls, self.TIAN52_S2QUERY_FIRST_PAGE, {'query': q}) ################################################################################################################ # @functions:process # @params: see WebSite.process # @return:none # @note:SiteS2Query, process S2 query result,一般为查询到的URL列表 ################################################################################################################ def process(self, params): try: # 从搜索首页面中获取的搜索结果数量,生成搜索页面URL if params.step == tian52S2Query.TIAN52_S2QUERY_FIRST_PAGE: self.step1(params) # 获得首页url参数 elif params.step == tian52S2Query.TIAN52_S2QUERY_EACH_PAGE: #print '########',params.content if self.r.search(u'/v7/404.asp', params.content): Logger.getlogging().warning( '{url}:40000. HttpRespond:404 Maybe no search results'. format(url=params.url)) return if re.findall( '^http[s]{0,1}://www\.52tian\.net/[(qingbao)|(tupian)|(yinyue)].*', params.originalurl): self.step3(params) else: self.step2(params) except: Logger.printexception() #---------------------------------------------------------------------- def step1(self, params): """获取查询的url列表""" q = params.customized['query'] soup = BeautifulSoup(params.content, 'html5lib') pageobj = soup.select('.pages > a') if pageobj: pages = int(pageobj[-3].get_text()) else: pages = 1 # 所有循环列表 querylist = [] # 根据总页数,获取query列表 for page in range(1, pages + 1, 1): url = tian52S2Query.TIAN52_QUERY_P_TEMPLATE.format( p=page, q=params.customized['query']) querylist.append(url) self.__storeqeuryurllist__(querylist, tian52S2Query.TIAN52_S2QUERY_EACH_PAGE, {'query': q}) #---------------------------------------------------------------------- def step2(self, params): """获取视频类的url列表""" key = Common.urldec(params.customized['query']) soup = BeautifulSoup(params.content, 'html5lib') lis = soup.select('.imagelist2 > ul > li') if lis: urllist = [] for li in lis: title = li.select_one('a').get_text() if key not in title: continue url = li.select_one('a').get('href') urllist.append(url) self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO) #---------------------------------------------------------------------- #新闻部分暂时没做 def step3(self, params): """获取新闻类的url列表""" key = Common.urldec(params.customized['query']) soup = BeautifulSoup(params.content, 'html5lib') lis = soup.select('.wzlist > ul > li.wztitle') if lis: urllist = [] for li in lis: title = li.select_one('a').get_text() # if key not in title: if not Common.checktitle(key, title): continue pubtime = li.select_one('span').get_text() url = 'http://www.52tian.net' + li.select_one('a').get('href') if compareNow(getuniformtime(pubtime), self.querylastdays): urllist.append(url) self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
def __init__(self): SiteComments.__init__(self) self.r = RegexUtility() self.basicstorage = BaseInfoStorage() self.commentstorage = CommentsStorage()
class MofangS2Query(SiteS2Query): QUERY_TEMPLATE = 'http://www.mofang.com/index.php?m=search&a=json_init&q={key}&type=video&page={pageno}&pagesize={pagesize}' QUERY_TEMPLATE_BBS = 'http://bbs.mofang.com/searchThread?keyword={key}&p={pageno}&pagesize={pagesize}' DEFAULT_PAGE_SIZE = 20 S2QUERY_FIRST_PAGE = 'S2QUERY_FIRST_PAGE' S2QUERY_EACH_PAGE = 'S2QUERY_EACH_PAGE' ################################################################################################################ # @functions:__init__ # @param: none # @return:none # @note:MofangS2Query,初始化内部变量 ################################################################################################################ def __init__(self): # 使用该URL识别回传S2查询结果的类,推荐使用主站URL SiteS2Query.__init__(self) self.r = RegexUtility() self.fakeoriginalurl = 'http://www.mofang.com/' ################################################################################################################ # @functions:query # @info: query condition # @return:none # @note:SiteS2Query,S2 query ################################################################################################################ def query(self, info): Logger.getlogging().info("query") keyvalue = Common.urlenc(info) #keyvalue = info # step1: 根据key, 拼出下面的url # http://www.mofang.com/index.php?m=search&a=json_init&q={key}&type=video&page={页数}&pagesize=1 # 视频S2 urls = [ MofangS2Query.QUERY_TEMPLATE.format(key=keyvalue, pageno=1, pagesize=1) ] Logger.getlogging().debug(urls[0]) self.__storeqeuryurllist__(urls, self.S2QUERY_FIRST_PAGE, {'query': info}) # 2016/12/20 测试时发现,该论坛的搜索功能改为discuz 为Post获取数据,因此关闭论坛S2功能。 # 论坛S2 # urls = [MofangS2Query.QUERY_TEMPLATE_BBS.format(key=keyvalue, pageno=1, pagesize=20)] # Logger.getlogging().debug(urls[0]) # self.__storeqeuryurllist__(urls, self.S2QUERY_FIRST_PAGE, {'query':info}) ################################################################################################################ # @functions:process # @params: see WebSite.process # @return:none # @note:SiteS2Query, process S2 query result,一般为查询到的URL列表 ################################################################################################################ def process(self, params): if self.r.match(r'^http://www\.mofang\.com/.*', params.url): self.processVideo(params) # else: # self.processBBS(params) ################################################################################################################ # @functions:processVideo # @params: see WebSite.processVideo # @return:none # @note:SiteS2Query, process S2 query result,一般为查询到的URL列表 ################################################################################################################ def processVideo(self, params): if params.step == MofangS2Query.S2QUERY_FIRST_PAGE: #Step2: 根据返回json内容,comments['totalnums'] 得到视频总数 #一个json返回20条数据,需要使用总数除以20获得总页数,然后在写到page参数里面 info = params.customized['query'] keyvalue = Common.urlenc(info) try: jsondate = json.loads(params.content) comments_count = jsondate['totalnums'] except: Logger.getlogging().warning('{}:40000'.format(params.url)) return # 获取不到,则返回 if int(comments_count) == 0: return page_count = int( math.ceil(float(comments_count) / self.DEFAULT_PAGE_SIZE)) # 根据上面的page_count数,拼出所有的搜索结果url(最新1周) querylist = [] if page_count > 0: for page in range(1, page_count + 1, 1): url = MofangS2Query.QUERY_TEMPLATE.format( key=keyvalue, pageno=page, pagesize=self.DEFAULT_PAGE_SIZE) Logger.getlogging().debug(url) querylist.append(url) self.__storeqeuryurllist__(querylist, MofangS2Query.S2QUERY_EACH_PAGE, {'query': info}) elif params.step == MofangS2Query.S2QUERY_EACH_PAGE: # Step3: 根据Step2的返回jason数据,获取 # 标题:comments['data'][0开始到19]['title'] # 连接:comments['data'][0开始到19]['url'] # 视频发布时间:comments['data'][0开始到19]['inputtime'] 这个需要截断前10位,只能对比日期 info = params.customized['query'] try: jsondate = json.loads(params.content) searchresult = jsondate['data'] except: Logger.getlogging().warning('{}:40000'.format(params.url)) return # 获取当前日(日期类型) today = datetime.datetime.strptime(TimeUtility.getcurrentdate(), TimeUtility.DATE_FORMAT_DEFAULT) urllist = [] for index in range(0, len(searchresult), 1): #print searchresult[index]['title'] #print searchresult[index]['inputtime'] if searchresult[index]['title'] is not None: # 标题中包含指定要查询的关键字,对应的url保存 # if searchresult[index]['title'].find(info) > -1: if Common.checktitle(info, searchresult[index]['title']): if searchresult[index]['inputtime'] is not None: #inputtime = datetime.datetime.strptime(TimeUtility.getuniformtime2(int(searchresult[index]['inputtime'])), TimeUtility.TIME_FORMAT_DEFAULT) #intervaldays = today - inputtime #if intervaldays.days <= int(self.querylastdays): pubtime = getuniformtime( str(searchresult[index]['inputtime'])) if compareNow(pubtime, int(self.querylastdays)): urllist.append(searchresult[index]['url']) else: # 获取不到发布时间,则默认为周期以内 urllist.append(searchresult[index]['url']) if len(urllist) > 0: self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO) ################################################################################################################ # @functions:processBBS # @params: see WebSite.processBBS # @return:none # @note:SiteS2Query, process S2 query result,一般为查询到的URL列表 ################################################################################################################ def processBBS(self, params): if params.step == MofangS2Query.S2QUERY_FIRST_PAGE: #Step2: 根据返回json内容,comments['totalnums'] 得到视频总数 #一个json返回20条数据,需要使用总数除以20获得总页数,然后在写到page参数里面 info = params.customized['query'] keyvalue = Common.urlenc(info) try: jsondate = json.loads(params.content) comments_count = jsondate['data']['total'] except: Logger.getlogging().warning('{}:40000'.format(params.url)) return # 获取不到,则返回 if int(comments_count) == 0: return page_count = int( math.ceil(float(comments_count) / self.DEFAULT_PAGE_SIZE)) # 根据上面的page_count数,拼出所有的搜索结果url(最新1周) querylist = [] if page_count > 0: for page in range(1, page_count + 1, 1): url = MofangS2Query.QUERY_TEMPLATE_BBS.format( key=keyvalue, pageno=page, pagesize=self.DEFAULT_PAGE_SIZE) Logger.getlogging().debug(url) querylist.append(url) self.__storeqeuryurllist__(querylist, MofangS2Query.S2QUERY_EACH_PAGE, {'query': info}) elif params.step == MofangS2Query.S2QUERY_EACH_PAGE: # Step3: 根据Step2的返回jason数据,获取 # 标题:comments['data']['threads'][0开始到19]['subject'] # 连接:comments['data']['threads'][0开始到19]['link_url'] # 视频发布时间:comments['data']['threads'][0开始到19]['create_time'] 这个需要截断前10位,只能对比日期 info = params.customized['query'] try: jsondate = json.loads(params.content) searchresult = jsondate['data']['threads'] except: Logger.getlogging().warning('{}:40000'.format(params.url)) return # 获取当前日(日期类型) today = datetime.datetime.strptime(TimeUtility.getcurrentdate(), TimeUtility.DATE_FORMAT_DEFAULT) urllist = [] for index in range(0, len(searchresult), 1): if searchresult[index]['subject'] is not None: # 标题中包含指定要查询的关键字,对应的url保存 # if searchresult[index]['subject'].find(info) > -1: if Common.checktitle(info, searchresult[index]['subject']): if searchresult[index]['create_time'] is not None: #inputtime = datetime.datetime.strptime(TimeUtility.getuniformtime2(int(searchresult[index]['create_time'])), TimeUtility.TIME_FORMAT_DEFAULT) #intervaldays = today - inputtime #if intervaldays.days <= int(self.querylastdays): #urllist.append(searchresult[index]['link_url']) inputtime = getuniformtime( str(searchresult[index]['create_time'])) if compareNow(inputtime, int(self.querylastdays)): urllist.append(searchresult[index]['link_url']) else: # 获取不到发布时间,则默认为周期以内 urllist.append(searchresult[index]['link_url']) if len(urllist) > 0: self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)
def __init__(self): SiteComments.__init__(self) self.r = RegexUtility() self.client_id = 'cytaCBUri'
def __init__(self): SiteS2Query.__init__(self) self.r = RegexUtility() self.fakeoriginalurl = 'http://www.acfun.cn/v/' self.querylastdays = SpiderConfigure.getinstance().getlastdays()
class NarutomS2Query(SiteS2Query): QUERY_TEMPLATE = 'http://search.narutom.com/cse/search?q={key}&p={pageno}&s=7660238846226745217&entry=1' FIRST_PAGE = 'http://search.narutom.com/cse/search?s=7660238846226745217&entry=1&q={key}' DEFAULT_PAGE_SIZE = 10 MAX_COUNT = 750 S2QUERY_FIRST_PAGE = 'S2QUERY_FIRST_PAGE' S2QUERY_EACH_PAGE = 'S2QUERY_EACH_PAGE' ################################################################################################################ # @functions:__init__ # @param: none # @return:none # @note:NarutomS2Query类构造器,初始化内部变量 ################################################################################################################ def __init__(self): # 使用该URL识别回传S2查询结果的类,推荐使用主站URL SiteS2Query.__init__(self) self.fakeoriginalurl = 'http://www.narutom.com/' self.r = RegexUtility() ################################################################################################################ # @functions:query # @info: query condition # @return:none # @note:SiteS2Query,S2 query ################################################################################################################ def query(self, info): Logger.getlogging().info("query") keyvalue = Common.urlenc(info) #Step1:根据key, 拼出下面的url(不能设置最新和一周检索条件) #http://search.narutom.com/cse/search?s=7660238846226745217&entry=1&ie=gbk&q=key的urlcode url = NarutomS2Query.FIRST_PAGE.format(key=keyvalue) urls = [url] Logger.getlogging().debug(urls[0]) self.__storeqeuryurllist__(urls, self.S2QUERY_FIRST_PAGE, {'query': info}) ################################################################################################################ # @functions:process # @params: see WebSite.process # @return:none # @note:SiteS2Query, process S2 query result,一般为查询到的URL列表 ################################################################################################################ def process(self, params): if params.step == NarutomS2Query.S2QUERY_FIRST_PAGE: #Step2: 根据返回的url,通过xpath://*[@id="results"]/span 得到搜索结果总件数,根据总件数,拼出搜索结果的url,写文件保存 html = etree.HTML(params.content) nodes = html.xpath('//*[@id="results"]/span') # 获取不到,则返回 if len(nodes) == 0: return # 获取总检索件数(例如:为您找到相关结果1,307个) count = 0 totalstr = nodes[0].text.replace(',', '') if self.r.search(u'\d+', totalstr): countstr = self.r.parse(u'(\d+)', totalstr)[0] count = int(countstr) # 该网站最多能获得750件检索结果 if count > self.MAX_COUNT: count = self.MAX_COUNT else: return # 根据上面的count数,拼出所有的搜索结果url info = params.customized['query'] keyvalue = Common.urlenc(info) page_count = float(count / self.DEFAULT_PAGE_SIZE) firstpage = NarutomS2Query.FIRST_PAGE.format(key=keyvalue) querylist = [] querylist.append(firstpage) if count > 10: #第二页的page数是1,第三页是2...... page数范围是:1-74(表示第2页-第75页) for page in range(1, int(math.ceil(page_count)), 1): url = NarutomS2Query.QUERY_TEMPLATE.format(key=keyvalue, pageno=page) querylist.append(url) self.__storeqeuryurllist__(querylist, NarutomS2Query.S2QUERY_EACH_PAGE, {'query': info}) elif params.step == NarutomS2Query.S2QUERY_EACH_PAGE: # Step3:根据Step2的返回结果,通过xpath: //*[@id="results"]/div/h3/a/@href 获得搜索结果的url,把url写入文件 info = params.customized['query'] html = etree.HTML(params.content) nodes = html.xpath('//*[@id="results"]/div/h3/a/@href') #titles = html.xpath('//*[@id="results"]/div/h3/a') pubtimestr = html.xpath('//*[@class="c-showurl"]') datecheck = False if len(pubtimestr) == len(nodes): datecheck = True urllist = [] for index in range(0, len(nodes), 1): # if titles[index] is not None and titles[index].find(info) > -1: # if titles[index] is not None and Common.checktitle(info, titles[index]): # 标题中包含指定要查询的关键字,对应的url保存 if datecheck: # 如果xpath获取到了包含时间的字符串,检查时间 if self.r.search('(\d+-\d+-\d+)', pubtimestr[index].text): pubtime = getuniformtime( self.r.parse('(\d+-\d+-\d+)', pubtimestr[index].text)[0]) if compareNow(pubtime, int(self.querylastdays)): urllist.append(nodes[index]) else: urllist.append(nodes[index]) ''' urllist = [] for node in nodes: urllist.append(node) ''' if len(urllist) > 0: self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
class YoukuComments(SiteComments): COMMENTS_URL = 'http://p.comments.youku.com/ycp/comment/pc/commentList?objectId=%s&app=100-DDwODVkv' \ '¤tPage=%d&pageSize=%d&listType=0&sign=%s&time=%s' PLAYINFO_URL = 'http://v.youku.com/action/getVideoPlayInfo?vid={vid}¶m%5B%5D=updown&callback=data' PAGE_SIZE = 30 STEP_1 = None STEP_2 = 2 STEP_3 = 3 ############################################################################################## # @functions:__init__ # @param: none # @return:none # @author:QW_Liang # @date:2017/09/07 # @note:youkuComments类的构造器,初始化内部变量 ############################################################################################## def __init__(self): SiteComments.__init__(self) self.r = RegexUtility() ############################################################################################## # @functions:process # @param:共通模块传入的参数(对象url, 原始url, 当前step数,自定义参数) # @return:Step1:获取评论的首页url # Step2:获取评论的所有url # Step3: 抽出的评论和最新评论的创建时间 # @author:QW_Liang # @date:2017/09/07 # @note:Step1:通过共通模块传入的html内容获取到oid,拼出获取评论总页数的url,并传递给共通模块 # Step2:通过共通模块传入的html内容获取到评论总页数,拼出获取评论的url,并传递给共通模块 # Step3:通过共通模块传入的html内容获取到评论和最新评论的创建时间,并传递给共通模块 ############################################################################################## def process(self, params): try: if params.step is YoukuComments.STEP_1: # 从url中获取拼接评论url的参数 objectId = self.r.getid('videoId', params.content, '\s*:\s*"') pTime = str( int( time.mktime( datetime.datetime.timetuple( datetime.datetime.now())) * 1000)) #获取参数中的随机数 sign = MD5().m( '100-DDwODVkv&6c4aa6af6560efff5df3c16c704b49f1&' + pTime) # 拼接第一页评论url comments_url = YoukuComments.COMMENTS_URL % ( objectId, 1, YoukuComments.PAGE_SIZE, sign, pTime) #通知下载平台,根据评论url获取第一页评论内容 self.storeurl(comments_url, params.originalurl, YoukuComments.STEP_2, {'objectId': objectId}) # 来疯吧直播播放量 if self.r.search(r'^http://v\.laifeng\.com/\d+', params.originalurl): clicknum = int(self.r.getid('onlineNum', params.content)) NewsStorage.setclicknum(params.originalurl, clicknum) if objectId: playinfo_url = YoukuComments.PLAYINFO_URL.format( vid=objectId) self.storeurl(playinfo_url, params.originalurl, YoukuComments.STEP_2, {'objectId': objectId}) #获取第一页评论内容,循环获取全部评论url elif params.step == YoukuComments.STEP_2: if re.findall('getVideoPlayInfo\?vid', params.url): playinfo = json.loads((params.content)[20:-2]) clicknum = int(playinfo['data']['stat']['vv'].replace( ',', '')) votenum = int(playinfo['data']['updown']['up'].replace( ',', '')) NewsStorage.setclicknum(params.originalurl, clicknum) NewsStorage.setvotenum(params.originalurl, votenum) else: objectId = params.customized['objectId'] pTime = str( int( time.mktime( datetime.datetime.timetuple( datetime.datetime.now())) * 1000)) # 获取参数中的随机数 sign = MD5().m( '100-DDwODVkv&6c4aa6af6560efff5df3c16c704b49f1&' + pTime) # 获取评论的Jason返回值 comments = json.loads(params.content) # 比较上次抓取该url的页面评论量和当前取到的评论量 if not comments.has_key('data'): Logger.getlogging().warning( "{url}:30000 No comments!".format( url=params.originalurl)) return if not comments['data']: Logger.getlogging().warning( "{url}:30000 No comments!".format( url=params.originalurl)) return # 判断增量 comments_count = comments['data']['totalSize'] cmtnum = CMTStorage.getcount(params.originalurl, True) if int(comments_count <= cmtnum): return NewsStorage.setcmtnum(params.originalurl, comments_count) # 获取评论总页数 comments_pages = int(comments['data']['totalPage']) if comments_pages == 0: return # 如果评论数量过多只取前十页 if comments_pages >= self.maxpages: comments_pages = self.maxpages lasttime = CMTStorage.getlastpublish( params.originalurl, True) # 循环拼接评论url,提交下载平台获取评论数据 for page in range(0, comments_pages + 1, 1): commentUrl = YoukuComments.COMMENTS_URL % ( objectId, page + 1, YoukuComments.PAGE_SIZE, sign, pTime) self.storeurl(commentUrl, params.originalurl, YoukuComments.STEP_3, {'objectId': objectId}) NewsStorage.setcmtnum(params.originalurl, int(comments['data']['totalSize'])) #解析评论数据 elif params.step == YoukuComments.STEP_3: commentsinfo = json.loads(params.content) for comment in commentsinfo['data']['comment']: content = str(comment['content']) curtime = TimeUtility.getuniformtime( int(comment['createTime'])) nick = comment['user']['userName'] # 通过时间判断评论增量 # if curtime > lasttime: if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) except: Logger.printexception()
def download(self): """ 平台上的下载分为两个步骤,而windows直接请求数据则只有step2:download() step1:从平台上下载数据到本地./data/platform step2:从./data/platform拷贝数据到./data/temp/done下,再存储解析后的json数据至./data/temp/json """ files = [] if self.completed(): return files Logger.getlogging().debug(self.download_path) srclist = FileUtility.getfilelist(self.download_path, []) for donefile in srclist: filename = FileUtility.getfilename(donefile) if donefile.endswith( 'done') and filename not in self.download_file_list: self.download_file_list.append(filename) self.download_time = time.time() for upfile in self.upload_file_list.keys(): if filename.startswith(upfile): FileUtility.copy(donefile, self.cache_path) binfile = self.cache_path + FileUtility.getfilename( donefile) if FileUtility.getfilesize( donefile) == FileUtility.getfilesize(binfile): Logger.getlogging().info( 'Remove {file}'.format(file=donefile)) FileUtility.remove(donefile) if FileUtility.exists(donefile): Logger.getlogging().error( 'Remove {file} failed'.format( file=donefile)) else: Logger.getlogging().error( 'File not equal {file}'.format(file=donefile)) jsonfile = self.bin2json(binfile) files.append(jsonfile) uploadtime = self.uploadfile_retranslist[ upfile].start_time if RegexUtility.match( TencentDownloader.DOWNLOAD_FORMAT1.format( file=upfile), filename): self.upload_file_list.pop(upfile) self.uploadfile_retranslist.pop(upfile) elif RegexUtility.match( TencentDownloader.DOWNLOAD_FORMAT2.format( file=upfile), filename): value = \ RegexUtility.parse(TencentDownloader.DOWNLOAD_FORMAT2.format(file=upfile), filename)[0] if value[0] == value[1]: self.upload_file_list.pop(upfile) self.uploadfile_retranslist.pop(upfile) if not FileUtility.exists(jsonfile): Logger.getlogging().error( 'no json file generate from done file:{done}'. format(done=binfile)) os.mknod(jsonfile) # update upload time keys = self.sortkeys() for fl in keys: if self.uploadfile_retranslist[ fl].start_time >= uploadtime: self.uploadfile_retranslist[ fl].start_time = time.time() time.sleep(0.1) break return files
def __init__(self): # 使用该URL识别回传S2查询结果的类,推荐使用主站URL SiteS2Query.__init__(self) self.fakeoriginalurl = 'http://www.narutom.com/' self.r = RegexUtility()
def __init__(self): # 使用该URL识别回传S2查询结果的类,推荐使用主站URL SiteS2Query.__init__(self) self.fakeoriginalurl = 'http://bbs.laohu.com/' self.r = RegexUtility() self.inputtime = self.querylastdays
def __init__(self): SiteComments.__init__(self) self.r = RegexUtility()
class LaohuS2Query(SiteS2Query): #LAOHU_QUERY_TEMPLATE = 'http://bbs.laohu.com/plugin.php?id=esearch&mymod=search&myac=thread&word={KEY}&page={pn}' LAOHU_QUERY_TEMPLATE = 'http://bbs.laohu.com/plugin.php?id=esearch&mymod=search&myac=thread&word={KEY}&page={pn}&srchfrom={time}' LAOHU_S2QUERY_FIRST_PAGE = 'S2QUERY_FIRST_PAGE' LAOHU_S2QUERY_EACH_PAGE = 'S2QUERY_EACH_PAGE' LAOHU_MAIN_DOMAIN = 'http://bbs.laohu.com/' LAOHU_LINK = 'http://bbs.laohu.com/thread-{tid}-1-1.html' DEFAULT_TIME = 86400 tids = [] ############################################################################################## # @functions:__init__ # @param: none # @return:none # @author:HuBorui # @date:2016/11/28 # @note:老虎游戏论坛元搜类的构造器,初始化内部变量 ############################################################################################## def __init__(self): # 使用该URL识别回传S2查询结果的类,推荐使用主站URL SiteS2Query.__init__(self) self.fakeoriginalurl = 'http://bbs.laohu.com/' self.r = RegexUtility() self.inputtime = self.querylastdays def preprocess(self, mid_url): if self.r.search('tid=\d+', mid_url): tid = self.r.parse('tid=(\d+)', mid_url)[0] if len(self.tids) == 0: self.tids.append(tid) newurl = self.LAOHU_LINK.format(tid=tid) else: if tid not in self.tids: self.tids.append(tid) newurl = self.LAOHU_LINK.format(tid=tid) else: newurl = None else: newurl = self.LAOHU_MAIN_DOMAIN + mid_url return newurl ################################################################################################################ # @functions:pageprocess # @info: query condition # @return:none # @note:SiteS2Query,S2 query ################################################################################################################ def pageprocess(self, params): # 获取文本 xparser = XPathUtility(params.content) # 获取该页超级链接 hreflist = xparser.xpath('//h3/a/@href') hrefs = [] for mid_url in hreflist: mid = self.preprocess(mid_url) if mid is not None: hrefs.append(mid) # 获取该页内容的所有发布时间 publictime = xparser.xpath('//*[@class="scontent"]/text()[1]') publicTimes = [] for timeindex in publictime: middle = str(timeindex).replace('\n', '').replace('\t', '').strip() publicTimes.append( str(str(middle).split(' ')[0]) + ' ' + str(str(middle).split(' ')[1])) # 获取改页所有title titles = [] titles_list = xparser.getlist('//h3') for title in titles_list: mid_title = str(title).replace('\n', '').replace('\t', '').strip() titles.append(mid_title) # 获取关键字 KEY_mid = params.customized['KEY'] KEY = Common.urldec(KEY_mid) # 获取标题正则表达式 titlePatten = KEY # 获取一周前日期 today = datetime.datetime.now() before_days = today + datetime.timedelta(-self.inputtime) before_arr = str(before_days).split('.') before_time = before_arr[0] urllist = [] len_hrefs = len(hrefs) number = 0 for index in publicTimes[:len_hrefs]: # 是否是标题命中 # mid_value = re.compile(titlePatten) # flg = mid_value.search(str(titles[number])) flg = Common.checktitle(titlePatten, str(titles[number])) # 是当前一周内发布视频,并且标题命中的场合 if index > before_time and flg: url = hrefs[number] urllist.append(url) number = number + 1 # 获取最终url列表 if len(urllist) > 0: self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA) ################################################################################################################ # @functions:query # @info: query condition # @return:none # @note:SiteS2Query,S2 query ################################################################################################################ def query(self, info): urlkey = Common.urlenc(info) time = self.querylastdays * LaohuS2Query.DEFAULT_TIME urls = [ LaohuS2Query.LAOHU_QUERY_TEMPLATE.format(KEY=urlkey, pn=1, time=time) ] self.__storeqeuryurllist__(urls, self.LAOHU_S2QUERY_FIRST_PAGE, { 'KEY': urlkey, 'time': time }) ################################################################################################################ # @functions:process # @params: see WebSite.process # @return:none # @note:SiteS2Query, process S2 query result,一般为查询到的URL列表 ################################################################################################################ def process(self, params): # 从搜索首页面中获取的搜索结果数量,生成搜索页面URL if params.step == LaohuS2Query.LAOHU_S2QUERY_FIRST_PAGE: # 获得首页url参数 KEY = params.customized['KEY'] time = params.customized['time'] #获取总页数 xparser = XPathUtility(params.content) pageCounts = xparser.getlist('//*[@id="main"]/div[2]/span') if len(pageCounts) > 0: page = str(pageCounts[0]).split('/')[1] #获取第一页的搜索结果 self.pageprocess(params) if int(page) > 1: if int(page) >= self.maxpages: page = self.maxpages querylist = [] # 根据总页数,获取query列表(第一页的数据已经获取到了,从第二页开始拼出获取的url) for pages in range(2, int(page) + 1, 1): url = LaohuS2Query.LAOHU_QUERY_TEMPLATE.format( KEY=KEY, pn=pages, time=time) querylist.append(url) self.__storeqeuryurllist__( querylist, LaohuS2Query.LAOHU_S2QUERY_EACH_PAGE, {'KEY': KEY}) else: Logger.getlogging().debug('抱歉,没有找到与' + ' ' + KEY + ' ' + '相关的帖子') # 从查询页面中获取视频URL elif params.step == LaohuS2Query.LAOHU_S2QUERY_EACH_PAGE: self.pageprocess(params)