def analysis(self, line, post=False): param = ProcessParam() js = json.loads(line) param.crawler_time = TimeUtility.getuniformtime2(js['crawler_time']) param.url = Common.urldec(js['foundin']) param.content = js['html'] if post: param.data = js['data'] if js['html'][:3] == constant.GZIP_CODE: param.content = zlib.decompress(param.content, 16 + zlib.MAX_WBITS) # decode content = Common.urldec(param.content) charset = RegexUtility.getid('charset', content) content = Common.trydecode(content, charset) param.content = content if 'property' in js: for property in js['property']: if not property.has_key('result'): continue if property['property_name'] == u'page_body': param.page_body = Common.trydecode(Common.urldec(property['result'][0]['text']), constant.CHARSET_GBK) elif property['property_name'] == u'page_title': param.page_title = Common.trydecode(Common.urldec(property['result'][0]['text']), constant.CHARSET_GBK) elif property['property_name'] == u'html_time': param.html_time = TimeUtility.getuniformtime2(property['result'][0]['text']) return param
def process(self, params): try: if params.step is AllComments.STEP_1: aid = re.findall("\d+", params.url.split("/")[-1])[0] aid_url = AllComments.AID_URL % (aid) self.storeurl(aid_url, params.originalurl, AllComments.STEP_2, {'aid': aid}) elif params.step is AllComments.STEP_2: cms_id = re.findall('appidArr \= \[\"cms\|(.+?)",', str(params.content))[0] cms_url = AllComments.KEYID_URL % ( cms_id, params.customized['aid'], params.originalurl) self.storeurl(cms_url, params.originalurl, AllComments.STEP_3, { 'aid': params.customized['aid'], 'cmsid': cms_id }) elif params.step is AllComments.STEP_3: comments = json.loads(params.content) sid = comments['data']['_id'] comment_url = AllComments.COMMENTS_URL % ( sid, '1', params.customized['cmsid']) self.storeurl(comment_url, params.originalurl, AllComments.STEP_4, { 'sid': sid, 'page': '1', 'cmsid': params.customized['cmsid'] }) elif params.step is AllComments.STEP_4: comments = json.loads(params.content) try: comment = [] index = 0 for index in range(0, len(comments['data'])): ctime = TimeUtility.getuniformtime2( comments['data'][index]['ctime']) if URLStorage.storeupdatetime(params.originalurl, str(ctime)): cmti = CommentInfo() cmti.content = comments['data'][index]['content'] comment.append(cmti) self.commentstorage.store(params.originalurl, comment) comment_url = AllComments.COMMENTS_URL % ( params.customized['sid'], str(int(params.customized['page']) + 1), params.customized['cmsid']) self.storeurl( comment_url, params.originalurl, AllComments.STEP_4, { 'sid': params.customized['sid'], 'page': str(int(params.customized['page']) + 1), 'cmsid': params.customized['cmsid'] }) except: return except Exception, e: traceback.print_exc() Logger.getlogging().error(e.message)
def parseinfofromjson(self, jsondata): # 页面爬取时间 self.crawler_time = TimeUtility.getuniformtime2( int(jsondata['crawler_time'])) # get title/body properties = jsondata['property'] for property in properties: if property['property_name'] == 'page_title': self.page_title = property['result'][0]['text'] elif property['property_name'] == 'page_body': self.page_body = property['result'][0]['text']
def processVideo(self, params): try: if params.step is MofangComments.STEP_1: if not self.r.search('data-flag=\"(.*?)\">', params.content): return cmsid = self.r.parse('data-flag=\"(.*?)\">', params.content)[0] comments_url = MofangComments.COMMENTS_URL % (cmsid, '4') self.storeurl(comments_url, params.originalurl, MofangComments.STEP_2, { 'cmsid': cmsid, 'pagesize': '4' }) elif params.step is MofangComments.STEP_2: comments = json.loads(params.content) pagesize = comments['data']['total'] comments_url = MofangComments.COMMENTS_URL % ( params.customized['cmsid'], pagesize) self.storeurl(comments_url, params.originalurl, MofangComments.STEP_3, { 'cmsid': params.customized['cmsid'], 'pagesize': pagesize }) elif params.step is MofangComments.STEP_3: comments = json.loads(params.content) if params.customized['pagesize'] <> '0': pcontent = [] ptime = [] for key in range(0, int(params.customized['pagesize'])): ptime.append( TimeUtility.getuniformtime2( comments['data']['list'][key]['create_time'])) pcontent.append( comments['data']['list'][key]['html_content']) if ptime <> []: index = 0 comments = [] complete = False for comment in pcontent: cmti = CommentInfo() cmti.content = comment #只判断时间段为新增时间段的情况下,才写入增量list中 if URLStorage.storeupdatetime( params.originalurl, str(ptime[index])): comments.append(cmti) index += 1 else: #更新数据库时间 complete = True break self.commentstorage.store(params.originalurl, comments) except Exception, e: Logger.printexception()
info.clicknum -= 10 else: info.clicknum = 0 if info.cmtnum > 10: info.cmtnum -= 10 else: info.cmtnum = 0 if info.votenum > 10: info.votenum -= 10 else: info.votenum = 0 if info.fansnum > 10: info.fansnum -= 10 else: info.fansnum = 0 if info.realnum > 10: info.realnum -= 10 else: info.realnum = 0 if info.updatetime > TimeUtility.getuniformtime2(0): if len(info.updatetime) != 19: info.updatetime = getuniformtime(info.updatetime) dt = datetime.datetime.strptime(info.updatetime, TimeUtility.TIME_FORMAT_DEFAULT) info.updatetime = TimeUtility.getuniformtime( str(dt - datetime.timedelta(days=int(1)))) print info.tostring() db.put(key, info.tostring()) db.flush()