def base_init(): LogGo.init(Configs()) RequestHelper.init(Configs()) SMTPServer.init(Configs()) Download(Configs()) RequestHelperClassVer.init(Configs()) ProxyHelper.init(Configs()) MysqlHelper.init(Configs()) BaseStrategy.init()
def scan_list(self, target, exists): list = [] result_list = [] cap = 'data' ruler = 'author:author;title:title;date:posttime;img:picurl;link:url;top:top;click:readnum_newest;vote_up:likenum_newest;subject:content' url = self.url.format(target.extra0, target.wx_hao) header = {'X-Requested-With': 'XMLHttpRequest'} raw = RequestHelper.get(url, header=header, file_cookie=Configs.gsdata_cookie_file) try: self.looper_js(list, raw, exists, ruler, cap) except Exception as e: E.out_err(e) return (-1, (target, None, None, None)) if len(list) > 0: list = self.sort(list) list.reverse() for item in list: if exists.count(item['title']) < 1: result_list.append(item) LogGo.debug('newrank list length:' + str(len(result_list))) if len(result_list) > 0: return (1, (target, list, None, None)) return(-1, (target, None, None, None))
def scan_list(self, target, exists): """请求参数""" par = (['flag', 'true'], ['uuid', target.extra0]) """抓取关键字""" keys = [ 'title', 'author', 'publicTime', 'url', 'clicksCount', 'likeCount', 'publicTime', 'summary' ] list = [] result_list = [] try: raw = RequestHelper.post(NewrankRuler.url, par, file_cookie=Configs.newrank_cookie_file) except Exception as e: import traceback msg = traceback.format_exc() # print(msg) LogGo.warning(msg) return (-1, (target, None, None, None)) try: list = ExtraJSON.extra_newrank_wechat_list(raw, keys) except: return (-1, (target, None, None, None)) if len(list) > 0: list.reverse() for item in list: if exists.count(item['title']) < 1: result_list.append(item) LogGo.debug('newrank list length:' + str(len(result_list))) if len(result_list) > 0: return (1, (target, list, None, None)) return (-1, (target, None, None, None))
def send_request(self, result_dic): json_dic = dict() json_dic['date'] = DateGo.get_current_date() json_dic['targetId'] = 'No Target ID!' json_dic['rowList'] = [result_dic] try: LogGo.info("Ready to Post!") raw = RequestHelper.post(Configs.fish_data_post_url, json = json_dic) preview_dic = result_dic.copy() preview_dic['text_not_format_clob'] = 'DUMMY CONTENT' preview_dic['text_blob'] = 'DUMMY CONTENT' json_dic['rowList'] = preview_dic json_str = json.dumps(json_dic) LogGo.info("POST CONTENT: " + json_str) LogGo.info("POST RESPONSE: " + str(raw)) except Exception: E.out_err()
def scan(self, target, order): result = [] type = self.td(target) url = target.extra0 #'http://ent.people.com.cn/GB/81374/index1.html' cap = None ruler = None if self.td(target) == 'i': cap = ['var tvInfoJs=', ''] url = self.iqiyi_base.format(url) ruler = 'keywords:contentKeyword;latestOrder:latestOrder;name:name;playCount:playCount;score:score;videoCount:videoCount' elif type == 'l': ruler = 'score:plist_score;comments:pcommon_count;bullets:pdm_count;like:up;hate:down;playCount:plist_play_count' url = self.letv_base.format(url, target.extra1) elif type == 't': cap = ["tlux.dispatch('$cover',", ");"] ruler = 'score:score->score;playCount:view_all_count;videoCount:episode_all;latestOrder:episode_updatedd' url = self.qq_base.format(url) elif type == 'm': url = self.mgtv_base.format(url) cap = ['"data":', ',"msg"'] ruler = 'playCount:all;like:like;hate:unlike' elif type == 'y': ruler = 'playCount:li [总播放数];comments:li [评论];like:li [顶];score:span class=star-num' if not s.is_url(url): if not url.startswith('id'): url = self.youku_prefix.format(url) url = self.youku_base.format(url) elif type == 's': url = self.sohu_base.format(url) elif type == 'c': url = self.cntv_base.format(url) ruler = 'playCount:^label [播放次数]' try: encode = ExtraHtml.get_page_encode(url) if type == 'y' or type == 'c': result = self.looper_html(url, ruler, encode, target) else: raw = RequestHelper.get(url, encode=encode) if type == 's': result = self.finder_sohu(raw) else: result = self.looper_js(raw, ruler, cap) except AttributeError as e: pass except Exception as e: import traceback msg = traceback.format_exc() print(msg) LogGo.warning(repr(e)) if len(result) > 0: result = self.build_base_dic(target, result, order) return result[0]
class Plantform(BaseRuler): req = RequestHelper() mgtv_base = 'http://vc.mgtv.com/v2/dynamicinfo?cid={0}' sohu_base = 'http://count.vrs.sohu.com/count/query_Album.action?albumId={0}' qq_base = 'https://m.v.qq.com/play.html?cid={0}' letv_base = 'http://v.stat.letv.com/vplay/queryMmsTotalPCount?pid={0}&vid={1}' iqiyi_base = 'http://mixer.video.iqiyi.com/jp/albums/{0}' cntv_base = 'http://www.soku.com/detail/show/{0}' youku_base = 'http://list.youku.com/show/{0}.html' youku_prefix = 'id_{0}' """ 优酷采用 电脑版 节目简介页 地址 例如 http://list.youku.com/show/id_z0f2233c722ec11e6bdbb.html 或者 地址中的 id id_z0f2233c722ec11e6bdbb 或者精简后的 z0f2233c722ec11e6bdbb 芒果tv 采集于 电脑版地址 封面页 id 例如 http://www.mgtv.com/h/295541.html?fpa=se 中的 295541 腾讯采用 应该是 节目简介页 链接地址中的 字母数字混合id 于 PC版 但是抓取用的可能是移动版 例如 https://v.qq.com/x/cover/dhzimk1qzznf301/l0024si3r7q.html 中的 dhzimk1qzznf301 或者 http://v.qq.com/detail/4/45yhivg8n755kh1.html 中的 45yhivg8n755kh1 爱奇艺使用 id 于 移动版开发模式 找一个像id 的 (另外,我发现电影话是 tvId 或者 aId 或者 referenceId 或者 albumId 电影的话有些不好找 是在一个 content_config 开头的请求里) 例如 http://m.iqiyi.com/v_19rrax9nq4.html#vfrm=13-0-0-1 中的 204446001?callback=Zepto1499852260800 中的 204446001 搜狐采用 pid 或者 albumId(可能是电影才用) 于 移动版地址栏中的数字 如 http://m.film.sohu.com/album/9344732.html 中的 9344732 或者 PC详情页 开发模式 例如 http://tv.sohu.com/s2017/dnwshylxt/ 中的 v?id=3879082&pid=9347799&pageNum=1&pageSize=50&isgbk=true&var=video_similar_search_result 中的 pid 9347799 乐视采用 pid 和 vid 于 移动版播放页开发模式 例如 http://m.le.com/vplay_29037420.html 中的 queryMmsTotalPCount?pid=10036184&vid=29037420&rnd=1499915428741&callback=jsonp4 中的 pid 和 vid cntv采用 优酷 搜酷平台获取播放量 使用地址id 于简介页 例如 http://www.soku.com/detail/show/XMTI1NDY1Ng 中的 XMTI1NDY1Ng """ # @Annoations.exe_time def scan(self, target, order): result = [] type = self.td(target) url = target.extra0 #'http://ent.people.com.cn/GB/81374/index1.html' cap = None ruler = None if self.td(target) == 'i': cap = ['var tvInfoJs=', ''] url = self.iqiyi_base.format(url) ruler = 'keywords:contentKeyword;latestOrder:latestOrder;name:name;playCount:playCount;score:score;videoCount:videoCount' elif type == 'l': ruler = 'score:plist_score;comments:pcommon_count;bullets:pdm_count;like:up;hate:down;playCount:plist_play_count' url = self.letv_base.format(url, target.extra1) elif type == 't': cap = ["tlux.dispatch('$cover',", ");"] ruler = 'score:score->score;playCount:view_all_count;videoCount:episode_all;latestOrder:episode_updatedd' url = self.qq_base.format(url) elif type == 'm': url = self.mgtv_base.format(url) cap = ['"data":', ',"msg"'] ruler = 'playCount:all;like:like;hate:unlike' elif type == 'y': ruler = 'playCount:li [总播放数];comments:li [评论];like:li [顶];score:span class=star-num' if not s.is_url(url): if not url.startswith('id'): url = self.youku_prefix.format(url) url = self.youku_base.format(url) elif type == 's': url = self.sohu_base.format(url) elif type == 'c': url = self.cntv_base.format(url) ruler = 'playCount:^label [播放次数]' try: encode = ExtraHtml.get_page_encode(url) if type == 'y' or type == 'c': result = self.looper_html(url, ruler, encode, target) else: raw = RequestHelper.get(url, encode=encode) if type == 's': result = self.finder_sohu(raw) else: result = self.looper_js(raw, ruler, cap) except AttributeError as e: pass except Exception as e: import traceback msg = traceback.format_exc() print(msg) LogGo.warning(repr(e)) if len(result) > 0: result = self.build_base_dic(target, result, order) return result[0] def union(self): """整合""" result = [] soap = SoapDao() list = soap.get_new_count() pro_list = [] while len(list) > 0: soap = list.pop(0) tmp_list = [soap] for i in range(len(list) - 1, -1, -1): if list[i][TBSoap.program.key] == soap[TBSoap.program.key]: tmp_list.append(list.pop(i)) pro_list.append(tmp_list) if len(pro_list) > 0: result = self.build_count_dic(pro_list) return result def looper_js(self, raw, ruler, cap): # iqiyi_cap = ['var tvInfoJs=', ''] # cap = Sh.str_to_tup(extra3_tup) list = ExtraJSON.extra_any_json_dic(raw, ruler, cap=cap) return list def td(self, type): if isinstance(type, str): type = type elif isinstance(type, Target): type = type.soap_type if type == 'iqiyi' or type == 'i': return 'i' elif type == 'letv' or type == 'l': return 'l' elif type == 'qq' or type == 'q' or type == 't': return 't' elif type == 'mgtv' or type == 'm': return 'm' elif type == 'youku' or type == 'y': return 'y' elif type == 'sohu' or type == 's': return 's' elif type == 'cnty' or type == 'c': return 'c' else: return None def finder_sohu(self, raw): try: count = s.cut_tail(raw.split('=')[1], ';') return {'playCount': count} except: pass def looper_html(self, url, ruler, encode, target): content = ExtraHtml.web_extra_content(url, ruler, encode) if self.td(target) == 'y': try: content['comments'] = int(''.join( content['comments'].split(':')[1].split(','))) content['like'] = int(''.join( content['like'].split(':')[1].split(','))) content['playCount'] = int(''.join( content['playCount'].split(':')[1].split(','))) except: pass elif self.td(target) == 'c': try: content['playCount'] = int(''.join( content['playCount'].split(','))) except: pass # try: # content['score'] = int(content['score']) # except:pass return content def build_base_dic(self, target, result, order): soap = TBSoap() program_dao = ProgramDao() soap_result = [] try: name = '' if Configs.show_utf: try: name = target.data_key except: name = '<<error>>' LogGo.info(">>> name: " + str(name) + "(" + str(result['playCount']) + ")") """字典的 键 对应数据库中的字段名 值 对应要存储的值""" dic = dict() try: dic[soap.play_count.key] = result['playCount'] #瞬时播放量 except KeyError as e: raise BaseDateLackException(str(e)) try: dic[soap.keywords.key] = result['keywords'] # 关键字 except: pass try: dic[soap.bullet_count.key] = result['bullets'] # 弹幕量 except: pass try: dic[soap.hate_count.key] = result['hate'] # 怒踩量 except: pass try: dic[soap.like_count.key] = result['like'] # 点赞量 except: pass try: dic[soap.latest_order.key] = result['latestOrder'] # 最新剧集 except: pass try: dic[soap.name.key] = result['name'] # 剧名 except: pass try: dic[soap.name.key] = program_dao.get_title_by_id( target.program_id) except: pass try: dic[soap.score.key] = result['score'] # 分数 except: pass try: dic[soap.video_count.key] = result['videoCount'] # 视频数量 except: pass try: # pass dic[soap.program.key] = target.program_id # program dic[soap.target.key] = target.id # program except: pass dic[soap.plantform.key] = target.soap_type order += 1 dic[soap.order_code.key] = order # """排序代码""" dic[soap.create_date.key] = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') # """此条记录创建时间""" dic[soap.valid.key] = 1 soap_result.append(dic) except BaseDateLackException as e: msg = "Lake improtant data(" + str(e) + ')' LogGo.warning(msg) except DataFormatException as e: pass # msg = "Date format error: " + i['link'] + '\r\n' + str(e) # LogGo.warning(msg) except KeyError as e: import traceback msg = traceback.format_exc() LogGo.warning(msg) except Exception as e: import traceback msg = traceback.format_exc() LogGo.warning(msg) return soap_result def build_count_dic(self, pro_list): # pc = TBProgramPlayCount result = [] try: LogGo.info(">>> count: " + str(len(pro_list))) for programs in pro_list: """字典的 键 对应数据库中的字段名 值 对应要存储的值""" dic = dict() total = 0 for program in programs: try: dic[TBProgramPlayCount.program.key] = program[ TBSoap.program.key] plantform = program[TBSoap.plantform.key] count = program[TBSoap.play_count.key] total += count if self.td(plantform) == 'i': dic[TBProgramPlayCount.count1.key] = count elif self.td(plantform) == 'l': dic[TBProgramPlayCount.count2.key] = count elif self.td(plantform) == 't': dic[TBProgramPlayCount.count3.key] = count elif self.td(plantform) == 'm': dic[TBProgramPlayCount.count4.key] = count elif self.td(plantform) == 'y': dic[TBProgramPlayCount.count5.key] = count elif self.td(plantform) == 's': dic[TBProgramPlayCount.count6.key] = count except Exception as e: import traceback msg = traceback.format_exc() LogGo.info(msg) dic[TBProgramPlayCount.total_count.key] = total dic[TBProgramPlayCount. create_time.key] = datetime.datetime.now().strftime( '%Y-%m-%d') # """此条记录创建时间""" result.append(dic) except BaseDateLackException as e: msg = "Lake improtant data(" + str(e) + ')' LogGo.warning(msg) except DataFormatException as e: pass # msg = "Date format error: " + i['link'] + '\r\n' + str(e) # LogGo.warning(msg) except KeyError as e: import traceback msg = traceback.format_exc() LogGo.warning(msg) except Exception as e: import traceback msg = traceback.format_exc() LogGo.warning(msg) return result
class WeiboRuler(BaseRuler): req = RequestHelper() request_login = '******' url_login = "******" # request_getindex = 'http://m.weibo.cn/container/getIndex' request_getindex = 'https://m.weibo.cn/api/container/getIndex' url_status = 'http://m.weibo.cn/status/' limited_attitude_count = 0 limited_forward_count = 0 exist_program = [] """请求参数""" par = (['username', Configs.weibo_username], ['password', Configs.weibo_password], ['savestate', 1], ['ec', 0], ['entry', 'mweibo']) """抓取关键字""" keys = ['title', 'author', 'publicTime', 'url', 'clicksCount', 'likeCount', 'publicTime'] """请求""" header = \ { "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Mobile Safari/537.36", "Referer": "https://passport.weibo.cn/signin/login?entry=mweibo&res=wel&wm=3349&r=http%3A%2F%2Fm.weibo.cn%2F%3Fjumpfrom%3Dwapv4%26tip%3D1", "Origin": "https://passport.weibo.cn", "Host": "passport.weibo.cn", "DNT": "1", "Content-Type": "application/x-www-form-urlencoded", "Connection": "keep-alive", "Accept-Language": "en-US,en;q=0.8,zh-CN;q=0.6,zh;q=0.4", "Accept-Encoding": "gzip, deflate, br", } def scan_list(self, target, exists): self.limited_forward_count = target.limited_forward_count self.limited_attitude_count = target.limited_attitude_count list = [] result_list = [] """模拟登陆""" status = 'you got it' """如果登陆成功""" if status != '': self.loops(target,exists,list) if len(list) < 1: return (0, (target, None, None, None)) else: LogGo.warning("Weibo: Loop scan faild!") return (-1, (target, None, None, None)) if len(list) > 0: list = self.purify(list) list.reverse() for item in list: if exists.count(item['id']) < 1: result_list.append(item) LogGo.debug('newrank list length:' + str(len(result_list))) if len(result_list) > 0: return (1, (target, list, None, None)) return(-1, (target, None, None, None)) def scan_detail(self, target, detail_page_bundle, order, content_ruler, encode): self.limited_forward_count = target.limited_forward_count self.limited_attitude_count = target.limited_attitude_count if detail_page_bundle is not None: return self.build_single_page_dic(target, detail_page_bundle, order, content_ruler, encode) else: return None """获取最新的微博""" # def scan_latest(self, target, exists, order): # # order = ScrappdeDataDao.get_max_order('weibo') # 数据库中排序代码 # # result = [] # # """模拟登陆""" # status = WeiboRuler.req._get(WeiboRuler.url_login) # raw = WeiboRuler.req.post_ex(WeiboRuler.request_login, WeiboRuler.par, header=WeiboRuler.header) # # """如果登陆成功""" # if status != '': # base_url = target.extra0 # for i in range(1, Configs().length_weibo): # print("page: " + str(i)) # # list = self.build_and_request(WeiboRuler.keys, base_url, WeiboRuler.request_getindex, i) # # if len(list) == 0: # break # for item in list: # """日常抓取时的重复验证""" # if exists.count(item['id']) < 1: # result.append(item) # else: # break # else: # LogGo.warning("Weibo: Simulation Weibo Login Failed!") # raise Exception("Invalid Username or Password") # # list = self.build_base_dic(result, exists, order, target) # return list # def scan(self,target,exists, order, exists_program:list): # # order = ScrappdeDataDao.get_max_order('weibo') # 数据库中排序代码 # # self.limited_forward_count = target.limited_forward_count # self.limited_attitude_count = target.limited_attitude_count # # self.exist_program = exists_program # # result = [] # # """模拟登陆""" # status = 'you got it'#WeiboRuler.req._get(WeiboRuler.url_login) # # raw = WeiboRuler.req.post_ex(WeiboRuler.request_login, WeiboRuler.par,header=WeiboRuler.header) # # """如果登陆成功""" # if status != '': # self.loops(target,exists,result) # if len(result) < 1: # return None # else: # LogGo.warning("Weibo: Simulation Weibo Login Failed!") # raise Exception("Invalid Username or Password") # # result = self.purify(result) # # result.reverse() # # return self.build_base_dic(result,exists,order, target) # @Annoations.exe_time def loops(self,target,exists,result): try: base_url = target.extra0 for i in range(0, Configs().length_weibo): # [::-1]: print("page: " + str(i)) list = self.build_and_request(WeiboRuler.keys, base_url, WeiboRuler.request_getindex, i) if len(list) == 0: break for item in list: """日常抓取时的重复验证""" if exists.count(item['id']) < 1: result.append(item) else: return except Exception as e: import traceback msg = traceback.format_exc() print(msg) LogGo.warning(repr(e)) LogGo.warning("Scan Failed!") return """ 根据 id 去除重复微博(抓取重复验证,因为微博的更新很快,有可能在抓取途中就发生了位移) 从 index = 0 开始遍历,若不重复则 放入 result ,result 的重复检测设置个限制,就 100吧 倒序查找我认为更有效率 20180306: 增加过滤条件:转发量(大于),点赞量(大于),节目数量检查(3个以下) """ # @Annoations.exe_time def purify(self,list): if len(list) < 1: return [] result = [] for i in list: try: flag = True id = i['id'] text = i['text'] limited_attitude_count = i['attitudes_count'] limited_forward_count = i['reposts_count'] program_count = 0 # 基础过滤重复id for seq in result[::-1]: sid = seq['id'] if id == sid: flag = False break # 第一次节目名过滤(有可能会包含到非节目) if flag and text.count('《') < 1: flag = False # 条二次参数过滤 if flag and self.limited_attitude_count is not None and limited_attitude_count is not None: if limited_attitude_count < self.limited_attitude_count: flag = False if flag and self.limited_forward_count is not None and limited_forward_count is not None: if limited_forward_count < self.limited_forward_count: flag = False # 第三次依据节目名过滤 if flag: for program in self.exist_program: if text.count(program) >= 1: program_count = program_count + 1 if program_count > 3: flag = False break if flag: result.append(i) except Exception as e: import traceback msg = traceback.format_exc() LogGo.warning(msg) return result def build_single_page_dic(self, target, detail_page_bundle, order, content_ruler, encode): news = TBNews() article = TBArticle() result_dic = dict() try: LogGo.info(WeiboRuler.url_status + detail_page_bundle['id']) # blob = i['text'].encode("UTF-8") """字典的 键 对应数据库中的字段名 值 对应要存储的值""" news_dic = dict() article_dic = dict() """排序代码""" order += 2 news_dic[news.order_code.key] = order # dic[news.text_not_format.key] = i['text']#"""去除标签的正文内容""" # dic[news.text_blob.key] = blob #"""原始带标签字段""" sub_tim = detail_page_bundle['created_at'] if sub_tim is not None: news_dic[news.subscribe_time.key] = sub_tim # getattr(i, 'publicTime') """文章发表日期""" else: LogGo.warning("no subscribe time!") news_dic[news.create_date.key] = DateGo.get_current_date() # """此条记录创建时间""" news_dic[news.status.key] = 1 # """状态""" news_dic[news.valid.key] = 1 news_dic[news.title.key] = detail_page_bundle['text'] news_dic[news.text_not_format.key] = detail_page_bundle['text'] news_dic[news.text_blob.key] = detail_page_bundle['text'] # title = None # try: # title = i['page_info'] # title = title['content1'] # except Exception as e: # pass # if title is None: # dic[news.title.key] = i['text'] # getattr(i, 'title') """文章标题""" # else: # dic[news.title.key] = title # """文章标题""" """文章所属机构""" try: user = detail_page_bundle['user'] screen_name = user['screen_name'] article_dic[article.company.key] = screen_name # getattr except Exception as e: pass article_dic[article.vote_up_count.key] = detail_page_bundle['attitudes_count'] # getattr(i, 'likeCount') """点赞数""" article_dic[article.scrabble_type.key] = 'weibo' #"""文章类型""" article_dic[article.is_scrabbled.key] = 1 #"""在数据库中作为 这是一条抓取到的数据 的标记""" article_dic[article.identifier.key] = detail_page_bundle['id'] #"""数据在母体中的 id""" article_dic[article.target_id.key] = target.id article_dic[article.content_url.key] = WeiboRuler.url_status + detail_page_bundle['id'] # getattr(i, 'url') """正文链接""" article_dic[article.publishStatus.key] = 1 # article_dic[article.messageType.key] = random.randint(0, 1) """如果是回复 或者 引用 会有被引用的微博,记录那个微博的 id""" try: retweeted_status = detail_page_bundle['retweeted_status'] ret_id = retweeted_status['id'] article_dic[article.identifier_re.key] = ret_id except Exception as e: pass """阅读量""" # dic['click_count'] = i['clicksCount'] #getattr(i, 'clicksCount') """转发数""" """评论量""" # """图片组""" # try: # pics = i['pics'] # if len(pics) > 0: # group_id = PictureDao.save_group_data(pics) # if group_id is not None: # dic['group_picture_id'] = group_id # except Exception as e: # print(e) # LogGo.warning(dic['content_url']) # LogGo.warning(e) result_dic.update(article_dic) result_dic.update(news_dic) except Exception as e: import traceback msg = traceback.format_exc() LogGo.warning(msg) return None return result_dic # def build_base_dic(self,result,exists,order, target): # list = [] # a_list = [] # # news = TBNews() # article = TBArticle() # # for i in result: # try: # if exists.count(i['id']) < 1: # getattr(i, 'url') # # LogGo.info(WeiboRuler.url_status + i['id']) # # # blob = i['text'].encode("UTF-8") # # """字典的 键 对应数据库中的字段名 值 对应要存储的值""" # dic = dict() # article_dic = dict() # # """排序代码""" # order += 2 # dic[news.order_code.key] = order # # dic[news.text_not_format.key] = i['text']#"""去除标签的正文内容""" # # dic[news.text_blob.key] = blob #"""原始带标签字段""" # # sub_tim = i['created_at'] # if sub_tim is not None: # dic[news.subscribe_time.key] = sub_tim # getattr(i, 'publicTime') """文章发表日期""" # else: # LogGo.warning("no subscribe time!") # # dic[news.create_date.key] = DateGo.get_current_date() # """此条记录创建时间""" # dic[news.status.key] = 1 # """状态""" # dic[news.valid.key] = 1 # # dic[news.title.key] = i['text'] # # # title = None # # try: # # title = i['page_info'] # # title = title['content1'] # # except Exception as e: # # pass # # # if title is None: # # dic[news.title.key] = i['text'] # getattr(i, 'title') """文章标题""" # # else: # # dic[news.title.key] = title # """文章标题""" # # """文章所属机构""" # try: # user = i['user'] # screen_name = user['screen_name'] # article_dic[article.company.key] = screen_name # getattr # except Exception as e: # pass # # article_dic[article.vote_up_count.key] = i['attitudes_count'] # getattr(i, 'likeCount') """点赞数""" # article_dic[article.scrabble_type.key] = 'weibo' #"""文章类型""" # article_dic[article.is_scrabbled.key] = 1 #"""在数据库中作为 这是一条抓取到的数据 的标记""" # article_dic[article.identifier.key] = i['id'] #"""数据在母体中的 id""" # article_dic[article.target_id.key] = target.id # article_dic[article.content_url.key] = WeiboRuler.url_status + i['id'] # getattr(i, 'url') """正文链接""" # # """如果是回复 或者 引用 会有被引用的微博,记录那个微博的 id""" # try: # retweeted_status = i['retweeted_status'] # ret_id = retweeted_status['id'] # # article_dic[article.identifier_re.key] = ret_id # except Exception as e: # pass # # list.append(dic) # a_list.append(article_dic) # # """阅读量""" # # dic['click_count'] = i['clicksCount'] #getattr(i, 'clicksCount') # """转发数""" # """评论量""" # # # """图片组""" # # try: # # pics = i['pics'] # # if len(pics) > 0: # # group_id = PictureDao.save_group_data(pics) # # if group_id is not None: # # dic['group_picture_id'] = group_id # # except Exception as e: # # print(e) # # LogGo.warning(dic['content_url']) # # LogGo.warning(e) # # except Exception as e: # import traceback # msg = traceback.format_exc() # LogGo.warning(msg) # # LogGo.warning(WeiboRuler.url_status + i['id']) # # return list, a_list """ 通过给定的 首页 url 拼接出分页请求用地址,并且发出请求获得回应数据 """ def build_and_request(self, keys, base_url, url, page): we_par_header = base_url.split('?')[0] we_par = base_url.split('?')[1] we_pars = we_par.split('&') _we_pars = dict() for par in we_pars: tmp = par.split('=',1) _we_pars[tmp[0]] = tmp[1] we_pars_dic = dict() we_pars_dic['uid'] = _we_pars['uid'] we_pars_dic['luicode'] = _we_pars['luicode'] we_pars_dic['type'] = 'uid' we_pars_dic['value'] = _we_pars['uid'] we_pars_dic['lfid'] = _we_pars['lfid'] we_pars_dic['containerid'] = '107603' + _we_pars['uid'] # we_pars_dic['featurecode'] = '0' # we_pars_dic['retcode'] = '0' request_url = "" request_url += url request_url += "?" """dest 标注参数拼接的顺序""" dest = ['uid', 'luicode', 'lfid', 'type', 'value', 'containerid'] # dest = ['uid', 'luicode', 'lfid', 'featurecode', 'retcode', 'type', 'value', 'containerid'] for key in dest: request_url += key request_url += "=" request_url += str(we_pars_dic[key]) request_url += "&" if int(page) > 1: request_url += 'page=' + str(page) """抓取地址""" # raw = RequestHelper.get(request_url) raw = WeiboRuler.req._get(request_url) # print(raw) """ 采集数据 开始字符 采集关键字 """ tup = ExtraJSON.extra_getindex_list(raw, keys) return tup # weibo = WeiboRuler() # # list = [{'id': '1',},{'id': '1',},{'id': '3',},{'id': '3',},{'id': '3',},{'id': '6',},{'id': '8',},{'id': '8',},] # result = weibo.purify(list) # # print(result)
def get_request(): url = 'http://app.media-plus.cn/portal/search/updateIndex' RequestHelper.get(url)
class NewrankRuler(BaseRuler): jsons = ExtraJSON() req = RequestHelper() url = 'https://www.newrank.cn/xdnphb/detail/getAccountArticle' """从 target 中获取 uuid 公众号 然后提取""" """返回结果为 dic 的 list ,每个 list 元素为一条微信""" def scan_list(self, target, exists): """请求参数""" par = (['flag', 'true'], ['uuid', target.extra0]) """抓取关键字""" keys = [ 'title', 'author', 'publicTime', 'url', 'clicksCount', 'likeCount', 'publicTime', 'summary' ] list = [] result_list = [] try: raw = RequestHelper.post(NewrankRuler.url, par, file_cookie=Configs.newrank_cookie_file) except Exception as e: import traceback msg = traceback.format_exc() # print(msg) LogGo.warning(msg) return (-1, (target, None, None, None)) try: list = ExtraJSON.extra_newrank_wechat_list(raw, keys) except: return (-1, (target, None, None, None)) if len(list) > 0: list.reverse() for item in list: if exists.count(item['title']) < 1: result_list.append(item) LogGo.debug('newrank list length:' + str(len(result_list))) if len(result_list) > 0: return (1, (target, list, None, None)) return (-1, (target, None, None, None)) # def test_loop(self, ): def scan_detail(self, target, detail_page_bundle, order, content_ruler, encode): news = TBNews() article = TBArticle() # picture_dao = PictureDao() result_dic = dict() try: info = self.ready_info(detail_page_bundle['title'], detail_page_bundle['url']) LogGo.info(info) try: # tup = ExtraJSON.wechat_extra_content(detail_page_bundle['url']) tup = self.jsons.wechat_extra_content( detail_page_bundle['url']) except HttpConnectionFailedException as e: LogGo.warning(repr(e)) return (-3, None) except AttributeError: LogGo.warning( "Maybe a deleted msg, complete the code to detect this error" ) return (-2, None) except Exception: LogGo.warning("Error when get detail message!") return (-2, None) raw_content = tup[1] content = tup[2] picture = tup[3] """字典的 键 对应数据库中的字段名 值 对应要存储的值""" news_dic = dict() article_dic = dict() ############################## NEWS ############################### """列表图片 id""" # if picture is not None: # picture_id = picture_dao.save_data(picture) # news_dic[news.main_pic_id.key] = picture_id news_dic[news.text_not_format.key] = content # """去除标签的正文内容""" # dic[news.text_blob.key] = raw_content#"""原始带标签字段""" news_dic[news.subscribe_time.key] = detail_page_bundle[ 'publicTime'] # """文章发表日期""" news_dic[news.create_date.key] = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') # """此条记录创建时间""" news_dic[news.subject.key] = detail_page_bundle[ 'summary'] # """摘要""" news_dic[news.valid.key] = 1 news_dic[news.author.key] = detail_page_bundle['author'] news_dic[news.title.key] = detail_page_bundle[ 'title'] # """文章标题""" news_dic[news.status.key] = 2 order += 5 news_dic[news.order_code.key] = order # """排序代码""" ############################## ARTICLE ############################### article_dic[article.content_url.key] = detail_page_bundle[ 'url'] # getattr(i, 'url')"""正文链接""" article_dic[article.fingerprint.key] = md5( detail_page_bundle['url']) # """由地址生成的指纹""" article_dic[article.company.key] = target.data_key # """文章所属机构""" article_dic[article.target_id.key] = target.id article_dic[article.raw_click_count.key] = detail_page_bundle[ 'clicksCount'] # getattr(i, 'clicksCount')#"""阅读量""" article_dic[article.vote_up_count.key] = detail_page_bundle[ 'likeCount'] # getattr(i, 'likeCount')"""点赞数""" article_dic[article.scrabble_type. key] = 'wechat' # """文章类型 微信固定值为 wechat """ article_dic[ article.is_scrabbled.key] = 1 # """在数据库中作为 这是一条抓取到的数据 的标记""" article_dic[article.publishStatus.key] = 1 # article_dic[article.messageType.key] = random.randint(0, 1) ############################## DIC ############################### result_dic.update(news_dic) result_dic.update(article_dic) except Exception: import traceback msg = traceback.format_exc() LogGo.warning(msg) return (-1, None) return (1, result_dic)
class GsdataRuler(BaseRuler): jsons = ExtraJSON() req = RequestHelper() url = 'http://www.gsdata.cn/rank/toparc?wxname={0}&wx={1}&sort=-1' def __init__(self): self.news = NewsDao() """从 target 中获取 uuid 公众号 然后提取""" """返回结果为 dic 的 list ,每个 lfist 元素为一条微信""" def looper_js(self, result, raw, exists, ruler, captup=None): cap = captup if captup != None: if captup.count(' ') == 2: cap = Sh.str_to_tup(captup) list = ExtraJSON.extra_any_json(raw, ruler, cap=cap) if len(list) > 0: for item in list: """日常抓取时的重复验证""" if 1>0:#if exists.count(item['link']) < 1: result.append(item) else: break def sort(self, list): for i in range(0,len(list)): for j in range(i + 1, len(list)): if list[i]['date'] < list[j]['date'] or list[i]['date'] == list[j]['date'] and list[i]['top'] > list[j]['top'] : list[i], list[j] = list[j], list[i] return list def scan_list(self, target, exists): list = [] result_list = [] cap = 'data' ruler = 'author:author;title:title;date:posttime;img:picurl;link:url;top:top;click:readnum_newest;vote_up:likenum_newest;subject:content' url = self.url.format(target.extra0, target.wx_hao) header = {'X-Requested-With': 'XMLHttpRequest'} raw = RequestHelper.get(url, header=header, file_cookie=Configs.gsdata_cookie_file) try: self.looper_js(list, raw, exists, ruler, cap) except Exception as e: E.out_err(e) return (-1, (target, None, None, None)) if len(list) > 0: list = self.sort(list) list.reverse() for item in list: if exists.count(item['title']) < 1: result_list.append(item) LogGo.debug('newrank list length:' + str(len(result_list))) if len(result_list) > 0: return (1, (target, list, None, None)) return(-1, (target, None, None, None)) def scan_detail(self, target, detail_page_bundle, order, content_ruler, encode): news = TBNews() article = TBArticle() # picture_dao = PictureDao() result_dic = dict() try: """由地址生成的指纹""" signature = md5(detail_page_bundle['link']) info = self.ready_info(detail_page_bundle['title'], detail_page_bundle['link']) LogGo.info(info) try: tup = self.jsons.wechat_extra_content(detail_page_bundle['link']) # getattr(i, 'url') except HttpConnectionFailedException as e: LogGo.warning(repr(e)) return (-3, None) except AttributeError as ae: LogGo.warning("Maybe a deleted msg, complete the code to detect this error") return (-2, None) raw_content = tup[1] content = tup[2] """字典的 键 对应数据库中的字段名 值 对应要存储的值""" news_dic = dict() article_dic = dict() ############################## NEWS ############################### """列表图片""" picture = detail_page_bundle['img'] """列表图片 id""" # if picture is not None: # picture_id = picture_dao.save_data(picture) # news_dic[news.main_pic_id.key] = picture_id order = order + 2 news_dic[news.order_code.key] = order # """排序代码""" news_dic[news.subject.key] = detail_page_bundle['subject'] # """摘要""" news_dic[news.valid.key] = 1 news_dic[news.create_date.key] = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') # """此条记录创建时间""" news_dic[news.text_not_format.key] = content # """去除标签的正文内容""" # news_dic[news.text_blob.key] = raw_content #"""原始带标签字段""" news_dic[news.title.key] = detail_page_bundle['title'] # getattr(i, 'title') """文章标题""" news_dic[news.subscribe_time.key] = detail_page_bundle['date'] # getattr(i, 'publicTime') """文章发表日期""" news_dic[news.status.key] = 2 try: news_dic[news.author.key] = detail_page_bundle['author'] # getattr(i, 'clicksCount') """阅读量""" except: pass ############################## ARTICLE ############################### try: article_dic[article.raw_click_count.key] = int( detail_page_bundle['click']) # getattr(i, 'clicksCount') """阅读量""" except: pass try: article_dic[article.vote_up_count.key] = int( detail_page_bundle['vote_up']) # getattr(i, 'likeCount') """点赞数""" except: pass article_dic[article.scrabble_type.key] = 'wechat' # """文章类型 微信固定值为 wechat """ article_dic[article.is_scrabbled.key] = 1 # """在数据库中作为 这是一条抓取到的数据 的标记""" article_dic[article.fingerprint.key] = signature # """由地址生成的指纹""" article_dic[article.target_id.key] = target.id article_dic[article.company.key] = target.data_key # getattr(i, 'author') """文章所属机构""" article_dic[article.content_url.key] = detail_page_bundle['link'] # getattr(i, 'url') """正文链接""" article_dic[article.publishStatus.key] = 1 # article_dic[article.messageType.key] = random.randint(0, 1) ############################## DIC ############################### result_dic.update(article_dic) result_dic.update(news_dic) except Exception as e: import traceback msg = traceback.format_exc() LogGo.warning(msg) return (-1, None) return (1, result_dic)
class WechatRuler: req = RequestHelper() """抓取关键字""" keys = ['author', 'content_url', 'cover', 'digest', 'title', 'datetime', 'fileid'] """从 target 中获取 uuid 公众号 然后提取""" """返回结果为 dic 的 list ,每个 list 元素为一条微信""" @Annoations.exe_time def ExtraList(self, target, existsUrls, order): # order = ScrappdeDataDao.get_max_order_code() # 数据库中排序代码 result = [] url = str(target.extra0) next_index = "" """抓取地址""" raw = WechatRuler.req._get(url) try: trup = ExtraJSON.extraWechatList(raw, 'msgList', WechatRuler.keys) list = trup[0] next_index = str(trup[1]) except Exception as e: print(e) print("ERROR") return result while True: try: print('>>> scaning id: ' + next_index) LogGo.info('>>> scaning id: ' + next_index) tup = self.loopToFail(url, next_index) re_list = tup[0] next_index = str(tup[1]) is_continue = tup[2] if len(re_list) > 0: for item in re_list: list.append(item) # break else: break if is_continue != 1: break except Exception as e: print(e) break print('>>> list scaning completed') print('>>>') list.reverse() print('>>> Start Build SQL') result = self.build_base_dic(target,list,existsUrls,order) print('>>> Build SQL Success') print('>>>') return result def build_base_dic(self,target,list,existsUrls,order): news = TBNews() article = TBArticle() picture_dao = PictureDao() result = [] article_result = [] """抓取正文""" for i in list: try: i['content_url'] = UrlHelper.unify(i['content_url']) #StringHelper.unescape(i['content_url']) if existsUrls.count(i['content_url']) < 1: # getattr(i, 'url') LogGo.info(">>> file id: " + str(i['fileid'])) LogGo.info(">>> url: " + str(i['content_url'])) try: tup = ExtraJSON.wechat_extra_content(i['content_url']) # getattr(i, 'url') except Exception as e: print(e) print(">>> ") print(">>> extra content error.") print(">>> ") LogGo.info("extra content error.") LogGo.info("possible a deleted msg") # LogGo.info("url: " + i['content_url']) continue raw_content = tup[1] content = tup[2] """字典的 键 对应数据库中的字段名 值 对应要存储的值""" dic = dict() article_dic = dict() order = order + 5 dic[news.order_code.key] = order # """排序代码""" dic[news.create_date.key] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') # """此条记录创建时间""" dic[news.valid.key] = 1 dic[news.text_not_format.key] = content #"""去除标签的正文内容""" # dic[news.text_blob.key] = raw_content #"""原始带标签字段""" dic[news.subscribe_time.key] = i['datetime'] # getattr(i, 'publicTime') """文章发表日期""" dic[news.author.key] = i['author'] # getattr(i, 'author')"""文章所属机构""" dic[news.title.key] = i['title'] # getattr(i, 'title')"""文章标题""" dic[news.subject.key] = i['digest'] # """摘要""" dic[news.status.key] = 2 picture_id = picture_dao.save_data(i['cover']) dic[news.main_pic_id.key] = picture_id #"""列表图片 id""" article_dic[article.fingerprint.key] = md5(i['content_url'])#"""由地址生成的指纹""" article_dic[article.target_id.key] = target.id article_dic[article.company.key] = target.data_key # getattr(i, 'author') """文章所属机构""" article_dic[article.content_url.key] = i['content_url'] # getattr(i, 'url')"""正文链接""" article_dic[article.scrabble_type.key] = 'wechat' # """文章类型 微信固定值为 wechat """ article_dic[article.is_scrabbled.key] = 1 # """在数据库中作为 这是一条抓取到的数据 的标记""" result.append(dic) article_result.append(article_dic) except Exception as e: import traceback msg = traceback.format_exc() print(msg) LogGo.warning(repr(e)) continue return result, article_result """生成 getmasssendmsg js 请求链接,并且进行请求操作,直到失败""" def loopToFail(self,url,index): "拆分请求url 提取参数 用在之后的请求中 """ we_par_header = url.split('?')[0] we_par = url.split('?')[1] we_pars = we_par.split('&') we_pars_dic = dict() we_pars_dic['count'] = 10 we_pars_dic['f'] = 'json' we_pars_dic['x5'] = 0 we_pars_dic['frommsgid'] = str(index) we_pars_dic['wxtoken'] = '' for par in we_pars: tmp = par.split('=',1) we_pars_dic[tmp[0]] = tmp[1] request_url = "" request_url += we_par_header request_url += "?" dest = ['__biz','uin','key','f', 'frommsgid','count','uin','key','pass_ticket', 'wxtoken','x5'] for key in dest: request_url += key request_url += "=" request_url += str(we_pars_dic[key]) request_url += "&" """抓取关键字""" # keys = ['author', 'content_url', 'cover', 'digest', 'title', 'datetime', 'fileid'] """抓取地址""" # raw = RequestHelper.get(request_url) raw = WechatRuler.req._get(request_url) # print(raw) """ 采集数据 开始字符 采集关键字 """ tup = ExtraJSON.extraGetMassList(raw, WechatRuler.keys) return tup # wechat = 'http://mp.weixin.qq.com/s?__biz=MzI3NTE2NTQyNw==&mid=2650732480&idx=4&sn=7d80d2d219c2e7a99555e28ef5d88ef3&chksm=f302ae5cc475274a6034b1674a20a119e2e136f3076f4472443f0e6e30ded5944af416832433&scene=27#wechat_redirect' # UrlHelper.unify(wechat)