def homepage(self): if self._homepage is None: response = send_request('get', self.url_home, session=self.session, headers=HEADERS, retries=-1) self._homepage = response.text return self._homepage
def wrapper(self, *args, **kwargs): res = func(self, *args, **kwargs) params = res.get('params') API = res.get('api', api) if API is None: API = api _ = { 'get': 'params', 'post': 'data', }[method.lower()] p = {_: params} extra_kwargs = res.get('extra_kwargs') p.update(extra_kwargs) response = send_request(method.lower(), url=API, session=self.session, **p) if response: msg = res.get('msg') check_func = res.get('check_func') tips = res.get('tips', {}) callback = res.get('callback') if msg: keys = [i for i in msg.keys()] if all([response.get(i) == msg[i] for i in keys]): logger.info(tips.get('ok')) else: logger.info(tips.get('fail')) elif check_func: res = check_func(response) if res: logger.info(tips.get('ok')) else: logger.info(tips.get('fail')) if callback and isfunction(callback): return callback(self, response) return response
def search(self, keyword, count=COUNT_SEARCH, USER=False, VIDEO=False, ALL=True, MDB=None, strict=False): if USER: tab = 4 elif VIDEO: tab = 2 else: tab = 1 tab_kind = {1: '综合', 2: '视频', 4: '用户'} amount = 0 offset = 0 results = [] retries = MAX_RETRY dbname = MONGODB['search'] if MDB: if isinstance(MDB, Database) and not MDB.connected: MDB.connect() elif not isinstance(MDB, Database): MDB = Database(MONGODB) MDB.connect() MDB.use_db(dbname) while 1: params = params_for_search(keyword, tab=tab, offset=offset) response = send_request('get', API_SEARCH, params=params, JSON=True, retries=retries, DATA=1, headers=self.headers) print(API_SEARCH) print(params) print(self.headers) data = response.get('data') if bool(data): offset = response.get('offset') for item in data: if not ALL: if amount >= count: logger.info(f'[采集完毕] 已达到搜索要求的{count}条数据.[OK]') return results if MDB: tname = f'{keyword}-{tab_kind[tab]}' _id = item.get('id') asks = MDB.select({'id': {"=": _id}}, tname=tname) if asks: continue MDB.save(item, tname=tname) if strict and USER: name = item.get('name') if name == keyword: logger.info(f'[搜索匹配成功]Strict 模式下搜索到相关用户!') return item results.append(item) amount += 1 logger.info(f'此次已搜索:{keyword} {tab_kind[tab]}数据 {amount} 条.') if response.get('has_more') != 0: retries = -1 else: logger.info( f'搜索关键词:{keyword} {tab_kind[tab]}数据采集完毕. 此次采集总数:{amount}.') return results
def get_published(self, count=COUNT_NEWS, ALL=False, MDB=None, STRONG=True, MODE=ARTICLE, end_link=None, **kwargs): hot_time = '0' amount = 0 results = [] retries = MAX_RETRY API = kwargs.get('API', API_USER_ARTICLE) headers = kwargs.get('headers', self.headers_article) cleaner = kwargs.get('cleaner', published_data_cleaner) callback = kwargs.get('data_cb') cb_args = kwargs.get('cb_args', ()) while 1: if MODE == WEITT: W_PARAMS.update({ 'visit_user_id': self.id, 'max_behot_time': hot_time, }) params = W_PARAMS else: params = payload_for_get(self.id, MODE, hot_time) response = send_request('get', API, session=self.session, params=params, JSON=True, retries=retries, DATA=STRONG, headers=headers) all_data = response.get('data') if bool(all_data): next = response.get('next') hot_time = next.get('max_behot_time') data = sorted(all_data, key=lambda x: x['behot_time'], reverse=True) for item in data: if callback and callable(callback): res = callback(item, *cb_args) if res: continue amount += 1 whole_url = URL_HOST + item.get('source_url') # print(whole_url+' '+end_link) if end_link != None and end_link == whole_url: return results if not ALL: if amount > count: logger.info( f'[采集完毕] 已达到采集要求的{count}条{MODE_MAP[MODE]}数据.[OK]' ) return results # if MDB: # tname = f'{self.name}-{self.id}' # item_id = item.get('item_id') # if MODE != WEITT: # asks = MDB.select({'item_id':{"=":item_id}},tname=tname) # else: # _key = 'concern_talk_cell' # cell = item.get(_key) # if not cell: # cell = item.get('stream_cell') # if cell: # _id = cell.get('id') # asks = MDB.select({'wid':{"=":_id}},tname=tname) # else: # asks = None # if asks: # continue # MDB.save(item,tname=tname,format=cleaner) # print(item['title']) results.append(item) logger.info( f'此次已采集用户:{self.name} ID:{self.id} {MODE_MAP[MODE]}数据 {amount} 条.' ) if response.get('has_more', False) is True: retries = -1 else: logger.info( f'用户:{self.name} ID:{self.id} 此次采集{MODE_MAP[MODE]}完毕. 此次采集总数:{amount}.' ) return results
def wrapper(self, *args, **kwargs): url = APIS[option] if api is None else api result = func(self, *args, **kwargs) if not result: return [] count = result.get('count', 0) amount = 0 cursor = 0 retries = MAX_RETRY results = [] MDB = result.get('MDB') ALL = result.get('ALL') dbname = MONGODB[option] if MDB: if isinstance(MDB, Database) and not MDB.connected: MDB.connect() elif not isinstance(MDB, Database): MDB = self.db if not MDB.connected: MDB.connect() MDB.use_db(dbname) while 1: params = payload_for_relation(self.id, cursor) # print('111111url') # print(params) login_headers = {'cookie': COOKIE} #HEADERS response = send_request(method, url, params=params, JSON=True, session=self.session, retries=retries, DATA=1, headers=login_headers) # print(response) # return data = response.get('data') if bool(data): cursor = response.get('cursor') for item in data: if not ALL: if amount >= count: logger.info(f'[采集完毕] 已达到采集要求的{count}条数据.[OK]') return results if MDB: tname = f'{self.name}-{self.id}' user_id = item.get('user_id') asks = MDB.select({'user_id': { "=": user_id }}, tname=tname) if asks: continue MDB.save(item, tname=tname, format=f_cleaner) results.append(item) amount += 1 logger.info( f'此次已采集用户:{self.name} ID:{self.id} {option}数据 {amount} 条.' ) if response.get('cursor') != 0: retries = -1 else: logger.info( f'用户:{self.name} ID:{self.id} 此次采集{option}完毕. 此次采集总数:{amount}.' ) return results
def wrapper(self,*args,**kwargs): res = func(self,*args,**kwargs) params_func = res.get('params_func') more = res.get('more','has_more') more_out = res.get('more_out') variables = res.get('var',{}) handler = res.get('condition_handle',{}) req_kwargs = res.get('request_kwargs',{}) args = res.get('extra_args',{}) res_args = res.get('res_args',{}) db_setup = res.get('db_setup',{}) var_outer = res.get('var_outer') cleaner = res.get('cleaner') data_out = res.get('data_out') item_out = res.get('item_out') item_callback = res.get('item_callback') data_wrap = res.get('data_wrap',True) count = kwargs.get('count',COUNT_HOTNEWS) MDB = kwargs.get('MDB') ALL = kwargs.get('ALL') var_keys = [i for i in variables.keys()] var_values = [i for i in variables.values()] retries = MAX_RETRY amount = 0 results = [] while 1: params = params_func(*var_values,**args,**res_args) if method.lower() == 'post': req_kwargs.update({ 'data': params }) if 'params' in req_kwargs: req_kwargs.pop('params') else: req_kwargs.update({ 'params':params }) if 'data' in req_kwargs: req_kwargs.pop('data') response = send_request(method, api, retries=retries, **req_kwargs) if data_wrap: data = response.get(data_out).get('data') if data_out else response.get('data') else: data = response if bool(data): if var_outer: var_values = [response.get(var_outer).get(i) for i in var_keys] else: var_values = [response.get(i) for i in var_keys] if res_args: res_args.update({ 'response':response }) raw_data = data.get(item_out) if item_out else data if not raw_data: logger.info(f'数据抓取完毕. 此次采集总数:{amount}.') return results for item in raw_data: if item_callback and isfunction(item_callback): cb_res = item_callback(self,item) if cb_res and not isinstance(cb_res,tuple): continue elif isinstance(cb_res,tuple) and cb_res[-1] == 200: item = cb_res[0] if not ALL: if amount >= count: logger.info(f'[采集完毕] 已达到搜索要求的{count}条数据.[OK]') return results if handler: flags = [] for i in handler.keys(): _func = handler[i][-1] _param = handler[i][0] _sec_param = item.get(i) if _func(_param,_sec_param): flags.append(1) else: flags.append(0) if all(flags): logger.info(f'未满足抓取条件,略过,标识:{item.get(db_setup["ticket"])}') continue if MDB : if isinstance(MDB,Database) and not MDB.connected: MDB.connect() elif not isinstance(MDB,Database): MDB = Database(MONGODB) MDB.connect() MDB.use_db(db_setup['db']) if cleaner and callable(cleaner): item = cleaner(item) _id = item.get(db_setup['ticket']) asks = MDB.select({db_setup['ticket']: {"=": _id}}, tname=db_setup['tname']) if asks: continue MDB.save(item, tname=db_setup['tname']) results.append(item) amount += 1 tip = f'此次抓取 数据 {amount} 条.' if not MDB else \ f'此次抓取 存入数据库:{db_setup.get("db")} 数据 {amount} 条.表:{db_setup.get("tname")}' logger.info(tip) if more_out: _more = response.get(more_out).get(more) else: _more = response.get(more) if _more: retries += 1 else: logger.info(f'数据抓取完毕. 此次采集总数:{amount}.') return results
def craw_per_item(self, account_obj, art_obj, last_topic): account_id = account_obj.id art_id = art_obj.id group_id = art_obj.link last_topic_createdate = None page_topic_createdate_str = '' if last_topic != None: last_topic_createdate = last_topic.createdate result_num = 0 while True: page_topic_createdate_str = urllib.parse.quote( page_topic_createdate_str) get_url = ZSXQ_API_GET_TOPICS.format( group_id=group_id, end_time_str=page_topic_createdate_str) response = send_request('get', get_url, session=self.session, headers=HEADERS, retries=-1, verify=False) content = response.content.decode('utf8') #json返回使用,否则报错 false = False true = True content = content.replace("\n", "\\n").encode('utf8', 'ignore').decode('utf8') json_obj = eval(content) succeeded = json_obj['succeeded'] if succeeded: topics = json_obj['resp_data']['topics'] topics = sorted(topics, key=lambda x: x['create_time'], reverse=True) if len(topics) <= 1: return result_num for t in topics: topicid = t['topic_id'] type = t['type'] digested = t['digested'] digested = 1 if digested == true else 0 create_time_zsxq = t['create_time'] create_time = topic.zsxq_datetime_to_db(create_time_zsxq) question_content = None answer_content = None if type == 'q&a': if t['answered'] == False or t['question'].get( 'text', None) == None or t['answer'].get( 'text') == None: continue question_content = t['question']['text'].encode( 'utf8', 'ignore').decode('utf8') answer_content = t['answer']['text'].encode( 'utf8', 'ignore').decode('utf8') elif type == 'talk': if t['talk'].get('text', None) == None: continue #主题类型 question_content = t['talk']['text'].encode( 'utf8', 'ignore').decode('utf8') if last_topic_createdate == None or create_time > last_topic_createdate: page_topic_createdate_decrease = topic.zsxq_datetime_to_db( create_time_zsxq) - datetime.timedelta( milliseconds=1) page_topic_createdate_str = topic.to_zsxq_datetime( page_topic_createdate_decrease) #create_time_zsxq topic_obj = Topic(type=type, question=question_content, anwser=answer_content, artid=art_id, accountid=account_id, createdate=create_time, digested=digested, topicid=topicid) topic.save(topic_obj) result_num += 1 # 结尾 , 抓取到已存储的时间点 else: return result_num else: return result_num