def article(paramsss,headersss,codess,urls): try: response = requests.get('https://twitter.com/i/api/graphql/Vf8si2dfZ1zmah8ePYPjDQ/UserByScreenNameWithoutResults', headers=headersss, params=paramsss) if response.status_code != 200: print("更换账号") login(paramsss,codess,headersss,urls) else: content = response.content getobj = GetValue2(content) contenttext = getobj.get_values('legacy') if contenttext is None: print("更换账号") login(paramsss, codess, headersss,urls) else: # lists = list(contenttext.values()) accountLevel = 0 accountClassification = 0 contenttexts = GetValue2(contenttext) establishTime = contenttexts.get_values('created_at') description = contenttexts.get_values('description') followCount = contenttexts.get_values('friends_count') fansCount = contenttexts.get_values('normal_followers_count') authorName = contenttexts.get_values('name') headPortrait = contenttexts.get_values('profile_image_url_https') bindingAddress = contenttexts.get_values('display_url') location = contenttexts.get_values('location') if location is None: location = "" if followCount is None: followCount = 0 if description is None: description = "" if fansCount is None: fansCount = 0 if bindingAddress is None: bindingAddress = "" data = [] data.append(InsertOne({"account_level":accountLevel,"account_classification":accountClassification,"establish_time":establishTime,"description":description,"follow_count":followCount,"fans_count":fansCount,"author_name":authorName,"head_portrait":headPortrait,"binding_address":bindingAddress,"authorUrl":urls,"location":location})) insertdb(data) except Exception as err: import traceback traceback.print_exc() pass
def article(pss,headersss): try: response = requests.get('https://twitter.com/i/api/graphql/Vf8si2dfZ1zmah8ePYPjDQ/UserByScreenNameWithoutResults', headers=headersss, params=pss) content = response.content getobj = GetValue2(content) contenttext = getobj.get_values('legacy') lists = list(contenttext.values()) accountLevel = 0 accountClassification = 0 contenttexts = GetValue2(contenttext) establishTime = contenttexts.get_values('created_at') description = contenttexts.get_values('description') followCount = contenttexts.get_values('friends_count') fansCount = contenttexts.get_values('normal_followers_count') authorName = contenttexts.get_values('name') headPortrait = contenttexts.get_values('profile_image_url_https') bindingAddress = contenttexts.get_values('display_url') print(bindingAddress) except Exception as err: import traceback traceback.print_exc() pass
def article(args, par, codess): response = ss.get('https://twitter.com/i/api/2/search/adaptive.json', headers=args, params=par) if response.status_code == 429: print("更换账号") login(codess) else: content = response.content getobj = GetValue2(content) contenttext = getobj.get_values('tweets') if contenttext is None: pass else: lists = list(contenttext.values()) for li in lists: imgs = '' contenttext = dict(li) contenttexts = GetValue2(contenttext) contenttextss = contenttexts.get_values('entities') videourl = re.compile("'video/mp4', 'url': '(.*?)'").findall( str(li)) if videourl == []: videourl = '' if contenttextss is None: print("请求失败") pass else: contenttextss = GetValue2(contenttextss) imgurl = contenttextss.get_values('media_url_https') if imgurl is None: print("错误") else: if isinstance(imgurl, list): for im in imgurl: imgs += "<br><img src=\'" + im + "\'></img>" else: imgs += "<br><img src='" + imgurl + "'></img>" else: videourl = "<br><video src='" + videourl[ 0] + "' controls=" "></video>" contenttextss = contenttexts.get_values('full_text') user = contenttexts.get_values('conversation_id_str') times = contenttexts.get_values('created_at') onlyId = contenttexts.get_values('user_id') articleUrl = "https://twitter.com/" + str( onlyId) + "/status/" + str(user) print(user) tempTime = time.strptime(times, '%a %b %d %H:%M:%S +0000 %Y') resTime = time.strftime('%Y-%m-%d %H:%M:%S', tempTime) startTime = datetime.datetime.strptime( resTime, "%Y-%m-%d %H:%M:%S") # 把strTime转化为时间格式,后面的秒位自动补位的 startTime.strftime("%Y-%m-%d %H:%M:%S") # 格式化输出,保持和给定格式一致 # startTime时间加 一分钟 startTime2 = ( startTime + datetime.timedelta(hours=8)).strftime("%Y-%m-%d %H:%M:%S") contentText = contenttextss + videourl + imgs downloadTime = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') site = "twitter" siteId = 1044518 data = [] articleStatue = 0 data.append( InsertOne({ "url": articleUrl, "title": contenttextss, "pub_time": startTime2, "content": contentText, "download_time": downloadTime, "site": site, "site_id": siteId, "aid": user, "only_id": onlyId, 'push_state': articleStatue })) insertdb(data)
def article(args,par,codess): try: response = ss.get('https://twitter.com/i/api/2/search/adaptive.json', headers=args, params=par) if response.status_code == 429 or response.status_code == 403: print("更换账号") login(codess) else: content = response.content getobj = GetValue2(content) contenttext = getobj.get_values('tweets') usertext = getobj.get_values('users') userId = re.compile("'user_id_str': '(.*?)',").findall(str(contenttext)) authorName = [] for us in userId: authorss = re.compile(" 'id_str': '" + us + "', 'name': '(.*?)', 'screen_name': '(.*?)',").findall(str(usertext)) for au,auId in authorss: aa = au+" "+auId authorName.append(aa) if contenttext is None: pass else: lists = list(contenttext.values()) for li,auName in zip(lists, authorName): imgs = '' contenttext = dict(li) contenttexts = GetValue2(contenttext) contenttextss = contenttexts.get_values('entities') videourl = re.compile("'video/mp4', 'url': '(.*?)'").findall(str(li)) if videourl == []: videourl = '' if contenttextss is None: print("请求失败") pass else: contenttextss = GetValue2(contenttextss) imgurl = contenttextss.get_values('media_url_https') if imgurl is None: print("错误") else: if isinstance(imgurl, list): for im in imgurl: imgs += "<br><img src=\'" + im + "\'></img>" else: imgs += "<br><img src='" + imgurl + "'></img>" else: videourl = "<br><video src='" + videourl[0] + "' controls=""></video>" contenttextss = contenttexts.get_values('full_text') user = contenttexts.get_values('id_str') times = contenttexts.get_values('created_at') onlyId = contenttexts.get_values('user_id') likeCount = contenttexts.get_values('favorite_count') forwardCount = contenttexts.get_values('retweet_count') replyCount = contenttexts.get_values('reply_count') original = contenttexts.get_values('quoted_status_id') if likeCount is None: likeCount = 0 if forwardCount is None: forwardCount = 0 if replyCount is None: replyCount = 0 if original is None: originalState = 0 else: originalState = 1 articleUrl = "https://twitter.com/"+str(onlyId)+"/status/"+str(user) tempTime = time.strptime(times, '%a %b %d %H:%M:%S +0000 %Y') resTime = time.strftime('%Y-%m-%d %H:%M:%S', tempTime) startTime = datetime.datetime.strptime(resTime, "%Y-%m-%d %H:%M:%S") # 把strTime转化为时间格式,后面的秒位自动补位的 startTime.strftime("%Y-%m-%d %H:%M:%S") # 格式化输出,保持和给定格式一致 # startTime时间加 一分钟 startTime2 = (startTime + datetime.timedelta(hours=8)).strftime("%Y-%m-%d %H:%M:%S") contentText = contenttextss + videourl + imgs downloadTime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') site = "twitter" siteId = 1044518 data = [] datas = [] articleStatue = 0 authorStatue = 0 data.append(InsertOne({"url": articleUrl,"title": contenttextss, "pub_time": startTime2, "content": contentText, "download_time": downloadTime, "site": site, "site_id": siteId, "aid": user, "only_id": onlyId,'push_state':articleStatue,'like_counts':likeCount,'forward_counts':forwardCount,'reply_counts':replyCount,'original_state':originalState,'author_name':auName})) datas.append(InsertOne({"auName":auName,"authorStatue":authorStatue})) insertdb(data) insertdbs(datas) except Exception as err: import traceback traceback.print_exc() pass
def article(responses, args, par): try: # response = ss.get('https://twitter.com/i/api/2/search/adaptive.json', headers=args, params=par) if responses.status_code == 429 or responses.status_code == 403: print("更换账号") # for tss in threads: # tss.join() # login(codess) else: content = responses.content content = responses.content fenye = re.compile('{"operation":{"cursor":{"value":"(.*?)",').findall(str(content)) ids = 0 for fy in range(1, 100000): print("循环" + str(fy) + "次") response = ss.get('https://twitter.com/i/api/2/search/adaptive.json', headers=args, params=par) content = response.content getobj = GetValue2(content) contenttext = getobj.get_values('tweets') if contenttext is None: break if ids >= 20: break print() else: ids = 0 usertext = getobj.get_values('users') userId = re.compile("'user_id_str': '(.*?)',").findall(str(contenttext)) authorName = [] for us in userId: authorss = re.compile(" 'id_str': '" + us + "', 'name': '(.*?)', 'screen_name': '(.*?)',").findall( str(usertext)) for au, auId in authorss: aa = au + "&~~&" + auId authorName.append(aa) if contenttext is None: pass else: lists = list(contenttext.values()) for li, auName in zip(lists, authorName): imgs = '' contenttext = dict(li) contenttexts = GetValue2(contenttext) contenttextss = contenttexts.get_values('entities') videourl = re.compile("'video/mp4', 'url': '(.*?)'").findall(str(li)) if videourl == []: videourl = '' if contenttextss is None: print("请求失败") pass else: contenttextss = GetValue2(contenttextss) imgurl = contenttextss.get_values('media_url_https') if imgurl is None: print("错误") else: if isinstance(imgurl, list): for im in imgurl: imgs += "<br><img src=\'" + im + "\'></img>" else: imgs += "<br><img src='" + imgurl + "'></img>" else: videourl = "<br><video src='" + videourl[0] + "' controls=""></video>" contenttextss = contenttexts.get_values('full_text') user = contenttexts.get_values('id_str') times = contenttexts.get_values('created_at') onlyId = contenttexts.get_values('user_id') likeCount = contenttexts.get_values('favorite_count') forwardCount = contenttexts.get_values('retweet_count') replyCount = contenttexts.get_values('reply_count') original = contenttexts.get_values('quoted_status_id') if likeCount is None: likeCount = 0 if forwardCount is None: forwardCount = 0 if replyCount is None: replyCount = 0 if original is None: originalState = 0 else: originalState = 1 articleUrl = "https://twitter.com/" + str(onlyId) + "/status/" + str(user) tempTime = time.strptime(times, '%a %b %d %H:%M:%S +0000 %Y') resTime = time.strftime('%Y-%m-%d %H:%M:%S', tempTime) startTime = datetime.datetime.strptime(resTime, "%Y-%m-%d %H:%M:%S") # 把strTime转化为时间格式,后面的秒位自动补位的 startTime.strftime("%Y-%m-%d %H:%M:%S") # 格式化输出,保持和给定格式一致 # startTime时间加 一分钟 startTime2 = (startTime + datetime.timedelta(hours=8)).strftime("%Y-%m-%d %H:%M:%S") duibiTime1 = datetime.datetime.strptime(startTime2, "%Y-%m-%d %H:%M:%S") duibiTime2 = datetime.datetime.strptime('2021-06-24 00:00:00', "%Y-%m-%d %H:%M:%S") delta = duibiTime1 - duibiTime2 # if delta.days == -1: # ids+=1 # print('计数') # else: # print() contentText = contenttextss + videourl + imgs downloadTime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') site = "twitter" siteId = 1044518 data = [] datas = [] articleStatue = 0 authorStatue = 0 data.append(InsertOne( {"url": articleUrl, "title": contenttextss, "pub_time": startTime2, "content": contentText, "download_time": downloadTime, "site": site, "site_id": siteId, "aid": user, "only_id": onlyId, 'push_state': articleStatue, 'like_num': likeCount, 'share_num': forwardCount, 'cmt_num': replyCount, 'original_state': originalState, 'author': auName})) datas.append(InsertOne({"auName": auName, "authorStatue": authorStatue})) # insertdb(data,ids) downloadTime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') try: collections.bulk_write(data) print('添加完成') print(downloadTime) except Exception as err: ids += 1 print("添加重复") print(downloadTime) insertdbs(datas) fenye = re.compile('{"operation":{"cursor":{"value":"(.*?)",').findall(str(content)) par = dict(par) par['cursor'] = fenye[1] print(fenye[1]) except Exception as err: import traceback traceback.print_exc() pass
def my_job(): for iu in range(126): iu = iu + 1 paramss = dict(params) paramss['page'] = iu response = requests.get('https://m.dutenews.com/index/ajax/content', headers=headers, params=paramss, cookies=cookies) content = response.content.decode('unicode-escape') contents = str(content) url = re.compile('getUrl\((.*?),\)').findall(contents) url = list(set(url)) for i in url: try: urls = 'https://plus.dutenews.com/api/v2/feeds/' + i + '/comments' # urls = 'https://plus.dutenews.com/api/v2/feeds/17639/comments' articleUrl = 'https://page.dutenews.com/H5/sns/#/imgText?id=' + i response = requests.get(urls) content = response.content.decode('utf-8') title = re.compile('"feed_content":"(.*?)",').findall(content) titles = title[0].encode('utf-8').decode('unicode_escape') if len(title) == 0: title = re.compile('"title":"(.*?)",').findall( str(content)) pubTime = re.compile('"created_at":"(.*?)",').findall( str(content)) fmt = '%Y-%m-%dT%H:%M:%SZ' t = datetime.datetime.strptime(pubTime[0], fmt) # 东八区 t += datetime.timedelta(hours=8) getobj = GetValue2(content) contenttext = getobj.get_values('images') videotext = getobj.get_values('video') articlecontent = '' if contenttext is None: print() else: imgs = '' contenttexts = re.compile("'url': '(.*?)',").findall( str(contenttext)) for im in contenttexts: imgs += "<br><img src=\'" + im + "\'></img>" articlecontent += titles + imgs if videotext is None: print() else: videos = '' aa = GetValue2(videotext['resource']) videourl = aa.get_values('url') videos += "<br><video src='" + videourl + "' controls=" "></video>" articlecontent += titles + videos site = "读特-鹏友圈" siteId = 1048212 data = [] articleStatue = 0 downloadTime = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') data.append( InsertOne({ "url": articleUrl, "title": titles, "pub_time": t, "content": articlecontent, "download_time": downloadTime, "site": site, "site_id": siteId, "aid": i, 'push_state': articleStatue, })) insertdb(data) except Exception as err: import traceback traceback.print_exc() pass
def my_job(): response = ss.get('https://xueqiu.com/statuses/hot/listV2.json', headers=headers, params=params, cookies=cookies) content = response.content.decode("utf-8") # datas = GetValue2(content) # aa = datas.get_values('text') # # author = datas.get_values('screen_name') nianTime = datetime.datetime.now().strftime('%Y') # time = datas.get_values('timeBefore') url = re.compile('original_status":{"id":(.*?),"user_id":(.*?),"').findall( str(content)) for ids in url: try: idss = str(ids).split(",") idd = idss[0].replace('\'', '') userid = idss[1].replace('\'', '') userid = userid.replace(" ", '') idd = idd.replace('(', '') userid = userid.replace(')', '') articleUrl = "https://xueqiu.com/" + userid + "/" + idd res = ss.get(articleUrl, headers=headers, cookies=cookies) contents = res.content.decode('utf-8') title = re.compile('<h1 class="status-title">(.*?)</h1>').findall( str(contents)) shijianchuo = re.compile('data-created_at="(.*?)"').findall( str(contents)) shijianchuo = int(shijianchuo[0]) timeArray = time.localtime(shijianchuo / 1000) pubTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray) articleText = re.compile( '<div class="detail">(.*?)</div>').findall(str(contents)) # author = re.compile('data-screenname="(.*?)"').findall(str(contents)) downloadTime = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') onlyId = pubTime + "==" + idd insertdb(articleUrl, title[0], articleText[0], pubTime, downloadTime, idd, onlyId) fenyeUrl = "https://xueqiu.com/statuses/comments.json?id=" + idd + "&count=20&reply=true&asc=false&type=status&split=true" for ic in range(5): paramss = dict(paramsss) paramss['page'] = ic + 1 fenyeRes = ss.get(fenyeUrl, headers=headers, cookies=cookies, params=paramss) fenyeContent = fenyeRes.content.decode('utf-8') datas = GetValue2(fenyeContent) commentText = datas.get_values('text') commentshijianchuo = datas.get_values('created_at') for commentTexts, commentshijianchuos in zip( commentText, commentshijianchuo): timeArray = time.localtime(commentshijianchuos / 1000) pubTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray) onlyId = pubTime + "==" + idd insertdb(articleUrl, commentText, commentText, pubTime, downloadTime, idd, onlyId) except Exception as err: import traceback traceback.print_exc() pass
def my_job(): params = { 'category': '', 'cursorScore': '', 'geo': '', 'page': '1', 'pageSize': '20', 'needGifCover': 'true', } sign = [ 'X5d78198d5d0cef0645770b2f36149fd3', 'X4a0e1b9cf3414aeba6c81ad21337c2b9', 'X03343d8c8df2a30947ee7e42bf468ca0', 'X786d4f4ef4137adde81ee9394ec35a08', 'Xe7f347761e83063a9d0e0ce4304d0d24', 'Xdabd969b6f1c5cd81e9bbe721b28cb8d', 'X8cd165a614bca8b544143663b8cd21e6', 'X36a2b464223e4b844d08b32a651ae1ec', 'X64bdf7ff5245d15254546197fb3050d9', 'Xd6f40de8649a2e56e61f4f6fb24a00ff', 'Xdba75a472e1b8cacaa59575e61202466', 'X788007cccd927043edf01388e329d4a0', 'X0e58ba9e7a452d89f71fe11e29867309', 'X6153230db274448e36b824352679e8ed', 'Xc7d153c4590525c52db4fc27aa61c234', 'Xc51ba1a7039a7bf08d8f8ef3f2c144a3', 'X78620a448dede01bade8cb9838f0b683', 'X09e67c58c2aa560eeb639de8c3e149ef', 'X4c5ff33985471392765e3da35770b091', ] category = [ 'recommend_v2', 'homefeed.travel_v2', 'homefeed.mens_fashion_v2', 'homefeed.fitness_v2', 'homefeed.movies_v2', 'homefeed.car_v2', 'homefeed.digital_v2', 'homefeed.home_v2', 'homefeed.books_v2', 'homefeed.food_v2', 'homefeed.skincare_v2', 'homefeed.music_v2', 'homefeed.celebrities_v2', 'homefeed.fashion_v2', 'homefeed.pets_v2', 'homefeed.baby_v2', 'homefeed.maternity_v2', 'homefeed.weddings_v2', 'homefeed.cosmetics_v2' ] for sign, category in zip(sign, category): headers['X-Sign'] = sign params['category'] = category response = ss.get( 'https://www.xiaohongshu.com/fe_api/burdock/weixin/v2/homefeed/personalNotes', headers=headers, params=params) content = response.content.decode('utf-8') datas = GetValue2(content) id = datas.get_values('id') title = datas.get_values('title') type = datas.get_values('type') likes = datas.get_values('likes') for a, b, c, d in zip(id, title, type, likes): if '万' in str(d): num = d.replace('万', '') d = float(num) * 10000 d = int(d) try: imgText = '' videoText = '' url = 'https://www.xiaohongshu.com/discovery/item/' + str(a) # url = 'https://www.xiaohongshu.com/discovery/item/60f8be50000000002103c8f8' response = requests.get( url, headers=headerss, ) content = response.content.decode('utf-8') articleContent = re.compile('"description": "(.*?)",').findall( str(content)) times = re.compile( '"time":"(\d{4}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2})","type"' ).findall(str(content)) pubTime = datetime.datetime.strptime( times[0], '%Y-%m-%d %H:%M').strftime('%Y-%m-%d %H:%M:%S') if c == 'normal': contentText = re.compile('<body>([\s\S]*?.)<h1').findall( str(content)) contentImg = re.compile( '<img src="//ci.xiaohongshu.com(.*?)\?').findall( str(contentText)) for i in contentImg: # if str.find(str(i),'http'): # print() # else: imgText += r"<img src='https://ci.xiaohongshu.com/" + str( i) + "'></br>" else: content = response.content.decode('utf-8') content = re.compile('<video src="(.*?)"').findall( str(content)) ac = content[0].encode('utf-8').decode('unicode_escape') aa = str(ac) aa = aa.replace('&', '&') videoText += '<video src="' + aa + '" controls="controls"></br>' contentText = imgText + videoText + articleContent[0] site = "小红书" siteId = 1048926 data = [] articleStatue = 0 downloadTime = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') data.append( InsertOne({ "url": url, "title": b, "pub_time": pubTime, "content": contentText, "download_time": downloadTime, "site": site, "site_id": siteId, "aid": a, 'push_state': articleStatue, 'like_num': d })) insertdb(data) except Exception as err: import traceback print("1")
def article(number, pa, aa): ac = 0 try: url = 'https://twitter.com/i/api/2/timeline/profile/' + number + '.json' time.sleep(5) daili() response = ss.get(url, headers=headers, params=pa) content = response.content getobj = GetValue2(content) contenttext = getobj.get_values('tweets') if contenttext is None: pass else: lists = list(contenttext.values()) for li in lists: imgs = '' contenttext = dict(li) contenttexts = GetValue2(contenttext) contenttextss = contenttexts.get_values('entities') videourl = re.compile("'video/mp4', 'url': '(.*?)'").findall( str(li)) if videourl == []: videourl = '' if contenttextss is None: print("错误") else: contenttextss = GetValue2(contenttextss) imgurl = contenttextss.get_values('media_url_https') if imgurl is None: print("错误") else: if isinstance(imgurl, list): for im in imgurl: imgs += "<br><img src=\'" + im + "\'></img>" else: imgs += "<br><img src='" + imgurl + "'></img>" else: videourl = "<br><video src='" + videourl[ 0] + "' controls=" "></video>" contenttextss = contenttexts.get_values('full_text') user = contenttexts.get_values('conversation_id_str') times = contenttexts.get_values('created_at') print(user) tempTime = time.strptime(times, '%a %b %d %H:%M:%S +0000 %Y') resTime = time.strftime('%Y-%m-%d %H:%M:%S', tempTime) print(resTime) contentText = contenttextss + videourl + imgs print(contentText) data = [] data.append(InsertOne({"title": contenttextss})) insertdb(data) ac = ac + 1 # pages(pa, number,aa) if ac <= 5: url = 'https://twitter.com/i/api/2/timeline/profile/' + number + '.json' response = ss.get(url, headers=headerss, params=pa) content = response.content page = re.compile('"value":"(.*?)",').findall(str(content)) try: page = page[1] except Exception as err: import traceback traceback.print_exc() pass par = dict(params) par['cursor'] = page article(number, par, aa) except Exception as err: import traceback traceback.print_exc() pass