def posts_index(): driver = driver_facebook() driver.get( "https://www.facebook.com/groups/southmongoliasupport//?ref=direct") time.sleep(2) execute_times(driver, 2000) posts_html = driver.page_source with open("posts_index.html", "w", encoding='utf-8') as f: f.write(posts_html) log('posts_html 写入文件夹') return posts_html
def personal_data(index_html): data_sex = re.findall(r'"addFriendText".*?<', index_html) or re.findall( r'<span class="FollowLink">.*?</span>', index_html) log('data_sex', data_sex) post = PostData() if len(data_sex) != 0: if '他' in data_sex[0]: post.sex = 'man' if "她" in data_sex[0]: post.sex = "woman" profile = re.findall(r'<div id="intro_container_id">.*?</ul></div>', index_html) if profile == []: global error_count error_count += 1 log("error 第{}次, not find profile".format(error_count)) e = pq(profile[0]) all_profile = e.text() list_profile = all_profile.split("\n") log('list_profile', list_profile) for item in list_profile: if ("曾经" in item or '就读于' in item) and post.degree == '': post.degree = item elif "所在地" in item: post.location = item elif "来自" in item: post.come_form = item elif "粉丝" in item: post.followers = item elif "-" in item and post.job == '' and '曾经' not in item: post.job = item log('post', post) return post
def run(self): # self.set_name() # while True: account_list = [ '晚聊伴夜', '氢氪财经', '菲迪克智慧工程企业管理平台', '山西同乡群', '筱猫影视', '沈阳南动车运用所', '潇湘茶', '众智睿赢企业管理咨询有限公司', '微景相册', '书悦堂', '分享好宝贝', '民艺旅舍', '女王Dcup', '轻松定位美丽', '乐清市红辣椒越剧艺苑', '畅舞馆', '人禾健康产业', '常州格物斯坦机器人创客中心', '千秋妃子', '崇左航博' ] for name in account_list: self.name = name html_account = self.account_homepage() if html_account: html, account_of_homepage = html_account else: continue log('start 公众号: ', self.name) urls_article = self.urls_article(html) account = Acount() account.name = self.name account.account = account_of_homepage account.get_account_id() backpack_list = [] for page_count, url in enumerate(urls_article): # if page_count < 35: # continue article = Article() article.create(url, self.name) log('文章标题:', article.title) log("第{}条".format(page_count)) entity = JsonEntity(article, account) backpack = Backpack() backpack.create(entity) backpack_list.append(backpack.create_backpack()) import pymongo conn = pymongo.MongoClient('mongo') # 上传数据库 sql = ''' INSERT INTO account_http(article_url, addon, account, account_id, author, id, title) VALUES (%s, %s, %s, %s, %s, %s, %s) ''' _tuple = (article.url, datetime.datetime.now(), entity.account, entity.account_id, entity.author, entity.id, entity.title) uploads_mysql(config_mysql, sql, _tuple) # if page_count == 5: # break log("发包") if entity: entity.uploads(backpack_list)
def run(self): # self.set_name() # while True: account_list = ['有看投',] entity = None backpack_list = [] for name in account_list: self.name = name html_account = self.account_homepage() if html_account: html, account_of_homepage = html_account else: continue log('start 公众号: ', self.name) urls_article = self.urls_article(html) account = Account() account.name = self.name account.account = account_of_homepage account.get_account_id() # account.account_id = 126774646 for page_count, url in enumerate(urls_article): # if page_count < 35: # continue article = Article() article.create(url, account) log('文章标题:', article.title) log("第{}条".format(page_count)) entity = JsonEntity(article, account) backpack = Backpack() backpack.create(entity) backpack_list.append(backpack.create_backpack()) import pymongo conn = pymongo.MongoClient('mongo') # 上传数据库 sql = ''' INSERT INTO account_http(article_url, addon, account, account_id, author, id, title) VALUES (%s, %s, %s, %s, %s, %s, %s) ''' _tuple = ( article.url, datetime.datetime.now(), entity.account, entity.account_id, entity.author, entity.id, entity.title ) uploads_mysql(config_mysql, sql, _tuple) if page_count == 4: break log("发包") if entity: # entity.uploads(backpack_list) # entity.uploads_datacenter_relay(backpack_list) entity.uploads_datacenter_unity(backpack_list) print('end')
def run(self): # self.set_name() # while True: account_list = global_account_list or [ '刀口谈兵', ] for name in account_list: self.name = name html_account = self.account_homepage() if html_account: html, account_of_homepage = html_account else: continue log('start 公众号: ', self.name) urls_article = self.urls_article(html) account = Account() account.name = self.name account.account = account_of_homepage # account.get_account_id() account.account_id = global_account_id or 126776905 entity = None backpack_list = [] for page_count, url in enumerate(urls_article): if page_count == 0: continue article = Article() article.create(url, self.name) log('文章标题:', article.title) log("第{}条".format(page_count)) entity = JsonEntity(article, account) backpack = Backpack() backpack.create(entity) backpack_list.append(backpack.create_backpack()) # 上传数据库 sql = ''' INSERT INTO account_http(article_url, addon, account, account_id, author, id, title) VALUES (%s, %s, %s, %s, %s, %s, %s) ''' _tuple = (article.url, datetime.datetime.now(), entity.account, entity.account_id, entity.author, entity.id, entity.title) uploads_mysql(config_mysql, sql, _tuple) if page_count == 5: break log("发包") if entity: entity.uploads(backpack_list) # entity.uploads_datacenter(backpack_list) print('end')
def account_homepage(self): # 搜索并进入公众号主页 search_url = self.url.format(self.name) resp_search = self.s.get(search_url, headers=self.headers, cookies=self.cookies) if 'class="b404-box" id="noresult_part1_container"' in resp_search.text: log("找不到该公众号: {}".format(self.name)) return e = pq(resp_search.text) if e(".tit").eq(0).text() == self.name: account_link = e(".tit").find('a').attr('href') elif len(e(".tit").eq(0).text()) > 1: log("不能匹配正确的公众号: {}".format(self.name)) return else: # 处理验证码 self.crack_sougou(search_url) print("验证完毕") # 被跳过的公众号要不要抓取 大概 4次 return account_match = re.search(r'微信号:\w*', e.text()) account_search = account_match.group().replace('微信号:', '') if account_match else '' homepage = self.s.get(account_link, cookies=self.cookies) if '<title>请输入验证码 </title>' in homepage.text: print("出现验码") from verification_code import captch_upload_image print('------开始处理微信验证码------') cert = random.random() image_url = 'https://mp.weixin.qq.com/mp/verifycode?cert={}'.format(cert) respones = self.s.get(image_url, ) captch_input = captch_upload_image(respones.content) print('------验证码:{}------'.format(captch_input)) data = { 'cert': cert, 'input': captch_input } respones = self.s.post(image_url, data=data, cookies=self.cookies) cookies = requests.utils.dict_from_cookiejar(respones.cookies) print('adffa', cookies) homepage = self.s.get(account_link, cookies=self.cookies) print('破解验证码之后') account = pq(homepage.text)('.profile_account').text().replace('微信号: ', '') # 搜索页面有account,公众号主页有account,确保找到account return homepage.text, account or account_search
def parse_posts_html(posts_html): log("begin parse_posts_html") e = pq(posts_html) results = e(".userContentWrapper") for count, item in enumerate(results): item = pq(item) post = PostData() post.account_name = item(".fwb").text() post.content = item(".userContent").text() post.time = item(".timestampContent").text().split(" ")[0] post.praise = item('._ipn')("._3t54").text().split("\n")[-1][:1] post.share = item(".UFIShareLink").text() if '年' in post.time: post.year = post.time.split("年")[0] else: post.year = 2018 if '月' and '年' in post.time: post.month = post.time.split("年")[-1].split('月')[0] elif '月' in post.time: post.month = post.time.split('月')[0] post.day = post.time.split("月")[-1] post_dict = post.obj_to_dict() log("result parse_posts_html{}".format(post_dict)) urun['post'].insert(post_dict) log('insert {} success'.format(post.account_name))
def parse_url(url_dict): driver = driver_facebook() for count, u in enumerate(url_dict): # if count <= 100: # continue try: link = u.get('link') name = u.get('name') log("begin name{}".format(name)) driver.get(link) time.sleep(1) index_html = driver.page_source post = personal_data(index_html) post.account_name = name post.home_page = link urun['test'].insert( { "account_name": post.account_name, 'home_page': post.home_page, 'location': post.location, 'come_form': post.come_form, "job": post.job, 'followers': post.followers, "degree": post.degree, "sex": post.sex, "is_get": True } ) log("insert {} sucessful".format(post.account_name)) time.sleep(randint(2, 5)) if count >= 10: break except Exception as e: log(count, e) continue
def run(self): name_list = self.get_name() for name in name_list: # name = '大鼎豫剧' log('start {}'.format(name)) self.name = name _tuple = self.account_homepage(name) # 跳过 搜索不到的公众号 if _tuple: html, _account = self.account_homepage(name) else: log('not find {}'.format(self.name)) continue # 所有文章链接 items = re.findall('"content_url":".*?,"copyright_stat"', html) backpack_list = [] for page_count, item in enumerate(items): url_last = item[15:-18].replace('amp;', '') url = 'https://mp.weixin.qq.com' + url_last article = Article() article.create(url) if article.is_share is True: continue log("catch {}".format(article.title)) account = Account() # account 读文件跟信源搜索不一样 account.name = article.author account.account = article.account account.get_account_id() entity = JsonEntity(article, account) backpack = Backpack() # 文章为分享 # try: backpack.create(entity) # except Exception as e: # log(e) # continue backpack_list.append(backpack.create_backpack()) # 上传数据库 sql = ''' INSERT INTO account_http(article_url, addon, account, account_id, author, id, title) VALUES (%s, %s, %s, %s, %s, %s, %s) ''' _tuple = ( article.url, datetime.datetime.now(), entity.account, entity.account_id, entity.author, entity.id, entity.title ) uploads_mysql(config_mysql, sql, _tuple) # if page_count == 30: # break log('catch {} successul 共{}条文章'.format(self.name, page_count)) log("发包") if entity: entity.uploads(backpack_list) log("uploads successful")
def run(self): # self.set_name() # while True: account_list = ['大数据发布', '上海港湾集团', '绿盟365', '酌梦录', '瞄了个喵', '豪德通讯', '魔都娱乐1', '大侠的小宇宙', '澳洲梦', '盛世路跑', '佛系金融女', '中卫今日热点', '金华社区居委会', '昕说法', '华农海洋研会', '尘埃一生', '革镇堡街道普法', '速度车行', '七分钟高清视频', '摘星少女酱', '青海省格尔木市健桥医院', '乐用好车', '最强省钱喵喵君', '石柱港航', '荣盛物业长沙花语馨苑客服中心', '汕头超声集团', '中奥吴郡半岛', '隽永人生', '飞鸿影视传媒', 'RGSE义乌雨具遮阳及防护用品展'] articles = [] ID = hash_md5(self.name) for name in account_list: if len(name) == 0: continue self.name = name html_account = self.account_homepage() if html_account: html, account_of_homepage = html_account else: continue log('start 公众号: ', self.name) urls_article = self.urls_article(html) account = Account() account.name = self.name account.account = account_of_homepage account.get_account_id() backpack_list = [] for page_count, url in enumerate(urls_article): # if page_count < 35: # continue article = Article() article.create(url, self.name) log('文章标题:', article.title) log("第{}条".format(page_count)) entity = JsonEntity(article, account) backpack = Backpack() backpack.create(entity) backpack_list.append(backpack.create_backpack()) # 所有文章 article_info = backpack.to_dict() articles.append({ID: article_info}) # 上传数据库 import pymongo conn = pymongo.MongoClient('120.78.237.213', 27017) sql = ''' INSERT INTO account_http(article_url, addon, account, account_id, author, id, title) VALUES (%s, %s, %s, %s, %s, %s, %s) ''' _tuple = ( article.url, datetime.datetime.now(), entity.account, entity.account_id, entity.author, entity.id, entity.title ) uploads_mysql(config_mysql, sql, _tuple) # if page_count == 5: # break log("发包") if entity: entity.uploads(backpack_list)
'db': MYSQL_DATABASE, } db = pymysql.connect(**config_mysql) cursor = db.cursor() cursor_save = db.cursor() cursor.execute('select * FROM imagefail_header_url') count = 0 urls = cursor.fetchmany(5776) urls = cursor.fetchmany(2000) proxies = {"https": "http://localhost:1080", } for index, url_tuple in enumerate(urls): numb, post_id, site, url = url_tuple id = post_id try: if not url: continue print(url) resp = requests.get(url, proxies=proxies) with open('img_header/{}.png'.format(post_id), 'wb') as f: f.write(resp.content) test['save_img_mysql'].insert({'id':numb, 'header_url': url, 'blogger_id': id}) # print('save {}'.format(id)) log('第{}次 save {} {}'.format(index, numb, id)) except Exception as e: log(e) log('=============') log(index, post_id)
from setting import log logging = log() class InfoBulider: def __init__(self, postion, city): self.__postion = postion self.__city = city def urlbulider(self): url = 'https://www.lagou.com/jobs/positionAjax.json?{}&needAd' \ 'dtionalResult=false'.format(self.__city) return url def headersbulider(self): headers = { 'Host': 'www.lagou.com', 'Origin': 'https://www.lagou.com', 'Referer': 'https://www.lagou.com/jobs/list_{}?labelWords=&fromSearch=true&s' 'uginput='.format(self.__postion), 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like' \ ' Gecko) Chrome/69.0.3497.81 Safari/537.36', 'X-Anit-Forge-Code': '0', 'X-Anit-Forge-Token': None, 'X-Requested-With': 'XMLHttpRequest' } return headers def databulider(self, page): logging.info('data bulider working')
def account_homepage(self): # 搜索并进入公众号主页 count = 0 while True: search_url = self.url.format(self.name) resp_search = self.s.get(search_url, headers=self.headers, cookies=self.cookies) if '相关的官方认证订阅号' in resp_search.text: log("找不到该公众号: {}".format(self.name)) return e = pq(resp_search.text) if self.name in e(".tit").eq(0).text(): account_link = e(".tit").find('a').attr('href') elif len(e(".tit").eq(0).text()) > 1: log("不能匹配正确的公众号: {}".format(self.name)) return else: log(search_url) # log(resp_search.text) log('验证之前的cookie', self.cookies) try_count = 0 while True: try_count += 1 self.crack_sougou(search_url) if '搜公众号' in self.driver.page_source: log('------cookies更新------') cookies = self.driver.get_cookies() new_cookie = {} for items in cookies: new_cookie[items.get('name')] = items.get('value') self.cookies = new_cookie log('------cookies已更新------', self.cookies) break elif try_count > 6: log("浏览器验证失败") break log("验证完毕") time.sleep(2) # 被跳过的公众号要不要抓取 大概 4次 continue account_match = re.search(r'微信号:\w*', e.text()) account_search = account_match.group().replace('微信号:', '') if account_match else '' homepage = self.s.get(account_link, cookies=self.cookies) if '<title>请输入验证码 </title>' in homepage.text: print("出现验码") print('------开始处理微信验证码------') cert = random.random() image_url = 'https://mp.weixin.qq.com/mp/verifycode?cert={}'.format(cert) respones = self.s.get(image_url, ) captch_input = captch_upload_image(respones.content) print('------验证码:{}------'.format(captch_input)) data = { 'cert': cert, 'input': captch_input } respones = self.s.post(image_url, data=data, cookies=self.cookies) cookies = requests.utils.dict_from_cookiejar(respones.cookies) print('adffa', cookies) homepage = self.s.get(account_link, cookies=self.cookies) print('破解验证码之后') account = pq(homepage.text)('.profile_account').text().replace('微信号: ', '') # 搜索页面有account,公众号主页有account,确保找到account return homepage.text, account or account_search
def crack_sougou(self, url): log('------开始处理未成功的URL:{}'.format(url)) if re.search('weixin\.sogou\.com', url): log('------开始处理搜狗验证码------') self.driver.get(url) time.sleep(2) if '搜公众号' in self.driver.page_source: for i in range(30): self.driver.get(url) log('浏览器页面正常') if '搜公众号' not in self.driver.page_source: break try: img = self.wait.until(EC.presence_of_element_located((By.ID, 'seccodeImage'))) log('------出现验证码页面------') location = img.location size = img.size left = location['x'] top = location['y'] right = location['x'] + size['width'] bottom = location['y'] + size['height'] screenshot = self.driver.get_screenshot_as_png() screenshot = Image.open(BytesIO(screenshot)) captcha = screenshot.crop((left, top, right, bottom)) captcha_path = os.path.join(IMAGE_DIR, CAPTCHA_NAME) captcha.save(captcha_path) with open(captcha_path, "rb") as f: filebytes = f.read() captch_input = captch_upload_image(filebytes) log('------验证码:{}------'.format(captch_input)) if captch_input: input_text = self.wait.until(EC.presence_of_element_located((By.ID, 'seccodeInput'))) input_text.clear() input_text.send_keys(captch_input) submit = self.wait.until(EC.element_to_be_clickable((By.ID, 'submit'))) submit.click() time.sleep(2) try: if '搜公众号' not in self.driver.page_source: log('验证失败') return log('------验证码正确------') except: log('--22222222----验证码输入错误------') except Exception as e: log('------未跳转到验证码页面,跳转到首页,忽略------') elif re.search('mp\.weixin\.qq\.com', url): log('------开始处理微信验证码------') cert = random.random() image_url = 'https://mp.weixin.qq.com/mp/verifycode?cert={}'.format(cert) respones = self.s.get(image_url, cookies=self.cookies) captch_input = captch_upload_image(respones.content) log('------验证码:{}------'.format(captch_input)) data = { 'cert': cert, 'input': captch_input } self.s.post(image_url, cookies=self.cookies, data=data) log('------cookies已更新------')
return config_mysql if __name__ == '__main__': mysql_params = mysql_localhost() import pymysql db = pymysql.connect(**mysql_params) cursor = db.cursor() cursor.execute("SELECT * FROM `twfb_copy` where Url like '%facebook%';") # 9-19 1145 UID Newmacau https://zh-hk.facebook.com/Newmacau/ items = cursor.fetchmany(1114) items = cursor.fetchall() driver = webdriver.Chrome() for count, item in enumerate(items): log('第{}次, '.format(count)) print(item) uid = item[0] url = item[2] if ('https' not in url) or ('小麗民主教室' in url): continue # url = 'https://www.facebook.com/pg/dokul1988/about/?ref=page_internal' driver.get(url) choice = '' WebDriverWait(driver, 5).until( EC.presence_of_element_located((By.CLASS_NAME, 'inputtext'))) try: # 群组 https://www.facebook.com/theeclecticsbasement/ driver.find_element_by_css_selector('._64-f') info = group(driver)
cursor.execute('select * FROM imagefail_header_url_copy') count = 0 urls = cursor.fetchmany(475) urls = cursor.fetchmany(500) proxies = { "https": "http://localhost:1080", } for index, url_tuple in enumerate(urls): numb, post_id, site, url = url_tuple id = post_id try: if not url: continue print(url) resp = requests.get(url, proxies=proxies) with open('img_mysql/{}.png'.format(post_id), 'wb') as f: f.write(resp.content) test['save_img_mysql'].insert({ 'id': numb, 'header_url': url, 'blogger_id': id }) # print('save {}'.format(id)) log('{} save {}'.format(index, id)) except Exception as e: log(e) log('=============') log(index, post_id)
def uploads(self, backpack_list): if backpack_list: sever1 = 'http://115.231.251.252:26016/' sever2 = 'http://60.190.238.168:38015/' body = json.dumps(backpack_list) # 保证发送成功 count = 0 while True: if count > 2: break try: log('start uploads') r = requests.post(sever1, data=body) if r.status_code == 200: log('uploads server1 successful') except Exception as e: log('uploads http error1', e) try: r2 = requests.post(sever2, data=body) if r2.status_code == 200: log('uploads server2 successful') break except Exception as e: log('uploads http error2', e) count += 1 log('uploads over')
import requests from setting import test, log # proxies = { 'https': "http//localhost:1080"} proxies = { "https": "http://localhost:1080", } for index, item in enumerate(test['img_herder_merge'].find()): # blogger_id = item.get('url').split('com/')[-1] try: blogger_id = item.get('blogger_id') url = item.get('header_url') id = item.get('id') # if int(id) < 5976: # continue # print(id, url) log(id, url) if not url: test['save'].insert({'id': id, 'header_url': ''}) continue resp = requests.get(url, proxies=proxies) with open('img_header_6000/{}.png'.format(blogger_id), 'wb') as f: f.write(resp.content) test['save_img'].insert({'id': id, 'header_url': url}) # print('save {}'.format(id)) log('{} save {}'.format(index, id)) except Exception as e: log(e) log('=============') log(id, blogger_id)
from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC driver = webdriver.Chrome() from setting import log count = 0 while True: url = 'https://www.facebook.com/permalink.php?story_fbid=2163264660369593&id=188533647842714' driver.get(url) element = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.CLASS_NAME, "fwn"))) Author = driver.find_element_by_class_name('fwb').text count += 1 print(count) log(count)
try: header_url_person_div = driver.find_element_by_css_selector( '.scaledImageFitWidth.img') header_url_person = header_url_person_div.get_attribute('src') except Exception as e: pass try: header_url_group_div = driver.find_element_by_css_selector( '._4jhq.img') header_url_group = header_url_group_div.get_attribute('src') except Exception as e: pass except Exception as e: log(e) log('not find') try: header_url = header_url_person or header_url_group d = { 'id': numb, 'url': url, 'header_url': header_url, 'post_id': post_id, } test['img_header_url_10000'].insert(d) count += 1 log('{} 已保存'.format(numb)) except Exception as e:
def parse_members_url(url_dict): driver = driver_facebook() error_count = 0 for count, u in enumerate(url_dict): # if count <= 10: # log("skip {} {}".format(count, u.get('name'))) # continue link = u.get('url') name = u.get('name') log("begin {} : {}", count, name) try: driver.get(link) time.sleep(2) post = MembersData() index_html = driver.page_source data_sex = re.findall( r'"addFriendText".*?<', index_html) or re.findall( r'<span class="FollowLink">.*?</span>', index_html) log(data_sex) if data_sex != []: if '他' in data_sex[0]: post.sex = 'man' if "她" in data_sex[0]: post.sex = "woman" profile = re.findall( r'<div id="intro_container_id">.*?</ul></div>', index_html) if profile == []: error_count += 1 log("error {} : {} {}".format(error_count, count, link)) e = pq(profile[0]) all_profile = e.text() log(all_profile) list_profile = all_profile.split("\n") for item in list_profile: if ("曾经" in item or '就读于' in item) and post.degree == '': post.degree = item elif "所在地" in item: post.location = item elif "来自" in item: post.come_form = item elif "粉丝" in item: post.followers = item elif "-" in item and post.job == '' and '曾经' not in item: post.job = item post.account_name = name post.home_page = link log("post", post) urun['test'].insert({ "account_name": post.account_name, 'home_page': post.home_page, 'location': post.location, 'come_form': post.come_form, "job": post.job, 'followers': post.followers, "degree": post.degree, "sex": post.sex, "is_get": True, }) log("insert {} sucessful".format(name)) # if count >= 20: # break except Exception as e: log(count, name, e) continue