def members_html(url): driver = driver_facebook() driver.get(url) execute_times(driver, 2000) html = driver.page_source # with open("group_members.html", "w", encoding="utf-8") as f: # f.write(html) driver.close() return html
def posts_index(): driver = driver_facebook() driver.get( "https://www.facebook.com/groups/southmongoliasupport//?ref=direct") time.sleep(2) execute_times(driver, 2000) posts_html = driver.page_source with open("posts_index.html", "w", encoding='utf-8') as f: f.write(posts_html) log('posts_html 写入文件夹') return posts_html
def main(): driver = driver_facebook() time.sleep(2) driver.get( 'https://www.facebook.com/profile.php?id=100018160331338&lst=100005036989194%3A100018160331338%3A1529916881&sk=friends&source_ref=pb_friends_tl' ) time.sleep(2) execute_times(driver, 70) html = driver.page_source with open("friends_all.html", 'w', encoding='utf-8') as f: f.write(html) # with open("friends_all.html", 'r', encoding='utf-8') as f: # html = f.read() all_url = parse_index(html)
def parse_url(url_dict): driver = driver_facebook() for count, u in enumerate(url_dict): # if count <= 100: # continue try: link = u.get('link') name = u.get('name') log("begin name{}".format(name)) driver.get(link) time.sleep(1) index_html = driver.page_source post = personal_data(index_html) post.account_name = name post.home_page = link urun['test'].insert( { "account_name": post.account_name, 'home_page': post.home_page, 'location': post.location, 'come_form': post.come_form, "job": post.job, 'followers': post.followers, "degree": post.degree, "sex": post.sex, "is_get": True } ) log("insert {} sucessful".format(post.account_name)) time.sleep(randint(2, 5)) if count >= 10: break except Exception as e: log(count, e) continue
def parse_members_url(url_dict): driver = driver_facebook() error_count = 0 for count, u in enumerate(url_dict): # if count <= 10: # log("skip {} {}".format(count, u.get('name'))) # continue link = u.get('url') name = u.get('name') log("begin {} : {}", count, name) try: driver.get(link) time.sleep(2) post = MembersData() index_html = driver.page_source data_sex = re.findall( r'"addFriendText".*?<', index_html) or re.findall( r'<span class="FollowLink">.*?</span>', index_html) log(data_sex) if data_sex != []: if '他' in data_sex[0]: post.sex = 'man' if "她" in data_sex[0]: post.sex = "woman" profile = re.findall( r'<div id="intro_container_id">.*?</ul></div>', index_html) if profile == []: error_count += 1 log("error {} : {} {}".format(error_count, count, link)) e = pq(profile[0]) all_profile = e.text() log(all_profile) list_profile = all_profile.split("\n") for item in list_profile: if ("曾经" in item or '就读于' in item) and post.degree == '': post.degree = item elif "所在地" in item: post.location = item elif "来自" in item: post.come_form = item elif "粉丝" in item: post.followers = item elif "-" in item and post.job == '' and '曾经' not in item: post.job = item post.account_name = name post.home_page = link log("post", post) urun['test'].insert({ "account_name": post.account_name, 'home_page': post.home_page, 'location': post.location, 'come_form': post.come_form, "job": post.job, 'followers': post.followers, "degree": post.degree, "sex": post.sex, "is_get": True, }) log("insert {} sucessful".format(name)) # if count >= 20: # break except Exception as e: log(count, name, e) continue