class TWitter(Base, twython.Twython): """ The RequestHandler class for our server. """ def __init__(self, args={}): #Base.__init__() super(TWitter, self).__init__() #以上是俩种调用基类构造函数的方法 self.__consumer_key = 'c58jPuNxqLex5QttLkoVF621T' self.__consumer_secret = "qU2EfulVxZ9a9mSPVm0bww4HXDyC8qk4a2gQrq7bgy4dKOqfup" self.__access_token = "930249938012798978-BJCWSdIgciyVZ0IUKLXVXLlc1A3D2my" self.__access_secret = "HjDrf1nvRDZIT5NSXioGVeOeZoev26Ibi08hCBQMhMof4" super(Base, self).__init__(self.__consumer_key, self.__consumer_secret, self.__access_token, self.__access_secret) auth = tweepy.OAuthHandler(self.__consumer_key, self.__consumer_secret) auth.set_access_token(self.__access_token, self.__access_secret) self.__flag = 'twitter' self.api = tweepy.API(auth) self.args = args # self.crawler_list_queue = RedisQueue(name='twitter_list',redis_config=redis_config) self.crawler_tweets_err_queue = RedisQueue( name='twitter_error', redis_config=self.app_config['redis_config']) self.crawler_replay_queue = RedisQueue( name='twitter_replay', redis_config=self.app_config['redis_config']) self.crawler_tweets_queue = RedisQueue( name='twitter', redis_config=self.app_config['redis_config']) self.twitter_users_queue = RedisQueue( name='twitter_users', redis_config=self.app_config['redis_config']) def fetch_user_tweets(self, user_id=None, deadline=None, current_max_id=None, bucket="timelines"): if not user_id: raise Exception("user_timeline: user_id cannot be None") prev_max_id = -1 if not current_max_id: current_max_id = 0 last_lowest_id = current_max_id # used to workaround users who has less than 200 tweets, 1 loop is enough... cnt = 0 retry_cnt = 5 timeline = [] # while current_max_id != prev_max_id and retry_cnt > 1: try: if current_max_id > 0: tweets = self.get_user_timeline(user_id=user_id, max_id=current_max_id - 1, count=20) else: tweets = self.get_user_timeline(user_id=user_id, count=20) prev_max_id = current_max_id # if no new tweets are found, the prev_max_id will be the same as current_max_id # crawler_replay_list= [] for tweet in tweets: # print(tweet) if deadline: date = datetime.datetime.strptime( tweet['created_at'], '%a %b %d %H:%M:%S %z %Y') print(date) deadline_panduan = datetime.datetime.strptime( '%s +0000' % deadline, '%Y-%m-%d %z') # print(deadline_panduan) if (date - deadline_panduan).days <= 0: break # list = self.crawler_list_count(tweet['user']['screen_name']) # tweet['list_num']=list tweet['site'] = 'twitter' tweet['latest'] = 'true' tweet['update_status'] = False tweet['update_time'] = datetime.datetime.today() # print('存入mongo') object_id = self.save(tweet) # crawler_replay_list.append("https://twitter.com/%s/status/%s" % (tweet['user']['screen_name'], tweet['id_str'])) print('save %s ==>successfuly' % object_id) time_line = re.search( r'\w{3}\sOct\s\d{2}\s\d{2}:\d{2}:\d{2}\s\+\d{4}\s2017', tweet['created_at']) if current_max_id == 0 or current_max_id > int( tweet['id']): current_max_id = int(tweet['id']) # if len(crawler_replay_list)>0: # print(crawler_replay_list) # self.crawler_replay_queue.put(crawler_replay_list) # print("推入成功%s个" % len(crawler_replay_list)) time.sleep(1) # no new tweets found if (prev_max_id == current_max_id): print('此用户文章抓取完成 %s ' % user_id) break except Exception as e: # print('<%s重新加载到文章队列>' % user_id) # self.crawler_tweets_err_queue.lput({"user_id":user_id,"current_max_id":current_max_id}) # posts = self.get_mongod_client() # deleteObj = posts.delete_many({'id_str': user_id}) # print('<清除%s用户的所有文章,文章数为:%s>' % (user_id, deleteObj.deleted_count)) # break; # print(e) raise e def crawler_list_count(self, user_sreen_name=None, user_id=None): try: reponse = self.asynchronous_request("https://twitter.com/%s" % user_sreen_name) _ = pq(reponse[0]['content']) tweet_count = _( 'ul.ProfileNav-list>li.ProfileNav-item--tweets span.ProfileNav-value' ).attr('data-count') flowing_count = _( 'ul.ProfileNav-list>li.ProfileNav-item--following span.ProfileNav-value' ).attr('data-count') followers_count = _( 'ul.ProfileNav-list>li.ProfileNav-item--followers span.ProfileNav-value' ).attr('data-count') favorites_count = _( 'ul.ProfileNav-list>li.ProfileNav-item--favorites span.ProfileNav-value' ).attr('data-count') list_count = _( 'ul.ProfileNav-list>li.ProfileNav-item--lists span.ProfileNav-value' ).text() moment_count = _( 'ul.ProfileNav-list>li.ProfileNav-item--moments span.ProfileNav-value' ).text() # print((tweet_count,flowing_count,followers_count,favorites_count,list_count,moment_count)) list_count = list_count if list_count else 0 moment_count = moment_count if moment_count else 0 flowing_count = flowing_count if flowing_count else 0 tweet_count = tweet_count if tweet_count else 0 favorites_count = favorites_count if favorites_count else 0 followers_count = followers_count if followers_count else 0 # print(list_count) if (tweet_count, followers_count, flowing_count, favorites_count, list_count, moment_count) == (0, 0, 0, 0, 0, 0): if _('.errorpage-body-content>h1').text(): print('此页面错误,无法抓取') return (0, 0, 0, 0, 0, 0) print('重新加入队列') self.twitter_users_queue.lput(user_id) return (tweet_count, flowing_count, followers_count, favorites_count, list_count, moment_count) except aiohttp.ClientError as e: print('重新加入队列') self.twitter_users_queue.lput(user_id) return (None, None, None, None, None, None) # raise e # print(e) # return None,None def crawler_replay_num(self, urls): try: response = self.asynchronous_request(urls) result_list = [] if response: for item in response: # print(item) try: _ = pq(item['content']) replay = _( 'div.js-tweet-details-fixer.tweet-details-fixer+div.stream-item-footer div.ProfileTweet-actionCountList.u-hiddenVisually span.ProfileTweet-action--reply.u-hiddenVisually>span' ).attr('data-tweet-stat-count') retweet = _( 'div.js-tweet-details-fixer.tweet-details-fixer+div.stream-item-footer div.ProfileTweet-actionCountList.u-hiddenVisually span.ProfileTweet-action--retweet.u-hiddenVisually>span' ).attr('data-tweet-stat-count') like = _( 'div.js-tweet-details-fixer.tweet-details-fixer+div.stream-item-footer div.ProfileTweet-actionCountList.u-hiddenVisually span.ProfileTweet-action--favorite.u-hiddenVisually>span' ).attr('data-tweet-stat-count') content = _( 'p.TweetTextSize.TweetTextSize--jumbo.js-tweet-text.tweet-text' ).text().replace( r'%s' % _('a.twitter-timeline-link.u-hidden').text(), '') result_list.append({ "url": item['url'], "reply_count": replay if replay else 0, "retweet_count": retweet if retweet else 0, "favorite_count": like if like else 0, 'content': content }) except Exception as e: print(e) result_list.append({ "url": item['url'], "reply_count": None, "retweet_count": None, "favorite_count": None, 'content': None }) return result_list except Exception as e: raise e # tweet['reply_count'] = reply_count # print(tweet['created_at']) def search_users(self, keyword=[], typeIndex=1): try: def handle(y): y = y._json if int(typeIndex) == 2: y['searchBy'] = 'EnglishName' else: y['searchBy'] = 'ChineseName' y['bySheet'] = self.args.sheet y['keywords'] = keyword[int(typeIndex)] # if(len(keyword)>1): # y['chinaName'] = keyword[1] # y['englishName'] = keyword[2] return y userList = self.api.search_users(keyword[int(typeIndex)]) users = list(map(handle, userList)) if users: for somebody in users: print(somebody) id = super().save(somebody) if (id): print('save %s==> ok' % id) else: print('no data provid') # super().saveAsExcel(users,self.__flag,keyword) except Exception as e: logging.exception(e) def get_user_info(self, screen_name=None): user_info = self.show_user(screen_name=screen_name) id = self.save_user(doc=user_info, dbName='Twitter', collectionName='twitter') print('[===%s存储成功===]' % id)
class FaceBook(Base): """ The RequestHandler class for our server. """ def __init__(self,args={}): #Base.__init__() super(FaceBook,self).__init__() #以上是俩种调用基类构造函数的方法 self.__username = "******" self.__password = "******" self.__access_token = "EAACEdEose0cBAAkdhoyXkFejburMPqbr7b773AxZCs7b1BORK7V2gUxVlmKkYydZCZBuyy4UcZA0QxThf7ii0tbDnsiCSzwFJ9DZAeGTcUCsGHQPTk7hPamWAZA2mN6IBjNXDsDQwwzrwet4h1piWTP5fuBnKjZCGm8ZCyXjCEWS7apZCoo1ZAuO5OBfoc9IDCgjSDfvc3pWKWGEPcICelHO456OUnZAxeDpLUZD" self.__flag = 'facebook' self.args = args self.crawler_tweets_err_queue = RedisQueue(name='facebook_error', redis_config=self.app_config['redis_config']) self.crawler_reactions_queue = RedisQueue(name='facebook_reactions',redis_config=self.app_config['redis_config']) self.crawler_tweets_queue = RedisQueue(name='facebook',redis_config=self.app_config['redis_config']) self.facebook_users_queue = RedisQueue(name='facebook_users', redis_config=self.app_config['redis_config']) def __reactions_handler(self,responseText=[]): # print(responseText) if not responseText or (len(responseText) <= 0): return None result_list = [] for item in responseText: try: bs = bs4.BeautifulSoup(item['content'], 'html.parser') if not bs: continue; html = bs.select( 'script') share = re.search(r'sharecount:\d{1,}', str(html)).group() if re.search(r'sharecount:\d{1,}', str(html)) else "sharecount:0" likes = re.search(r'likecount:\d{1,}', str(html)).group() if re.search(r'likecount:\d{1,}', str(html)) else "likecount:0" comment = re.search(r'commentcount:\d{1,}', str(html)).group() if re.search(r'commentcount:\d{1,}', str(html)) else "commentcount:0" # print(str1) # comment = re.search(r'count:\d{1,}', str1).group() # print(share,likes,comment) share_count = re.search(r'\d{1,}', share).group() if re.search(r'\d{1,}', share) else 0 likes_count = re.search(r'\d{1,}', likes).group() if re.search(r'\d{1,}', likes) else 0 comment_count = re.search(r'\d{1,}', comment).group() if re.search(r'\d{1,}', comment) else 0 result_list.append({ "url": item["url"], "reactions": { "share_count": share_count, "likes_count": likes_count, "comment_count": comment_count } }) except Exception as e: # raise e result_list.append({ "url":item['url'], "reactions":[] }) return result_list def make_next_page_url(self,url,page_id,next_time,back_end=False): default_next_page_ma = '09223372036854775788'; if back_end==1: return "https://www.facebook.com/pages_reaction_units/more/?page_id={0}&cursor=%7B%22timeline_cursor%22%3A%22timeline_unit%3A1%3A0000000000{1}%3A04611686018427387904%3A{2}%3A04611686018427387904%22%2C%22timeline_section_cursor%22%3A%7B%22profile_id%22%3A{3}%2C%22start%22%3A0%2C%22end%22%3A1517471999%2C%22query_type%22%3A36%2C%22filter%22%3A1%7D%2C%22has_next_page%22%3Atrue%7D&surface=www_pages_posts&unit_count=9&dpr=2&__user=0&__a=1&__req=j&__be=-1&__pc=EXP1:home_page_pkg&__rev=3574843".format(page_id,next_time,int(default_next_page_ma)-9,page_id) elif back_end==2: return "https://www.facebook.com/pages_reaction_units/more/?page_id={0}&cursor=%7B%22timeline_cursor%22%3A%22timeline_unit%3A1%3A0000000000{1}%3A04611686018427387904%3A{2}%3A04611686018427387904%22%2C%22timeline_section_cursor%22%3A%7B%22profile_id%22%3A{3}%2C%22start%22%3A1483257600%2C%22end%22%3A1514793599%2C%22query_type%22%3A8%2C%22filter%22%3A1%2C%22filter_after_timestamp%22%3A1487694945%7D%2C%22has_next_page%22%3Atrue%7D&surface=www_pages_posts&unit_count=8&dpr=2&__user=0&__a=1&__dyn=5V8WXBzamaUmgDxKS5o9FE9XGiWGey8jrWo466ES2N6xucxu13wFG2LzEjyR88xK5WAAzoOuVWxeUPwExnBg4bzojDx6aCyVeFFUkgmxGUO2S1iyECQ3e4oqyU9ooxqqVEgyk3GEtgWrwJxqawLh42ui2G262iu4rGUpCx65aBy9EixO12y9E9oKfzUy5uazrDwFxCibUK8Lz-icK8Cx6789E-8HgoUhwKl4ykby8cUSmh2osBK&__req=22&__be=-1&__pc=EXP1%3Ahome_page_pkg&__rev=3576820".format( page_id, next_time, int(default_next_page_ma) - 9,page_id) elif back_end==0: return "https://www.facebook.com/pages_reaction_units/more/?page_id={0}&cursor=%7B%22timeline_cursor%22%3A%22timeline_unit%3A1%3A0000000000{1}%3A04611686018427387904%3A{2}%3A04611686018427387904%22%2C%22timeline_section_cursor%22%3A%7B%7D%2C%22has_next_page%22%3Atrue%7D&surface=www_pages_posts&unit_count=9&dpr=2&__user=0&__a=1&__req=j&__be=-1&__pc=EXP1:home_page_pkg&__rev=3574843".format(page_id,next_time,int(default_next_page_ma)-9) def crawler_reactions_nums(self,url): try: content = self.asynchronous_request(url) return self.__reactions_handler(content) except Exception as e: raise e; def crawler_user_likes(self,url,user_id=None): try: content = self.asynchronous_request(url) return_list = [] for item in content: # print(item['content']) user_community = pq(item['content'])('._3xom').text() print(user_community) if user_community == '0': return_list.append({ "url": item['url'], "like_count": user_community, "fan_count": user_community }) elif user_community == '': return_list.append({ "url": item['url'], "isLoginStatus":True, "like_count": '0', "fan_count": '0' }) else: if(len(user_community))>1: if re.search(r'\s万',user_community): likes_count,fan_count, = tuple(user_community.replace(' 万','0000').split(' ')) else: likes_count, fan_count, = tuple(user_community.split(' ')) return_list.append({ "url": item['url'], "isLoginStatus": True, "like_count": likes_count, "fan_count": fan_count }) else: # likes_count, fan_count, = tuple(user_community.split(' ')) return_list.append({ "url": item['url'], "isLoginStatus": True, "like_count": user_community, "fan_count": 0 }) return return_list; except aiohttp.ClientError as e: print('重新加入队列') self.facebook_users_queue.lput(user_id) return_list = [] # if likes_count: # people_likes_num = re.search(r'\d+,\d+,\d+',likes_count) if re.search(r'\d+,\d+,\d+',likes_count) else 0 # else: # people_likes_num=0; # print(people_likes_num) # print(likes_count) return return_list; def timestamp_to_strtime(self,timestamp): local_str_time = datetime.utcfromtimestamp(timestamp).strftime('%Y-%m-%dT%H:%M:%S.000Z') return local_str_time def fetch_user_tweets(self,id=None,deadline='2017-01-01',urls=[]): flag=True back=0 while True: try: content = self.asynchronous_request(urls) if re.search(r'(/posts)',urls): origin_html = content[0]['content'] else: origin = json.loads(content[0]['content'].decode()[9:])['domops'] origin_html = list(filter(lambda x: type(x) == dict, origin[0])) origin_html = origin_html[0]['__html'] def scrape(i, e): return { "name": pq(e)('div.userContentWrapper div._6a._5u5j._6b>h5 a').text(), "create_at": pq(e)('div.userContentWrapper div._6a._5u5j._6b>h5+div>span:nth-child(3) a>abbr').attr('data-utime'), "last_untime": pq(e)('div.userContentWrapper div._6a._5u5j._6b>h5+div>span:nth-child(3) a>abbr').attr( 'data-utime'), "permalink_url": pq(e)('div.userContentWrapper div._6a._5u5j._6b>h5+div>span:nth-child(3) a').attr('href'), "message": pq(e)('div.userContent p').text() + pq(e)('div.mtm div.mbs>a').text() } _ = pq(origin_html) tweets = list(_('div._4-u2._4-u8').map(scrape)) if(len(tweets)==0): print('没有数据tweets为0') break; # print(tweets) tweet3 = [] printFlag = True; for x in filter(lambda x:x['create_at'],tweets): # x['create_at']=re.sub(r'[年月日\(\)金木水火土]', ' ', x['create_at']) # if printFlag: # print(x['create_at']) # printFlag=False # thisTime = x['create_at'] # thisTime = thisTime.replace(',', '') # thisTime = thisTime.replace('at', '') # if 'am' in thisTime: # thisTime = thisTime.replace('am', ' AM') # if 'pm' in thisTime: # thisTime = thisTime.replace('pm', ' PM') # if 'Surday' in thisTime: # thisTime = thisTime.replace('Surday', 'Saturday') # # # x['create_at'] = datetime.strptime(thisTime, '%A %B %d %Y %H:%M %p').strftime('%Y-%m-%d %H:%M') # x['create_at'] = datetime.strptime(thisTime, '%Y-%m-%d %H:%M').strftime('%Y-%m-%d %H:%M') #最新修改 # x['create_at'] = datetime.strptime(x['create_at'], '%Y-%m-%d %H:%M').strftime('%Y-%m-%d %H:%M') #在本地跑数据 x['create_at']=self.timestamp_to_strtime(int(x['create_at'])) # print(x['create_at']) tweet3.append(x) def dedupe(items, key=None): seen = set() for item in items: val = item if key is None else key(item) if val not in seen: yield item seen.add(val) tweet3 = list(dedupe(tweet3, key=lambda d: (d['name'], d['create_at'],d['last_untime'],d['permalink_url'],d['message']))) if len(tweet3)<=1: back=back+1 urls=self.make_next_page_url(urls,id,tweet3[-1]['last_untime'],back_end=back) # crawler_reactions_list = [] for item in tweet3: # print(item) item['site']='facebook' item['latest']='true' item['update_status'] = False item['update_time'] = datetime.today() item['user_id'] = id item['permalink_url'] = 'https://facebook.com%s' % item['permalink_url'] if deadline and tweet3.index(item)!= 0: date = datetime.strptime(item['create_at'],'%Y-%m-%dT%H:%M:%S.000Z') print(date) deadline_panduan = datetime.strptime('%s' % deadline, '%Y-%m-%d') print((date - deadline_panduan).days) if (date - deadline_panduan).days <= 0: flag=False; break; item['create_at'] = datetime.strptime(item['create_at'], '%Y-%m-%dT%H:%M:%S.000Z') object_id = self.save(item) # crawler_reactions_list.append({'url':item['permalink_url'],'id':str(object_id)}) print('save %s ==>successfuly' % object_id) # self.crawler_reactions_queue.put(crawler_reactions_list) print('获取的文档长度:%s' % len(tweet3)) if not flag : print('此用户的文章爬取完成') back=0 break; except Exception as e: # print('<%s重新加载到文章队列>' % id) # self.crawler_tweets_err_queue.lput({'id':id,'url':urls}) # # posts = self.get_mongod_client() # # deleteObj = posts.delete_many({'user_id':id}) # # print('<清除%s用户的所有文章,文章数为:%s>' % (id,deleteObj.deleted_count)) # break; raise e def searchUserInfo(self,keyword=[],typeIndex=1): print(keyword[typeIndex]) self.__graph = facebook.GraphAPI(access_token=self.__access_token, version='2.10') kwInfo = self.__graph.search(type='page', q=keyword[int(typeIndex)]) kInfos = kwInfo['data'] if len(kInfos): for item in kInfos: res=self.__graph.get_object(item['id'],fields="name,id,current_location,birthday,category,fan_count,emails,hometown,link,location,website,likes.limit(3),new_like_count,about,description,verification_status") #friends = self.__graph.get_connections(id=item['id'], connection_name='friends') print(res['id']) res['keywords']=keyword[int(typeIndex)]; # if int(typeIndex) == 2: # res['searchBy'] = 'EnglishName' # else: # res['searchBy'] = 'ChineseName' res['bySheet'] = self.args.sheet # print(super().save(res)) id = super().save(res) if (id): print('save %s==> ok' % id) else: print('没有数据') #super().saveAsExcel([],self.__flag,kw) def login(self): try: driver = webdriver.Firefox(executable_path="/Users/suohailong/phantomjs/geckodriver") driver.get('https://www.facebook.com') driver.find_element_by_id('email').send_keys(self.__username) driver.find_element_by_id('pass').send_keys(self.__password) driver.find_element_by_id('login_form').submit() element = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.NAME, "q")) ) return driver except Exception as e: return False def getToken(self): facebookApiUrl = "https://developers.facebook.com/tools/explorer/145634995501895/?method=GET&path=168597536563870&version=v2.11" driver = self.login() if driver: driver.get(facebookApiUrl) element = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.XPATH, '//*[@id="facebook"]/body/div[2]/div[2]/div/div/div/div[2]/div/div[2]/a')) ) actions = action_chains.ActionChains(driver) actions.click(element).perform() #menu = driver.find_element_by_xpath('//div[@class="uiContextualLayer uiContextualLayerBelowLeft"]/div/div/ul') getUserTokenItem = driver.find_element_by_xpath('//div[@class="uiContextualLayer uiContextualLayerBelowLeft"]/div/div/ul/li[1]/a') getUserTokenItem.click() tokenButton = driver.find_element_by_xpath('//*[@id="facebook"]/body/div[8]/div[2]/div/div/div/div/div[3]/div/div/div[2]/div/div/button[1]') tokenButton.click() tokenIput = driver.find_element_by_xpath('//*[@id="facebook"]/body/div[2]/div[2]/div/div/div/div[2]/div/div[1]/label/input') self.__access_token=tokenIput.get_attribute('value') print(self.__access_token) driver.quit() return True else: return False def getPagePosts(self): pass; def search_users(self, keyword='',typeIndex=1): try: print('当前参数为:%s' % keyword) self.searchUserInfo(keyword,typeIndex) except Exception as e: if e.code == 190: print('access Token has expired =====>Reget Touken!') while self.getToken(): self.searchUserInfo(keyword, typeIndex) break; #logging.exception(e) def get_user_info(self,url): content = self.asynchronous_request(url) origin_html = content[0]['content'] # print(content) _ = pq(origin_html) # print(_('#content_container div.clearfix').text()) id = re.search(r'PagesProfileAboutInfoPagelet_\d+',origin_html.decode()) id = id.group() name = re.sub( r"[\u4E00-\u9FA5]|[\u3040-\u30FF\u31F0-\u31FF]|[\u1100-\u11FF\u3130-\u318F\uAC00-\uD7AF]|[-,.?:;\'\"!`]|(-{2})|(\.{3})|(\(\))|(\[\])|({})", '', _('#pageTitle').text()) birthday = _('#content_container div.clearfix').text() website = _('#content_container div.clearfix').text() origin_str = _('#content_container div.clearfix').text() # print(origin_str) if re.search(r'(\d+)年(\d+)月(\d+)日',birthday): birthday = re.search(r'(\d+)年(\d+)月(\d+)日',birthday).group() birthday = re.sub(r'(\d+)年(\d+)月(\d+)日',r'\1-\2-\3',birthday) birthday = re.sub( r"[\u4E00-\u9FA5]|[\u3040-\u30FF\u31F0-\u31FF]|[\u1100-\u11FF\u3130-\u318F\uAC00-\uD7AF]|\s|[.?:;\'\"!`]|(-{2})|(\.{3})|(\(\))|(\[\])|({})", '',birthday) else: birthday='' verified = re.search( r'Facebook \\u5df2\\u786e\\u8ba4\\u8fd9\\u662f\\u516c\\u4f17\\u4eba\\u7269\\u3001\\u5a92\\u4f53\\u516c\\u53f8\\u6216\\u54c1\\u724c\\u7684\\u771f\\u5b9e\\u4e3b\\u9875', origin_html.decode()) if verified: verified = True else: verified=False item = self.crawler_user_likes(url.replace('/about','')+'/community/') if re.search(r'((http|https)://)[\w1-9]+.[\w1-9]+.*',website): website = re.search(r'((http|https)://)[\w1-9]+.[\w1-9]+.[\w]+',website).group() else: website = '' user_id = self.save_user(doc={ "id": re.search(r'\d+',id).group(), "birthday": birthday, "link": url.replace('/about',''), 'website': website, # 'about':re.search(r'简介 (\w+\s)+.',origin_str).group().replace('简介','') if re.search(r'简介 (\w+\s)+.',origin_str) else '', 'about': _('div.text_exposed_root').text(), 'hometown':re.search(r'来自 (\S+\s)+简介',origin_str).group().replace('来自','').replace('简介','') if re.search(r'来自 (\S+\s)+简介',origin_str) else '', 'name': name.replace("Facebook","").replace('|','') , 'gender':re.search(r'性别 \S',origin_str).group().replace('性别','') if re.search(r'性别 \S',origin_str) else '', 'PoliticalViews':re.search(r'政治观点 \S+\s',origin_str).group().replace('政治观点','') if re.search(r'政治观点 \S+\s',origin_str) else '', 'ReligiousBeliefs':re.search(r'宗教信仰 \S+\s',origin_str).group().replace('宗教信仰','') if re.search(r'宗教信仰 \S+\s',origin_str) else '', 'category':re.search(r'categories \S+\s',origin_str).group().replace('categories','') if re.search(r'categories \S+\s',origin_str) else '', 'fan_count':item[0].get('fan_count',0), 'likes_num':item[0].get('like_count',0), 'verified':verified },dbName='FaceBook',collectionName='facebook') print("[===存储%s成功===]" % user_id)