class Mention(): def __init__(self, username, password): self.launcher = Launcher(username, password) self.driver = self.launcher.login() self.mention_list = self.launcher.get_mention_list() self.es = Es_fb() self.list = [] def get_mention(self): for url in self.mention_list: print(url) self.driver.get(url) for each in self.driver.find_elements_by_xpath( '//div[@id="contentArea"]'): try: author_name = each.find_element_by_xpath( './div/div/div[3]/div/div/div/div[2]/div[1]/div[2]/div[1]/div/div/div[2]/div/div/div[2]/h5/span/span/span/a' ).text except: author_name = each.find_element_by_xpath( './div/div/div/div/div/div/div[2]/div[1]/div[2]/div[1]/div/div/div[2]/div/div/div[2]/h5/span/span/span/a' ).text try: author_id = ''.join( re.findall( re.compile('id=(\d+)'), each.find_element_by_xpath( './div/div/div[3]/div/div/div/div[2]/div[1]/div[2]/div[1]/div/div/div[2]/div/div/div[2]/h5/span/span/span/a' ).get_attribute('data-hovercard'))) except: author_id = ''.join( re.findall( re.compile('id=(\d+)'), each.find_element_by_xpath( './div/div/div/div/div/div/div[2]/div[1]/div[2]/div[1]/div/div/div[2]/div/div/div[2]/h5/span/span/span/a' ).get_attribute('data-hovercard'))) try: pic_url = each.find_element_by_xpath( './div/div/div[3]/div/div/div/div[2]/div/div[2]/div/div/a/div/img' ).get_attribute('src') except: pic_url = each.find_element_by_xpath( './div/div/div/div/div/div/div[2]/div/div[2]/div/div/a/div/img' ).get_attribute('src') try: ti = int( each.find_element_by_xpath( './div/div/div[3]/div/div/div/div[2]/div/div[2]/div/div/div/div[2]/div/div/div[2]/div/span[3]/span/span/a/abbr' ).get_attribute('data-utime')) except: ti = int( each.find_element_by_xpath( './div/div/div/div/div/div/div[2]/div/div[2]/div/div/div/div[2]/div/div/div[2]/div/span[3]/span/a/abbr' ).get_attribute('data-utime')) try: content = each.find_element_by_xpath( './div/div/div/div/div/div/div[2]/div/div[2]/div[2]/p' ).text except Exception as e: content = 'None' item = { 'nick_name': author_name, 'uid': author_id, 'photo_url': pic_url, 'timestamp': ti, 'text': content } self.list.append(item) return self.list def save(self, indexName, typeName, list): self.es.executeES(indexName, typeName, list)
).text author_id = ''.join( re.findall( re.compile('id=(\d+)'), each.find_element_by_xpath( './div/div[3]/div/div/div/div[2]/div[1]/div[2]/div[1]/div/div/div[2]/div/div/div[2]/h5/span/span/a' ).get_attribute('data-hovercard'))) pic_url = each.find_element_by_xpath( './div/div[3]/div/div/div/div/div/div[2]/div/div/a/div/img' ).get_attribute('src') time = each.find_element_by_xpath( './div/div[3]/div/div/div/div[2]/div/div[2]/div/div/div/div[2]/div/div/div[2]/div/span[3]/span/a/abbr' ).get_attribute('data-utime') try: content = each.find_element_by_xpath( './div/div[3]/div/div/div/div[2]/div/div[2]/div[2]/div/div/p' ).text except Exception as e: content = 'None' def save(self, indexName, typeName, item): es.executeES(indexName, typeName, item) if __name__ == '__main__': fb = Launcher('18538728360', 'zyxing,0513') es = es_twitter() mention_list = fb.get_mention_list() mention = Mention() mention.get_mention()
class Mention(): def __init__(self, username, password): self.launcher = Launcher(username, password) self.driver = self.launcher.login() self.mention_list = self.launcher.get_mention_list() self.es = Es_fb() self.list = [] self.update_time = int(time.time()) def get_mention(self): try: for url in self.mention_list: self.driver.get(url) time.sleep(1) # 退出通知弹窗进入页面 try: self.driver.find_element_by_xpath( '//div[@class="_n8 _3qx uiLayer _3qw"]').click() except: pass for each in self.driver.find_elements_by_xpath( '//div[@id="contentArea"]'): try: try: author_name = each.find_element_by_xpath( './div/div/div[3]/div/div/div/div[2]/div[1]/div[2]/div[1]/div/div/div[2]/div/div/div[2]/h5/span/span/span/a' ).text except: author_name = each.find_element_by_xpath( './div/div/div/div/div/div/div[2]/div[1]/div[2]/div[1]/div/div/div[2]/div/div/div[2]/h5/span/span/span/a' ).text except: author_name = 'None' try: try: author_id = ''.join( re.findall( re.compile('id=(\d+)'), each.find_element_by_xpath( './div/div/div[3]/div/div/div/div[2]/div[1]/div[2]/div[1]/div/div/div[2]/div/div/div[2]/h5/span/span/span/a' ).get_attribute('data-hovercard'))) except: author_id = ''.join( re.findall( re.compile('id=(\d+)'), each.find_element_by_xpath( './div/div/div/div/div/div/div[2]/div[1]/div[2]/div[1]/div/div/div[2]/div/div/div[2]/h5/span/span/span/a' ).get_attribute('data-hovercard'))) except: author_id = 'None' try: try: pic_url = each.find_element_by_xpath( './div/div/div[3]/div/div/div/div[2]/div/div[2]/div/div/a/div/img' ).get_attribute('src') except: pic_url = each.find_element_by_xpath( './div/div/div/div/div/div/div[2]/div/div[2]/div/div/a/div/img' ).get_attribute('src') except: pic_url = 'None' try: try: ti = int( each.find_element_by_xpath( './div/div/div[3]/div/div/div/div[2]/div/div[2]/div/div/div/div[2]/div/div/div[2]/div/span[3]/span/span/a/abbr' ).get_attribute('data-utime')) except: ti = int( each.find_element_by_xpath( './div/div/div/div/div/div/div[2]/div/div[2]/div/div/div/div[2]/div/div/div[2]/div/span[3]/span/a/abbr' ).get_attribute('data-utime')) except: ti = 'None' try: content = each.find_element_by_xpath( './div/div/div/div/div/div/div[2]/div/div[2]/div[2]/p' ).text except: content = 'None' try: try: mid = ''.join( re.findall( re.compile('/(\d+)'), each.find_element_by_xpath( './div/div/div[3]/div/div/div/div[2]/div/div[2]/div/div/div/div[2]/div/div/div[2]/div/span[3]/span/span/a' ).get_attribute('href'))) except: mid = ''.join( re.findall( re.compile('/(\d+)'), each.find_element_by_xpath( './div/div/div/div/div/div/div[2]/div/div[2]/div/div/div/div[2]/div/div/div[2]/div/span[3]/span/a' ).get_attribute('href'))) except: mid = 'None' item = { 'uid': author_id, 'photo_url': pic_url, 'nick_name': author_name, 'mid': mid, 'timestamp': ti, 'text': content, 'update_time': self.update_time } self.list.append(item) finally: self.driver.close() return self.list def save(self, indexName, typeName, list): self.es.executeES(indexName, typeName, list)
class Mention(): def __init__(self, username, password): self.launcher = Launcher(username, password) self.mention_list, self.driver = self.launcher.get_mention_list() self.es = Es_fb() self.list = [] self.update_time = int(time.time()) def date2timestamp(self, date): date = date.replace(u'月', '-').replace(u'日', '').replace(' ', '') if date == '刚刚': timestamp = int(time.time()) return timestamp if u'上午' in date: date = date.replace(u'上午', ' ') if u'下午' in date: if date.split(u'下午')[1].split(':')[0] == '12': date = date.replace(u'下午', ' ') elif eval(date.split(u'下午')[1].split(':')[0]) < 12: date = date.split(u'下午')[0] + ' ' + str( eval(date.split(u'下午')[1].split(':')[0]) + 12) + ':' + date.split(u'下午')[1].split(':')[1] if u'年' not in date and u'分钟' not in date and u'小时' not in date: date = str( time.strftime('%Y-%m-%d', time.localtime( time.time())).split('-')[0]) + '-' + date if u'年' in date and u'分钟' not in date and u'小时' not in date: date = date.replace(u'年', '-') if u'分钟' in date: timestamp = int( time.time()) - int(re.search(r'(\d+)', date).group(1)) * 60 return timestamp if u'小时' in date: timestamp = int(time.time()) - int( re.search(r'(\d+)', date).group(1)) * 60 * 60 return timestamp try: timestamp = int(time.mktime(time.strptime(date, '%Y-%m-%d'))) except: timestamp = int(time.mktime(time.strptime(date, '%Y-%m-%d %H:%M'))) return timestamp def get_mention(self): for url in self.mention_list: self.driver.get(url) time.sleep(1) try: nick_name = self.driver.find_element_by_xpath( '//div[@id="root"]/div[1]/div[1]/div/div[1]/div[1]/table/tbody/tr/td[2]/div/h3/strong/a' ).text except: nick_name = '' print nick_name try: uid = re.findall( r'id=(\d+)', self.driver.find_element_by_xpath( '//div[@id="root"]/div[1]/div[1]/div/div[1]/div[1]/table/tbody/tr/td[2]/div/h3/strong/a' ).get_attribute('href'))[0] except: uid = '' print uid try: timestamp = self.date2timestamp( self.driver.find_element_by_xpath( '//div[@id="root"]/div[1]/div[1]/div/div[2]/div/abbr'). text) except: timestamp = 0 print timestamp try: text = self.driver.find_element_by_xpath( '//div[@id="root"]/div[1]/div[1]/div/div[1]/div[2]').text except: text = '' print text try: mid = ''.join(re.findall(re.compile('fbid%3D(\d+)'), url)) except: mid = '' print mid item = { 'uid': uid, 'nick_name': nick_name, 'mid': mid, 'timestamp': timestamp, 'text': text, 'update_time': self.update_time } self.list.append(item) for i in self.list: self.driver.get('https://m.facebook.com/profile.php?id=' + str(i['uid'])) try: photo_url = self.driver.find_element_by_xpath( '//div[@id="m-timeline-cover-section"]/div[1]/div[2]/div[1]/div/a/img' ).get_attribute('src') except: try: photo_url = self.driver.find_element_by_xpath( '//div[@id="m-timeline-cover-section"]/div[2]/div/div[1]/div[1]/a/img' ).get_attribute('src') except: photo_url = self.driver.find_element_by_xpath( '//div[@id="m-timeline-cover-section"]/div[2]/div/div[1]/a/img' ).get_attribute('src') i['photo_url'] = photo_url self.driver.quit() return self.list def save(self, indexName, typeName, mention_list): self.es.executeES(indexName, typeName, mention_list)