Exemplo n.º 1
0
class Comment():
    def __init__(self, username, password):
        self.launcher = Launcher(username, password)
        self.driver = self.launcher.login()
        self.es = Es_fb()
        self.comment_list = self.launcher.get_comment_list()
        self.list = []

    def get_comment(self):
        for url in self.comment_list:
            print(url)
            self.driver.get(url)
            time.sleep(1)
            try:
                root_content = self.driver.find_element_by_xpath(
                    '//div[@role="feed"]/div[1]/div[1]/div[2]/div[1]/div[2]/div[2]'
                ).text
            except:
                root_content = self.driver.find_element_by_xpath(
                    '//div[@role="feed"]/div[1]/div[1]/div[1]/div[1]/div[2]/div[2]'
                ).text
            try:
                root_time = self.driver.find_element_by_xpath(
                    '//abbr[@class="_5ptz"]').get_attribute('data-utime')
            except:
                root_time = self.driver.find_element_by_xpath(
                    '//abbr[@class="_5ptz timestamp livetimestamp"]'
                ).get_attribute('data-utime')
            for each in self.driver.find_elements_by_xpath(
                    '//div[@aria-label="评论"]'):
                try:
                    author_name = each.find_element_by_xpath(
                        './div/div/div/div[2]/div/div/div/div/div/span/span[1]/a'
                    ).text
                except:
                    author_name = each.find_element_by_xpath(
                        './div/div/div/div[2]/div/div/div/span/span[1]/a').text
                try:
                    author_id = ''.join(
                        re.findall(
                            re.compile('id=(\d+)'),
                            each.find_element_by_xpath(
                                './div/div/div/div[2]/div/div/div/div/div/span/span[1]/a'
                            ).get_attribute('data-hovercard')))
                except:
                    author_id = ''.join(
                        re.findall(
                            re.compile('id=(\d+)'),
                            each.find_element_by_xpath(
                                './div/div/div/div[2]/div/div/div/span/span[1]/a'
                            ).get_attribute('data-hovercard')))
                pic_url = each.find_element_by_xpath(
                    './div/div/div/div[1]/a/img').get_attribute('src')
                try:
                    content = each.find_element_by_xpath(
                        './div/div/div/div[2]/div/div/div/div/div/span/span[2]/span/span/span/span'
                    ).text
                except:
                    content = each.find_element_by_xpath(
                        './div/div/div/div[2]/div/div/div/span/span[2]/span/span/span/span'
                    ).text
                try:
                    ti = int(
                        each.find_element_by_xpath(
                            './div/div/div/div[2]/div/div/div[2]/span[4]/a/abbr'
                        ).get_attribute('data-utime'))
                except:
                    ti = int(
                        each.find_element_by_xpath(
                            './div/div/div/div[2]/div/div/div[2]/span[5]/a/abbr'
                        ).get_attribute('data-utime'))
                self.list.append({
                    'nick_name': author_name,
                    'uid': author_id,
                    'photo_url': pic_url,
                    'text': content,
                    'timestamp': ti
                })
        return self.list

    def save(self, indexName, typeName, list):
        self.es.executeES(indexName, typeName, list)
Exemplo n.º 2
0
                        get_attribute('data-hovercard')))
                pic_url = each.find_element_by_xpath(
                    './div/div/div/div[1]/a/img').get_attribute('src')
                content = each.find_element_by_xpath(
                    './div/div/div/div[2]/div/div/div/span/span[2]/span/span/span/span'
                ).text
                time = each.find_element_by_xpath(
                    './div/div/div/div[2]/div/div/div[2]/span[4]/a/abbr'
                ).get_attribute('data-utime')
                self.list.append({
                    'author_name': author_name,
                    'author_id': author_id,
                    'pic_url': pic_url,
                    'content': content,
                    'time': time
                })
        return self.list

    def save(self, indexName, typeName, item):
        es.executeES(indexName, typeName, item)


if __name__ == '__main__':

    fb = Launcher('18538728360', 'zyxing,0513')
    es = es_twitter()
    comment_list = fb.get_comment_list()
    comment = Comment()
    list = comment.get_comment()
    comment.save(list)
Exemplo n.º 3
0
class Comment():
    def __init__(self, username, password):
        self.launcher = Launcher(username, password)
        self.driver = self.launcher.login()
        self.es = Es_fb()
        self.comment_list = self.launcher.get_comment_list()
        self.list = []
        self.update_time = int(time.time())

    def get_comment(self):
        try:
            for url in self.comment_list:
                print(url)
                self.driver.get(url)
                time.sleep(1)
                # 退出通知弹窗进入页面
                try:
                    self.driver.find_element_by_xpath(
                        '//div[@class="_n8 _3qx uiLayer _3qw"]').click()
                except:
                    pass

                try:
                    try:
                        root_text = self.driver.find_element_by_xpath(
                            '//div[@role="feed"]/div[1]/div[1]/div[2]/div[1]/div[2]/div[2]'
                        ).text
                    except:
                        root_text = self.driver.find_element_by_xpath(
                            '//div[@role="feed"]/div[1]/div[1]/div[1]/div[1]/div[2]/div[2]'
                        ).text
                except:
                    root_text = 'None'
                try:
                    try:
                        root_mid = ''.join(
                            re.findall(
                                re.compile('story_fbid=(\d+)'),
                                self.driver.find_element_by_xpath(
                                    '//div[@role="feed"]/div[1]/div[1]/div[2]/div[1]/div[2]/div[1]/div/div/div[2]/div/div/div[2]/div/span[3]/span/a'
                                ).get_attribute('href')))
                    except:
                        root_mid = ''.join(
                            re.findall(
                                re.compile('story_fbid=(\d+)'),
                                self.driver.find_element_by_xpath(
                                    '//div[@role="feed"]/div[1]/div[1]/div[1]/div[1]/div[2]/div[1]/div/div/div[2]/div/div/div[2]/div/span[3]/span/a'
                                ).get_attribute('href')))
                except:
                    root_mid = 'None'
                for each in self.driver.find_elements_by_xpath(
                        '//div[@aria-label="评论"]'):
                    try:
                        try:
                            author_name = each.find_element_by_xpath(
                                './div/div/div/div[2]/div/div/div/div/div/span/span[1]/a'
                            ).text
                        except:
                            author_name = each.find_element_by_xpath(
                                './div/div/div/div[2]/div/div/div/span/span[1]/a'
                            ).text
                    except:
                        author_name = 'None'
                    try:
                        try:
                            author_id = ''.join(
                                re.findall(
                                    re.compile('id=(\d+)'),
                                    each.find_element_by_xpath(
                                        './div/div/div/div[2]/div/div/div/div/div/span/span[1]/a'
                                    ).get_attribute('data-hovercard')))
                        except:
                            author_id = ''.join(
                                re.findall(
                                    re.compile('id=(\d+)'),
                                    each.find_element_by_xpath(
                                        './div/div/div/div[2]/div/div/div/span/span[1]/a'
                                    ).get_attribute('data-hovercard')))
                    except:
                        author_id = 'None'
                    try:
                        pic_url = each.find_element_by_xpath(
                            './div/div/div/div[1]/a/img').get_attribute('src')
                    except:
                        pic_url = 'None'
                    try:
                        content = each.find_element_by_xpath(
                            './div/div/div/div[2]/div/div/div/div/div/span/span[2]/span/span/span/span'
                        ).text
                    except:
                        content = each.find_element_by_xpath(
                            './div/div/div/div[2]/div/div/div/span/span[2]/span/span/span/span'
                        ).text
                    try:
                        ti = int(
                            each.find_element_by_xpath(
                                './div/div/div/div[2]/div/div/div[2]/span[4]/a/abbr'
                            ).get_attribute('data-utime'))
                    except:
                        ti = int(
                            each.find_element_by_xpath(
                                './div/div/div/div[2]/div/div/div[2]/span[5]/a/abbr'
                            ).get_attribute('data-utime'))
                    self.list.append({'uid':author_id, 'photo_url':pic_url, 'nick_name':author_name, 'mid':root_mid, 'timestamp':ti, 'text':content,\
                          'update_time':self.update_time, 'root_text':root_text, 'root_mid':root_mid})
        finally:
            self.driver.close()
        return self.list

    def save(self, indexName, typeName, list):
        self.es.executeES(indexName, typeName, list)
Exemplo n.º 4
0
class Comment():
    def __init__(self, username, password):
        self.launcher = Launcher(username, password)
        self.es = Es_fb()
        self.comment_list, self.driver = self.launcher.get_comment_list()
        self.list = []
        self.update_time = int(time.time())

    def date2timestamp(self, date):
        date = date.replace(u'月', '-').replace(u'日', '').replace(' ', '')
        if date == '刚刚':
            timestamp = int(time.time())
            return timestamp
        if u'上午' in date:
            date = date.replace(u'上午', ' ')
        if u'下午' in date:
            if date.split(u'下午')[1].split(':')[0] == '12':
                date = date.replace(u'下午', ' ')
            elif eval(date.split(u'下午')[1].split(':')[0]) < 12:
                date = date.split(u'下午')[0] + ' ' + str(
                    eval(date.split(u'下午')[1].split(':')[0]) +
                    12) + ':' + date.split(u'下午')[1].split(':')[1]
        if u'年' not in date and u'分钟' not in date and u'小时' not in date:
            date = str(
                time.strftime('%Y-%m-%d', time.localtime(
                    time.time())).split('-')[0]) + '-' + date
        if u'年' in date and u'分钟' not in date and u'小时' not in date:
            date = date.replace(u'年', '-')

        if u'分钟' in date:
            timestamp = int(
                time.time()) - int(re.search(r'(\d+)', date).group(1)) * 60
            return timestamp
        if u'小时' in date:
            timestamp = int(time.time()) - int(
                re.search(r'(\d+)', date).group(1)) * 60 * 60
            return timestamp

        try:
            timestamp = int(time.mktime(time.strptime(date, '%Y-%m-%d')))
        except:
            timestamp = int(time.mktime(time.strptime(date, '%Y-%m-%d %H:%M')))
        return timestamp

    def get_comment(self):
        print 'comment_list', self.comment_list
        for url in self.comment_list:
            self.driver.get(url)
            time.sleep(1)

            try:
                root_text = self.driver.find_element_by_xpath(
                    '//div[@id="m_story_permalink_view"]/div[1]/div/div[1]/div[2]'
                ).text
            except BaseException, e:
                root_text = ''
            print root_text

            try:
                root_mid = ''.join(
                    re.search(re.compile('fbid%3D(\d+)%'), url).group(1))
                print 'root_mid', root_mid
            except BaseException, e:
                print "get_comment Position44444444", e
                root_mid = ''
            print root_mid

            for each in self.driver.find_elements_by_xpath(
                    '//div[@id="m_story_permalink_view"]/div[2]/div/div[4]/div'
            ):
                if u' 查看更多评论' in each.text:
                    break

                try:
                    author_name = each.find_element_by_xpath('./div/h3/a').text
                except BaseException, e:
                    print "get_comment Position66666666", e
                    try:
                        author_name = each.find_element_by_xpath(
                            './div[1]/div/div/div[2]/div/div/div/div[1]/div/span/span[1]/a'
                        ).text
                    except:
                        author_name = ''
                print author_name

                try:
                    author_id = ''.join(
                        re.findall(
                            re.compile('id=(\d+)'),
                            each.find_element_by_xpath(
                                './div/h3/a').get_attribute('href')))
                except:
                    author_id = ''

                #try:
                #	print 7777777777
                #	pic_url = each.find_element_by_xpath('./div/div/div/div[1]/a/img').get_attribute('src')
                #except:
                #	pic_url = 'None'
                try:
                    content = each.find_element_by_xpath('./div/div[1]').text
                except:
                    try:
                        content = each.find_element_by_xpath(
                            './div/div[1]/span/span').text
                    except:
                        content = 'Emoji'
                print content

                try:
                    ti = self.date2timestamp(
                        str(each.text.replace(' ', '').split('·')[5]))
                except:
                    try:
                        ti = self.date2timestamp(
                            str(each.text.replace(' ', '').split('·')[4]))
                    except:
                        ti = 0

                try:
                    if re.findall(
                            r'id=(\d+)&',
                            each.find_element_by_xpath(
                                './div/h3/a').get_attribute('href')):
                        comment_type = 'receive'
                        text = content
                    else:
                        comment_type = 'make'
                        text = content
                except:
                    comment_type = 'unknown'
                    text = ''

                self.list.append({
                    'uid': author_id,
                    'nick_name': author_name,
                    'mid': root_mid,
                    'timestamp': ti,
                    'text': content,
                    'update_time': self.update_time,
                    'root_text': root_text,
                    'root_mid': root_mid,
                    'comment_type': comment_type
                })