Python WeiboLogin.login 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: weibo_login

클래스/타입: WeiboLogin

메소드/함수: login

hotexamples.com에서의 예제들: 5

Python WeiboLogin.login - 5개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 weibo_login.WeiboLogin.login에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

WeiboLogin(8)

authorize_app(1)

load_cookies(1)

login_un(1)

예제 #1

파일 보기

파일: user_info_spiders.py 프로젝트: aodiwei/wb_sp

class UserInfoCrawl(Spider):
    name = "weibo_user_info"

    # allowed_domains = ["weibo.cn"]

    def __init__(self,
                 name="*****@*****.**",
                 password="******",
                 uid="09424248189",
                 *args,
                 **kwargs):
        super(UserInfoCrawl, self).__init__(*args, **kwargs)
        self.uid = uid
        self.start_urls = ["http://weibo.com"]
        self.allowed_domains = ["weibo.com", "weibo.cn"]
        self.url_base = "http://weibo.cn"
        self.first_flag_info = True  # 不爬取自己的微博
        self.first_flag_home = True  # 处理自己资料的时候和其他账户有所不一

        if os.path.exists("weibocookie.json"):
            with open("weibocookie.json", "r") as f:
                self.cookie = json.load(f)
        else:
            self.weibo = WeiboLogin()
            self.session = self.weibo.login(name, password)
            cookiejar = requests.utils.dict_from_cookiejar(
                self.session.cookies)

            # Set sina weibo cookie
            self.cookie = {
                'ALF': cookiejar['ALF'],
                'sso_info': cookiejar['sso_info'],
                'SUB': cookiejar['SUB'],
                'SUBP': cookiejar['SUBP'],
                'SUE': cookiejar['SUE'],
                'SUHB': cookiejar['SUHB'],
                'SUP': cookiejar['SUP'],
                'SUS': cookiejar['SUS']
            }
            with open("weibocookie.json", "w") as f:
                json.dump(self.cookie, f)

    def start_requests(self):
        # Parse weibo homepage
        home_url = "http://weibo.cn/u/%s" % self.uid
        yield Request(url=home_url,
                      cookies=self.cookie,
                      callback=self._parse_homepage,
                      errback=self.parse_error)

    def _parse_homepage(self, response):
        html = response.body
        soup = BeautifulSoup(html, "lxml")
        # 粉丝数
        fans_count, uid = self.get_fans_count(soup)

        # 微博数量
        weibo_count = self.get_weibo_count(soup)

        # 关注
        follow_count, follow_url = self.get_follows(soup)

        # 微博，只爬第一条
        weibo_item = self.parse_weibo_context(soup, uid)
        if weibo_item is not None:
            yield weibo_item

        weibo_social = WeiboSocialConnection()
        weibo_social["user_id"] = uid
        weibo_social["weibo"] = weibo_count
        weibo_social["fans"] = fans_count
        weibo_social["follow"] = follow_count
        if weibo_count > 10:
            yield weibo_social

        # 个人资料
        detail_url_ele = soup.find("a", text=u"资料")
        if detail_url_ele:
            detail_url = self.url_base + detail_url_ele["href"]
            yield Request(url=detail_url,
                          cookies=self.cookie,
                          callback=self.parse_info,
                          errback=self.parse_error,
                          priority=1)

        if follow_url:
            yield Request(url=follow_url,
                          cookies=self.cookie,
                          callback=self.parse_follow,
                          errback=self.parse_error)

    def parse_error(self, response):
        logger.error("post:%s" % response.url)

    def parse_info(self, response):
        html = response.body
        soup = BeautifulSoup(html, "lxml")
        info_tip_ele = soup.find("div", text=u"基本信息")
        uid = self.get_uid_from_response(response)
        info = {}
        if info_tip_ele:
            info_ele = info_tip_ele.next_sibling
            if self.first_flag_info:
                self.first_flag_info = False
                # info_eles = info_ele.find_all("a")
                # for ele in info_eles:
                #     if ele.text in [u"昵称", u"性别", u"地区", u"生日", u"简介"]:
                #         info[ele.text.encode("utf-8")] = ele.next_sibling.encode("utf-8")
                #         print ele.text, ele.next_sibling
            else:
                info_eles = info_ele.strings
                user_info = WeiboUserInfoItem()
                user_info["user_id"] = uid
                for ele in info_eles:
                    el = ele.split(":")
                    if len(el) == 2 and el[0] in [
                            u"昵称", u"性别", u"地区", u"生日", u"简介"
                    ]:
                        info[el[0]] = el[1]
                        info_item = el[1].encode("utf-8")
                        if el[0] == u"昵称":
                            user_info["user_name"] = info_item
                        elif el[0] == u"性别":
                            user_info["sex"] = info_item
                        elif el[0] == u"地区":
                            region = info_item.split(" ")
                            if len(region) == 1:
                                user_info["province"] = ""
                                user_info["city"] = region[0]
                            else:
                                user_info["province"] = region[0]
                                user_info["city"] = region[1]
                        elif el[0] == u"生日":
                            if len(info_item.split("-")) < 3:
                                user_info["birthday"] = "2050-" + info_item
                            else:
                                user_info["birthday"] = info_item
                            p = re.compile(r"^\d{4}-\d{2}-\d{2}$")
                            if not p.findall(user_info["birthday"]):
                                user_info["birthday"] = None
                        elif el[0] == u"简介":
                            user_info["abstract"] = info_item.encode("utf-8", "ignore").replace(" ", "").\
                        replace("\n", "").replace("\xc2\xa0", "").replace("\xF0\x9F\x91\x8A", "").\
                                replace("\xF0\x9F\x91\xBC", "").replace("\xF0\x9F\x8C\xB8\xF0\x9F", "")
                yield user_info

    def parse_follow(self, response):
        html = response.body
        soup = BeautifulSoup(html, "lxml")
        table_eles = soup.find_all("table")
        for ele in table_eles:
            follower_url = ele.find("a")["href"]
            yield Request(url=follower_url,
                          cookies=self.cookie,
                          callback=self._parse_homepage,
                          errback=self.parse_error)

    def get_uid_from_response(self, response):
        if isinstance(response, str):
            url = response
        else:
            url = response.url
        pattern = re.compile(r'/(\d+)/?')
        res = re.findall(pattern, url)
        id = 0
        if res:
            id = int(res[0])
            # print "id:", id
        return id

    def parse_weibo_context(self, soup, uid):
        weibo_info = WeiboItem()
        if self.first_flag_home:
            self.first_flag_home = False
            return None
        else:
            contexts = soup.find_all("div", class_="c")
            for item in contexts:
                try:
                    context = item.find("span", class_="ctt")
                    if not context:
                        continue
                    weibo_text = context.text.encode("utf-8", "ignore").replace(" ", "").\
                        replace("\n", "").replace("\xc2\xa0", "").replace("\xF0\x9F\x91\x8A", "").\
                        replace("\xF0\x9F\x91\xBC", "").replace("\xF0\x9F\x8C\xB8\xF0\x9F", "")
                    parent_ele = context.parent.parent
                    like_ele = parent_ele.find(text=re.compile(u"^赞\[\d*\]$"))
                    relay_ele = parent_ele.find(
                        text=re.compile(u"^转发\[\d*\]$"))
                    comment_ele = parent_ele.find(
                        text=re.compile(u"^评论\[\d*\]$"))
                    issue_time_ele = parent_ele.find("span", class_="ct")
                    issue_time = issue_time_ele.text
                    issue_time = issue_time.encode("utf-8")

                    issue = issue_time.split("来自")
                    issue_datetime = ""
                    if len(issue) > 0:
                        if "分钟" in issue[0]:
                            min = filter(str.isdigit, issue[0])
                            t = datetime.datetime.now() - datetime.timedelta(
                                minutes=int(min))
                            issue_datetime = t.strftime("%Y-%m-%d %H:%M:%S")
                        elif "今天" in issue[0]:
                            time = issue[0].replace("今天 ", "").replace(
                                "\xc2\xa0", "")
                            issue_datetime = datetime.datetime.now().strftime(
                                "%Y-%m-%d ") + time
                        else:
                            issue_datetime = issue[0].replace(
                                "月", "-").replace("日",
                                                  "").replace("\xc2\xa0", "")
                            if issue[0].count("-") < 2:
                                issue_datetime = datetime.datetime.now(
                                ).strftime("%Y-") + issue_datetime
                    issue_device = issue[1] if len(issue) > 1 else None

                    weibo_info["context"] = weibo_text
                    weibo_info["user_id"] = uid
                    weibo_info["issue_time"] = issue_datetime.strip()
                    weibo_info["get_time"] = datetime.datetime.now().strftime(
                        "%Y-%m-%d %H:%M:%S")
                    weibo_info["like_count"] = filter(str.isdigit,
                                                      like_ele.encode("utf-8"))
                    weibo_info["relay_count"] = filter(
                        str.isdigit, relay_ele.encode("utf-8"))
                    weibo_info["comment_count"] = filter(
                        str.isdigit, comment_ele.encode("utf-8"))
                    weibo_info["device"] = issue_device

                    # print issue_datetime, issue_device, weibo_text
                    # print like_ele.encode("utf-8"), relay_ele.encode("utf-8"), comment_ele.encode("utf-8")
                    return weibo_info
                    # 只爬去第一条微博
                except Exception, e:
                    logger.error(e)

예제 #2

파일 보기

파일: weibo_main.py 프로젝트: ligoudanblabla/workSpace

#coding:utf-8
import urllib2
import post_encode
from weibo_login import WeiboLogin
import get_weibo
if __name__ == '__main__':
	Login = WeiboLogin('17089368196', 'tttt5555')
	if Login.login() == True:
		print "登录成功"
	#可以根据page来循环以便达到爬取多页的目的
	html = urllib2.urlopen("http://s.weibo.com/weibo/%25E5%2591%25A8%25E6%2589%25AC%25E9%259D%2592&page=3").read()
	#调用解析html内容的函数	
	get_weibo.write_all_info(html)

예제 #3

파일 보기

파일: weibo_crawler.py 프로젝트: DaisyZhouCh/weibo-keywords-crawler

class WeiboCrawler():
    '''
    crawl weibo using keywords
    '''
    def __init__(self, search_key, user_name=USER_NAME, passwd=PASSWD):
        # login to sinaweibo
        self.driver = webdriver.PhantomJS()
        self.wl = WeiboLogin(user_name, passwd, self.driver) # the interface for authorization

        if self.wl.login():
            logging.info('login successfully')
        else:
            logging.info('login faied')
            sys.exit(1)
        self.sk = search_key.strip()
        return

    def __del__(self):
        self.driver.quit()
        return

    def crawl(self, page_count=1, comments=False):
        '''
        crawl the weibo using the keywords

        page_count: how many pages would be crawled
        '''
        self.results = []
        # get the mids from each result page
        pages = list(range(1, page_count+1))
        random.shuffle(pages)

        for t in ('hot', 'time'):
            for i in pages:
                url_to_crawl = self.get_search_url(i)
                logging.info('crawling page {}:{}'.format(i, url_to_crawl))
                self.driver.get(url_to_crawl)
                # wait the page loading the content
                try:
                    element = WebDriverWait(self.driver, 5).until(
                            lambda x: x.find_elements_by_class_name('feed_list')
                            )
                except TimeoutException:
                    logging.info('there is no weibo content in {}'.format(url_to_crawl))
                    logging.info('you are considered as a robot')
                    logging.info(self.driver.current_url)
                    self.driver.get_screenshot_as_file('./screenshot/error.png')

                    # let user input the verification code
                    verify_user(self.driver, 'search')
                    # break


                weibo_list = self.get_weibo_list(self.driver.page_source) # mid is used to crawl the original weibo content, using batch mode
                self.results.extend(weibo_list)

                # sleep some time to prevent hitting too much
                # time.sleep(1)
            else: continue
            break

        # for r in results:
        #     logging.info_dict(r)
        logging.info('total result {}'.format(len(self.results)))


        if comments:
            logging.info('crawling the comments')
            self.crawl_comments()
        return

    def get_search_url(self, page=1, w_type='hot'):
        '''
        compose a search url based on page_num and weibo type
        '''
        # logging.info('generating the url')
        url=''
        url += 'http://'
        url += search_domain
        url += '/wb'
        url += urllib.parse.quote('/'+self.sk)
        url += '&'
        url += urllib.parse.urlencode([
            ('page', page),
            ('xsort', w_type)
            ])

        return url


    def get_weibo_list(self, content):
        '''
        parse the weibo content in the current result page
        content: the source page of the keywords result

        return: a list of weibo object
        '''
        weibo_list = []
        soup = BeautifulSoup(content, 'html5lib')
        for t in soup.find_all('dl', class_='feed_list'):
            if t.has_attr('mid'):
                weibo = self.parse_weibo(t)
                if weibo:
                    weibo_list.append(weibo)
        logging.info('There are {} weibo on this page'.format(len(weibo_list)))
        return weibo_list

    def parse_weibo(self, t):
        '''
        parse weibo object from html
        t: the tag object that has weibo content

        Return weibo object
        '''
        weibo = {}

        try:
            weibo['keywords'] = self.sk.split(' ') #keywords is a list of words
            weibo['mid'] = t['mid']

            # the user name
            weibo['screen_name'] = t.find(name='dt', class_='face').find('a').get('title')
            weibo['user_profile'] = t.find(name='dt', class_='face').find('a').get('href')

            # the content of weibo
            weibo['text'] = t.find(name='dd', class_='content').find('em').get_text().strip()
            # the source url of the weibo
            weibo['source_url'] = t.find(name='a', class_='date').get('href').strip()
            logging.info(weibo['source_url'])

            # logging.info(weibo['text'])

            # meta data
            epoch_length = len(str(int(time.time())))
            time_str = t.find('dd', class_='content').find('p', class_='info W_linkb W_textb').find(name='a', class_='date').get('date')[0:epoch_length]
            time_now = time.localtime(int(time_str))
            weibo['created_at'] = datetime.datetime(*time_now[0:6])
            weibo['source'] = t.find('dd', class_='content').find('p', class_='info W_linkb W_textb').find('a', rel='nofollow').string.strip()

            pop_str = t.find('dd', class_='content').find('p', class_='info W_linkb W_textb').find('span').get_text().strip().replace('\n', '')

            pop_type = {
                    # key: source representation, value: attr
                    '赞': 'like_count',
                    '转发': 'repost_count',
                    '评论': 'comment_count'
                    }
            for key in list(pop_type.keys()):
                pattern = re.compile(r'.*(%s\((\d+)\)).*' % key)
                match = pattern.match(pop_str)
                if match:
                    # logging.info match.group(1)
                    # logging.info match.group(2)
                    weibo[pop_type[key]] = int(match.group(2))
                else:
                    # logging.info key, 'not found.'
                    weibo[pop_type[key]] = 0

        except Exception as e:
            logging.info(e)
            return None

        # logging.info_dict(weibo)
        return weibo

    def save(self, dist_dir='result'):
        '''
        save the search results to file
        '''
        if dist_dir not in os.listdir(os.curdir):
            os.mkdir(dist_dir)
        for w in self.results:
            file_name = ''.join([
                    '_'.join([k for k in w['keywords']]),
                    w['mid']
                    ])
            file_name += '.txt'
            f = codecs.open(os.path.join(dist_dir, file_name), 'w', 'utf-8')
            json.dump(w, f, ensure_ascii = False, default=json_util.default, indent = 2)
            # logging.info(w['text'])
            logging.info('writed to file {}'.format(file_name))
        return

    def crawl_comments(self):
        '''
        crawl the comments after getting all the results and update the results list --> self
        '''
        client = self.wl.authorize_app()
        if client:
            for w in self.results:
                # logging.info(w['mid'])
                w['comments'] = []
                crawler = WeiboCommentsCrawler(client, weibo_mid = w['mid'])
                r = crawler.crawl()

                # filter out the unrelated fields
                for c in r:
                    c.pop('status')
                w['comments'].extend(r)
        else:
            logging.error('认证失败，不能获取评论列表')
        return

예제 #4

파일 보기

파일: user_info_spiders.py 프로젝트: ic2y/wb_scrapy

class UserInfoCrawl(Spider):
    name = "weibo_user_info"
    # allowed_domains = ["weibo.cn"]

    def __init__(self, name="*****@*****.**", password="******", uid="1709818975", *args, **kwargs):
        super(UserInfoCrawl, self).__init__(*args, **kwargs)
        self.uid = uid
        self.start_urls = ["http://weibo.com"]
        self.allowed_domains = ["weibo.com", "weibo.cn"]
        self.url_base = "http://weibo.cn"
        self.first_flag_info = True  # 不爬取自己的微博
        self.first_flag_home = True  # 处理自己资料的时候和其他账户有所不一

        if os.path.exists("weibocookie.json"):
            with open("weibocookie.json", "r") as f:
                self.cookie = json.load(f)
        else:
            self.weibo = WeiboLogin()
            self.session = self.weibo.login(name, password)
            cookiejar = requests.utils.dict_from_cookiejar(self.session.cookies)

            # Set sina weibo cookie
            self.cookie = {'ALF': cookiejar['ALF'],
                           'sso_info': cookiejar['sso_info'],
                           'SUB': cookiejar['SUB'],
                           'SUBP': cookiejar['SUBP'],
                           'SUE': cookiejar['SUE'],
                           'SUHB': cookiejar['SUHB'],
                           'SUP': cookiejar['SUP'],
                           'SUS': cookiejar['SUS']}
            with open("weibocookie.json", "w") as f:
                json.dump(self.cookie, f)

    def start_requests(self):
        # Parse weibo homepage
        home_url = "http://weibo.cn/u/%s" % self.uid
        yield Request(url=home_url, cookies=self.cookie, callback=self._parse_homepage, errback=self.parse_error)

    def _parse_homepage(self, response):
        html = response.body
        soup = BeautifulSoup(html, "lxml")
        # 粉丝数
        fans_count, uid = self.get_fans_count(soup)

        # 微博数量
        weibo_count = self.get_weibo_count(soup)

        # 关注
        follow_count, follow_url = self.get_follows(soup)

        # 微博，只爬第一条
        weibo_item = self.parse_weibo_context(soup, uid)
        if weibo_item is not None:
            yield weibo_item

        weibo_social = WeiboSocialConnection()
        weibo_social["user_id"] = uid
        weibo_social["weibo"] = weibo_count
        weibo_social["fans"] = fans_count
        weibo_social["follow"] = follow_count
        if weibo_count > 10:
            yield weibo_social

        # 个人资料
        detail_url_ele = soup.find("a", text=u"资料")
        if detail_url_ele:
            detail_url = self.url_base + detail_url_ele["href"]
            yield Request(url=detail_url, cookies=self.cookie,
                          callback=self.parse_info, errback=self.parse_error,
                          priority=1)

        if follow_url:
            yield Request(url=follow_url, cookies=self.cookie, callback=self.parse_follow, errback=self.parse_error)

    def parse_error(self, response):
        logger.error("post:%s" % response.url)

    def parse_info(self, response):
        html = response.body
        soup = BeautifulSoup(html, "lxml")
        info_tip_ele = soup.find("div", text=u"基本信息")
        uid = self.get_uid_from_response(response)
        info = {}
        if info_tip_ele:
            info_ele = info_tip_ele.next_sibling
            if self.first_flag_info:
                self.first_flag_info = False
                # info_eles = info_ele.find_all("a")
                # for ele in info_eles:
                #     if ele.text in [u"昵称", u"性别", u"地区", u"生日", u"简介"]:
                #         info[ele.text.encode("utf-8")] = ele.next_sibling.encode("utf-8")
                #         print ele.text, ele.next_sibling
            else:
                info_eles = info_ele.strings
                user_info = WeiboUserInfoItem()
                user_info["user_id"] = uid
                for ele in info_eles:
                    el = ele.split(":")
                    if len(el) == 2 and el[0] in [u"昵称", u"性别", u"地区", u"生日", u"简介"]:
                        info[el[0]] = el[1]
                        info_item = el[1].encode("utf-8")
                        if el[0] == u"昵称":
                            user_info["user_name"] = info_item
                        elif el[0] == u"性别":
                            user_info["sex"] = info_item
                        elif el[0] == u"地区":
                            region = info_item.split(" ")
                            if len(region) == 1:
                                user_info["province"] = ""
                                user_info["city"] = region[0]
                            else:
                                user_info["province"] = region[0]
                                user_info["city"] = region[1]
                        elif el[0] == u"生日":
                            if len(info_item.split("-")) < 3:
                                user_info["birthday"] = "2050-" + info_item
                            else:
                                user_info["birthday"] = info_item
                            p = re.compile(r"^\d{4}-\d{2}-\d{2}$")
                            if not p.findall(user_info["birthday"]):
                                user_info["birthday"] = None
                        elif el[0] == u"简介":
                            user_info["abstract"] = info_item.encode("utf-8", "ignore").replace(" ", "").\
                        replace("\n", "").replace("\xc2\xa0", "").replace("\xF0\x9F\x91\x8A", "").\
                                replace("\xF0\x9F\x91\xBC", "").replace("\xF0\x9F\x8C\xB8\xF0\x9F", "")
                yield user_info

    def parse_follow(self, response):
        html = response.body
        soup = BeautifulSoup(html, "lxml")
        table_eles = soup.find_all("table")
        for ele in table_eles:
            follower_url = ele.find("a")["href"]
            yield Request(url=follower_url, cookies=self.cookie, callback=self._parse_homepage, errback=self.parse_error)

    def get_uid_from_response(self, response):
        if isinstance(response, str):
            url = response
        else:
            url = response.url
        pattern = re.compile(r'/(\d+)/?')
        res = re.findall(pattern, url)
        id = 0
        if res:
            id = int(res[0])
            # print "id:", id
        return id

    def parse_weibo_context(self, soup, uid):
        weibo_info = WeiboItem()
        if self.first_flag_home:
            self.first_flag_home = False
            return None
        else:
            contexts = soup.find_all("div", class_="c")
            for item in contexts:
                try:
                    context = item.find("span", class_="ctt")
                    if not context:
                        continue
                    weibo_text = context.text.encode("utf-8", "ignore").replace(" ", "").\
                        replace("\n", "").replace("\xc2\xa0", "").replace("\xF0\x9F\x91\x8A", "").\
                        replace("\xF0\x9F\x91\xBC", "").replace("\xF0\x9F\x8C\xB8\xF0\x9F", "")
                    parent_ele = context.parent.parent
                    like_ele = parent_ele.find(text=re.compile(u"^赞\[\d*\]$"))
                    relay_ele = parent_ele.find(text=re.compile(u"^转发\[\d*\]$"))
                    comment_ele = parent_ele.find(text=re.compile(u"^评论\[\d*\]$"))
                    issue_time_ele = parent_ele.find("span", class_="ct")
                    issue_time = issue_time_ele.text
                    issue_time = issue_time.encode("utf-8")

                    issue = issue_time.split("来自")
                    issue_datetime = ""
                    if len(issue) > 0:
                        if "分钟" in issue[0]:
                            min = filter(str.isdigit, issue[0])
                            t = datetime.datetime.now() - datetime.timedelta(minutes=int(min))
                            issue_datetime = t.strftime("%Y-%m-%d %H:%M:%S")
                        elif "今天" in issue[0]:
                            time = issue[0].replace("今天 ", "").replace("\xc2\xa0", "")
                            issue_datetime = datetime.datetime.now().strftime("%Y-%m-%d ") + time
                        else:
                            issue_datetime = issue[0].replace("月", "-").replace("日", "").replace("\xc2\xa0", "")
                            if issue[0].count("-") < 2:
                                issue_datetime =datetime.datetime.now().strftime("%Y-") + issue_datetime
                    issue_device = issue[1] if len(issue) > 1 else None

                    weibo_info["context"] = weibo_text
                    weibo_info["user_id"] = uid
                    weibo_info["issue_time"] = issue_datetime.strip()
                    weibo_info["get_time"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                    weibo_info["like_count"] = filter(str.isdigit, like_ele.encode("utf-8"))
                    weibo_info["relay_count"] = filter(str.isdigit, relay_ele.encode("utf-8"))
                    weibo_info["comment_count"] = filter(str.isdigit, comment_ele.encode("utf-8"))
                    weibo_info["device"] = issue_device


                    # print issue_datetime, issue_device, weibo_text
                    # print like_ele.encode("utf-8"), relay_ele.encode("utf-8"), comment_ele.encode("utf-8")
                    return weibo_info
                    # 只爬去第一条微博
                except Exception, e:
                    logger.error(e)

예제 #5

파일 보기

class WeiboCrawler():
    '''
    crawl weibo using keywords
    '''
    def __init__(self, search_key, user_name=USER_NAME, passwd=PASSWD):
        # login to sinaweibo
        self.driver = webdriver.PhantomJS()
        self.wl = WeiboLogin(user_name, passwd,
                             self.driver)  # the interface for authorization

        if self.wl.login():
            logging.info('login successfully')
        else:
            logging.info('login faied')
            sys.exit(1)
        self.sk = search_key.strip()
        return

    def __del__(self):
        self.driver.quit()
        return

    def crawl(self, page_count=1, comments=False):
        '''
        crawl the weibo using the keywords

        page_count: how many pages would be crawled
        '''
        self.results = []
        # get the mids from each result page
        pages = list(range(1, page_count + 1))
        random.shuffle(pages)

        for t in ('hot', 'time'):
            for i in pages:
                url_to_crawl = self.get_search_url(i)
                logging.info('crawling page {}:{}'.format(i, url_to_crawl))
                self.driver.get(url_to_crawl)
                # wait the page loading the content
                try:
                    element = WebDriverWait(self.driver, 5).until(
                        lambda x: x.find_elements_by_class_name('feed_list'))
                except TimeoutException:
                    logging.info(
                        'there is no weibo content in {}'.format(url_to_crawl))
                    logging.info('you are considered as a robot')
                    logging.info(self.driver.current_url)
                    self.driver.get_screenshot_as_file(
                        './screenshot/error.png')

                    # let user input the verification code
                    verify_user(self.driver, 'search')
                    # break

                weibo_list = self.get_weibo_list(
                    self.driver.page_source
                )  # mid is used to crawl the original weibo content, using batch mode
                self.results.extend(weibo_list)

                # sleep some time to prevent hitting too much
                # time.sleep(1)
            else:
                continue
            break

        # for r in results:
        #     logging.info_dict(r)
        logging.info('total result {}'.format(len(self.results)))

        if comments:
            logging.info('crawling the comments')
            self.crawl_comments()
        return

    def get_search_url(self, page=1, w_type='hot'):
        '''
        compose a search url based on page_num and weibo type
        '''
        # logging.info('generating the url')
        url = ''
        url += 'http://'
        url += search_domain
        url += '/wb'
        url += urllib.parse.quote('/' + self.sk)
        url += '&'
        url += urllib.parse.urlencode([('page', page), ('xsort', w_type)])

        return url

    def get_weibo_list(self, content):
        '''
        parse the weibo content in the current result page
        content: the source page of the keywords result

        return: a list of weibo object
        '''
        weibo_list = []
        soup = BeautifulSoup(content, 'html5lib')
        for t in soup.find_all('dl', class_='feed_list'):
            if t.has_attr('mid'):
                weibo = self.parse_weibo(t)
                if weibo:
                    weibo_list.append(weibo)
        logging.info('There are {} weibo on this page'.format(len(weibo_list)))
        return weibo_list

    def parse_weibo(self, t):
        '''
        parse weibo object from html
        t: the tag object that has weibo content

        Return weibo object
        '''
        weibo = {}

        try:
            weibo['keywords'] = self.sk.split(
                ' ')  #keywords is a list of words
            weibo['mid'] = t['mid']

            # the user name
            weibo['screen_name'] = t.find(name='dt',
                                          class_='face').find('a').get('title')
            weibo['user_profile'] = t.find(name='dt',
                                           class_='face').find('a').get('href')

            # the content of weibo
            weibo['text'] = t.find(
                name='dd', class_='content').find('em').get_text().strip()
            # the source url of the weibo
            weibo['source_url'] = t.find(name='a',
                                         class_='date').get('href').strip()
            logging.info(weibo['source_url'])

            # logging.info(weibo['text'])

            # meta data
            epoch_length = len(str(int(time.time())))
            time_str = t.find('dd', class_='content').find(
                'p', class_='info W_linkb W_textb').find(
                    name='a', class_='date').get('date')[0:epoch_length]
            time_now = time.localtime(int(time_str))
            weibo['created_at'] = datetime.datetime(*time_now[0:6])
            weibo['source'] = t.find('dd', class_='content').find(
                'p', class_='info W_linkb W_textb').find(
                    'a', rel='nofollow').string.strip()

            pop_str = t.find('dd', class_='content').find(
                'p', class_='info W_linkb W_textb').find(
                    'span').get_text().strip().replace('\n', '')

            pop_type = {
                # key: source representation, value: attr
                '赞': 'like_count',
                '转发': 'repost_count',
                '评论': 'comment_count'
            }
            for key in list(pop_type.keys()):
                pattern = re.compile(r'.*(%s\((\d+)\)).*' % key)
                match = pattern.match(pop_str)
                if match:
                    # logging.info match.group(1)
                    # logging.info match.group(2)
                    weibo[pop_type[key]] = int(match.group(2))
                else:
                    # logging.info key, 'not found.'
                    weibo[pop_type[key]] = 0

        except Exception as e:
            logging.info(e)
            return None

        # logging.info_dict(weibo)
        return weibo

    def save(self, dist_dir='result'):
        '''
        save the search results to file
        '''
        if dist_dir not in os.listdir(os.curdir):
            os.mkdir(dist_dir)
        for w in self.results:
            file_name = ''.join(
                ['_'.join([k for k in w['keywords']]), w['mid']])
            file_name += '.txt'
            f = codecs.open(os.path.join(dist_dir, file_name), 'w', 'utf-8')
            json.dump(w,
                      f,
                      ensure_ascii=False,
                      default=json_util.default,
                      indent=2)
            # logging.info(w['text'])
            logging.info('writed to file {}'.format(file_name))
        return

    def crawl_comments(self):
        '''
        crawl the comments after getting all the results and update the results list --> self
        '''
        client = self.wl.authorize_app()
        if client:
            for w in self.results:
                # logging.info(w['mid'])
                w['comments'] = []
                crawler = WeiboCommentsCrawler(client, weibo_mid=w['mid'])
                r = crawler.crawl()

                # filter out the unrelated fields
                for c in r:
                    c.pop('status')
                w['comments'].extend(r)
        else:
            logging.error('认证失败，不能获取评论列表')
        return