示例#1
0
    def __init__(self, sid):
        self._sid = sid
        self._login = WeiboLogin(settings.ACCOUNT, settings.PASSWORD)

        # 将抓取的内容保存在 以博主ID号为文件夹 的目录下
        self._dir = 'resource/' + sid
        self._parser = WeiboParser(self._dir)
示例#2
0
def login_hook(opener, **kw):
    username = str(kw['username'])
    passwd = str(kw['password'])
    
    loginer = AccountLogin(opener,username,passwd)
    ret = loginer.login()

    return ret
示例#3
0
def main():
    # RedisCookies.clean()
    weiboLogin = WeiboLogin()
    success = []
    failed = []
    for account in ACCOUNTS:
        try:
            LOGGER.info('get cookies for %s' % str(account))
            cookies = weiboLogin.login_by_selenium(account['user'], account['password'])
            if cookies is not None and 'SSOLoginState' in cookies and 'SUBP' in cookies and 'SUHB' in cookies:
                success.append(account)
                RedisCookies.save_cookies(account['user'], cookies)
            else:
                failed.append(account)
        except Exception:
            LOGGER.error("get cookies failed")
            traceback.print_exc()
            failed.append(account)
    LOGGER.info("%d accounts login success" % len(success))
    LOGGER.info("%d accounts login failed" % len(failed))
示例#4
0
 def refresh_cookies(self, ck_dir):
     """
         refresh cookie db
     """
     idx = 0
     # del all cookies
     if os.path.isdir(cookie_dir):
         shutil.rmtree(cookie_dir)
     os.mkdir(cookie_dir)
     # add cookie from folder
     accounts = []
     for root, dirs, files in os.walk(ck_dir):
         for filespath in files:
             full_name = os.path.join(root, filespath)
             with open(full_name) as f:
                 for line in f.readlines():
                     if line:
                         u, p = line.split('\t')
                         if u and p:
                             accounts.append((u.strip(), p.strip()))
     # save cookie
     for u, p in accounts:
         opener = MechanizeOpener(
             user_agent=
             'Baiduspider+(+http://www.baidu.com/search/spider.htm)',
             timeout=10)
         opener.browser.set_proxies({'http': get_ip_proxy(size=10)})
         lm = WeiboLogin(opener, u, p)
         try:
             status = lm.login()
         except Exception as ex:
             self.logger.warn("login error:%s" % u)
             self.logger.error(ex)
             continue
         if status:
             idx += 1
             opener.cj.save(os.path.join(cookie_dir, '%d.txt' % idx),
                            ignore_discard=True,
                            ignore_expires=True)
             self.validated.append("%s\t%s\r\n" % (u, p))
         opener.close()
示例#5
0
def crawl_author():
    mongo_url = "localhost"
    mongo_db = "weibo"
    mongo_coll = "author"

    account = input("enter your weibo account:\n")
    pwd = input("enter your passwords:\n")
    login = WeiboLogin(account, pwd)
    sp = AuthorSpider(login)
    with AuthorPipeline(mongo_url, mongo_db, mongo_coll) as pipe:
        for item in sp.crawl():
            pipe.save(item)
示例#6
0
def crawl_weibo(start=0, width=300):
    with open("weibo_ids.csv") as f:
        ids = f.readlines()
    ids = [s.strip().strip('"') for s in ids[1:]]
    
    account = input("enter your weibo account:\n")
    pwd = input("enter your passwords:\n")
    lg = WeiboLogin(account, pwd)
    sp = WeiboSpider2(lg)
    
    mongo_url = "localhost"
    mongo_db = "weibo"
    mongo_coll = "posts_newfields"
    
    id_ = "1006062557129567"
    with WeiboPipeline(mongo_url, mongo_db, mongo_coll) as pipe:
        for i, id_ in enumerate(ids[:start+width]):
            print(i)
            if i < start:
                continue           
            for item in sp.crawl(id_, "2015-01-01 00:00"):
                pipe.update(item)
示例#7
0
# -*- coding: utf-8 -*-

from crawler import WeiboCrawler
from login import WeiboLogin

if __name__ == '__main__':

    username = '******'
    pwd = 'Cc19900201'
    keywords = ['#中国人寿#', '#中国人寿保险#', '#中国人寿保险公司#']

    WeiboLogin(username, pwd).login()
    WeiboCrawler(
        isConnectMySQL=True,
        htmlOutputDir='/Users/cchen224/Downloads/China Life').search(keywords)
    print 'Finally!!'
示例#8
0
def login_hook(opener, **kw):
    username = kw['username']
    passwd = kw['password']
    
    loginer = WeiboLogin(opener, username, passwd)
    return loginer.login()
示例#9
0

if __name__ == '__main__':
    base_url = "https://www.weibo.com/u/%s?is_all=1"
    wb_url = "https://www.weibo.com/p/%s/home?pids=Pl_Official_MyProfileFeed__22&is_search=0&visible=0&is_all=1&is_tag=0&profile_ftype=1&page=%s#feedtop"
    ajax_url = "https://www.weibo.com/p/aj/v6/mblog/mbloglist?ajwvr=6&domain=%s&is_search=0&visible=0&is_all=1&is_tag=0&profile_ftype=1&page=%s&pagebar=%s&pl_name=Pl_Official_MyProfileFeed__22&id=%s&pre_page=%s"
    # url = ajax_url % (domain_id, page, page_bar, page_id)
    sleep_time = 0.1
    page_limit = 50
    max_pages = 100

    # S1: Get cookie
    print('Get logined cookie...')
    username = '******'
    password = '******'
    dl = WeiboLogin(username=username, password=password)
    with open('login.cookie', 'rb') as f:
        dl.session.cookies.update(pickle.load(f))

    # S2: Create spider
    print('Initialized the Spider...')
    spider = WeiboSpider(dl)
    log = spider.IOHandle['log']

    # S3: Treat uids
    for uid in spider.uids:
        print("start to treat uid:", uid)
        log.write("start to treat uid: %s\n" % uid)
        spider.uid = uid
        # 3.0
        spider.make_output_handles()
示例#10
0
class Spider:
    def __init__(self, sid):
        self._sid = sid
        self._login = WeiboLogin(settings.ACCOUNT, settings.PASSWORD)

        # 将抓取的内容保存在 以博主ID号为文件夹 的目录下
        self._dir = 'resource/' + sid
        self._parser = WeiboParser(self._dir)

    def __spider(self, url):
        cookies = self._login.cookies()
        # 获取登录cookies
        if not cookies:
            if not self._login.login():
                print("==> login failed and exist!")
                exit(-1)

        headers = {'User-Agent': USER_AGENT}

        # 请求URL,获取页面信息,其中包含文章内容
        response = requests.get(url, cookies=cookies, headers=headers)
        if response.status_code != 200:
            print('network error')
            return

        # re.S 匹配包括换行在内的所有字符,这样就不会因为换行导致的匹配不到内容
        match = re.search('Sina Visitor System', response.text, re.S)
        if not match:
            # 登录成功,开始解析文章内容
            content = BeautifulSoup(response.text,
                                    'lxml',
                                    exclude_encodings='gbk')
            self._parser.parse(content)
            print("match")
        else:
            print('<Sina Visitor System> error')

    def running(self, start_page, end_page):
        print("*" * 50)
        print("*" + " Start crawling sina")
        print("*" + " Account: %s" % settings.ACCOUNT)
        print("*" + " Sid: %s" % self._sid)
        print("*" + " page: %d - %d" % (start_page, end_page))
        print("*" * 50)

        # 检测保存目录是否存在,不存在则创建
        if not os.path.exists(self._dir):
            os.makedirs(self._dir)

        for i in range(start_page, end_page + 1):
            _pager = WeiboPage(self._sid, i)
            print("===> START Page <%d --- 1>" % i)
            self.__spider(_pager.first())
            print("===> END Page <%d --- 1>" % i)

            print("===> START Page <%d --- 2>" % i)
            self.__spider(_pager.second())
            print("===> END Page <%d --- 2>" % i)

            print("===> START Page <%d --- 3>" % i)
            self.__spider(_pager.third())
            print("===> END Page <%d --- 3>" % i)

            if i != end_page:
                time_random = random.randint(10, 20)
                print("==> Sleep %d second" % (time_random + 10))
                time.sleep(time_random + 10)
示例#11
0
#coding=utf-8

from login import WeiboLogin
import collectData

if __name__ == '__main__':
    uid = '18640376585'
    psw = '89364013'

    WeiboLogin(uid, psw)
    collectData.main()
示例#12
0
def login_hook(opener, **kw):
    username = str(kw["username"])
    passwd = str(kw["password"])

    loginer = WeiboLogin(opener, username, passwd)
    return loginer.login()
示例#13
0
文件: __init__.py 项目: Andelfin/cola
    """
    #新的短信验证方式,不支持多线程同时登陆,会造成发生短信颜值码频率太高的错误,所以先使用单线程的方式实现
    uname = str(user_config['job']['login'][0]['username'])
    passwd = str(user_config['job']['login'][0]['password'])
    
    user_agent = """
    Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.107 Safari/537.36"""
    cookie = """SINAGLOBAL=122.234.236.211_1449673121.694525; Apache=122.234.236.211_1449673121.694527;
    SUB=_2AkMhNM6Vf8NhqwJRmPoUxW_naItzygjEiebDAH_sJxJjHlEO7FBtRgGyzabhoI02ECY9_U0P29jX;
    SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WWu0sYw.Q-ey7_1U9OXyjuM; ULOGIN_IMG=gz-fcc5afecc7602110ffffd666df9d024f0051"""

    cookies_filename = os.path.join(os.path.dirname(os.path.abspath(__file__)), cookies_filename) 
    opener = MechanizeOpener(user_agent=user_agent, cookie_filename=cookies_filename)
    #opener.browser.addheaders = [('User-agent',user_agent),('Connection','keep-alive'),('Cookie',cookie)]
    opener.browser.addheaders = [('User-agent',user_agent),('Connection','keep-alive')]
    loginer = WeiboLogin(opener,uname,passwd)
    #TODO:尝试直接使用上次的cookie,不重新登录...
    #is_need_login = False
    is_need_login = True
    if not is_need_login or loginer.login() == True:
      msg = """已经成功登录微博,请继续使用opener对象访问微博的其他页面,比如:\n
  response = opener.open('URL地址','要提交的数据,请先用urllib.urlencode进行编码')
      """
      try:
        from IPython import embed
        embed(banner2 = msg)
      except ImportError:
        import code
        code.interact(msg, local=globals())
    else:
      print '登录失败,每天发送验证短信的次数只有4-5次'