Пример #1
0
 def testMechanizeOpener(self):
     test_url = 'http://www.baidu.com'
     opener = MechanizeOpener()
       
     assert 'baidu' in opener.open(test_url)
       
     br = opener.browse_open(test_url)
     assert u'百度' in br.title()
     assert 'baidu' in br.response().read()
Пример #2
0
    def __init__(self, opener=None, url=None, **kw):
        super(WikiParser, self).__init__(opener=opener, url=url, **kw)

        if self.opener is None:
            self.opener = MechanizeOpener()
        self.html_comment_reg = re.compile(r'<!--[^-]+-->', re.DOTALL)
        self.en_time_reg = re.compile(
            r'\d{1,2} [A-Z][a-z]{2,} \d{4} at \d{1,2}:\d{1,2}')
        self.zh_time_reg = re.compile(
            ur'\d{4}年\d{1,2}月\d{1,2}日 \(.+\) \d{1,2}:\d{1,2}')
Пример #3
0
    def setUp(self):
        self.test_uid = '1784725941'
        self.bundle = WeiboUserBundle(self.test_uid)
        self.opener = MechanizeOpener()

        self.conn = Connection()
        self.db = self.conn[getattr(user_config.job, 'db')]
        self.collection = self.db.weibo_user

        assert len(user_config.job['login']) > 0

        login_hook(self.opener, **user_config.job['login'][0])
Пример #4
0
    def __init__(self, opener=None, url=None, bundle=None, **kwargs):
        super(DoubanMovieParser, self).__init__(opener=opener,
                                                url=url,
                                                **kwargs)
        if self.opener is None:
            self.opener = MechanizeOpener()

        self.url = url

        self.opener.set_default_timeout(TIMEOUT)

        if not hasattr(self, 'logger') or self.logger is None:
            self.logger = get_logger(name='douban_parser')
Пример #5
0
    def setUp(self):
        self.test_uid = '1667486960'
        self.bundle = WeiboUserBundle(self.test_uid)
        self.opener = MechanizeOpener()

        self.conn = MongoClient()
        self.db = self.conn[getattr(user_config.job, 'db')]
        self.users_collection = self.db.weibo_user
        self.weibos_collection = self.db.micro_blog

        #assert len(user_config.job['login']) > 0

        login_hook(self.opener, **user_config.job['login'][0])
Пример #6
0
 def refresh_cookies(self, ck_dir):
     """
         refresh cookie db
     """
     idx = 0
     # del all cookies
     if os.path.isdir(cookie_dir):
         shutil.rmtree(cookie_dir)
     os.mkdir(cookie_dir)
     # add cookie from folder
     accounts = []
     for root, dirs, files in os.walk(ck_dir):
         for filespath in files:
             full_name = os.path.join(root, filespath)
             with open(full_name) as f:
                 for line in f.readlines():
                     if line:
                         u, p = line.split('\t')
                         if u and p:
                             accounts.append((u.strip(), p.strip()))
     # save cookie
     for u, p in accounts:
         opener = MechanizeOpener(
             user_agent=
             'Baiduspider+(+http://www.baidu.com/search/spider.htm)',
             timeout=10)
         opener.browser.set_proxies({'http': get_ip_proxy(size=10)})
         lm = WeiboLogin(opener, u, p)
         try:
             status = lm.login()
         except Exception as ex:
             self.logger.warn("login error:%s" % u)
             self.logger.error(ex)
             continue
         if status:
             idx += 1
             opener.cj.save(os.path.join(cookie_dir, '%d.txt' % idx),
                            ignore_discard=True,
                            ignore_expires=True)
             self.validated.append("%s\t%s\r\n" % (u, p))
         opener.close()
Пример #7
0
    def parse(self, url=None):
        url = url or self.url
        if 'click' not in url:
            times = random.randrange(2, 5)
        else:
            times = 1
        i = 0

        self.opener = MechanizeOpener(user_agent=random_user_agent())
        odds = random.randint(0, 100)
        if 'click' not in url or odds <= 5:
            # add proxy
            p_ = get_ip_proxy()
            if p_:

                self.opener.remove_proxy()
                self.opener.add_proxy(p_)
            while i < times:
                html = self.opener.open(url)
                #print(html)
                i = i + 1
                time.sleep(.1)

        return url
Пример #8
0
 def setUp(self):
     self.base_url = 'http://zhidao.baidu.com'
     self.url = 'http://zhidao.baidu.com/question/559110619.html'
     self.html = MechanizeOpener().open(self.url)
Пример #9
0
from cola.core.opener import MechanizeOpener
import re


for i in range(100):
    browser = MechanizeOpener().open('https://google.com')
    mainform = re.search('<input .+?>', browser).group()

    print(mainform)