def testMechanizeOpener(self): test_url = 'http://www.baidu.com' opener = MechanizeOpener() assert 'baidu' in opener.open(test_url) br = opener.browse_open(test_url) assert u'百度' in br.title() assert 'baidu' in br.response().read()
def __init__(self, opener=None, url=None, **kw): super(WikiParser, self).__init__(opener=opener, url=url, **kw) if self.opener is None: self.opener = MechanizeOpener() self.html_comment_reg = re.compile(r'<!--[^-]+-->', re.DOTALL) self.en_time_reg = re.compile( r'\d{1,2} [A-Z][a-z]{2,} \d{4} at \d{1,2}:\d{1,2}') self.zh_time_reg = re.compile( ur'\d{4}年\d{1,2}月\d{1,2}日 \(.+\) \d{1,2}:\d{1,2}')
def setUp(self): self.test_uid = '1784725941' self.bundle = WeiboUserBundle(self.test_uid) self.opener = MechanizeOpener() self.conn = Connection() self.db = self.conn[getattr(user_config.job, 'db')] self.collection = self.db.weibo_user assert len(user_config.job['login']) > 0 login_hook(self.opener, **user_config.job['login'][0])
def __init__(self, opener=None, url=None, bundle=None, **kwargs): super(DoubanMovieParser, self).__init__(opener=opener, url=url, **kwargs) if self.opener is None: self.opener = MechanizeOpener() self.url = url self.opener.set_default_timeout(TIMEOUT) if not hasattr(self, 'logger') or self.logger is None: self.logger = get_logger(name='douban_parser')
def setUp(self): self.test_uid = '1667486960' self.bundle = WeiboUserBundle(self.test_uid) self.opener = MechanizeOpener() self.conn = MongoClient() self.db = self.conn[getattr(user_config.job, 'db')] self.users_collection = self.db.weibo_user self.weibos_collection = self.db.micro_blog #assert len(user_config.job['login']) > 0 login_hook(self.opener, **user_config.job['login'][0])
def refresh_cookies(self, ck_dir): """ refresh cookie db """ idx = 0 # del all cookies if os.path.isdir(cookie_dir): shutil.rmtree(cookie_dir) os.mkdir(cookie_dir) # add cookie from folder accounts = [] for root, dirs, files in os.walk(ck_dir): for filespath in files: full_name = os.path.join(root, filespath) with open(full_name) as f: for line in f.readlines(): if line: u, p = line.split('\t') if u and p: accounts.append((u.strip(), p.strip())) # save cookie for u, p in accounts: opener = MechanizeOpener( user_agent= 'Baiduspider+(+http://www.baidu.com/search/spider.htm)', timeout=10) opener.browser.set_proxies({'http': get_ip_proxy(size=10)}) lm = WeiboLogin(opener, u, p) try: status = lm.login() except Exception as ex: self.logger.warn("login error:%s" % u) self.logger.error(ex) continue if status: idx += 1 opener.cj.save(os.path.join(cookie_dir, '%d.txt' % idx), ignore_discard=True, ignore_expires=True) self.validated.append("%s\t%s\r\n" % (u, p)) opener.close()
def parse(self, url=None): url = url or self.url if 'click' not in url: times = random.randrange(2, 5) else: times = 1 i = 0 self.opener = MechanizeOpener(user_agent=random_user_agent()) odds = random.randint(0, 100) if 'click' not in url or odds <= 5: # add proxy p_ = get_ip_proxy() if p_: self.opener.remove_proxy() self.opener.add_proxy(p_) while i < times: html = self.opener.open(url) #print(html) i = i + 1 time.sleep(.1) return url
def setUp(self): self.base_url = 'http://zhidao.baidu.com' self.url = 'http://zhidao.baidu.com/question/559110619.html' self.html = MechanizeOpener().open(self.url)
from cola.core.opener import MechanizeOpener import re for i in range(100): browser = MechanizeOpener().open('https://google.com') mainform = re.search('<input .+?>', browser).group() print(mainform)