示例#1
0
文件: ganji.py 项目: 61--/crawler
class Ganji(object):
    def __init__(self):
        self.web_page = WebPage()
        pass

    def view_person_all(self, page=30):
        for i in range(page):
            page_url = r'http://bj.ganji.com/fang1/haidian/a1o{0}m1/'.format(
                i + 1)
            r = self.web_page.get(page_url)
            urls = module.get_post_urls(r.text)
            print urls
            for url in urls:
                r = self.web_page.get(url)
                if module.check_useful(r.text):
                    module.show_link(url)

    def _get_page(self, url):
        r = self.web_page.get(url)
        if 'confirm' in r.url:
            pass
        return r
        pass
示例#2
0
文件: douban.py 项目: 61--/crawler
class Douban(object):
    def __init__(self):
        self.web_page = WebPage()

    def login(self, email, password):
        print 'do douban login'
        login_url = r'http://www.douban.com/accounts/login'

        r = self.web_page.get(login_url)
        data = {
            'source': 'simple',
            'redir': 'http://www.douban.com',
            'form_email': email,
            'form_password': password,
            'remember': 'on',
            'user_login': u'登录',
        }
        captcha_url = self._get_captcha_img_url(r.text)
        if captcha_url:
            print u'登录需要验证码'
            img_r = self.web_page.get(captcha_url)
            captcha_solution = module.captcha_input(img_r.content)
            if not captcha_solution:
                print 'input captcha code error'
                return False

            data.update({
                'captcha-id': self._get_captcha_id(r.text),
                'captcha-solution': captcha_solution,
            })

        login_post_url = r'http://www.douban.com/accounts/login'
        r = self.web_page.post(login_post_url, data)
        return self._check_log_success(r)

    def visit_group(self, group_url, max_page=15):
        for i in range(max_page):
            url = group_url + r'discussion?start={0}'.format(i * 25)
            print url
            page_r = self.web_page.get(url)
            post_urls = module.get_useful_post_url(page_r.text)
            module.show_link(post_urls)

    def _get_captcha_img_url(self, page_text):
        m = re.search(r'<(.*)class="captcha_image"(.*)>', page_text)
        if m:
            for i in [1, 2]:
                s_m = re.search(r'src="(.*?)"', m.group(i))
                if s_m:
                    name = s_m.group(1)
                    return name

    def _get_captcha_id(self, page_text):
        m = re.search(r'<(.*)name="captcha-id"(.*)>', page_text)
        if m:
            for i in [1, 2]:
                s_m = re.search(r'value="(.*)"', m.group(i))
                if s_m:
                    return s_m.group(1)

    def _check_log_success(self, r):
        if 'login' in r.url:
            return False
        return True
示例#3
0
文件: main.py 项目: 61--/crawler
def test_proxy():
    web_page = WebPage()
    web_page.set_proxy('204.12.223.173:7808')
    r = web_page.get(r'http://www.baidu.com/')
    print r.text