Exemplo n.º 1
0
 def test_fn_with_kwargs(self):
     "fn() with keyword arguments."
     pq.fn.test = lambda p=1: pq(this).eq(p)
     S = pq(self.html)
     self.assertEqual(S("li").test(0).text(), "Coffee")
     self.assertEqual(S("li").test().text(), "Tea")
     self.assertEqual(S("li").test(p=2).text(), "Milk")
Exemplo n.º 2
0
    def test_map(self):
        def ids_minus_one(i, elem):
            return int(self.klass(elem).attr('id')[-1]) - 1
        assert self.klass('div', self.html).map(ids_minus_one) == [0, 1]

        d = pq('<p>Hello <b>warming</b> world</p>')
        self.assertEqual(d('strong').map(lambda i, el: pq(this).text()), [])  # NOQA
Exemplo n.º 3
0
 def test_fn_with_kwargs(self):
     "fn() with keyword arguments."
     pq.fn.test = lambda p=1: pq(this).eq(p)  # NOQA
     S = pq(self.html)
     self.assertEqual(S('li').test(0).text(), 'Coffee')
     self.assertEqual(S('li').test().text(), 'Tea')
     self.assertEqual(S('li').test(p=2).text(), 'Milk')
Exemplo n.º 4
0
 def test_fn(self):
     "Example from `PyQuery.Fn` docs."
     fn = lambda: this.map(lambda i, el: pq(this).outerHtml())  # NOQA
     pq.fn.listOuterHtml = fn
     S = pq(self.html)
     self.assertEqual(S('li').listOuterHtml(),
                      ['<li>Coffee</li>', '<li>Tea</li>', '<li>Milk</li>'])
Exemplo n.º 5
0
    def test_map(self):
        def ids_minus_one(i, elem):
            return int(self.klass(elem).attr("id")[-1]) - 1

        assert self.klass("div", self.html).map(ids_minus_one) == [0, 1]

        d = pq("<p>Hello <b>warming</b> world</p>")
        self.assertEqual(d("strong").map(lambda i, el: pq(this).text()), [])  # NOQA
Exemplo n.º 6
0
 def test_replaceWith_with_function(self):
     expected = '''<div class="portlet">
   TestimageMy link text
   imageMy link text 2
   Behind you, a three-headed HTML&amp;dash;Entity!
 </div>'''
     d = pq(self.html)
     d('a').replace_with(lambda i, e: pq(e).html())
     val = d.__html__()
     assert val == expected, (repr(val), repr(expected))
Exemplo n.º 7
0
 def test_remove(self):
     d = pq(self.html)
     d('img').remove()
     val = d('a:first').html()
     assert val == 'Test My link text', repr(val)
     val = d('a:last').html()
     assert val == ' My link text 2', repr(val)
Exemplo n.º 8
0
    def test_filter(self):
        assert len(self.klass('div', self.html).filter('.node3')) == 1
        assert len(self.klass('div', self.html).filter('#node2')) == 1
        assert len(self.klass('div', self.html).filter(lambda i: i == 0)) == 1

        d = pq('<p>Hello <b>warming</b> world</p>')
        self.assertEqual(d('strong').filter(lambda el: True), [])
Exemplo n.º 9
0
    def test_filter(self):
        assert len(self.klass('div', self.html).filter('.node3')) == 1
        assert len(self.klass('div', self.html).filter('#node2')) == 1
        assert len(self.klass('div', self.html).filter(lambda i: i == 0)) == 1

        d = pq('<p>Hello <b>warming</b> world</p>')
        self.assertEqual(d('strong').filter(lambda el: True), [])
Exemplo n.º 10
0
 def test_soup_parser(self):
     d = pq('<meta><head><title>Hello</head><body onload=crash()>Hi all<p>',
            parser='soup')
     self.assertEqual(
         str(d),
         '<html><meta/><head><title>Hello</title></head><body onload="crash()">Hi all<p/></body></html>'
     )
Exemplo n.º 11
0
 def test_selector(self):
     expected = 'What'
     d = pq(b(self.xml), parser='xml')
     val = d('bar|blah', namespaces={
         'bar': 'http://example.com/bar'
     }).text()
     self.assertEqual(repr(val), repr(expected))
Exemplo n.º 12
0
 def test_get(self):
     d = pq('http://www.theonion.com/search/', {'q': 'inconsistency'},
            method='get')
     self.assertEqual(d('input[name=q]:last').val(), 'inconsistency')
     self.assertEqual(
         d('.news-in-brief h3').text(),
         'Slight Inconsistency Found In Bible')
Exemplo n.º 13
0
 def test_remove(self):
     d = pq(self.html)
     d("img").remove()
     val = d("a:first").html()
     assert val == "Test My link text", repr(val)
     val = d("a:last").html()
     assert val == " My link text 2", repr(val)
Exemplo n.º 14
0
 def test_serialize(self):
     d = pq(self.html4)
     self.assertEqual(
         d('form').serialize(),
         'spam=Spam%2Fspam&order=baked%0D%0Abeans&order=tomato&'
         'multiline=multiple%0D%0Alines%0D%0Aof%20text'
     )
Exemplo n.º 15
0
 async def get_flag_text(self, data_url):
     page = self._page_seller_flag
     net_check()
     while 1:
         try:
             await page.bringToFront()
             await page.goto(data_url)
         except errors.TimeoutError:
             sleep(5)
         except errors.PageError:
             sleep(5)
         else:
             break
     await asyncio.sleep(1)
     content = await page.content()
     await asyncio.sleep(2)
     # await page.close()
     await self.page.bringToFront()
     doc = pq(content)
     res = re.search('"tip":"(.*?)"}', doc("pre").text())
     if res:
         return res.group(1)
     else:
         logger.info(doc("pre").text())
         return None
Exemplo n.º 16
0
 def test_serialize_dict(self):
     d = pq(self.html4)
     self.assertEqual(d('form').serialize_dict(), {
         'spam': 'Spam/spam',
         'order': ['baked\r\nbeans', 'tomato'],
         'multiline': 'multiple\r\nlines\r\nof text',
     })
Exemplo n.º 17
0
    def test_filter(self):
        assert len(self.klass("div", self.html).filter(".node3")) == 1
        assert len(self.klass("div", self.html).filter("#node2")) == 1
        assert len(self.klass("div", self.html).filter(lambda i: i == 0)) == 1

        d = pq("<p>Hello <b>warming</b> world</p>")
        self.assertEqual(d("strong").filter(lambda el: True), [])
Exemplo n.º 18
0
 def test_remove(self):
     d = pq(self.html)
     d('img').remove()
     val = d('a:first').html()
     assert val == 'Test My link text', repr(val)
     val = d('a:last').html()
     assert val == ' My link text 2', repr(val)
Exemplo n.º 19
0
 def test_val_for_textarea(self):
     d = pq(self.html3)
     self.assertEqual(d('textarea').val(), 'Spam')
     self.assertEqual(d('textarea').text(), 'Spam')
     d('textarea').val('42')
     self.assertEqual(d('textarea').val(), '42')
     # Note: jQuery still returns 'Spam' here.
     self.assertEqual(d('textarea').text(), '42')
Exemplo n.º 20
0
 def _login(self, email, password):
     self._log("Logging in...")
     login_page = self._session.get('https://www.fitbit.com/login')
     form = pq(login_page.content).find('#loginForm')
     action = form.attr('action')
     data = dict((i.name, i.value) for i in form.find('input'))
     data.update({'email': email, 'password': password})
     self._session.post(action, data)
Exemplo n.º 21
0
 def test_val_for_textarea(self):
     d = pq(self.html3)
     self.assertEqual(d('textarea').val(), 'Spam')
     self.assertEqual(d('textarea').text(), 'Spam')
     d('textarea').val('42')
     self.assertEqual(d('textarea').val(), '42')
     # Note: jQuery still returns 'Spam' here.
     self.assertEqual(d('textarea').text(), '42')
Exemplo n.º 22
0
def get_url():
    doc = pq(driver.page_source)
    doc = doc.find('.sight_item_caption')
    li = []
    for box in doc.items():
        url = 'http://piao.qunar.com' + str(box.find('.name').attr('href'))
        li.append(url)
    return li
Exemplo n.º 23
0
 def test_unicode(self):
     xml = pq(u("<p>é</p>", 'utf-8'))
     self.assertEqual(type(xml.html()), text_type)
     if PY3k:
         self.assertEqual(str(xml), '<p>é</p>')
     else:
         self.assertEqual(unicode(xml), u("<p>é</p>", 'utf-8'))
         self.assertEqual(str(xml), '<p>&#233;</p>')
Exemplo n.º 24
0
 def test_selector_with_xml(self):
     expected = 'What'
     d = pq('bar|blah',
            b(self.xml),
            parser='xml',
            namespaces=self.namespaces)
     val = d.text()
     self.assertEqual(repr(val), repr(expected))
Exemplo n.º 25
0
 def test_serialize_array(self):
     d = pq(self.html4)
     self.assertEqual(d('form').serialize_array(), [
         {'name': 'spam', 'value': 'Spam/spam'},
         {'name': 'order', 'value': 'baked\r\nbeans'},
         {'name': 'order', 'value': 'tomato'},
         {'name': 'multiline', 'value': 'multiple\r\nlines\r\nof text'},
     ])
Exemplo n.º 26
0
 def test_unicode(self):
     xml = pq(u("<p>é</p>", 'utf-8'))
     self.assertEqual(type(xml.html()), text_type)
     if PY3k:
         self.assertEqual(str(xml), '<p>é</p>')
     else:
         self.assertEqual(unicode(xml), u("<p>é</p>", 'utf-8'))
         self.assertEqual(str(xml), '<p>&#233;</p>')
Exemplo n.º 27
0
 def test_get(self):
     if not HAS_REQUEST:
         return
     d = pq(u('http://ru.wikipedia.org/wiki/Заглавная_страница', 'utf8'),
            method='get')
     print(d)
     self.assertEqual(
         d('#n-mainpage a').text(), u('Заглавная страница', 'utf8'))
Exemplo n.º 28
0
 def test_get(self):
     if not HAS_REQUEST:
         return
     d = pq(u('http://ru.wikipedia.org/wiki/Заглавная_страница', 'utf8'),
            method='get')
     print(d)
     self.assertEqual(d('#n-mainpage a').text(),
                      u('Заглавная страница', 'utf8'))
Exemplo n.º 29
0
    def test_get_root(self):
        doc = pq(b'<?xml version="1.0" encoding="UTF-8"?><root><p/></root>')
        self.assertEqual(isinstance(doc.root, etree._ElementTree), True)
        self.assertEqual(doc.encoding, 'UTF-8')

        child = doc.children().eq(0)
        self.assertNotEqual(child._parent, no_default)
        self.assertTrue(isinstance(child.root, etree._ElementTree))
Exemplo n.º 30
0
 def test_val_for_multiple_elements(self):
     d = pq(self.html5)
     # "Get" returns *first* value.
     self.assertEqual(d('div > *').val(), 'spam')
     # "Set" updates *every* value.
     d('div > *').val('42')
     self.assertEqual(d('#first').val(), '42')
     self.assertEqual(d('#second').val(), '42')
     self.assertEqual(d('#third').val(), '42')
Exemplo n.º 31
0
 def test_val_for_multiple_elements(self):
     d = pq(self.html5)
     # "Get" returns *first* value.
     self.assertEqual(d('div > *').val(), 'spam')
     # "Set" updates *every* value.
     d('div > *').val('42')
     self.assertEqual(d('#first').val(), '42')
     self.assertEqual(d('#second').val(), '42')
     self.assertEqual(d('#third').val(), '42')
Exemplo n.º 32
0
    def test_next_all(self):
        d = pq(self.html2)

        # without filter
        self.assertEqual(len(d('#term-2').next_all()), 6)
        # with filter
        self.assertEqual(len(d('#term-2').next_all('dd')), 5)
        # when empty
        self.assertEqual(d('#NOTHING').next_all(), [])
Exemplo n.º 33
0
def get_comments():
    doc = pq(driver.page_source)
    doc = doc.find('.mp-comments-list')
    for item in doc.find('.mp-comments-item').items():
        usr = item.find('.mp-comments-username').text()
        date = item.find('.mp-comments-time').text()
        comment = item.find('.mp-comments-desc').text()
        view = {'user': usr, 'date': date, 'comment': comment}
        print(view)
Exemplo n.º 34
0
 def test_serialize_pairs_form_values(self):
     d = pq(self.html4)
     self.assertEqual(
         d('form').serialize_pairs(), [
             ('spam', 'Spam/spam'),
             ('order', 'baked\r\nbeans'),
             ('order', 'tomato'),
             ('multiline', 'multiple\r\nlines\r\nof text'),
         ])
Exemplo n.º 35
0
    def test_next_until(self):
        d = pq(self.html2)

        # without filter
        self.assertEqual(len(d('#term-2').next_until('dt')), 3)
        # with filter
        self.assertEqual(len(d('#term-2').next_until('dt', ':not(.strange)')),
                         2)
        # when empty
        self.assertEqual(d('#NOTHING').next_until('*'), [])
Exemplo n.º 36
0
 def test_session(self):
     if HAS_REQUEST:
         import requests
         session = requests.Session()
         session.headers.update({'X-FOO': 'bar'})
         d = pq(url=self.application_url, data={'q': 'foo'},
                method='get', session=session)
         self.assertIn('HTTP_X_FOO: bar', d('p').text())
     else:
         self.skipTest('no requests library')
Exemplo n.º 37
0
 def test_serialize_pairs_form_id(self):
     d = pq(self.html)
     self.assertEqual(d('#div').serialize_pairs(), [])
     self.assertEqual(d('#dispersed').serialize_pairs(), [
         ('order', 'spam'), ('order', 'eggs'), ('order', 'ham'),
         ('order', 'tomato'), ('order', 'baked beans'),
     ])
     self.assertEqual(d('.no-id').serialize_pairs(), [
         ('spam', 'Spam'),
     ])
Exemplo n.º 38
0
 def test_replaceWith(self):
     expected = '''<div class="portlet">
   <a href="/toto">TestimageMy link text</a>
   <a href="/toto2">imageMy link text 2</a>
   Behind you, a three-headed HTML&amp;dash;Entity!
 </div>'''
     d = pq(self.html)
     d('img').replace_with('image')
     val = d.__html__()
     assert val == expected, (repr(val), repr(expected))
Exemplo n.º 39
0
def get_comment_last(url):
    # 评论总页数
    driver.get(url)
    wait.until(
        EC.presence_of_element_located(
            (By.CSS_SELECTOR, '.mp-pager-next.mp-pager-item')))
    doc = pq(driver.page_source)
    doc = doc.find('#pageContainer')
    li = [i.text() for i in doc.find('.mp-pager-item').items()]
    return int(li[-2])
Exemplo n.º 40
0
 def test_replaceWith(self):
     expected = '''<div class="portlet">
   <a href="/toto">TestimageMy link text</a>
   <a href="/toto2">imageMy link text 2</a>
   Behind you, a three-headed HTML&amp;dash;Entity!
 </div>'''
     d = pq(self.html)
     d('img').replace_with('image')
     val = d.__html__()
     assert val == expected, (repr(val), repr(expected))
Exemplo n.º 41
0
def get_securityfocus_url():
    '''
    利用securityfocus_url搜寻更多的链接
    :return:
    '''
    results = db.refs.find({'host': 'www.securityfocus.com'})
    print('securityfocus:', results.count())
    for result in results:
        url = result['ref'] + '/references'
        headers = {
            'host':
            result['host'],
            'user-agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
        }
        print(url)
        try:
            response = requests.get(url)
            if response.status_code == 200:
                context = response.text
                doc = pq(context)
                items = doc('#vulnerability a').items()
                for item in items:
                    ref = item.attr('href')
                    # 更新cve_meta,并且往refs插数据
                    if db.refs.find({
                            'ref': ref,
                            'cve': result['cve']
                    }).count() == 0:
                        cve_meta = db.cve_meta.find_one({'_id': result['cve']})
                        cve_meta['refs'].append({'url': ref})
                        print(cve_meta)
                        modified_count = db.cve_meta.update_one(
                            {'_id': result['cve']}, {'$set': cve_meta})
                        print(modified_count)
                        items = db.cve_meta.find({})
                        pattern = re.compile('[a-zA-Z]+://(.*?)/(.*?)')
                        match = re.match(pattern, str(ref))
                        refs = {}
                        if match != None:
                            ref = re.sub('\s', '', ref)
                            refs['ref'] = ref
                            refs['host'] = str(match.group(1))
                            text = ref.split("/")[-1]
                            if text != '':
                                refs['text'] = text
                            else:
                                text = ref.split("/")[-2]
                                refs['text'] = text
                            refs['cve'] = result['cve']
                            print(refs)
                            db.refs.insert_one(refs)

        except requests.ConnectionError as e:
            print('error', e.args)
Exemplo n.º 42
0
def get_view():
    html = broswer.page_source
    doc = pq(html)
    lis = doc('.rev-item.comment-item.clearfix')
    for li in lis.items():
        view = {
            'name': li.find('.name').text(),
            'level': li.find('.level').text(),
            'txt': li.find('.rev-txt').text()
        }
        save_to_mongo(view)
Exemplo n.º 43
0
 def test_unicode(self):
     xml = pq(u("<html><p>é</p></html>", "utf-8"))
     self.assertEqual(type(xml.html()), text_type)
     if PY3k:
         self.assertEqual(str(xml), "<html><p>é</p></html>")
         self.assertEqual(str(xml('p:contains("é")')), "<p>é</p>")
     else:
         self.assertEqual(unicode(xml), u("<html><p>é</p></html>", "utf-8"))
         self.assertEqual(str(xml), "<html><p>&#233;</p></html>")
         self.assertEqual(str(xml(u('p:contains("é")', "utf8"))), "<p>&#233;</p>")
         self.assertEqual(unicode(xml(u('p:contains("é")', "utf8"))), u("<p>é</p>", "utf8"))
Exemplo n.º 44
0
def get_page_url(content):
    doc = pq(content)
    items = doc('.paging')
    total = items.find('b').text()
    print(total)
    items = items.find('a').items()
    hrefs = []
    for item in items:
        print(item.attr('href'))
        hrefs.append(item.attr('href'))
    return hrefs
Exemplo n.º 45
0
def HouseUrl(url):
    time.sleep(random.random() * 10)
    broswer.get(url)
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.curr')))
    doc = pq(broswer.page_source)
    doc = doc.find('.house-title')
    HouseUrls = []
    for box in doc.items():
        HouseUrl = str(box.find('.houseListTitle').attr('href'))
        HouseUrls.append(HouseUrl)
    return HouseUrls
Exemplo n.º 46
0
    def test_html_replacement(self):
        html = '<div>Not Me<span>Replace Me</span>Not Me</div>'
        replacement = 'New <em>Contents</em> New'
        expected = html.replace('Replace Me', replacement)

        d = pq(html)
        d.find('span').html(replacement)

        new_html = d.outerHtml()
        self.assertEqual(new_html, expected)
        self.assertIn(replacement, new_html)
Exemplo n.º 47
0
 def test_unicode(self):
     xml = pq(u"<html><p>é</p></html>")
     self.assertEqual(type(xml.html()), text_type)
     if PY3k:
         self.assertEqual(str(xml), '<html><p>é</p></html>')
         self.assertEqual(str(xml('p:contains("é")')), '<p>é</p>')
     else:
         self.assertEqual(text_type(xml), u"<html><p>é</p></html>")
         self.assertEqual(str(xml), '<html><p>&#233;</p></html>')
         self.assertEqual(str(xml(u'p:contains("é")')), '<p>&#233;</p>')
         self.assertEqual(text_type(xml(u'p:contains("é")')), u'<p>é</p>')
Exemplo n.º 48
0
 def test_val_for_select(self):
     d = pq(self.html4)
     self.assertIsNone(d('#first').val())
     self.assertEqual(d('#second').val(), 'eggs')
     self.assertIsNone(d('#third').val())
     d('#first').val('spam')
     d('#second').val('bacon')
     d('#third').val('eggs') # Selecting non-existing option.
     self.assertEqual(d('#first').val(), 'spam')
     self.assertEqual(d('#second').val(), 'bacon')
     self.assertIsNone(d('#third').val())
     d('#first').val('bacon') # Selecting non-existing option.
     self.assertIsNone(d('#first').val())
Exemplo n.º 49
0
 def test_val_for_inputs(self):
     d = pq(self.html2)
     self.assertEqual(d('input[name="spam"]').val(), 'Spam')
     self.assertEqual(d('input[name="eggs"]').val(), 'Eggs')
     self.assertEqual(d('input:checkbox').val(), 'Bacon')
     self.assertEqual(d('input:radio').val(), 'Ham')
     d('input[name="spam"]').val('42')
     d('input[name="eggs"]').val('43')
     d('input:checkbox').val('44')
     d('input:radio').val('45')
     self.assertEqual(d('input[name="spam"]').val(), '42')
     self.assertEqual(d('input[name="eggs"]').val(), '43')
     self.assertEqual(d('input:checkbox').val(), '44')
     self.assertEqual(d('input:radio').val(), '45')
Exemplo n.º 50
0
def test_from_url(url, timeout=1):
    """
    get proxy from given url address , and collect proxy and port ; then test them ;
    return a list of useful proxy_s
    :param url:
    :param timeout: second(s) , default 1 second
    :return:

    usage:
    url = 'http://www.ip84.com/pn'
    proxy_list = test_from_url(url)
    or like this: proxy_list = test_from_url(url , timeout=3)
    """
    patt_pp = re.compile(r'(?<![\.\d])(?:\d{1,3}\.){3}\d{1,3}(?![\.\d]):\d{1,5}')
    t = requests.get(url, verify=True).text
    txt = ':'.join(pq(t).text().split(' '))
    proxy_port = list(set(re.findall(patt_pp, txt)))
    return test_from_list(proxy_list=proxy_port, timeout=timeout)
Exemplo n.º 51
0
def getbyurl_zh(url):
  map_data={}
  import requests
  a=requests.get(url).content.decode('gbk')
  from pyquery.pyquery import PyQuery as pq
  rr={u"题 名":'title',
  u"页 码":u'载体形态',
  u"作 者":u'作者',
  u"出版项":u'出版社',
  "ISB":u'标准号',
  u"索取号":'position',
  u"附注信":u'载体形态'}
  for i in pq(a)("tr td"):
   print pq(i).text()[:3]
   if pq(i).text()[:3] in rr:
       map_data[   rr[ pq(i).text()[:3] ]  ]=  pq(i).text()[pq(i).text().index(":")+1:]
  map_data['img']= "http://book.bookday.cn/book/cover?isbn=%s&w=100&h=150"  %map_data[u'标准号']
  return map_data
Exemplo n.º 52
0
 def test_soup_parser(self):
     d = pq('<meta><head><title>Hello</head><body onload=crash()>Hi all<p>',
            parser='soup')
     self.assertEqual(str(d), (
         '<html><meta/><head><title>Hello</title></head>'
         '<body onload="crash()">Hi all<p/></body></html>'))
Exemplo n.º 53
0
    def handle(self, *args, **kwargs):
        print '开始下载欧普钢网资源单...'
        driver = webdriver.PhantomJS()

        if not os.path.exists(settings.CRAWL_ROOT):
            os.mkdir(settings.CRAWL_ROOT)
            print '新建目录: %s' % settings.CRAWL_ROOT

        today = datetime.datetime.now().strftime('%Y_%m_%d')
        # yesday = datetime.datetime.now() + datetime.timedelta(days=-1)
        # yesday_str = yesday.strftime('%m-%d')
        yesday_str = datetime.datetime.now().strftime('%m-%d')
        zhPattern = re.compile(u'[\u4e00-\u9fa5]+')

        print yesday_str
        today_dir = os.path.join(settings.CRAWL_ROOT, today)

        if not os.path.exists(today_dir):
            os.mkdir(today_dir)
            print '新建目录: %s' % today_dir

        gangyin_dir = os.path.join(today_dir, 'oupu')
        if not os.path.exists(gangyin_dir):
            os.mkdir(gangyin_dir)
            print '新建目录: %s' % gangyin_dir

        try:
            profile = PhoneUserProfile.objects.get(nickname=u'欧普钢网资源单', status=2)
        except PhoneUserProfile.DoesNotExist:
            user = User.objects.create_user('__oupu', '__oupu')
            profile = PhoneUserProfile.objects.create(
                user=user,
                phone='-',
                qq='-',
                nickname=u'欧普钢网资源单',
                status=2
            )
            print '系统用户已生成'

        driver.get(url)
        time.sleep(2)
        q = pq(driver.page_source)
        # pages = int(pq(q('.z-end')).attr('id'))
        pages = 500
        print '一共%d页' % pages

        all_results={}

        for page in range(1, pages+1):
            driver.get(url2+'_%d.html'%page)
            print '第%d页' % page
            time.sleep(2)

            q = pq(driver.page_source)
            q = q('table tr')
            # import pdb
            # pdb.set_trace()
            break_out = False
            for _ in range(1,24):
                try:
                    product_name = pq(pq(q('tr')[_])('td')[1]).text()
                    shop_sign = pq(pq(q('tr')[_])('td')[3]).text()
                    spec = pq(pq(q('tr')[_])('td')[2]).text()
                    weight = pq(pq(pq(q('tr')[_])('td')[6]).find('p')[0]).text()
                    price = pq(pq(q('tr')[_])('td')[5]).text()
                    provider_name = pq(pq(pq(q('tr')[_])('td')[9]).find('a')[0]).text()
                    release_time = pq(pq(q('tr')[_])('td')[8]).text()
                    print yesday_str
                    print release_time
                    print provider_name

                    if provider_name == '':
                        provider_name = '欧浦商城'
                        contacts = '欧浦商城华东站热线'
                        phone = '021-60717078-8'

                    else:
                        match = zhPattern.search(release_time)
                        if match == None:
                            if release_time != yesday_str:

                                break_out = True
                                break
                        contacts = pq(pq(pq(q('tr')[_])('td')[9]).find('em')[0]).text()
                        phone = pq(pq(pq(q('tr')[_])('td')[9]).find('em')[1]).text()
                    warehouse_name = pq(pq(q('tr')[_])('td')[7]).text()
                    manufacturer =  pq(pq(q('tr')[_])('td')[4]).text()

                except (IndexError,Exception):
                    continue
                if provider_name not in all_results:
                    all_results[provider_name] = {}

                if product_name not in all_results[provider_name]:
                    all_results[provider_name][product_name] = []

                res = {}
                res['product_name'] = product_name
                res['shop_sign'] = shop_sign
                res['spec'] = spec
                res['weight'] = weight
                res['price'] = price
                res['provider_name'] = provider_name
                res['warehouse_name'] = warehouse_name
                res['manufacturer'] = manufacturer
                res['phone'] = phone
                res['contacts'] = contacts
                if res not in all_results[provider_name][product_name]:
                    all_results[provider_name][product_name].append(res)

            if break_out:
                break

        for provider_name, data in all_results.iteritems():
            file_name = u'%s-欧普钢网资源单-%s.xls'%(provider_name, today)
            file_path = os.path.join(gangyin_dir, file_name)

            wb = xlwt.Workbook()
            for product_name, rows in data.iteritems():
                r='[’!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+'
                product_name = re.sub(r,'',product_name)
                ws = wb.add_sheet(product_name)
                c = 2
                ws .write(0, 0 ,u'联系人:%s'%rows[0]['contacts'])
                ws.write(1, 0, u'电话:%s'%rows[0]['phone'])
                ws.write(c, 0, u'品名')
                ws.write(c, 1, u'牌号')
                ws.write(c, 2, u'规格')
                ws.write(c, 3, u'产地')
                ws.write(c, 4, u'仓库')
                ws.write(c, 5, u'重量')
                ws.write(c, 6, u'价格')
                # ws.write(c, 7, u'说明1')

                for row in rows:
                    c += 1
                    ws.write(c, 0, row['product_name'])
                    ws.write(c, 1, row['shop_sign'])
                    ws.write(c, 2, row['spec'])
                    ws.write(c, 3, row['manufacturer'])
                    ws.write(c, 4, row['warehouse_name'])
                    ws.write(c, 5, row['weight'])
                    ws.write(c, 6, row['price'])
                    # ws.write(c, 7, row['special'])

            wb.save(file_path)
            CrawlExcel.objects.create(
            create_time=time.time(),
            crawl_user=profile.user,
            source=7,
            # source_id=excel_id,
            filepath=file_path,
            provider=provider_name,
            imported=False
                    )
            print provider_name, file_path
Exemplo n.º 54
0
 def test_selector_with_xml(self):
     expected = 'What'
     d = pq('bar|blah', b(self.xml), parser='xml',
            namespaces={'bar': 'http://example.com/bar'})
     val = d.text()
     self.assertEqual(repr(val), repr(expected))
Exemplo n.º 55
0
 def test_html_upper_element_name(self):
     xml = pq('<X>foo</X>', parser='html')
     self.assertEqual(len(xml('X')), 1)
     self.assertEqual(len(xml('x')), 1)
Exemplo n.º 56
0
 def test_post(self):
     d = pq(self.application_url, {'q': 'foo'},
            method='post')
     self.assertIn('REQUEST_METHOD: POST', d('p').text())
     self.assertIn('q=foo', d('p').text())
Exemplo n.º 57
0
 def test_get(self):
     d = pq(self.application_url, {'q': 'foo'},
            method='get')
     print(d)
     self.assertIn('REQUEST_METHOD: GET', d('p').text())
     self.assertIn('q=foo', d('p').text())
Exemplo n.º 58
0
 def test_remove_namespaces(self):
     expected = 'What'
     d = pq(b(self.xml), parser='xml').remove_namespaces()
     val = d('blah').text()
     self.assertEqual(repr(val), repr(expected))
Exemplo n.º 59
0
 def test_xhtml_namespace_html_parser(self):
     expected = 'What'
     d = pq(self.xhtml, parser='html')
     d.xhtml_to_html()
     val = d('div').text()
     self.assertEqual(repr(val), repr(expected))
Exemplo n.º 60
0
 def test_selector_html(self):
     expected = 'What'
     d = pq('blah', self.xml.split('?>', 1)[1], parser='html')
     val = d.text()
     self.assertEqual(repr(val), repr(expected))