예제 #1
0
    def extract(self):
        self.html = re.sub("<!--.*?-->", "", self.html)
        doc = PyQuery(self.html)
        content_node = doc("div#blog_article_content")

        content = content_node.outerHtml()
        cpl = re.compile('<img.*?src=".*?"', re.I)
        content = re.sub("%", "%%", content)
        content_doc = PyQuery(content)
        content_doc("img").attr("src", "%s")

        item = ContentItem()
        item["title"] = self.title = doc("div.blog_main_left_content").find("h3").text()
        item["author"] = self.author = doc("div#common_person_blogtitle")("div#title01")("a").text()

        item["content"] = self.content = content_doc.outerHtml()

        self.release_time = doc("div.blog_main_time").find("p").text().strip()
        item["release_time"] = self.release_time

        item["source"] = u"凤凰网"
        item["pic_url"] = ""

        item["image_urls"] = [img.get("src") for img in content_node("img")]

        return item
 def render(self, edit=False):
     layout = self.current_layout
     rendered = layout.render()
     if not edit:
         widget_markup = """
         <div id="%(wid)s" class="view-widget">
           %(content)s
         </div>
         """
     else:
         widget_markup = """
         <div id="%(wid)s" class="widget">
           <div class="widget-head"><h3>%(title)s</h3></div>
           <div class="widget-content">%(content)s</div>
         </div>
         """
     pq = PyQuery(rendered)
     for column, addwidgets in self.widget_map.items():
         for addwidget in addwidgets:
             try:
                 widget = self[addwidget]
             except KeyError:
                 continue
             widget_info = {'col': column,
                            'wid': addwidget,
                            'title': widget.title,
                            'content': widget.render(),
                            'url': widget.absolute_url()
                           }
             pq('#%s' % column).append(widget_markup % widget_info)
     return pq.outerHtml()
예제 #3
0
 def render(self, edit=False):
     layout = self.current_layout
     rendered = layout.render()
     if not edit:
         widget_markup = """
         <div id="%(wid)s" class="view-widget">
           %(content)s
         </div>
         """
     else:
         widget_markup = """
         <div id="%(wid)s" class="widget">
           <div class="widget-head"><h3>%(title)s</h3></div>
           <div class="widget-content">%(content)s</div>
         </div>
         """
     pq = PyQuery(rendered)
     for column, addwidgets in self.widget_map.items():
         for addwidget in addwidgets:
             try:
                 widget = self[addwidget]
             except KeyError:
                 continue
             widget_info = {
                 'col': column,
                 'wid': addwidget,
                 'title': widget.title,
                 'content': widget.render(),
                 'url': widget.absolute_url()
             }
             pq('#%s' % column).append(widget_markup % widget_info)
     return pq.outerHtml()
예제 #4
0
def test_django_templatevar_conversion():
    hp = create_htmlproducer()
    pre_html = """<div class="something">
<div class="test"></div>
<a href="{{ STATIC_URL }}docson/widget.js">text</a>
</div>"""
    pq_dom = PyQuery(pre_html)
    post_pq_html = pq_dom.outerHtml()
    repaired_html = hp.repair_django_tags(post_pq_html)
    assert pre_html == repaired_html
예제 #5
0
def test_django_templatetag_url_conversion():
    hp = create_htmlproducer()
    pre_html = """<div class="something">
<div class="test"></div>
<a href="{% url 'schema' release_name 'release_package' %}">text</a>
</div>"""
    pq_dom = PyQuery(pre_html)
    post_pq_html = pq_dom.outerHtml()
    repaired_html = hp.repair_django_tags(post_pq_html)
    assert pre_html == repaired_html
예제 #6
0
    def multi(self, url):
        try:

            resp = self.session.get(url)

            status_code = resp.status_code
            pqhtml = PyQuery(resp.text or 'nothing')

            if status_code != 200:

                log_info = json.dumps(
                    dict(time=time.time(),
                         title=pqhtml('title').text(),
                         url=url))

                self.logger.info(log_info)

                data = tool.get_error(code=status_code,
                                      message='status_code Error',
                                      backUrl=resp.url,
                                      html=pqhtml.outerHtml())

                return tool.return_data(successful=False, data=data)

            elements = pqhtml("section article.product-look")

            plist = []

            for product in elements.items():

                obj = {}

                obj['url'] = product('a:first').attr("href")

                obj['img'] = product('picture img:first').attr("srcset")

                obj['name'] = product('hgroup.look-name').text()

                obj['price'] = product("span.price").text()

                plist.append(obj)

            log_info = json.dumps(
                dict(time=time.time(),
                     count=len(plist),
                     title=pqhtml('title').text(),
                     url=url))

            self.logger.info(log_info)

            return tool.return_data(successful=True, data=plist)

        except Exception, e:
            raise
 def convert_md_to_html(self, mdcontent, outer_menu_html, inner_menu_html):
     htmlcontent = markdown(mdcontent, extensions=['footnotes', 'sane_lists', 'toc'])
     pq_dom = PyQuery(htmlcontent)
     rendered_menu = self.extract_toc_to_html(pq_dom)
     self.insert_included_json(pq_dom)
     self.insert_included_csv(pq_dom)
     htmlcontent = pq_dom.outerHtml()
     htmlcontent = self.repair_django_tags(htmlcontent)
     rendered_html = render_to_string('main/menu_content.html', {
         'outer_menu': outer_menu_html,
         'inner_menu': inner_menu_html,
         'html_content': htmlcontent,
     })
     return rendered_html, rendered_menu
예제 #8
0
def clean_links(obj, url, html_raw_response=False):
    """
    Rewrite (internal) links to make them absolute.

    1. external links are not changed
    2. prepend URL to links that are just fragments (e.g. #section)
    3. prepend URL (without filename) to internal relative links
    """

    # TODO: do not depend on PyQuery
    obj = PQ(obj)

    if url is None:
        return obj

    for link in obj.find('a'):
        base_url = urlparse(url)
        # We need to make all internal links, to be absolute
        href = link.attrib['href']
        parsed_href = urlparse(href)
        if parsed_href.scheme or parsed_href.path.startswith('/'):
            # don't change external links
            continue

        if not parsed_href.path and parsed_href.fragment:
            # href="#section-link"
            new_href = base_url.geturl() + href
            link.attrib['href'] = new_href
            continue

        if not base_url.path.endswith('/'):
            # internal relative link
            # href="../../another.html" and ``base_url`` is not HTMLDir
            # (e.g. /en/latest/deep/internal/section/page.html)
            # we want to remove the trailing filename (page.html) and use the rest as base URL
            # The resulting absolute link should be
            # https://slug.readthedocs.io/en/latest/deep/internal/section/../../another.html

            # remove the filename (page.html) from the original document URL (base_url) and,
            path, _ = base_url.path.rsplit('/', 1)
            # append the value of href (../../another.html) to the base URL.
            base_url = base_url._replace(path=path + '/')

        new_href = base_url.geturl() + href
        link.attrib['href'] = new_href

    if html_raw_response:
        return obj.outerHtml()

    return obj
 def convert_md_to_html(self, mdcontent, outer_menu_html, inner_menu_html):
     htmlcontent = markdown(mdcontent,
                            extensions=['footnotes', 'sane_lists', 'toc'])
     pq_dom = PyQuery(htmlcontent)
     rendered_menu = self.extract_toc_to_html(pq_dom)
     self.insert_included_json(pq_dom)
     self.insert_included_csv(pq_dom)
     htmlcontent = pq_dom.outerHtml()
     htmlcontent = self.repair_django_tags(htmlcontent)
     rendered_html = render_to_string(
         'main/menu_content.html', {
             'outer_menu': outer_menu_html,
             'inner_menu': inner_menu_html,
             'html_content': htmlcontent,
         })
     return rendered_html, rendered_menu
예제 #10
0
def sanitise(text, markdown=False):
    if markdown:
        text = md(text)

    dom = PyQuery(text)

    for a in dom.find('a[href^="javascript:"]'):
        a = PyQuery(a)
        a.replaceWith(a.text())

    for obj in UNCLEAN_TAGS:
        dom.find(obj).remove()

    for attr in UNCLEAN_ATTRS:
        dom.find('[%s]' % attr).removeAttr(attr)

    text = dom.outerHtml()
    if markdown:
        dom = HTML2Text()
        text = dom.handle(text)

    return text
예제 #11
0
	def sanitise(self, text, markdown = True):
		if markdown:
			text = md(text)
		
		dom = PyQuery(text)
		
		for a in dom.find('a[href^="javascript:"]'):
			a = PyQuery(a)
			a.replaceWith(a.text())

		for obj in UNCLEAN_TAGS:
			dom.find(obj).remove()

		for attr in UNCLEAN_ATTRS:
			dom.find('[%s]' % attr).removeAttr(attr)
		
		text = dom.outerHtml()
		if markdown:
			dom = HTML2Text()
			text = dom.handle(text)

		return text
예제 #12
0
def prepare_html(fileobj):
    """ prepares the html for wordpress pages """
    pq=PyQuery("".join(strip_if_not_pre(fileobj))) 
    
    pq("a.headerlink").remove()
    # Do we want title at all?
    if pq("div.section h1"):
      title= pq("div.section h1")[0].text
      pq("div.section h1:first").remove()
    else:
      title=""

    # TODO: insert toc (??)

    out = PyQuery(pq("div.content").outerHtml() )
    # insert after h1 on 4th ine
    # lines = out.split('\n')
    # out = '\n'.join(lines[:4] + [ '[toc]' ] + lines[4:])

    # now various regex
    
    out.append("<p><small>Last update: %s</small></p>"%(
    datetime.datetime.now().strftime("%Y-%m-%d")))
    out=out.outerHtml()
    # replace .html with / and index.html with simple ./
    pattern = '(internal" href=".[^"]*)index\.html"'
    out = re.sub(pattern, '\\1"', out)
    pattern = 'internal" href="index\.html"'
    out = re.sub(pattern, 'href="./"', out)
    pattern = '(internal" href="[^"]*).html"'
    out = re.sub(pattern, '\\1/"', out)
    pattern = '(internal" href="[^"]*).html#([^"]*)"'
    out = re.sub(pattern, '\\1/#\\2"', out)
    pattern = '(internal" href="[^"]*/)index/#([^"]*)"'
    out = re.sub(pattern, '\\1/#\\2"', out)
    
    
    return (out, title)
예제 #13
0
def prepare_html(fileobj):
    """ prepares the html for wordpress pages """
    pq=PyQuery("".join(strip_if_not_pre(fileobj))) 
    
    pq("a.headerlink").remove()
    # Do we want title at all?
    if pq("div.section h1"):
      title= pq("div.section h1")[0].text
      pq("div.section h1:first").remove()
    else:
      title=""

    # TODO: insert toc (??)

    out = PyQuery(pq("div.content").outerHtml() )
    # insert after h1 on 4th ine
    # lines = out.split('\n')
    # out = '\n'.join(lines[:4] + [ '[toc]' ] + lines[4:])

    # now various regex
    
    out.append("<p><small>%s</small></p>"%pq("p.meta").text())
    out=out.outerHtml()
    # replace .html with / and index.html with simple ./
    pattern = '(internal" href=".[^"]*)index\.html"'
    out = re.sub(pattern, '\\1"', out)
    pattern = 'internal" href="index\.html"'
    out = re.sub(pattern, 'href="./"', out)
    pattern = '(internal" href="[^"]*).html"'
    out = re.sub(pattern, '\\1/"', out)
    pattern = '(internal" href="[^"]*).html#([^"]*)"'
    out = re.sub(pattern, '\\1/#\\2"', out)
    pattern = '(internal" href="[^"]*/)index/#([^"]*)"'
    out = re.sub(pattern, '\\1/#\\2"', out)
    
    
    return (out, title)
예제 #14
0
파일: newbalance.py 프로젝트: hellowac/drag
    def detail(self, url):
        try:
            resp = self.session.get(url, verify=False)

            status_code = resp.status_code
            pqhtml = PyQuery(resp.text or 'nothing')
            #下架
            if status_code == 404:

                log_info = json.dumps(
                    dict(time=time.time(),
                         title=pqhtml('title').text(),
                         url=url))

                self.logger.info(log_info)

                data = tool.get_off_shelf(code=status_code,
                                          message=self.cfg.SOLD_OUT,
                                          backUrl=resp.url,
                                          html=pqhtml.outerHtml())

                return tool.return_data(successful=False, data=data)

            #其他错误
            if status_code != 200:

                log_info = json.dumps(
                    dict(time=time.time(),
                         title=pqhtml('title').text(),
                         url=url))

                self.logger.info(log_info)

                data = tool.get_error(code=status_code,
                                      message=self.cfg.GET_ERR.get(
                                          'SCERR', 'ERROR'),
                                      backUrl=resp.url,
                                      html=pqhtml.outerHtml())

                return tool.return_data(successful=False, data=data)

            #前期准备
            area = pqhtml('#container')
            domain = tool.get_domain(url)
            pdata = self.get_pdata(pqhtml, domain)

            # print area.outerHtml()
            # print json.dumps(pdata)
            # exit()

            #下架
            if not pdata['hasOrderableVariants']:

                log_info = json.dumps(
                    dict(time=time.time(),
                         title=pqhtml('title').text(),
                         url=url))

                self.logger.info(log_info)

                data = tool.get_off_shelf(code=status_code,
                                          message=self.cfg.SOLD_OUT,
                                          backUrl=resp.url,
                                          html=pqhtml.outerHtml())

                return tool.return_data(successful=False, data=data)

            detail = dict()

            #品牌
            brand = area('.product-meta').attr('data-brand')
            detail['brand'] = brand

            #名称
            detail['name'] = area('.product-meta').attr('data-productname')

            #货币
            currency = re.search(r's\["currencyCode"\]="(\w{3})";',
                                 pqhtml('script').text()).groups()[0]
            detail['currency'] = currency
            detail['currencySymbol'] = tool.get_unit(currency)

            #获取信息.
            price, sizes = self.get_info(pdata)

            #价格
            detail['price'] = price

            ptxt = area('.pricenotebucket').text()
            listPrice = re.search(r'\d[\d\.]',
                                  ptxt).groups()[0] if ptxt else price

            detail['listPrice'] = listPrice

            #颜色
            status, color, imgs = self.get_color(pdata)
            detail['color'] = color
            detail['colorId'] = dict([(key, key) for key in color.keys()])

            #钥匙
            detail['keys'] = color.keys()

            #图片集
            detail['img'] = imgs[0] if isinstance(imgs, list) else dict(
                [(cId, imgArr[0]) for cId, imgArr in imgs.items()])
            detail['imgs'] = imgs

            #产品ID
            productId = area('.product-meta').attr('data-pid')
            detail['productId'] = productId

            #规格
            detail['sizes'] = sizes

            #描述
            detail['descr'] = area(
                'section.product-details .longdescription').text()

            #详细
            detail['detail'] = area('section.product-details').text()

            #HTTP状态码
            detail['status_code'] = status_code

            #状态
            detail['status'] = status

            #返回链接
            detail['backUrl'] = resp.url

            log_info = json.dumps(
                dict(time=time.time(),
                     productId=detail['productId'],
                     name=detail['name'],
                     currency=detail['currency'],
                     price=detail['price'],
                     listPrice=detail['listPrice'],
                     url=url))

            self.logger.info(log_info)

            return tool.return_data(successful=True, data=detail)

        except Exception, e:
            raise
예제 #15
0
    def detail(self,url):
        try:

            resp = self.session.get(url, verify=False)

            status_code = resp.status_code
            pqhtml = PyQuery(resp.text or 'nothing')

            #下架
            if status_code == 404 :

                log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url))

                self.logger.info(log_info)

                data = tool.get_off_shelf(code=status_code,message=self.cfg.SOLD_OUT,backUrl=resp.url, html=pqhtml.outerHtml())
                return tool.return_data(successful=False, data=data)

            #其他错误
            if status_code != 200 :

                log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url))

                self.logger.info(log_info)

                data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get('SCERR','ERROR'), backUrl=resp.url, html=pqhtml.outerHtml())

                return tool.return_data(successful=False, data=data)

            #前期准备:
            Jtxt = pqhtml('script').text()
            pdata = self.get_pdata(Jtxt)
            area = pqhtml('#detail-display-wrapper')

            #下架
            if not pdata :
                data = tool.get_off_shelf(code=status_code,message=self.cfg.SOLD_OUT,backUrl=resp.url, html=pqhtml.outerHtml())
                return tool.return_data(successful=False, data=data)

            detail = dict()

            #名称
            detail['name'] = pqhtml('h2.detail-title').text()

            #品牌
            detail['brand'] = self.get_brand(area)

            #价格符号
            currency = pqhtml('meta[itemprop="priceCurrency"]:first').attr('content')
            detail['currency'] = currency
            detail['currencySymbol'] = tool.get_unit(currency)

            #颜色
            detail['color'] = pqhtml('ul.detail-additional-info:first>li:last').text()

            #价格
            detail['price'] = pdata['Products']['Info']['BasePrice'].replace(',','')
            listPrice = pdata['Products']['Info']['OldPrice'].replace(',','')
            detail['listPrice'] = (pqhtml('span.strokeText>span.price').text() or pqhtml('div#detail-display-info-wrapper span.price').text())[1:]

            #图片集合
            imgsTmp = [li.attr('data-zoom') for li in pqhtml('div#detail-display-icon ul').children('li').items()]
            detail['img'] = imgsTmp[0]
            detail['imgs'] = imgsTmp

            #规格
            detail['sizes'] = self.get_sizes(pdata,area)

            #描述
            detail['descr'] = area('p.detail-description:first').text()

            #产品ID
            detail['productId'] = pdata['Products']['Info']['ParentProductId']
            detail['colorId'] = pdata['Products']['Info']['ParentProductId']

            #HTTP状态码
            detail['status_code'] = status_code

            #状态
            detail['status'] = self.cfg.STATUS_SALE

            #返回链接
            detail['backUrl'] = url

            log_info = json.dumps(dict(time=time.time(), productId=detail['productId'], name=detail[
                                  'name'], currency=detail['currency'], price=detail['price'], listPrice=detail['listPrice'], url=url))

            self.logger.info(log_info)

            return tool.return_data(successful=True, data=detail)

        except Exception, e:
            raise
예제 #16
0
파일: hbx.py 프로젝트: hellowac/drag
    def detail(self, url):
        try:

            resp = self.session.get(url, verify=False)

            status_code = resp.status_code
            pqhtml = PyQuery(resp.text or 'nothing')

            if status_code == 404:

                log_info = json.dumps(
                    dict(time=time.time(),
                         title=pqhtml('title').text(),
                         url=url))

                self.logger.info(log_info)

                data = tool.get_off_shelf(code=status_code,
                                          message=self.cfg.GET_ERR.get(
                                              'SCERR', 'ERROR'),
                                          backUrl=resp.url,
                                          html=pqhtml.outerHtml())

                return tool.return_data(successful=False, data=data)

            if status_code != 200:

                log_info = json.dumps(
                    dict(time=time.time(),
                         title=pqhtml('title').text(),
                         url=url))

                self.logger.info(log_info)

                data = tool.get_error(code=status_code,
                                      message=self.cfg.GET_ERR.get(
                                          'SCERR', 'ERROR'),
                                      backUrl=resp.url,
                                      html=pqhtml.outerHtml())

                return tool.return_data(successful=False, data=data)

            JscriptTxt = pqhtml('script').text()

            pqhtml.remove('script').remove('style')

            area = pqhtml('div#product-summary')

            # print area.outerHtml().encode('utf-8')

            buttonTxt = area('#product-form .add-button').text()

            if u'售罄' in buttonTxt.lower() or u'sold out' in buttonTxt.lower():

                log_info = json.dumps(
                    dict(time=time.time(),
                         title=pqhtml('title').text(),
                         url=url))

                self.logger.info(log_info)

                data = tool.get_off_shelf(code=status_code,
                                          message=self.cfg.GET_ERR.get(
                                              'SCERR', 'ERROR'),
                                          backUrl=resp.url,
                                          html=pqhtml.outerHtml())

                return tool.return_data(successful=False, data=data)

            detail = dict()

            #所有图片
            imgs = self.get_imgs(pqhtml)
            detail['imgs'] = imgs
            detail['img'] = imgs[0]

            #名称
            detail['name'] = area('h1.brand').text() + ' ' + area(
                '.name').text()

            #货币
            currency = area('span.regular-price').text().split()[0]
            detail['currency'] = currency
            detail['currencySymbol'] = tool.get_unit(currency)

            #价格
            price, listPrice = self.get_all_price(area)
            detail['price'] = price
            detail['listPrice'] = listPrice

            color, sizes = self.get_sizes(area)

            #颜色
            detail['color'] = color

            #sizes
            detail['sizes'] = sizes

            #下架:
            if isinstance(detail['sizes'],
                          basestring) and detail['sizes'] == 'sold out':

                log_info = json.dumps(
                    dict(time=time.time(),
                         title=pqhtml('title').text(),
                         url=url))

                self.logger.info(log_info)

                data = tool.get_off_shelf(code=status_code,
                                          message=self.cfg.SOLD_OUT,
                                          backUrl=resp.url,
                                          html=pqhtml.outerHtml())

                return tool.return_data(successful=False, data=data)

            #描述
            detail['descr'] = area('div#description').text() or pqhtml(
                '#product-details .product-details-section').text()

            #品牌
            detail['brand'] = area('h1.brand').text()

            #产品ID
            prodId = area.attr('data-id')
            detail['productId'] = prodId
            detail['colorId'] = prodId

            #HTTP状态码
            detail['status_code'] = status_code

            #状态
            detail['status'] = self.cfg.STATUS_SALE

            #返回链接
            detail['backUrl'] = resp.url

            #返回的IP和端口
            if resp.raw._original_response.peer:
                detail['ip_port'] = ':'.join(
                    map(lambda x: str(x), resp.raw._original_response.peer))

            log_info = json.dumps(
                dict(time=time.time(),
                     productId=detail['productId'],
                     name=detail['name'],
                     currency=detail['currency'],
                     price=detail['price'],
                     listPrice=detail['listPrice'],
                     url=url))

            self.logger.info(log_info)

            return tool.return_data(successful=True, data=detail)

        except Exception, e:
            raise
예제 #17
0
class KNMISource(Source):
  """Weather source class"""

  __baseUrl = 'http://www.knmi.nl/klimatologie/daggegevens/index.cgi?station=370'
  __d = None
  __date = None
  __day = None
  __url = None


  def __init__(self):
    """Constructor"""


  def get_weather(self, day):
    if day >= 0:
      sys.stderr.write('`day` must be an integer and less than 0\n')
      return None

    date = datetime.datetime.now() + datetime.timedelta(days=day)
    url = self.__get_url(date)

    self.__date = date
    self.__day = day
    self.__url = url

    self.__d = PyQuery(url=url)

    data = None
    try:
      data = self.__parse()
    except:
      html = self.__d.outerHtml().encode('ascii', 'replace')
      sys.stderr.write(html)
      sys.stderr.write('\n\n')
      traceback.print_exc()
    return data


  def __parse(self):
    """Parse the HTML page"""
    rows = self.__d('#printable > table > tr')
    
    w = {}

    w['url'] = self.__url
    w['date'] = self.__date.strftime("%Y-%m-%d")
    w['url_timestamp'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    w['day'] = self.__day

    w['temperature_average'] = self.__numeric(rows.eq(2).find('td').eq(1).text())
    w['temperature_maximum'] = self.__numeric(rows.eq(3).find('td').eq(1).text())
    w['temperature_minimum'] = self.__numeric(rows.eq(4).find('td').eq(1).text())

    w['rain_amount'] = self.__numeric(rows.eq(2).find('td').eq(6).text().lstrip('<-'))
    w['rain_duration'] = self.__numeric(rows.eq(3).find('td').eq(6).text().lstrip('-'))

    w['sunshine_duration'] = self.__numeric(rows.eq(7).find('td').eq(1).text())
    w['sunshine_relative'] = self.__numeric(rows.eq(8).find('td').eq(1).text())

    w['sky_coverage'] = self.__numeric(rows.eq(9).find('td').eq(1).text())
    w['sky_visibiliy'] = self.__numeric(rows.eq(11).find('td').eq(1).text().lstrip('<'))

    w['wind_speed_average'] = self.__numeric(rows.eq(7).find('td').eq(6).text())
    w['wind_speed_maximum_average'] = self.__numeric(rows.eq(8).find('td').eq(6).text())
    w['wind_speed_maximum'] = self.__numeric(rows.eq(9).find('td').eq(6).text())
    w['wind_direction'] = self.__numeric(rows.eq(11).find('td').eq(6).text())

    w['atmosphere_humidity'] = self.__numeric(rows.eq(14).find('td').eq(1).text())
    w['atmosphere_pressure'] = self.__numeric(rows.eq(14).find('td').eq(6).text())


    return w


  def __get_url(self, date):
    """Get the remote URL for fetching the weather"""

    params = {
      'year': date.year,
      'month': date.month,
      'day': date.day
    }

    url = self.__baseUrl
    for key, value in params.iteritems():
      url += '&' + key + '=' + str(value)

    return url


  def __numeric(self, x):
    x = x.rstrip('-')
    if not x:
      return 0
    return float(x) if '.' in x else int(x)
예제 #18
0
파일: superdry.py 프로젝트: hellowac/drag
    def detail(self,url):
        try:
            resp = self.session.get(url, verify=False)

            status_code = resp.status_code
            pqhtml = PyQuery(resp.text or 'nothing')
            #下架
            if status_code == 404 :

                log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url))

                self.logger.info(log_info)

                data = tool.get_off_shelf(code=status_code,message=self.cfg.SOLD_OUT,backUrl=resp.url, html=pqhtml.outerHtml())
                
                return tool.return_data(successful=False, data=data)

            #其他错误
            if status_code != 200 :

                log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url))

                self.logger.info(log_info)

                data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get('SCERR','ERROR'), backUrl=resp.url, html=pqhtml.outerHtml())

                return tool.return_data(successful=False, data=data)
            
            #前期准备
            area = pqhtml('.product-detail')
            detail_tab = pqhtml('#product-detail-tabs')
            img_tab = pqhtml('div.images')

            domain = tool.get_domain(url)
            pdata = self.get_pdata(pqhtml)

            # print area.outerHtml().encode('utf-8')
            # print json.dumps(pdata)
            # print detail_tab.outerHtml().encode('utf-8')
            # print img_tab.outerHtml().encode('utf-8')
            
            # exit()

            #下架
            if not area or 'out of stock' in area('.out-of-stock').text():

                log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url))

                self.logger.info(log_info)
                data = tool.get_off_shelf(code=status_code,message=self.cfg.SOLD_OUT,backUrl=resp.url, html=pqhtml.outerHtml())
                
                return tool.return_data(successful=False, data=data)

            video_prefix = 'http://image1.superdry.com/static/images/products/'

            detail = dict()

            detail['stock'] = pdata['product']['stock']     #该商品总库存.

            detail['video'] = video_prefix+pdata['product']['video']

            detail['gender'] = pdata['product']['gender']

            detail['season'] = pdata['product']['season']

            detail['category'] = pdata['product']['category']

            detail['productSku'] = pdata['product']['sku_code']

            detail['size_guide'] = pdata['product']['size_guide']

            detail['subcategory'] = pdata['product']['subcategory']

            detail['productCode'] = pdata['product']['sku_code']

            #产品ID
            productId = pdata['product']['id']
            detail['productId'] = productId
            
            #品牌
            brand = 'SUPERDRY'
            detail['brand'] = brand

            #名称
            detail['name'] = pdata['product']['name']

            #货币
            currency = pdata['product']['currency']
            detail['currency'] = currency
            detail['currencySymbol'] = tool.get_unit(currency)

            #价格
            detail['price'] = pdata['product']['unit_sale_price']
            detail['listPrice'] = pdata['product']['unit_price']

            #描述
            detail['descr'] =  pdata['product']['description']

            #详细
            detail['detail'] = detail_tab.text()

            #退换货
            detail['returns'] = detail_tab('tab-page:last').text()

            #颜色
            detail['color'] = pdata['product']['color']
            detail['colorId'] = pdata['product']['color']

            #图片集
            imgs = [ ele.attr('src') for ele in img_tab('.scroller li img').items()]
            imgs = map(lambda x : x.replace('/productthumbs/','/zoom/'), imgs)
            detail['img'] = img_tab('.scroller li img:first').attr('src').replace('/productthumbs/','/zoom/')
            detail['imgs'] = imgs

            #规格
            detail['sizes'] = self.get_sizes(area)

            #HTTP状态码
            detail['status_code'] = status_code

            #状态
            detail['status'] = self.cfg.STATUS_SALE

            #返回链接
            detail['backUrl'] = resp.url
            
            #返回的IP和端口
            if resp.raw._original_response.peer :
                detail['ip_port']=':'.join(map( lambda x:str(x),resp.raw._original_response.peer))

            log_info = json.dumps(dict(time=time.time(), 
                                       productId=detail['productId'], 
                                       name=detail['name'], 
                                       currency=detail['currency'], 
                                       price=detail['price'], 
                                       listPrice=detail['listPrice'], 
                                       url=url))

            self.logger.info(log_info)


            return tool.return_data(successful=True, data=detail)

        except Exception, e:
            raise
예제 #19
0
파일: yintai.py 프로젝트: hellowac/drag
    def detail(self, url):
        try:
            resp = requests.get(url, verify=False)

            status_code = resp.status_code
            pqhtml = PyQuery(resp.text or 'nothing')
            #下架
            if status_code == 404:

                log_info = json.dumps(
                    dict(time=time.time(),
                         title=pqhtml('title').text(),
                         url=url))

                self.logger.info(log_info)

                data = tool.get_off_shelf(code=status_code,
                                          message=self.cfg.SOLD_OUT,
                                          backUrl=resp.url,
                                          html=pqhtml.outerHtml())

                return tool.return_data(successful=False, data=data)

            #其他错误
            if status_code != 200:

                log_info = json.dumps(
                    dict(time=time.time(),
                         title=pqhtml('title').text(),
                         url=url))

                self.logger.info(log_info)

                data = tool.get_error(code=status_code,
                                      message=self.cfg.GET_ERR.get(
                                          'SCERR', 'ERROR'),
                                      backUrl=resp.url,
                                      html=pqhtml.outerHtml())

                return tool.return_data(successful=False, data=data)

            #前期准备
            area = pqhtml('#bd .grid')
            # domain = tool.get_domain(url)
            pdata = self.get_pdata(pqhtml)

            # print area.outerHtml().encode('utf-8')

            #下架
            if not len(area('.p-buy #addCart .buynow')):

                log_info = json.dumps(
                    dict(time=time.time(),
                         title=pqhtml('title').text(),
                         url=url))

                self.logger.info(log_info)

                data = tool.get_off_shelf(code=status_code,
                                          message=self.cfg.SOLD_OUT,
                                          backUrl=resp.url,
                                          html=pqhtml.outerHtml())

                return tool.return_data(successful=False, data=data)

            #产品应该只有一个
            if len(pdata['prods']) != 1:
                raise ValueError('yintai product data length great than 1')

            detail = dict()

            #品牌
            brand = area('h4.y-pro-cooper-name').text()
            detail['brand'] = brand

            #名称
            detail['name'] = pdata['prods'][0]['name']

            #货币
            currency = 'CNY'
            detail['currency'] = currency
            detail['currencySymbol'] = tool.get_unit(currency)

            price = pdata['prods'][0]['price']

            if u'直降' in area('#Y_ProBen').text():
                self.session.headers['Referer'] = url
                self.session.headers['X-Requested-With'] = 'XMLHttpRequest'
                # self.session.headers['Origin'] = 'http://item.yintai.com'
                self.session.headers['Origin'] = url

                # subArea = PyQuery(self.session.post(url,data=dict()).text)
                subArea = PyQuery(
                    requests.post(url,
                                  data=dict(),
                                  headers=self.session.headers,
                                  cookies=resp.cookies).text)

                price = re.search(
                    r'(\d[\d\.]*)',
                    subArea('.marketPriceNum .yt-num').text()).groups()[0]
                price = price + subArea('.marketPriceNum .yt-num em').text()

            #价格,该业务逻辑后边删除
            detail['price'] = float(price)
            detail['listPrice'] = pdata['prods'][0]['mPrice']

            # print area('.productInfo .s-s-color').next()('a[href="Javascript:void(0);"]').outerHtml().encode('utf-8')
            # print area('.productInfo .s-s-color').next()('.selected a').text()

            #颜色
            # color = self.get_color(area)
            color = area('.productInfo .s-s-color').next()(
                'a[href="Javascript:void(0);"]').text()
            color = color or area('.productInfo .s-s-color').next()(
                '.selected a').text()  #2016-12-15添加
            detail['color'] = color
            detail['colorId'] = pdata['prods'][0]['colorID']

            #图片集
            imgs = self.get_imgs(area)
            detail['img'] = imgs[0]
            detail['imgs'] = imgs

            #产品ID
            productId = pdata['prods'][0]['sku']
            detail['productId'] = productId

            #规格
            detail['sizes'] = self.get_sizes(area)

            #描述
            detail['descr'] = area('.yp-con-desc').text()

            #HTTP状态码
            detail['status_code'] = status_code

            #状态
            detail['status'] = self.cfg.STATUS_SALE

            #返回链接
            detail['backUrl'] = resp.url

            self.session.cookies = RequestsCookieJar()
            self.session.headers = tool.get_one_header()

            log_info = json.dumps(
                dict(time=time.time(),
                     productId=detail['productId'],
                     name=detail['name'],
                     currency=detail['currency'],
                     price=detail['price'],
                     listPrice=detail['listPrice'],
                     url=url))

            self.logger.info(log_info)

            return tool.return_data(successful=True, data=detail)

        except Exception, e:
            if 'get YinTai_TagData Fail' in str(e) and self._retry < 10:
                self._retry += 1
                return self.detail(url)
            elif self._retry >= 10:
                raise ValueError('yintai retry five times ,{0}'.format(str(e)))
            else:
                raise
예제 #20
0
    def detail(self, url):
        resp = self.session.get(url, verify=False)

        status_code = resp.status_code
        pqhtml = PyQuery(resp.text or 'nothing')
        #下架
        if status_code == 404:

            log_info = json.dumps(
                dict(time=time.time(), title=pqhtml('title').text(), url=url))

            self.logger.info(log_info)

            data = tool.get_off_shelf(code=status_code,
                                      message=self.cfg.SOLD_OUT,
                                      backUrl=resp.url,
                                      html=pqhtml.outerHtml())
            return tool.return_data(successful=False, data=data)

        #其他错误
        if status_code != 200:

            log_info = json.dumps(
                dict(time=time.time(), title=pqhtml('title').text(), url=url))

            self.logger.info(log_info)

            data = tool.get_error(
                code=status_code,
                message='status_code:{0},need 200, message:{1}'.format(
                    status_code, self.cfg.GET_ERR.get('SCERR', 'ERROR')),
                backUrl=resp.url,
                html=pqhtml.outerHtml())

            return tool.return_data(successful=False, data=data)

        #错误
        if len(pqhtml('.error_message')) >= 1:

            log_info = json.dumps(
                dict(time=time.time(), title=pqhtml('title').text(), url=url))

            self.logger.info(log_info)

            data = tool.get_error(code=status_code,
                                  message=self.cfg.GET_ERR.get(
                                      'SAKERR', 'ERROR'),
                                  backUrl=resp.url,
                                  html=pqhtml.outerHtml())
            return tool.return_data(successful=False, data=data)

        #前期准备
        area = pqhtml('#pdp-content-area')
        pdata = self.get_pdata(area)

        # print json.dumps(pdata)
        # exit()

        #下架
        if pdata['sold_out_message']['enabled'] or pdata[
                'intl_shipping_restriction']['enabled']:

            log_info = json.dumps(
                dict(time=time.time(), title=pqhtml('title').text(), url=url))

            self.logger.info(log_info)

            data = tool.get_off_shelf(code=status_code,
                                      message=self.cfg.SOLD_OUT,
                                      backUrl=resp.url,
                                      html=pqhtml.outerHtml())
            return tool.return_data(successful=False, data=data)

        detail = dict()

        #品牌
        brand = pdata['brand_name']['label'] if pdata['brand_name'][
            'enabled'] else ''
        detail['brand'] = brand

        #名称
        detail['name'] = pdata['short_description']

        #货币
        currency = pdata['price']['list_price']['local_currency_code']
        detail['currency'] = currency
        detail['currencySymbol'] = tool.get_unit(currency)

        #价格
        price, listPrice = self.get_all_price(pdata)

        detail['price'] = price
        detail['listPrice'] = listPrice

        #颜色,此处必须取color的id,虽然有为0的坑.但是下面价格是根据id来进行区分颜色的.
        color = {
            str(clor['id']): clor['label']
            for clor in pdata['colors']['colors']
        }
        colorId = {
            str(clor['id']): str(clor['id'])
            for clor in pdata['colors']['colors']
        }
        detail['color'] = color or self.cfg.DEFAULT_ONE_COLOR
        detail['colorId'] = colorId or self.cfg.DEFAULT_COLOR_SKU

        #图片集
        imgs = self.get_imgs(pdata, area)

        detail['img'] = imgs[0] if isinstance(imgs, list) else dict(
            [(cid, Arr[0]) for cid, Arr in imgs.items()])
        detail['imgs'] = imgs

        #规格,包括多颜色的price.listPrice
        sprice, slistPrice, sizes = self.get_sizes(pdata)

        #钥匙
        if sizes.keys():
            detail['keys'] = sizes.keys()
        elif color:
            detail['keys'] = color.keys()

        # self.logger.debug('price.keys()->{}'.format(price.keys() if isinstance(price,dict) else 'not dict'))
        # self.logger.debug('color.keys()->{}'.format(color.keys() if isinstance(color,dict) else 'not dict'))
        # self.logger.debug('sizes.keys()->{}'.format(sizes.keys() if isinstance(sizes,dict) else 'not dict'))
        # self.logger.debug('detail[\'keys\']->{}'.format(detail['keys'] if 'keys' in detail else 'not keys'))

        #产品ID
        productId = pdata['product_code']
        detail['productId'] = productId

        # print price,listPrice
        # print sprice,slistPrice
        detail['sizes'] = sizes
        detail['price'] = sprice
        detail['listPrice'] = slistPrice

        #描述
        detail['descr'] = PyQuery(pdata['description']).text()

        #退换货
        detail['returns'] = pdata['simple_shipping_statement']['message']

        #HTTP状态码
        detail['status_code'] = status_code

        #状态
        detail['status'] = self.cfg.STATUS_SALE

        #返回链接
        detail['backUrl'] = url

        log_info = json.dumps(
            dict(time=time.time(),
                 productId=detail['productId'],
                 name=detail['name'],
                 currency=detail['currency'],
                 price=detail['price'],
                 listPrice=detail['listPrice'],
                 url=url))

        self.logger.info(log_info)

        return tool.return_data(successful=True, data=detail)
예제 #21
0
    def detail(self, url):
        try:
            self.domain = tool.get_domain(url)

            resp = self.session.get(url, verify=False)

            status_code = resp.status_code
            pqhtml = PyQuery(resp.text or 'nothing')
            #下架
            if status_code == 404:

                log_info = json.dumps(
                    dict(time=time.time(),
                         title=pqhtml('title').text(),
                         url=url))

                self.logger.info(log_info)

                data = tool.get_off_shelf(code=status_code,
                                          message=self.cfg.SOLD_OUT,
                                          backUrl=resp.url,
                                          html=pqhtml.outerHtml())
                return tool.return_data(successful=False, data=data)

            #其他错误
            if status_code != 200:

                log_info = json.dumps(
                    dict(time=time.time(),
                         title=pqhtml('title').text(),
                         url=url))

                self.logger.info(log_info)

                data = tool.get_error(code=status_code,
                                      message=self.cfg.GET_ERR.get(
                                          'SCERR', 'ERROR'),
                                      backUrl=resp.url,
                                      html=pqhtml.outerHtml())

                return tool.return_data(successful=False, data=data)

            #前期准备
            area = pqhtml('#content>#productContainer')
            pdata = self.get_pdata(pqhtml)

            # print area.outerHtml()

            # print json.dumps(pdata)
            # exit()

            #下架
            if not area or area('.productButtons #disabledAddtobasket'):

                log_info = json.dumps(
                    dict(time=time.time(),
                         title=pqhtml('title').text(),
                         url=url))

                self.logger.info(log_info)

                data = tool.get_off_shelf(code=status_code,
                                          message=self.cfg.SOLD_OUT,
                                          backUrl=resp.url,
                                          html=pqhtml.outerHtml())

                return tool.return_data(successful=False, data=data)

            detail = dict()

            #品牌
            brand = 'COS'
            detail['brand'] = brand

            #名称
            detail['name'] = area('.productInfo h1:first').text()

            #货币
            currency = pqhtml('meta[property="og:price:currency"]').attr(
                'content')
            detail['currency'] = currency
            detail['currencySymbol'] = tool.get_unit(currency)

            #价格
            price, listPrice = self.get_all_price(pqhtml, area)
            detail['price'] = price
            detail['listPrice'] = listPrice

            #颜色
            color = self.get_color(area)
            detail['color'] = color
            detail['colorId'] = dict([(key, key) for key in color.keys()])

            #图片集
            imgs = self.get_imgs(area)
            detail['img'] = imgs[0] if isinstance(imgs, list) else dict(
                [(cid, Arr[0]) for cid, Arr in imgs.items()])
            detail['imgs'] = imgs

            #钥匙
            detail['keys'] = color.keys()

            #产品ID
            productId = area('input[data-product-identifier!=""]').attr(
                'data-product-identifier')
            detail['productId'] = productId

            #规格
            detail['sizes'] = self.get_sizes(area)

            #描述
            detail['descr'] = area('.productInfo>.infowrap>dl>dd:first').text()

            #退换货
            detail['returns'] = area(
                '.productInfo>.infowrap>dl>dd:first').text()

            #HTTP状态码
            detail['status_code'] = status_code

            #状态
            detail['status'] = self.cfg.STATUS_SALE

            #返回链接
            detail['backUrl'] = url

            log_info = json.dumps(
                dict(time=time.time(),
                     productId=detail['productId'],
                     name=detail['name'],
                     currency=detail['currency'],
                     price=detail['price'],
                     listPrice=detail['listPrice'],
                     url=url))

            self.logger.info(log_info)

            return tool.return_data(successful=True, data=detail)

        except Exception, e:
            raise
예제 #22
0
    def detail(self, url):
        resp = self.session.get(url, verify=False)

        status_code = resp.status_code
        pqhtml = PyQuery(resp.text or 'nothing')

        # 下架
        if status_code == 404:

            log_info = json.dumps(
                dict(time=time.time(), title=pqhtml('title').text(), url=url))

            self.logger.info(log_info)

            data = tool.get_off_shelf(code=status_code,
                                      message=self.cfg.SOLD_OUT,
                                      backUrl=resp.url,
                                      html=pqhtml.outerHtml())
            return tool.return_data(successful=False, data=data)

        # 其他错误
        if status_code != 200:

            log_info = json.dumps(
                dict(time=time.time(), title=pqhtml('title').text(), url=url))

            self.logger.info(log_info)

            data = tool.get_error(code=status_code,
                                  message=self.cfg.GET_ERR.get(
                                      'SCERR', 'ERROR'),
                                  backUrl=resp.url,
                                  html=pqhtml.outerHtml())

            return tool.return_data(successful=False, data=data)

        # 前期准备:
        # Jtxt = pqhtml('script').text()
        pdata = self.get_pdata(pqhtml)
        area = pqhtml('.productDetailSummary')
        pinfo = pqhtml('#productInfo')
        imgPath = url.split('/')[3]

        # print area.outerHtml()
        # print json.dumps(pdata)
        # exit()

        # 下架
        if not pdata:

            log_info = json.dumps(
                dict(time=time.time(), title=pqhtml('title').text(), url=url))

            self.logger.info(log_info)

            data = tool.get_off_shelf(code=status_code,
                                      message=self.cfg.SOLD_OUT,
                                      backUrl=resp.url,
                                      html=pqhtml.outerHtml())
            return tool.return_data(successful=False, data=data)

        detail = dict()

        # 名称
        detail['name'] = pqhtml('.productName').text()

        # 品牌
        detail['brand'] = pqhtml('.productName a').text()

        # 货币
        currency = area('span[itemprop="priceCurrency"]').text()
        detail['currency'] = currency
        detail['currencySymbol'] = tool.get_unit(currency)

        # 价格
        price, listPrice = self.get_all_price(area)
        detail['price'] = price
        detail['listPrice'] = listPrice

        # 图片集
        img, imgs = self.get_imgs(area, imgPath)
        detail['img'] = img
        detail['imgs'] = imgs

        # 视频
        if len(area('.MagicScroll .productVideo')) > 0:
            detail['video'] = area('.MagicScroll a.productVideo').attr(
                'data-video-url')

        # 颜色
        colors, sizes = self.get_colors_sizes(area, pdata)
        detail['color'] = colors
        detail['sizes'] = sizes

        detail['keys'] = colors.keys()

        detail['colorId'] = dict([(key, key) for key in colors.keys()])

        # 产品ID
        productId = area('input#baseNo').attr('value')
        detail['productId'] = productId

        # 描述
        detail['descr'] = pinfo('#overview').text()

        # 详情
        detail['detail'] = pinfo('#specs').text()

        # HTTP状态码
        detail['status_code'] = status_code

        # 状态
        detail['status'] = self.cfg.STATUS_SALE

        # 返回链接
        detail['backUrl'] = url

        log_info = json.dumps(
            dict(time=time.time(),
                 productId=detail['productId'],
                 name=detail['name'],
                 currency=detail['currency'],
                 price=detail['price'],
                 listPrice=detail['listPrice'],
                 url=url))

        self.logger.info(log_info)

        return tool.return_data(successful=True, data=detail)
예제 #23
0
파일: sixpm.py 프로젝트: hellowac/drag
    def detail(self,url):
        try:
            resp = self.session.get(url, verify=False)

            status_code = resp.status_code
            pqhtml = PyQuery(resp.text or 'nothing')
            #下架
            if status_code == 404 :

                log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url))

                self.logger.info(log_info)

                data = tool.get_off_shelf(code=status_code,message=self.cfg.SOLD_OUT,backUrl=resp.url, html=pqhtml.outerHtml())
                
                return tool.return_data(successful=False, data=data)

            #其他错误
            if status_code != 200 :

                log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url))

                self.logger.info(log_info)

                data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get('SCERR','ERROR'), backUrl=resp.url, html=pqhtml.outerHtml())

                return tool.return_data(successful=False, data=data)
            
            #前期准备
            area = pqhtml('#theater')
            domain = tool.get_domain(url)
            pdata = self.get_pdata(pqhtml('script:gt(20)'))
            
            # exit()

            #下架
            # if True :

                # log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url))

                # self.logger.info(log_info)
                # data = tool.get_off_shelf(code=status_code,message=self.cfg.SOLD_OUT,backUrl=resp.url, html=pqhtml.outerHtml())
                
                # return tool.return_data(successful=False, data=data)

            detail = dict()

            #品牌
            brand = area('.brand').text()
            detail['brand'] = brand

            #名称
            detail['name'] = area('h1:first').text()


            currencySymbol,price,listPrice = self.get_price_info(pdata)

            if currencySymbol != '$' :
                raise ValueError('currencySymbol is not USD')

            #货币
            currency = 'USD'
            detail['currency'] = currency
            detail['currencySymbol'] = tool.get_unit(currency)

            #价格
            detail['price'] = price
            detail['listPrice'] = listPrice

            #颜色
            color = self.get_color(pdata)
            detail['color'] = color
            detail['colorId'] = {cid:cid for cid in color.keys() }

            #图片集
            img,imgs = self.get_imgs(pdata)
            detail['img'] = img
            detail['imgs'] = imgs

            #产品ID
            productId = pqhtml('input[name="productId"]').attr('value')
            detail['productId'] = productId

            #规格
            sizes = self.get_sizes(pdata)
            detail['sizes'] = sizes

            #描述
            detail['descr'] = area('.description').text()

            detail['keys'] = set(img.keys())&set(sizes.keys())

            #HTTP状态码
            detail['status_code'] = status_code

            #状态
            detail['status'] = self.cfg.STATUS_SALE

            #返回链接
            detail['backUrl'] = resp.url
            
            #返回的IP和端口
            if resp.raw._original_response.peer :
                detail['ip_port']=':'.join(map( lambda x:str(x),resp.raw._original_response.peer))

            log_info = json.dumps(dict(time=time.time(), 
                                       productId=detail['productId'], 
                                       name=detail['name'], 
                                       currency=detail['currency'], 
                                       price=detail['price'], 
                                       listPrice=detail['listPrice'], 
                                       url=url))

            self.logger.info(log_info)


            return tool.return_data(successful=True, data=detail)

        except Exception, e:
            raise
예제 #24
0
파일: ashford.py 프로젝트: hellowac/drag
    def detail(self, url):
        try:

            resp = self.session.get(url, verify=False)

            status_code = resp.status_code
            pqhtml = PyQuery(resp.text or 'nothing')
            domain = tool.get_domain(url)

            #下架:
            if status_code == 404:

                log_info = json.dumps(
                    dict(time=time.time(),
                         title=pqhtml('title').text(),
                         url=url))

                self.logger.info(log_info)

                data = tool.get_off_shelf(code=status_code,
                                          message=self.cfg.SOLD_OUT,
                                          backUrl=resp.url,
                                          html=pqhtml.outerHtml())

                return tool.return_data(successful=False, data=data)

            #其他错误
            if status_code != 200:

                log_info = json.dumps(
                    dict(time=time.time(),
                         title=pqhtml('title').text(),
                         url=url))

                self.logger.info(log_info)

                data = tool.get_error(code=status_code,
                                      message=self.cfg.GET_ERR.get(
                                          'SCERR', 'ERROR'),
                                      backUrl=resp.url,
                                      html=pqhtml.outerHtml())

                return tool.return_data(successful=False, data=data)

            #前期准备
            Jtxt = pqhtml('script').text()
            area = pqhtml('#container')
            pdata = self.get_pdata(Jtxt)
            domain = tool.get_domain(url)

            #下架
            # if not instock :
            # data = tool.get_off_shelf(code=status_code,message=self.cfg.SOLD_OUT,backUrl=resp.url, html=pqhtml.outerHtml())

            # return tool.return_data(successful=False, data=data)

            # print area.outerHtml()
            # exit()

            detail = dict()

            #图片
            imgsTmp = [
                domain + a.attr('href')
                for a in area('form#addToCart ul.alt_imgs:first>li>a').items()
            ]
            detail['img'] = imgsTmp[0]
            detail['imgs'] = imgsTmp

            #名称
            detail['name'] = pdata['product']['name']

            #品牌
            detail['brand'] = area('form#addToCart a#sameBrandProduct').text()

            #价格
            detail['price'] = pdata['product']['unit_sale_price']
            detail['listPrice'] = pdata['product']['unit_price']

            #价格符号
            currency = pdata['product']['currency']
            detail['currency'] = currency
            detail['currencySymbol'] = tool.get_unit(currency)

            #产品id
            productId = pdata['product']['id']
            detail['productId'] = productId

            #颜色
            detail['color'] = self.cfg.DEFAULT_ONE_COLOR
            detail['colorId'] = productId

            #规格
            detail['sizes'] = [
                dict(name=self.cfg.DEFAULT_ONE_SIZE,
                     inventory=pdata['product']['stock'],
                     sku=productId)
            ]

            #描述
            detail['descr'] = area('.prod_desc').text() + ' ' + area(
                'div#info_tabs>div.wrap>div#tab1_info').text()

            #详细
            detail['detail'] = area('#tab1_info').text()

            #品牌描述
            detail['brandDescr'] = area('#tab2_info').text()

            #保修
            detail['note'] = area('#tab5_info').text()

            #配送
            detail['delivery'] = area('#shippingData').text()

            #HTTP状态码
            detail['status_code'] = status_code

            #状态
            detail['status'] = self.cfg.STATUS_SALE

            #返回链接
            detail['backUrl'] = url

            log_info = json.dumps(
                dict(time=time.time(),
                     productId=detail['productId'],
                     name=detail['name'],
                     currency=detail['currency'],
                     price=detail['price'],
                     listPrice=detail['listPrice'],
                     url=url))

            self.logger.info(log_info)

            return tool.return_data(successful=True, data=detail)

        except Exception, e:
            raise
예제 #25
0
    def detail(self, url):
        try:
            resp = self.session.get(url, verify=False)

            status_code = resp.status_code
            pqhtml = PyQuery(resp.text or 'nothing')
            #下架
            if status_code == 404:

                log_info = json.dumps(
                    dict(time=time.time(),
                         title=pqhtml('title').text(),
                         url=url))

                self.logger.info(log_info)

                data = tool.get_off_shelf(code=status_code,
                                          message=self.cfg.SOLD_OUT,
                                          backUrl=resp.url,
                                          html=pqhtml.outerHtml())

                return tool.return_data(successful=False, data=data)

            #其他错误
            if status_code != 200:

                log_info = json.dumps(
                    dict(time=time.time(),
                         title=pqhtml('title').text(),
                         url=url))

                self.logger.info(log_info)

                data = tool.get_error(code=status_code,
                                      message=self.cfg.GET_ERR.get(
                                          'SCERR', 'ERROR'),
                                      backUrl=resp.url,
                                      html=pqhtml.outerHtml())

                return tool.return_data(successful=False, data=data)

            #前期准备
            area = pqhtml('div.primary-content')
            domain = tool.get_domain(url)

            # print area.outerHtml().encode('utf-8')
            # exit()

            #下架
            # if True :

            # log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url))

            # self.logger.info(log_info)
            # data = tool.get_off_shelf(code=status_code,message=self.cfg.SOLD_OUT,backUrl=resp.url, html=pqhtml.outerHtml())

            # return tool.return_data(successful=False, data=data)

            detail = dict()

            #产品ID
            # productId = area('input.productId').attr('value')
            productId = pqhtml('span[itemprop="productID"]').attr('content')
            detail['productId'] = productId
            detail['productSku'] = productId
            detail['productCode'] = productId

            #品牌
            brand = pqhtml('span[itemprop="brand"]').attr('content')
            detail['brand'] = brand

            #名称
            detail['name'] = pqhtml('span[itemprop="name"]').attr('content')

            #货币
            currency = pqhtml('span[itemprop="priceCurrency"]').attr('content')
            detail['currency'] = currency
            detail['currencySymbol'] = tool.get_unit(currency)

            #价格
            price, listPrice = self.get_all_price(area)
            price = pqhtml('span[itemprop="price"]').attr('content')
            detail['price'] = price
            detail['listPrice'] = listPrice

            #一级分类
            detail['category'] = area('a[data-bigpopup="sizeChart"]').attr(
                'data-category')

            #二级分类
            detail['subcategory'] = area('a[data-bigpopup="sizeChart"]').attr(
                'data-sub-category')

            #描述
            detail['descr'] = pqhtml('span[itemprop="description"]').attr(
                'content')

            #详细
            detail['detail'] = area('#collapseOne').text()

            #退换货
            detail['returns'] = area('#collapseFive').text()

            #颜色
            # color = self.get_color(area)
            detail['color'] = pqhtml('span[itemprop="color"]').attr('content')
            detail['colorId'] = self.cfg.DEFAULT_COLOR_SKU

            #图片集
            imgs = [
                img.attr('src') for img in area(
                    '.product-image-carousel img.primary-image').items()
            ]
            detail['img'] = pqhtml('span[itemprop="image"]').attr('content')
            detail['imgs'] = imgs

            #规格
            detail['sizes'] = self.get_sizes(area)

            #HTTP状态码
            detail['status_code'] = status_code

            #状态
            detail['status'] = self.cfg.STATUS_SALE

            #返回链接
            detail['backUrl'] = resp.url

            #返回的IP和端口
            if resp.raw._original_response.peer:
                detail['ip_port'] = ':'.join(
                    map(lambda x: str(x), resp.raw._original_response.peer))

            log_info = json.dumps(
                dict(time=time.time(),
                     productId=detail['productId'],
                     name=detail['name'],
                     currency=detail['currency'],
                     price=detail['price'],
                     listPrice=detail['listPrice'],
                     url=url))

            self.logger.info(log_info)

            return tool.return_data(successful=True, data=detail)

        except Exception, e:
            raise
예제 #26
0
파일: kicksusa.py 프로젝트: hellowac/drag
    def detail(self, url):
        try:

            resp = self.session.get(url, verify=False)  #加载第一次.

            #验证resp.防爬虫.!!!
            resp = self.resp_verify(resp)

            if 'window.location.reload(true);' in resp.text:

                resp = self.session.get(url, verify=False)  #加载第二次.

            #会出现不返回内容的情况
            while not resp.text:
                return self.detail(url, verify=False)

            status_code = resp.status_code
            pqhtml = PyQuery(resp.text or 'nothing')

            #下架
            if status_code == 404:

                log_info = json.dumps(
                    dict(time=time.time(),
                         title=pqhtml('title').text(),
                         url=url))

                self.logger.info(log_info)

                data = tool.get_off_shelf(code=status_code,
                                          message=self.cfg.SOLD_OUT,
                                          backUrl=resp.url,
                                          html=pqhtml.outerHtml())

                return tool.return_data(successful=False, data=data)

            #其他错误
            if status_code != 200:

                log_info = json.dumps(
                    dict(time=time.time(),
                         title=pqhtml('title').text(),
                         url=url))

                self.logger.info(log_info)

                data = tool.get_error(code=status_code,
                                      message=self.cfg.GET_ERR.get(
                                          'SCERR', 'ERROR'),
                                      backUrl=resp.url,
                                      html=pqhtml.outerHtml())

                return tool.return_data(successful=False, data=data)

            #下架
            if 'Out of stock' in pqhtml('.product-availability').text():

                log_info = json.dumps(
                    dict(time=time.time(),
                         title=pqhtml('title').text(),
                         url=url))

                self.logger.info(log_info)

                data = tool.get_off_shelf(code=status_code,
                                          message=self.cfg.SOLD_OUT,
                                          backUrl=resp.url,
                                          html=pqhtml.outerHtml())

                return tool.return_data(successful=False, data=data)

            #前期准备
            area = pqhtml('div#product-view')
            domain = tool.get_domain(url)
            pdata = self.get_pdata(area)

            detail = dict()

            #品牌
            brand = area('.panel-a h1:first').text().split('-')[0].strip()
            detail['brand'] = brand

            #名称
            detail['name'] = area('.panel-a h1:first').text()

            #货币
            currency = pqhtml('meta[itemprop="priceCurrency"]').attr('content')
            detail['currency'] = currency
            detail['currencySymbol'] = tool.get_unit(currency)

            #产品ID
            productId = pdata['productId']
            detail['productId'] = productId

            #价格
            price, listPrice = pdata['basePrice'].replace(
                ',', ''), pdata['oldPrice'].replace(',', '')
            detail['price'] = price
            detail['listPrice'] = listPrice

            #颜色
            # color = self.get_color(area)
            detail['color'] = area('button#product-addtocart-button').attr(
                'data-variant') or self.cfg.DEFAULT_ONE_COLOR
            detail['colorId'] = productId

            #图片集
            imgs = [
                img.attr('data-src')
                for img in area('div#mobile-carousel-images a>img').items()
            ]
            detail['img'] = imgs[0]
            detail['imgs'] = imgs

            #规格
            detail['sizes'] = self.get_sizes(area, pdata)

            #描述
            detail['descr'] = area('div.tog-desc').text() + area.parent()(
                '.description-section:first').text()

            #HTTP状态码
            detail['status_code'] = status_code

            #状态
            detail['status'] = self.cfg.STATUS_SALE

            #返回链接
            detail['backUrl'] = resp.url

            log_info = json.dumps(
                dict(time=time.time(),
                     productId=detail['productId'],
                     name=detail['name'],
                     currency=detail['currency'],
                     price=detail['price'],
                     listPrice=detail['listPrice'],
                     url=url))

            self.logger.info(log_info)

            return tool.return_data(successful=True, data=detail)

        except Exception, e:
            raise
예제 #27
0
class WeeronlineSource(Source):
  """Weather source class"""

  __d = None
  __date = None
  __day = None
  __url = 'http://www.weeronline.nl/Europa/Nederland/Eindhoven/4058591'


  def __init__(self):
    """Constructor"""


  def get_weather(self, day):
    if day < 0 or day > 13:
      sys.stderr.write('`day` must be an integer between 0 and 13\n')
      return None
    if day > 3:
      sys.stderr.write('`day` >= 4 not yet supported\n')
      return None

    self.__date = datetime.datetime.now() + datetime.timedelta(days=day)
    self.__day = day

    if not self.__d:
      self.__d = PyQuery(url=self.__url)

    data = None
    try:
      data = self.__parse()
    except:
      html = self.__d.outerHtml().encode('ascii', 'replace')
      sys.stderr.write(html)
      sys.stderr.write('\n\n')
      traceback.print_exc()
    return data


  def __parse(self):
    """Parse the HTML page"""
    container = self.__d('.weatherforecast.FiveDays')
    rows = container.find('.row_forecast')
    iconRows = container.find('.row_weathericons')
    ratingRows = container.find('.row_weathernumbers')
    
    index = self.__day + 1

    w = {}

    w['url'] = self.__url
    w['date'] = self.__date.strftime("%Y-%m-%d")
    w['url_timestamp'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    w['day'] = self.__day

    icons = iconRows.eq(0).find('td').eq(index).find('div')
    for i in range(3):
      val = icons.eq(i).attr('class')
      w['icon_' + str(i + 1)] = val

    val = rows.eq(0).find('td').eq(index).text()
    val = val.encode('ascii', 'ignore') # strip the °-sign
    w['temperature_minimum'] = self.__numeric(val)

    val = rows.eq(1).find('td').eq(index).text()
    val = val.encode('ascii', 'ignore') # strip the °-sign
    w['temperature_maximum'] = self.__numeric(val)

    val = rows.eq(2).find('td').eq(index).text()
    val = val.rstrip('/') # strip the '/'
    w['wind_force'] = self.__numeric(val)

    val = rows.eq(2).find('td').eq(index).find('.windImageDiv.darkImage > div').attr('class')
    val = val.replace('wind_icon_small_', '').replace('_xs darkImage', '')
    w['wind_direction'] = val

    val = rows.eq(3).find('td').eq(index).text()
    val = val.rstrip('%') # strip the '%'
    w['rain_percentage'] = self.__numeric(val)

    val = rows.eq(4).find('td').eq(index).text()
    val = val.rstrip('m') # strip the 'mm'
    w['rain_amount'] = self.__numeric(val)

    val = ratingRows.eq(0).find('td').eq(index).text()
    w['rating'] = self.__numeric(val)

    return w


  def __numeric(self, x):
    x = x.replace(',', '.')
    if not x:
      return 0
    return float(x) if '.' in x else int(x)
예제 #28
0
파일: iherb.py 프로젝트: hellowac/drag
    def detail(self, url):
        try:
            resp = self.session.get(url, verify=False)

            status_code = resp.status_code
            pqhtml = PyQuery(resp.text or 'nothing')
            #下架
            if status_code == 404:

                log_info = json.dumps(
                    dict(time=time.time(),
                         title=pqhtml('title').text(),
                         url=url))

                self.logger.info(log_info)

                data = tool.get_off_shelf(code=status_code,
                                          message=self.cfg.SOLD_OUT,
                                          backUrl=resp.url,
                                          html=pqhtml.outerHtml())
                return tool.return_data(successful=False, data=data)

            #其他错误
            if status_code != 200:

                log_info = json.dumps(
                    dict(time=time.time(),
                         title=pqhtml('title').text(),
                         url=url))

                self.logger.info(log_info)

                data = tool.get_error(code=status_code,
                                      message=self.cfg.GET_ERR.get(
                                          'SCERR', 'ERROR'),
                                      backUrl=resp.url,
                                      html=pqhtml.outerHtml())

                return tool.return_data(successful=False, data=data)

            #前期准备
            area = pqhtml('.product-detail-container')
            domain = tool.get_domain(url)

            # print area.outerHtml()
            # exit()

            #下架
            if u'缺货' in area('#stock-status').text():

                log_info = json.dumps(
                    dict(time=time.time(),
                         title=pqhtml('title').text(),
                         url=url))

                self.logger.info(log_info)
                data = tool.get_off_shelf(code=status_code,
                                          message=self.cfg.SOLD_OUT,
                                          backUrl=resp.url,
                                          html=pqhtml.outerHtml())

                return tool.return_data(successful=False, data=data)

            detail = dict()

            #品牌
            brand = area('#brand:first span').text() or area('#brand a').text()
            detail['brand'] = brand

            #名称
            detail['name'] = area('#name').text()

            #货币
            currency = area('#price-currency').attr('content')
            detail['currency'] = currency
            detail['currencySymbol'] = tool.get_unit(currency)

            #价格
            price, listPrice = self.get_all_price(area)
            detail['price'] = price
            detail['listPrice'] = listPrice

            #颜色
            detail['color'] = self.cfg.DEFAULT_ONE_COLOR
            detail['colorId'] = self.cfg.DEFAULT_COLOR_SKU

            #图片集
            imgs = [
                a.attr('data-large-img') for a in area(
                    '.image-container  .thumbnail-container img').items()
            ] or [
                img.attr('src')
                for img in area('#iherb-product-zoom img').items()
            ]
            imgs = imgs or [
                area('#product-image .product-summary-image a').attr('href')
            ]
            detail['img'] = imgs[0]
            detail['imgs'] = imgs

            #产品ID
            productId = area('input[name="pid"]').attr('value')
            detail['productId'] = productId

            #规格
            stock_txt = area('#stock-status').text()

            inv = area('#ddlQty option:last').attr(
                'value'
            ) if 'In Stock' in stock_txt or u'有库存' in stock_txt else 0
            detail['sizes'] = [
                dict(name=self.cfg.DEFAULT_ONE_SIZE,
                     inventory=inv,
                     id=productId,
                     sku=productId)
            ]

            #描述
            detail['descr'] = area('#product-specs-list li').text()

            #详细
            detail['detail'] = pqhtml('div[itemprop="description"]').text()

            #HTTP状态码
            detail['status_code'] = status_code

            #状态
            detail['status'] = self.cfg.STATUS_SALE

            #返回链接
            detail['backUrl'] = url

            log_info = json.dumps(
                dict(time=time.time(),
                     productId=detail['productId'],
                     name=detail['name'],
                     currency=detail['currency'],
                     price=detail['price'],
                     listPrice=detail['listPrice'],
                     url=url))

            self.logger.info(log_info)

            return tool.return_data(successful=True, data=detail)

        except Exception, e:
            raise
예제 #29
0
    def detail(self,url):
        try:
            resp = self.session.get(url, verify=False)

            status_code = resp.status_code
            pqhtml = PyQuery(resp.text or 'nothing')
            #下架
            if status_code == 404 :

                log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url))

                self.logger.info(log_info)

                data = tool.get_off_shelf(code=status_code,message=self.cfg.SOLD_OUT,backUrl=resp.url, html=pqhtml.outerHtml())
                
                return tool.return_data(successful=False, data=data)

            #其他错误
            if status_code != 200 :

                log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url))

                self.logger.info(log_info)

                data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get('SCERR','ERROR'), backUrl=resp.url, html=pqhtml.outerHtml())

                return tool.return_data(successful=False, data=data)
            
            #前期准备
            area = pqhtml('#goodsInfo')
            domain = tool.get_domain(url)
            # pdata = self.get_pdata(area)
            
            # print area.outerHtml().encode('utf-8')
            # exit()

            #下架
            if not area :

                log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url))

                self.logger.info(log_info)
                data = tool.get_off_shelf(code=status_code,message=self.cfg.SOLD_OUT,backUrl=resp.url, html=pqhtml.outerHtml())
                
                return tool.return_data(successful=False, data=data)

            detail = dict()


            #产品ID
            productId = pqhtml('#goodsForm input#bskGodGodNo').attr('value')
            detail['productId'] = productId
            detail['productSku'] = productId
            detail['productCode'] = area('.prd-code').text()
            
            #品牌
            brand = pqhtml('#goodsForm input#brndNm').attr('value')
            detail['brand'] = brand

            #名称
            detail['name'] = u'{0} {1}'.format(brand,pqhtml('#goodsForm input#godNm').attr('value'))

            #货币,价格
            currency,price,listPrice = self.get_currency_prices(pqhtml,area)
            detail['currency'] = currency
            detail['currencySymbol'] = tool.get_unit(currency)

            detail['price'] = price
            detail['listPrice'] = listPrice

            #描述
            detail['descr'] = pqhtml('meta[name="description"]').attr('content')

            #详细
            detail['detail'] = pqhtml('meta[name="description"]').attr('content') + area('.desc-area').text()

            #颜色
            color = self.get_color(area)
            detail['color'] = self.cfg.DEFAULT_ONE_COLOR
            detail['colorId'] = self.cfg.DEFAULT_COLOR_SKU

            #图片集
            imgs = [ img.attr('src') for img in pqhtml('#prdImgWrap .prdImg ul>li>img').items()]
            detail['img'] = pqhtml('meta[property="og:image"][name="og_image"]').attr('content')
            detail['imgs'] = imgs

            #规格
            detail['sizes'] = self.get_sizes(area)

            #HTTP状态码
            detail['status_code'] = status_code

            #状态
            detail['status'] = self.cfg.STATUS_SALE

            #返回链接
            detail['backUrl'] = resp.url
            
            #返回的IP和端口
            if resp.raw._original_response.peer :
                detail['ip_port']=':'.join(map( lambda x:str(x),resp.raw._original_response.peer))

            log_info = json.dumps(dict(time=time.time(), 
                                       productId=detail['productId'], 
                                       name=detail['name'], 
                                       currency=detail['currency'], 
                                       price=detail['price'], 
                                       listPrice=detail['listPrice'], 
                                       url=url))

            self.logger.info(log_info)


            return tool.return_data(successful=True, data=detail)

        except Exception, e:
            raise
예제 #30
0
파일: kithnyc.py 프로젝트: hellowac/drag
    def detail(self, url):
        try:
            resp = self.session.get(url, verify=False)

            status_code = resp.status_code
            pqhtml = PyQuery(resp.text or 'nothing')
            #下架
            if status_code == 404:

                log_info = json.dumps(
                    dict(time=time.time(),
                         title=pqhtml('title').text(),
                         url=url))

                self.logger.info(log_info)

                data = tool.get_off_shelf(code=status_code,
                                          message=self.cfg.SOLD_OUT,
                                          backUrl=resp.url,
                                          html=pqhtml.outerHtml())

                return tool.return_data(successful=False, data=data)

            #其他错误
            if status_code != 200:

                log_info = json.dumps(
                    dict(time=time.time(),
                         title=pqhtml('title').text(),
                         url=url))

                self.logger.info(log_info)

                data = tool.get_error(code=status_code,
                                      message=self.cfg.GET_ERR.get(
                                          'SCERR', 'ERROR'),
                                      backUrl=resp.url,
                                      html=pqhtml.outerHtml())

                return tool.return_data(successful=False, data=data)

            #前期准备
            # area = pqhtml('.caption-product')
            area = pqhtml('.product-single-section-main')
            imgArea = pqhtml('.slider')
            domain = tool.get_domain(url)
            pdata = self.get_pdata(pqhtml('head'))

            # print area.outerHtml().encode('utf-8')
            # exit()

            #下架
            # if len(area('#variant-listbox')) == 0 :

            #     log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url))

            #     self.logger.info(log_info)

            #     data = tool.get_off_shelf(code=status_code,message=self.cfg.SOLD_OUT,backUrl=resp.url, html=pqhtml.outerHtml())

            #     return tool.return_data(successful=False, data=data)

            detail = dict()

            #品牌
            brand = pdata['product']['vendor']
            detail['brand'] = brand

            #名称
            detail['name'] = area('h1[itemprop="name"]').text()

            #货币
            currency = pqhtml('meta[itemprop="priceCurrency"]').attr('content')
            detail['currency'] = currency
            detail['currencySymbol'] = tool.get_unit(currency)

            #价格
            price, listPrice = self.get_all_price(area)
            detail['price'] = price
            detail['listPrice'] = listPrice

            #颜色
            # color = self.get_color(area)
            detail['color'] = self.cfg.DEFAULT_ONE_COLOR
            detail['colorId'] = pdata['product']['id']

            #图片集
            # imgs = [ 'https:'+a.attr('src') for a in imgArea('img').items()]
            imgs = [
                'http:' + img.attr('src')
                for img in area('.super-slider-main img').items()
            ]
            detail['img'] = imgs[0]
            detail['imgs'] = imgs

            #产品ID
            productId = pdata['product']['id']
            detail['productId'] = productId

            #规格
            detail['sizes'] = self.get_sizes(pdata, area)

            #描述
            detail['descr'] = area('.product-single-details-dropdown').text()

            #HTTP状态码
            detail['status_code'] = status_code

            #状态
            detail['status'] = self.cfg.STATUS_SALE

            #返回链接
            detail['backUrl'] = resp.url

            log_info = json.dumps(
                dict(time=time.time(),
                     productId=detail['productId'],
                     name=detail['name'],
                     currency=detail['currency'],
                     price=detail['price'],
                     listPrice=detail['listPrice'],
                     url=url))

            self.logger.info(log_info)

            return tool.return_data(successful=True, data=detail)

        except Exception, e:
            raise
예제 #31
0
    def detail(self, url):
        resp = self.session.get(url, verify=False)

        status_code = resp.status_code
        pqhtml = PyQuery(resp.text or 'nothing')
        #下架
        if status_code == 404:

            log_info = json.dumps(
                dict(time=time.time(), title=pqhtml('title').text(), url=url))

            self.logger.info(log_info)

            data = tool.get_off_shelf(code=status_code,
                                      message=self.cfg.SOLD_OUT,
                                      backUrl=resp.url,
                                      html=pqhtml.outerHtml())

            return tool.return_data(successful=False, data=data)

        if status_code != 200:

            log_info = json.dumps(
                dict(time=time.time(), title=pqhtml('title').text(), url=url))

            self.logger.info(log_info)

            data = tool.get_error(code=status_code,
                                  message='status_code Error',
                                  backUrl=resp.url,
                                  html=pqhtml.outerHtml())

            return tool.return_data(successful=False, data=data)

        # area = pqhtml('.product_schema_wrapper>.page_width')
        area = pqhtml('.container-full--small-only .grid')

        if not area:
            log_info = json.dumps(
                dict(time=time.time(), title=pqhtml('title').text(), url=url))
            self.logger.info(log_info)
            data = tool.get_off_shelf(code=status_code,
                                      message=self.cfg.SOLD_OUT,
                                      backUrl=resp.url,
                                      html=pqhtml.outerHtml())

            return tool.return_data(successful=False, data=data)

        # print area.outerHtml().encode('utf-8')

        # exit()

        detail = dict()

        #名称
        detail['name'] = area('h1.product__title').text() + ' ' + area(
            'h2.product__desc').text() + ' ' + area(
                'span.product__desc').text()

        #颜色
        detail['color'] = area('span[itemprop="color"]').text()

        #图片集
        # imgsTmp = [ a.attr('href') for a in area('.product-gallery__imgholder a').items() ]
        # imgsTmp = [ a.attr('data-zoom-image') for a in area('.product-gallery__imgholder a').items() ]
        imgsTmp = [
            img.attr('data-lazy') or img.attr('src')
            for img in area('.product-gallery__imgholder a img').items()
        ]
        detail['img'] = imgsTmp[0]
        detail['imgs'] = imgsTmp

        #货币
        currency = area('meta[itemprop="priceCurrency"]').attr('content')
        detail['currency'] = currency
        detail['currencySymbol'] = tool.get_unit(currency)

        #现价
        price = area('meta[itemprop="price"]').attr('content')
        detail['price'] = price

        #原价
        # detail['listPrice'] = area('span[itemprop="standard_price"]').text().replace(',','')
        listPriceBlock = area('span.product__price--old')
        detail['listPrice'] = re.search(
            r'(\d[\.\d,]*)', listPriceBlock.text()).groups()[0].replace(
                ',', '') if len(listPriceBlock) else price

        productInfo = area('#product-info')
        #描述
        detail['descr'] = productInfo('#design').text()

        #品牌
        detail['brand'] = 'REISS'

        #产品ID
        productId = area('span[itemprop="productID"]').text()
        detail['productId'] = productId

        #颜色ID
        detail['colorId'] = productId

        #配送和退货
        detail['delivery'] = productInfo('#delivery').text()
        detail['returns'] = productInfo('#delivery').text()

        #设计
        detail['designer'] = productInfo('#design').text()

        #sizeFit
        detail['sizeFit'] = productInfo('#size').text()

        #fabric
        detail['fabric'] = productInfo('#care').text()

        #规格
        detail['sizes'] = [
            dict(name=opt.text(),
                 sku=opt('input').attr('value'),
                 id=opt('input').attr('value'),
                 inventory=self.cfg.DEFAULT_STOCK_NUMBER
                 if opt.attr('class') != 'size_not_available' else 0)
            for opt in
            area('form .product-attributes .product-sizes .product-sizes__item'
                 ).items() if len(opt('input'))  #if 过滤没有库存的size.
        ]

        #没有sizes?
        if not detail['sizes']:

            log_info = json.dumps(
                dict(time=time.time(), title=pqhtml('title').text(), url=url))

            self.logger.info(log_info)

            data = tool.get_off_shelf(code=status_code,
                                      message=self.cfg.SOLD_OUT,
                                      backUrl=resp.url,
                                      html=pqhtml.outerHtml())

            return tool.return_data(successful=False, data=data)

        #HTTP状态码
        detail['status_code'] = status_code

        #状态
        detail['status'] = self.cfg.STATUS_SALE

        #返回链接
        detail['backUrl'] = resp.url

        log_info = json.dumps(
            dict(time=time.time(),
                 productId=detail['productId'],
                 name=detail['name'],
                 currency=detail['currency'],
                 price=detail['price'],
                 listPrice=detail['listPrice'],
                 url=url))

        self.logger.info(log_info)

        return tool.return_data(successful=True, data=detail)
예제 #32
0
    def detail(self, url):
        try:

            resp = self.session.get(url, verify=False)

            status_code = resp.status_code
            pqhtml = PyQuery(resp.text or 'nothing')

            #下架:
            if status_code == 404:

                log_info = json.dumps(
                    dict(time=time.time(),
                         title=pqhtml('title').text(),
                         url=url))

                self.logger.info(log_info)

                data = tool.get_off_shelf(code=status_code,
                                          message=self.cfg.SOLD_OUT,
                                          backUrl=resp.url,
                                          html=pqhtml.outerHtml())
                return tool.return_data(successful=False, data=data)

            #其他错误
            if status_code != 200:

                log_info = json.dumps(
                    dict(time=time.time(),
                         title=pqhtml('title').text(),
                         url=url))

                self.logger.info(log_info)

                data = tool.get_error(code=status_code,
                                      message=self.cfg.GET_ERR.get(
                                          'SCERR', 'ERROR'),
                                      backUrl=resp.url,
                                      html=pqhtml.outerHtml())

                return tool.return_data(successful=False, data=data)

            #前期准备
            Jtxt = pqhtml('script').text()
            area = pqhtml('#itemContent')

            # print area.outerHtml().encode('utf-8')

            #下架
            if not area:

                log_info = json.dumps(
                    dict(time=time.time(),
                         title=pqhtml('title').text(),
                         url=url))

                self.logger.info(log_info)

                data = tool.get_off_shelf(code=status_code,
                                          message=self.cfg.SOLD_OUT,
                                          backUrl=resp.url,
                                          html=pqhtml.outerHtml())
                return tool.return_data(successful=False, data=data)

            pdata = json.loads(
                re.search(r'jsInit.item.colorSizeJson =\s*(.*?\}\});\s*', Jtxt,
                          re.DOTALL).groups()[0])

            detail = dict()

            #名称:
            name = re.search(r'tc_vars\["product_title"\] =\s*"(.*?)";', Jtxt,
                             re.DOTALL).groups()[0]
            # name = json.loads(u'[{0}]'.format(HTMLParser().unescape(name)))[0]
            detail['name'] = area('#itemTitle').text()

            #品牌
            brand = re.search(r'tc_vars\["product_brand"\] =\s*"(.*?)";', Jtxt,
                              re.DOTALL).groups()[0]
            detail['brand'] = area(
                '#itemTitle span[itemprop="brand"]').text() or brand

            #货币符号
            currency = re.search(r'tc_vars\["nav_currency"\] =\s*"(.*?)";',
                                 Jtxt, re.DOTALL).groups()[0]
            detail['currency'] = currency
            detail['currencySymbol'] = tool.get_unit(currency)

            #价格
            detail['price'] = re.search(
                r'tc_vars\["product_discountprice"\] =\s*"(.*?)";', Jtxt,
                re.DOTALL).groups()[0]
            detail['listPrice'] = re.search(
                r'tc_vars\["product_price"\] =\s*"(.*?)";', Jtxt,
                re.DOTALL).groups()[0]

            #图片集
            imgsTmp = self.get_imgs(Jtxt, area, pdata)
            detail['img'] = dict([
                (cid, imgs[0]) for cid, imgs in imgsTmp.items()
            ]) if isinstance(imgsTmp, dict) else imgsTmp[0]
            detail['imgs'] = imgsTmp

            #规格
            detail['sizes'] = self.get_sizes(pdata)

            #产品ID
            detail['productId'] = dict([(color['Cod10'], color['Cod10'])
                                        for color in pdata['Colors']])

            #颜色
            detail['color'] = dict([(color['Cod10'], color['Name'])
                                    for color in pdata['Colors']])
            detail['colorId'] = dict([(color['Cod10'], color['Cod10'])
                                      for color in pdata['Colors']])

            #描述,2016-09-25 12:31:54 修改
            detail['descr'] = area('#item-infos li:first').remove(
                'script').text()
            # detail['descr'] = area('#itemInfoTab #tabs-1').remove('script').text()

            #构造物,2016-09-25 12:31:54 修改
            detail['fabric'] = area('#item-infos li:first').remove(
                'script').text()
            # detail['fabric'] = area('#item-infos #tabs-1').remove('script').text()

            #退换货,2016-09-25 12:31:54 修改
            detail['returns'] = area('#item-infos li:last').remove(
                'script').text()
            # detail['returns'] = area('#item-infos #tabs-3').remove('script').text()

            #设计者
            detail['designer'] = re.search(
                r'tc_vars\["product_author"\] =\s*"(.*?)";', Jtxt,
                re.DOTALL).groups()[0]

            #钥匙
            detail['keys'] = [color['Cod10'] for color in pdata['Colors']]

            #HTTP状态码
            detail['status_code'] = status_code

            #状态
            detail['status'] = self.cfg.STATUS_SALE

            #返回链接
            detail['backUrl'] = url

            log_info = json.dumps(
                dict(time=time.time(),
                     productId=detail['productId'],
                     name=detail['name'],
                     currency=detail['currency'],
                     price=detail['price'],
                     listPrice=detail['listPrice'],
                     url=url))

            self.logger.info(log_info)

            return tool.return_data(successful=True, data=detail)

        except Exception, e:
            raise
예제 #33
0
파일: fwrd.py 프로젝트: hellowac/drag
    def detail(self,url):
        try:
            resp = self.session.get(url, verify=False)

            status_code = resp.status_code
            pqhtml = PyQuery(resp.text or 'nothing')
            #下架
            if status_code == 404 :

                log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url))

                self.logger.info(log_info)

                data = tool.get_off_shelf(code=status_code,message=self.cfg.SOLD_OUT,backUrl=resp.url, html=pqhtml.outerHtml())
                
                return tool.return_data(successful=False, data=data)

            #其他错误
            if status_code != 200 :

                log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url))

                self.logger.info(log_info)

                data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get('SCERR','ERROR'), backUrl=resp.url, html=pqhtml.outerHtml())

                return tool.return_data(successful=False, data=data)
            
            #前期准备
            area = pqhtml('.fwd_page .fwd_content')
            # domain = tool.get_domain(url)
            # pdata = self.get_pdata(area)
            
            # print area.outerHtml().encode('utf-8')
            # print pqhtml.outerHtml()
            # exit()

            #下架
            if 'Sold Out' in area('.stock_info:first').text() :

                log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url))

                self.logger.info(log_info)
                data = tool.get_off_shelf(code=status_code,message=self.cfg.SOLD_OUT,backUrl=resp.url, html=pqhtml.outerHtml())
                
                return tool.return_data(successful=False, data=data)

            detail = dict()

            #品牌
            brand = area('.product_info:first .designer_brand:first a:first').text() or area('.product_info:first .product-titles__brand a:first').text()
            detail['brand'] = brand

            #名称
            detail['name'] = brand + ' ' + (area('.product_info:first h2.product_name:first').text() or area('.product_info:first h1.product_name:first').text())

            #货币
            currency = pqhtml('meta[itemprop="priceCurrency"]').attr('content')
            detail['currency'] = currency
            detail['currencySymbol'] = tool.get_unit(currency)

            #价格
            price,listPrice = self.get_all_price(area('.eagle .prices'))
            detail['price'] = price
            detail['listPrice'] = listPrice

            #产品ID
            productId = area('button.addtobag').attr('data-code')
            detail['productId'] = productId

            #颜色
            detail['color'] = area('.color_dd .one_sizeonly').text() or area('.color_dd option:first').text()
            detail['colorId'] = productId

            #图片集
            imgs = [ a.attr('data-zoom-image') for a in area('.cycle-slideshow .product-detail-image-zoom img').items()]
            detail['img'] = imgs[0]
            detail['imgs'] = imgs

            #规格
            detail['sizes'] = self.get_sizes(area)

            #描述
            detail['descr'] = area('#details').text()

            #品牌描述
            detail['brandDescr'] = area('#aboutdesigner').text()

            #退换货
            detail['returns'] = area('#free_ship_popup').text()

            #HTTP状态码
            detail['status_code'] = status_code

            #状态
            detail['status'] = self.cfg.STATUS_SALE

            #返回链接
            detail['backUrl'] = resp.url

            log_info = json.dumps(dict(time=time.time(), productId=detail['productId'], name=detail[
                                  'name'], currency=detail['currency'], price=detail['price'], listPrice=detail['listPrice'], url=url))

            self.logger.info(log_info)


            return tool.return_data(successful=True, data=detail)

        except Exception, e:
            raise
예제 #34
0
파일: mrporter.py 프로젝트: hellowac/drag
    def detail(self, url):
        try:

            resp = self.session.get(url, verify=False)

            status_code = resp.status_code
            pqhtml = PyQuery(resp.text or 'nothing')
            #下架
            if status_code == 404:

                log_info = json.dumps(
                    dict(time=time.time(),
                         title=pqhtml('title').text(),
                         url=url))

                self.logger.info(log_info)

                data = tool.get_off_shelf(code=status_code,
                                          message=self.cfg.SOLD_OUT,
                                          backUrl=resp.url,
                                          html=pqhtml.outerHtml())

                return tool.return_data(successful=False, data=data)

            if status_code != 200:

                log_info = json.dumps(
                    dict(time=time.time(),
                         title=pqhtml('title').text(),
                         url=url))

                self.logger.info(log_info)

                data = tool.get_error(code=status_code,
                                      message=self.cfg.GET_ERR.get(
                                          'SCERR', 'ERROR'),
                                      backUrl=resp.url,
                                      html=pqhtml.outerHtml())

                return tool.return_data(successful=False, data=data)

            area = pqhtml('#content')

            self.link_area = re.search(r'/en-(\w{2})/', url).groups()[0]

            SoldOut = self.checkSoldOut(pqhtml)

            if SoldOut:

                log_info = json.dumps(
                    dict(time=time.time(),
                         title=pqhtml('title').text(),
                         url=url))

                self.logger.info(log_info)

                data = tool.get_off_shelf(code=status_code,
                                          message=self.cfg.SOLD_OUT,
                                          backUrl=resp.url,
                                          html=pqhtml.outerHtml())
                return tool.return_data(successful=False, data=data)

            pdata = self.get_pdata(area)

            detail = dict()

            #品牌
            brand = pdata['brand']['name']
            detail['brand'] = brand

            #名称
            detail['name'] = brand + ' ' + pdata['name']

            #货币单位
            currency = pdata['price']['currency']
            detail['currency'] = currency
            detail['currencySymbol'] = tool.get_unit(currency)

            #价格
            price, listPrice = self.get_all_price(pdata)
            detail['price'] = price
            detail['listPrice'] = listPrice

            #图片集
            imgsTmp = self.get_imgs(area)
            detail['img'] = imgsTmp[0]
            detail['imgs'] = imgsTmp

            #规格
            sizesTmp = self.get_sizes(pdata)

            if sizesTmp is None:

                log_info = json.dumps(
                    dict(time=time.time(),
                         title=pqhtml('title').text(),
                         url=url))

                self.logger.info(log_info)

                data = tool.get_off_shelf(code=status_code,
                                          message=self.cfg.SOLD_OUT,
                                          backUrl=resp.url,
                                          html=pqhtml.outerHtml())
                return tool.return_data(successful=False, data=data)

            #处理one size
            if len(sizesTmp) == 1 and sizesTmp[0]['name'].lower() in [
                    'one size', 'onesize'
            ]:
                sizesTmp[0]['name'] = self.cfg.DEFAULT_ONE_SIZE

            detail['sizes'] = sizesTmp

            #视频
            if 'videos' in pdata and pdata['videos']:
                detail['video'] = self.get_video(pdata)

            #产品注意:
            detail['note'] = area(
                'section.product-accordion--desktop>section:first').text()

            #产品sizeFit
            detail['sizeFit'] = area(
                'section.product-accordion--desktop>section:eq(1)').text()

            #产品详情
            detail['detail'] = area(
                'section.product-accordion--desktop>section:eq(2)').text()

            #产品送货
            detail['delivery'] = area(
                'section.product-accordion--desktop>section:last').text()

            #产品退货
            detail['returns'] = area(
                'section.product-accordion--desktop>section:last').text()

            #描述
            detail['descr'] = self.get_descr(area)

            #产品ID
            detail['productId'] = pdata['id']

            print

            #颜色
            detail['color'] = pdata['colourInfo'][0]['colourName'] if pdata[
                'colourInfo'] else self.cfg.DEFAULT_ONE_COLOR

            #颜色ID
            detail['colorId'] = (pdata['colourInfo'][0]['colourId']
                                 or self.cfg.DEFAULT_COLOR_SKU
                                 ) if pdata['colourInfo'] else pdata['id']

            #HTTP状态码
            detail['status_code'] = status_code

            #状态
            detail['status'] = self.cfg.STATUS_SALE

            #返回链接
            detail['backUrl'] = resp.url

            log_info = json.dumps(
                dict(time=time.time(),
                     productId=detail['productId'],
                     name=detail['name'],
                     currency=detail['currency'],
                     price=detail['price'],
                     listPrice=detail['listPrice'],
                     url=url))

            self.logger.info(log_info)

            return tool.return_data(successful=True, data=detail)

        except TooManyRedirects as e:
            self.logger.exception(e)

            data = tool.get_off_shelf(code=0,
                                      message=self.cfg.SOLD_OUT,
                                      backUrl=url,
                                      html=str(e))

            return tool.return_data(successful=False, data=data)

        except Exception, e:
            raise
예제 #35
0
def simple_package(package_name):
    ''' Given a package name, returns all the versions for downloading
    that package.

    If the package doesn't exists, then it will call PyPi (CheeseShop).
    But if the package exists in the local path, then it will get all
    the versions for the local package.

    This will take into account if the egg is private or if it is a normal
    egg that was uploaded to PyPi. This is important to take into account
    the version of the eggs. For example, a proyect requires request==1.0.4
    and another package uses request==1.0.3. Then the instalation of the
    second package will fail because it wasn't downloaded an the **request**
    folder only has the 1.0.4 version.

    To solve this problem, the system uses 2 different kinds of eggs:

    * private eggs: are the eggs that you uploaded to the private repo.
    * normal eggs: are the eggs that are downloaded from pypi.

    So the normal eggs will always get the simple page from the pypi repo,
    will the private eggs will always be read from the filesystem.


    :param package_name: the name of the egg package. This is only the
                          name of the package with the version or anything
                          else.

    :return: a template with all the links to download the packages.
    '''
    app.logger.debug('Requesting index for: %s', package_name)
    package_folder = get_package_path(package_name)
    if (is_private(package_name) or (
            exists(package_name) and app.config['SHOULD_USE_EXISTING'])):

        app.logger.debug('Found information of package: %s in local repository',
                         package_name)
        package_versions = []
        template_data = dict(
            source_letter=package_name[0],
            package_name=package_name,
            versions=package_versions
        )

        for filename in listdir(package_folder):
            if not filename.endswith('.md5'):
                # I only read .md5 files so I skip this egg (or tar,
                # or zip) file
                continue

            with open(join(package_folder, filename)) as md5_file:
                md5 = md5_file.read(-1)

            # remove .md5 extension
            name = filename[:-4]
            data = VersionData(name, md5)
            package_versions.append(data)

        return render_template('simple_package.html', **template_data)
    else:
        app.logger.debug('Didnt found package: %s in local repository. '
                         'Using proxy.', package_name)
        url = app.config['PYPI_URL'] + 'simple/%s' % package_name
        response = get(url)
        if response.status_code != 200:
            app.logger.warning('Error while getting proxy info for: %s'
                               'Errors details: %s', package_name,
                               response.text)
            abort(response.status_code)

        content = response.content
        p = PyQuery(content)
        external_links = set()
        for anchor in p("a"):
            panchor = PyQuery(anchor)
            href = panchor.attr('href')
            # robin-jarry: modified the href to ../../packages/
            # so that it works also for non-source packages (.egg, .exe and .msi)
            parsed = urlparse.urlparse(href)
            
            if parsed.hostname:
                # the link is to an external server.
                if parsed.hostname == 'pypi.python.org':
                    # we remove the hostname to make the URL relative
                    panchor.attr('href', parsed.path)
                else:
                    if panchor.attr('rel') == 'download':
                        if url_is_egg_file(parsed.path):
                            # href points to a filename
                            external_links.add('<a href="%s">%s</a>' % (href, basename(parsed.path)))
                        else:
                            # href points to an external page where we will find 
                            # links to package files
                            external_links.update(find_external_links(href))
                    # what ever happens, we remove the link for now
                    # we'll add the external_links after that we found after
                    panchor.remove()                    
            else:
                # local link to pypi.python.org
                if not href.startswith('../../packages/'):
                    # ignore anything else than package links
                    panchor.remove()
            
        # after collecting all external links, we insert them in the html page
        for link in external_links:
            plink = PyQuery(link)
            href = plink.attr('href')
            plink.attr('href', convert_to_internal_url(href, package_name, basename(href)))
            p('a').after(plink)
        
        content = p.outerHtml()
        return content
예제 #36
0
파일: okini.py 프로젝트: hellowac/drag
    def detail(self, url):
        resp = self.session.get(url, verify=False)

        status_code = resp.status_code
        pqhtml = PyQuery(resp.text or 'nothing')
        # 下架
        if status_code == 404:

            log_info = json.dumps(
                dict(time=time.time(), title=pqhtml('title').text(), url=url))

            self.logger.info(log_info)

            data = tool.get_off_shelf(
                code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml())

            return tool.return_data(successful=False, data=data)

        # 其他错误
        if status_code != 200:

            log_info = json.dumps(
                dict(time=time.time(), title=pqhtml('title').text(), url=url))

            self.logger.info(log_info)

            data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get(
                'SCERR', 'ERROR'), backUrl=resp.url, html=pqhtml.outerHtml())

            return tool.return_data(successful=False, data=data)

        # 前期准备
        area = pqhtml('#main #primary')
        # domain = tool.get_domain(url)
        # pdata = self.get_pdata(area)

        # print area.outerHtml().encode('utf-8')
        # exit()

        # 下架
        # if True :

        # log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url))

        # self.logger.info(log_info)
        # data = tool.get_off_shelf(code=status_code,message=self.cfg.SOLD_OUT,backUrl=resp.url, html=pqhtml.outerHtml())

        # return tool.return_data(successful=False, data=data)

        detail = dict()

        # 产品ID
        productId = area('[itemprop="productID"]:first').text().replace('#','')
        detail['productId'] = productId
        detail['productSku'] = productId
        detail['productCode'] = productId

        # 品牌
        brand = area('.brand-name:first').text()
        detail['brand'] = brand

        # 名称
        detail['name'] = ' '.join([brand,area('.product-name:first').text()])

        # 价格
        price, listPrice, currency = self.get_all_price(area)
        detail['price'] = price
        detail['listPrice'] = listPrice

        # 货币,该渠道只有 欧元,美元,英镑,三种单位.
        detail['currency'] = currency
        detail['currencySymbol'] = tool.get_unit(currency)

        # 描述
        detail['descr'] = area('#pdpMain .product-detail .product-information').text()

        # 颜色
        # color = self.get_color(area)
        detail['color'] = self.cfg.DEFAULT_ONE_COLOR
        detail['colorId'] = self.cfg.DEFAULT_COLOR_SKU

        # 图片集
        imgs = [img.attr('src') for img in area('#pdpMain #product-col-2 img').items()]
        detail['img'] = imgs[0]
        detail['imgs'] = imgs

        # 规格
        detail['sizes'] = self.get_sizes(area)

        # HTTP状态码
        detail['status_code'] = status_code

        # 状态
        detail['status'] = self.cfg.STATUS_SALE

        # 返回链接
        detail['backUrl'] = resp.url

        # 返回的IP和端口
        if resp.raw._original_response.peer:
            detail['ip_port'] = ':'.join(
                map(lambda x: str(x), resp.raw._original_response.peer))

        log_info = json.dumps(dict(time=time.time(),
                                   productId=detail['productId'],
                                   name=detail['name'],
                                   currency=detail['currency'],
                                   price=detail['price'],
                                   listPrice=detail['listPrice'],
                                   url=url))

        self.logger.info(log_info)

        return tool.return_data(successful=True, data=detail)
예제 #37
0
    def detail(self, url):
        try:
            resp = self.session.get(url, timeout=self.cfg.REQUEST_TIME_OUT)
            # resp = requests.get(url,headers=self.session.headers,timeout=self.cfg.REQUEST_TIME_OUT)
            # print self.session.headers
            # resp = requests.get(url,headers=self.session.headers,timeout=20)

            status_code = resp.status_code
            pqhtml = PyQuery(resp.text or 'nothing')

            # print resp.headers

            #下架
            if status_code == 404:

                log_info = json.dumps(
                    dict(time=time.time(),
                         title=pqhtml('title').text(),
                         url=url))

                self.logger.info(log_info)

                data = tool.get_off_shelf(code=status_code,
                                          message=self.cfg.SOLD_OUT,
                                          backUrl=resp.url,
                                          html=pqhtml.outerHtml())

                return tool.return_data(successful=False, data=data)

            if status_code != 200:

                log_info = json.dumps(
                    dict(time=time.time(),
                         title=pqhtml('title').text(),
                         url=url))

                self.logger.info(log_info)

                data = tool.get_error(code=status_code,
                                      message=self.cfg.GET_ERR.get(
                                          'SCERR', 'ERROR'),
                                      backUrl=resp.url,
                                      html=pqhtml.outerHtml())

                return tool.return_data(successful=False, data=data)

            Jtxt = pqhtml('script').text()

            #下架
            if 'productDetails' not in Jtxt:

                log_info = json.dumps(
                    dict(time=time.time(),
                         title=pqhtml('title').text(),
                         url=url))

                self.logger.info(log_info)

                data = tool.get_off_shelf(code=status_code,
                                          message=self.cfg.SOLD_OUT,
                                          backUrl=resp.url,
                                          html=pqhtml.outerHtml())
                return tool.return_data(successful=False, data=data)

            pdata = self.get_pdata(Jtxt)

            #前期准备
            product = pdata['product']
            allLooks = product['allLooks']
            skuJournal = self.get_skuJournal(Jtxt)
            sizeAttribute = product['sizeAttribute'] if product.has_key(
                'sizeAttribute') else {
                    'values': [{
                        'id': 0,
                        'value': self.cfg.DEFAULT_ONE_SIZE
                    }]
                }
            colorAttribute = product['colorAttribute'] if product.has_key(
                'colorAttribute') else {
                    'values': [{
                        'id': 0,
                        'value': self.cfg.DEFAULT_ONE_COLOR
                    }]
                }

            #lookId 和 SkuArr 映射
            # lookId2SkuArr = dict([(look['productLookId'],[Id['skuId'] for Id in look['skus']]) for look in allLooks])
            #lookId 和 ImgArr 映射
            lookId2ImgArr = dict([(look['productLookId'], [
                'http:' + img['retinaQuickViewLookUrl']
                for img in look['images']
            ]) for look in allLooks])
            #lookId 和 现价 映射, 多颜色多价格
            lookId2Price = dict([(look['productLookId'],
                                  look['pricing']['maxSkuSalePrice']['raw'])
                                 for look in allLooks])
            #lookId 和 原价 映射,多颜色多价格
            lookId2ListPrice = dict([
                (look['productLookId'],
                 look['pricing']['maxSkuMsrpPrice']['raw'])
                for look in allLooks
            ])
            #lookId 和 skuArr 映射
            lookId2SkuArr = dict([(look['productLookId'],
                                   [Id['skuId'] for Id in look['skus']])
                                  for look in allLooks])
            #sizeId 和 名称 映射  #{2000: u's', 2001: u'm', 2002: u'l', 2003: u'xl', 2004: u'xxl'}
            sizeId2Name = dict([(size['id'], size['value'])
                                for size in sizeAttribute['values']])
            #colorId 和 名称 映射   #{1000: u'dark red', 1001: u'true navy'}
            colorId2Name = dict([(color['id'], color['value'])
                                 for color in colorAttribute['values']])
            #sku 和 有库存 映射
            sku2Inventory = self.get_sku2Inventory(skuJournal)
            #sku 和 无库存 映射
            sku2NoInventory = dict([
                (sku['skuId'], sku['numberUnitsForSale'])
                for sku in skuJournal['entries']
                if sku['type'] == 'inventory' and sku['status'] == ['X', 'U']
            ])
            #更新 库存 字典
            sku2Inventory.update(sku2NoInventory)
            #sku 和 现价 映射, 多size多价格.
            sku2Price = dict([(sku['skuId'], str(sku['salePrice']['raw']))
                              for sku in skuJournal['entries']
                              if sku['type'] == 'pricing'])
            #sku 和 原价 映射, 多size多价格.
            sku2ListPrice = dict([(sku['skuId'], str(sku['msrpPrice']['raw']))
                                  for sku in skuJournal['entries']
                                  if sku['type'] == 'pricing'])
            #skuId 和 sizeId 映射
            skuId2SizeId = dict([
                (sku['skuId'], sku['savId']) for sku in skuJournal['entries']
                if sku['type'] == 'associate' and sku['attribute'] == 'Size'
            ])
            #skuId 和 colorId 映射
            skuId2ColorId = dict([
                (sku['skuId'], sku['savId']) for sku in skuJournal['entries']
                if sku['type'] == 'associate' and sku['attribute'] == 'Color'
            ])
            #sku 和 sizeName 映射
            sku2SizeName = self.get_sku2SizeName(product, skuId2SizeId,
                                                 sizeId2Name)
            #sku 和 colorName 映射
            sku2ColorName = self.get_sku2ColorName(product, skuId2ColorId,
                                                   colorId2Name)
            #lookId 和 colorId 映射
            lookId2ColorId = self.get_lookIe2ColorId(lookId2SkuArr,
                                                     skuId2ColorId)
            #lookId 和 colorName 映射
            lookId2ColorName = self.get_lookIe2ColorName(
                lookId2SkuArr, sku2ColorName)
            #lookId 和 size集合 映射
            lookId2Sizes = self.get_lookId2Sizes(lookId2SkuArr, sku2SizeName,
                                                 sku2Inventory, sku2Price,
                                                 sku2ListPrice)

            # print(json.dumps(sku2Price))
            # print(json.dumps(sku2ListPrice))
            # print(json.dumps(lookId2SkuArr))
            # print(json.dumps(sku2ColorName))
            # print(json.dumps(lookId2ColorName))
            # print(json.dumps(sku2SizeName))
            detail = dict()

            #只获取当前连接中的sku值
            try:
                lookId = None
                if '-' in url[url.rindex('/'):]:
                    lookId = url[url.rindex('/') + 1:].split('-')[0]
                    lookIds = [int(lookId)]
            except Exception, e:
                pass

            #钥匙
            detail['keys'] = lookId2SkuArr.keys()

            #只获取链接中lookId
            # detail['keys'] = lookIds or lookId2SkuArr.keys()

            #颜色
            detail['color'] = lookId2ColorName
            detail['colorId'] = lookId2ColorId

            #产品ID
            detail['productId'] = product['productId']

            #图片
            detail['img'] = dict([(lookId, imgArr[0])
                                  for lookId, imgArr in lookId2ImgArr.items()])
            detail['imgs'] = lookId2ImgArr

            #规格
            detail['sizes'] = lookId2Sizes

            #价格
            detail['price'] = lookId2Price
            detail['listPrice'] = lookId2ListPrice

            #品牌
            brand = pdata['brand']['name']
            detail['brand'] = brand

            #名称
            detail['name'] = brand + ' ' + pdata['product']['name']

            #货币符号
            currency = pdata['defaultLook']['pricing']['currencyCode']
            detail['currency'] = currency
            detail['currencySymbol'] = tool.get_unit(currency)

            #退换货
            detail['returns'] = pdata['returnPolicy']['description']

            #描述
            dtxt = PyQuery(pdata['product']['description'])
            dtxt.remove('strong')
            detail['descr'] = dtxt.text()

            #HTTP状态码
            detail['status_code'] = status_code

            #状态
            detail['status'] = self.cfg.STATUS_SALE

            #返回链接
            detail['backUrl'] = resp.url

            log_info = json.dumps(
                dict(time=time.time(),
                     productId=detail['productId'],
                     name=detail['name'],
                     currency=detail['currency'],
                     price=detail['price'],
                     listPrice=detail['listPrice'],
                     url=url))

            self.logger.info(log_info)

            return tool.return_data(successful=True, data=detail)
예제 #38
0
    def detail(self, url):
        resp = self.session.get(url, verify=False)

        status_code = resp.status_code
        pqhtml = PyQuery(resp.text or 'nothing')
        add_tocart = pqhtml('#buy')

        # 下架
        if status_code == 404 or not add_tocart:

            log_info = json.dumps(
                dict(time=time.time(), title=pqhtml('title').text(), url=url))

            self.logger.info(log_info)

            data = tool.get_off_shelf(code=status_code,
                                      message=self.cfg.SOLD_OUT,
                                      backUrl=resp.url,
                                      html=pqhtml.outerHtml())

            return tool.return_data(successful=False, data=data)

        # 其他错误, 或没有加入购物车按钮
        if status_code != 200:

            log_info = json.dumps(
                dict(time=time.time(), title=pqhtml('title').text(), url=url))

            self.logger.info(log_info)

            data = tool.get_error(code=status_code,
                                  message=self.cfg.GET_ERR.get(
                                      'SCERR', 'ERROR'),
                                  backUrl=resp.url,
                                  html=pqhtml.outerHtml())

            return tool.return_data(successful=False, data=data)

        # 前期准备
        img_area = pqhtml('body div.left')
        prod_area = pqhtml('body .right')

        # print img_area.outerHtml().encode('utf-8')
        # print prod_area.outerHtml().encode('utf-8')
        # exit()

        # 下架
        if not prod_area:

            log_info = json.dumps(
                dict(time=time.time(), title=pqhtml('title').text(), url=url))

            self.logger.info(log_info)
            data = tool.get_off_shelf(code=status_code,
                                      message=self.cfg.SOLD_OUT,
                                      backUrl=resp.url,
                                      html=pqhtml.outerHtml())

            return tool.return_data(successful=False, data=data)

        detail = dict()

        # 产品ID
        productId = re.search(r'goods\/(\d+)[\/]?', url).groups()[0]
        detail['productId'] = productId
        detail['productSku'] = productId
        detail['productCode'] = productId

        # 品牌
        brand = prod_area('p:last').text().replace(u'进入品牌', '').strip()
        detail['brand'] = brand

        # 名称
        detail['name'] = prod_area('#kuriosity_code').prev().text()

        # 货币
        currency = 'CNY'
        detail['currency'] = currency
        detail['currencySymbol'] = tool.get_unit(currency)

        # 价格
        price, listPrice = self.get_all_price(prod_area)
        detail['price'] = price
        detail['listPrice'] = listPrice

        # 退换货
        detail['returns'] = ''  # img_area('div:last').text()

        # 描述
        img_area('div:last').empty()  # 清空售后说明
        detail['descr'] = prod_area('.text').text() + img_area(
            'div:first').text()

        # 颜色
        # color = self.get_color(area)
        detail['color'] = self.cfg.DEFAULT_ONE_COLOR
        detail['colorId'] = self.cfg.DEFAULT_COLOR_SKU

        # 图片集
        imgs = [
            'https://www.k11kuriosity.com' + img.attr('src')
            for img in img_area('img.small').items()
        ]
        detail['img'] = imgs[0]
        detail['imgs'] = imgs

        # 规格
        detail['sizes'] = self.get_sizes(prod_area)

        # HTTP状态码
        detail['status_code'] = status_code

        # 状态
        detail['status'] = self.cfg.STATUS_SALE

        # 返回链接
        detail['backUrl'] = resp.url

        # 返回的IP和端口
        if resp.raw._original_response.peer:
            detail['ip_port'] = ':'.join(
                map(lambda x: str(x), resp.raw._original_response.peer))

        log_info = json.dumps(
            dict(time=time.time(),
                 productId=detail['productId'],
                 name=detail['name'],
                 currency=detail['currency'],
                 price=detail['price'],
                 listPrice=detail['listPrice'],
                 url=url))

        self.logger.info(log_info)

        return tool.return_data(successful=True, data=detail)