示例#1
0
    def parse_other_url(self, dom, params):
        """
        获取所有url, 并写入depfilter_task_queue
        :param dom:
        :param params:
        :return:
        """
        result_list = []
        channel = self.get_channel(dom)
        for e in dom.find('a'):
            sub_url = PyQuery(e).attr('href')
            if sub_url and sub_url.startswith("."):
                sub_url = self.link_analysis.url_join(params["info:url"],
                                                      sub_url)

            if self.link_analysis.url_legal(sub_url, self.allow_domains):
                if not self.link_filter(sub_url):
                    # 存入redis队列
                    _params = dict(
                        params.copy(), **{
                            "info:url": sub_url,
                            "info:channel": channel
                        })
                    result_list.extend(
                        [json.dumps(_params),
                         int(_params["info:priority"])])
        self.redis_action.priority_queue_push("dupfilter_task_queue",
                                              *result_list)
        return "parse urls"
示例#2
0
 def extract_detail_url(self, html):
     pq = PQ(html)
     div = pq("div[class='question-summary']")
     hrefs = list()
     for a in div('h3 a'):
         href = PQ(a).attr('href')
         if href.startswith('/'):
             href = self.BASE_URL + href
         hrefs.append(href)
     return hrefs
 def parseProductDetails(self, product_page_content, product_info):
     doc = PyQuery(product_page_content)
     product_info['name'] = doc('h1#div_product_name').text()
     product_info['sku_id'] = doc('span#div_product_itemno').text()
     product_info['price'] = doc('span#div_product_price').text()
     product_info['label_price'] = doc('span#div_retail_price').text()
     product_info['img_url'] = self.merchant.filteruri(doc('img#target_img').attr('src'))
     #获取reviews数目
     product_info['reviews'] = '0'
     bNodeList = doc('b')
     for item in bNodeList:
         text = PyQuery(item).text()
         if text.startswith("Customer Reviews"):
             product_info['reviews'] = extractNum(text)
             break
     #获取品类路径
     nodeList = doc('a.nav-location')
     if PyQuery(nodeList[0]).text().strip() == 'Home':
         nodeList = nodeList[1:]
     for i, node in enumerate(nodeList):
         product_info['level' + str(i+1) + '_category'] = PyQuery(node).text().strip()
示例#4
0
 def parseProductDetails(self, product_page_content, product_info):
     doc = PyQuery(product_page_content)
     product_info['name'] = doc('h1#div_product_name').text()
     product_info['sku_id'] = doc('span#div_product_itemno').text()
     product_info['price'] = doc('span#div_product_price').text()
     product_info['label_price'] = doc('span#div_retail_price').text()
     product_info['img_url'] = self.merchant.filteruri(
         doc('img#target_img').attr('src'))
     #获取reviews数目
     product_info['reviews'] = '0'
     bNodeList = doc('b')
     for item in bNodeList:
         text = PyQuery(item).text()
         if text.startswith("Customer Reviews"):
             product_info['reviews'] = extractNum(text)
             break
     #获取品类路径
     nodeList = doc('a.nav-location')
     if PyQuery(nodeList[0]).text().strip() == 'Home':
         nodeList = nodeList[1:]
     for i, node in enumerate(nodeList):
         product_info['level' + str(i + 1) +
                      '_category'] = PyQuery(node).text().strip()
示例#5
0
 def parse_item(self, response):
     city = response.meta.get('item')
     html = response.body
     name = PyQuery(html).find('.company-name-t').find('a').text()
     size = PyQuery(html).find('.terminal-company').find('li').eq(0).find(
         'strong').text()
     nature = PyQuery(html).find('.terminal-company').find('li').eq(1).find(
         'strong').text()
     industry = PyQuery(html).find('.terminal-company').find('li').eq(
         2).find('a').text()
     website = PyQuery(html).find('.terminal-company').find('li').eq(
         -2).find('a').text()
     if not website.startswith('www'):
         website = None
     address = PyQuery(html).find('.terminal-company').find('li').eq(
         -1).find('strong').text().strip()
     item = ZhilianItem(city=city,
                        name=name,
                        size=size,
                        nature=nature,
                        industry=industry,
                        website=website,
                        address=address)
     yield item
示例#6
0
    def run(self):
        '''
        解析网站源码
        '''
        time.sleep(random.uniform(1.0, 3.6))
        try:
            pq = helper.get(self.url, myHeaders=self.headers)
            # 款型名称
            name = pq('div.product-brand').text().strip() + ' ' + pq('h1.product-name').text().strip()
            # 颜色尺寸
            # 找出所有的尺寸
            size_span_list = pq('div.product-sizes__options span.product-sizes__detail')
            size_price_list = []
            for size_span in size_span_list:
                size = PyQuery(size_span).find('span.product-sizes__size').text().strip()
                if 'K' in size or 'k' in size or '-' in size or 'XS' in size:
                    continue
                size = re.sub(r'[WwYyCc\*]', '', size)
                # 还有非数字的size,醉了
                if size == 'S':
                    continue
                elif size == 'M':
                    continue
                elif size == 'L':
                    continue
                elif size == 'XL':
                    continue
                elif size == 'XXL':
                    continue
                elif size == 'XXXL':
                    continue
                elif size == '':
                    continue
                elif size == 'OS':
                    continue
                price = PyQuery(size_span).find('span.product-sizes__price').text().strip()
                if price.startswith('$'):
                    price = price.replace('$', '').replace(',', '')
                    size_price_list.append({
                        'size': size,
                        'price': float(price),
                        'isInStock': True
                    })
                else:
                    size_price_list.append({
                        'size': size,
                        'price': 0.0,
                        'isInStock': False
                    })
            if len(size_price_list) < 1:
                return
            # 配色的编号
            number = ''
            # 性别
            gender = 0
            # 颜色
            color_value = ''
            tr_list = pq('table#product-attribute-specs-table tr')
            for tr in tr_list:
                key = PyQuery(tr).find('th').text().strip()
                if key == 'Gender':
                    gender_txt = PyQuery(tr).find('td').text().strip()
                    if gender_txt == 'Mens':
                        gender = 1
                    elif gender_txt == 'Womens':
                        gender = 2
                elif key == 'Colorway':
                    color_value = PyQuery(tr).find('td').text().strip()
                elif key == 'Manufacturer Sku':
                    number = PyQuery(tr).find('td').text().strip()
            # print(name, number, self.url, size_price_list, gender, color_value)
            img_downloaded = mongo.is_pending_goods_img_downloaded(self.url)

            if not img_downloaded:
                img_url = pq('div.product-gallery-image > img')[0].get('src')
                # 下载图片
                result = helper.downloadImg(img_url, os.path.join('.', 'imgs', 'stadiumgoods', '%s.jpg' % number))
                if result == 1:
                    # 上传到七牛
                    qiniuUploader.upload_2_qiniu('stadiumgoods', '%s.jpg' % number, './imgs/stadiumgoods/%s.jpg' % number)
                img_downloaded = True
            mongo.insert_pending_goods(name, number, self.url, size_price_list, ['%s.jpg' % number], gender, color_value, 'stadiumgoods', '5b8f484b299207efc1fb0904', self.crawl_counter, img_downloaded=img_downloaded)
        except:
            global error_detail_url
            error_counter = error_detail_url.get(self.url, 1)
            error_detail_url[self.url] = error_counter + 1
            helper.log('[ERROR] error timer = %s, url = %s' % (error_counter, self.url), 'stadiumgoods')
            if error_counter < 3:
                self.q.put(self.url)
示例#7
0
# Download Pixelmator tutorial videos from vimeo.
# by twinsant
import requests
from pyquery import PyQuery

from savevideo import get_download_links
from savevideo import download_video

if __name__ == '__main__':
    # Get all vimeo urls in tutorial page
    session = requests.Session()
    r = session.get('http://www.pixelmator.com/tutorials/')
    d = PyQuery(r.text.encode('utf8'))
    hrefs = d('a')
    urls = set()
    for href in hrefs:
        a = PyQuery(href).attr.href
        if a.startswith('https://vimeo.com'):
            urls.add(a)
    for url in sorted(list(urls)):
        print 'Get video links for %s' % url
        # With help of savevideo.me
        links = get_download_links(url)
        for link in links:
            video_url, profile = link
            # Exclude HD and Mobile versions
            if profile.find('(MP4 format)') != -1:
                download_video(video_url)
示例#8
0
# Download Pixelmator tutorial videos from vimeo.
# by twinsant
import requests
from pyquery import PyQuery

from savevideo import get_download_links
from savevideo import download_video

if __name__ == '__main__':
    # Get all vimeo urls in tutorial page
    session = requests.Session()
    r = session.get('http://www.pixelmator.com/tutorials/')
    d = PyQuery(r.text.encode('utf8'))
    hrefs = d('a')
    urls = set()
    for href in hrefs:
        a = PyQuery(href).attr.href
        if a.startswith('https://vimeo.com'):
            urls.add(a)
    for url in sorted(list(urls)):
        print 'Get video links for %s' % url
        # With help of savevideo.me
        links = get_download_links(url)
        for link in links:
            video_url, profile = link
            # Exclude HD and Mobile versions
            if profile.find('(MP4 format)')!=-1:
                download_video(video_url)