Пример #1
0
class Spider(object):
    def __init__(self):
        self.tao=Taobao()
        self.tao.main()
        self.u='http://mm.taobao.com/json/request_top_list.htm?page='
        self.rule_1='<div class="list-item">.*?<div class="pic-word">.*?<a href="//(.*?)".*?<img src="//(.*?)".*?<a class="lady-name" .*?>(.*?)</a>'
        self.rule_2='<img.*?src="//(.*?)"'
        self.rule_3='<div class="mm-aixiu-content".*?">(.*?)<!--'
    def baseurl(self,i):
        return self.u+str(i)
    def read(self,url):
        req=urllib2.Request(url)
        url=urlopen(req)
        page=url.read().decode('gbk')
        #page=page.encode('utf-8')
        return page
    def compile(self,rule):
        RE=re.compile(rule,re.S)
        return RE
    def mkdir(self,path):
        path=path.strip()
        if os.path.exists(path):
            return False
        else:
            os.makedirs(path)
            return True
    def savePicutre(self,temp,path):
        path=path.strip()
        k=0
        for i in temp:
            print i
            url=urlopen('http://'+i)
            with open(path+'\\'+str(k)+'.jpg','wb') as f:
                f.write(url.read())
            k=k+1
    def load(self,start,end):
        if isinstance(start,int) and isinstance(end,int):
            for i in range(start,end):
                 item=re.findall(self.compile(self.rule_1),self.read(self.baseurl(i)))
                 for temp in item:
                     if self.mkdir(temp[2]):
                         p=self.tao.getpage('http://'+temp[0])
                         p=p.decode('gbk')
                         p=p.encode('gbk')
                         picpart=re.search(self.compile(self.rule_3),p)
                         print picpart.group(1)
                         pic=re.findall(self.compile(self.rule_2),picpart.group(1))
                         print pic
                         self.savePicutre(pic,temp[2])
                     else:
                         raise(Exception)
        else:
            raise(Exception)
Пример #2
0
 def __init__(self):
     self.tao=Taobao()
     self.tao.main()
     self.u='http://mm.taobao.com/json/request_top_list.htm?page='
     self.rule_1='<div class="list-item">.*?<div class="pic-word">.*?<a href="//(.*?)".*?<img src="//(.*?)".*?<a class="lady-name" .*?>(.*?)</a>'
     self.rule_2='<img.*?src="//(.*?)"'
     self.rule_3='<div class="mm-aixiu-content".*?">(.*?)<!--'
Пример #3
0
    def __init__(self, ):
        from taobao import Taobao
        from oss.oss_api import *

        self.logger = logging.getLogger("taoke")
        self.taobao = Taobao('12570801', 'fbab4f2ded890ae889e876ae0eee90b9')
        self.oss = OssAPI("storage.aliyun.com", "dcixul0kll5ubeisualf3q1d", "1fMUf01mRTfjXe/Ub4qEmLiu7tU=")
Пример #4
0
class GetTaokDetailOld(object):
    def __init__(self, ):
        self.logger = logging.getLogger("taoke")
        self.taobao = Taobao('12570801', 'fbab4f2ded890ae889e876ae0eee90b9')
        self.oss = OssAPI("storage.aliyun.com", "dcixul0kll5ubeisualf3q1d", "1fMUf01mRTfjXe/Ub4qEmLiu7tU=")
        
    def __call__(self, site, http, next_task, url, local_url, *args):
        local_abs_path = site.real_path(local_url)
        if os.path.isfile(local_abs_path):
            self.logger.info("the topic is exist in local:%s, local:%s" % (url, local_abs_path))
            return
        
        #self.logger.info("start fetch topic, url:%s" % url)
        #data = http.get(url)
        self.taobao.http = http
        num_iid = re.search(r"/(\d{7,})/", url).group(1)
        self.logger.info("start fetch taoke details id:%s" % num_iid)
        
        try:
            data = self.taobao.taobao_item_get(fields='detail_url,num_iid,title,nick,type,cid,seller_cids,props,input_pids,input_str,desc,pic_url,num,valid_thru,list_time,delist_time,stuff_status,location,price,post_fee,express_fee,ems_fee,has_discount,freight_payer,has_invoice,has_warranty,has_showcase,modified,increment,approve_status,postage_id,product_id,auction_point,property_alias,item_img,prop_img,sku,video,outer_id,is_virtual', 
                num_iid=num_iid)
            data = data.get('item_get_response')
        except Exception, e:
            if e.__class__.__name__ == 'TaobaoException' and 'isv.' in e.sub_code: # == 'isv.item-is-delete:invalid-numIid-or-iid':
                self.logger.info("expired taoke id:%s" % num_iid)
                remove_url = "http://%s/queue/expired_taoke/" % site.hostname
                http.post_data(remove_url, {'num_iid': num_iid})
                return
            else:
                raise

        #self.logger.info("data:%s" % data)
        
        data['item']['traderates'] = self.get_comments(num_iid, data['item']['nick'])
        self.logger.info("start fetch main images...")
        index = 0
        main_image = [e['url'] for e in data['item']['item_imgs']['item_img'] ]
        (main_image, index) = self.save_image_to_oss(main_image, index, http, site, num_iid)
        
        self.logger.info("start fetch desc images...")
        desc_images = self._parse_image_from_desc(data['item']['desc'])        
        (desc_images, index) = self.save_image_to_oss(desc_images, index, http, site, num_iid)
        
        data['item']['main_images'] = main_image
        data['item']['desc_images'] = desc_images
        #print data
        if os.environ.get('HUDSON_URL'):
            http.post_data("http://127.0.0.1:8924/queue/q/imported_taoke?format=json", {'details': json.dumps(data['item']), 'num_iid': num_iid}, {})
        else:
            http.post_data("http://data.deonwu84.com/queue/q/imported_taoke?format=json", {'details': json.dumps(data['item']), 'num_iid': num_iid}, {})
        http.post(url, site.real_path("log/%s/%s.txt" % (num_iid[-1:], num_iid)), {'data': json.dumps(data['item'])})
        self.save_topic_data(data, local_abs_path)
        self.logger.info("done process taoke, id:%s" % num_iid)
Пример #5
0
class GetTaokDetail(object):
    def __init__(self, ):
        from taobao import Taobao
        from oss.oss_api import *

        self.logger = logging.getLogger("taoke")
        self.taobao = Taobao('12570801', 'fbab4f2ded890ae889e876ae0eee90b9')
        self.oss = OssAPI("storage.aliyun.com", "dcixul0kll5ubeisualf3q1d", "1fMUf01mRTfjXe/Ub4qEmLiu7tU=")
        
    def __call__(self, site, http, next_task, url, local_url, *args):
        local_abs_path = site.real_path(local_url)
        
        #self.logger.info("start fetch topic, url:%s" % url)
        #data = http.get(url)
        self.taobao.http = http
        num_iid = re.search(r"/(\d{7,})/", url).group(1)
        self.logger.info("start fetch taoke details id:%s" % num_iid)
        
        try:
            data = self.taobao.taobao_item_get(fields='detail_url,num_iid,title,nick,type,cid,seller_cids,props,input_pids,input_str,desc,pic_url,num,valid_thru,list_time,delist_time,stuff_status,location,price,post_fee,express_fee,ems_fee,has_discount,freight_payer,has_invoice,has_warranty,has_showcase,modified,increment,approve_status,postage_id,product_id,auction_point,property_alias,item_img,prop_img,sku,video,outer_id,is_virtual', 
                num_iid=num_iid)
            data = data.get('item_get_response')
        except Exception, e:
            if e.__class__.__name__ == 'TaobaoException' and 'isv.' in e.sub_code: # == 'isv.item-is-delete:invalid-numIid-or-iid':
                self.logger.info("expired taoke id:%s" % num_iid)
                remove_url = "http://%s/queue/expired_taoke/" % site.hostname
                http.post_data(remove_url, {'num_iid': num_iid})
                return
            else:
                raise
        
        self.load_cid_props(unicode(data['item']['cid']))
        #self.logger.info("data:%s" % data)
        data['item']['props_str'] = self.convert_props_tostr(data['item']['props'])
        logging.info(u"prpos:%s" % data['item']['props_str']);
        
        try:
            traderates = self.get_comments(num_iid, data['item']['nick'])
            data['item']['traderates'] = traderates.get("trade_rates", {}).get('trade_rate', [])        
            data['item']['traderates_count'] = traderates.get('total_results', 0)
            self.logger.info("traderates_count:%s" % data['item']['traderates_count'])
        except Exception, e:
            self.logger.info("failed to get comments:%s" % e)
Пример #6
0
from taobao import Taobao

def test_get_detail(num_iid):
    pass
    
def test_get_cate_list(pid):
    pass
    
    
if __name__ == '__main__':
    api = Taobao('12395385', '53697d99eccd670191af0603d7256f77')
    #data = api.taobao_itemcats_get(fields='cid,parent_cid,name,is_parent', parent_cid=0)
    data = api.taobao_item_get(fields='detail_url,num_iid,title,nick,type,cid,seller_cids,props,input_pids,input_str,desc,pic_url,num,valid_thru,list_time,delist_time,stuff_status,location,price,post_fee,express_fee,ems_fee,has_discount,freight_payer,has_invoice,has_warranty,has_showcase,modified,increment,approve_status,postage_id,product_id,auction_point,property_alias,item_img,prop_img,sku,video,outer_id,is_virtual', 
    num_iid='4735623930')
    print data
    
Пример #7
0
 def job_taobao(self):
     for v in self.jobs['taobao']['account']:
         taobao = Taobao(v['username'], v['password'])
         if taobao.login():
             taobao.checkin()
Пример #8
0
 def __init__(self, ):
     self.logger = logging.getLogger("taoke")
     self.taobao = Taobao('12395385', '53697d99eccd670191af0603d7256f77')
     self.oss = OssAPI("storage.aliyun.com", "dcixul0kll5ubeisualf3q1d", "1fMUf01mRTfjXe/Ub4qEmLiu7tU=")
Пример #9
0
 def __init__(self, ):
     self.logger = logging.getLogger("taoke")
     self.taobao = Taobao('12570801', 'fbab4f2ded890ae889e876ae0eee90b9')
     self.oss = OssAPI("storage.aliyun.com", "dcixul0kll5ubeisualf3q1d",
                       "1fMUf01mRTfjXe/Ub4qEmLiu7tU=")
Пример #10
0
class GetTaokDetailOld(object):
    def __init__(self, ):
        self.logger = logging.getLogger("taoke")
        self.taobao = Taobao('12570801', 'fbab4f2ded890ae889e876ae0eee90b9')
        self.oss = OssAPI("storage.aliyun.com", "dcixul0kll5ubeisualf3q1d",
                          "1fMUf01mRTfjXe/Ub4qEmLiu7tU=")

    def __call__(self, site, http, next_task, url, local_url, *args):
        local_abs_path = site.real_path(local_url)
        if os.path.isfile(local_abs_path):
            self.logger.info("the topic is exist in local:%s, local:%s" %
                             (url, local_abs_path))
            return

        #self.logger.info("start fetch topic, url:%s" % url)
        #data = http.get(url)
        self.taobao.http = http
        num_iid = re.search(r"/(\d{7,})/", url).group(1)
        self.logger.info("start fetch taoke details id:%s" % num_iid)

        try:
            data = self.taobao.taobao_item_get(
                fields=
                'detail_url,num_iid,title,nick,type,cid,seller_cids,props,input_pids,input_str,desc,pic_url,num,valid_thru,list_time,delist_time,stuff_status,location,price,post_fee,express_fee,ems_fee,has_discount,freight_payer,has_invoice,has_warranty,has_showcase,modified,increment,approve_status,postage_id,product_id,auction_point,property_alias,item_img,prop_img,sku,video,outer_id,is_virtual',
                num_iid=num_iid)
            data = data.get('item_get_response')
        except Exception, e:
            if e.__class__.__name__ == 'TaobaoException' and 'isv.' in e.sub_code:  # == 'isv.item-is-delete:invalid-numIid-or-iid':
                self.logger.info("expired taoke id:%s" % num_iid)
                remove_url = "http://%s/queue/expired_taoke/" % site.hostname
                http.post_data(remove_url, {'num_iid': num_iid})
                return
            else:
                raise

        #self.logger.info("data:%s" % data)

        data['item']['traderates'] = self.get_comments(num_iid,
                                                       data['item']['nick'])
        self.logger.info("start fetch main images...")
        index = 0
        main_image = [e['url'] for e in data['item']['item_imgs']['item_img']]
        (main_image, index) = self.save_image_to_oss(main_image, index, http,
                                                     site, num_iid)

        self.logger.info("start fetch desc images...")
        desc_images = self._parse_image_from_desc(data['item']['desc'])
        (desc_images, index) = self.save_image_to_oss(desc_images, index, http,
                                                      site, num_iid)

        data['item']['main_images'] = main_image
        data['item']['desc_images'] = desc_images
        #print data
        if os.environ.get('HUDSON_URL'):
            http.post_data(
                "http://127.0.0.1:8924/queue/q/imported_taoke?format=json", {
                    'details': json.dumps(data['item']),
                    'num_iid': num_iid
                }, {})
        else:
            http.post_data(
                "http://data.deonwu84.com/queue/q/imported_taoke?format=json",
                {
                    'details': json.dumps(data['item']),
                    'num_iid': num_iid
                }, {})
        http.post(url,
                  site.real_path("log/%s/%s.txt" % (num_iid[-1:], num_iid)),
                  {'data': json.dumps(data['item'])})
        self.save_topic_data(data, local_abs_path)
        self.logger.info("done process taoke, id:%s" % num_iid)
Пример #11
0
from taobao import Taobao


def test_get_detail(num_iid):
    pass


def test_get_cate_list(pid):
    pass


if __name__ == '__main__':
    api = Taobao('12395385', '53697d99eccd670191af0603d7256f77')
    #data = api.taobao_itemcats_get(fields='cid,parent_cid,name,is_parent', parent_cid=0)
    data = api.taobao_item_get(
        fields=
        'detail_url,num_iid,title,nick,type,cid,seller_cids,props,input_pids,input_str,desc,pic_url,num,valid_thru,list_time,delist_time,stuff_status,location,price,post_fee,express_fee,ems_fee,has_discount,freight_payer,has_invoice,has_warranty,has_showcase,modified,increment,approve_status,postage_id,product_id,auction_point,property_alias,item_img,prop_img,sku,video,outer_id,is_virtual',
        num_iid='4735623930')
    print data
Пример #12
0
        if failed_404_times > 5:
            print("页面出现404错误,延迟5分钟")
            suning.delay_time(180)
            failed_404_times = 0

        #try:
        # 每次启动时,启动3次关闭窗口程序
        k = 0
        for k in range(3):
            os.system('close.exe')

        start = time.clock()

        # 初始并实例化淘宝商品类
        taobao = Taobao(taobao_id)

        # http://item.taobao.com/item.htm?id=584180190359
        taobao_product_url = taobao.get_taobao_product_url()
        taobao_mdskip_url = taobao.get_taobao_product_mdskip_url()
        headers = taobao.get_taobao_product_headers()

        # 淘宝商品由 PVS: {颜色,尺码,价格,库存,折扣,照片}
        taobao_products = {}
        # 淘宝颜色
        taobao_colors = {}
        taobao_sizes = {}
        taobao_image_url_list = []

        # 初始并实例化苏宁商品类
        suning = Suning(suning_productCode)
Пример #13
0
    def __init__(self, http_client):
        from taobao import Taobao
        from oss.oss_api import *

        self.logger = logging.getLogger("taoke")
        self.taobao = Taobao('12570801', 'fbab4f2ded890ae889e876ae0eee90b9', http=http_client)
Пример #14
0
class GetTaokDetail(object):
    def __init__(self, http_client):
        from taobao import Taobao
        from oss.oss_api import *

        self.logger = logging.getLogger("taoke")
        self.taobao = Taobao('12570801', 'fbab4f2ded890ae889e876ae0eee90b9', http=http_client)
        
    def generate_cate_tree(self, site, http, pid):
    
        parents_id = [pid, ]
        while len(parents_id) > 0:
            tmp_list = parents_id
            parents_id = []
            for cid in tmp_list:
                sub_cate = self._get_cate_list(cid)
                parents_id += [ unicode(e['cid']) for e in sub_cate if e['is_parent'] ]                
                self._save_and_output(sub_cate)
                import time
                time.sleep(3) # taobao call limit.
        pass
        
    def _get_cate_list(self, cid):
        cate = self.taobao.taobao_itemcats_get(fields='cid,parent_cid,name,is_parent', parent_cid=cid)
        return cate["itemcats_get_response"]['item_cats']['item_cat']

    def _get_cate_props(self, cid):
        cate = self.taobao.taobao_itemprops_get(fields='pid,name,must,multi,prop_values', cid=cid)
        return cate["itemprops_get_response"]['item_props']['item_prop']
        
    def _save_props(self, cid, props):
        cid = unicode(cid)
        path = u"props/%s/%s.data" % (cid[-1], cid)
        
        values = {}
        for item in props:
            key = item['pid']            
            values[u'%s_name' % key] = item['name']
            if 'prop_values' not in item: continue
            for v in item['prop_values']['prop_value']:
                vk = u'%s_%s' % (key, v['vid'])
                values[vk] = v['name']
        
        import codecs
        if not os.path.isdir(os.path.dirname(path)):
            os.makedirs(os.path.dirname(path))    
        
        logging.info(u"save props:%s" % path)
        fd = codecs.open(path, "w", "utf-8")
        fd.write(json.dumps(values, encoding='utf-8', ensure_ascii=False))
        fd.close()

    
    def _save_and_output(self, cate_list):
        for c in cate_list:
            #self.fd.write(unicode(c['cid']))
            #self.fd.write("-->")
            #self.fd.write(c['name'])
            self.fd.write(u"%s-->%s --> %s\n" % (c['cid'], c['name'], c['parent_cid']))
            print u"%s -> %s" % (c['cid'], c['name'])
            if not c['is_parent']:
                try:
                    cid = unicode(c['cid'])
                    props = self._get_cate_props(cid)
                    self._save_props(cid, props)
                except Exception, e:
                    logging.error("get cate props error:%s" % e)