class Spider(object): def __init__(self): self.tao=Taobao() self.tao.main() self.u='http://mm.taobao.com/json/request_top_list.htm?page=' self.rule_1='<div class="list-item">.*?<div class="pic-word">.*?<a href="//(.*?)".*?<img src="//(.*?)".*?<a class="lady-name" .*?>(.*?)</a>' self.rule_2='<img.*?src="//(.*?)"' self.rule_3='<div class="mm-aixiu-content".*?">(.*?)<!--' def baseurl(self,i): return self.u+str(i) def read(self,url): req=urllib2.Request(url) url=urlopen(req) page=url.read().decode('gbk') #page=page.encode('utf-8') return page def compile(self,rule): RE=re.compile(rule,re.S) return RE def mkdir(self,path): path=path.strip() if os.path.exists(path): return False else: os.makedirs(path) return True def savePicutre(self,temp,path): path=path.strip() k=0 for i in temp: print i url=urlopen('http://'+i) with open(path+'\\'+str(k)+'.jpg','wb') as f: f.write(url.read()) k=k+1 def load(self,start,end): if isinstance(start,int) and isinstance(end,int): for i in range(start,end): item=re.findall(self.compile(self.rule_1),self.read(self.baseurl(i))) for temp in item: if self.mkdir(temp[2]): p=self.tao.getpage('http://'+temp[0]) p=p.decode('gbk') p=p.encode('gbk') picpart=re.search(self.compile(self.rule_3),p) print picpart.group(1) pic=re.findall(self.compile(self.rule_2),picpart.group(1)) print pic self.savePicutre(pic,temp[2]) else: raise(Exception) else: raise(Exception)
def __init__(self): self.tao=Taobao() self.tao.main() self.u='http://mm.taobao.com/json/request_top_list.htm?page=' self.rule_1='<div class="list-item">.*?<div class="pic-word">.*?<a href="//(.*?)".*?<img src="//(.*?)".*?<a class="lady-name" .*?>(.*?)</a>' self.rule_2='<img.*?src="//(.*?)"' self.rule_3='<div class="mm-aixiu-content".*?">(.*?)<!--'
def __init__(self, ): from taobao import Taobao from oss.oss_api import * self.logger = logging.getLogger("taoke") self.taobao = Taobao('12570801', 'fbab4f2ded890ae889e876ae0eee90b9') self.oss = OssAPI("storage.aliyun.com", "dcixul0kll5ubeisualf3q1d", "1fMUf01mRTfjXe/Ub4qEmLiu7tU=")
class GetTaokDetailOld(object): def __init__(self, ): self.logger = logging.getLogger("taoke") self.taobao = Taobao('12570801', 'fbab4f2ded890ae889e876ae0eee90b9') self.oss = OssAPI("storage.aliyun.com", "dcixul0kll5ubeisualf3q1d", "1fMUf01mRTfjXe/Ub4qEmLiu7tU=") def __call__(self, site, http, next_task, url, local_url, *args): local_abs_path = site.real_path(local_url) if os.path.isfile(local_abs_path): self.logger.info("the topic is exist in local:%s, local:%s" % (url, local_abs_path)) return #self.logger.info("start fetch topic, url:%s" % url) #data = http.get(url) self.taobao.http = http num_iid = re.search(r"/(\d{7,})/", url).group(1) self.logger.info("start fetch taoke details id:%s" % num_iid) try: data = self.taobao.taobao_item_get(fields='detail_url,num_iid,title,nick,type,cid,seller_cids,props,input_pids,input_str,desc,pic_url,num,valid_thru,list_time,delist_time,stuff_status,location,price,post_fee,express_fee,ems_fee,has_discount,freight_payer,has_invoice,has_warranty,has_showcase,modified,increment,approve_status,postage_id,product_id,auction_point,property_alias,item_img,prop_img,sku,video,outer_id,is_virtual', num_iid=num_iid) data = data.get('item_get_response') except Exception, e: if e.__class__.__name__ == 'TaobaoException' and 'isv.' in e.sub_code: # == 'isv.item-is-delete:invalid-numIid-or-iid': self.logger.info("expired taoke id:%s" % num_iid) remove_url = "http://%s/queue/expired_taoke/" % site.hostname http.post_data(remove_url, {'num_iid': num_iid}) return else: raise #self.logger.info("data:%s" % data) data['item']['traderates'] = self.get_comments(num_iid, data['item']['nick']) self.logger.info("start fetch main images...") index = 0 main_image = [e['url'] for e in data['item']['item_imgs']['item_img'] ] (main_image, index) = self.save_image_to_oss(main_image, index, http, site, num_iid) self.logger.info("start fetch desc images...") desc_images = self._parse_image_from_desc(data['item']['desc']) (desc_images, index) = self.save_image_to_oss(desc_images, index, http, site, num_iid) data['item']['main_images'] = main_image data['item']['desc_images'] = desc_images #print data if os.environ.get('HUDSON_URL'): http.post_data("http://127.0.0.1:8924/queue/q/imported_taoke?format=json", {'details': json.dumps(data['item']), 'num_iid': num_iid}, {}) else: http.post_data("http://data.deonwu84.com/queue/q/imported_taoke?format=json", {'details': json.dumps(data['item']), 'num_iid': num_iid}, {}) http.post(url, site.real_path("log/%s/%s.txt" % (num_iid[-1:], num_iid)), {'data': json.dumps(data['item'])}) self.save_topic_data(data, local_abs_path) self.logger.info("done process taoke, id:%s" % num_iid)
class GetTaokDetail(object): def __init__(self, ): from taobao import Taobao from oss.oss_api import * self.logger = logging.getLogger("taoke") self.taobao = Taobao('12570801', 'fbab4f2ded890ae889e876ae0eee90b9') self.oss = OssAPI("storage.aliyun.com", "dcixul0kll5ubeisualf3q1d", "1fMUf01mRTfjXe/Ub4qEmLiu7tU=") def __call__(self, site, http, next_task, url, local_url, *args): local_abs_path = site.real_path(local_url) #self.logger.info("start fetch topic, url:%s" % url) #data = http.get(url) self.taobao.http = http num_iid = re.search(r"/(\d{7,})/", url).group(1) self.logger.info("start fetch taoke details id:%s" % num_iid) try: data = self.taobao.taobao_item_get(fields='detail_url,num_iid,title,nick,type,cid,seller_cids,props,input_pids,input_str,desc,pic_url,num,valid_thru,list_time,delist_time,stuff_status,location,price,post_fee,express_fee,ems_fee,has_discount,freight_payer,has_invoice,has_warranty,has_showcase,modified,increment,approve_status,postage_id,product_id,auction_point,property_alias,item_img,prop_img,sku,video,outer_id,is_virtual', num_iid=num_iid) data = data.get('item_get_response') except Exception, e: if e.__class__.__name__ == 'TaobaoException' and 'isv.' in e.sub_code: # == 'isv.item-is-delete:invalid-numIid-or-iid': self.logger.info("expired taoke id:%s" % num_iid) remove_url = "http://%s/queue/expired_taoke/" % site.hostname http.post_data(remove_url, {'num_iid': num_iid}) return else: raise self.load_cid_props(unicode(data['item']['cid'])) #self.logger.info("data:%s" % data) data['item']['props_str'] = self.convert_props_tostr(data['item']['props']) logging.info(u"prpos:%s" % data['item']['props_str']); try: traderates = self.get_comments(num_iid, data['item']['nick']) data['item']['traderates'] = traderates.get("trade_rates", {}).get('trade_rate', []) data['item']['traderates_count'] = traderates.get('total_results', 0) self.logger.info("traderates_count:%s" % data['item']['traderates_count']) except Exception, e: self.logger.info("failed to get comments:%s" % e)
from taobao import Taobao def test_get_detail(num_iid): pass def test_get_cate_list(pid): pass if __name__ == '__main__': api = Taobao('12395385', '53697d99eccd670191af0603d7256f77') #data = api.taobao_itemcats_get(fields='cid,parent_cid,name,is_parent', parent_cid=0) data = api.taobao_item_get(fields='detail_url,num_iid,title,nick,type,cid,seller_cids,props,input_pids,input_str,desc,pic_url,num,valid_thru,list_time,delist_time,stuff_status,location,price,post_fee,express_fee,ems_fee,has_discount,freight_payer,has_invoice,has_warranty,has_showcase,modified,increment,approve_status,postage_id,product_id,auction_point,property_alias,item_img,prop_img,sku,video,outer_id,is_virtual', num_iid='4735623930') print data
def job_taobao(self): for v in self.jobs['taobao']['account']: taobao = Taobao(v['username'], v['password']) if taobao.login(): taobao.checkin()
def __init__(self, ): self.logger = logging.getLogger("taoke") self.taobao = Taobao('12395385', '53697d99eccd670191af0603d7256f77') self.oss = OssAPI("storage.aliyun.com", "dcixul0kll5ubeisualf3q1d", "1fMUf01mRTfjXe/Ub4qEmLiu7tU=")
def __init__(self, ): self.logger = logging.getLogger("taoke") self.taobao = Taobao('12570801', 'fbab4f2ded890ae889e876ae0eee90b9') self.oss = OssAPI("storage.aliyun.com", "dcixul0kll5ubeisualf3q1d", "1fMUf01mRTfjXe/Ub4qEmLiu7tU=")
class GetTaokDetailOld(object): def __init__(self, ): self.logger = logging.getLogger("taoke") self.taobao = Taobao('12570801', 'fbab4f2ded890ae889e876ae0eee90b9') self.oss = OssAPI("storage.aliyun.com", "dcixul0kll5ubeisualf3q1d", "1fMUf01mRTfjXe/Ub4qEmLiu7tU=") def __call__(self, site, http, next_task, url, local_url, *args): local_abs_path = site.real_path(local_url) if os.path.isfile(local_abs_path): self.logger.info("the topic is exist in local:%s, local:%s" % (url, local_abs_path)) return #self.logger.info("start fetch topic, url:%s" % url) #data = http.get(url) self.taobao.http = http num_iid = re.search(r"/(\d{7,})/", url).group(1) self.logger.info("start fetch taoke details id:%s" % num_iid) try: data = self.taobao.taobao_item_get( fields= 'detail_url,num_iid,title,nick,type,cid,seller_cids,props,input_pids,input_str,desc,pic_url,num,valid_thru,list_time,delist_time,stuff_status,location,price,post_fee,express_fee,ems_fee,has_discount,freight_payer,has_invoice,has_warranty,has_showcase,modified,increment,approve_status,postage_id,product_id,auction_point,property_alias,item_img,prop_img,sku,video,outer_id,is_virtual', num_iid=num_iid) data = data.get('item_get_response') except Exception, e: if e.__class__.__name__ == 'TaobaoException' and 'isv.' in e.sub_code: # == 'isv.item-is-delete:invalid-numIid-or-iid': self.logger.info("expired taoke id:%s" % num_iid) remove_url = "http://%s/queue/expired_taoke/" % site.hostname http.post_data(remove_url, {'num_iid': num_iid}) return else: raise #self.logger.info("data:%s" % data) data['item']['traderates'] = self.get_comments(num_iid, data['item']['nick']) self.logger.info("start fetch main images...") index = 0 main_image = [e['url'] for e in data['item']['item_imgs']['item_img']] (main_image, index) = self.save_image_to_oss(main_image, index, http, site, num_iid) self.logger.info("start fetch desc images...") desc_images = self._parse_image_from_desc(data['item']['desc']) (desc_images, index) = self.save_image_to_oss(desc_images, index, http, site, num_iid) data['item']['main_images'] = main_image data['item']['desc_images'] = desc_images #print data if os.environ.get('HUDSON_URL'): http.post_data( "http://127.0.0.1:8924/queue/q/imported_taoke?format=json", { 'details': json.dumps(data['item']), 'num_iid': num_iid }, {}) else: http.post_data( "http://data.deonwu84.com/queue/q/imported_taoke?format=json", { 'details': json.dumps(data['item']), 'num_iid': num_iid }, {}) http.post(url, site.real_path("log/%s/%s.txt" % (num_iid[-1:], num_iid)), {'data': json.dumps(data['item'])}) self.save_topic_data(data, local_abs_path) self.logger.info("done process taoke, id:%s" % num_iid)
from taobao import Taobao def test_get_detail(num_iid): pass def test_get_cate_list(pid): pass if __name__ == '__main__': api = Taobao('12395385', '53697d99eccd670191af0603d7256f77') #data = api.taobao_itemcats_get(fields='cid,parent_cid,name,is_parent', parent_cid=0) data = api.taobao_item_get( fields= 'detail_url,num_iid,title,nick,type,cid,seller_cids,props,input_pids,input_str,desc,pic_url,num,valid_thru,list_time,delist_time,stuff_status,location,price,post_fee,express_fee,ems_fee,has_discount,freight_payer,has_invoice,has_warranty,has_showcase,modified,increment,approve_status,postage_id,product_id,auction_point,property_alias,item_img,prop_img,sku,video,outer_id,is_virtual', num_iid='4735623930') print data
if failed_404_times > 5: print("页面出现404错误,延迟5分钟") suning.delay_time(180) failed_404_times = 0 #try: # 每次启动时,启动3次关闭窗口程序 k = 0 for k in range(3): os.system('close.exe') start = time.clock() # 初始并实例化淘宝商品类 taobao = Taobao(taobao_id) # http://item.taobao.com/item.htm?id=584180190359 taobao_product_url = taobao.get_taobao_product_url() taobao_mdskip_url = taobao.get_taobao_product_mdskip_url() headers = taobao.get_taobao_product_headers() # 淘宝商品由 PVS: {颜色,尺码,价格,库存,折扣,照片} taobao_products = {} # 淘宝颜色 taobao_colors = {} taobao_sizes = {} taobao_image_url_list = [] # 初始并实例化苏宁商品类 suning = Suning(suning_productCode)
def __init__(self, http_client): from taobao import Taobao from oss.oss_api import * self.logger = logging.getLogger("taoke") self.taobao = Taobao('12570801', 'fbab4f2ded890ae889e876ae0eee90b9', http=http_client)
class GetTaokDetail(object): def __init__(self, http_client): from taobao import Taobao from oss.oss_api import * self.logger = logging.getLogger("taoke") self.taobao = Taobao('12570801', 'fbab4f2ded890ae889e876ae0eee90b9', http=http_client) def generate_cate_tree(self, site, http, pid): parents_id = [pid, ] while len(parents_id) > 0: tmp_list = parents_id parents_id = [] for cid in tmp_list: sub_cate = self._get_cate_list(cid) parents_id += [ unicode(e['cid']) for e in sub_cate if e['is_parent'] ] self._save_and_output(sub_cate) import time time.sleep(3) # taobao call limit. pass def _get_cate_list(self, cid): cate = self.taobao.taobao_itemcats_get(fields='cid,parent_cid,name,is_parent', parent_cid=cid) return cate["itemcats_get_response"]['item_cats']['item_cat'] def _get_cate_props(self, cid): cate = self.taobao.taobao_itemprops_get(fields='pid,name,must,multi,prop_values', cid=cid) return cate["itemprops_get_response"]['item_props']['item_prop'] def _save_props(self, cid, props): cid = unicode(cid) path = u"props/%s/%s.data" % (cid[-1], cid) values = {} for item in props: key = item['pid'] values[u'%s_name' % key] = item['name'] if 'prop_values' not in item: continue for v in item['prop_values']['prop_value']: vk = u'%s_%s' % (key, v['vid']) values[vk] = v['name'] import codecs if not os.path.isdir(os.path.dirname(path)): os.makedirs(os.path.dirname(path)) logging.info(u"save props:%s" % path) fd = codecs.open(path, "w", "utf-8") fd.write(json.dumps(values, encoding='utf-8', ensure_ascii=False)) fd.close() def _save_and_output(self, cate_list): for c in cate_list: #self.fd.write(unicode(c['cid'])) #self.fd.write("-->") #self.fd.write(c['name']) self.fd.write(u"%s-->%s --> %s\n" % (c['cid'], c['name'], c['parent_cid'])) print u"%s -> %s" % (c['cid'], c['name']) if not c['is_parent']: try: cid = unicode(c['cid']) props = self._get_cate_props(cid) self._save_props(cid, props) except Exception, e: logging.error("get cate props error:%s" % e)