def get_shop_id(thread_name, queue, table): while not EXIT_FLAG: QUEUE_LOCK.acquire() if not WORK_QUEUE.empty(): try: sku = queue.get() QUEUE_LOCK.release() url = 'https://item.m.jd.com/product/' + sku + '.html' print(thread_name, url) _spider = jd_spider.Spider() html_data = _spider.get_html(url) if html_data[0] != -1: result = html_analysis.get_shop_id(html_data[1]) else: pass if result[0] != -1: shop_id = result[1] print("%s: shop_id %s" % (thread_name, shop_id)) sql = 'update ' + table + ' set shop_id=%s where sku=%s ' data = [shop_id, sku] database_util.update_sql(sql, data) count = list( database_util.search_sql( 'select count(*) from shop where shop_id=%s', shop_id)[1])[0][0] if count == 0: database_util.update_sql( 'insert into shop(shop_id) values(%s)', shop_id) except Exception as err: print(err) # print('thread_queue get_shop_id err:' + str(err)) else: QUEUE_LOCK.release() time.sleep(1)
def update_shop_info(thread_name, queue, table): while not EXIT_FLAG: QUEUE_LOCK.acquire() if not WORK_QUEUE.empty(): try: shop_id = queue.get() QUEUE_LOCK.release() _spider = jd_spider.Spider() url = 'https://shop.m.jd.com/?shopId=' + shop_id print(url) html_data = _spider.get_html(url) if html_data[0] != -1: result = html_analysis.get_shop_info(html_data[1]) else: pass if result[0] != -1: follow = result[1] shop_name = result[2] print("%s: %s %d " % (thread_name, shop_name, follow)) sql = 'update shop set update_time=%s,follow=%s,shop_name=%s where shop_id=%s ' data = [ datetime.datetime.now(), follow, shop_name, shop_id ] database_util.update_sql(sql, data) except Exception as err: print('thread_queue update_shop_info err:' + str(err)) else: QUEUE_LOCK.release() time.sleep(1)
def get_sku(table): sql = 'select url,id from ' + table result = database_util.search_sql(sql, None) if result[0] != -1: result = list(result[1]) for i in result: sku = i[0].strip('https://item.jd.com/').strip('.html') id = i[1] sql = 'update ' + table + ' set sku=%s where id=%s' database_util.update_sql(sql, [sku, id])
def unify_brand(table): sql = 'select sku,brand from ' + table + ' where brand=%s' result = database_util.search_sql(sql, '360手机') if result[0] != -1: result = list(result[1]) for i in result: sql = 'update ' + table + ' set brand=%s where sku=%s' sku = i[0] data = ['360', sku] database_util.update_sql(sql, data) print(i[0])
def get_param(table): # 把店铺关注人数少的商品删掉 sql = 'delete from '+table+' where sku in (select a.sku from (select a.sku from '+table+' a,shop b where a.shop_id=b.shop_id and b.follow<10000) a)' database_util.update_sql(sql,None) sql = 'SELECT url FROM '+table+' where update_time is null'; result = list(database_util.search_sql(sql, None)[1]) url_list = [] for i in result: url_list.append(i[0]) thread_queue.fill_queue(url_list) thread_queue.use_threading(['get_param',table])
def temp(table): sql = 'select shop_id from computer where shop_id is not null' result = list(database_util.search_sql(sql, None))[1] for i in result: shop_id = i[0] count = list( database_util.search_sql( 'select count(*) from shop where shop_id=%s', shop_id)[1])[0][0] if count == 0: database_util.update_sql('insert into shop(shop_id) values(%s)', shop_id)
def update_img(table): # https://img11.360buyimg.com/n5/s54x54_jfs/t5773/143/1465870132/216483/4bbce005/592692d8Nbcc8f248.jpg # https://img10.360buyimg.com/n7/jfs/t18772/89/1863054684/170815/d28ecae1/5adca3deN76bb61cb.jpg sql = 'select img,sku from ' + table result = database_util.search_sql(sql, None) if result[0] != -1: imgs = list(result[1]) for i in imgs: img = i[0] sku = i[1] print(img) new_img = img.replace('n5/s54x54_jfs', 'n7/jfs') print(new_img + '\n') sql = 'update ' + table + ' set img=%s where sku=%s' data = [new_img, sku] database_util.update_sql(sql, data)
def insert_url(thread_name, queue, table): while not EXIT_FLAG: QUEUE_LOCK.acquire() if not WORK_QUEUE.empty(): try: url = queue.get() QUEUE_LOCK.release() count = list( database_util.search_sql( 'select count(*) url from ' + table + ' where url=%s', url)[1])[0][0] if count == 0: sql = 'insert into ' + table + ' set url=%s' database_util.update_sql(sql, url) except Exception as err: print('thread_queue update_price err:' + str(err)) else: QUEUE_LOCK.release() time.sleep(1)
def update_shop_info(table): sql = 'SELECT shop_id FROM shop where TO_DAYS(NOW()) - TO_DAYS(update_time) >=1' result = database_util.search_sql(sql, None) shop_id = [] if result[0] != -1: id = list(result[1]) for i in id: shop_id.append(i[0]) thread_queue.fill_queue(shop_id) thread_queue.use_threading(['update_shop_info', table]) sql = 'select brand,follow from ' + table + ' group by brand order by follow' result = database_util.search_sql(sql, None) if result[0] != -1: result = list(result[1]) for i in result: sql = 'update ' + table + ' set brand_hot=%s where brand=%s' data = [i[1], i[0]] database_util.update_sql(sql, data)
def get_comment(queue, table, page_no): while not EXIT_FLAG: QUEUE_LOCK.acquire() if not WORK_QUEUE.empty(): try: sku = queue.get() QUEUE_LOCK.release() _spider = jd_spider.Spider() result = _spider.get_comment(table, sku, page_no) # if result[0] != -1: # result = _spider.get_after_comment(table,sku,page_no) if result[0] != -1: sql = 'update ' + table + ' set update_comment_time=%s where sku=%s ' data = [datetime.datetime.now(), sku] database_util.update_sql(sql, data) except Exception as err: print('thread_queue get_comment err:' + str(err)) else: QUEUE_LOCK.release() time.sleep(1)
def update_price(thread_name, queue, table): while not EXIT_FLAG: QUEUE_LOCK.acquire() if not WORK_QUEUE.empty(): try: data = queue.get() QUEUE_LOCK.release() sku = data['sku'] max_price = data['max_price'] min_price = data['min_price'] avg_price = data['avg_price'] price_times = data['price_times'] _spider = jd_spider.Spider() price_result = _spider.get_price(sku) if price_result[0] != -1: cur_price = price_result[1] if cur_price > max_price: max_price = cur_price if cur_price < min_price: min_price = cur_price avg_price = round((avg_price * price_times + cur_price) / (price_times + 1), 2) price_times += 1 print("%s: %.2f, %.2f, %.2f, %.2f" % (thread_name, max_price, min_price, avg_price, cur_price)) sql = 'update ' + table + ' set update_price_time=%s,max_price=%s,min_price=%s,avg_price=%s,price=%s,price_times=%s where sku=%s ' data = [ datetime.datetime.now(), max_price, min_price, avg_price, cur_price, price_times, sku ] database_util.update_sql(sql, data) except Exception as err: print('thread_queue update_price err:' + str(err)) else: QUEUE_LOCK.release() time.sleep(1)
def update_score(thread_name, queue, table, para): while not EXIT_FLAG: QUEUE_LOCK.acquire() if not WORK_QUEUE.empty(): try: sku = queue.get() QUEUE_LOCK.release() w_rate = para['w_rate'] w_follow = para['w_follow'] w_comment = para['w_comment'] w_sentiment = para['w_sentiment'] w_brand = para['w_brand'] sql = 'select sku,rate,follow,comment,sentiment,brand_hot from ' + table + ' where sku=%s' result = database_util.search_sql(sql, sku) if result[0] != -1: result = list(result[1]) for i in result: sku = i[0] rate = float(i[1]) * 100 follow = int(i[2]) comment = int(i[3]) sentiment = int(i[4]) brand_hot = int(i[5]) score = round( (rate * w_rate + follow * w_follow + comment * w_comment + sentiment * w_sentiment + brand_hot * w_brand), 2) sql = 'update ' + table + ' set score=%s where sku=%s' data = [score, sku] database_util.update_sql(sql, data) except Exception as err: print('thread_queue update_score err:' + str(err)) else: QUEUE_LOCK.release() time.sleep(1)
def get_param(thread_name, queue, table): while not EXIT_FLAG: QUEUE_LOCK.acquire() if not WORK_QUEUE.empty(): try: url = queue.get() QUEUE_LOCK.release() _item = item.Item() _spider = jd_spider.Spider() html_data = _spider.get_html(url) # 获取商品详情页面的html数据 if html_data[0] == -1: continue sku = url.strip('https://item.jd.com/').strip('.html') _item = html_analysis.get_all_param(html_data[1], _item) #获取普通参数 _item = _spider.get_rate(sku, _item) #获取跟评价有关的信息 result = _spider.get_price(sku) if result[0] != -1: _item.price = result[1] _item.price_times = 1 print(thread_name, url) sql = 'update ' + table + ' set description=%s,price=%s,img=%s,brand=%s,name=%s,update_time=%s,' \ 'comment=%s,rate=%s,max_price=%s,min_price=%s,avg_price=%s,price_times=%s,update_price_time=%s,update_rate_time=%s where sku=%s ' data = [ _item.description, _item.price, _item.img, _item.brand, _item.name, _item.update_time, _item.comment, _item.rate, _item.price, _item.price, _item.price, 1, _item.update_price_time, _item.update_rate_time, sku ] database_util.update_sql(sql, data) except Exception as err: print('thread_queue get_param err:' + str(err)) else: QUEUE_LOCK.release() time.sleep(1)
def del_items(table): sql = 'delete from ' + table + ' where shop_name is null;' database_util.update_sql(sql, None)