def update_item(sql): t = time.time() db = get_db_engine() item = db.execute(sql) results = get_taobao_items(get_top(), item, fn_join_iids=lambda x:','.join([str(i[1]) for i in x]), calllimit=60) for batch_item in results: for iid, item in batch_item.items.iteritems(): try: item_id = item['req'][0] item_iid = item['req'][1] shop_id = item['req'][2] item_title = item['req'][3] item_picurl = item['req'][4] local_pic_url = item['req'][5] #直接用数据库的文件名,不更新,类似"18142957186_28924096.jpg" if item['resp']: taobao_title = item['resp']['title'] taobao_picurl = item['resp']['pic_url'] #item_picurl != taobao_picurl,则需要重新获取,并存入dfs,再更新item #title, pic_url, pic_width, pic_height, modified if FLAGS.forcibly: #强制更新 is_title_update = True is_picurl_update = True # local_pic_url = "%s_%s.%s" % (item_iid, str(id(item)), item_picurl.split('.')[-1].split('?')[0].split('/')[-1]) else: if cmp(item_title, taobao_title): is_title_update = True else: is_title_update = False if cmp(item_picurl, taobao_picurl): is_picurl_update = True else: is_picurl_update = False if is_title_update: if is_picurl_update: width, height = download_image({'item_id': item_id, 'num_id': item_iid, 'shop_id': shop_id, 'pic_url': taobao_picurl, 'image_name': local_pic_url, 'crawl_path': FLAGS.crawl_path}) db.execute("update item set modified=now(), title=%s, pic_url=%s, pic_width=%s, pic_height=%s where id=%s", taobao_title, taobao_picurl, width, height, item_id) logger.info("item %s num_id %s update title from %s to %s, pic_url from %s to %s", item_id, item_iid, item_title, taobao_title, item_picurl, taobao_picurl) else: db.execute("update item set modified=now(), title=%s where id=%s", taobao_title, item_id) logger.info("item %s update title from %s to %s", item_id, item_title, taobao_title) elif is_picurl_update: width, height = download_image({'item_id':item_id, 'num_id': item_iid, 'shop_id': shop_id, 'pic_url': taobao_picurl, 'image_name': local_pic_url, 'crawl_path': FLAGS.crawl_path}) db.execute("update item set modified=now(), pic_url=%s, pic_width=%s, pic_height=%s where id=%s", taobao_picurl, width, height, item_id) logger.info("item %s num_id %s update pic_url from %s to %s", item_id, item_iid, item_picurl, taobao_picurl) except: logger.error("update failed %s", traceback.format_exc()) spent = time.time() - t logger.info("update_item_title_image use time : %s", spent*1000)
def crawl_main(): write_db, read_db = get_db_engines(**{'dbconnstrs': FLAGS.xdbconnstrs}) sql = "select item.id, item.num_id, item.price, item.pic_url, item.volume from item_hotest, item, shop where item_hotest.item_id = item.id and item.status = 1 and item.shop_id = shop.id and shop.type <= 2 and shop.status=1 limit %s" % FLAGS.limit rows = read_db.execute(sql) counter = 0 off_counter = 0 change_counter = 0 vol_change_counter = 0 total = rows.rowcount results = get_taobao_items( get_top(), rows, fn_join_iids=lambda x: ','.join([str(i[1]) for i in x]), calllimit=300) for batch_item in results: for iid, item in batch_item.items.items(): try: counter += 1 item_id = item['req'][0] item_iid = item['req'][1] item_price = item['req'][2] #item_picurl = item['req'][3] if item['resp']: if item['resp']['approve_status'] != 'onsale': logger.debug("Item %s/%s %s %s is offshelf", counter, total, item_id, item_iid) off_counter += 1 write_db.execute( "update item set status=2, modified=now() where id=%s" % item_id) else: price = float(item['resp']['price']) #title = item['resp']['title'] #pic_url = item['resp']['pic_url'] if abs(item_price - price) / (item_price + 0.0000001) > 0.2 or abs( item_price - price) > 2.0: change_counter += 1 logger.debug("Item %s/%s %s %s price %s -> %s", counter, total, item_id, item_iid, item_price, price) if FLAGS.commit_price: write_db.execute( "update item set price=%s where id=%s" % (price, item_id)) logger.debug("req %s resp %s", item['req'], item['resp']) except: logger.error("update failed %s", traceback.format_exc()) logger.info( "Taobao quickupdate, total %s, off %s, price change %s, volume change %s", total, off_counter, change_counter, vol_change_counter)
def buildPath(self, cid): itemcats = get_taobao_itemcats(get_top(), cid) path = [] if itemcats: name = itemcats['item_cats']['item_cat'][0]['name'] parent_cid = itemcats['item_cats']['item_cat'][0]['parent_cid'] self.saveCategory(cid, parent_cid, name) if parent_cid != 0: # 注意这里有迭代 path.append(self.buildPath(parent_cid)) return ",".join(path)
def doCrawl(shop_id, numids_set): """ 注意: 下面这3行完全是为了满足get_taobao_items的第二个参数限制,组装成类似数据库查询结果,没啥意义 """ num_iids = [] for id in numids_set: num_iids.append((shop_id, id)) return_item_list = [] results = get_taobao_items(get_top(), num_iids, fn_join_iids=lambda x: ','.join([str(i[1]) for i in x])) for r in results: for iid, item in r.items.iteritems(): if item['resp']: return_item_list.append(dict(item['resp'])) return return_item_list
def crawl_main(): write_db, read_db = get_db_engines(**{'dbconnstrs' : FLAGS.xdbconnstrs}) sql = "select item.id, item.num_id, item.price, item.pic_url, item.volume from item_hotest, item, shop where item_hotest.item_id = item.id and item.status = 1 and item.shop_id = shop.id and shop.type <= 2 and shop.status=1 limit %s" % FLAGS.limit rows = read_db.execute(sql) counter = 0 off_counter = 0 change_counter = 0 vol_change_counter = 0 total = rows.rowcount results = get_taobao_items(get_top(), rows, fn_join_iids=lambda x:','.join([str(i[1]) for i in x]), calllimit=300) for batch_item in results: for iid, item in batch_item.items.items(): try: counter += 1 item_id = item['req'][0] item_iid = item['req'][1] item_price = item['req'][2] #item_picurl = item['req'][3] if item['resp']: if item['resp']['approve_status'] != 'onsale': logger.debug("Item %s/%s %s %s is offshelf", counter, total, item_id, item_iid) off_counter += 1 write_db.execute("update item set status=2, modified=now() where id=%s" % item_id) else: price = float(item['resp']['price']) #title = item['resp']['title'] #pic_url = item['resp']['pic_url'] if abs(item_price - price) / (item_price + 0.0000001) > 0.2 or abs(item_price - price) > 2.0: change_counter += 1 logger.debug("Item %s/%s %s %s price %s -> %s", counter, total, item_id, item_iid, item_price, price) if FLAGS.commit_price: write_db.execute("update item set price=%s where id=%s" % (price, item_id)) logger.debug("req %s resp %s", item['req'], item['resp']) except: logger.error("update failed %s", traceback.format_exc()) logger.info("Taobao quickupdate, total %s, off %s, price change %s, volume change %s", total, off_counter, change_counter, vol_change_counter)
def main(): if FLAGS.sessionid == "": logger.error( "Get SESSION from http://container.api.taobao.com/container?appkey=12525923" ) db = None csv_w = None if not FLAGS.dryrun: db = get_db_engine() if FLAGS.csv: csv_w = csv.writer(open(FLAGS.csv_filename, "wb"), delimiter=FLAGS.csv_split, quotechar=FLAGS.csv_quote, quoting=csv.QUOTE_NONNUMERIC) csv_w.writerow([ "report_date", "outer_code", "commission_rate", "item_title", "seller_nick", "num_iid", "shop_title", "app_key", "commission", "trade_id", "pay_time", "item_num", "category_id", "pay_price", "real_pay_fee", "category_name" ]) for d in waitlimit(FLAGS.limit, 60.0, dates()): logger.info("Fetching %s %s", d, FLAGS.sessionid) try: pageno = 1 total = 100 result_len = 100 got = 0 while result_len >= total: report = get_report(get_top(), d, FLAGS.sessionid, pageno, total) if not report: logger.info("result %s %s null", d, pageno) break else: logger.info("result %s %s", d, pageno) pageno += 1 result_len = len( report['taobaoke_report']['taobaoke_report_members'] ['taobaoke_report_member']) got += result_len logger.info( "result %s %s -> %s %s", d, pageno, got, len(report['taobaoke_report']['taobaoke_report_members'] ['taobaoke_report_member'])) if result_len > 0: members = report['taobaoke_report'][ 'taobaoke_report_members']['taobaoke_report_member'] for m in members: try: #import pdb; pdb.set_trace() check_sql = """select outer_code, commission_rate, item_title, seller_nick, num_iid, shop_title, app_key, commission, trade_id, pay_time, item_num, category_id, pay_price, real_pay_fee, category_name, create_time, confirm_time, status from taobao_report where trade_id=%s""" % m['trade_id'] result = list(db.execute(check_sql)) if result: if result[0][0] == m.get( 'outer_code', '') and result[0][4] == m['num_iid']: logger.debug( "already exists in db, skip %s vs %s", result[0], m) else: logger.warn( "same trade id, something wrong! %s %s" % (m, result)) continue sql = """insert into taobao_report (outer_code, commission_rate, item_title, seller_nick, num_iid, shop_title, app_key, commission, trade_id, pay_time, item_num, category_id, pay_price, real_pay_fee, category_name) values ( "%s", "%s", "%s", "%s", %s, "%s", "%s", "%s", %s, "%s", %s, %s, "%s", "%s", "%s" )""" % ( m.get('outer_code', ''), m['commission_rate'].replace( '%', '%%'), m['item_title'].replace( '%', '%%'), m['seller_nick'].replace( '%', '%%'), m['num_iid'], m['shop_title'].replace('%', '%%'), m['app_key'], m['commission'], m['trade_id'], m['pay_time'], m['item_num'], m['category_id'], m['pay_price'], m['real_pay_fee'], m.get('category_name', '').replace('%', '%%')) logger.debug(sql) if db: try: db.execute(sql) except: logger.warn( "insert failed sql %s --> err %s", sql, traceback.format_exc()) if csv_w: writecsv(csv_w, [ d, m.get('outer_code', ''), m['commission_rate'], m['item_title'], m['seller_nick'], m['num_iid'], m['shop_title'], m['app_key'], m['commission'], m['trade_id'], m['pay_time'], m['item_num'], m['category_id'], m['pay_price'], m['real_pay_fee'], m.get('category_name', '') ]) except: logger.error("Got error %s %s", m, traceback.format_exc()) except: logger.error("Got fatal error %s %s", d, traceback.format_exc())
def main(): if FLAGS.sessionid == "": logger.error("Get SESSION from http://container.api.taobao.com/container?appkey=12525923") db = None csv_w = None if not FLAGS.dryrun: db = get_db_engine() if FLAGS.csv: csv_w = csv.writer(open(FLAGS.csv_filename, "wb"), delimiter=FLAGS.csv_split, quotechar=FLAGS.csv_quote, quoting=csv.QUOTE_NONNUMERIC) csv_w.writerow(["report_date", "outer_code", "commission_rate", "item_title", "seller_nick", "num_iid", "shop_title", "app_key", "commission", "trade_id", "pay_time", "item_num", "category_id", "pay_price", "real_pay_fee", "category_name"]) for d in waitlimit(FLAGS.limit, 60.0, dates()): logger.info("Fetching %s %s", d, FLAGS.sessionid) try: pageno = 1 total = 100 result_len = 100 got = 0 while result_len >= total: report = get_report(get_top(), d, FLAGS.sessionid, pageno, total) if not report: logger.info("result %s %s null", d, pageno) break else: logger.info("result %s %s", d, pageno) result_len = len(report['taobaoke_payments']['taobaoke_payment']) got += result_len logger.info("result %s %s -> %s %s", d, pageno, got, len(report['taobaoke_payments']['taobaoke_payment'])) if result_len > 0: members = report['taobaoke_payments']['taobaoke_payment'] for m in members: try: #import pdb; pdb.set_trace() check_sql = """select outer_code, commission_rate, item_title, seller_nick, num_iid, shop_title, app_key, commission, trade_id, pay_time, item_num, category_id, pay_price, real_pay_fee, category_name, create_time, confirm_time, status from taobao_report where trade_id=%s""" % m['trade_id'] result = list(db.execute(check_sql)) if result: if result[0][0] == m.get('outer_code', '') and result[0][4] == m['num_iid']: logger.debug("already exists in db, skip %s vs %s", result[0], m) else: logger.warn("same trade id, something wrong! %s %s" % (m, result)) continue sql = """insert into taobao_report (outer_code, commission_rate, item_title, seller_nick, num_iid, shop_title, app_key, commission, trade_id, pay_time, item_num, category_id, pay_price, real_pay_fee, category_name, create_time) values ( "%s", "%s", "%s", "%s", %s, "%s", "%s", "%s", %s, "%s", %s, %s, "%s", "%s", "%s", now() )""" % ( m.get('outer_code', ''), m['commission_rate'].replace('%', '%%'), m['item_title'].replace('%', '%%'), m['seller_nick'].replace('%', '%%'), m['num_iid'], m['shop_title'].replace('%', '%%'), m['app_key'], m['commission'], m['trade_id'], m['pay_time'], m['item_num'], m['category_id'], m['pay_price'], m['real_pay_fee'], m.get('category_name','').replace('%', '%%') ) logger.debug(sql) if db: try: db.execute(sql) except: logger.warn("insert failed sql %s --> err %s", sql, traceback.format_exc()) if csv_w: writecsv(csv_w, [d, m.get('outer_code', ''), m['commission_rate'], m['item_title'], m['seller_nick'], m['num_iid'], m['shop_title'], m['app_key'], m['commission'], m['trade_id'], m['pay_time'], m['item_num'], m['category_id'], m['pay_price'], m['real_pay_fee'], m.get('category_name', '')]) except: logger.error("Got error %s %s", m, traceback.format_exc()) pageno += 1 except: logger.error("Got fatal error %s %s", d, traceback.format_exc())