def crawl_all(): login_params = {'emailaddress':'*****@*****.**', 'password':'******', 'type':'undefined', 'wbid':'0', 'savestat':'true' # 'checkcode':'', } req = urllib2.Request('http://www.meilishuo.com/users/ajax_logon?frm=undefined', urllib.urlencode(login_params), headers) handle = urllib2.urlopen(req) logger.info("logged result %s", handle.read()) if FLAGS.itemid: crawl_item(FLAGS.itemid) else: if FLAGS.group: start = FLAGS.group*1000000 end = (FLAGS.group+1)*1000000 else: start = FLAGS.start end = FLAGS.end db = get_db_engine() for item_id in xrange(start, end, 1): try: results = db.execute("select item_id from crawl_html where item_id=%s" % item_id) if results.rowcount > 0: continue except: db = get_db_engine() crawl_item(item_id)
def crawl_all(): login_params = { 'emailaddress': '*****@*****.**', 'password': '******', 'type': 'undefined', 'wbid': '0', 'savestat': 'true' # 'checkcode':'', } req = urllib2.Request( 'http://www.meilishuo.com/users/ajax_logon?frm=undefined', urllib.urlencode(login_params), headers) handle = urllib2.urlopen(req) logger.info("logged result %s", handle.read()) if FLAGS.itemid: crawl_item(FLAGS.itemid) else: if FLAGS.group: start = FLAGS.group * 1000000 end = (FLAGS.group + 1) * 1000000 else: start = FLAGS.start end = FLAGS.end db = get_db_engine() for item_id in xrange(start, end, 1): try: results = db.execute( "select item_id from crawl_html where item_id=%s" % item_id) if results.rowcount > 0: continue except: db = get_db_engine() crawl_item(item_id)
def convert_main(): db = get_db_engine() db_production = get_db_engine(connstr=FLAGS.production_connstr) all_nicks = db_production.execute("select nick from shop"); all_nick_set = set([row[0] for row in all_nicks]) result = db.execute("select url, name from shop_shop where is_voted=1 and is_cloth=1 and is_delete=0;") for row in result: if row[0].find("tmall.com") > 0: shop_type = 2 else: shop_type = 1 if row[1] not in all_nick_set: db_production.execute("insert into shop(nick, url, type, status) values(%s, %s ,%s, 2)", row[1], row[0], shop_type) else: print row[1].encode('utf8'), " exists"
def crawl_shops(sql): db = get_db_engine() shops = list(db.execute(sql)) if not shops: logger.info("not shop crawler.") return # debug if FLAGS.debug_parser: import pdb pdb.set_trace() # global, all shop use tb_category = TaobaoCategory(db) term_factory = TermFactory(db) logger.info("init category %s and term factory %s.", len(tb_category.categories_dict), len(term_factory.sub_terms)) last_time = 0 for shop in shops: cur = time.time() * 1000 if cur - last_time < FLAGS.interval: time.sleep((FLAGS.interval - (cur - last_time)) / 1000.0) last_time = time.time() * 1000 crawl_one_shop(shop, tb_category, term_factory, db)
def check_image(): #因为数据量比较大,分批检查 db_limit = { 1: 100000, 100000: 200000, 200000: 300000, 300000: 400000, 400000: 500000 } n = 0 for s, e in db_limit.items(): sql = "select id, num_id, shop_id, pic_url, local_pic_url from item where status=1 limit %s,%s" % (s, e) db = get_db_engine() items = list(db.execute(sql)) for item in items: item_id = item[0] item_iid = str(item[1]) shop_id = item[2] pic_url = str(item[3]) local_pic_url = str(item[4]) validate_path = "/space/wwwroot/image.guang.j.cn/ROOT/images/" + str(shop_id) + "/big/" + local_pic_url if not os.path.exists(validate_path): n += 1 logger.error("item %s not pic %s", item_id, validate_path) try: download_image({'item_id': item_id, 'num_id': item_iid, 'shop_id': shop_id, 'pic_url': pic_url, 'image_name': local_pic_url, 'crawl_path': FLAGS.crawl_path}) except: logger.error("download %s:%s failed reason: %s", item_id, pic_url, traceback.format_exc()) continue logger.info("stat item not image number=%s", n)
def load_click_items(numid2volumeprice): logger.info("Loading click items") click_items = [] paid_items = [] click_item_type = namedtuple( "ClickItemType", 'click_hash item_id click_time click_ip area_code click_price click_volume item_price item_volume shop_nick taobao_report_id num_id' ) db = get_db_engine() where = "click_time>='%s' and click_time<='%s'" % (datestr( FLAGS.start), datestr(FLAGS.end)) if FLAGS.limit > 0: where += " limit %s" % FLAGS.limit sql = "select outer_code,item_id,click_time,click_ip,click_area,click_price,click_volume,item.price,item.volume,shop.nick,click_item_log.taobao_report_id,item.num_id from click_item_log left join item on click_item_log.item_id=item.id left join shop on shop.id=item.shop_id where %s" % where logger.debug("fetching %s", sql) results = db.execute(sql) progress = 0 item_matched = 0 logger.info("Processing click items %s", results.rowcount) price_diffs = 0 for line in results: progress += 1 click_item = click_item_type(*line) if not click_item.num_id: logger.warn("no numid %s", click_item) continue click_items.append(click_item) if click_item.item_id > 0: item_matched += 1 volume = click_item.item_volume if not volume or volume == 0: logger.warn("item %s abnormal %s", click_item.item_id, volume) volume = 0.2 elif volume > 800: volume = 800 price = click_item.click_price if click_item.item_price and price > click_item.item_price * 1.5: price = click_item.item_price price_diffs += 1 logger.warn("Price diff paid? %s %s/%s too much %s - %s", click_item.taobao_report_id, price_diffs, results.rowcount, click_item.click_price, click_item.item_price) if price > 500.0: price = 500.0 if not price or price < 0.5: logger.warn("price %s abnormal %s", click_item.item_id, price) price = 1.0 numid2volumeprice[long(click_item.num_id)] = { 'volume': volume, 'price': price } if click_item.taobao_report_id: paid_items.append(click_item.taobao_report_id) logger.info("Total click %s item matched %s", len(click_items), item_matched) return click_items, paid_items
def load_click_items(numid2volume): logger.info("Loading click items") db = get_db_engine() json_file = open(FLAGS.click_input) click_json = simplejson.load(json_file) click_item_type = namedtuple("ClickItemType", 'click_hash source media_id holder_id site admember_id campaign_id adgroup_id creative_id click_time click_ip area_code lpid price pubcat_list user_attr_list score item_price item_volume') click_items = [] creative_matched = 0 outercode_matched = 0 progress = 0 creative2item_cache = {} logger.info("Processing click items") for line in click_json: progress += 1 click_item = click_item_type(*line) click_items.append(click_item) if creative2item_cache.has_key(click_item.creative_id): rr = creative2item_cache[click_item.creative_id] else: # creative_id --> (num_id, shop_name) item_price, item_volume r = db.execute("select num_id, shop.nick from item,shop where item.shop_id=shop.id and item.uctrac_creative_id=%s" % click_item.creative_id) if not r.rowcount: logger.warn("creative not matched %s %s/%s", click_item.creative_id, progress, len(click_json)) continue rr = creative2item_cache[click_item.creative_id] = list(r) creative_matched += 1 num_id, seller_nick = rr[0] #import pdb; pdb.set_trace() numid2volume[long(num_id)] = click_item.item_volume click_hash = 'jn%s' % click_item.click_hash r2 = db.execute('select 1 from taobao_report where outer_code="%s"' % click_hash) if r2.rowcount: outercode_matched += 1 logger.info("Total click %s creative matched %s outercode matched %s", len(click_items), creative_matched, outercode_matched) return click_items
def crawl_items(sql): db = get_db_engine() items = db.execute(sql) logger.info("crawling image total %s", items.rowcount) if not items.rowcount: return if FLAGS.parallel: mapper = SimpleMapReduce(crawl_item2, identity) results = mapper(transform_args(items)) logger.info("crawl finished %s", len(results)) else: for item in items: crawl_item2({ 'item': item, 'is_commit': FLAGS.commit, 'crawl_path': FLAGS.crawl_path, 'server_path': FLAGS.path, 'is_remove': FLAGS.removetmp, 'org_server_path': FLAGS.org_path, 'dbuser': FLAGS.dbuser, 'dbpasswd': FLAGS.dbpasswd, 'dbhost': FLAGS.dbhost, 'dbport': FLAGS.dbport, 'db': FLAGS.db, 'echosql': FLAGS.echosql, 'statshost': FLAGS.statshost, 'statsport': FLAGS.statsport })
def process_item(item, total, cur): try: id,shop_id,local_pic_url,pic_url,manual_set,manual_updated_columns,status,num_id,pic_height,pic_width = item big_path = "%s%s/big/%s" % (FLAGS.path, shop_id, local_pic_url) mid2_path = "%s%s/mid2/%s" % (FLAGS.path, shop_id, local_pic_url) mid_path = "%s%s/mid/%s" % (FLAGS.path, shop_id, local_pic_url) sma_path = "%s%s/sma/%s" % (FLAGS.path, shop_id, local_pic_url) if os.path.exists(big_path) and pic_width == 0: size = get_image_size(big_path) logger.debug("update %s size %s" % (id, size)) db = get_db_engine() db.execute("update item set pic_width=%s,pic_height=%s where id=%s" % (size[0], size[1], id)) if status in (2,3) and not FLAGS.force: return if not os.path.exists(big_path): headers = {'Referer' : "http://item.taobao.com/item.htm?id=%s" % id, 'User-Agent' : DEFAULT_UA} data = crawl_page(num_id, pic_url, headers) # save to path logger.debug("crawling %s %s %s %s", cur, total, big_path, item) save_image(big_path, data) if not os.path.exists(mid2_path): logger.debug("thumbing %s %s %s %s", cur, total, mid2_path, item) imagemagick_resize(300, 300, big_path, mid2_path) if not os.path.exists(mid_path): logger.debug("thumbing %s %s", mid_path, item) imagemagick_resize(210, 210, big_path, mid_path) if not os.path.exists(sma_path): logger.debug("thumbing %s %s", sma_path, item) imagemagick_resize(60, 60, big_path, sma_path) except: logger.error("unknown error %s, %s", item, traceback.format_exc())
def update_item(sql): t = time.time() db = get_db_engine() item = db.execute(sql) results = get_taobao_items(get_top(), item, fn_join_iids=lambda x:','.join([str(i[1]) for i in x]), calllimit=60) for batch_item in results: for iid, item in batch_item.items.iteritems(): try: item_id = item['req'][0] item_iid = item['req'][1] shop_id = item['req'][2] item_title = item['req'][3] item_picurl = item['req'][4] local_pic_url = item['req'][5] #直接用数据库的文件名,不更新,类似"18142957186_28924096.jpg" if item['resp']: taobao_title = item['resp']['title'] taobao_picurl = item['resp']['pic_url'] #item_picurl != taobao_picurl,则需要重新获取,并存入dfs,再更新item #title, pic_url, pic_width, pic_height, modified if FLAGS.forcibly: #强制更新 is_title_update = True is_picurl_update = True # local_pic_url = "%s_%s.%s" % (item_iid, str(id(item)), item_picurl.split('.')[-1].split('?')[0].split('/')[-1]) else: if cmp(item_title, taobao_title): is_title_update = True else: is_title_update = False if cmp(item_picurl, taobao_picurl): is_picurl_update = True else: is_picurl_update = False if is_title_update: if is_picurl_update: width, height = download_image({'item_id': item_id, 'num_id': item_iid, 'shop_id': shop_id, 'pic_url': taobao_picurl, 'image_name': local_pic_url, 'crawl_path': FLAGS.crawl_path}) db.execute("update item set modified=now(), title=%s, pic_url=%s, pic_width=%s, pic_height=%s where id=%s", taobao_title, taobao_picurl, width, height, item_id) logger.info("item %s num_id %s update title from %s to %s, pic_url from %s to %s", item_id, item_iid, item_title, taobao_title, item_picurl, taobao_picurl) else: db.execute("update item set modified=now(), title=%s where id=%s", taobao_title, item_id) logger.info("item %s update title from %s to %s", item_id, item_title, taobao_title) elif is_picurl_update: width, height = download_image({'item_id':item_id, 'num_id': item_iid, 'shop_id': shop_id, 'pic_url': taobao_picurl, 'image_name': local_pic_url, 'crawl_path': FLAGS.crawl_path}) db.execute("update item set modified=now(), pic_url=%s, pic_width=%s, pic_height=%s where id=%s", taobao_picurl, width, height, item_id) logger.info("item %s num_id %s update pic_url from %s to %s", item_id, item_iid, item_picurl, taobao_picurl) except: logger.error("update failed %s", traceback.format_exc()) spent = time.time() - t logger.info("update_item_title_image use time : %s", spent*1000)
def mig_main(): db = get_db_engine() result = db.execute("select id,name,status from wb_account;") for row in result: sql = "update wb_qq_account set qqid=%s where name='%s'" % (QQIDS[row[1]], row[1]) print sql db.execute(sql)
def crawl_hotest(): #查出bi-db1中所有的item_hotest表item_id数据,这个表应该是每小时更新一次 #写入一个临时表temp_item_hotest,写入前先删除旧数据 #联合查询item,temp_item_hotest表,进行抓取评论,最多抓取20页 bi_db = get_db_engine(dbhost=FLAGS.bihost) itemid_list = list(bi_db.execute("select item_id from item_hotest")) db = get_db_engine() db.execute("TRUNCATE table temp_item_hotest") logger.debug("TRUNCATE table temp_item_hotest") db.execute("insert into temp_item_hotest values (%s)", itemid_list) logger.debug("insert temp_item_hotest") if FLAGS.force: return crawl_items("select item.id,item.detail_url,item.num_id from item,temp_item_hotest where item.id=temp_item_hotest.item_id") else: return crawl_items("select item.id,item.detail_url,item.num_id from item,temp_item_hotest where item.status=1 and item.id=temp_item_hotest.item_id order by item.id desc")
def get_data(): sql = "select shop_id,local_pic_url from item where modified>'2013-12-09 09' order by shop_id desc" db = get_db_engine() items = list(db.execute(sql)) for item in items: refreshCdnCache(item[0], item[1]) time.sleep(1)
def clicklog_main(): click_file_list = [] for d in eachday(FLAGS.start, FLAGS.end): click_file_list.extend(glob("/space/log/filtered/click*/click-" + datestr(d) + "_00???")) # TODO: load from conversion db? ret = [] if FLAGS.commit: db = get_db_engine() for fn in click_file_list: logger.debug("processing %s", fn) for line in open(fn, "r"): click = get_click(line) if not click: continue click_obj, click_ex_obj, score, why = click rec = get_record(click) #if rec[0] in written: # continue #already written in db. if rec: if FLAGS.commit: insert_match(db, rec) else: ret.append(rec) simplejson.dump(ret, open(FLAGS.out_file, "w")) return ret
def GET(self, id): db = get_db_engine() results = db.execute("select crawl_item_images.url, crawl_item_images.pos, crawl_item_images.type from crawl_html, crawl_item_images where crawl_item_images.item_id=crawl_html.item_id and crawl_html.item_id=%s;" % id) item_crawler = ItemCrawler(id, FLAGS.crawl_path) item_crawler.crawl(results, ((94,94), (350,350)), False) return render.crawlitem(id, item_crawler.results)
def db_validate(): db = get_db_engine() items = list( db.execute( "select shop_id,local_pic_url from item where status=1 and crawl_status=2 and created between CURDATE()-interval 7 day and CURDATE()" )) i = 0 for item in items: shop_id = item[0] local_pic_url = item[1] if shop_id == 5: small4_path = "/space/wwwroot/image.guang.j.cn/ROOT/images_1/%s/small4/%s" % ( shop_id, local_pic_url) else: small4_path = "/space/wwwroot/image.guang.j.cn/ROOT/images/%s/small4/%s" % ( shop_id, local_pic_url) if not os.path.isfile(small4_path): i += 1 try: big_path = small4_path.replace("/small4/", "/big/") image = Image.open(big_path) width, height = image.size convert_img(big_path, small4_path, width, height) logger.info("%s:%s", i, small4_path) except IOError, e: logger.error("Open image failed %s:%s %s", i, big_path, e.message) continue
def crawl_main(): hosts = set() hosts_in_db = set() hosts_attr = {} db = get_db_engine() result = db.execute("select url from shop") for row in result: hosts_in_db.add(str(urlparse.urlparse(row[0]).netloc)) #print hosts_in_db for line in open(FLAGS.path): url = line.split()[0] host = str(urlparse.urlparse(url).netloc) hosts.add(host) if url.find('tmall.com') > 0: shopname = " ".join(line.split()[1:]) else: shopname = " ".join(line.split()[1:-1]) hosts_attr[host] = shopname hosts_not_in_db = hosts - hosts_in_db print "hosts %s indb %s notindb %s" % (len(hosts), len(hosts_in_db), len(hosts_not_in_db)) for host in hosts_not_in_db: print "http://%s/ %s" % (host, hosts_attr[host])
def img_update(): sql = "select id,num_id,shop_id,pic_url,local_pic_url from item where pic_url like '%%q90.%%'" db = get_db_engine() items = db.execute(sql) tr = re.compile("(.+\.(jpg|png))[^$]*.jpg$") for item in items: taobao_picurl = item[3] taobao_picurl = tr.sub(r'\1', taobao_picurl) try: width, height = download_image({ 'item_id': item[0], 'num_id': item[1], 'shop_id': item[2], 'pic_url': taobao_picurl, 'image_name': item[4], 'crawl_path': FLAGS.crawl_path }) db.execute( "update item set modified=now(), pic_url=%s, pic_width=%s, pic_height=%s where id=%s", taobao_picurl, width, height, item[0]) logger.info("item %s update image ok", item[0]) except: logger.error("download %s:%s failed reason %s", item[0], taobao_picurl, traceback.format_exc()) continue
def get_xks_tagmatch(xks): tagmatch = '' if xks: db = get_db_engine() rows = db.execute("SELECT tag_match FROM recommend_subscriber WHERE id = %s" % xks) if rows.rowcount > 0: tagmatch = convert_tagmatch(list(rows)[0][0]) return tagmatch
def crawler(sql): db = get_db_engine() items = list(db.execute(sql)) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() for item in items: shop_id = item[0] shop_type = item[1] item_id = item[2] url = item[3] try: htm = get_item_htm(item_id, url, db) if shop_type == 1: htm_obj = parse_html(htm, encoding='gb18030') discount_url = htm_obj.xpath("//div[@id='promote']/@data-default") if discount_url and len(discount_url) > 0: item_headers = {'Referer': url, 'User-Agent': DEFAULT_UA} disc_content = download(discount_url[0], item_headers) if disc_content.strip(): disc_obj = parse_html(disc_content, encoding='gb18030') content = disc_obj.xpath("//div[@id='J_MjsData']/h3/text()")[0].strip() dates = disc_obj.xpath("//div[@id='J_MjsData']/h3/span[@class='tb-indate']/text()")[0].strip() st = dates.encode('utf-8').replace("--","—").split("—") start_time = datetime.datetime.strptime(st[0].strip().replace('年','-').replace("月","-").replace("日",""),'%Y-%m-%d') end_time = datetime.datetime.strptime(st[1].strip().replace('年','-').replace("月","-").replace("日",""),'%Y-%m-%d') db.execute("replace into shop_discount (shop_id,content,start_time,end_time,discount_url,create_time,last_update_time) values (%s,%s,%s,%s,%s,now(),now())", shop_id, content.encode('utf-8'), start_time, end_time, discount_url[0]) logger.info("taobao shop %s get discount success", shop_id) else: logger.warning("taobao shop %s:%s not discount.", shop_id, url) else: logger.warning("taobao shop %s:%s not discount.", shop_id, url) elif shop_type == 2: d_url = get_val(htm, "initApi") if d_url: item_headers = {'Referer': url, 'User-Agent': DEFAULT_UA} disc_content = download(d_url, item_headers) cjson = loads(disc_content.decode('gb18030').encode('utf8')) shop_prom = cjson['defaultModel']['itemPriceResultDO']['tmallShopProm'] if shop_prom: st = int(shop_prom['startTime'])/1000 et = int(shop_prom['endTime'])/1000 start_time = time.strftime("%Y-%m-%d", time.localtime(st)) end_time = time.strftime("%Y-%m-%d", time.localtime(et)) content = shop_prom['promPlan'][0]['msg'] db.execute("replace into shop_discount (shop_id,content,start_time,end_time,discount_url,create_time,last_update_time) values (%s,%s,%s,%s,%s,now(),now())", shop_id, content.encode('utf-8'), start_time, end_time, d_url) logger.info("tmall shop %s get discount success", shop_id) else: logger.warning("taobao shop %s:%s not discount.", shop_id, url) except: logger.error("shop %s:%s xpath failed:%s", shop_id, url, traceback.format_exc())
def do_all(fn): db = get_db_engine() where_sql = " %s" % (FLAGS.where) results = db.execute("select id from shop where type < 3 and %s" % where_sql) for result in results: fn(result[0], db) time.sleep(1.0)
def GET(self, id): db = get_db_engine() results = db.execute( "select crawl_item_images.url, crawl_item_images.pos, crawl_item_images.type from crawl_html, crawl_item_images where crawl_item_images.item_id=crawl_html.item_id and crawl_html.item_id=%s;" % id) item_crawler = ItemCrawler(id, FLAGS.crawl_path) item_crawler.crawl(results, ((94, 94), (350, 350)), False) return render.crawlitem(id, item_crawler.results)
def convert_main(): db = get_db_engine() db_production = get_db_engine(connstr=FLAGS.production_connstr) all_nicks = db_production.execute("select nick from shop") all_nick_set = set([row[0] for row in all_nicks]) result = db.execute( "select url, name from shop_shop where is_voted=1 and is_cloth=1 and is_delete=0;" ) for row in result: if row[0].find("tmall.com") > 0: shop_type = 2 else: shop_type = 1 if row[1] not in all_nick_set: db_production.execute( "insert into shop(nick, url, type, status) values(%s, %s ,%s, 2)", row[1], row[0], shop_type) else: print row[1].encode('utf8'), " exists"
def process_all_items(): db = get_db_engine() last_time = 0 sql = "select id,shop_id,local_pic_url,pic_url,manual_set,manual_updated_columns,status,num_id,pic_height,pic_width from item " + FLAGS.sql items = db.execute(sql) i = 0 for item in items: i += 1 process_item(item, items.rowcount, i)
def crawl_num(): db = get_db_engine() if FLAGS.force: return crawl_items( "select item.id,item.detail_url,item.num_id from item,shop where item.shop_id=shop.id and shop.type!=3 and item.num_id=%s" % FLAGS.numid) else: return crawl_items( "select item.id,item.detail_url,item.num_id from item,shop where item.shop_id=shop.id and shop.type!=3 and item.status=1 and item.num_id=%s" % FLAGS.numid)
def crawl_shops(sql_filter): sql_template = ''' select s.id as shop_id , s.type as shop_type , s.url as shop_url , i.id as first_item_id , h.id as item_html_id , h.html as item_html from ( select max(i.id) as item_id , i.shop_id FROM item i inner join crawl_html h on i.status = 1 and i.crawl_status = 2 and i.id = h.item_id group by i.shop_id ) sni inner join item i on sni.item_id = i.id inner join crawl_html h on h.item_id = i.id inner join shop s on i.shop_id = s.id where ''' sql = sql_template + sql_filter + ';' db_shop = get_db_engine() shops = db_shop.execute(sql) if not shops.returns_rows: logger.info("no shops to be crawled.") return # debug if FLAGS.debug_parser: import pdb pdb.set_trace() db = get_db_engine() last_time = 0 for shop in shops: cur = time.time() * 1000 if cur - last_time < FLAGS.interval: time.sleep((FLAGS.interval - (cur - last_time)) / 1000.0) last_time = time.time() * 1000 crawl_one_shop(shop, db)
def update_shop_level(sql): db = get_db_engine() shops = db.execute(sql) failed = [] for shop in shops: process_shop(db, shop, failed) results = "Update shop's level, Checked result, total %s failed %s, detailed %s" % (shops.rowcount, len(failed), ",".join(map(str, failed))) if len(failed): logger.warn(results) else: logger.info(results)
def crawl_hotest(): #查出bi-db1中所有的item_hotest表item_id数据,这个表应该是每小时更新一次 #写入一个临时表temp_item_hotest,写入前先删除旧数据 #联合查询item,temp_item_hotest表,进行抓取评论,最多抓取20页 bi_db = get_db_engine(dbhost=FLAGS.bihost) itemid_list = list(bi_db.execute("select item_id from item_hotest")) db = get_db_engine() db.execute("TRUNCATE table temp_item_hotest") logger.debug("TRUNCATE table temp_item_hotest") db.execute("insert into temp_item_hotest values (%s)", itemid_list) logger.debug("insert temp_item_hotest") if FLAGS.force: return crawl_items( "select item.id,item.detail_url,item.num_id from item,temp_item_hotest where item.id=temp_item_hotest.item_id" ) else: return crawl_items( "select item.id,item.detail_url,item.num_id from item,temp_item_hotest where item.status=1 and item.id=temp_item_hotest.item_id order by item.id desc" )
def replace_main(): now_time = datetime.datetime.now() front_time = now_time - datetime.timedelta(hours=1) number = FLAGS.number # 默认取详情页第一张图片 fdfs_client = Fdfs_client('/etc/fdfs/client.conf') db = get_db_engine() if FLAGS.itemid > 0: item_sql = "select id,shop_id,local_pic_url from item where id=%s and status=1" % FLAGS.itemid elif FLAGS.shopid > 0: item_sql = "select id,shop_id,local_pic_url from item where shop_id=%s and crawl_status=2 and status=1" % FLAGS.shopid elif FLAGS.all: for shop, num in SHOP_NUM.items(): number = num item_sql = "select id,shop_id,local_pic_url from item where shop_id=%s and crawl_status=2 and status=1 and created>'%s'" % ( shop, front_time) items = list(db.execute(item_sql)) logger.info("replace main image total %s", len(items)) i = 1 for item in items: item_id = item[0] shop_id = item[1] local_pic_url = item[2] # 一定要使用pos排序 image_sql = "select item_id,fastdfs_filename,pos from item_images where type=2 and item_id=%s order by pos limit %s,1" % ( item_id, number - 1) image_item = list(db.execute(image_sql)) try: if len(image_item) > 0 and image_item[0][0] is not None: fastdfs_filename = str(image_item[0][1]) else: fastdfs_filename = "http://image2.guang.j.cn/images/%s/big/%s" % ( shop_id, local_pic_url) download_image(fastdfs_filename, shop_id, item_id, local_pic_url, fdfs_client) except: logger.error("download %s:%s failed reason: %s", item_id, fastdfs_filename, traceback.format_exc()) continue try: refreshCdnCache(shop_id, local_pic_url) except: logger.error("refreshCdnCache %s:%s failed: %s", item_id, local_pic_url, traceback.format_exc()) continue logger.info("%s/%s replace item %s main image success %s", i, len(items), item_id, local_pic_url) i += 1
def crawl_item(item_id): try: url = "http://www.meilishuo.com/share/%s" % item_id data = crawl_page(item_id, url, headers) if not data: return try: html_obj = etree.XML(data) except: try: html_obj = soupparser.fromstring(data.decode('utf8')) except: try: html_obj = etree.HTML(data) except: logger.warn("crawling %s len %s parse failed %s", item_id, len(data), traceback.format_exc(), extra={'tags':['crawlItemParseException',]}) #saved_data = etree.tostring(html_obj.xpath("//div[@id='main']/div/div/div")[0]) detail_path = html_obj.xpath("//div[@id='main']/div/div/div") if not detail_path: logger.info("err parse %s len %s", item_id, len(data)) return detail_obj = detail_path[0] results = {} results['user_url'] = get_obj(detail_obj, "div/dl/dt/a/@href") results['user_name'] = get_obj(detail_obj, "div/dl/dd[1]/a/text()") results['obj_date'] = get_obj(detail_obj, "div/dl/dd/span/text()") results['obj_url'] = get_obj(detail_obj, "div/div/div/p[1]/a/@href") results['obj_title'] = get_obj(detail_obj, "div/div/div/p[1]/a/text()") results['obj_img'] = get_obj(detail_obj, "div/div/a/img/@src") results['obj_fav_count'] = get_obj(detail_obj, "div/div/div/p[2]/a/b/text()") results['obj_org_img'] = get_obj(detail_obj, "div/div[@class='original_pic_ioc']/a/@href") results['obj_comment_count'] = get_obj(detail_obj, "div/div/div/a/b/text()") results['obj_price'] = get_obj(detail_obj, "div/div/div/div/p/text()") results['group_title'] = get_obj(detail_obj, "div/dl/dd[1]/a/text()") results['group_url'] = get_obj(detail_obj, "div/dl/dd[1]/a/@href") results['group_desc'] = get_obj(detail_obj, "div/dl/dd[1]/text()") logger.debug("results %s", results) #import pdb; pdb.set_trace() db = get_db_engine() db.execute("delete from crawl_html where item_id=%s" % item_id) db.execute("insert into crawl_html (item_id,html) values (%s, %s)", item_id, simplejson.dumps(results)) logger.info("crawled %s len %s", url, len(data)) except KeyboardInterrupt: raise except: logger.warn("crawl failed %s exception %s", url, traceback.format_exc())
def crawl(): company_id = 19 url = "https://efinance.cmbchinaucs.com/Handler/ActionPage.aspx?targetAction=GetProjectList_Index" headers = { 'Host': "efinance.cmbchinaucs.com", 'Connection': "keep-alive", 'Content-Length': "33", 'Cache-Control': "max-age=0", 'Accept': "text/plain, */*", 'Origin': "https://efinance.cmbchinaucs.com", 'X-Requested-With': "XMLHttpRequest", 'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.101 Safari/537.36", 'Content-Type': "application/x-www-form-urlencoded", 'Referer': "https://efinance.cmbchinaucs.com/", 'Accept-Encoding': "gzip,deflate", 'Accept-Language': "zh-CN,zh;q=0.8,en;q=0.6", 'Cookie': "ASP.NET_SessionId=woqbxpemqp3kk4syvfbkxtzw" } db = get_db_engine() db_ids = list( db.execute( "select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = post(url, data={"targetAction": "GetProjectList_Index"}, headers=headers) loans_json = loads(loan_htm, encoding="UTF-8") print loans_json except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def update_shop_level(sql): db = get_db_engine() update_sql = "update shop set level = '%s' where id = %s" shops = db.execute(sql) failed = [] for shop in shops: #import pdb; pdb.set_trace() process_shop(db, shop, failed) results = "Update shop's level, Checked result, total %s failed %s, detailed %s" % (shops.rowcount, len(failed), ",".join(map(str, failed))) if len(failed): logger.warn(results) else: logger.info(results)
def update_shop_level(sql): db = get_db_engine() update_sql = "update shop set level = '%s' where id = %s" shops = db.execute(sql) failed = [] for shop in shops: #import pdb; pdb.set_trace() process_shop(db, shop, failed) results = "Update shop's level, Checked result, total %s failed %s, detailed %s" % ( shops.rowcount, len(failed), ",".join(map(str, failed))) if len(failed): logger.warn(results) else: logger.info(results)
def main(): db = get_db_engine() items = db.execute("select id, shop_id, local_pic_url, concat('/space/wwwroot/image.guang.j.cn/ROOT/images/', shop_id, '/mid2/', local_pic_url) as img_path from item where status=1 and %s order by id" % FLAGS.where) for item in items: img_path = item[3] if not os.path.exists(img_path) or img_path.endswith('.png'): logger.warn('skipping %s %s', item[0], item[3]) continue try: logger.debug('processing %s %s', item[0], item[3]) d = ','.join(map(str, pHash.image_digest(img_path, 1.0, 1.0, 180).coeffs)) db.execute("insert ignore into item_image_digest (item_id, digest) values (%s, '%s')" % (item[0], d)) except: pass
def fix_items(sql): db = get_db_engine() items = db.execute(sql) logger.info("Fixing image total %s", items.rowcount) if not items.rowcount: return if FLAGS.parallel: mapper = SimpleMapReduce(fix_item2, identity) results = mapper(transform_args(items)) logger.info("fix finished %s", len(results)) else: for item in items: fix_item2({'item':item, 'crawl_path':FLAGS.crawl_path, 'server_path':FLAGS.path, 'is_remove':FLAGS.removetmp, 'org_server_path':FLAGS.org_path})
def rollback_shop(shop_id, db): if not db: db = get_db_engine() sql = "select id,num_id,null,null from item where shop_id = %s and detail_url like '%%%%s.click.taobao.com%%%%'" % shop_id if FLAGS.limit: sql += " limit " + str(FLAGS.limit) results = db.connect().execute(sql) for i, result in enumerate(results): new_url = "http://item.taobao.com/item.htm?id=%s" % result[1] sql = "update item set detail_url=\'%s\' where id = %s" % (new_url, result[0]) logger.debug("Run sql %s/%s: %s" % (i, results.rowcount, sql)) if not FLAGS.dryrun: db.execute(sql)
def remove(): db = get_db_engine() key = db.execute("select uniq_url from item_images where item_id=%s and fastdfs_filename=%s", FLAGS.itemid, FLAGS.dfsimg) if not key.rowcount > 0: return else: key = list(key) result = db.execute("select id from item_images where uniq_url=%s and disabled=0", key[0]) i = 0 for r in result: sql = "update item_images set disabled=1 where id=%s" % r[0] print("deleting %s/%s %s", i, result.rowcount, sql) db.execute(sql) i+=1
def load_click_items(numid2volumeprice): logger.info("Loading click items") click_items = [] paid_items = [] click_item_type = namedtuple("ClickItemType", 'click_hash item_id click_time click_ip area_code click_price click_volume item_price item_volume shop_nick taobao_report_id num_id') db = get_db_engine() where = "click_time>='%s' and click_time<='%s'" % (datestr(FLAGS.start), datestr(FLAGS.end)) if FLAGS.limit > 0: where += " limit %s" % FLAGS.limit sql = "select outer_code,item_id,click_time,click_ip,click_area,click_price,click_volume,item.price,item.volume,shop.nick,click_item_log.taobao_report_id,item.num_id from click_item_log left join item on click_item_log.item_id=item.id left join shop on shop.id=item.shop_id where %s" % where logger.debug("fetching %s", sql) results = db.execute(sql) progress = 0 item_matched = 0 logger.info("Processing click items %s", results.rowcount) price_diffs = 0 for line in results: progress += 1 click_item = click_item_type(*line) if not click_item.num_id: logger.warn("no numid %s", click_item) continue click_items.append(click_item) if click_item.item_id > 0: item_matched += 1 volume = click_item.item_volume if not volume or volume == 0: logger.warn("item %s abnormal %s", click_item.item_id, volume) volume = 0.2 elif volume > 800: volume = 800 price = click_item.click_price if click_item.item_price and price > click_item.item_price * 1.5: price = click_item.item_price price_diffs += 1 logger.warn("Price diff paid? %s %s/%s too much %s - %s", click_item.taobao_report_id, price_diffs, results.rowcount, click_item.click_price, click_item.item_price) if price > 500.0: price = 500.0 if not price or price < 0.5: logger.warn("price %s abnormal %s", click_item.item_id, price) price = 1.0 numid2volumeprice[long(click_item.num_id)] = {'volume' : volume, 'price' : price} if click_item.taobao_report_id: paid_items.append(click_item.taobao_report_id) logger.info("Total click %s item matched %s", len(click_items), item_matched) return click_items, paid_items
def img_update(): sql = "select id,num_id,shop_id,pic_url,local_pic_url from item where pic_url like '%%q90.%%'" db = get_db_engine() items = db.execute(sql) tr = re.compile("(.+\.(jpg|png))[^$]*.jpg$") for item in items: taobao_picurl = item[3] taobao_picurl = tr.sub(r'\1', taobao_picurl) try: width, height = download_image({'item_id':item[0], 'num_id': item[1], 'shop_id': item[2], 'pic_url': taobao_picurl, 'image_name': item[4], 'crawl_path': FLAGS.crawl_path}) db.execute("update item set modified=now(), pic_url=%s, pic_width=%s, pic_height=%s where id=%s", taobao_picurl, width, height, item[0]) logger.info("item %s update image ok", item[0]) except: logger.error("download %s:%s failed reason %s", item[0], taobao_picurl, traceback.format_exc()) continue
def remove(): db = get_db_engine() content = open(FLAGS.file) j = 0 for l in content: key = l.split('\t') print("querying %s", key[0]) result = db.execute("select id from item_images where uniq_url=%s and disabled=0", key[0]) i = 0 j+=1 for r in result: sql = "update item_images set disabled=1 where id=%s" % r[0] print("deleting %s %s/%s %s", j, i, result.rowcount, sql) db.execute(sql) i+=1
def save(): db = get_db_engine() content = open(FLAGS.file) html = '<html><body>' for l in content: key = l.split('\t') result = db.execute("select fastdfs_filename from item_images where uniq_url=%s limit 10", key[0]) if result.rowcount > 0: html += '<div>' #for r in result: # html += '<p><img src="http://img%s.guang.j.cn/%s"></p>' % (random.randint(1,5), r[0]) html += '<p><img src="http://img%s.guang.j.cn/%s"></p>' % (random.randint(1,5), list(result)[0][0]) html += '</div>' html += '</body></html>' print html
def main(): url = "http://%s:7080%s" % (FLAGS.solr_host, SOLR_URL) #import pdb; pdb.set_trace() results = simplejson.loads(download(url)) db = get_db_engine() counts = [] for doc in results['response']['docs']: item_id = doc['item_id'] count = db.execute("select count(id) from favourite where itemid=%s and acttime>'2012-12-01' and favstatus=1 and firstchoose=0;" % item_id) if count.rowcount: counts.append(list(count)[0][0]) else: counts.append(0) cs = Series(counts) logger.info(cs.describe())
def remove(): db = get_db_engine() content = open(FLAGS.file) j = 0 for l in content: key = l.split("\t") print ("querying %s", key[0]) result = db.execute("select id from item_images where uniq_url=%s and disabled=0", key[0]) i = 0 j += 1 for r in result: sql = "update item_images set disabled=1 where id=%s" % r[0] print ("deleting %s %s/%s %s", j, i, result.rowcount, sql) db.execute(sql) i += 1
def save(): db = get_db_engine() content = open(FLAGS.file) html = "<html><body>" for l in content: key = l.split("\t") result = db.execute("select fastdfs_filename from item_images where uniq_url=%s limit 10", key[0]) if result.rowcount > 0: html += "<div>" # for r in result: # html += '<p><img src="http://img%s.guang.j.cn/%s"></p>' % (random.randint(1,5), r[0]) html += '<p><img src="http://img%s.guang.j.cn/%s"></p>' % (random.randint(1, 5), list(result)[0][0]) html += "</div>" html += "</body></html>" print html
def check_shops(sql): db = get_db_engine() last_time = 0 shops = db.execute(sql) logger.info("checking total %s", shops.rowcount) failed = [] for shop in shops: cur = time.time()*1000 if cur - last_time < FLAGS.interval: time.sleep((FLAGS.interval-(cur-last_time))/1000.0) last_time = time.time()*1000 check_one_shop(shop, failed) logger.info("Checked result, total %s failed %s", shops.rowcount, len(failed)) for f in failed: logger.warn("%s %s", f['shopid'], f['err'])