def run(self): db = RoseVisionDb() db.conn(getattr(gs, 'DATABASE')['DB_SPEC']) if not self.brand_list: rs = db.query_match(['brand_id'], 'products', distinct=True) brand_list = [int(val[0]) for val in rs.fetch_row(maxrows=0)] self.brand_list = brand_list else: brand_list = self.brand_list if not brand_list: # 如果没有任何品牌,则直接退出 return self.report self.progress = 0 # 获得检查总数 self.tot = int( db.query( str.format( 'SELECT COUNT(*) FROM products WHERE brand_id IN ({0})', ','.join(str(tmp) for tmp in brand_list))).fetch_row()[0][0]) for brand in brand_list: if not self.silent: print unicode.format(u'\nPROCESSING {0} / {1}\n', brand, info.brand_info()[brand]['brandname_e']) db.start_transaction() try: for model, pid, fingerprint in db.query_match( ['model', 'idproducts', 'fingerprint'], 'products', { 'brand_id': brand }).fetch_row(maxrows=0): self.progress += 1 new_fp = gen_fingerprint(brand, model) if fingerprint != new_fp: self.report.append({ 'model': model, 'idproducts': pid, 'fingerprint_db': fingerprint, 'fingerprint_gen': new_fp, 'brand_id': brand }) if not self.silent: print unicode.format( u'\nMismatched fingerprints! model={0}, idproducts={1}, brand_id={2}, ' u'fingerprints: {3} => {4}\n', model, pid, brand, fingerprint, new_fp) if self.update_fingerprint: # 自动更新MD5指纹 db.update({'fingerprint': new_fp}, 'products', str.format('idproducts={0}', pid), timestamps=['update_time']) except: db.rollback() raise finally: db.commit() db.close()
def currency_update(param_dict): """ 更新货币的汇率信息 @param param_dict: """ db = RoseVisionDb() db.conn(getattr(gs, 'DATABASE')['DB_SPEC']) rs = db.query_match(['iso_code', 'currency'], 'region_info').fetch_row(maxrows=0) db.start_transaction() try: for code, currency in rs: print str.format('Fetching for currency data for {0}...', currency) data = cm.get_data(url=str.format( 'http://download.finance.yahoo.com/d/quotes.csv?s={0}CNY=X' '&f=sl1d1t1ba&e=.json', currency)) rdr = csv.reader(StringIO(data['body'])) line_data = [val for val in rdr][0] timestamp = datetime.datetime.strptime( str.format('{0} {1}', line_data[2], line_data[3]), '%m/%d/%Y %I:%M%p') db.update( { 'rate': line_data[1], 'update_time': timestamp.strftime('%Y-%m-%d %H:%M:%S') }, 'region_info', str.format('iso_code="{0}"', code)) db.commit() except: db.rollback() raise
def urlprocess_main(): db_spec = { "host": "127.0.0.1", "port": 3306, "username": "******", "password": "******", "schema": "editor_stores" } db = RoseVisionDb() db.conn(db_spec) idproducts_start = 0 idproducts_count = 100 opts, args = getopt.getopt(sys.argv[1:], "s:c:") for opt, arg in opts: if opt == '-s': idproducts_start = int(arg) elif opt == '-c': idproducts_count = int(arg) logger.info(str.format("Url process start")) while 1: products = get_products(db, idproducts_start, idproducts_count) if not products: logger.info(str.format("Url process end")) break else: logger.info( str.format("Url process offset : {0} count : {1}", idproducts_start, len(products))) idproducts_start += idproducts_count for product in products: origin_url = product['url'] url = None try: url = urlencode(origin_url) except: url = None logger.info( str.format("Error: {0} encode {1} failed", product['idproducts'], origin_url)) pass if url: try: db.update({'url': url}, 'products', str.format('idproducts="{0}"', product['idproducts'])) except: logger.info( str.format("Error: {0} update {1} failed", product['idproducts'], url)) pass
def process_editor_tags(db_spec=getattr(glob, 'DATABASE')['DB_SPEC'], db_spider_spec=getattr(glob, 'SPIDER_SPEC'), table='products', extra_cond=None): """ 给editor库的数据添加tags字段 """ db = RoseVisionDb() db.conn(db_spider_spec) try: extra_cond = ' AND '.join( unicode.format(u'({0})', tuple(unicodify(v))) for v in extra_cond) if extra_cond else '1' rs = db.query( unicode.format( u'SELECT tag_name,mapping_list FROM products_tag_mapping WHERE {1}', extra_cond).encode('utf-8')) temp = rs.fetch_row(maxrows=0) mapping_rules = dict(temp) finally: db.close() db.conn(db_spec) db.start_transaction() try: rs = db.query( unicode.format(u'SELECT * FROM {0} WHERE {1}', table, extra_cond)) for i in xrange(rs.num_rows()): record = rs.fetch_row(how=1)[0] extra = json.loads(record['extra']) tags = [] for k in extra: tags.extend(extra[k]) tags = set(tags) tag_names = [] for v in tags: if v in mapping_rules: tag_names.extend(json.loads(mapping_rules[v])) tag_names = list(set(tag_names)) db.update({'tags': json.dumps(tag_names, ensure_ascii=False)}, str.format('idproducts={0}', record['idproducts'])) db.commit() pass except OperationalError: db.rollback() finally: db.close()
def run(self): db_src = RoseVisionDb() db_src.conn(self.src_spec) db_dst = RoseVisionDb() db_dst.conn(self.dst_spec) # 备选记录 idproducts_list = [ int(val[0]) for val in db_src.query( unicode.format(u'SELECT idproducts FROM products WHERE {0}', u' AND '.join(self.cond)).encode( 'utf-8')).fetch_row(maxrows=0) ] self.tot = len(idproducts_list) self.progress = 0 db_dst.execute('SET AUTOCOMMIT=0') # db_dst.execute('ALTER TABLE products DISABLE KEYS') for pid_src in idproducts_list: self.progress += 1 record = db_src.query( str.format('SELECT * FROM products WHERE idproducts={0}', pid_src)).fetch_row(how=1)[0] db_dst.start_transaction() try: rs = db_dst.query( str.format( 'SELECT idproducts FROM products WHERE brand_id={0} AND model="{1}" ' 'AND region="{2}"', record['brand_id'], record['model'], record['region'])) pid_dst = int( rs.fetch_row()[0][0]) if rs.num_rows() > 0 else None entry = {k: record[k] for k in record if k != 'idproducts'} price = process_price(record['price'], record['region']) if price: entry['price_rev'] = price['price'] entry['currency_rev'] = price['currency'] if entry['details']: entry['details'] = self.process_text( unicodify(entry['details'])) if entry['description']: entry['description'] = self.process_text( unicodify(entry['description'])) if entry['name']: entry['name'] = self.process_text(unicodify(entry['name'])) if entry['category']: entry['category'] = self.process_text( unicodify(entry['category'])) if entry['extra']: entry['extra'] = self.process_text( unicodify(entry['extra'])) if pid_dst: db_dst.update(entry, 'products', str.format('idproducts={0}', pid_dst)) else: db_dst.insert(entry, 'products') pid_dst = int( db_dst.query( str.format( 'SELECT idproducts FROM products WHERE brand_id={0} AND model="{1}" ' 'AND region="{2}"', record['brand_id'], record['model'], record['region'])).fetch_row()[0][0]) # 是否需要处理价格信息 if price: record_price = db_dst.query( str.format( 'SELECT price,currency FROM products_price_history ' 'WHERE idproducts={0} ORDER BY date DESC LIMIT 1', pid_dst)).fetch_row(how=1) if not record_price or float(record_price[0]['price']) != price['price'] or \ record_price[0]['currency'] != price['currency']: db_dst.insert( { 'idproducts': pid_dst, 'date': record['update_time'], 'brand_id': record['brand_id'], 'region': record['region'], 'model': record['model'], 'price': price['price'], 'currency': price['currency'] }, 'products_price_history') # 处理图像信息 tmp = db_src.query( str.format( 'SELECT checksum,brand_id,url,path,width,height,format FROM products_image ' 'WHERE brand_id={0} AND model="{1}"', record['brand_id'], record['model'])).fetch_row(maxrows=0, how=1) image_record = {val['checksum']: val for val in tmp} checksum_src = set(image_record.keys()) # 完善images_store信息。如果checksum没有在images_store中出现,则添加之。 for checksum in checksum_src: if db_dst.query( str.format( 'SELECT checksum FROM images_store WHERE checksum="{0}"', checksum)).num_rows() == 0: db_dst.insert( { 'checksum': checksum, 'brand_id': image_record[checksum]['brand_id'], 'url': image_record[checksum]['url'], 'path': image_record[checksum]['path'], 'width': image_record[checksum]['width'], 'height': image_record[checksum]['height'], 'format': image_record[checksum]['format'] }, 'images_store') # 补充目标数据库的products_image表,添加相应的checksum checksum_dst = set([ val[0] for val in db_dst.query( str.format( 'SELECT checksum FROM products_image WHERE brand_id={0} AND model="{1}"', record['brand_id'], record['model'])).fetch_row( maxrows=0) ]) for checksum in checksum_src - checksum_dst: db_dst.insert( { 'checksum': checksum, 'brand_id': record['brand_id'], 'model': record['model'] }, 'products_image') db_dst.commit() except: db_dst.rollback() raise