class Fetchor(object): """ Fetch product by 1. merchant 2. pid limit: product last stock time """ db_config = DEJA_FASHION_MYSQL_CONFIG_PRODUCT def __init__(self): self.conn = MySQL(self.db_config) pass def fetch_by_pid(self, pid_list=[-1]): print json.dumps(pid_list)[1:-1] fetch_sql = u""" select p.*, m.bid as merchant from deja_product as p , merchant as m where p.spider_name = m.spider_name and pid in ({pid_list}) """.format( pid_list=json.dumps(pid_list)[1:-1] ) print fetch_sql return self.conn.fetch_rows(fetch_sql) def fetch_by_merchant(self, merchant_list=[-1]): print merchant_list print json.dumps(merchant_list)[1:-1] # huanan said that if a product doesn't have detail_images, I leave it alone. fetch_sql = u""" select p.*, m.bid as merchant from deja_product as p , merchant as m where p.spider_name = m.spider_name and detail_images is not NULL and detail_images not like '' and m.bid in ({merchant_list}) """.format( merchant_list=json.dumps(merchant_list)[1:-1] ) # the line above json.dumps(merchant_list) get a string, then slice it to get rid of "[" and "]" return self.conn.fetch_rows(fetch_sql) def valid_merchant_list(self): fetch_sql = u""" select id from brands where status = 1 """ conn = MySQL(DEJA_FASHION_MYSQL_CONFIG_TEST) return [result["id"] for result in conn.fetch_rows(fetch_sql)]
def _fetch_stock(self): pid = self.product_db['pid'] conn = MySQL(DEJA_FASHION_MYSQL_CONFIG_PRODUCT) fetch_sql = ''' select * from deja_product_stock where pid = '{pid}' order by time desc limit 1 '''.format(pid=pid) results = conn.fetch_rows(fetch_sql) #get the most recently stock infomation, then generate stock. self.stock = Stock(results) pass
def valid_merchant_list(self): fetch_sql = u""" select id from brands where status = 1 """ conn = MySQL(DEJA_FASHION_MYSQL_CONFIG_TEST) return [result["id"] for result in conn.fetch_rows(fetch_sql)]
class Updator(object): def __init__(self): self.conn_algorithm = MySQL(DEJA_FASHION_MYSQL_CONFIG_TEST) self.all_pid_in_algorithm_list = self.get_all_pid_in_algorithm_list() def get_all_pid_in_algorithm_list(self): pid_list = list() sql = 'select pid from algorithm.product' result = self.conn_algorithm.fetch_rows(sql) for row in result: pid_list.append(row['pid'].lower().strip()) return pid_list def is_exist(self, product): pid = product.format_dict['pid'] if pid.lower().strip() in self.all_pid_in_algorithm_list: return True else: return False def insert(self, product): INSERT_SQL = u'''insert into algorithm.product set pid = '{pid}' , product_code = '{product_code}' , merchant = {merchant} , description = '{description}', group_id = '{group_id}', stock_info = '{stock_info}' , original_price = {original_price}, price ={price} , currency = '{currency}' , natural_currency = '{natural_currency}', natural_original_price = '{natural_original_price}', natural_price = '{natural_price}', shop_url ='{shop_url}' , detail_images = '{detail_images}' , white_index = '{white_index}' , suitable_index = '{suitable_index}' , status = {status}, update_v = {update_v}, create_time = NOW();''' print INSERT_SQL.format(**product.format_dict) self.conn_algorithm.execute(INSERT_SQL.format(**product.format_dict)) def update(self, product): UPDATE_SQL = u'''update algorithm.product set product_code = '{product_code}' , merchant = {merchant} , description = '{description}', group_id = '{group_id}', stock_info = '{stock_info}' , original_price = {original_price}, price ={price} , currency = '{currency}' , natural_currency = '{natural_currency}', natural_original_price = '{natural_original_price}', natural_price = '{natural_price}', shop_url ='{shop_url}' , detail_images = '{detail_images}' , white_index = '{white_index}' , suitable_index = '{suitable_index}' , status = {status}, update_v = {update_v} where pid = '{pid}'; ''' print UPDATE_SQL.format(**product.format_dict) self.conn_algorithm.execute(UPDATE_SQL.format(**product.format_dict))
if result['update_time'] > valid_result['update_time']: valid_result = result return valid_result['id'] if __name__ == '__main__': conn = MySQL(DEJA_FASHION_MYSQL_CONFIG_TEST) product_code_sql = ''' select product_code,count(*) as c from product as p ,brands as b where tag_status in (1,2,10) and p.merchant = b.id and b.status = 1 and category in (17,18,19,20,21,585) and valid_status = 0 group by product_code having count(*)>1 ''' product_code_list = [result['product_code'] for result in conn.fetch_rows(product_code_sql)] sql = ''' select p.id,p.product_code,p.update_time,p.tag_status from product as p, brands as b where tag_status in (1,2,10) and p.merchant = b.id and b.status = 1 and product_code in ({product_code_list}) and valid_status = 0 and category in (17,18,19,20,21,585) '''.format(product_code_list=json.dumps(product_code_list)[1:-1]) results = conn.fetch_rows(sql) product_dict = dict() for result in results: if product_dict.get(result['product_code']) == None: product_dict[result['product_code']] = [result]
class Eliminate_Repeat(): def __init__(self): self.conn_algorithm = MySQL(DEJA_FASHION_MYSQL_CONFIG_ALGORITHM) self.conn_clawer = MySQL(DEJA_FASHION_MYSQL_CONFIG_CRAWLER) self.suitable_images_dic = dict() self.pids_need_to_hash_list = list() self.get_suitable_image_dic() self.get_results_pids_need_to_hash() self.email_new_synchronise_content = list() self.email_self_repeat_content = list() self.email_before_repeat_content = list() self.email_no_repeat_info = list() self.email_no_fingerprint_info = list() self.f_prefix = 'http://office.mozat.com/photos/product_images/' self.sql_update = ''' update algorithm.product set fingerprint = '{fp}' where pid = '{pid}' ''' self.sql_get_info = ''' SELECT b.id as bid, b.name as name, count(*) as num FROM algorithm.product as p, algorithm.brands as b where p.merchant = b.id and tag_status = {status} and p.update_time >= "{time}" group by merchant; ''' def get_results_pids_need_to_hash(self): results_pids_need_to_hash = self.conn_algorithm.fetch_rows(sql_get_pid_need_to_hash_algorithm) for row in results_pids_need_to_hash: self.pids_need_to_hash_list.append(row['pid']) def get_suitable_image_dic(self): result_suitable_images = self.conn_clawer.fetch_rows(sql_get_pid_suitable_images_dic_crawler) for row in result_suitable_images: self.suitable_images_dic[row['pid']] = row['suitable_images'] def start(self): self.get_new_synchronise_info() self.now = strftime("%Y-%m-%d %H:%M:%S") self.get_new_hash() self.get_no_fingerprint_info() self.now = strftime("%Y-%m-%d %H:%M:%S") self.eliminate_self_repeat() self.get_self_repeat_info() self.now = strftime("%Y-%m-%d %H:%M:%S") self.eliminate_with_before_repeat() self.get_before_repeate_info() self.now = strftime("%Y-%m-%d %H:%M:%S") self.change_status_for_tag() self.get_no_repeate_info() def get_new_synchronise_info(self): sql_new_synchronise = ''' SELECT b.id as bid, b.name as name, count(*) as num FROM algorithm.product as p, algorithm.brands as b where p.merchant = b.id and tag_status = -11 group by merchant; ''' result = self.conn_algorithm.fetch_rows(sql_new_synchronise) for row in result: self.email_new_synchronise_content.append([row['bid'], row['name'], row['num']]) def get_self_repeat_info(self): result = self.conn_algorithm.fetch_rows(self.sql_get_info.format(status=-12, time=self.now)) for row in result: self.email_self_repeat_content.append([row['bid'], row['name'], row['num']]) def get_before_repeate_info(self): result = self.conn_algorithm.fetch_rows(self.sql_get_info.format(status=-13, time=self.now)) for row in result: self.email_before_repeat_content.append([row['bid'], row['name'], row['num']]) def get_no_fingerprint_info(self): result = self.conn_algorithm.fetch_rows(self.sql_get_info.format(status=-14, time=self.now)) for row in result: self.email_no_fingerprint_info.append([row['bid'], row['name'], row['num']]) def get_no_repeate_info(self): result = self.conn_algorithm.fetch_rows(self.sql_get_info.format(status=0, time=self.now)) for row in result: self.email_no_repeat_info.append([row['bid'], row['name'], row['num']]) def get_new_hash(self): sql_set_tag_status_minus_fourteen = ''' UPDATE algorithm.product set tag_status = -14 where pid = '{pid}'; ''' for pid_need_to_hash in self.pids_need_to_hash_list: if pid_need_to_hash in self.suitable_images_dic.keys(): f_suffix = self.suitable_images_dic[pid_need_to_hash] full_url = self.f_prefix + f_suffix hash = self.get_hash(full_url) fp = json.dumps([hash]) print "pid:", pid_need_to_hash, " hash:", hash, " url", full_url self.conn_algorithm.execute(self.sql_update.format(fp=fp, pid=pid_need_to_hash)) else: self.conn_algorithm.execute(sql_set_tag_status_minus_fourteen.format(pid=pid_need_to_hash)) def get_hash(self, url): fname = url.split('/')[-1] try: download_img(fname, url) except: pass image = Image.open(fname) hash = dhash(image, 12) os.remove(fname) return hash def eliminate_self_repeat(self): sql_get_fingerprint = ''' select pid, fingerprint from algorithm.product where tag_status = -11 ''' sql_change_tag_staus_to_minus_twelve = ''' update algorithm.product set tag_status = -12 where pid = '{pid}' ''' fingerprint_result = self.conn_algorithm.fetch_rows(sql_get_fingerprint) fingerprint_dic = dict() for fingerprint_row in fingerprint_result: fingerprint_dic[fingerprint_row['fingerprint']] = [] for fingerprint_row in fingerprint_result: fingerprint_dic[fingerprint_row['fingerprint']].append(fingerprint_row['pid']) counter = 0 for key in fingerprint_dic.keys(): if len(fingerprint_dic[key]) > 1: for pid in fingerprint_dic[key]: self.conn_algorithm.execute(sql_change_tag_staus_to_minus_twelve.format(pid=pid)) print key, fingerprint_dic[key] counter += len(fingerprint_dic[key]) print "self_repeat", counter def eliminate_with_before_repeat(self): sql_get_fingerprint_status_minus_eleven = ''' select pid, fingerprint from algorithm.product where tag_status = -11 ''' sql_get_fingerprint_before = ''' select pid, fingerprint from algorithm.product where valid_status = 0 and fingerprint not like '[]' and tag_status not in (-11, -12, -13) ''' sql_change_tag_staus_to_minus_third = ''' update algorithm.product set tag_status = -13 where pid = '{pid}' ''' fingerprint_result_status_minus_eleven = self.conn_algorithm.fetch_rows(sql_get_fingerprint_status_minus_eleven) fingerprint_result_before = self.conn_algorithm.fetch_rows(sql_get_fingerprint_before) fingerprint_dic_status_minus_eleven = dict() for fingerprint_row in fingerprint_result_status_minus_eleven: fingerprint_dic_status_minus_eleven[json.loads(fingerprint_row['fingerprint'])[0]] = fingerprint_row['pid'] fingerprint_before_list = list() for row in fingerprint_result_before: fingerprint_before_list.append(json.loads(row['fingerprint'])[0]) counter = 0 for fingerprint in fingerprint_dic_status_minus_eleven.keys(): if fingerprint in fingerprint_before_list: counter += 1 self.conn_algorithm.execute(sql_change_tag_staus_to_minus_third.format(pid=fingerprint_dic_status_minus_eleven[fingerprint])) print fingerprint, fingerprint_dic_status_minus_eleven[fingerprint] print "repeat_before", counter def change_status_for_tag(self): sql_get_fingerprint_status_minus_eleven = ''' select pid from algorithm.product where tag_status = -14 ''' sql_update_tag_status_to_zero = ''' update algorithm.product set tag_status = 0 where pid = '{pid}' ''' pid_result_status_minus_eleven = self.conn_algorithm.fetch_rows(sql_get_fingerprint_status_minus_eleven) pid_list_status_minus_eleven = list() for pid_row in pid_result_status_minus_eleven: pid_list_status_minus_eleven.append(pid_row['pid']) for pid in pid_list_status_minus_eleven: self.conn_algorithm.execute(sql_update_tag_status_to_zero.format(pid=pid))