pass results = product_dict[product_same] for result in results: if result['id'] != valid_id(results): delete_list.append(result['id']) delete_list = list(set(delete_list)) if len(delete_list) == 0: pass update_sql = ''' update product set valid_status = 1 ,update_time = CURRENT_TIMESTAMP ,update_v = unix_timestamp()*1000 where id in ({id_list}) '''.format(id_list = json.dumps(delete_list)[1:-1]) print update_sql conn.execute(update_sql) # sql1 = ''' # select id, product_code from product as p1 where valid_status=1 # ''' # sql2 = ''' # select product_code from product as p1 where valid_status=0 # ''' # results1 = conn.fetch_rows(sql1) # results2 = conn.fetch_rows(sql2) # product_code_list = [] # recovery_list = [] # for item in results2:
class Updator(object): def __init__(self): self.conn_algorithm = MySQL(DEJA_FASHION_MYSQL_CONFIG_TEST) self.all_pid_in_algorithm_list = self.get_all_pid_in_algorithm_list() def get_all_pid_in_algorithm_list(self): pid_list = list() sql = 'select pid from algorithm.product' result = self.conn_algorithm.fetch_rows(sql) for row in result: pid_list.append(row['pid'].lower().strip()) return pid_list def is_exist(self, product): pid = product.format_dict['pid'] if pid.lower().strip() in self.all_pid_in_algorithm_list: return True else: return False def insert(self, product): INSERT_SQL = u'''insert into algorithm.product set pid = '{pid}' , product_code = '{product_code}' , merchant = {merchant} , description = '{description}', group_id = '{group_id}', stock_info = '{stock_info}' , original_price = {original_price}, price ={price} , currency = '{currency}' , natural_currency = '{natural_currency}', natural_original_price = '{natural_original_price}', natural_price = '{natural_price}', shop_url ='{shop_url}' , detail_images = '{detail_images}' , white_index = '{white_index}' , suitable_index = '{suitable_index}' , status = {status}, update_v = {update_v}, create_time = NOW();''' print INSERT_SQL.format(**product.format_dict) self.conn_algorithm.execute(INSERT_SQL.format(**product.format_dict)) def update(self, product): UPDATE_SQL = u'''update algorithm.product set product_code = '{product_code}' , merchant = {merchant} , description = '{description}', group_id = '{group_id}', stock_info = '{stock_info}' , original_price = {original_price}, price ={price} , currency = '{currency}' , natural_currency = '{natural_currency}', natural_original_price = '{natural_original_price}', natural_price = '{natural_price}', shop_url ='{shop_url}' , detail_images = '{detail_images}' , white_index = '{white_index}' , suitable_index = '{suitable_index}' , status = {status}, update_v = {update_v} where pid = '{pid}'; ''' print UPDATE_SQL.format(**product.format_dict) self.conn_algorithm.execute(UPDATE_SQL.format(**product.format_dict))
class Eliminate_Repeat(): def __init__(self): self.conn_algorithm = MySQL(DEJA_FASHION_MYSQL_CONFIG_ALGORITHM) self.conn_clawer = MySQL(DEJA_FASHION_MYSQL_CONFIG_CRAWLER) self.suitable_images_dic = dict() self.pids_need_to_hash_list = list() self.get_suitable_image_dic() self.get_results_pids_need_to_hash() self.email_new_synchronise_content = list() self.email_self_repeat_content = list() self.email_before_repeat_content = list() self.email_no_repeat_info = list() self.email_no_fingerprint_info = list() self.f_prefix = 'http://office.mozat.com/photos/product_images/' self.sql_update = ''' update algorithm.product set fingerprint = '{fp}' where pid = '{pid}' ''' self.sql_get_info = ''' SELECT b.id as bid, b.name as name, count(*) as num FROM algorithm.product as p, algorithm.brands as b where p.merchant = b.id and tag_status = {status} and p.update_time >= "{time}" group by merchant; ''' def get_results_pids_need_to_hash(self): results_pids_need_to_hash = self.conn_algorithm.fetch_rows(sql_get_pid_need_to_hash_algorithm) for row in results_pids_need_to_hash: self.pids_need_to_hash_list.append(row['pid']) def get_suitable_image_dic(self): result_suitable_images = self.conn_clawer.fetch_rows(sql_get_pid_suitable_images_dic_crawler) for row in result_suitable_images: self.suitable_images_dic[row['pid']] = row['suitable_images'] def start(self): self.get_new_synchronise_info() self.now = strftime("%Y-%m-%d %H:%M:%S") self.get_new_hash() self.get_no_fingerprint_info() self.now = strftime("%Y-%m-%d %H:%M:%S") self.eliminate_self_repeat() self.get_self_repeat_info() self.now = strftime("%Y-%m-%d %H:%M:%S") self.eliminate_with_before_repeat() self.get_before_repeate_info() self.now = strftime("%Y-%m-%d %H:%M:%S") self.change_status_for_tag() self.get_no_repeate_info() def get_new_synchronise_info(self): sql_new_synchronise = ''' SELECT b.id as bid, b.name as name, count(*) as num FROM algorithm.product as p, algorithm.brands as b where p.merchant = b.id and tag_status = -11 group by merchant; ''' result = self.conn_algorithm.fetch_rows(sql_new_synchronise) for row in result: self.email_new_synchronise_content.append([row['bid'], row['name'], row['num']]) def get_self_repeat_info(self): result = self.conn_algorithm.fetch_rows(self.sql_get_info.format(status=-12, time=self.now)) for row in result: self.email_self_repeat_content.append([row['bid'], row['name'], row['num']]) def get_before_repeate_info(self): result = self.conn_algorithm.fetch_rows(self.sql_get_info.format(status=-13, time=self.now)) for row in result: self.email_before_repeat_content.append([row['bid'], row['name'], row['num']]) def get_no_fingerprint_info(self): result = self.conn_algorithm.fetch_rows(self.sql_get_info.format(status=-14, time=self.now)) for row in result: self.email_no_fingerprint_info.append([row['bid'], row['name'], row['num']]) def get_no_repeate_info(self): result = self.conn_algorithm.fetch_rows(self.sql_get_info.format(status=0, time=self.now)) for row in result: self.email_no_repeat_info.append([row['bid'], row['name'], row['num']]) def get_new_hash(self): sql_set_tag_status_minus_fourteen = ''' UPDATE algorithm.product set tag_status = -14 where pid = '{pid}'; ''' for pid_need_to_hash in self.pids_need_to_hash_list: if pid_need_to_hash in self.suitable_images_dic.keys(): f_suffix = self.suitable_images_dic[pid_need_to_hash] full_url = self.f_prefix + f_suffix hash = self.get_hash(full_url) fp = json.dumps([hash]) print "pid:", pid_need_to_hash, " hash:", hash, " url", full_url self.conn_algorithm.execute(self.sql_update.format(fp=fp, pid=pid_need_to_hash)) else: self.conn_algorithm.execute(sql_set_tag_status_minus_fourteen.format(pid=pid_need_to_hash)) def get_hash(self, url): fname = url.split('/')[-1] try: download_img(fname, url) except: pass image = Image.open(fname) hash = dhash(image, 12) os.remove(fname) return hash def eliminate_self_repeat(self): sql_get_fingerprint = ''' select pid, fingerprint from algorithm.product where tag_status = -11 ''' sql_change_tag_staus_to_minus_twelve = ''' update algorithm.product set tag_status = -12 where pid = '{pid}' ''' fingerprint_result = self.conn_algorithm.fetch_rows(sql_get_fingerprint) fingerprint_dic = dict() for fingerprint_row in fingerprint_result: fingerprint_dic[fingerprint_row['fingerprint']] = [] for fingerprint_row in fingerprint_result: fingerprint_dic[fingerprint_row['fingerprint']].append(fingerprint_row['pid']) counter = 0 for key in fingerprint_dic.keys(): if len(fingerprint_dic[key]) > 1: for pid in fingerprint_dic[key]: self.conn_algorithm.execute(sql_change_tag_staus_to_minus_twelve.format(pid=pid)) print key, fingerprint_dic[key] counter += len(fingerprint_dic[key]) print "self_repeat", counter def eliminate_with_before_repeat(self): sql_get_fingerprint_status_minus_eleven = ''' select pid, fingerprint from algorithm.product where tag_status = -11 ''' sql_get_fingerprint_before = ''' select pid, fingerprint from algorithm.product where valid_status = 0 and fingerprint not like '[]' and tag_status not in (-11, -12, -13) ''' sql_change_tag_staus_to_minus_third = ''' update algorithm.product set tag_status = -13 where pid = '{pid}' ''' fingerprint_result_status_minus_eleven = self.conn_algorithm.fetch_rows(sql_get_fingerprint_status_minus_eleven) fingerprint_result_before = self.conn_algorithm.fetch_rows(sql_get_fingerprint_before) fingerprint_dic_status_minus_eleven = dict() for fingerprint_row in fingerprint_result_status_minus_eleven: fingerprint_dic_status_minus_eleven[json.loads(fingerprint_row['fingerprint'])[0]] = fingerprint_row['pid'] fingerprint_before_list = list() for row in fingerprint_result_before: fingerprint_before_list.append(json.loads(row['fingerprint'])[0]) counter = 0 for fingerprint in fingerprint_dic_status_minus_eleven.keys(): if fingerprint in fingerprint_before_list: counter += 1 self.conn_algorithm.execute(sql_change_tag_staus_to_minus_third.format(pid=fingerprint_dic_status_minus_eleven[fingerprint])) print fingerprint, fingerprint_dic_status_minus_eleven[fingerprint] print "repeat_before", counter def change_status_for_tag(self): sql_get_fingerprint_status_minus_eleven = ''' select pid from algorithm.product where tag_status = -14 ''' sql_update_tag_status_to_zero = ''' update algorithm.product set tag_status = 0 where pid = '{pid}' ''' pid_result_status_minus_eleven = self.conn_algorithm.fetch_rows(sql_get_fingerprint_status_minus_eleven) pid_list_status_minus_eleven = list() for pid_row in pid_result_status_minus_eleven: pid_list_status_minus_eleven.append(pid_row['pid']) for pid in pid_list_status_minus_eleven: self.conn_algorithm.execute(sql_update_tag_status_to_zero.format(pid=pid))