def __init__(self): """ 初始化 """ self.logging = LogFactory() self.config_data = Config().config_data self.conn = self.get_odps_conn()
def main(): config = Config() s = "" for query in config.get_queries(): s += Finder(query, config.get_texts()).__str__() print(s) save(s, config.get_output())
def __init__(self): self.cuslabelcluster = CusLabelCluster() self.logging = LogFactory() self.config = Config().config_data self.odps = ODPSdb() self.bdp_date = (datetime.date.today() + datetime.timedelta(days=-1)).strftime("%Y%m%d")
class CusLabelDistance: def __init__(self): self.config = Config().config_data self.logging = LogFactory() self.odps = ODPSdb() def preprocessing(self, data): """ 数据预处理 :param data: :return: """ index = data['str_code'].values data.drop(columns=['str_code', 'distrib_code'], axis=1, inplace=True) # 空值补零 data.fillna(0, inplace=True) # 行中心化 m_row = data.mean(axis=1) data = data - m_row[:, None] # 正则化Normalizer scaler = Normalizer() x_train = scaler.fit_transform(data) return x_train, index def distance(self, x_train): """ 计算距离矩阵 :param x_train: :return: 返回一维的压缩距离矩阵 对于一个m维矩阵,(i, j)对应的距离,在位置m * i + j - ((i + 2) * (i + 1)) // 2 """ d_condensed = pdist(x_train, metric='cosine') return d_condensed def main(self): """ 按分区 :return: """ bdp_date = (datetime.date.today() + datetime.timedelta(days=-1)).strftime("%Y%m%d") sql_cmd = self.config.get('GET_CUSTOM_LABEL').format(bdp_date) data = self.odps.get_data(sql=sql_cmd) d_condensed, index, distrib = [], [], [] for distrib_code in set(data.distrib_code): x_train, idx = self.preprocessing( data=data[data['distrib_code'] == distrib_code]) d_condensed.append(self.distance(x_train)) index.append(idx) distrib.append(distrib_code) return d_condensed, index, distrib
class DistribStyDistance(): def __init__(self): self.config = Config().config_data self.logging = LogFactory() self.odps = ODPSdb() def preprocessing(self, data): """ 数据预处理 :param data: :return: """ index = data['str_code'].values # 向量化 tfidf2 = TfidfVectorizer() x_train = tfidf2.fit_transform(data['sty_code_list']) # 标准化 scaler = Normalizer() x_train = scaler.fit_transform(x_train) return x_train, index def distance(self, x_train): """ 计算距离矩阵 :param x_train: :return: 返回一维的压缩距离矩阵 对于一个m维矩阵,(i, j)对应的距离,在位置m * i + j - ((i + 2) * (i + 1)) // 2 """ d_condensed = pdist(x_train.toarray(), metric='cosine') return d_condensed def main(self): sql_cmd = self.config.get('GET_STR_STY_LABEL') data = self.odps.get_data(sql=sql_cmd) d_condensed, index, distrib = [], [], [] for distrib_code in set(data.distrib_code): x_train, idx = self.preprocessing(data=data[data['distrib_code']==distrib_code]) d_condensed.append(self.distance(x_train)) index.append(idx) distrib.append(distrib_code) return d_condensed, index, distrib
def __init__(self): self.logging = LogFactory() self.config_data = Config().config_data self.conn = self.get_conn()
class PostGreSQL: def __init__(self): self.logging = LogFactory() self.config_data = Config().config_data self.conn = self.get_conn() def get_conn(self): pg_config = self.config_data.get('POSTGRESQL') conn = psycopg2.connect(host=pg_config['HOST'], port=pg_config['PORT'], database=pg_config['DBNAME'], user=pg_config['USER'], password=pg_config['PASSWD']) return conn def get_pg_records(self, query, param=None): """ 从sqlite数据库获取数据 :param query: 查询语句 :param param: 查询参数 :return: 查询结果(list) """ cursor = self.conn.cursor() try: if param is None: cursor.execute(query) else: cursor.execute(query, param) records = cursor.fetchall() except: records = [] self.logging.error(traceback.format_exc()) cursor.close() return records def update_pg(self, query, param=None): """ 执行增删改类查询sql语句 :param query: 查询语句 :param param: 查询参数 :return: 执行结果(1-成功;0-失败) """ cursor = self.conn.cursor() try: if param is None: cursor.execute(query) else: cursor.execute(query, param) cursor.close() self.conn.commit() return 1 except: self.logging.error(traceback.format_exc()) cursor.close() self.conn.rollback() return 0 def pandas_readsql(self, sql, columns=None): """ 使用pandas读取 :param sql: 查询语句 :param columns: 查询列 :return: dataframe """ res = pd.read_sql(sql, con=self.conn, columns=columns) return res def close(self): self.conn.close()
def __init__(self): self.config = Config().config_data self.logging = LogFactory() self.odps = ODPSdb()
class CusLabelCluster: def __init__(self): self.config = Config().config_data self.logging = LogFactory() self.odps = ODPSdb() def preprocessing(self, data): """ 数据预处理 :param data: :return: """ index = data['str_code'] data.drop(columns=['str_code', 'distrib_code'], axis=1, inplace=True) # 空值补零 data.fillna(0, inplace=True) # 行中心化 m_row = data.mean(axis=1) data = data - m_row[:, None] # 正则化Normalizer scaler = Normalizer() x_train = scaler.fit_transform(data) return x_train, index def cluster(self, x_train): """ 聚类 :param x_train: :return: 根据门店数目确定聚类数目的大概范围。1、每组最少30家门店,最多200家门店;2、聚类数目最少为2组,最多为30组。 """ silhouette_best = 0 cluster_label = None cluster_num = None n = max(len(x_train) // 200, 2) m = min(len(x_train) // 30, 30) + 1 self.logging.info("聚类数目最小值:{0},最大值:{1}".format(n, m)) step = 1 if m < 11 else 2 for i in range(n, m, step): clusterid, error, nfound = kcluster(x_train, nclusters=i, dist='u', npass=1000) silhouette_score = metrics.silhouette_score(x_train, clusterid, metric='cosine') self.logging.info("聚类数目:{0},聚类得分:{1}".format(i, silhouette_score)) self.logging.info("找到解的次数:{0}".format(nfound)) if silhouette_best < silhouette_score: silhouette_best = silhouette_score cluster_label = clusterid cluster_num = i self.logging.info("最优聚类数目:{0},最优轮廓系数得分:{1}".format( cluster_num, silhouette_best)) return cluster_label def main(self): bdp_date = (datetime.date.today() + datetime.timedelta(days=-1)).strftime("%Y%m%d") sql_cmd = self.config.get('GET_CUSTOM_LABEL').format(bdp_date) data = self.odps.get_data(sql=sql_cmd) cluster_label, index = [], [] for distrib_code in set(data.distrib_code): x_train, idx = self.preprocessing( data=data[data['distrib_code'] == distrib_code]) cluster_label.extend(self.cluster(x_train=x_train)) index.extend(idx.values) cluster_df = pd.DataFrame(cluster_label, index=index, columns=['label'], dtype=object) return cluster_df
class ODPSdb: """ 配置数据库 """ def __init__(self): """ 初始化 """ self.logging = LogFactory() self.config_data = Config().config_data self.conn = self.get_odps_conn() def get_odps_conn(self): """ 连接ODPS :return: """ odps_config = self.config_data.get('ODPS') try: conn = ODPS(access_id=odps_config['USER'], secret_access_key=odps_config['PASSWD'], project=odps_config['DBNAME'], endpoint=odps_config['URL']) except: self.logging.error(traceback.format_exc()) raise return conn def get_data(self, sql): """ 查询数据 :param sql: :return: """ self.logging.info("查询数据:" + sql) try: with self.conn.execute_sql(sql).open_reader() as reader: data = reader.to_pandas() if len(data) == 0: self.logging.error("数据为空!") except: self.logging.error(traceback.format_exc()) return None self.logging.info("read_data success!") return data def write_to_db(self, data, tablename, if_partition=1): """ 写入数据库 :param data: :param tablename: :param if_partition: 是否分区 :return:(1-成功;0-失败) """ if data is None or data.empty: self.logging.error("{0} 写入数据库失败!数据为空!".format(tablename)) return 0 else: try: data['dw_date'] = datetime.datetime.now() if if_partition: bdp_date = (datetime.date.today() + datetime.timedelta(days=-1)).strftime("%Y%m%d") DataFrame(data).persist(name=tablename, overwrite=True, partition="ds='{}'".format(bdp_date), create_partition=True, odps=self.conn, cast=True) else: DataFrame(data).persist(name=tablename, overwrite=True, odps=self.conn, cast=True) except: self.logging.error(traceback.format_exc()) return 0 self.logging.info("{0} 成功写入数据库!".format(tablename)) return 1
class NewStoreCluster: def __init__(self): self.cuslabelcluster = CusLabelCluster() self.logging = LogFactory() self.config = Config().config_data self.odps = ODPSdb() self.bdp_date = (datetime.date.today() + datetime.timedelta(days=-1)).strftime("%Y%m%d") def main(self): cluster_df = self.cuslabelcluster.main() sql_cmd = self.config.get('GET_GEOGRAPHY_LABEL').format(self.bdp_date) full_store = self.odps.get_data(sql=sql_cmd) full_store = pd.merge(full_store, cluster_df, how='left', left_on=['str_code'], right_index=True) # 不同组中出现最频繁的:商圈、城市、城市等级、省区 # 商圈+城市 feature_matrix1 = full_store.groupby( by=['distrib_code', 'label', 'city_name', 'cbd_type_name'], as_index=False).agg(feature1=('str_code', 'count')) # 城市 feature_matrix2 = full_store.groupby( by=['distrib_code', 'label', 'city_name'], as_index=False).agg(feature2=('str_code', 'count')) # 省区+城市等级 feature_matrix3 = full_store.groupby( by=['distrib_code', 'label', 'str_org4_name', 'city_level'], as_index=False).agg(feature3=('str_code', 'count')) # 省区 feature_matrix4 = full_store.groupby( by=['distrib_code', 'label', 'str_org4_name'], as_index=False).agg(feature4=('str_code', 'count')) for i, item in full_store[full_store['label'].isnull()].iterrows(): cbd_type_name = item['cbd_type_name'] city_name = item['city_name'] city_level = item['city_level'] str_org4_name = item['str_org4_name'] if cbd_type_name is not None: data = feature_matrix1[ (feature_matrix1['cbd_type_name'] == cbd_type_name) & (feature_matrix1['city_name'] == city_name)].sort_values( by='feature1') if len(data) > 0: full_store.loc[i, 'label'] = data.iloc[-1]['label'] continue if city_name is not None: data = feature_matrix2[feature_matrix2['city_name'] == city_name].sort_values(by='feature2') if len(data) > 0: full_store.loc[i, 'label'] = data.iloc[-1]['label'] continue if city_level is not None: data = feature_matrix3[ (feature_matrix3['city_level'] == city_level) & (feature_matrix3['str_org4_name'] == str_org4_name )].sort_values(by='feature3') if len(data) > 0: full_store.loc[i, 'label'] = data.iloc[-1]['label'] continue if str_org4_name is not None: data = feature_matrix4[feature_matrix4['str_org4_name'] == str_org4_name].sort_values( by='feature4') if len(data) > 0: full_store.loc[i, 'label'] = data.iloc[-1]['label'] continue self.logging.error('{0}未匹配到!'.format(item['str_code'])) full_store['label'] = full_store['label'].astype('int') return full_store