def structure_rank_create(environment): """ 结构化rank创建映射,并优化rank :param environment: :return: """ structure_rank_create_time = time.time() sparkTask.structureMapRank_task(environment, rank_output_path) utils.rm_mkdir(current_rank_version + "/result", constant.local_sign) # 结构化rank下载 commond = "hadoop fs -text " + rank_output_path + "/structureMapRank/part* > " + current_rank_version + constant.local_structure_rank_path utils.execute_command(commond, shell=True) # 结构化rank 优化 parse_commond = "java -Xms4096M -Xmx7096M -jar " + constant.java_jar_path + "structure-optimize-1.0-SNAPSHOT.jar " + current_rank_version + constant.local_structure_rank_path + " " + current_rank_version + constant.local_structure_optimize_path utils.execute_command(parse_commond, shell=True) # 结构化rank上传 utils.rm_mkdir(rank_output_path + "/structureOptimizeRank/", constant.cluster_sign) upload_rank_structure_status_commond = "hadoop fs -put " + current_rank_version + constant.local_structure_optimize_path + " " + rank_output_path + "/structureOptimizeRank/" utils.execute_command(upload_rank_structure_status_commond, shell=True) logger.info("spark structure_rank_create finished,used time:%s s", str(time.time() - structure_rank_create_time))
def main(environment='beta'): rank_begin_time = time.time() logger.info("rank work flow begin") time_version = datetime.datetime.now().strftime('%Y%m%d_%H:%M') global zeus_poi_path global zeus_myself_path global zeus_structure_path global zeus_polygon_path global rank_output_path global current_rank_version logger.info("current environment:" + environment) logger.info("zeus_poi_path:" + zeus_poi_path) logger.info("zeus_myself_path:" + zeus_myself_path) logger.info("zeus_structure_path:" + zeus_structure_path) logger.info("zeus_polygon_path:" + zeus_polygon_path) logger.info("rank_output_path:" + rank_output_path) current_rank_version = constant.rank_version_path + time_version utils.rm_mkdir(current_rank_version,constant.local_sign) parse_excel_upload() feature_poi_create(environment) download_feature_poi() rank_create() rank_combine_upload(environment) brand_rank_create(environment) structure_rank_create(environment) rank_optimization(environment) logger.info("rank work flow finished,total time:{time}s,environment:{environment}".format( time=str(time.time() - rank_begin_time), environment=environment))
def brand_rank_create(environment): brand_rank_create_time = time.time() sparkTask.brandFeature_task(environment, rank_output_path) utils.rm_mkdir(current_rank_version + constant.local_brandfeaturePoi_path, constant.local_sign) commond = "hadoop fs -get " + rank_output_path + "/brandFeatureValue/*-feature " + current_rank_version + constant.local_brandfeaturePoi_path utils.execute_command(commond, shell=True) utils.rm_mkdir(current_rank_version + constant.brand_rank_path, constant.local_sign) # brand predict brandRankPrediction.files_rank_cluster(current_rank_version + constant.local_brandfeaturePoi_path, current_rank_version + constant.brand_rank_path) upload_brand_rank_commond = "hadoop fs -put " + current_rank_version + constant.brand_rank_path + " " + rank_output_path utils.execute_command(upload_brand_rank_commond, shell=True) logger.info("brand rank create and upload finished,used time:%s s", str(time.time() - brand_rank_create_time))
def download_feature_poi(): """ feature poi download in two ways (bycategory,by city) :return: """ # 特征值数据下载 download_begin_time = time.time() utils.rm_mkdir(current_rank_version + constant.local_featurePoi_path, constant.local_sign) commond = "hadoop fs -get " + rank_output_path + "/featureValue/*-feature " + current_rank_version + constant.local_featurePoi_path utils.execute_command(commond, shell=True) logger.info("featurePoi download finished,used time:%s s", str(time.time() - download_begin_time)) # 城市特征值数据下载 download_begin_time = time.time() utils.rm_mkdir(current_rank_version + constant.local_city_featurePoi_path, constant.local_sign) commond = "hadoop fs -get " + rank_output_path + "/cityFeatureValue/*-feature " + current_rank_version + constant.local_city_featurePoi_path utils.execute_command(commond, shell=True) logger.info("cityfeaturePoi download finished,used time:%s s", str(time.time() - download_begin_time))
def parse_excel_upload(): """ 解析特征阀值并上传 :return: """ download_rank_config_commond = 'wget "http://svn.sogou-inc.com/svn/go2map/data/poi/edit/trunk/rank/poi-rank.xlsx" --user=svnsogoumap --password="******" -O /search/odin/taoyongbo/rank/input/poi-rank.xlsx' utils.execute_command(download_rank_config_commond, shell=True) # 解析excel 生成特征阈值及权重配置文件 parse_commond = "java -Xms800M -Xmx2g -jar " + constant.java_jar_path + "excelparse.jar" utils.execute_command(parse_commond, shell=True) parse_excel_time = time.time() logger.info("parse_excel finished,used time:%s s", str(time.time() - parse_excel_time)) utils.rm_mkdir(current_rank_version + "/config/") mv_config_rank_commond = "cp /search/odin/taoyongbo/rank/result/poi-threshold.txt /search/odin/taoyongbo/rank/result/poi-weight.txt " + current_rank_version + "/config/" utils.execute_command(mv_config_rank_commond, shell=True) upload_config_time = time.time() utils.rm_mkdir(rank_output_path, constant.cluster_sign) utils.rm_mkdir(rank_output_path + "/config/", constant.cluster_sign) # 特征阈值文件上传 upload_threshold_commond = "hadoop fs -put " + current_rank_version + constant.poi_threshold_path + " " + rank_output_path + "/config/poi-threshold.txt" utils.execute_command(upload_threshold_commond, shell=True) # 权重文件上传 upload_weight_commond = "hadoop fs -put " + current_rank_version + constant.weight_path + " " + rank_output_path + "/config/poiWeight.txt" utils.execute_command(upload_weight_commond, shell=True) logger.info("upload_threshold finished,used time:%s s", str(time.time() - upload_config_time))
def rank_create(): """ rank 生产并备份 :return: """ # 切割文件 splitFeatureFile.split_file(current_rank_version) utils.rm_mkdir(current_rank_version + constant.local_multi_path, constant.local_sign) utils.rm_mkdir(current_rank_version + constant.hotCount_single_rank_path, constant.local_sign) utils.rm_mkdir(current_rank_version + constant.hitCount_single_rank_path, constant.local_sign) # 多维度特征值文件聚类 cluster_begin_time = time.time() rankPrediction.files_rank_cluster(current_rank_version + constant.local_split_featurePoi_path, current_rank_version + constant.local_multi_path, current_rank_version + constant.weight_path) logger.info("multi featurePoi cluster finished,used time:%s s", str(time.time() - cluster_begin_time)) hotcount_cluster_begin_time = time.time() # 单字段特征值文件聚类 poiRankCluster.files_rank_cluster(current_rank_version + constant.local_city_featurePoi_path, current_rank_version + constant.hotCount_single_rank_path, "single", "hotCount") logger.info("hotCount featurePoi cluster finished,used time:%s s", str(time.time() - hotcount_cluster_begin_time)) hitcount_cluster_begin_time = time.time() poiRankCluster.files_rank_cluster(current_rank_version + constant.local_city_featurePoi_path, current_rank_version + constant.hitCount_single_rank_path, "single", "hitCount") logger.info("hitCount featurePoi cluster finished,used time:%s s", str(time.time() - hitcount_cluster_begin_time)) multiRankcommond = "cat " + current_rank_version + constant.local_multi_path + "*-rank > " + current_rank_version + "/multiRank" utils.execute_command(multiRankcommond, shell=True) hotCountRankcommond = "cat " + current_rank_version + constant.hotCount_single_rank_path + "*_rank > " + current_rank_version + "/hotCountRank" utils.execute_command(hotCountRankcommond, shell=True) hitCountRankcommond = "cat " + current_rank_version + constant.hitCount_single_rank_path + "*_rank > " + current_rank_version + "/hitCountRank" utils.execute_command(hitCountRankcommond, shell=True)