def aggregate_brand_stats(self): self.brand_loss = self.load_brand_loss() self.brand_risk = self.compute_brand_risk() brand_groups = self.bigtable.groupby('brand') brand_df = brand_groups.agg( brandid=pd.NamedAgg(column='brandid', aggfunc='first'), mill_count=pd.NamedAgg(column='umlid', aggfunc='count'), rspo_mill_count=pd.NamedAgg(column='cert', aggfunc=lambda x: x.value_counts()['RSPO Certified']), unique_parent_co=pd.NamedAgg(column='prnt_comp', aggfunc=lambda x: len(x.unique())), unique_group_name=pd.NamedAgg(column='group_name', aggfunc=lambda x: len(x.unique())) ) brand_df['nonrspo_mill_count'] = brand_df['mill_count']-brand_df['rspo_mill_count'] brand_cols = ['brandid', 'brand', 'country', 'rspo_member_since', 'external_link', 'description_attribution', 'description'] unique_brands_df = self.brands[brand_cols].drop_duplicates() brand_df1 = brand_df.merge(unique_brands_df, on='brandid', how='left') brand_df2 = self.brand_loss.merge(self.brand_risk, on='brandid', how='left') self.brand_df = brand_df1.merge(brand_df2, on='brandid', how='left') self.brand_df = self.brand_df.sort_values(by='mill_count', ascending=False) logger.info("Aggregated brands data shape: %s" % str(self.brand_df.shape))
def write_json(mills, path): try: with open(path, 'w') as f: f.write(json.dumps(mills)) logger.info('Completed writing %s' % path) except Exception as e: logger.error('Failed writing %s' % path)
def webhook(): """ 接收git提交参数,默认提交处理 :return: """ # try: ret = request.data.decode('utf-8') # print(ret, type(ret)) data = json.loads(ret) # git提交类型 event = data['object_kind'] print("event:", event) branch = data['ref'] print(branch) branch_name = branch if branch.find('/') < 0 else branch[branch.rfind('/') + 1:] print('branch_name:', branch_name) git_url = data['project']['url'] pro_name = "CI__" \ + git_url[git_url.rfind(':') + 1:git_url.rfind("/")] + "__" \ + git_url[git_url.rindex('/') + 1:git_url.rindex(".git")] + "__" + branch_name if event == 'push': if branch_name.startswith('release-'): create_build(data) elif branch_name.startswith('develop') and (request.args.get('dev') == '1' or request.args.get('dev') == 'true'): create_build(data) else: logger.error('未处理分支类型:[%s]!' % branch_name) elif event == 'tag_push': create_build(data) else: logger.error("未处理类型: [%s]!" % event) logger.info('已经完成对[%s]项目的创建!' % pro_name + '\n' + '分支/标签: [%s]' % branch_name) return 'Hello Webhook!'
def build_uml_boundaries_data(output_file_path, input_file_path, radius, res): if os.path.exists(output_file_path): uml_gdf = gpd.read_file(output_file_path) logger.info("Reading UML boundaries data from local geojson file.") pass else: logger.info("Started reading mills data from json.") # Rename column for 'id' as 'umlid' uml_df = pd.read_json(input_file_path, orient='index') #uml_df.rename(columns={"id": "umlid"}, inplace=True) # Convert to GeoDataFrame uml_gdf = gpd.GeoDataFrame(uml_df[['umlid', 'latitude', 'longitude']], geometry=gpd.points_from_xy( uml_df.longitude, uml_df.latitude)) # Set CRS initially to epsg:4326 (lat/lon in degrees) uml_gdf.set_crs(epsg=4326, inplace=True) # Convert to CRS epsg:3395 (lat/lon in meters) and create buffer, # then convert back to CRS epsg:4326. uml_gdf.to_crs('epsg:3395', inplace=True) uml_gdf['geometry'] = uml_gdf.buffer(radius, resolution=res) uml_gdf.to_crs('epsg:4326', inplace=True) # Write geodataframe out to geojson write_geojson(uml_gdf, output_file_path) return uml_gdf
def build_uml_data(output_path, mills_api_url, request_params): res = {} if os.path.exists(output_path): logger.info("Reading UML mills from local file.") try: with open(output_path, 'r') as f: res = json.load(f) except Exception as e: logger.error("Failed to read UML file.") pass else: try: mills_dict = {} # Request mills from opendata.arcgis.com req = requests.get(mills_api_url, params=request_params) res_json = json.loads(req.text) # Handle empty response or missing mills data if 'features' not in res_json or len(res_json['features']) == 0: logger.error('Missing mills data') pass # Extract mills properties from response JSON mills = res_json['features'] mills_dict = { x["properties"]["objectid"]: x["properties"] for x in mills } column_mapper = {'Group_Name': 'group_name', 'id': 'umlid'} for k, v in mills_dict.items(): if v['country'] in request_params['country']: for old, new in column_mapper.items(): v[new] = v.pop(old) res[k] = v write_json(res, output_path) except Exception as e: print(e) logger.error("Failed to read UML mills from API.") return pd.DataFrame.from_dict(res, orient='index')
def create_build(data, project_type='default'): """ 构建默认提交类型 :param data: :param project_type: :return: """ git_url = data['project']['url'] branch = data['ref'] branch_name = branch if branch.find( '/') < 0 else branch[branch.rfind('/') + 1:] pro_name = "CI__" \ + git_url[git_url.rfind(':') + 1:git_url.rfind("/")] + "__" \ + git_url[git_url.rindex('/') + 1:git_url.rindex(".git")] + "__" + branch_name if project_type == 'jdk7': build_name = 'JDK_7u79' config = get_simple_maven_config(url=git_url, branch=branch, jdk_version=build_name) elif project_type == 'jdk8': build_name = 'JDK_8u112' config = get_simple_maven_config(url=git_url, branch=branch, jdk_version=build_name) elif project_type == 'npm': config = get_simple_npm_config(url=git_url, branch=branch) elif not project_type or project_type == 'default': config = get_simple_default_config(url=git_url, branch=branch) else: logger.error('项目类型有误: [%s]' % project_type) # raise Exception("Project Type Error: [%s]" % project_type) s = 0 while s < 20: s += 1 try: if not server.get_job_name(pro_name): server.create_job(pro_name, config) logger.info('新的项目,开始创建[%s]项目名称!' % pro_name) else: logger.info('[%s]项目名称已存在,准备开始构建!' % pro_name) break except jenkins.JenkinsException as e: continue while s < 30: s += 1 try: log.info("项目 [%s] 构建中..." % pro_name) server.build_job(pro_name) log.info("项目 [%s] 构建完成!" % pro_name) break except Exception as e: log.error("项目 [%s] 构建失败!" + pro_name + '\n' + traceback.format_exc()) continue logger.info('构建项目 [%s] 信息:' % pro_name + '分支/标签: [%s]' % branch_name + '构建重试次数:[%s]' % (s - 2))
def webhook_jdk_npm(j_n): """ 接收git提交参数,处理以分支和url是jdk和npm类型 :param j_n: :return: """ # 获取JDK版本 # try: # git提交信息 ret = request.data.decode('utf-8') data = json.loads(ret) # git提交类型 event = data['object_kind'] branch = data['ref'] branch_name = branch if branch.find('/') < 0 else branch[branch.rfind('/') + 1:] git_url = data['project']['url'] pro_name = "CI__" \ + git_url[git_url.rfind(':') + 1:git_url.rfind("/")] + "__" \ + git_url[git_url.rindex('/') + 1:git_url.rindex(".git")] + "__" + branch_name if event == 'push': if branch_name.startswith('release-'): create_build(data, j_n) elif branch_name.startswith('develop') and (request.args.get('dev') == '1' or request.args.get('dev') == 'true'): create_build(data, j_n) else: logger.info('未处理分支类型:[%s]!' % branch_name) raise Exception('未处理分支类型:[%s]!' % branch_name) elif event == 'tag_push': if j_n.startswith('jdk') or j_n.startswith('npm'): create_build(data, j_n) else: logger.info("未处理类型: [%s]!" % event) raise Exception("未处理类型: [%s]!" % event) # logger.info('JSON_jdk_npm: [%s]' % ret) logger.info('已经完成对[%s]项目的构建!' % pro_name + '\n' + '分支/标签:[%s]' % branch_name) return 'Hello Webhook!'
def write_geojson(gdf, path): try: gdf.to_file(path, driver='GeoJSON') logger.info('Completed writing %s' % path) except Exception as e: logger.error('Failed writing %s' % path)
def write_df(df, path, index=False): try: df.to_csv(path, index=index) logger.info('Completed writing %s' % path) except Exception as e: logger.error('Failed writing %s' % path)
'risk_score_current', 'risk_score_future']] def write_uniquebrands(self): self.brand_df.to_csv(self.out_brands, index=False) def write_brand_mill_matches(self): self.brands[['brandid', 'umlid']].to_csv(self.out_matches, index=False) if __name__ == '__main__': try: os.mkdir(OUTPUT_DIR) except FileExistsError: logger.info('Output directory already exists.') uml_df = load_uml_data() logger.info("UML data shape: %s" % str(uml_df.shape)) brand_df = load_brand_data() logger.info("Brand data shape: %s" % str(brand_df.shape)) ## # Input: umls.json, output: boundaries.geojson # This code should read the output/umls.json file, use EE to # calculate the polygon shapes (and probably water/land/intersection areas # when available), then write the output to geojson. Return geopandas df # with UML ID, lat/lon, and polygon shapes. uml_boundaries_geodf = load_uml_boundaries_data() logger.info("UML boundaries data shape: %s" % str(uml_boundaries_geodf.shape))
def build_brand_data(input_path, input_brand_path, input_new_matches_path, output_path): res = None if os.path.exists(output_path): res = pd.read_csv(output_path) logger.info("Reading brand data from local CSV file.") else: logger.info("Started parsing brand data from TSV.") df = pd.read_csv(input_path, sep='\t') # Drop mills not on in indonesia or null rows df = df[df['Country'].notnull()] df = df[df['Country'] == 'indonesia'] # Keep wanted columns df = df[[ 'idx', 'UMLID', 'Consumer Company', 'Mill Name', 'Mill Company', 'Parent Company', 'Province', 'District', 'RSPO' ]] # Rename columns mapper = { 'idx': 'idx', 'UMLID': 'umlid', 'Consumer Company': 'brand', 'Mill Name': 'mill_name', 'Mill Company': 'group_name', 'Parent Company': 'prnt_comp', 'Province': 'state', 'District': 'sub_state', 'RSPO': 'rspo_model' } df = df.rename(columns=mapper) df.reset_index(drop=True, inplace=True) # Create df1 where each row has a company and mill idx df1 = df[df['brand'].notnull()].loc[:, ['idx', 'brand']] # Create df2 where each row has a uml and mill idx, mill info df2 = df[df['umlid'].notnull()] # Merge and filter unique id/company tuples dfm = df1.merge(df2, on='idx', how='left') dfm = dfm[(dfm['brand_x'].notnull()) & (dfm['umlid'].notnull())] # Clean up merged dataset dfm.reset_index(drop=True, inplace=True) dfm.drop_duplicates(subset=['brand_x', 'umlid'], inplace=True) dfm.drop(columns=['brand_y', 'idx'], inplace=True) dfm.rename(columns={'brand_x': 'brand'}, inplace=True) # Bring in new match dataset dfnew = pd.read_csv(input_new_matches_path) # Keep wanted columns dfnew = dfnew[[ 'UMLID', 'Consumer Company', 'Mill Name', 'Mill Company', 'Parent Company', 'Province', 'District', 'RSPO' ]] # Rename columns del mapper['idx'] dfnew = dfnew.rename(columns=mapper) dfnew.reset_index(drop=True, inplace=True) dfnew.drop_duplicates(subset=['brand', 'umlid'], inplace=True) # Concatenate datasets dfm = pd.concat([dfm, dfnew]) dfm.drop_duplicates(subset=['brand', 'umlid'], inplace=True) # Rename brands brand_mapper = { 'ferrero': 'Ferrero', 'kellog': 'Kellogg Company', 'pepsico': 'PepsiCo', 'frieslandcampina': 'Royal FrieslandCampina N.V.', 'johnson and johnson': 'Johnson & Johnson', 'general mills': 'General Mills, Inc', 'hershey': 'The Hershey Company', 'loreal': "L'Oreal", 'procter and gamble': 'The Procter & Gamble Company', 'colgate palmolive': 'Colgate-Palmolive Company', 'nestle': 'Nestlé', 'mars': 'Mars, Incorporated', 'unilever': 'Unilever' } for old, new in brand_mapper.items(): dfm['brand'] = dfm['brand'].replace(old, new) # Merge brand info. df3 = pd.read_csv(input_brand_path) df3.rename(columns={'name': 'brand', 'id': 'brandid'}, inplace=True) dft = df3.merge(dfm, on='brand', how='right') res = dft write_df(res, output_path, index=False) return res
def build_risk_data(input_file_path, output_file_path, id_col, years=[2018, 2019]): risk_df = None if os.path.exists(output_file_path): risk_df = pd.read_csv(output_file_path) logger.info("Reading risk data from local csv file.") pass else: logger.info("Started reading loss data from csv.") loss_df = pd.read_csv(input_file_path) # Create a new column that is the z-score for the sqrt tree loss proportion. loss_df['past_risk_z'] = get_z(loss_df, 'treeloss_sum_proportion_of_forest') # Create a new column that is the risk (1-5) associated with z-score # of past tree loss loss_df['risk_score_past'] = get_risk_from_z(loss_df, 'past_risk_z') # Create a new column that is the mean treeloss for specified years. mean_col = 'mean_loss_' for year in years: mean_col += str(year) col_list = ['treeloss_' + str(year) for year in years] loss_df[mean_col] = loss_df.loc[:, col_list].mean(axis=1) # Create a new colum that is the mean treeloss as a proportion of forest. mean_prop_sqrt_col = mean_col + '_proportion_sqrt' loss_df[mean_prop_sqrt_col] = np.sqrt(loss_df[mean_col] / loss_df['forest_area']) # Create a new column that is the z-score for the mean treeloss as a # proportion of forest. current_z_col = mean_prop_sqrt_col + "_z" loss_df[current_z_col] = get_z(loss_df, mean_prop_sqrt_col) # Convert z-score to risk (1-5) loss_df['risk_score_current'] = get_risk_from_z(loss_df, current_z_col) # Create a new column that is the z-score for the remaining # tree cover proportion. loss_df['remaining_forest_z'] = get_z( loss_df, 'remaining_proportion_of_forest') # Create a new column that is 0.5*remaining proportion of forest z-score, # and 0.5*z-score for the mean current treeloss proportion of forest. loss_df['future_risk_z'] = 0.5*loss_df['remaining_forest_z'] + \ 0.5*loss_df[current_z_col] # Create a new column that is the risk (1-5) associated with z-score # of past tree loss loss_df['risk_score_future'] = get_risk_from_z(loss_df, 'future_risk_z') # risk_df includes UMLid and risk_score columns only risk_df = loss_df.loc[:, [ id_col, 'risk_score_current', 'risk_score_past', 'risk_score_future' ]] # Write out risk_df to CSV write_df(risk_df, output_file_path, index=False) return risk_df
def build_loss_data(input_file_path, output_file_path, GFC_DATASET_NAME, id_col, credentials, area_factor=1): loss_data = None if os.path.exists(output_file_path): loss_data = pd.read_csv(output_file_path) logger.info("Reading loss data from {}.".format(output_file_path)) else: # Earth Engine Initialization ee.Initialize(credentials) logger.info("Computing loss and area for geometries from {}.".format( input_file_path)) logger.info("Loading GFC data.") # Load the Global Forest Change dataset as a GEE image gfc_img = ee.Image(GFC_DATASET_NAME) # Open geojson file and convert data to Earth Engine Feature Collection. with open(input_file_path) as f: data = json.load(f) geoms = ee.FeatureCollection(data['features']) # Compute cumulative tree cover loss per geometry across **all** # lossyears # NOTE: The resulting sum is a decimal number because a weighted # reduction is performed: # https://developers.google.com/earth-engine/guides/reducers_weighting. # The sum is a weighted aggregation of the bitmap property "loss," # which is either 0 or 1. We then convert to an area using the # area_factor parameter. logger.info("Computing tree cover loss sum.") lossdict = reduce_sum(gfc_img, 'loss', geoms) # Store area info in a dataframe. column_names = [id_col, "treeloss_sum"] rows = [] for area in lossdict: rows.append([ area['properties'][id_col], area_factor * area['properties']['sum'] ]) loss_data = pd.DataFrame(columns=column_names, data=rows) # Compute land area within each geometric boundary and add a column to data # frame. Compute histogram of datamask layer per mill area. logger.info("Computing areas of land and forest.") datamask_bins = (1, 2, 1) # 1 bin of [1,2) landTypedict = reduce_hist(gfc_img, 'datamask', geoms, datamask_bins) logger.info("Land finished.") # Extract land area for each mill and add to dataframe. land_areas = [] for area in landTypedict: land_areas.append(area_factor * area["properties"]['histogram'][0][1]) loss_data['land_area'] = land_areas # Compute forested area for each area and add a column to dataframe. # Compute the area where treecover2000 is greater than or equal to 30%. treecover_bins = (30, 101, 1) # 1 bin of [30,101) treecoverdict = reduce_hist(gfc_img, 'treecover2000', geoms, treecover_bins) # Extract the area for each area boundary and add to dataframe. treecover2000_area = [] for area in treecoverdict: treecover2000_area.append(area_factor * area["properties"]['histogram'][0][1]) loss_data['forest_area'] = treecover2000_area # Compute cumulative tree cover loss area per area per year # Add a column to the data frame for each year. logger.info("Computing yearly tree cover loss.") lossyears = list(range(1, 20)) lossyear_bins = (1, 20, 19) # 19 bins of 1 each from 1-19 lossyeardict = reduce_hist(gfc_img, 'lossyear', geoms, lossyear_bins) for i, year in enumerate(lossyears): col_name = "treeloss_20" + str(year).zfill(2) loss = [] for area in lossyeardict: loss.append(area_factor * area['properties']['histogram'][i][1]) loss_data[col_name] = loss logger.info("Yearly tree cover loss computation complete.") #Compute the total tree cover loss for each mill as a proportion of #land area and add to dataframe. loss_data['treeloss_sum_proportion_of_land'] = ( loss_data['treeloss_sum'] / loss_data['land_area']) #Compute the total tree cover loss for each mill as a proportion of #forest in 2000 and add to dataframe. loss_data['treeloss_sum_proportion_of_forest'] = ( loss_data['treeloss_sum'] / loss_data['forest_area']) #Compute the proportion of forest area that is remaining #(1 - proportion of forest lost). loss_data['remaining_proportion_of_forest'] = ( 1 - loss_data['treeloss_sum_proportion_of_forest']) logger.info("Writing tree cover loss data to file.") write_df(loss_data, output_file_path, index=False) return loss_data