def incidentsPerKm(dfAll): dfKm = pd.read_excel('./raw_data/pipeline_length.XLSX') dfKm = dfKm[dfKm['Regulated KM Rounded'].notnull()] dfKm['Company Name'] = [x.strip() for x in dfKm['Company Name']] dfKm['Company Name'] = dfKm['Company Name'].replace(company_rename()) dfAll = dfAll.groupby('Company')['Incident Number'].count() dfAll = dfAll.reset_index() dfAll = dfAll.rename(columns={'Incident Number': 'Incident Count'}) dfAll = dfAll.merge(dfKm, how='inner', left_on='Company', right_on='Company Name') for delete in ['Company Name', 'Regulated KM', 'PipelineID']: del dfAll[delete] dfAll['Incidents per 1000km'] = [ round((i / l) * 1000, 0) for i, l in zip(dfAll['Incident Count'], dfAll['Regulated KM Rounded']) ] # find the average incident per 1000km per group dfAvg = dfAll.copy() dfAvg = dfAvg.groupby(['Commodity' ])['Incidents per 1000km'].mean().reset_index() dfAvg = dfAvg.rename(columns={'Incidents per 1000km': 'Avg per 1000km'}) # merge the average with the company results dfAll = dfAll.merge(dfAvg, how='inner', left_on='Commodity', right_on='Commodity') dfAll['Avg per 1000km'] = dfAll['Avg per 1000km'].round(0) return dfAll
def process_french(df, fr): df = normalize_text(df, ['Location', 'Short Project Name', 'Theme(s)']) en = df.copy() en = en[en['Short Project Name'] != "SAM/COM"].copy().reset_index( drop=True) fr = fr.rename( columns={ "Société": "Company", "Nom du projet": "Project Name", "Nom du projet court": "Short Project Name", "État du projet": "Project Status", "Instrument no": "Instrument Number", "Activité liée à l'instrument": "Instrument Activity", "Entrée en vigueur": "Effective Date", "Date de délivrance": "Issuance Date", "Date de réexamen": "Sunset Date", "État de l'instrument": "Instrument Status", "Lieu": "Location", "Condition No": "Condition Number", "Condition": "Condition", "État de condition": "Condition Status", "Étape de condition": "Condition Phase", "Type de Condition": "Condition Type", "Dépôt pour condition": "Condition Filing", "Thème(s)": "Theme(s)" }) fr = fr[fr['Short Project Name'] != "SAM/COM"].copy().reset_index( drop=True) fr['Company'] = fr['Company'].replace(company_rename()) en = normalize_text(en, [ 'Location', 'Short Project Name', 'Theme(s)', 'Condition Number', 'Instrument Number' ]) fr = normalize_text(fr, [ 'Location', 'Short Project Name', 'Theme(s)', 'Condition Number', 'Instrument Number' ]) fr['french id'] = [ str(ins).strip() + '_' + str(cond).strip() for ins, cond in zip(fr['Instrument Number'], fr['Condition Number']) ] en['english id'] = [ str(ins).strip() + '_' + str(cond).strip() for ins, cond in zip(en['Instrument Number'], en['Condition Number']) ] fr = fr[['french id', 'Location', 'Short Project Name', 'Theme(s)']].copy().reset_index(drop=True) join = en.merge(fr, how='inner', left_on='english id', right_on='french id', suffixes=('_en', '_fr')) projectReplace = frenchSubsets(join, 'Short Project Name') themeReplace = frenchSubsets(join, 'Theme(s)') locationReplace = frenchSubsets(join, 'Location') df['Location'] = df['Location'].replace(locationReplace) df['Theme(s)'] = df['Theme(s)'].replace(themeReplace) df['Short Project Name'] = df['Short Project Name'].replace(projectReplace) return df
def process_incidents(remote=False, land=False, company_names=False, companies=False, test=False, lang='en'): if remote: if lang == 'en': link = "https://www.cer-rec.gc.ca/en/safety-environment/industry-performance/interactive-pipeline/map/2020-12-31-incident-data.csv" else: link = "https://www.cer-rec.gc.ca/fr/securite-environnement/rendement-lindustrie/carte-interactive-pipelines/carte/2020-12-31-donnees-incidents.csv" print('downloading remote incidents file') df = pd.read_csv(link, skiprows=1, encoding="UTF-16", error_bad_lines=False) df.to_csv("./raw_data/incidents.csv", index=False) elif test: print('reading test incidents file') if lang == 'en': df = pd.read_csv("./raw_data/test_data/incidents_en.csv", skiprows=0, encoding="UTF-8", error_bad_lines=False) df = process_english(df) else: df = pd.read_csv("./raw_data/incidents_fr.csv", skiprows=1, encoding="UTF-16", error_bad_lines=False) df = process_french(df) else: print('reading local incidents file') if lang == 'en': print('starting english incidents...') df = pd.read_csv("./raw_data/incidents_en.csv", skiprows=0, encoding="UTF-8", error_bad_lines=False) df = process_english(df) else: print('starting french incidents...') df = pd.read_csv("./raw_data/incidents_fr.csv", skiprows=1, encoding="UTF-16", error_bad_lines=False) df = process_french(df) # initial data processing df['Company'] = df['Company'].replace(company_rename()) df['Approximate Volume Released'] = pd.to_numeric( df['Approximate Volume Released'], errors='coerce') df['Reported Date'] = pd.to_datetime(df['Reported Date'], errors='raise') for delete in [ 'Significant', 'Release Type', 'Nearest Populated Centre', 'Reported Date' ]: del df[delete] if company_names: print(get_company_names(df['Company'])) # industryTrend = changes(df, volume=True) # perKm = incidentsPerKm(df) perKm = None if companies: company_files = companies else: company_files = [ 'NOVA Gas Transmission Ltd.', 'TransCanada PipeLines Limited', 'Enbridge Pipelines Inc.', 'Enbridge Pipelines (NW) Inc.', 'Enbridge Bakken Pipeline Company Inc.', 'Express Pipeline Ltd.', 'Trans Mountain Pipeline ULC', 'Trans Quebec and Maritimes Pipeline Inc.', 'Trans-Northern Pipelines Inc.', 'TransCanada Keystone Pipeline GP Ltd.', 'Westcoast Energy Inc.', 'Alliance Pipeline Ltd.', 'PKM Cochin ULC', 'Foothills Pipe Lines Ltd.', 'Southern Lights Pipeline', 'Emera Brunswick Pipeline Company Ltd.', 'Plains Midstream Canada ULC', 'Genesis Pipeline Canada Ltd.', 'Montreal Pipe Line Limited', 'Trans-Northern Pipelines Inc.', 'Kingston Midstream Westspur Limited', 'Many Islands Pipe Lines (Canada) Limited', 'Vector Pipeline Limited Partnership', 'Maritimes & Northeast Pipeline Management Ltd.' ] for company in company_files: folder_name = company.replace(' ', '').replace('.', '') df_c = df[df['Company'] == company].copy().reset_index(drop=True) df_vol = df_c[~df_c['Approximate Volume Released'].isnull()].copy( ).reset_index(drop=True) thisCompanyData = {} if not df_vol.empty: # calculate metadata here, before non releases are filtered out meta = incidentMetaData(df, perKm, company, lang) # companyTrend = changes(df_vol, volume=False) # meta['trends'] = {"company": companyTrend, "industry": industryTrend} thisCompanyData['meta'] = meta del df_vol['Incident Types'] del df_vol['Company'] df_vol = optimizeJson(df_vol) thisCompanyData['events'] = df_vol.to_dict(orient='records') if not test: with open( '../incidents/company_data/' + lang + '/' + folder_name + '.json', 'w') as fp: json.dump(thisCompanyData, fp) else: # there are no product release incidents thisCompanyData['events'] = df_vol.to_dict(orient='records') thisCompanyData['meta'] = {"companyName": company} if not test: with open( '../incidents/company_data/' + lang + '/' + folder_name + '.json', 'w') as fp: json.dump(thisCompanyData, fp) return df_c, df_vol, meta
def process_english(df): def replace_what_why(df, col_name, values): new_col = [] for what in df[col_name]: what = what.split(",") what = [x.strip() for x in what] what = [values[x] for x in what] new_col.append(what) df[col_name] = new_col return df df = fix_columns(df) df['Substance'] = df['Substance'].replace( {'Butane': 'Natural Gas Liquids'}) chosen_substances = [ "Propane", "Natural Gas - Sweet", "Natural Gas - Sour", "Fuel Gas", "Lube Oil", "Crude Oil - Sweet", "Crude Oil - Synthetic", "Crude Oil - Sour", "Natural Gas Liquids", "Condensate", # "Sulphur Dioxide", "Diesel Fuel", "Gasoline" ] df['Substance'] = [ x if x in chosen_substances else "Other" for x in df['Substance'] ] # custom codes for product df['Substance'] = df['Substance'].replace({ 'Propane': 'pro', 'Natural Gas - Sweet': 'ngsweet', 'Natural Gas - Sour': 'ngsour', 'Fuel Gas': 'fgas', 'Lube Oil': 'loil', 'Crude Oil - Sweet': 'cosweet', 'Crude Oil - Synthetic': 'sco', 'Crude Oil - Sour': 'cosour', 'Natural Gas Liquids': 'ngl', 'Condensate': 'co', 'Diesel Fuel': 'diesel', 'Gasoline': 'gas' }) df = idify(df, "Province", "region") what = { "Defect and Deterioration": "dd", "Corrosion and Cracking": "cc", "Equipment Failure": "ef", "Incorrect Operation": "io", "External Interference": "ei", "Natural Force Damage": "nfd", "Other Causes": "oc", "To be determined": "tbd" } why = { "Engineering and Planning": "ep", "Maintenance": "m", "Inadequate Procurement": "ip", "Tools and Equipment": "te", "Standards and Procedures": "sp", "Failure in communication": "fc", "Inadequate Supervision": "is", "Human Factors": "hf", "Natural or Environmental Forces": "ef", "To be determined": "tbd" } df = replace_what_why(df, "What Happened", what) df = replace_what_why(df, "Why It Happened", why) df["what common"] = [", ".join(x) for x in df["What Happened"]] df["why common"] = [", ".join(x) for x in df["Why It Happened"]] df['Status'] = df['Status'].replace({ "Closed": "c", "Initially Submitted": "is", "Submitted": "s" }) df['Company'] = df['Company'].replace(company_rename()) df = apply_system_id(df, "Company") return df
def process_remediation(sql=False, remote=True, companies=False, test=False, save=True): if test: print("reading test remediation test data") df = pd.read_csv( os.path.join(os.getcwd(), "raw_data", "test_data", "remediation.csv")) elif remote: print("reading remote remediation file") df = pd.read_csv( "https://www.cer-rec.gc.ca/open/compliance/contamination.csv", encoding="latin-1", engine="python", ) df.to_csv("./raw_data/remediation.csv") else: print("reading local remediation file") df = pd.read_csv("./raw_data/remediation.csv") contaminants = get_data(sql=sql, script_loc=os.getcwd(), query="remediationContaminants.sql", db="dsql22cap") old = get_data(sql=sql, script_loc=os.getcwd(), query="remediation_pre_2018.sql", db="dsql22cap") df = apply_contaminant_ids(df, contaminants) df["Contaminants at the Site"] = [["18"] if x == None else x for x in df["Contaminants at the Site"]] df["Site Within 30 Meters Of Waterbody"] = [ True if x == "Yes" else False for x in df["Site Within 30 Meters Of Waterbody"] ] df = normalize_text(df, [ 'Applicable Land Use', 'Site Status', 'Activity At Time Of Discovery', 'Pipeline Name', 'Facility Name' ]) pipe_section = [] na = "Not Specified" for pipe, section in zip(df['Pipeline Name'], df['Facility Name']): if pipe == na and section == na: pipe_section.append("ns") # Not Specified elif pipe == na and section != na: pipe_section.append("f") # Facility elif pipe != na and section == na: pipe_section.append("p") # Pipeline elif pipe != na and section != na: pipe_section.append("pf") # Pipeline and Facility else: print("error here!") df["ps"] = pipe_section del df['Pipeline Name'] del df['Facility Name'] # add id's land_use_ids = { "developed land - industrial": "dli", "developed land - small commercial": "dls", "developed land - residential": "dlr", "barren land": "bl", "shrub land": "sl", "vegetative barren": "vb", "forests": "f", "Agricultural Cropland": "ac", "water / wetlands": "w", "Tundra / Native Prairie / Parks": "t", "agricultural land": "al", "protected area": "pa", "non-developed land": "ndl" } status_ids = { "monitored": "m", "post-remediation monitoring": "prm", "facility monitoring": "fm", "ongoing remediation": "or", "site assessment": "sa", "risk managed": "rm" } activity_ids = { "maintenance": "m", "operation": "o", "construction": "c", "abandonment": "a" } df = idify(df, "Applicable Land Use", land_use_ids) df = idify(df, "Province", "region") df = idify(df, "Site Status", status_ids) df = idify(df, "Activity At Time Of Discovery", activity_ids) df['Final Submission Date'] = pd.to_datetime(df['Final Submission Date']) df['y'] = df['Final Submission Date'].dt.year df = df.fillna(value=np.nan) for ns in [ 'Applicable Land Use', 'Activity At Time Of Discovery', 'Contaminants at the Site', 'Initial Estimate of Contaminated Soil (m3)', 'Site Within 30 Meters Of Waterbody', 'Site Status', 'Latitude', 'Longitude' ]: df[ns] = [ None if x in ["Not Specified", np.nan, "nan"] else x for x in df[ns] ] for numeric in [ 'Initial Estimate of Contaminated Soil (m3)', 'Latitude', 'Longitude', 'y' ]: df[numeric] = df[numeric].replace(np.nan, int(-1)) for int_numeric in ['y', 'Initial Estimate of Contaminated Soil (m3)']: df[int_numeric] = df[int_numeric].astype(int) df['loc'] = [[lat, long] for lat, long in zip(df['Latitude'], df['Longitude'])] del df['Latitude'] del df['Longitude'] columns = { "Event ID": "id", "Site Status": "s", "Activity At Time Of Discovery": "a", "Province": "p", "Applicable Land Use": "use", "Contaminants at the Site": "c", "Initial Estimate of Contaminated Soil (m3)": "vol", "Site Within 30 Meters Of Waterbody": "w" } df = df.rename(columns=columns) for col in df: if col not in columns.values() and col not in [ "Company Name", "Final Submission Date", "y", "ps", "loc" ]: del df[col] df['Company Name'] = df['Company Name'].replace(company_rename()) df = apply_system_id(df, "Company Name") old["Company"] = old["Company"].replace(company_rename()) old = apply_system_id(old, "Company") if companies: company_files = companies else: company_files = get_company_list("all") for company in company_files: try: folder_name = company.replace(' ', '').replace('.', '') df_c = df[df['Company Name'] == company].copy().reset_index( drop=True) this_company_data = {} if not df_c.empty: this_company_data["meta"] = meta(df_c, company, old) this_company_data["build"] = True this_company_data["data"] = optimize_json(df_c) if save and not test: with open( '../data_output/remediation/' + folder_name + '.json', 'w') as fp: json.dump(this_company_data, fp) else: this_company_data['data'] = df_c.to_dict(orient='records') this_company_data['meta'] = {"companyName": company} this_company_data["build"] = False if save and not test: with open( '../data_output/remediation/' + folder_name + '.json', 'w') as fp: json.dump(this_company_data, fp) print("completed: " + company) except: print("remediation error: " + company) raise return df, this_company_data
def process_conditions(remote=False, nonStandard=True, company_names=False, companies=False, test=False, lang='en', save=True): if remote: print('downloading remote conditions file') if lang == 'en': link = 'http://www.cer-rec.gc.ca/open/conditions/conditions.csv' df = pd.read_csv(link, sep='\t', lineterminator='\r', encoding="UTF-16", error_bad_lines=False) df = normalize_text(df, ['Location', 'Short Project Name', 'Theme(s)']) else: link = 'http://www.cer-rec.gc.ca/open/conditions/conditions.csv' linkFR = 'https://www.cer-rec.gc.ca/ouvert/conditions/conditions.csv' df = pd.read_csv(link, sep='\t', lineterminator='\r', encoding="UTF-16", error_bad_lines=False) fr = pd.read_csv(linkFR, sep='\t', lineterminator='\r', encoding="UTF-16", error_bad_lines=False) df = process_french(df, fr) elif test: print('reading test conditions data') df = pd.read_csv('./raw_data/test_data/conditions.csv', encoding="UTF-16", sep='\t') df = normalize_text(df, ['Location', 'Short Project Name', 'Theme(s)']) else: print('reading local conditions data') if lang == 'en': df = pd.read_csv('./raw_data/conditions_en.csv', encoding="UTF-16", sep='\t') df = normalize_text(df, ['Location', 'Short Project Name', 'Theme(s)']) else: df = pd.read_csv('./raw_data/conditions_en.csv', encoding="UTF-16", sep='\t') fr = pd.read_csv('./raw_data/conditions_fr.csv', encoding="UTF-16", sep='\t') df = process_french(df, fr) for date_col in ['Effective Date', 'Issuance Date', 'Sunset Date']: df[date_col] = pd.to_datetime(df[date_col]) if not nonStandard: # only include non-standard conditions df = df[df['Condition Type'] != 'Standard'] delete_cols = [ 'Condition', 'Condition Phase', 'Instrument Activity', 'Condition Type', 'Condition Filing' ] for delete in delete_cols: del df[delete] for r in ['\n', '"']: df['Company'] = df['Company'].replace(r, '', regex=True) df['Company'] = [x.strip() for x in df['Company']] df['Condition Status'] = df['Condition Status'].astype('object') df['Condition Status'] = [str(x).strip() for x in df['Condition Status']] # preliminary processing df['Company'] = df['Company'].replace(company_rename()) df = df[df['Short Project Name'] != "SAM/COM"].copy().reset_index( drop=True) df['Theme(s)'] = df['Theme(s)'].replace({"nan": "No theme specified"}) regions_map = import_simplified() df = add_links(df) if company_names: print(get_company_names(df['Company'])) if companies: # used to set one company for testing company_files = companies else: company_files = [ 'NOVA Gas Transmission Ltd.', 'TransCanada PipeLines Limited', 'Enbridge Pipelines Inc.', 'Enbridge Pipelines (NW) Inc.', 'Express Pipeline Ltd.', 'Trans Mountain Pipeline ULC', 'Trans Quebec and Maritimes Pipeline Inc.', 'Trans-Northern Pipelines Inc.', 'TransCanada Keystone Pipeline GP Ltd.', 'Westcoast Energy Inc.', 'Alliance Pipeline Ltd.', 'PKM Cochin ULC', 'Foothills Pipe Lines Ltd.', 'Southern Lights Pipeline', 'Emera Brunswick Pipeline Company Ltd.', 'Many Islands Pipe Lines (Canada) Limited', 'Maritimes & Northeast Pipeline Management Ltd.', 'Vector Pipeline Limited Partnership', 'Plains Midstream Canada ULC', 'Enbridge Bakken Pipeline Company Inc.', 'Genesis Pipeline Canada Ltd.', 'Montreal Pipe Line Limited', 'Kingston Midstream Westspur Limited', 'Aurora Pipeline Company Ltd' ] for company in company_files: thisCompanyData = {} folder_name = company.replace(' ', '').replace('.', '') df_c = df[df['Company'] == company].copy().reset_index(drop=True) if not df_c.empty: # df_c = add_links(df_c, links) df_c['condition id'] = [ str(ins) + '_' + str(cond) for ins, cond in zip( df_c['Instrument Number'], df_c['Condition Number']) ] expanded_locations = [] for unique in df_c['condition id']: row = df_c[df_c['condition id'] == unique].copy().reset_index( drop=True) locations = [x.split(',') for x in row['Location']] for region in locations[0]: regionProvince = region.strip().split('/') row['id'] = regionProvince[0].strip() row['Flat Province'] = regionProvince[-1].strip() expanded_locations.append(row.copy()) df_all = pd.concat(expanded_locations, axis=0, sort=False, ignore_index=True) # calculate metadata here dfmeta, meta = conditionMetaData(df_all, folder_name) meta["build"] = True thisCompanyData['meta'] = meta shp, mapMeta = conditions_on_map(dfmeta, regions_map, folder_name, lang) thisCompanyData['regions'] = shp.to_json() thisCompanyData['mapMeta'] = mapMeta.to_dict(orient='records') if not test and save: with open( '../conditions/company_data/' + lang + '/' + folder_name + '.json', 'w') as fp: json.dump(thisCompanyData, fp) print('completed+saved ' + lang + ' conditions: ' + company) else: meta = {"companyName": company} thisCompanyData = { 'meta': { "companyName": company, "build": False }, 'regions': "{}", 'mapMeta': [] } if not test and save: with open( '../conditions/company_data/' + lang + '/' + folder_name + '.json', 'w') as fp: json.dump(thisCompanyData, fp) print('completed+saved ' + lang + ' conditions: ' + company) # if not test: # print('completed '+lang+' conditions: '+company) return df_c, shp, dfmeta, meta
def process_incidents(remote=False, land=False, company_names=False, companies=False, test=False): if remote: link = "https://www.cer-rec.gc.ca/en/safety-environment/industry-performance/interactive-pipeline/map/2020-12-31-comprehensive-incident-data.csv" print('downloading remote incidents file') df = pd.read_csv(link, skiprows=1, encoding="UTF-16", error_bad_lines=False) df.to_csv("./raw_data/incidents.csv", index=False) elif test: print('reading test incidents file') df = pd.read_csv( "./raw_data/test_data/comprehensive-incident-data.csv", skiprows=0, encoding="UTF-8", error_bad_lines=False) else: print('reading local incidents file') df = pd.read_csv("./raw_data/comprehensive-incident-data.csv", skiprows=0, encoding='latin-1', error_bad_lines=True) for vol in [ 'Approximate Volume Released (m³)', 'Approximate Volume Released (m3)' ]: try: df = df.rename(columns={vol: 'Approximate Volume Released'}) except: None # initial data processing df['Company'] = df['Company'].replace(company_rename()) df['Company'] = [x if x in group1 else "Group 2" for x in df['Company']] df['Approximate Volume Released'] = pd.to_numeric( df['Approximate Volume Released'], errors='coerce') df['Reported Date'] = pd.to_datetime(df['Reported Date'], errors='raise') df['Substance'] = df['Substance'].replace({ "Water": "Other", "Hydrogen Sulphide": "Other", "Amine": "Other", "Contaminated Water": "Other", "Potassium Hydroxide (caustic solution)": "Other", "Glycol": "Other", "Pulp slurry": "Other", "Sulphur": "Other", "Odourant": "Other", "Potassium Carbonate": "Other", "Waste Oil": "Other", "Produced Water": "Other", "Butane": "Natural Gas Liquids", "Mixed HVP Hydrocarbons": "Other", "Drilling Fluid": "Other", "Jet Fuel": "Other", "Gasoline": "Other", "Sulphur Dioxide": "Other", "Lube Oil": "Other", "Propane": "Natural Gas Liquids", "Fuel Gas": "Other", "Diesel Fuel": "Other" }) if company_names: print(get_company_names(df['Company'])) keep = [ 'Incident Number', 'Incident Types', 'Province', 'Company', 'Status', 'Latitude', 'Longitude', 'Approximate Volume Released', 'Substance', 'Year', 'What happened category', 'Why it happened category', 'Activity being performed at time of incident', 'How the incident was discovered', 'Incident type', 'Residual effects on the environment', 'Number of fatalities', 'Number of individuals injured', 'Off Company Property', 'Was NEB Staff Deployed' ] for col in df.columns: if col not in keep: del df[col] df = df.rename( columns={ 'What happened category': 'What happened', 'Why it happened category': 'Why it happened', 'Activity being performed at time of incident': 'Activity at time of incident', 'How the incident was discovered': 'How was it discovered' }) # df = df[~df['Approximate Volume Released'].isnull()].copy().reset_index(drop=True) fillZero = [ 'Approximate Volume Released', 'Number of fatalities', 'Number of individuals injured' ] for f in fillZero: df[f] = df[f].fillna(0) fillOther = ['How was it discovered'] for f in fillOther: df[f] = df[f].fillna("Other") textCols = [ 'Incident Number', 'Incident Types', 'Province', 'Company', 'Status', 'Substance', 'What happened', 'Why it happened', 'Activity at time of incident', 'How was it discovered', 'Incident type', 'Residual effects on the environment', 'Off Company Property', 'Was NEB Staff Deployed' ] for t in textCols: df[t] = [str(x).strip() for x in df[t]] meta = {} allCompanyData = {} allCompanyData['meta'] = meta allCompanyData['events'] = df.to_dict(orient='records') if not test: with open('../incidents/incident_releases.json', 'w') as fp: json.dump(allCompanyData, fp) return allCompanyData, df
def process_conditions(remote=False, sql=False, non_standard=True, company_names=False, companies=False, test=False, save=True): if remote: print('downloading remote conditions file') link = 'http://www.cer-rec.gc.ca/open/conditions/conditions.csv' df = pd.read_csv( link, # sep='\t', # lineterminator='\r', encoding="latin-1", error_bad_lines=True) df = normalize_text(df, ['Location', 'Short Project Name', 'Theme(s)']) elif test: print('reading test conditions data') df = pd.read_csv('./raw_data/test_data/conditions.csv', encoding="UTF-16", sep='\t') df = normalize_text(df, ['Location', 'Short Project Name', 'Theme(s)']) else: print('reading local conditions data') df = pd.read_csv('./raw_data/conditions_en.csv', encoding="UTF-16", sep='\t') df = normalize_text(df, ['Location', 'Short Project Name', 'Theme(s)']) for date_col in ['Effective Date', 'Issuance Date', 'Sunset Date']: df[date_col] = pd.to_datetime(df[date_col]) if not non_standard: # only include non-standard conditions df = df[df['Condition Type'] != 'Standard'] delete_cols = [ 'Condition', 'Condition Phase', 'Instrument Activity', 'Condition Type', 'Condition Filing' ] for delete in delete_cols: del df[delete] for r in ['\n', '"']: df['Company'] = df['Company'].replace(r, '', regex=True) df['Company'] = [x.strip() for x in df['Company']] df['Condition Status'] = df['Condition Status'].astype('object') df['Condition Status'] = [str(x).strip() for x in df['Condition Status']] # preliminary processing df['Company'] = df['Company'].replace(company_rename()) df = apply_system_id(df, "Company") df = df[df['Short Project Name'] != "SAM/COM"].copy().reset_index( drop=True) df = add_links(df, sql) if company_names: print(get_company_names(df['Company'])) df, region_replace, project_names = idify_conditions(df, sql) regions_map = import_simplified(region_replace) if companies: company_files = companies else: company_files = get_company_list("all") for company in company_files: try: df_c, shp, dfmeta, meta = process_company(df, company, project_names, regions_map, test, save) print("completed: " + company) except: print("conditions error: " + company) raise return df_c, shp, dfmeta, meta
def process_oandm(remote=False, companies=False, test=False): lang = "en" if remote: link = "https://can01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.cer-rec.gc.ca%2Fopen%2Foperations%2Foperation-and-maintenance-activity.csv&data=04%7C01%7CMichelle.Shabits%40cer-rec.gc.ca%7Cbbc3fece7b3a439e253908d8f9ec4eab%7C56e9b8d38a3549abbdfc27de59608f01%7C0%7C0%7C637534140608125634%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&sdata=HvG6KtuvEzJiNy4CZ4OyplKnfx2Zk5sPjUNNutoohic%3D&reserved=0" print('downloading remote oandm file') df = pd.read_csv(link, skiprows=0, encoding="latin-1", engine="python", error_bad_lines=False) df.to_csv("./raw_data/oandm_" + lang + ".csv", index=False) elif test: print('reading test oandm file') path = "./raw_data/test_data/oandm_en.csv" df = pd.read_csv(path, skiprows=0, encoding="utf-8", error_bad_lines=False) else: print('reading local oandm file') if lang == 'en': path = "./raw_data/oandm_en.csv" encoding = "utf-8" df = pd.read_csv(path, skiprows=0, encoding=encoding, error_bad_lines=False) df = strip_cols(df) df = df.rename(columns={x: x.replace("\xa0", " ") for x in df.columns}) df = df.replace({"Yes": "y", "No": "n"}) # Event Number and nearest populated center should be deleted later # New Land Area Needed is probably the total land for delete in [ 'Company City', 'Company Postal Code', 'Company Province/Territory', 'Circumstance(s)', 'Result Of A Class Location Change', 'Distance To Closest Building', 'Event Creation Date', 'Submission Date', 'Pipeline Name', 'Pipeline Outside Diameter', 'Pipeline Length', 'Commodity Carried', 'Facility Name', 'Facility Type', 'New Permanent Land Needed', 'Activity Acquiring New Private Land', 'Activity Acquiring New Land Under Compliance', 'Land Within Critical Habitat', 'Activity Crossing Water Body', 'New Temporary Land Needed', 'Vehicle Crossing Count', 'Provincial and federal authorities been consulted', 'Activity Using Navigable Water', 'Activity Following DFO Fish Measures For In Stream Work', 'Navigable Water Frozen Or Dry', 'Activity Following DFO Fish Measures For Crossing', 'Ground Disturbance Near Water Required', 'Navigable Water Activity Meeting Transport Canada Minor Works And Waters Order' ]: del df[delete] for date_col in df.columns: if "date" in date_col.lower(): df[date_col] = pd.to_datetime(df[date_col]) df['Company Name'] = df['Company Name'].replace(company_rename()) df = apply_system_id(df, "Company Name") df = column_insights(df) df = df.rename(columns={ "Species At Risk Present At Activity Site": "Species At Risk Present" }) df = df[df['Commencement Date'].dt.year >= 2015].reset_index(drop=True) if companies: company_files = companies else: company_files = get_company_list("all") for company in company_files: try: folder_name = company.replace(' ', '').replace('.', '') df_c = df[df['Company Name'] == company].copy().reset_index( drop=True) df_c = df_c.drop_duplicates(subset=['Event Number']) this_company_data = {} if not df_c.empty: this_company_data["meta"] = metadata(df_c, company, test) this_company_data["build"] = True this_company_data["data"] = optimize_json(df_c) if not test: with open('../data_output/oandm/' + folder_name + '.json', 'w') as fp: json.dump(this_company_data, fp) else: # there are no o and m events this_company_data['data'] = df_c.to_dict(orient='records') this_company_data['meta'] = {"companyName": company} this_company_data["build"] = False if not test: with open('../data_output/oandm/' + folder_name + '.json', 'w') as fp: json.dump(this_company_data, fp) print("completed: " + company) except: print("o&m error: " + company) raise return this_company_data
def process_incidents(remote=False, land=False, company_names=False, companies=False, test=False): if remote: link = "https://www.cer-rec.gc.ca/open/incident/pipeline-incidents-data.csv" process_func = process_english print('downloading remote incidents file') df = pd.read_csv(link, skiprows=0, encoding="latin-1", engine="python", error_bad_lines=False) df = process_func(df) df.to_csv("./raw_data/incidents_"+"en"+".csv", index=False) elif test: print('reading test incidents file') path = "./raw_data/test_data/incidents_en.csv" process_func = process_english df = pd.read_csv(path, skiprows=0, encoding="utf-8", error_bad_lines=False) df = process_func(df) else: print('reading local incidents file') path = "./raw_data/incidents_en.csv" process_func = process_english encoding = "latin-1" df = pd.read_csv(path, skiprows=0, encoding=encoding, error_bad_lines=False) df = process_func(df) # initial data processing df['Company'] = df['Company'].replace(company_rename()) df['Approximate Volume Released'] = pd.to_numeric(df['Approximate Volume Released'], errors='coerce') df['Reported Date'] = pd.to_datetime(df['Reported Date'], errors='raise') for delete in ['Significant', 'Release Type', 'Nearest Populated Centre', 'Reported Date']: del df[delete] if company_names: print(get_company_names(df['Company'])) perKm = None if companies: company_files = companies else: company_files = ['NOVA Gas Transmission Ltd.', 'TransCanada PipeLines Limited', 'Enbridge Pipelines Inc.', 'Enbridge Pipelines (NW) Inc.', 'Enbridge Bakken Pipeline Company Inc.', 'Express Pipeline Ltd.', 'Trans Mountain Pipeline ULC', 'Trans Quebec and Maritimes Pipeline Inc.', 'Trans-Northern Pipelines Inc.', 'TransCanada Keystone Pipeline GP Ltd.', 'Westcoast Energy Inc.', 'Alliance Pipeline Ltd.', 'PKM Cochin ULC', 'Foothills Pipe Lines Ltd.', 'Southern Lights Pipeline', 'Emera Brunswick Pipeline Company Ltd.', 'Plains Midstream Canada ULC', 'Genesis Pipeline Canada Ltd.', 'Montreal Pipe Line Limited', 'Trans-Northern Pipelines Inc.', 'Kingston Midstream Westspur Limited', 'Many Islands Pipe Lines (Canada) Limited', 'Vector Pipeline Limited Partnership', 'Maritimes & Northeast Pipeline Management Ltd.', 'Aurora Pipeline Company Ltd'] for company in company_files: folder_name = company.replace(' ', '').replace('.', '') df_c = df[df['Company'] == company].copy().reset_index(drop=True) df_vol = df_c[~df_c['Approximate Volume Released'].isnull()].copy().reset_index(drop=True) thisCompanyData = {} if not df_vol.empty: # calculate metadata here, before non releases are filtered out meta = incidentMetaData(df, perKm, company) thisCompanyData['meta'] = meta for delete in ['Incident Types', 'Company', 'why common', 'what common']: del df_vol[delete] df_vol = optimizeJson(df_vol) thisCompanyData['events'] = df_vol.to_dict(orient='records') if not test: with open('../incidents/company_data/'+folder_name+'.json', 'w') as fp: json.dump(thisCompanyData, fp) else: # there are no product release incidents thisCompanyData['events'] = df_vol.to_dict(orient='records') thisCompanyData['meta'] = {"companyName": company} if not test: with open('../incidents/company_data/'+folder_name+'.json', 'w') as fp: json.dump(thisCompanyData, fp) return df_c, df_vol, meta