def regulatedfarelookup(lkupfilepath, df): """ This imports a csv file as a source of lookup data for regulated fares. A merge statement is used to join superfile with the regulated data. Only about 20% of rows will match. No match will return NaN in the new fields. Parameters: lkupfilepath - a string containing the filepath where the lookup text file is stored df - a dataframe containing the combined TOC data Returns: df - a dataframe with new fields (orig, dest, route,ticketa, flaga, ticket, flag2) added on the right hand on the dataframe. """ #import fares lookup data print("Regulated fares lookup being loaded\n") fareslkupdata = pd.read_csv(lkupfilepath + 'regulated fares sort.csv') fareslkupvalues = pd.DataFrame(fareslkupdata) #data typing for farelookup values print("Data typeing fare lookup values\n") fareslkupvalues['orig'] = fareslkupvalues['orig'].astype('str') fareslkupvalues['dest'] = fareslkupvalues['dest'].astype('str') fareslkupvalues['route'] = fareslkupvalues['route'].astype('str') fareslkupvalues['ticket'] = fareslkupvalues['ticket'].astype('str') #set string length by padding fields with leading zeros print("Amending lookupvalue keys\n") fareslkupvalues['orig'] = fareslkupvalues['orig'].str.zfill(4) fareslkupvalues['dest'] = fareslkupvalues['dest'].str.zfill(4) fareslkupvalues['route'] = fareslkupvalues['route'].str.zfill(5) #merging lookup values with superfile print("Regulated fares lookup being performed.\n") df = pd.merge(left=df,right=fareslkupvalues,how='left', left_on=['Origin Code','Destination Code','Route Code', 'Product Code'] ,right_on=['orig','dest','route','ticket']) #drop lookup dataframe no longer needed del fareslkupvalues # amend datatypes to categorical df = applydatatypes(df, ['Product Code','Origin Code','Destination Code','Route Code','orig','dest','route','ticketa','flaga','ticket','flag2']) return df
def getcategorylookup(df,filepath, filename,destinationpath): """ This procedure loads category lookup data from excel file, conforms the format of fields to be return and joins it to the main file. Non-matches are categorised as 'Missing' and these rows are exported to a logging file. Categorical datatyping is applied to new fields. Parameters: df - a dataframe containing the 'superfile' filepath - a string containing the file path for the excel file filename - a string containing the name of the excel file destinationpath - a string containing the file path for the output to be sent Returns: df - a dataframe containing the 'superfile' with category information """ print("Category Lookup Data being loaded \n") records = pd.read_excel(filepath + filename,sheet_name='Sheet1',heading=['Product Code','Product Primary Code','Category']) savtodf = records #define column names savtodf.columns = ['Product Code','Product Primary Code','Category'] #force all categories to lower case savtodf['Category'] = savtodf['Category'].str.lower() print("Category Information being added\n") df = pd.merge(df,savtodf,how='left', left_on=['Product Code','Product Primary Code'],right_on=['Product Code','Product Primary Code']) #handle missing category information df['Category'].fillna('Missing',inplace=True) nonmatches = df[df['Category']=='Missing'] unique_filtered_nonmatches = nonmatches[['Product Code','Product Primary Code']].copy().drop_duplicates() exportfile(unique_filtered_nonmatches,destinationpath, 'missing_categories') #apply datatyping df = applydatatypes(df,['Product Code','Product Primary Code','Category']) #remove unnecessary column del savtodf return df
def generatedata(originpath,destinationpath,regulatedfarespath,categorypath): """ This joins a number of CSV source files into a common dataframe; handles the cases of sub profit centres being used in place of profit centres; maps sectors, ticket_types, classes, regulated status and category information to the combined file. This is then used to calculated advanced and non-advanced data. Parameters: originpath - A string specifying the location of the individual TOC files are found destinationpath - A string specifying the location of where output should be exported regulatedfarespath - A string specifying the location of the lookup file for regulated fares file categorypath - A string specifying the location of the lookup file for category Returns: superfile - A dataframe consisting of the combined and mapped data. """ list_of_tocs,filecount = getdata(originpath) joinedfile = combinefiles(list_of_tocs,filecount) if filecount > 50: print("As you are processing a large number of files, this may possibly cause the PC to freeze or crash due to memory issues.\n") print("If this happens, restart the computer, then close down IE, Outlook and any other memory/resource hungry applications and try again.\n") superfile = joinedfile.copy() #drop where category_code not starting with 1 or 2 superfile = superfile[superfile['Product Code'].str.contains('1[A-Z][A-Z][A-Z]|2[A-Z][A-Z][A-Z]',regex=True)] #fields to convert to categorical data type superfile = applydatatypes(superfile,['Carrier TOC / Third Party Code','Product Code','Product Primary Code']) #mapping of lookups starts here #mapping of sectors print("mapping sectors within superfile\n") sector_mapping = {'EK':'Lon SE','HO':'Lon SE','HQ':'Lon SE','HS':'Lon SE','HT':'Lon SE','HU':'Lon SE','HW':'Lon SE','HY':'Lon SE','HZ':'Lon SE','EX':'Lon SE', 'EA':'Regional','EI':'Regional','EJ':'Regional','HA':'Regional','HC':'Regional','HD':'Regional','HE':'Regional','HL':'Regional','ES':'Regional', 'EC':'Long D','EH':'Long D','HB':'Long D','HF':'Long D','HI':'Long D','HK':'Long D','HM':'Long D'} superfile = assignlookupvalues(superfile,'sectors', sector_mapping, "Carrier TOC / Third Party Code", 'sector' ,destinationpath) #mapping of tickettypes print("mapping ticket types within superfile\n") tickettypemapping = {'PG01':'Full','PG05':'Full','PG02':'Reduced','PG03':'Reduced','PG06':'Reduced','PG07':'Reduced','PG04':'Season','PG08':'Season'} superfile = assignlookupvalues(superfile,'ticket_type',tickettypemapping,"Product Primary Code",'ticket_type',destinationpath,'Other') #mapping of ticketclasses print("mapping ticket types within superfile\n") classmapping = {'1':'1', '2':'2','9':'2'} superfile['classreference'] = superfile['Product Code'].str[0] superfile = assignlookupvalues(superfile,'class',classmapping,'classreference','class',destinationpath,'2') del superfile['classreference'] print("mapping regulated status within superfile\n") #getting the regulated fares lookup to add flag_2 information for faretypes superfile = regulatedfarelookup(regulatedfarespath,superfile ) #setting rows as regulated/unregulated fares here superfile = setregulatedfares(superfile,destinationpath) #mapping of categories print("mapping categories within superfile\n") superfile = getcategorylookup(superfile,categorypath,'Product_category_lookup_2020_v1.xlsx',destinationpath) #dropping columns no longer needed superfile = superfile.drop(['orig','dest','route'], axis=1) #apply final superfile datatyping superfile = applydatatypes(superfile,['Carrier TOC / Third Party Code','Origin Code','Destination Code','Route Code','Product Code','sector','ticket_type','class','Regulated_Status_Start','Regulated_Status_toc','Regulated_Status_Products','Regulated_Status_exceptions','Regulated_Status_class','Regulated_Status_PCC','Regulated_Status','Category']) #export full superfile for later testing of regulated status setting, if needed exportfile(superfile,destinationpath,"superfile with full regulated data") #delete the surplus Regulated status columns superfile = superfile.drop(['Regulated_Status_Start','Regulated_Status_toc','Regulated_Status_Products','Regulated_Status_exceptions','Regulated_Status_class','Regulated_Status_PCC'], axis=1) #producing distinct list of product codes with their assigned regulated status regulatedcheck = superfile[['Product Code','Product Primary Code','Regulated_Status']].drop_duplicates() exportfile(regulatedcheck,destinationpath,"regulated products check") return superfile
def setregulatedfares(df,destinationpath): """ This processes the lookups of exceptional products, normal products and profit centres to create the fields needed to assign the status of regulated/regulated to fares. It also chains these three fields into a logic can creates a new regulated_status field It also removes various fields that are no longer needed at the end of processing Parameters: df: - a dataframe containg the superfile destinationpath - a string with the file path for any error file with missing rows to be sent output: df: - a modified dataframe with a new field "Regulated_status" and 7 fields removed. """ #created new 'Regulated_Status' column with initial value of "Not Assigned" df['Regulated_Status_Start'] = 'Not Assigned' #assign values based on TOC Codes toc = {'SCR':'Regulated_by_TOC','LER':'Regulated_by_TOC','GWR':'Regulated_by_TOC','GXR':'Regulated_by_TOC','IEC':'Regulated_by_TOC','IWC':'Regulated_by_TOC','LTS':'Regulated_by_TOC' ,'MML':'Regulated_by_TOC','NCH':'Regulated_by_TOC','NLR':'Regulated_by_TOC','NSC':'Regulated_by_TOC','NSE':'Regulated_by_TOC','SWT':'Regulated_by_TOC','NIW':'Regulated_by_TOC' ,'RCL':'Regulated_by_TOC','RMS':'Regulated_by_TOC','RWB':'Regulated_by_TOC','TPE':'Regulated_by_TOC','HUL':'Regulated_by_TOC','IXC':'Regulated_by_TOC','FCC':'Regulated_by_TOC','NTH':'Regulated_by_TOC' ,'GWA':'Regulated_by_TOC','GCR':'Regulated_by_TOC','RNE':'Regulated_by_TOC','RNW':'Regulated_by_TOC','PDR':'Regulated_by_TOC','PSV':'Regulated_by_TOC','PWS':'Regulated_by_TOC','PGM':'Regulated_by_TOC' ,'PMR':'Regulated_by_TOC','PSC':'Regulated_by_TOC','PSY':'Regulated_by_TOC','PTW':'Regulated_by_TOC','PWM':'Regulated_by_TOC','DBA':'Regulated_by_TOC' ,'NFD':'Regulated_by_TOC','LRC':'Regulated_by_TOC','LBR':'Regulated_by_TOC','SMR':'Regulated_by_TOC' ,'ANG':'Regulated_by_TOC', 'GER':'Regulated_by_TOC','NTT':'Regulated_by_TOC','RCV':'Regulated_by_TOC','RWW':'Regulated_by_TOC','TLK':'Regulated_by_TOC','WGN':'Regulated_by_TOC'} df = assignedregulatedlookupvalues(df,'Regulation by TOCs',toc,'flag2','Regulated_Status_toc','Regulated_Status_Start') #assign values based on Product Codes products = {'2MTA':'Regulated_by_Product','2MTB':'Regulated_by_Product','2MTE':'Regulated_by_Product', '2MTJ':'Regulated_by_Product','2MTK':'Regulated_by_Product','2MTN':'Regulated_by_Product','2VQA':'Regulated_by_Product', '2MQA':'Regulated_by_Product', '2MSA':'Regulated_by_Product', '2MSH':'Regulated_by_Product', '2MSL':'Regulated_by_Product', '2MSW':'Regulated_by_Product', '2MTH':'Regulated_by_Product', '2MTL':'Regulated_by_Product', '2MTW':'Regulated_by_Product', '2CGE':'Regulated_by_Product','2CGJ':'Regulated_by_Product', '2CGK':'Regulated_by_Product', '2CGN':'Regulated_by_Product', '2CGO':'Regulated_by_Product', '2CGS':'Regulated_by_Product', '2CGT':'Regulated_by_Product', '2OBC':'Regulated_by_Product', '2OBD':'Regulated_by_Product', '2OBE':'Regulated_by_Product', '2OBF':'Regulated_by_Product', '2OCH':'Regulated_by_Product', '2OCI':'Regulated_by_Product', '2OCJ':'Regulated_by_Product', '2OCL':'Regulated_by_Product', '2OCN':'Regulated_by_Product', '2OEH':'Regulated_by_Product', '2CIC':'Regulated_by_Product', '2CID':'Regulated_by_Product', '2CIE':'Regulated_by_Product', '2CIF':'Regulated_by_Product'} df = assignedregulatedlookupvalues(df,'Regulation by Product',products,'Product Code','Regulated_Status_Products','Regulated_Status_toc') #assign values based on an exceptional products mapping regulatedexception = {'2ADA':'Unregulated_by_Product','2BDY':'Unregulated_by_Product'} df = assignedregulatedlookupvalues(df,'Regulation by Exception',regulatedexception,'Product Code','Regulated_Status_exceptions','Regulated_Status_Products') #assign values based on class unregulatedclass = {'1':'Unregulated_by_Class'} df = assignedregulatedlookupvalues(df,'Regulation by Class',unregulatedclass,'class','Regulated_Status_class','Regulated_Status_exceptions') #assign values based on Profit Centre Code regulatedprofitcentrecode = {'EC':'Unregulated_by_PCC','HM':'Unregulated_by_PCC'} df = assignedregulatedlookupvalues(df,'Regulation by Profit Centre',regulatedprofitcentrecode,'Carrier TOC / Third Party Code','Regulated_Status_PCC','Regulated_Status_class') #create final copy of regulated column df['Regulated_Status'] = df['Regulated_Status_PCC'] #convert the final column from intermediate labels to final labels df['Regulated_Status'].replace('Not Assigned','Unregulated', inplace=True) df['Regulated_Status'].replace('Unregulated_by_Product','Unregulated', inplace=True) df['Regulated_Status'].replace('Unregulated_by_Class','Unregulated', inplace=True) df['Regulated_Status'].replace('Unregulated_by_PCC','Unregulated', inplace=True) df['Regulated_Status'].replace('Regulated_by_TOC','Regulated', inplace=True) df['Regulated_Status'].replace('Regulated_by_Product','Regulated', inplace=True) #drop fields no longer required del df['ticketa'] del df['flaga'] del df['ticket'] del df['flag2'] # Amend the relevant data types df = applydatatypes(df,['Regulated_Status_Start', 'Regulated_Status_toc', 'Regulated_Status_Products', 'Regulated_Status_exceptions', 'Regulated_Status_class', 'Regulated_Status']) return df
def add_lennon_fares_info(df, lookupdf, year, typeofjoin): """ This merges the LENNON data with the superfile and adds a year reference to the column title Parameters: df - a dataframe containing the combined data which is to be joined against lookupdf - a dataframe containing the relevant LENNON information year - a string representing the year the data relates to typeofjoin - a string representing whether the join is for advanced or non-advanced data Returns: df - a dataframe with LENNON data joined """ # apply appropriate merge type based on name of join if typeofjoin == 'non-advanced': #the non-advanced join #exportfile(df,'C:\\Users\\gwilliams\\Desktop\\Python Experiments\\work projects\\FaresIndexOutput\\',"non-advanced input") df = pd.merge(left=df, right=lookupdf, how='left', left_on=[ 'Origin Code', 'Destination Code', 'Route Code', 'Product Code' ], right_on=[ 'Origin Code', 'Destination Code', 'Route Code', 'Product Code' ], suffixes=('', '_LENNON' + year)) df = applydatatypes( df, ['Origin Code', 'Destination Code', 'Route Code', 'Product Code']) #exportfile(lookupdf,'C:\\Users\\gwilliams\\Desktop\\Python Experiments\\work projects\\FaresIndexOutput\\',"non-advanced lookup") #exportfile(df,'C:\\Users\\gwilliams\\Desktop\\Python Experiments\\work projects\\FaresIndexOutput\\',"non-advanced output") elif typeofjoin == 'advanced': # advanced join # datatyping of key fields print("this is non advanced datatyping") df['Origin Code'] = df['Origin Code'].astype(str) df['Destination Code'] = df['Destination Code'].astype(str) df['Route Code'] = df['Route Code'].astype(str) df['class'] = df['class'].astype(str) #exportfile(df,'C:\\Users\\gwilliams\\Desktop\\Python Experiments\\work projects\\FaresIndexOutput\\',"advanced input") df = pd.merge( left=df, right=lookupdf, how='left', left_on=['Origin Code', 'Destination Code', 'Route Code', 'class'], right_on=[ 'Origin Code', 'Destination Code', 'Route Code', 'class' ], suffixes=('', '_LENNON' + year)) df = applydatatypes( df, ['Origin Code', 'Destination Code', 'Route Code', 'class']) #exportfile(lookupdf,'C:\\Users\\gwilliams\\Desktop\\Python Experiments\\work projects\\FaresIndexOutput\\',"advanced lookup") #exportfile(df,'C:\\Users\\gwilliams\\Desktop\\Python Experiments\\work projects\\FaresIndexOutput\\',"advanced output") else: print("Type of join not recognised") return None return df