def run(self): #READ AMAZON KEYS FROM USER aws_access_read = input("Enter your aws access key: ") aws_secret_read = input("Enter your aws secret key: ") aws_access_key = aws_access_read.strip() aws_secret_key = aws_secret_read.strip() #print("check") # fileName1 = "cleaned_loandata.csv" fileName2 = "cleaned_reject_loandata.csv" clean_dir = "Data/Cleaned/" # filePath1 = clean_dir + fileName1 filePath2 = clean_dir + fileName2 # r = check_if_file_exists(aws_access_key,aws_secret_key,fileName1, filePath1) # if (r==False): # amazon_upload(aws_access_key,aws_secret_key,filePath1) r = check_if_file_exists(aws_access_key, aws_secret_key, fileName2, filePath2) if (r == False): amazon_upload(aws_access_key, aws_secret_key, filePath2) # check if upload successful # r1 = check_if_file_exists(aws_access_key,aws_secret_key,fileName1, filePath1) r2 = check_if_file_exists(aws_access_key, aws_secret_key, fileName2, filePath2) if (r == True): create_directory("Data/Results") open("Data/Results/resultsrejectloandata.csv", 'a').close()
def run(self): create_directory("cleaned") cleaned_dir = "cleaned/" downloads_dir = "downloads/" # Itereate through all the years. Check if the cleaned file exists! If not Read the downloads csv and Clean it. And Place it in the cleaned directory for i in range(1999,2017): downloads_filePath = downloads_dir + "sample_orig_" + str(i) + ".txt" cleaned_filePath = cleaned_dir + "cleaned_sample_orig_" + str(i) + ".csv" if not (os.path.isfile(cleaned_filePath)): # LOAD AND ADD HEADERS TO THE ORIG DATA orig_file = pd.read_csv(downloads_filePath ,sep="|", header=None, \ names = ["CREDIT SCORE",\ "FIRST PAYMENT DATE",\ "FIRST TIME HOMEBUYER FLAG",\ "MATURITY DATE",\ "METROPOLITAN STATISTICAL AREA (MSA) OR METROPOLITAN DIVISION",\ "MORTGAGE INSURANCE PERCENTAGE (MI %)",\ "NUMBER OF UNITS",\ "OCCUPANCY STATUS",\ "ORIGINAL COMBINED LOAN-TO-VALUE (CLTV)",\ "ORIGINAL DEBT-TO-INCOME (DTI) RATIO",\ "ORIGINAL UPB",\ "ORIGINAL LOAN-TO-VALUE (LTV)",\ "ORIGINAL INTEREST RATE",\ "CHANNEL",\ "PREPAYMENT PENALTY MORTGAGE (PPM) FLAG",\ "PRODUCT TYPE",\ "PROPERTY STATE",\ "PROPERTY TYPE",\ "POSTAL CODE",\ "LOAN SEQUENCE NUMBER",\ "LOAN PURPOSE",\ "ORIGINAL LOAN TERM",\ "NUMBER OF BORROWERS",\ "SELLER NAME",\ "SERVICER NAME",\ "Super Conforming Flag"\ ]) # CLEAN THE ORIG FILE # SAVE DATAFRAME TO CLEANED DIRECTORY orig_file.to_csv(cleaned_filePath, sep=',', index = False) print ("cleaned origination files")
def run(self): create_directory("cleaned") cleaned_dir = "cleaned/" downloads_dir = "downloads/" # Itereate through all the years. Check if the cleaned file exists! If not Read the downloads csv and Clean it. And Place it in the cleaned directory for i in range(1999, 2017): downloads_filePath = downloads_dir + "sample_svcg_" + str( i) + ".txt" cleaned_filePath = cleaned_dir + "cleaned_sample_svcg_" + str( i) + ".csv" if not (os.path.isfile(cleaned_filePath)): # LOAD AND ADD HEADERS TO THE PERFORMANCE DATA performance = pd.read_csv( downloads_filePath, sep="|", header=None, names=[ "LOAN SEQUENCE NUMBER", "MONTHLY REPORTING PERIOD", "CURRENT ACTUAL UPB", "CURRENT LOAN DELINQUENCY STATUS", "LOAN AGE", "REMAINING MONTHS TO LEGAL MATURITY", "REPURCHASE FLAG", "MODIFICATION FLAG", "ZERO BALANCE CODE", "ZERO BALANCE EFFECTIVE DATE", "CURRENT INTEREST RATE", "CURRENT DEFERRED UPB", "DUE DATE OF LAST PAID INSTALLMENT", "MI RECOVERIES", "NET SALES PROCEEDS", "NON MI RECOVERIES", "EXPENSES", "LEGAL COSTS", "MAINTENANCE AND PRESERVATION COSTS", "TAXES AND INSURANCE", "MISCELLANEOUS EXPENSES", "ACTUAL LOSS CALCULATION", "MODIFICATION COST" ]) # CLEAN THE PERFORMANCE FILE # NEEDS A CONDITIONAL FUNCTION FOR YEAR 2000 clean_monthly_reporting_period(performance) clean_loan_del_status(performance) clean_repurchase_flag(performance) clean_modification_flag(performance) clean_zero_balance_code(performance) clean_zero_balance_effective_date(performance) clean_ddlpi(performance) replace_all_other_NaNs_With_zero(performance) # SAVE DATAFRAME TO CLEANED DIRECTORY performance.to_csv(cleaned_filePath, sep=',', index=False) print("cleaned performance files") # __________________________________________________________________________________________________________________________________________
def run(self): create_directory("summary") summary_dir = "summary/" cleaned_dir = "cleaned/" # Itereate through all the years. Check if the cleaned file exists! If not Read the downloads csv and Clean it. And Place it in the cleaned directory # for i in range(1999,2017): # cleaned_filePath = cleaned_dir + "cleaned_sample_orig_" + str(i) + ".csv" # summary_filePath = summary_dir + "summary_orig_" + str(i) + ".csv" # if not (os.path.isfile(summary_filePath)): # # LOAD THE PERFORMANCE DATA # orig = pd.read_csv(cleaned_filePath,sep="|", header=None) # # SAVE DATAFRAME TO CLEANED DIRECTORY print("Building Prediction Model")
def run(self): create_directory("Summary") downloads_dir = "Data/Downloads/" summary_dir = "Data/Summary/" # READ DATA INTO DATAFRAME irisData = pd.read_csv(downloads_dir + 'irisdataset.data', sep=",", header=None, names=[ 'sepal_length_in_cm', 'sepal_width_in_cm', 'petal_length_in_cm', 'petal_width_in_cm', 'iris_class' ]) # SUMMARIZE DATA # SAVE DATAFRAME irisData.to_csv(summary_dir + "summary_irisdataset.csv", sep=',', index=False)
def run(self): create_directory("cleaned") cleaned_dir = "cleaned/" downloads_dir = "downloads/" # Itereate through all the years. Check if the cleaned file exists! If not Read the downloads csv and Clean it. And Place it in the cleaned directory for i in range(1999, 2017): downloads_filePath = downloads_dir + "sample_svcg_" + str( i) + ".txt" cleaned_filePath = cleaned_dir + "cleaned_sample_svcg_" + str( i) + ".csv" if not (os.path.isfile(cleaned_filePath)): # LOAD AND ADD HEADERS TO THE PERFORMANCE DATA performance = pd.read_csv( downloads_filePath, sep="|", header=None, names=[ "LOAN SEQUENCE NUMBER", "MONTHLY REPORTING PERIOD", "CURRENT ACTUAL UPB", "CURRENT LOAN DELINQUENCY STATUS", "LOAN AGE", "REMAINING MONTHS TO LEGAL MATURITY", "REPURCHASE FLAG", "MODIFICATION FLAG", "ZERO BALANCE CODE", "ZERO BALANCE EFFECTIVE DATE", "CURRENT INTEREST RATE", "CURRENT DEFERRED UPB", "DUE DATE OF LAST PAID INSTALLMENT", "MI RECOVERIES", "NET SALES PROCEEDS", "NON MI RECOVERIES", "EXPENSES", "Legal Costs", "Maintenance and Preservation Costs", "Taxes and Insurance", "Miscellaneous Expenses", "Actual Loss Calculation", "Modification Cost" ]) # CLEAN THE PERFORMANCE FILE # SAVE DATAFRAME TO CLEANED DIRECTORY performance.to_csv(cleaned_filePath, sep=',', index=False) print("cleaned performance files")
def run(self): create_directory("Data") create_directory("Data/Downloads") create_directory("Data/Downloads/LoanData") # create_directory("Data/Downloads/DeclinedLoanData") downloads_dir = "Data/Downloads/" url = "https://www.lendingclub.com/account/gotoLogin.action" # login credentials username = input("Enter your Lending Club usenname: ") password = input("Enter your Lending Club password: "******"id": "member-login"}) login_form.find("input", {"name": "login_email"})["value"] = username login_form.find("input", {"name": "login_password"})["value"] = password response = browser.submit(login_form, login_page.url) if (response.url == "https://www.lendingclub.com/account/myAccount.action"): # CODE TO DOWNLOAD DATASET INTO THE DIRECTORY url = "https://www.lendingclub.com/info/download-data.action" folder1 = "LoanData/" # folder2 = "DeclinedLoanData/" link = 'https://www.lendingclub.com/info/download-data.action' r = browser.get(link) soup = BeautifulSoup(r.text, "html.parser") loan_namelist = (soup.find('div', {'id': "loanStatsFileNamesJS"})).text loan_names = parts = loan_namelist.split('|') prefix = (soup.find('div', {'id': "urlPublicPrefix"})).text #Download Loan i = 0 for name in loan_names: if (name.strip() != ""): i = i + 1 url_n = (prefix + name) if not (os.path.isfile(downloads_dir + folder1 + name.split('.')[0] + '.csv')): zf = browser.get(url_n) z = zipfile.ZipFile(io.BytesIO(zf.content)) z.extractall(path=downloads_dir + folder1) else: print("Try again with correct Lending Club credentials")
def run(self): create_directory("summary") summary_dir = "summary/" downloads_dir = "downloads/" # FOR ORIGINATION DATA headers_orig = ["CREDIT SCORE",\ "FIRST PAYMENT DATE",\ "FIRST TIME HOMEBUYER FLAG",\ "MATURITY DATE",\ "METROPOLITAN STATISTICAL AREA (MSA) OR METROPOLITAN DIVISION",\ "MORTGAGE INSURANCE PERCENTAGE (MI %)",\ "NUMBER OF UNITS",\ "OCCUPANCY STATUS",\ "ORIGINAL COMBINED LOAN-TO-VALUE (CLTV)",\ "ORIGINAL DEBT-TO-INCOME (DTI) RATIO",\ "ORIGINAL UPB",\ "ORIGINAL LOAN-TO-VALUE (LTV)",\ "ORIGINAL INTEREST RATE",\ "CHANNEL",\ "PREPAYMENT PENALTY MORTGAGE (PPM) FLAG",\ "PRODUCT TYPE",\ "PROPERTY STATE",\ "PROPERTY TYPE",\ "POSTAL CODE",\ "LOAN SEQUENCE NUMBER",\ "LOAN PURPOSE",\ "ORIGINAL LOAN TERM",\ "NUMBER OF BORROWERS",\ "SELLER NAME",\ "SERVICER NAME",\ "SUPER CONFORMING FLAG"\ ] # CREATE EMPTY FILES AND INSERT headers_orig TO THE FILES for head in headers_orig: if not (head == "LOAN SEQUENCE NUMBER"): summary_filePath = summary_dir + "summary_raw_sample_orig_" + head + ".csv" with open(summary_filePath, 'a') as f: f.write(head) f.write(',COUNT,YEAR\n') f.close() # ADD SUMMARY TO THE FILES CREATED!! for i in range(1999, 2017): downloads_filePath = downloads_dir + "sample_orig_" + str( i) + ".txt" # downloads_filePath = downloads_dir + "historical_data1_Q" + str(i) + "2007.txt" # cleaned_filePath = cleaned_dir + "cleaned_historical_data1_Q" + str(i) + "2007.csv" orig_file = pd.read_csv(downloads_filePath, sep="|", header=None, names=headers_orig) # SUMMARIZE THE RAW SAMPLE FILE for head in headers_orig: # d = orig_file[['col1', 'col2', 'col3', 'col4']].groupby(['col1', 'col2']).agg(['mean', 'count']) if not (head == "LOAN SEQUENCE NUMBER"): d = orig_file.groupby(head) \ .agg({"LOAN SEQUENCE NUMBER" : len}) \ .rename(columns={'LOAN SEQUENCE NUMBER':'COUNT'}) d['Year'] = str(i) summary_filePath = summary_dir + "summary_raw_sample_orig_" + head + ".csv" with open(summary_filePath, 'a') as f: d.to_csv(f, sep=',', header=False) f.close() print("SUMMARIZED RAW ORIGINATION FILES") # ______________________________________________- # FOR PERFORMANCE DATA headers_svcg = ["LOAN SEQUENCE NUMBER",\ "MONTHLY REPORTING PERIOD",\ "CURRENT ACTUAL UPB", \ "CURRENT LOAN DELINQUENCY STATUS",\ "LOAN AGE",\ "REMAINING MONTHS TO LEGAL MATURITY",\ "REPURCHASE FLAG",\ "MODIFICATION FLAG",\ "ZERO BALANCE CODE",\ "ZERO BALANCE EFFECTIVE DATE",\ "CURRENT INTEREST RATE",\ "CURRENT DEFERRED UPB",\ "DUE DATE OF LAST PAID INSTALLMENT",\ "MI RECOVERIES",\ "NET SALES PROCEEDS",\ "NON MI RECOVERIES",\ "EXPENSES",\ "LEGAL COSTS",\ "MAINTENANCE AND PRESERVATION COSTS",\ "TAXES AND INSURANCE",\ "MISCELLANEOUS EXPENSES",\ "ACTUAL LOSS CALCULATION",\ "MODIFICATION COST"\ ] # CREATE EMPTY FILES AND INSERT headers_svcg TO THE FILES for head in headers_svcg: summary_filePath = summary_dir + "summary_raw_sample_svcg_" + head + ".csv" with open(summary_filePath, 'a') as f: f.write(head) f.write(',COUNT,YEAR\n') f.close() # ADD SUMMARY TO THE FILES CREATED!! for i in range(1999, 2017): downloads_filePath = downloads_dir + "sample_svcg_" + str( i) + ".txt" # downloads_filePath = downloads_dir + "historical_data1_Q" + str(i) + "2007.txt" # cleaned_filePath = cleaned_dir + "cleaned_historical_data1_Q" + str(i) + "2007.csv" svcg_file = pd.read_csv(downloads_filePath, sep="|", header=None, names=headers_svcg) svcg_file = svcg_file.reset_index() # svcg_file = svcg_file.set_index(['LOAN SEQUENCE NUMBER', 'MONTHLY REPORTING PERIOD'], inplace=False) # SUMMARIZE THE RAW SAMPLE FILE print(svcg_file[2:3]) for head in headers_svcg: # d = orig_file[['col1', 'col2', 'col3', 'col4']].groupby(['col1', 'col2']).agg(['mean', 'count']) d = svcg_file.groupby(head).count() d = svcg_file.groupby(head) \ .agg({'index' : len }) \ .rename(columns={'index' : 'COUNT'}) d['Year'] = str(i) summary_filePath = summary_dir + "summary_raw_sample_svcg_" + head + ".csv" with open(summary_filePath, 'a') as f: d.to_csv(f, sep=',', header=False) f.close() print("SUMMARIZED RAW PERFORMANCE FILES")
def run(self): create_directory("cleaned") cleaned_dir = "cleaned/" downloads_dir = "downloads/" quarterandyear = glob.glob(cleaned_dir + '[0-9][0-9][0-9][0-9][0-9]')[0] quarter = int(re.search(r'(\d)(\d{4})', quarterandyear).group(1)) year = int(re.search(r'(\d)(\d{4})', quarterandyear).group(2)) # Itereate through all the years. Check if the cleaned file exists! If not Read the downloads csv and Clean it. And Place it in the cleaned directory for i in range(quarter - 1, quarter + 1): downloads_filePath = downloads_dir + "historical_data1_time_Q" + str( i) + str(year) + ".txt" cleaned_filePath = cleaned_dir + "cleaned_historical_data1_time_Q" + str( i) + str(year) + ".csv" if not (os.path.isfile(cleaned_filePath)): # LOAD AND ADD HEADERS TO THE PERFORMANCE DATA chunk = 500000 perf = None for performance in pd.read_csv(downloads_filePath, sep="|", header = None, chunksize = chunk, iterator = True, index_col=False, names = ["LOAN SEQUENCE NUMBER", \ "MONTHLY REPORTING PERIOD", \ "CURRENT ACTUAL UPB", \ "CURRENT LOAN DELINQUENCY STATUS",\ "LOAN AGE",\ "REMAINING MONTHS TO LEGAL MATURITY",\ "REPURCHASE FLAG",\ "MODIFICATION FLAG",\ "ZERO BALANCE CODE",\ "ZERO BALANCE EFFECTIVE DATE",\ "CURRENT INTEREST RATE",\ "CURRENT DEFERRED UPB",\ "DUE DATE OF LAST PAID INSTALLMENT",\ "MI RECOVERIES",\ "NET SALES PROCEEDS",\ "NON MI RECOVERIES",\ "EXPENSES",\ "LEGAL COSTS",\ "MAINTENANCE AND PRESERVATION COSTS",\ "TAXES AND INSURANCE",\ "MISCELLANEOUS EXPENSES",\ "ACTUAL LOSS CALCULATION",\ "MODIFICATION COST"\ ]): # CLEAN THE PERFORMANCE FILE # NEEDS A CONDITIONAL FUNCTION FOR YEAR 2000 # performance.index += j # i+=1 clean_loan_seq_num(performance) clean_monthly_reporting_period(performance) clean_loan_del_status(performance) clean_repurchase_flag(performance) clean_modification_flag(performance) clean_zero_balance_code(performance) clean_zero_balance_effective_date(performance) clean_ddlpi(performance) replace_all_other_NaNs_With_zero(performance) # j = performance.index[-1] + 1 # SAVE DATAFRAME TO CLEANED DIRECTORY if not (os.path.isfile(cleaned_filePath)): performance.to_csv(cleaned_filePath, sep=',', index=False) else: with open(cleaned_filePath, 'a') as f: performance.to_csv(f, sep=',', index=False, header=False) if (os.path.isfile(cleaned_dir + 'cleaned_historical_data1_time_Q' + str(quarter) + str(year) + '.csv') & os.path.isfile(cleaned_dir + 'cleaned_historical_data1_time_Q' + str(quarter - 1) + str(year) + '.csv')): file = open(cleaned_dir + 'cleaned_perf.txt', 'w+') file.close() file = open(cleaned_dir + str(quarter) + str(year), 'w+') file.close() print("cleaned performance files") print("UNCOMMENT THE CODE WHEN SURE OF THIS") # __________________________________________________________________________________________________________________________________________
def run(self): url = "https://freddiemac.embs.com/FLoan/secure/login.php" myusername = '******' mypassword = '******' # Create Browser browser = mechanicalsoup.Browser() login_page = browser.get(url) login_form = login_page.soup.find('form', {"name": "loginform"}) login_form.find("input", {"name": "username"})["value"] = myusername login_form.find("input", {"name": "password"})["value"] = mypassword # Logging in response = browser.submit(login_form, login_page.url) termsPage = response.soup.find("html") # confirming login h2 = termsPage.find("h2") if not (h2.text == "Loan-Level Dataset"): print("Please check your credentials to login") else: termsForm = termsPage.find('form') termsForm.find("input", {"name": "accept"})["checked"] = True # Submitting form on terms and conditions page response = browser.submit(termsForm, response.url) dataPage = response.soup.find("html") table = dataPage.find("table", {"class": "table1"}) # print(tables) files = [] for row in table.findAll('tr'): try: data = row.findAll('td') file = [data[0].string, data[0].a['href'], data[2].string] files.append(file) except: pass files = pd.DataFrame( data=files, columns=["fileName", "downloadURL", "fileSize"]) # print(files) pattern = r'download.php' downloadURL = re.sub(pattern, "", response.url) create_directory("downloads") dir = "downloads/" for index, row in files.iterrows(): # Check if its a sample file if ("sample_" in row['fileName']): # Get url for sample file fileURL = downloadURL + row['downloadURL'] year = re.search(r'sample_(.+?).zip', row['fileName']).group(1) filePath1 = dir + "sample_orig_" + year + ".txt" filePath2 = dir + "sample_svcg_" + year + ".txt" print(filePath1 + filePath2) # print(fileURL) # Check if sample file exists in the directory, and download if doesnt if not (os.path.isfile(filePath1) & os.path.isfile(filePath2)): zf = browser.get(fileURL) # open("filePath", "w").write(response.read()).close() z = zipfile.ZipFile(io.BytesIO(zf.content)) z.extractall(path=dir) print("Data downloaded")
def run(self): create_directory("Data/Cleaned") create_directory("Data/Summary") downloads_dir_loan = "Data/Downloads/LoanData" cleaned_dir = "Data/Cleaned/" summary_dir = "Data/Summary/" # if (os.path.isfile(downloads_dir_loan + "/full_downloaded_loandata.xls")): fullData = pd.read_csv(downloads_dir_loan + "/full_downloaded_loandata.xls", sep=",", encoding="ISO-8859-1", low_memory=False) else: #Looping over all the csv to load and process the data ls_dir = os.listdir(downloads_dir_loan) fullData = None for file in ls_dir: # only if file is csv regexp = re.compile(r'.csv') if (regexp.search(file)): filePath = downloads_dir_loan + "/" + file # # # READ DATA INTO DATAFRAME data = pd.read_csv(filePath, sep=",", skiprows=[0], encoding="ISO-8859-1", low_memory=False) # Removing Rows where all columns are null or only have value for 1 column data.dropna(how='all', inplace=True) data.dropna(thresh=2, inplace=True) try: fullData = pd.concat([fullData, data]) except: fullData = data data = None # Save fullData (full Raw dataset) in Downloads directory -- just to view fullData.to_csv(downloads_dir_loan + "/full_downloaded_loandata.xls", sep=',', index=True) # Describe downloaded dataset summary = fullData.describe() # grouped_data = data.groupby(['iris_class']) # print(grouped_data.describe().unstack()) # SAVE summary of downloaded data to files # print(summary) summary.to_csv(summary_dir + "summary_downloaded_loandata.csv", sep=',', index=True) # CLEAN DATA # fullData = replace_by_mean(fullData) fullData = remove_columns(fullData) fullData = replace_by_median(fullData) fullData = replace_by_zero(fullData) # fullData = remove_rows(fullData) fullData = add_derived_columns(fullData) fullData = add_dummy_variable(fullData) fullData = clean_text_columns(fullData) # SAVE Cleaned/Preprocessed Data fullData.to_csv(cleaned_dir + "cleaned_loandata.csv", sep=',', index=False) # Summarize and save cleaned dataset summary = fullData.describe() summary.to_csv(summary_dir + "summary_cleaned_loandata.csv", sep=',', index=True)
def run(self): create_directory("cleaned") cleaned_dir = "cleaned/" downloads_dir = "downloads/" # Itereate through all the years. Check if the cleaned file exists! If not Read the downloads csv and Clean it. And Place it in the cleaned directory for i in range(1999, 2017): downloads_filePath = downloads_dir + "sample_orig_" + str( i) + ".txt" cleaned_filePath = cleaned_dir + "cleaned_sample_orig_" + str( i) + ".csv" # downloads_filePath = downloads_dir + "historical_data1_Q" + str(i) + "2007.txt" # cleaned_filePath = cleaned_dir + "cleaned_historical_data1_Q" + str(i) + "2007.csv" if not (os.path.isfile(cleaned_filePath)): # LOAD AND ADD HEADERS TO THE ORIG DATA orig_file = pd.read_csv(downloads_filePath ,sep="|", header=None, \ names = ["CREDIT SCORE",\ "FIRST PAYMENT DATE",\ "FIRST TIME HOMEBUYER FLAG",\ "MATURITY DATE",\ "METROPOLITAN STATISTICAL AREA (MSA) OR METROPOLITAN DIVISION",\ "MORTGAGE INSURANCE PERCENTAGE (MI %)",\ "NUMBER OF UNITS",\ "OCCUPANCY STATUS",\ "ORIGINAL COMBINED LOAN-TO-VALUE (CLTV)",\ "ORIGINAL DEBT-TO-INCOME (DTI) RATIO",\ "ORIGINAL UPB",\ "ORIGINAL LOAN-TO-VALUE (LTV)",\ "ORIGINAL INTEREST RATE",\ "CHANNEL",\ "PREPAYMENT PENALTY MORTGAGE (PPM) FLAG",\ "PRODUCT TYPE",\ "PROPERTY STATE",\ "PROPERTY TYPE",\ "POSTAL CODE",\ "LOAN SEQUENCE NUMBER",\ "LOAN PURPOSE",\ "ORIGINAL LOAN TERM",\ "NUMBER OF BORROWERS",\ "SELLER NAME",\ "SERVICER NAME",\ "SUPER CONFORMING FLAG"\ ]) if not (i == 2016): t = 50000 else: t = 12500 # CLEAN THE ORIG FILE orig_file = clean_credit_score(orig_file, i, t) orig_file = clean_first_payment_date(orig_file) orig_file = clean_first_time_homebuyer_flag(orig_file) orig_file = clean_maturity_date(orig_file) orig_file = clean_msa_md(orig_file) orig_file = clean_mi_percentage(orig_file) orig_file = clean_number_of_units(orig_file) orig_file = clean_occupancy_status(orig_file) orig_file = clean_cltv(orig_file) orig_file = clean_dti_ratio(orig_file) orig_file = clean_original_upb(orig_file, i, t) orig_file = clean_original_ltv(orig_file, i, t) orig_file = clean_original_interest(orig_file) orig_file = clean_channel(orig_file) orig_file = clean_ppm_flag(orig_file) orig_file = clean_product_type(orig_file) orig_file = clean_property_state(orig_file) orig_file = clean_property_type(orig_file) orig_file = clean_postal_code(orig_file) orig_file = clean_loan_seq_num(orig_file) orig_file = clean_loan_purpose(orig_file) orig_file = clean_orig_loan_term(orig_file) orig_file = clean_num_of_borrowers(orig_file) orig_file = clean_seller_and_servicer_name(orig_file) orig_file = clean_super_conf_flag(orig_file) # orig_file['CREDIT SCORE'].apply(lambda x: x.fillna(x.MEDIAN()),axis=0) # orig_file['NUMBER OF UNITS'] = orig_file['NUMBER OF UNITS'].replace({'\s':'8'}) # SAVE DATAFRAME TO CLEANED DIRECTORY orig_file.to_csv(cleaned_filePath, sep=',', index=False) print("cleaned origination files")
def run(self): url = "https://freddiemac.embs.com/FLoan/secure/login.php" # myusername = '******' # mypassword = '******' myusername = input("Enter your username: "******"Enter your password: "******"Enter your password: "******"name": "loginform"}) login_form.find("input", {"name": "username"})["value"] = myusername login_form.find("input", {"name": "password"})["value"] = mypassword # Logging in response = browser.submit(login_form, login_page.url) termsPage = response.soup.find("html") # confirming login h2 = termsPage.find("h2") if not (h2.text == "Loan-Level Dataset"): print("Please check your credentials to login and try again") else: termsForm = termsPage.find('form') termsForm.find("input", {"name": "accept"})["checked"] = True # Submitting form on terms and conditions page response = browser.submit(termsForm, response.url) dataPage = response.soup.find("html") table = dataPage.find("table", {"class": "table1"}) # print(tables) files = [] for row in table.findAll('tr'): try: data = row.findAll('td') file = [data[0].string, data[0].a['href'], data[2].string] files.append(file) except: pass # TAKE YEAR AND QUARTER FROM USER while (year not in range(1999, 2017) or quarter not in range(2, 5)): year = input( "Please enter the year for which you want to run the model: " ) quarter = input( "Please enter the quarter for which you want to predict (2nd,3rd or 4th): " ) try: year = int(year) quarter = int(quarter) except: print("Please enter a valid year and quarter") year = 0 quarter = 0 pass files = pd.DataFrame( data=files, columns=["fileName", "downloadURL", "fileSize"]) # print(files) pattern = r'download.php' downloadURL = re.sub(pattern, "", response.url) create_directory("downloads") dir = "downloads/" filePath1 = "historical_data1_Q" + str(quarter - 1) + str(year) filePath2 = "historical_data1_time_Q" + str(quarter - 1) + str(year) filePath3 = "historical_data1_Q" + str(quarter) + str(year) filePath4 = "historical_data1_time_Q" + str(quarter) + str(year) fs = [filePath1, filePath2, filePath3, filePath4] for index, row in files.iterrows(): for f in fs: if (f in row['fileName']): fileURL = downloadURL + row['downloadURL'] if not (os.path.isfile(dir + f + '.txt')): zf = browser.get(fileURL) z = zipfile.ZipFile(io.BytesIO(zf.content)) z.extractall(path=dir) if (os.path.isfile(dir + filePath1 + '.txt') & os.path.isfile(dir + filePath2 + '.txt') & os.path.isfile(dir + filePath3 + '.txt') & os.path.isfile(dir + filePath4 + '.txt')): file = open(dir + 'downloaded.txt', 'w+') file.close() file = open(dir + str(quarter) + str(year), 'w+') file.close() print("Data downloaded")
def run(self): create_directory("summary") summary_filePath = "summary/summary_sample_orig.csv" summary_filePath2 = "summary/summary_sample_orig_quarter.csv" cleaned_dir = "cleaned/" for year in range(1999, 2017): summary = pd.read_csv("cleaned/cleaned_sample_orig_" + str(year) + ".csv", usecols=[ 'LOAN SEQUENCE NUMBER', 'LOAN PURPOSE', 'ORIGINAL LOAN TERM' ]) summary = pd.read_csv( "cleaned/cleaned_sample_orig_" + str(year) + ".csv", usecols=[ "ORIGINAL UPB", "ORIGINATION YEAR", "ORIGINATION QUARTER" ]) #Sum of Original UPB s = summary.sum(axis=0) #print(s) #Sum of Original UPB per year s1 = summary.groupby([ "ORIGINATION YEAR" ])["ORIGINAL UPB"].sum().reset_index(name="Sum of UPB per year") # print(s1) #Sum of Original UPB per year per Quarter s3 = summary.groupby(["ORIGINATION YEAR", "ORIGINATION QUARTER" ])["ORIGINAL UPB"].sum().reset_index( name="Sum of UPB per year per quarter") s4 = summary.groupby(["ORIGINATION YEAR" ])["ORIGINAL UPB"].mean().reset_index( name="Average of UPB per year") # print(s4) #Average of Original UPB per year per Quarter s6 = summary.groupby( ["ORIGINATION YEAR", "ORIGINATION QUARTER"])["ORIGINAL UPB"].mean().reset_index( name="Average of UPB per year per quarter") # print(s6) summary = pd.read_csv( "cleaned/cleaned_sample_orig_" + str(year) + ".csv", usecols=[ "CREDIT SCORE", "ORIGINATION YEAR", "ORIGINATION QUARTER" ]) #Average of Credit Score per year per Year s7 = summary.groupby(["ORIGINATION YEAR" ])["CREDIT SCORE"].mean().reset_index( name="Average of Credit Score per year") # print(s7) #Average of Credit Score per year per Quarter s9 = summary.groupby( ["ORIGINATION YEAR", "ORIGINATION QUARTER"])["CREDIT SCORE"].mean().reset_index( name="Average of Credit Score per year per quarter") all_year_summary = pd.merge( (pd.merge(s1, s4, on=["ORIGINATION YEAR"])), s7, on=["ORIGINATION YEAR"]) all_year_quarter_summary = pd.merge( (pd.merge( s3, s6, on=["ORIGINATION YEAR", "ORIGINATION QUARTER"])), s9, on=["ORIGINATION YEAR", "ORIGINATION QUARTER"]) summary = pd.read_csv("cleaned/cleaned_sample_orig_" + str(year) + ".csv", usecols=[ "ORIGINAL COMBINED LOAN-TO-VALUE (CLTV)", "ORIGINATION YEAR", "ORIGINATION QUARTER" ]) #Average of CLTV per year per Year s10 = summary.groupby([ "ORIGINATION YEAR" ])["ORIGINAL COMBINED LOAN-TO-VALUE (CLTV)"].mean().reset_index( name="Average of CLTV per year") #Average of CLTV per year per Quarter s12 = summary.groupby([ "ORIGINATION YEAR", "ORIGINATION QUARTER" ])["ORIGINAL COMBINED LOAN-TO-VALUE (CLTV)"].mean().reset_index( name="Average of CLTV Score per year per quarter") all_year_summary = pd.merge(all_year_summary, s10, on=["ORIGINATION YEAR"]) all_year_quarter_summary = pd.merge( all_year_quarter_summary, s12, on=["ORIGINATION YEAR", "ORIGINATION QUARTER"]) summary = pd.read_csv( "cleaned/cleaned_sample_orig_" + str(year) + ".csv", usecols=[ "ORIGINAL LOAN-TO-VALUE (LTV)", "ORIGINATION YEAR", "ORIGINATION QUARTER" ]) #Average of LTV per Year s13 = summary.groupby([ "ORIGINATION YEAR" ])["ORIGINAL LOAN-TO-VALUE (LTV)"].mean().reset_index( name="Average of LTV per Year") #Average of LTV per year per Quarter s15 = summary.groupby([ "ORIGINATION YEAR", "ORIGINATION QUARTER" ])["ORIGINAL LOAN-TO-VALUE (LTV)"].mean().reset_index( name="Average of LTV Score per year per quarter") all_year_summary = all_year_summary = pd.merge( all_year_summary, s13, on=["ORIGINATION YEAR"]) all_year_quarter_summary = pd.merge( all_year_quarter_summary, s15, on=["ORIGINATION YEAR", "ORIGINATION QUARTER"]) summary = pd.read_csv( "cleaned/cleaned_sample_orig_" + str(year) + ".csv", usecols=[ "ORIGINAL INTEREST RATE", "ORIGINATION YEAR", "ORIGINATION QUARTER" ]) #Average of Interest Rate per Year s16 = summary.groupby([ "ORIGINATION YEAR" ])["ORIGINAL INTEREST RATE"].mean().reset_index( name="Average of Interest Rate per Year") #Average of Interest Rate per year per Quarter s18 = summary.groupby([ "ORIGINATION YEAR", "ORIGINATION QUARTER" ])["ORIGINAL INTEREST RATE"].mean().reset_index( name="Average of Interest rate per year per quarter") all_year_summary = all_year_summary = pd.merge( all_year_summary, s16, on=["ORIGINATION YEAR"]) all_year_quarter_summary = pd.merge( all_year_quarter_summary, s18, on=["ORIGINATION YEAR", "ORIGINATION QUARTER"]) summary = pd.read_csv( "cleaned/cleaned_sample_orig_" + str(year) + ".csv", usecols=[ "LOAN SEQUENCE NUMBER", "FIRST TIME HOMEBUYER FLAG", "OCCUPANCY STATUS", "LOAN PURPOSE" ]) #Count of Loans with First Time Home Buyer equal to "Y", Occupancy equal to "I" or "S" and Loan Purpose equal to "C" and "N" result = summary[((summary["FIRST TIME HOMEBUYER FLAG"] == 'Y')) & ((summary["OCCUPANCY STATUS"] == 'I') | (summary["OCCUPANCY STATUS"] == 'S')) & ((summary["LOAN PURPOSE"] == 'C') | (summary["LOAN PURPOSE"] == 'N'))] s = int(result["LOAN SEQUENCE NUMBER"].count()) # print("COUNT OF FALSE Y FLAG FOR FIRSTTIME HOMEBUYER = " ) # print(str(s)) anomaly = pd.DataFrame({ "ORIGINATION YEAR": [year], "COUNT OF FALSE Y FLAG FOR FIRSTTIME HOMEBUYER": [s] }) all_year_summary.loc[all_year_summary["ORIGINATION YEAR"] >= 0, 'ORIGINATION YEAR'] = year all_year_summary = pd.merge(all_year_summary, anomaly, on=["ORIGINATION YEAR"]) try: alls = pd.concat([alls, all_year_summary]) alls_quarter = pd.concat( [alls_quarter, all_year_quarter_summary]) except: alls = all_year_summary alls_quarter = all_year_quarter_summary alls.to_csv(summary_filePath, sep=',', index=False) alls_quarter.to_csv(summary_filePath2, sep=',', index=False) print("files summarized")
def run(self): create_directory("Data/Cleaned") create_directory("Data/Summary") downloads_dir_loan = "Data/Downloads/DeclinedLoanData" cleaned_dir = "Data/Cleaned/" summary_dir = "Data/Summary/" if (os.path.isfile(downloads_dir_loan + "/combined_downloaded_reject_loandata.xls")): fullData = pd.read_csv(downloads_dir_loan + "/combined_downloaded_reject_loandata.xls", sep=",", encoding="ISO-8859-1", low_memory=False) else: #Looping over all the csv to load and process the data ls_dir = os.listdir(downloads_dir_loan) fullData = None for file in ls_dir: # only if file is csv regexp = re.compile(r'.csv') if (regexp.search(file)): print(file) filePath = downloads_dir_loan + "/" + file # # # READ DATA INTO DATAFRAME data = pd.read_csv(filePath + '', sep=",", skiprows=[0], encoding="ISO-8859-1", low_memory=False) # Removing Rows where all columns are null or only have value for 1 column data.dropna(how='all', inplace=True) data.dropna(thresh=2, inplace=True) try: fullData = pd.concat([fullData, data]) fullData1 = fullData2 = fullData except: fullData = data data = None # Describe downloaded dataset fullData['Employment Length'] = fullData[ 'Employment Length'].str.replace('+', '') fullData['Employment Length'] = fullData[ 'Employment Length'].str.replace('<', '') fullData['Employment Length'] = fullData[ 'Employment Length'].str.replace('years', '') fullData['Employment Length'] = fullData[ 'Employment Length'].str.replace('year', '') fullData['Employment Length'] = fullData[ 'Employment Length'].str.replace('n/a', '0') fullData['Zip Code'] = fullData['Zip Code'].str.replace('xx', '00') fullData['Employment Length'] = fullData[ 'Employment Length'].astype(np.int64) fullData.to_csv(downloads_dir_loan + "/combined_downloaded_reject_loandata.xls", sep=',', index=False) summary = fullData.describe() # SAVE summary of downloaded data to files # print(summary) summary.to_csv(summary_dir + "summary_downloaded_reject_loandata.csv", sep=',', index=True) # CLEAN DATA fullData = remove_columns(fullData) fullData = replace_by_na(fullData) fullData = separate_application_date(fullData) fulldata = replace_by_default(fullData) summary = fullData.describe() # SAVE Cleaned/Preprocessed Data fullData.to_csv(cleaned_dir + "cleaned_reject_loandata.csv", sep=',', index=False) # Summarize and save cleaned dataset summary.to_csv(summary_dir + "summary_cleaned_reject_loandata.csv", sep=',', index=True)