def run(self):
        #READ AMAZON KEYS FROM USER
        aws_access_read = input("Enter your aws access key: ")
        aws_secret_read = input("Enter your aws secret key: ")

        aws_access_key = aws_access_read.strip()
        aws_secret_key = aws_secret_read.strip()
        #print("check")

        # fileName1 = "cleaned_loandata.csv"
        fileName2 = "cleaned_reject_loandata.csv"
        clean_dir = "Data/Cleaned/"
        # filePath1 = clean_dir + fileName1
        filePath2 = clean_dir + fileName2

        # r = check_if_file_exists(aws_access_key,aws_secret_key,fileName1, filePath1)
        # if (r==False):
        #     amazon_upload(aws_access_key,aws_secret_key,filePath1)
        r = check_if_file_exists(aws_access_key, aws_secret_key, fileName2,
                                 filePath2)
        if (r == False):
            amazon_upload(aws_access_key, aws_secret_key, filePath2)

        # check if upload successful
        # r1 = check_if_file_exists(aws_access_key,aws_secret_key,fileName1, filePath1)
        r2 = check_if_file_exists(aws_access_key, aws_secret_key, fileName2,
                                  filePath2)
        if (r == True):
            create_directory("Data/Results")
            open("Data/Results/resultsrejectloandata.csv", 'a').close()
  def run(self):
    create_directory("cleaned")
    cleaned_dir = "cleaned/"
    downloads_dir = "downloads/"
    
    
      
    # Itereate through all the years. Check if the cleaned file exists! If not Read the downloads csv and Clean it. And Place it in the cleaned directory
    for i in range(1999,2017):
      downloads_filePath = downloads_dir + "sample_orig_" + str(i) + ".txt"
      cleaned_filePath = cleaned_dir + "cleaned_sample_orig_" + str(i) + ".csv"
      
      if not (os.path.isfile(cleaned_filePath)):
        # LOAD AND ADD HEADERS TO THE ORIG DATA
        orig_file = pd.read_csv(downloads_filePath ,sep="|", header=None, \
               names = ["CREDIT SCORE",\
                        "FIRST PAYMENT DATE",\
                        "FIRST TIME HOMEBUYER FLAG",\
                        "MATURITY DATE",\
                        "METROPOLITAN STATISTICAL AREA (MSA) OR METROPOLITAN DIVISION",\
                        "MORTGAGE INSURANCE PERCENTAGE (MI %)",\
                        "NUMBER OF UNITS",\
                        "OCCUPANCY STATUS",\
                        "ORIGINAL COMBINED LOAN-TO-VALUE (CLTV)",\
                        "ORIGINAL DEBT-TO-INCOME (DTI) RATIO",\
                        "ORIGINAL UPB",\
                        "ORIGINAL LOAN-TO-VALUE (LTV)",\
                        "ORIGINAL INTEREST RATE",\
                        "CHANNEL",\
                        "PREPAYMENT PENALTY MORTGAGE (PPM) FLAG",\
                        "PRODUCT TYPE",\
                        "PROPERTY STATE",\
                        "PROPERTY TYPE",\
                        "POSTAL CODE",\
                        "LOAN SEQUENCE NUMBER",\
                        "LOAN PURPOSE",\
                        "ORIGINAL LOAN TERM",\
                        "NUMBER OF BORROWERS",\
                        "SELLER NAME",\
                        "SERVICER NAME",\
                        "Super Conforming Flag"\
                       ])

        # CLEAN THE ORIG FILE





        # SAVE DATAFRAME TO CLEANED DIRECTORY
        orig_file.to_csv(cleaned_filePath, sep=',', index = False)



    print ("cleaned origination files")
示例#3
0
    def run(self):
        create_directory("cleaned")
        cleaned_dir = "cleaned/"
        downloads_dir = "downloads/"

        # Itereate through all the years. Check if the cleaned file exists! If not Read the downloads csv and Clean it. And Place it in the cleaned directory
        for i in range(1999, 2017):
            downloads_filePath = downloads_dir + "sample_svcg_" + str(
                i) + ".txt"
            cleaned_filePath = cleaned_dir + "cleaned_sample_svcg_" + str(
                i) + ".csv"

            if not (os.path.isfile(cleaned_filePath)):

                # LOAD AND ADD HEADERS TO THE PERFORMANCE DATA

                performance = pd.read_csv(
                    downloads_filePath,
                    sep="|",
                    header=None,
                    names=[
                        "LOAN SEQUENCE NUMBER", "MONTHLY REPORTING PERIOD",
                        "CURRENT ACTUAL UPB",
                        "CURRENT LOAN DELINQUENCY STATUS", "LOAN AGE",
                        "REMAINING MONTHS TO LEGAL MATURITY",
                        "REPURCHASE FLAG", "MODIFICATION FLAG",
                        "ZERO BALANCE CODE", "ZERO BALANCE EFFECTIVE DATE",
                        "CURRENT INTEREST RATE", "CURRENT DEFERRED UPB",
                        "DUE DATE OF LAST PAID INSTALLMENT", "MI RECOVERIES",
                        "NET SALES PROCEEDS", "NON MI RECOVERIES", "EXPENSES",
                        "LEGAL COSTS", "MAINTENANCE AND PRESERVATION COSTS",
                        "TAXES AND INSURANCE", "MISCELLANEOUS EXPENSES",
                        "ACTUAL LOSS CALCULATION", "MODIFICATION COST"
                    ])

                # CLEAN THE PERFORMANCE FILE

                # NEEDS A CONDITIONAL FUNCTION FOR YEAR 2000
                clean_monthly_reporting_period(performance)
                clean_loan_del_status(performance)
                clean_repurchase_flag(performance)
                clean_modification_flag(performance)
                clean_zero_balance_code(performance)
                clean_zero_balance_effective_date(performance)
                clean_ddlpi(performance)
                replace_all_other_NaNs_With_zero(performance)

                # SAVE DATAFRAME TO CLEANED DIRECTORY
                performance.to_csv(cleaned_filePath, sep=',', index=False)

        print("cleaned performance files")


# __________________________________________________________________________________________________________________________________________
    def run(self):
        create_directory("summary")
        summary_dir = "summary/"
        cleaned_dir = "cleaned/"

        # Itereate through all the years. Check if the cleaned file exists! If not Read the downloads csv and Clean it. And Place it in the cleaned directory
        # for i in range(1999,2017):

        #   cleaned_filePath = cleaned_dir + "cleaned_sample_orig_" + str(i) + ".csv"
        #   summary_filePath = summary_dir + "summary_orig_" + str(i) + ".csv"
        #   if not (os.path.isfile(summary_filePath)):

        #     # LOAD THE PERFORMANCE DATA
        #     orig = pd.read_csv(cleaned_filePath,sep="|", header=None)

        #     # SAVE DATAFRAME TO CLEANED DIRECTORY

        print("Building Prediction Model")
示例#5
0
    def run(self):
        create_directory("Summary")
        downloads_dir = "Data/Downloads/"
        summary_dir = "Data/Summary/"
        # READ DATA INTO DATAFRAME
        irisData = pd.read_csv(downloads_dir + 'irisdataset.data',
                               sep=",",
                               header=None,
                               names=[
                                   'sepal_length_in_cm', 'sepal_width_in_cm',
                                   'petal_length_in_cm', 'petal_width_in_cm',
                                   'iris_class'
                               ])

        # SUMMARIZE DATA

        # SAVE DATAFRAME
        irisData.to_csv(summary_dir + "summary_irisdataset.csv",
                        sep=',',
                        index=False)
示例#6
0
    def run(self):
        create_directory("cleaned")
        cleaned_dir = "cleaned/"
        downloads_dir = "downloads/"

        # Itereate through all the years. Check if the cleaned file exists! If not Read the downloads csv and Clean it. And Place it in the cleaned directory
        for i in range(1999, 2017):
            downloads_filePath = downloads_dir + "sample_svcg_" + str(
                i) + ".txt"
            cleaned_filePath = cleaned_dir + "cleaned_sample_svcg_" + str(
                i) + ".csv"

            if not (os.path.isfile(cleaned_filePath)):

                # LOAD AND ADD HEADERS TO THE PERFORMANCE DATA
                performance = pd.read_csv(
                    downloads_filePath,
                    sep="|",
                    header=None,
                    names=[
                        "LOAN SEQUENCE NUMBER", "MONTHLY REPORTING PERIOD",
                        "CURRENT ACTUAL UPB",
                        "CURRENT LOAN DELINQUENCY STATUS", "LOAN AGE",
                        "REMAINING MONTHS TO LEGAL MATURITY",
                        "REPURCHASE FLAG", "MODIFICATION FLAG",
                        "ZERO BALANCE CODE", "ZERO BALANCE EFFECTIVE DATE",
                        "CURRENT INTEREST RATE", "CURRENT DEFERRED UPB",
                        "DUE DATE OF LAST PAID INSTALLMENT", "MI RECOVERIES",
                        "NET SALES PROCEEDS", "NON MI RECOVERIES", "EXPENSES",
                        "Legal Costs", "Maintenance and Preservation Costs",
                        "Taxes and Insurance", "Miscellaneous Expenses",
                        "Actual Loss Calculation", "Modification Cost"
                    ])

                # CLEAN THE PERFORMANCE FILE

                # SAVE DATAFRAME TO CLEANED DIRECTORY
                performance.to_csv(cleaned_filePath, sep=',', index=False)

        print("cleaned performance files")
示例#7
0
    def run(self):
        create_directory("Data")
        create_directory("Data/Downloads")
        create_directory("Data/Downloads/LoanData")
        # create_directory("Data/Downloads/DeclinedLoanData")
        downloads_dir = "Data/Downloads/"
        url = "https://www.lendingclub.com/account/gotoLogin.action"
        # login credentials
        username = input("Enter your Lending Club usenname: ")
        password = input("Enter your Lending Club password: "******"id": "member-login"})
        login_form.find("input", {"name": "login_email"})["value"] = username
        login_form.find("input",
                        {"name": "login_password"})["value"] = password
        response = browser.submit(login_form, login_page.url)

        if (response.url ==
                "https://www.lendingclub.com/account/myAccount.action"):

            # CODE TO DOWNLOAD DATASET INTO THE DIRECTORY
            url = "https://www.lendingclub.com/info/download-data.action"

            folder1 = "LoanData/"
            # folder2 = "DeclinedLoanData/"
            link = 'https://www.lendingclub.com/info/download-data.action'
            r = browser.get(link)
            soup = BeautifulSoup(r.text, "html.parser")
            loan_namelist = (soup.find('div',
                                       {'id': "loanStatsFileNamesJS"})).text
            loan_names = parts = loan_namelist.split('|')
            prefix = (soup.find('div', {'id': "urlPublicPrefix"})).text
            #Download Loan
            i = 0
            for name in loan_names:
                if (name.strip() != ""):
                    i = i + 1
                    url_n = (prefix + name)
                    if not (os.path.isfile(downloads_dir + folder1 +
                                           name.split('.')[0] + '.csv')):
                        zf = browser.get(url_n)
                        z = zipfile.ZipFile(io.BytesIO(zf.content))
                        z.extractall(path=downloads_dir + folder1)
        else:
            print("Try again with correct Lending Club credentials")
示例#8
0
    def run(self):
        create_directory("summary")
        summary_dir = "summary/"
        downloads_dir = "downloads/"

        # FOR ORIGINATION DATA
        headers_orig = ["CREDIT SCORE",\
                                    "FIRST PAYMENT DATE",\
                                    "FIRST TIME HOMEBUYER FLAG",\
                                    "MATURITY DATE",\
                                    "METROPOLITAN STATISTICAL AREA (MSA) OR METROPOLITAN DIVISION",\
                                    "MORTGAGE INSURANCE PERCENTAGE (MI %)",\
                                    "NUMBER OF UNITS",\
                                    "OCCUPANCY STATUS",\
                                    "ORIGINAL COMBINED LOAN-TO-VALUE (CLTV)",\
                                    "ORIGINAL DEBT-TO-INCOME (DTI) RATIO",\
                                    "ORIGINAL UPB",\
                                    "ORIGINAL LOAN-TO-VALUE (LTV)",\
                                    "ORIGINAL INTEREST RATE",\
                                    "CHANNEL",\
                                    "PREPAYMENT PENALTY MORTGAGE (PPM) FLAG",\
                                    "PRODUCT TYPE",\
                                    "PROPERTY STATE",\
                                    "PROPERTY TYPE",\
                                    "POSTAL CODE",\
                                    "LOAN SEQUENCE NUMBER",\
                                    "LOAN PURPOSE",\
                                    "ORIGINAL LOAN TERM",\
                                    "NUMBER OF BORROWERS",\
                                    "SELLER NAME",\
                                    "SERVICER NAME",\
                                    "SUPER CONFORMING FLAG"\
                                   ]

        # CREATE EMPTY FILES AND INSERT headers_orig TO THE FILES
        for head in headers_orig:
            if not (head == "LOAN SEQUENCE NUMBER"):
                summary_filePath = summary_dir + "summary_raw_sample_orig_" + head + ".csv"
                with open(summary_filePath, 'a') as f:
                    f.write(head)
                    f.write(',COUNT,YEAR\n')
                    f.close()
        # ADD SUMMARY TO THE FILES CREATED!!
        for i in range(1999, 2017):
            downloads_filePath = downloads_dir + "sample_orig_" + str(
                i) + ".txt"

            # downloads_filePath = downloads_dir + "historical_data1_Q" + str(i) + "2007.txt"
            # cleaned_filePath = cleaned_dir + "cleaned_historical_data1_Q" + str(i) + "2007.csv"

            orig_file = pd.read_csv(downloads_filePath,
                                    sep="|",
                                    header=None,
                                    names=headers_orig)

            # SUMMARIZE THE RAW SAMPLE FILE

            for head in headers_orig:
                # d = orig_file[['col1', 'col2', 'col3', 'col4']].groupby(['col1', 'col2']).agg(['mean', 'count'])
                if not (head == "LOAN SEQUENCE NUMBER"):

                    d = orig_file.groupby(head) \
                             .agg({"LOAN SEQUENCE NUMBER" : len}) \
                               .rename(columns={'LOAN SEQUENCE NUMBER':'COUNT'})
                    d['Year'] = str(i)
                    summary_filePath = summary_dir + "summary_raw_sample_orig_" + head + ".csv"

                    with open(summary_filePath, 'a') as f:
                        d.to_csv(f, sep=',', header=False)
                        f.close()
        print("SUMMARIZED RAW ORIGINATION FILES")

        # ______________________________________________-

        # FOR PERFORMANCE DATA
        headers_svcg = ["LOAN SEQUENCE NUMBER",\
                        "MONTHLY REPORTING PERIOD",\
                        "CURRENT ACTUAL UPB", \
                        "CURRENT LOAN DELINQUENCY STATUS",\
                        "LOAN AGE",\
                        "REMAINING MONTHS TO LEGAL MATURITY",\
                        "REPURCHASE FLAG",\
                        "MODIFICATION FLAG",\
                        "ZERO BALANCE CODE",\
                        "ZERO BALANCE EFFECTIVE DATE",\
                        "CURRENT INTEREST RATE",\
                        "CURRENT DEFERRED UPB",\
                        "DUE DATE OF LAST PAID INSTALLMENT",\
                        "MI RECOVERIES",\
                        "NET SALES PROCEEDS",\
                        "NON MI RECOVERIES",\
                        "EXPENSES",\
                        "LEGAL COSTS",\
                        "MAINTENANCE AND PRESERVATION COSTS",\
                        "TAXES AND INSURANCE",\
                        "MISCELLANEOUS EXPENSES",\
                        "ACTUAL LOSS CALCULATION",\
                        "MODIFICATION COST"\
                                   ]

        # CREATE EMPTY FILES AND INSERT headers_svcg TO THE FILES
        for head in headers_svcg:
            summary_filePath = summary_dir + "summary_raw_sample_svcg_" + head + ".csv"
            with open(summary_filePath, 'a') as f:
                f.write(head)
                f.write(',COUNT,YEAR\n')
                f.close()
        # ADD SUMMARY TO THE FILES CREATED!!
        for i in range(1999, 2017):
            downloads_filePath = downloads_dir + "sample_svcg_" + str(
                i) + ".txt"

            # downloads_filePath = downloads_dir + "historical_data1_Q" + str(i) + "2007.txt"
            # cleaned_filePath = cleaned_dir + "cleaned_historical_data1_Q" + str(i) + "2007.csv"

            svcg_file = pd.read_csv(downloads_filePath,
                                    sep="|",
                                    header=None,
                                    names=headers_svcg)
            svcg_file = svcg_file.reset_index()
            # svcg_file = svcg_file.set_index(['LOAN SEQUENCE NUMBER', 'MONTHLY REPORTING PERIOD'], inplace=False)
            # SUMMARIZE THE RAW SAMPLE FILE
            print(svcg_file[2:3])
            for head in headers_svcg:
                # d = orig_file[['col1', 'col2', 'col3', 'col4']].groupby(['col1', 'col2']).agg(['mean', 'count'])

                d = svcg_file.groupby(head).count()
                d = svcg_file.groupby(head) \
                                 .agg({'index'  : len }) \
                                   .rename(columns={'index' : 'COUNT'})
                d['Year'] = str(i)
                summary_filePath = summary_dir + "summary_raw_sample_svcg_" + head + ".csv"

                with open(summary_filePath, 'a') as f:
                    d.to_csv(f, sep=',', header=False)
                    f.close()
        print("SUMMARIZED RAW PERFORMANCE FILES")
    def run(self):
        create_directory("cleaned")
        cleaned_dir = "cleaned/"
        downloads_dir = "downloads/"

        quarterandyear = glob.glob(cleaned_dir +
                                   '[0-9][0-9][0-9][0-9][0-9]')[0]

        quarter = int(re.search(r'(\d)(\d{4})', quarterandyear).group(1))
        year = int(re.search(r'(\d)(\d{4})', quarterandyear).group(2))

        # Itereate through all the years. Check if the cleaned file exists! If not Read the downloads csv and Clean it. And Place it in the cleaned directory
        for i in range(quarter - 1, quarter + 1):
            downloads_filePath = downloads_dir + "historical_data1_time_Q" + str(
                i) + str(year) + ".txt"
            cleaned_filePath = cleaned_dir + "cleaned_historical_data1_time_Q" + str(
                i) + str(year) + ".csv"

            if not (os.path.isfile(cleaned_filePath)):

                # LOAD AND ADD HEADERS TO THE PERFORMANCE DATA
                chunk = 500000
                perf = None
                for performance in pd.read_csv(downloads_filePath, sep="|", header = None, chunksize = chunk, iterator = True, index_col=False, names = ["LOAN SEQUENCE NUMBER", \
                                "MONTHLY REPORTING PERIOD", \
                                "CURRENT ACTUAL UPB", \
                                "CURRENT LOAN DELINQUENCY STATUS",\
                                "LOAN AGE",\
                                "REMAINING MONTHS TO LEGAL MATURITY",\
                                "REPURCHASE FLAG",\
                                "MODIFICATION FLAG",\
                                "ZERO BALANCE CODE",\
                                "ZERO BALANCE EFFECTIVE DATE",\
                                "CURRENT INTEREST RATE",\
                                "CURRENT DEFERRED UPB",\
                                "DUE DATE OF LAST PAID INSTALLMENT",\
                                "MI RECOVERIES",\
                                "NET SALES PROCEEDS",\
                                "NON MI RECOVERIES",\
                                "EXPENSES",\
                                "LEGAL COSTS",\
                                "MAINTENANCE AND PRESERVATION COSTS",\
                                "TAXES AND INSURANCE",\
                                "MISCELLANEOUS EXPENSES",\
                                "ACTUAL LOSS CALCULATION",\
                                "MODIFICATION COST"\
                               ]):

                    # CLEAN THE PERFORMANCE FILE

                    # NEEDS A CONDITIONAL FUNCTION FOR YEAR 2000
                    # performance.index += j
                    # i+=1
                    clean_loan_seq_num(performance)
                    clean_monthly_reporting_period(performance)
                    clean_loan_del_status(performance)
                    clean_repurchase_flag(performance)
                    clean_modification_flag(performance)
                    clean_zero_balance_code(performance)
                    clean_zero_balance_effective_date(performance)
                    clean_ddlpi(performance)
                    replace_all_other_NaNs_With_zero(performance)

                    # j = performance.index[-1] + 1
                    # SAVE DATAFRAME TO CLEANED DIRECTORY

                    if not (os.path.isfile(cleaned_filePath)):
                        performance.to_csv(cleaned_filePath,
                                           sep=',',
                                           index=False)
                    else:
                        with open(cleaned_filePath, 'a') as f:
                            performance.to_csv(f,
                                               sep=',',
                                               index=False,
                                               header=False)

        if (os.path.isfile(cleaned_dir + 'cleaned_historical_data1_time_Q' +
                           str(quarter) + str(year) + '.csv')
                & os.path.isfile(cleaned_dir +
                                 'cleaned_historical_data1_time_Q' +
                                 str(quarter - 1) + str(year) + '.csv')):
            file = open(cleaned_dir + 'cleaned_perf.txt', 'w+')
            file.close()
            file = open(cleaned_dir + str(quarter) + str(year), 'w+')
            file.close()
            print("cleaned performance files")

        print("UNCOMMENT THE CODE WHEN SURE OF THIS")


# __________________________________________________________________________________________________________________________________________
示例#10
0
    def run(self):
        url = "https://freddiemac.embs.com/FLoan/secure/login.php"

        myusername = '******'
        mypassword = '******'

        # Create Browser
        browser = mechanicalsoup.Browser()

        login_page = browser.get(url)
        login_form = login_page.soup.find('form', {"name": "loginform"})
        login_form.find("input", {"name": "username"})["value"] = myusername
        login_form.find("input", {"name": "password"})["value"] = mypassword

        # Logging in
        response = browser.submit(login_form, login_page.url)
        termsPage = response.soup.find("html")

        # confirming login
        h2 = termsPage.find("h2")

        if not (h2.text == "Loan-Level Dataset"):
            print("Please check your credentials to login")
        else:
            termsForm = termsPage.find('form')
            termsForm.find("input", {"name": "accept"})["checked"] = True

            # Submitting form on terms and conditions page
            response = browser.submit(termsForm, response.url)

            dataPage = response.soup.find("html")

            table = dataPage.find("table", {"class": "table1"})
            # print(tables)

            files = []
            for row in table.findAll('tr'):

                try:
                    data = row.findAll('td')
                    file = [data[0].string, data[0].a['href'], data[2].string]
                    files.append(file)
                except:
                    pass

            files = pd.DataFrame(
                data=files, columns=["fileName", "downloadURL", "fileSize"])
            # print(files)

            pattern = r'download.php'
            downloadURL = re.sub(pattern, "", response.url)

            create_directory("downloads")
            dir = "downloads/"
            for index, row in files.iterrows():
                # Check if its a sample file
                if ("sample_" in row['fileName']):
                    # Get url for sample file
                    fileURL = downloadURL + row['downloadURL']
                    year = re.search(r'sample_(.+?).zip',
                                     row['fileName']).group(1)
                    filePath1 = dir + "sample_orig_" + year + ".txt"
                    filePath2 = dir + "sample_svcg_" + year + ".txt"
                    print(filePath1 + filePath2)
                    # print(fileURL)
                    # Check if sample file exists in the directory, and download if doesnt
                    if not (os.path.isfile(filePath1)
                            & os.path.isfile(filePath2)):
                        zf = browser.get(fileURL)
                        # open("filePath", "w").write(response.read()).close()
                        z = zipfile.ZipFile(io.BytesIO(zf.content))
                        z.extractall(path=dir)
        print("Data downloaded")
示例#11
0
    def run(self):
        create_directory("Data/Cleaned")
        create_directory("Data/Summary")
        downloads_dir_loan = "Data/Downloads/LoanData"
        cleaned_dir = "Data/Cleaned/"
        summary_dir = "Data/Summary/"

        #
        if (os.path.isfile(downloads_dir_loan +
                           "/full_downloaded_loandata.xls")):
            fullData = pd.read_csv(downloads_dir_loan +
                                   "/full_downloaded_loandata.xls",
                                   sep=",",
                                   encoding="ISO-8859-1",
                                   low_memory=False)

        else:

            #Looping over all the csv to load and process the data
            ls_dir = os.listdir(downloads_dir_loan)
            fullData = None
            for file in ls_dir:
                #     only if file is csv
                regexp = re.compile(r'.csv')
                if (regexp.search(file)):

                    filePath = downloads_dir_loan + "/" + file
                    # # # READ DATA INTO DATAFRAME
                    data = pd.read_csv(filePath,
                                       sep=",",
                                       skiprows=[0],
                                       encoding="ISO-8859-1",
                                       low_memory=False)

                    # Removing Rows where all columns are null or only have value for 1 column
                    data.dropna(how='all', inplace=True)
                    data.dropna(thresh=2, inplace=True)

                    try:
                        fullData = pd.concat([fullData, data])
                    except:
                        fullData = data

                    data = None

            # Save fullData (full Raw dataset) in Downloads directory -- just to view
            fullData.to_csv(downloads_dir_loan +
                            "/full_downloaded_loandata.xls",
                            sep=',',
                            index=True)

        # Describe downloaded dataset
        summary = fullData.describe()
        # grouped_data = data.groupby(['iris_class'])
        # print(grouped_data.describe().unstack())

        # SAVE summary of downloaded data to files
        # print(summary)
        summary.to_csv(summary_dir + "summary_downloaded_loandata.csv",
                       sep=',',
                       index=True)

        # CLEAN DATA

        # fullData = replace_by_mean(fullData)
        fullData = remove_columns(fullData)

        fullData = replace_by_median(fullData)

        fullData = replace_by_zero(fullData)

        # fullData = remove_rows(fullData)

        fullData = add_derived_columns(fullData)

        fullData = add_dummy_variable(fullData)

        fullData = clean_text_columns(fullData)

        # SAVE Cleaned/Preprocessed Data
        fullData.to_csv(cleaned_dir + "cleaned_loandata.csv",
                        sep=',',
                        index=False)

        # Summarize and save cleaned dataset
        summary = fullData.describe()
        summary.to_csv(summary_dir + "summary_cleaned_loandata.csv",
                       sep=',',
                       index=True)
    def run(self):
        create_directory("cleaned")
        cleaned_dir = "cleaned/"
        downloads_dir = "downloads/"

        # Itereate through all the years. Check if the cleaned file exists! If not Read the downloads csv and Clean it. And Place it in the cleaned directory
        for i in range(1999, 2017):
            downloads_filePath = downloads_dir + "sample_orig_" + str(
                i) + ".txt"
            cleaned_filePath = cleaned_dir + "cleaned_sample_orig_" + str(
                i) + ".csv"
            # downloads_filePath = downloads_dir + "historical_data1_Q" + str(i) + "2007.txt"
            # cleaned_filePath = cleaned_dir + "cleaned_historical_data1_Q" + str(i) + "2007.csv"

            if not (os.path.isfile(cleaned_filePath)):
                # LOAD AND ADD HEADERS TO THE ORIG DATA
                orig_file = pd.read_csv(downloads_filePath ,sep="|", header=None, \
                       names = ["CREDIT SCORE",\
                                "FIRST PAYMENT DATE",\
                                "FIRST TIME HOMEBUYER FLAG",\
                                "MATURITY DATE",\
                                "METROPOLITAN STATISTICAL AREA (MSA) OR METROPOLITAN DIVISION",\
                                "MORTGAGE INSURANCE PERCENTAGE (MI %)",\
                                "NUMBER OF UNITS",\
                                "OCCUPANCY STATUS",\
                                "ORIGINAL COMBINED LOAN-TO-VALUE (CLTV)",\
                                "ORIGINAL DEBT-TO-INCOME (DTI) RATIO",\
                                "ORIGINAL UPB",\
                                "ORIGINAL LOAN-TO-VALUE (LTV)",\
                                "ORIGINAL INTEREST RATE",\
                                "CHANNEL",\
                                "PREPAYMENT PENALTY MORTGAGE (PPM) FLAG",\
                                "PRODUCT TYPE",\
                                "PROPERTY STATE",\
                                "PROPERTY TYPE",\
                                "POSTAL CODE",\
                                "LOAN SEQUENCE NUMBER",\
                                "LOAN PURPOSE",\
                                "ORIGINAL LOAN TERM",\
                                "NUMBER OF BORROWERS",\
                                "SELLER NAME",\
                                "SERVICER NAME",\
                                "SUPER CONFORMING FLAG"\
                               ])
                if not (i == 2016):
                    t = 50000
                else:
                    t = 12500

                # CLEAN THE ORIG FILE
                orig_file = clean_credit_score(orig_file, i, t)

                orig_file = clean_first_payment_date(orig_file)

                orig_file = clean_first_time_homebuyer_flag(orig_file)

                orig_file = clean_maturity_date(orig_file)

                orig_file = clean_msa_md(orig_file)

                orig_file = clean_mi_percentage(orig_file)

                orig_file = clean_number_of_units(orig_file)

                orig_file = clean_occupancy_status(orig_file)

                orig_file = clean_cltv(orig_file)

                orig_file = clean_dti_ratio(orig_file)

                orig_file = clean_original_upb(orig_file, i, t)

                orig_file = clean_original_ltv(orig_file, i, t)

                orig_file = clean_original_interest(orig_file)

                orig_file = clean_channel(orig_file)

                orig_file = clean_ppm_flag(orig_file)

                orig_file = clean_product_type(orig_file)

                orig_file = clean_property_state(orig_file)

                orig_file = clean_property_type(orig_file)

                orig_file = clean_postal_code(orig_file)

                orig_file = clean_loan_seq_num(orig_file)

                orig_file = clean_loan_purpose(orig_file)

                orig_file = clean_orig_loan_term(orig_file)

                orig_file = clean_num_of_borrowers(orig_file)

                orig_file = clean_seller_and_servicer_name(orig_file)

                orig_file = clean_super_conf_flag(orig_file)

                # orig_file['CREDIT SCORE'].apply(lambda x: x.fillna(x.MEDIAN()),axis=0)
                # orig_file['NUMBER OF UNITS'] = orig_file['NUMBER OF UNITS'].replace({'\s':'8'})

                # SAVE DATAFRAME TO CLEANED DIRECTORY
                orig_file.to_csv(cleaned_filePath, sep=',', index=False)

        print("cleaned origination files")
    def run(self):
        url = "https://freddiemac.embs.com/FLoan/secure/login.php"

        # myusername = '******'
        # mypassword = '******'

        myusername = input("Enter your username: "******"Enter your password: "******"Enter your password: "******"name": "loginform"})
        login_form.find("input", {"name": "username"})["value"] = myusername
        login_form.find("input", {"name": "password"})["value"] = mypassword

        # Logging in
        response = browser.submit(login_form, login_page.url)
        termsPage = response.soup.find("html")

        # confirming login
        h2 = termsPage.find("h2")

        if not (h2.text == "Loan-Level Dataset"):
            print("Please check your credentials to login and try again")
        else:
            termsForm = termsPage.find('form')
            termsForm.find("input", {"name": "accept"})["checked"] = True

            # Submitting form on terms and conditions page
            response = browser.submit(termsForm, response.url)

            dataPage = response.soup.find("html")

            table = dataPage.find("table", {"class": "table1"})
            # print(tables)

            files = []
            for row in table.findAll('tr'):

                try:
                    data = row.findAll('td')
                    file = [data[0].string, data[0].a['href'], data[2].string]
                    files.append(file)
                except:
                    pass

            # TAKE YEAR AND QUARTER FROM USER
            while (year not in range(1999, 2017)
                   or quarter not in range(2, 5)):
                year = input(
                    "Please enter the year for which you want to run the model: "
                )
                quarter = input(
                    "Please enter the quarter for which you want to predict (2nd,3rd or 4th): "
                )

                try:
                    year = int(year)
                    quarter = int(quarter)
                except:
                    print("Please enter a valid year and quarter")
                    year = 0
                    quarter = 0
                    pass

            files = pd.DataFrame(
                data=files, columns=["fileName", "downloadURL", "fileSize"])
            # print(files)

            pattern = r'download.php'
            downloadURL = re.sub(pattern, "", response.url)

            create_directory("downloads")
            dir = "downloads/"
            filePath1 = "historical_data1_Q" + str(quarter - 1) + str(year)
            filePath2 = "historical_data1_time_Q" + str(quarter -
                                                        1) + str(year)
            filePath3 = "historical_data1_Q" + str(quarter) + str(year)
            filePath4 = "historical_data1_time_Q" + str(quarter) + str(year)
            fs = [filePath1, filePath2, filePath3, filePath4]

            for index, row in files.iterrows():
                for f in fs:
                    if (f in row['fileName']):
                        fileURL = downloadURL + row['downloadURL']
                        if not (os.path.isfile(dir + f + '.txt')):
                            zf = browser.get(fileURL)
                            z = zipfile.ZipFile(io.BytesIO(zf.content))
                            z.extractall(path=dir)

            if (os.path.isfile(dir + filePath1 + '.txt')
                    & os.path.isfile(dir + filePath2 + '.txt')
                    & os.path.isfile(dir + filePath3 + '.txt')
                    & os.path.isfile(dir + filePath4 + '.txt')):
                file = open(dir + 'downloaded.txt', 'w+')
                file.close()
                file = open(dir + str(quarter) + str(year), 'w+')
                file.close()
                print("Data downloaded")
    def run(self):
        create_directory("summary")
        summary_filePath = "summary/summary_sample_orig.csv"
        summary_filePath2 = "summary/summary_sample_orig_quarter.csv"
        cleaned_dir = "cleaned/"

        for year in range(1999, 2017):
            summary = pd.read_csv("cleaned/cleaned_sample_orig_" + str(year) +
                                  ".csv",
                                  usecols=[
                                      'LOAN SEQUENCE NUMBER', 'LOAN PURPOSE',
                                      'ORIGINAL LOAN TERM'
                                  ])

            summary = pd.read_csv(
                "cleaned/cleaned_sample_orig_" + str(year) + ".csv",
                usecols=[
                    "ORIGINAL UPB", "ORIGINATION YEAR", "ORIGINATION QUARTER"
                ])

            #Sum of Original UPB
            s = summary.sum(axis=0)
            #print(s)

            #Sum of Original UPB per year
            s1 = summary.groupby([
                "ORIGINATION YEAR"
            ])["ORIGINAL UPB"].sum().reset_index(name="Sum of UPB per year")
            #     print(s1)

            #Sum of Original UPB per year per Quarter
            s3 = summary.groupby(["ORIGINATION YEAR", "ORIGINATION QUARTER"
                                  ])["ORIGINAL UPB"].sum().reset_index(
                                      name="Sum of UPB per year per quarter")

            s4 = summary.groupby(["ORIGINATION YEAR"
                                  ])["ORIGINAL UPB"].mean().reset_index(
                                      name="Average of UPB per year")
            #     print(s4)

            #Average of Original UPB per year per Quarter
            s6 = summary.groupby(
                ["ORIGINATION YEAR",
                 "ORIGINATION QUARTER"])["ORIGINAL UPB"].mean().reset_index(
                     name="Average of UPB per year per quarter")
            #     print(s6)

            summary = pd.read_csv(
                "cleaned/cleaned_sample_orig_" + str(year) + ".csv",
                usecols=[
                    "CREDIT SCORE", "ORIGINATION YEAR", "ORIGINATION QUARTER"
                ])

            #Average of Credit Score per year per Year
            s7 = summary.groupby(["ORIGINATION YEAR"
                                  ])["CREDIT SCORE"].mean().reset_index(
                                      name="Average of Credit Score per year")
            #     print(s7)

            #Average of Credit Score per year per Quarter
            s9 = summary.groupby(
                ["ORIGINATION YEAR",
                 "ORIGINATION QUARTER"])["CREDIT SCORE"].mean().reset_index(
                     name="Average of Credit Score per year per quarter")

            all_year_summary = pd.merge(
                (pd.merge(s1, s4, on=["ORIGINATION YEAR"])),
                s7,
                on=["ORIGINATION YEAR"])

            all_year_quarter_summary = pd.merge(
                (pd.merge(
                    s3, s6, on=["ORIGINATION YEAR", "ORIGINATION QUARTER"])),
                s9,
                on=["ORIGINATION YEAR", "ORIGINATION QUARTER"])

            summary = pd.read_csv("cleaned/cleaned_sample_orig_" + str(year) +
                                  ".csv",
                                  usecols=[
                                      "ORIGINAL COMBINED LOAN-TO-VALUE (CLTV)",
                                      "ORIGINATION YEAR", "ORIGINATION QUARTER"
                                  ])

            #Average of CLTV per year per Year
            s10 = summary.groupby([
                "ORIGINATION YEAR"
            ])["ORIGINAL COMBINED LOAN-TO-VALUE (CLTV)"].mean().reset_index(
                name="Average of CLTV per year")

            #Average of CLTV per year per Quarter
            s12 = summary.groupby([
                "ORIGINATION YEAR", "ORIGINATION QUARTER"
            ])["ORIGINAL COMBINED LOAN-TO-VALUE (CLTV)"].mean().reset_index(
                name="Average of CLTV Score per year per quarter")

            all_year_summary = pd.merge(all_year_summary,
                                        s10,
                                        on=["ORIGINATION YEAR"])

            all_year_quarter_summary = pd.merge(
                all_year_quarter_summary,
                s12,
                on=["ORIGINATION YEAR", "ORIGINATION QUARTER"])

            summary = pd.read_csv(
                "cleaned/cleaned_sample_orig_" + str(year) + ".csv",
                usecols=[
                    "ORIGINAL LOAN-TO-VALUE (LTV)", "ORIGINATION YEAR",
                    "ORIGINATION QUARTER"
                ])

            #Average of LTV per Year
            s13 = summary.groupby([
                "ORIGINATION YEAR"
            ])["ORIGINAL LOAN-TO-VALUE (LTV)"].mean().reset_index(
                name="Average of LTV per Year")

            #Average of LTV per year per Quarter
            s15 = summary.groupby([
                "ORIGINATION YEAR", "ORIGINATION QUARTER"
            ])["ORIGINAL LOAN-TO-VALUE (LTV)"].mean().reset_index(
                name="Average of LTV Score per year per quarter")

            all_year_summary = all_year_summary = pd.merge(
                all_year_summary, s13, on=["ORIGINATION YEAR"])
            all_year_quarter_summary = pd.merge(
                all_year_quarter_summary,
                s15,
                on=["ORIGINATION YEAR", "ORIGINATION QUARTER"])

            summary = pd.read_csv(
                "cleaned/cleaned_sample_orig_" + str(year) + ".csv",
                usecols=[
                    "ORIGINAL INTEREST RATE", "ORIGINATION YEAR",
                    "ORIGINATION QUARTER"
                ])

            #Average of Interest Rate per Year
            s16 = summary.groupby([
                "ORIGINATION YEAR"
            ])["ORIGINAL INTEREST RATE"].mean().reset_index(
                name="Average of Interest Rate per Year")

            #Average of Interest Rate per year per Quarter
            s18 = summary.groupby([
                "ORIGINATION YEAR", "ORIGINATION QUARTER"
            ])["ORIGINAL INTEREST RATE"].mean().reset_index(
                name="Average of Interest rate per year per quarter")

            all_year_summary = all_year_summary = pd.merge(
                all_year_summary, s16, on=["ORIGINATION YEAR"])
            all_year_quarter_summary = pd.merge(
                all_year_quarter_summary,
                s18,
                on=["ORIGINATION YEAR", "ORIGINATION QUARTER"])

            summary = pd.read_csv(
                "cleaned/cleaned_sample_orig_" + str(year) + ".csv",
                usecols=[
                    "LOAN SEQUENCE NUMBER", "FIRST TIME HOMEBUYER FLAG",
                    "OCCUPANCY STATUS", "LOAN PURPOSE"
                ])

            #Count of Loans with First Time Home Buyer equal to "Y", Occupancy equal to "I" or "S" and Loan Purpose equal to "C" and "N"
            result = summary[((summary["FIRST TIME HOMEBUYER FLAG"] == 'Y'))
                             & ((summary["OCCUPANCY STATUS"] == 'I')
                                | (summary["OCCUPANCY STATUS"] == 'S')) &
                             ((summary["LOAN PURPOSE"] == 'C') |
                              (summary["LOAN PURPOSE"] == 'N'))]

            s = int(result["LOAN SEQUENCE NUMBER"].count())
            #     print("COUNT OF FALSE Y FLAG FOR FIRSTTIME HOMEBUYER =  " )
            #     print(str(s))

            anomaly = pd.DataFrame({
                "ORIGINATION YEAR": [year],
                "COUNT OF FALSE Y FLAG FOR FIRSTTIME HOMEBUYER": [s]
            })

            all_year_summary.loc[all_year_summary["ORIGINATION YEAR"] >= 0,
                                 'ORIGINATION YEAR'] = year

            all_year_summary = pd.merge(all_year_summary,
                                        anomaly,
                                        on=["ORIGINATION YEAR"])

            try:
                alls = pd.concat([alls, all_year_summary])
                alls_quarter = pd.concat(
                    [alls_quarter, all_year_quarter_summary])
            except:
                alls = all_year_summary
                alls_quarter = all_year_quarter_summary
        alls.to_csv(summary_filePath, sep=',', index=False)
        alls_quarter.to_csv(summary_filePath2, sep=',', index=False)
        print("files summarized")
示例#15
0
    def run(self):
        create_directory("Data/Cleaned")
        create_directory("Data/Summary")
        downloads_dir_loan = "Data/Downloads/DeclinedLoanData"
        cleaned_dir = "Data/Cleaned/"
        summary_dir = "Data/Summary/"

        if (os.path.isfile(downloads_dir_loan +
                           "/combined_downloaded_reject_loandata.xls")):
            fullData = pd.read_csv(downloads_dir_loan +
                                   "/combined_downloaded_reject_loandata.xls",
                                   sep=",",
                                   encoding="ISO-8859-1",
                                   low_memory=False)
        else:
            #Looping over all the csv to load and process the data
            ls_dir = os.listdir(downloads_dir_loan)
            fullData = None
            for file in ls_dir:
                #     only if file is csv
                regexp = re.compile(r'.csv')
                if (regexp.search(file)):
                    print(file)
                    filePath = downloads_dir_loan + "/" + file
                    # # # READ DATA INTO DATAFRAME
                    data = pd.read_csv(filePath + '',
                                       sep=",",
                                       skiprows=[0],
                                       encoding="ISO-8859-1",
                                       low_memory=False)

                    # Removing Rows where all columns are null or only have value for 1 column
                    data.dropna(how='all', inplace=True)
                    data.dropna(thresh=2, inplace=True)

                    try:
                        fullData = pd.concat([fullData, data])
                        fullData1 = fullData2 = fullData
                    except:
                        fullData = data

                    data = None

            # Describe downloaded dataset
            fullData['Employment Length'] = fullData[
                'Employment Length'].str.replace('+', '')
            fullData['Employment Length'] = fullData[
                'Employment Length'].str.replace('<', '')
            fullData['Employment Length'] = fullData[
                'Employment Length'].str.replace('years', '')
            fullData['Employment Length'] = fullData[
                'Employment Length'].str.replace('year', '')
            fullData['Employment Length'] = fullData[
                'Employment Length'].str.replace('n/a', '0')
            fullData['Zip Code'] = fullData['Zip Code'].str.replace('xx', '00')

            fullData['Employment Length'] = fullData[
                'Employment Length'].astype(np.int64)

            fullData.to_csv(downloads_dir_loan +
                            "/combined_downloaded_reject_loandata.xls",
                            sep=',',
                            index=False)
            summary = fullData.describe()

            # SAVE summary of downloaded data to files
            # print(summary)
            summary.to_csv(summary_dir +
                           "summary_downloaded_reject_loandata.csv",
                           sep=',',
                           index=True)

            # CLEAN DATA
            fullData = remove_columns(fullData)

            fullData = replace_by_na(fullData)

            fullData = separate_application_date(fullData)

            fulldata = replace_by_default(fullData)

            summary = fullData.describe()

            # SAVE Cleaned/Preprocessed Data
            fullData.to_csv(cleaned_dir + "cleaned_reject_loandata.csv",
                            sep=',',
                            index=False)

            # Summarize and save cleaned dataset
            summary.to_csv(summary_dir + "summary_cleaned_reject_loandata.csv",
                           sep=',',
                           index=True)