def main(): Cust_Count = len(CCs) proc = 16 iterator = xrange(proc) remainder = Cust_Count % proc ccount = Cust_Count / proc lenI = len(iterator) chk1Time = pyTimer.startTimer() func = partial(createTransData, ccount, remainder, lenI) pool = Pool(processes=proc) results = pool.map(func, iterator) liMaster = results[0] + results[1] + results[2] + results[3] + results[ 4] + results[5] + results[6] + results[7] + results[8] + results[ 9] + results[10] + results[11] + results[12] + results[ 13] + results[14] + results[15] endLoopTime = pyTimer.startTimer() avgLoopTime = round(((endLoopTime - chk1Time) / Cust_Count), 2) avgLoopTime = ("{0:.1f}".format(avgLoopTime)) pyTimer.writeRuntimeLog( "The average time to create 1 customer's transactions is: " + str(avgLoopTime) + ' seconds\n') #Open CSV file for writing chk2Time = pyTimer.startTimer() ## lines=sc.parallelize(liMaster) ## lines.saveAsTextFile("Transactions") with open('cc_trans.csv', 'w') as f1: writer = csv.writer( f1, delimiter='|', lineterminator='\n', ) #File header writer.writerow(['ROWNUM']+['ACCOUNTID']+['MERCHANT_NAME']+['MERCHANT_CATEGORY_CODE']+['MERCHANT_CATEGORY_DESC']+['MERCHANT_COUNTRY']+\ ['POST_DATE']+['TRANSACTION_DATE']+['TRANSACTION_TYPE']+['CREDIT_DEBIT']+['CREDIT_LIMIT']+['AMOUNT']+['BALANCE']+\ ['CREDITCARDNUMBER']+['CC_TYPE']+['USE_CASE']+['CUST_NAME']+['NUM_CCS']+['CUST_CITY']+['CUST_STATE']+['CUST_ZIP']+['CUST_COUNTRY']+['TRANS_DETAIL']) for row in liMaster: writer.writerow(row) endCSVTime = pyTimer.startTimer() endCSVTime = round((endCSVTime - chk2Time), 2) endCSVTime = ("{0:.1f}".format(endCSVTime)) pyTimer.writeRuntimeLog("It took: " + str(endCSVTime) + ' seconds to write to file\n') pyTimer.endTimer(startTime, str(Cust_Count) + ' Transactions creation')
nameLi.extend(extractNames(htmlLi)) writeToLog("Removing Duplicates\n") nameLi = removeDuplicates(nameLi) writeToLog("Creating CSV\n") createCSV(nameLi, scrapeFile) ##*********************END MAIN FUNCTION*********************## ##*********************END FUNCTIONS*********************## ##*********************PROGRAM*********************## ## If statement makes this program standalone ## Do not need this if statement if another program will be calling above functions if __name__ == "__main__": ## Create start time startTime = pyTimer.startTimer() ## Try to download NLTK packages try: punktDL = nltk.download('punkt') aptDL = nltk.download('averaged_perceptron_tagger') except: writeToLog('NLTK Punkt and Averaged_Perceptron_tagger need to be installed') currDate = datetime.now() fileDate = currDate.strftime('%m%d%Y') writeToLog('*****************************' + fileDate + '*****************************\n') fileName = '/var/www/html/' + fileDate + '_CRA_Scrape.csv' mainURL = 'http://www.cra-arc.gc.ca/convictions/' mainXPath = '//*[@class="module-menu-section span-3"]' linkXPath = '//*[@class="col-md-9 col-md-push-3"]' paraXPath = '//p' ## If the NLTK packages are downloaded, run the main program
def main(mainURLList): currDate = datetime.now() ## Make currDate Yesterday's date currDate = currDate - timedelta(days=1) fileDate = currDate.strftime('%m%d%Y') currDate = currDate.strftime('%Y-%m-%d') writeToLog("*************************** " + currDate + " ***************************\n") ## Open a file and overwrite the existing file or create a new file if needed fileName = '/var/www/html/' + fileDate + '_ScreenScrape.csv' with open(fileName,'w') as scrapeFile: writer = csv.writer(scrapeFile, delimiter=',', quoting=csv.QUOTE_NONE, escapechar=' ') ## Add a header row writer.writerow(["PhoneNumber","Email_Address","Website","BackPage_Link"]) try: ## Loop through all urls in the mainURLList for mainURL in mainURLList: liData = [] writeToLog("\nMain scrape of: " + mainURL + "\n") startT = pyTimer.startTimer() startPage = 0 endPage = 0 increment = 1 ## Increment through 999 possible pages while increment < 1000: ## If increment > 1 then add the page string to the URL ## Http request the mainURL if increment == 1: mainRequest = requests.get(mainURL + "adult/") else: mainRequest = requests.get(mainURL + "adult/?page=" + str(increment)) ## Translate the request content to HTML mainContent = html.fromstring(mainRequest.content) ## Use xpath to only grab HTML tags with the CSS class "date" date = mainContent.xpath('//*[@class="date"]') dateStr = '' ## Loop through dates on the page to make sure that the current date is on the page for dateStr in date: dateStr = tostring(dateStr) dateStr = re.search("\w{3}. \w{3}. \d{1,2}", dateStr) dateStr = dateStr.group() + " - " + str(datetime.now().year) dateStr = datetime.strptime(dateStr, '%a. %b. %d - %Y').date() dateStr = dateStr.strftime('%Y-%m-%d') if dateStr == currDate: break ## Compare current date to date on webpage if dateStr == currDate: if startPage == 0: startPage = increment ## Extend liData to include anything from the main body of the postings liData.extend(scrapeInfo(mainURL, mainContent, '/html/body/div//*[@href]')) ## Extend liData to include anything from the sponsorBoxContent liData.extend(scrapeInfo(mainURL, mainContent, '//*[@class="sponsorBoxContent"]/a')) ## If the date on the page is greater than the currDate variable or the currDate variable is blank, go to next page elif currDate < dateStr and currDate <> '': increment = increment + 1 continue else: endPage = increment writeToLog("Scraped pages: " + str(startPage) + " to " + str(endPage) + "\n") writeToLog("Remove dups from scrape of: " + mainURL + "\n") beforeDedup = len(liData) ## Call function removeDuplicates liData = removeDuplicates(liData) writeToLog(str(len(liData)) + " records of " + str(beforeDedup) + " left after deduplication\n") break increment = increment + 1 writeToLog(pyTimer.endTimer(startT) + mainURL + "\n") writeToLog("Write to scrape to CSV\n") ## Call createCSV function to write the list data to the scrapeFile ## createCSV needs a list and a writer from the open file to run writeToCSV(liData, writer) ## Sleep for 30 seconds and then request a different page to make it seem like a human is doing the surfing time.sleep(30) requests.get("http://www.google.com") except: e = traceback.format_exc() writeToLog("Unexpected error:" + str(e) + "\n")
# History | ddmmyyyy | User | Changes # | 01192016 | Ivana D. | Credit Card model,code, ref lists, etc... # | 01202016 | Jeff K. | Comments, ref lists, etc... # | 01202016 | Justin S | SSN distinct list #-----------------------------------------------------------------------------*/ #Reference data is located on the test-bmohb console gs://newccdatav3 from pyspark import SparkConf, SparkContext conf = SparkConf().setAppName("Customers") sc = SparkContext(conf=conf) from random import randrange, choice, randint from datetime import datetime from barnum import gen_data from faker import Faker import csv, NAICS, zips, re, geo_data, pyTimer startTime = pyTimer.startTimer() #####Customer Count wanted for the end file###### cust_count = 10000 fake = Faker() #Dictionary for type of account Related_Type = ['Primary', 'Secondary', 'Joint'] #Dictionary for how the account was opened Party_Type = ['Person', 'Non-Person'] #Dictionary for a BMO customer Party_Relation = ['Customer', 'Non-Customer'] #Dictionary for random flags Yes_No_Cust_Flag = ['Yes'] + ['No'] * 2 + [''] * 392 #Closed Account flag Clsd_flag = ['Yes'] + ['No'] * 98 #Dictionary for client whose net worth is over $500K HighNetWorth = ['Yes'] + ['No'] * 30
def main(): #####Customer Count wanted for the end file###### cust_count = 50000 liSSNMaster = [] liSSNMaster = createSSNs(liSSNMaster, cust_count) pyTimer.endTimer( startTime, 'Creating ' + str(len(liSSNMaster)) + ' SSNs for customers') chk1Time = pyTimer.startTimer() cust_list = [] proc = 16 iterator = xrange(proc) remainder = cust_count % proc ccount = cust_count / proc lenI = len(iterator) func = partial(createCustData, liSSNMaster, ccount, remainder, lenI) pool = Pool(processes=proc) results = pool.map(func, iterator) cust_list = results[0] + results[1] + results[2] + results[3] + results[ 4] + results[5] + results[6] + results[7] + results[8] + results[ 9] + results[10] + results[11] + results[12] + results[ 13] + results[14] + results[15] endLoopTime = pyTimer.startTimer() avgLoopTime = round(((endLoopTime - chk1Time) / cust_count), 6) avgLoopTime = ("{0:.6f}".format(avgLoopTime)) pyTimer.writeRuntimeLog('The average time to create a customer is: ' + str(avgLoopTime) + ' seconds\n') ## cust_list.append(['ROWNUM']+['ACCOUNTID']+['ACCT_TYPE']+['NUM_CCS']+['NAME']+['M_NAME']+['SSN']+['AUTHORIZED_NAME2']+['M_NAME2']+['SSN2']+\ ## ['AUTHORIZED_NAME3']+['M_NAME3']+['SSN3']+['AUTHORIZED_NAME4']+['M_NAME4']+['SSN4']+['CREDITCARDNUMBER']+['CREDITCARDTYPE']+['EMPLOYER']+['CUSTEMAIL']+\ ## ['OCCUPATION']+['CITY']+['STATE']+['ZIP']+['COUNTRY']+['PREVIOUS_CITY']+['PREVIOUS_STATE']+\ ## ['PREVIOUS_ZIP']+['PREVIOUS_COUNTRY']+['DOB']+['PEP']+['SAR']+['CLOSEDACCOUNT']+['RELATED_ACCT']+['RELATED_TYPE']+['PARTY_TYPE']+['PARTY_RELATION']+['PARTY_STARTDATE']+['PARTY_ENDDATE']+\ ## ['LARGE_CASH_EXEMPT']+['DEMARKET_FLAG']+['DEMARKET_DATE']+['PROB_DEFAULT_RISKR']+['OFFICIAL_LANG_PREF']+['CONSENT_SHARING']+\ ## ['PREFERRED_CHANNEL']+['PRIMARY_BRANCH_NO']+['DEPENDANTS_COUNT']+['SEG_MODEL_ID']+['SEG_MODEL_TYPE']+\ ## ['SEG_MODEL_NAME']+['SEG_MODEL_GROUP']+['SEG_M_GRP_DESC']+['SEG_MODEL_SCORE']+['ARMS_MANUFACTURER']+['AUCTION']+\ ## ['CASHINTENSIVE_BUSINESS']+['CASINO_GAMBLING']+['CHANNEL_ONBOARDING']+['CHANNEL_ONGOING_TRANSACTIONS']+['CLIENT_NET_WORTH']+\ ## ['COMPLEX_HI_VEHICLE']+['DEALER_PRECIOUS_METAL']+['DIGITAL_PM_OPERATOR']+['EMBASSY_CONSULATE']+['EXCHANGE_CURRENCY']+\ ## ['FOREIGN_FINANCIAL_INSTITUTION']+['FOREIGN_GOVERNMENT']+['FOREIGN_NONBANK_FINANCIAL_INSTITUTION']+['INTERNET_GAMBLING']+\ ## ['MEDICAL_MARIJUANA_DISPENSARY']+['MONEY_SERVICE_BUSINESS']+['NAICS_CODE']+['NONREGULATED_FINANCIAL_INSTITUTION']+\ ## ['NOT_PROFIT']+['PRIVATELY_ATM_OPERATOR']+['PRODUCTS']+['SALES_USED_VEHICLES']+['SERVICES']+\ ## ['SIC_CODE']+['STOCK_MARKET_LISTING']+['THIRD_PARTY_PAYMENT_PROCESSOR']+['TRANSACTING_PROVIDER']+['HIGH_NET_WORTH']+['HIGH_RISK']+['RISK_RATING']+['USE_CASE_SCENARIO']) ## cust_list=createCustData(cust_count) ## lines=sc.parallelize(cust_list) ## lines.saveAsTextFile("Customers") #Creates CSV with open('uber_custv3.csv', 'w') as f1: #Writer for CSV...Pipe delimited...Return for a new line writer = csv.writer( f1, delimiter='|', lineterminator='\n', ) #Header Row writer.writerow(['ROWNUM']+['ACCOUNTID']+['ACCT_TYPE']+['NUM_CCS']+['NAME']+['M_NAME']+['SSN']+['AUTHORIZED_NAME2']+['M_NAME2']+['SSN2']+\ ['AUTHORIZED_NAME3']+['M_NAME3']+['SSN3']+['AUTHORIZED_NAME4']+['M_NAME4']+['SSN4']+['CREDITCARDNUMBER']+['CREDITCARDTYPE']+['EMPLOYER']+['CUSTEMAIL']+\ ['OCCUPATION']+['CITY']+['STATE']+['ZIP']+['COUNTRY']+['PREVIOUS_CITY']+['PREVIOUS_STATE']+\ ['PREVIOUS_ZIP']+['PREVIOUS_COUNTRY']+['DOB']+['PEP']+['SAR']+['CLOSEDACCOUNT']+['RELATED_ACCT']+['RELATED_TYPE']+['PARTY_TYPE']+['PARTY_RELATION']+['PARTY_STARTDATE']+['PARTY_ENDDATE']+\ ['LARGE_CASH_EXEMPT']+['DEMARKET_FLAG']+['DEMARKET_DATE']+['PROB_DEFAULT_RISKR']+['OFFICIAL_LANG_PREF']+['CONSENT_SHARING']+\ ['PREFERRED_CHANNEL']+['PRIMARY_BRANCH_NO']+['DEPENDANTS_COUNT']+['SEG_MODEL_ID']+['SEG_MODEL_TYPE']+\ ['SEG_MODEL_NAME']+['SEG_MODEL_GROUP']+['SEG_M_GRP_DESC']+['SEG_MODEL_SCORE']+['ARMS_MANUFACTURER']+['AUCTION']+\ ['CASHINTENSIVE_BUSINESS']+['CASINO_GAMBLING']+['CHANNEL_ONBOARDING']+['CHANNEL_ONGOING_TRANSACTIONS']+['CLIENT_NET_WORTH']+\ ['COMPLEX_HI_VEHICLE']+['DEALER_PRECIOUS_METAL']+['DIGITAL_PM_OPERATOR']+['EMBASSY_CONSULATE']+['EXCHANGE_CURRENCY']+\ ['FOREIGN_FINANCIAL_INSTITUTION']+['FOREIGN_GOVERNMENT']+['FOREIGN_NONBANK_FINANCIAL_INSTITUTION']+['INTERNET_GAMBLING']+\ ['MEDICAL_MARIJUANA_DISPENSARY']+['MONEY_SERVICE_BUSINESS']+['NAICS_CODE']+['NONREGULATED_FINANCIAL_INSTITUTION']+\ ['NOT_PROFIT']+['PRIVATELY_ATM_OPERATOR']+['PRODUCTS']+['SALES_USED_VEHICLES']+['SERVICES']+\ ['SIC_CODE']+['STOCK_MARKET_LISTING']+['THIRD_PARTY_PAYMENT_PROCESSOR']+['TRANSACTING_PROVIDER']+['HIGH_NET_WORTH']+['HIGH_RISK']+['RISK_RATING']+['USE_CASE_SCENARIO']) for row in cust_list: writer.writerow(row) pyTimer.endTimer(startTime, str(cust_count) + ' Customer creation')