Exemplo n.º 1
0
def main():
    Cust_Count = len(CCs)
    proc = 16
    iterator = xrange(proc)
    remainder = Cust_Count % proc
    ccount = Cust_Count / proc
    lenI = len(iterator)
    chk1Time = pyTimer.startTimer()
    func = partial(createTransData, ccount, remainder, lenI)
    pool = Pool(processes=proc)
    results = pool.map(func, iterator)
    liMaster = results[0] + results[1] + results[2] + results[3] + results[
        4] + results[5] + results[6] + results[7] + results[8] + results[
            9] + results[10] + results[11] + results[12] + results[
                13] + results[14] + results[15]
    endLoopTime = pyTimer.startTimer()
    avgLoopTime = round(((endLoopTime - chk1Time) / Cust_Count), 2)
    avgLoopTime = ("{0:.1f}".format(avgLoopTime))
    pyTimer.writeRuntimeLog(
        "The average time to create 1 customer's transactions is: " +
        str(avgLoopTime) + ' seconds\n')
    #Open CSV file for writing
    chk2Time = pyTimer.startTimer()
    ##    lines=sc.parallelize(liMaster)
    ##    lines.saveAsTextFile("Transactions")
    with open('cc_trans.csv', 'w') as f1:
        writer = csv.writer(
            f1,
            delimiter='|',
            lineterminator='\n',
        )
        #File header
        writer.writerow(['ROWNUM']+['ACCOUNTID']+['MERCHANT_NAME']+['MERCHANT_CATEGORY_CODE']+['MERCHANT_CATEGORY_DESC']+['MERCHANT_COUNTRY']+\
                        ['POST_DATE']+['TRANSACTION_DATE']+['TRANSACTION_TYPE']+['CREDIT_DEBIT']+['CREDIT_LIMIT']+['AMOUNT']+['BALANCE']+\
                        ['CREDITCARDNUMBER']+['CC_TYPE']+['USE_CASE']+['CUST_NAME']+['NUM_CCS']+['CUST_CITY']+['CUST_STATE']+['CUST_ZIP']+['CUST_COUNTRY']+['TRANS_DETAIL'])
        for row in liMaster:
            writer.writerow(row)
    endCSVTime = pyTimer.startTimer()
    endCSVTime = round((endCSVTime - chk2Time), 2)
    endCSVTime = ("{0:.1f}".format(endCSVTime))
    pyTimer.writeRuntimeLog("It took: " + str(endCSVTime) +
                            ' seconds to write to file\n')
    pyTimer.endTimer(startTime, str(Cust_Count) + ' Transactions creation')
Exemplo n.º 2
0
##  If statement makes this program standalone
##  Do not need this if statement if another program will be calling above functions
if __name__ == "__main__":
##  Create start time
    startTime = pyTimer.startTimer()
##  Try to download NLTK packages
    try:
        punktDL = nltk.download('punkt')
        aptDL = nltk.download('averaged_perceptron_tagger')
    except:
        writeToLog('NLTK Punkt and Averaged_Perceptron_tagger need to be installed')
    currDate = datetime.now()
    fileDate = currDate.strftime('%m%d%Y')
    writeToLog('*****************************' + fileDate + '*****************************\n')
    fileName = '/var/www/html/' + fileDate + '_CRA_Scrape.csv'
    mainURL = 'http://www.cra-arc.gc.ca/convictions/'
    mainXPath = '//*[@class="module-menu-section span-3"]'
    linkXPath = '//*[@class="col-md-9 col-md-push-3"]'
    paraXPath = '//p'
## If the NLTK packages are downloaded, run the main program
    if punktDL and aptDL:
        main(mainURL, mainXPath, linkXPath, paraXPath, fileName)
    else:
        writeToLog('NLTK Punkt and Averaged_Perceptron_tagger need to be downloaded first.')
        writeToLog('Please sudo python and run nltk.download("punkt") and nltk.download("averaged_perceptron_tagger")')
##  Find total time in seconds of program run
    pName = os.path.basename(__file__)
    endTime = pyTimer.endTimer(startTime, pName)
    writeToLog("Program took " + endTime + " to complete.\n")
##*********************END PROGRAM*********************##
Exemplo n.º 3
0
##  scrapeInfo needs the url, content and 2 xpath variables to call the function
##  scrapeInfo returns a list when completed
        liData.extend(scrapeInfo(mainURL, mainContent, mainXPath, linkXPath))
##  Call function removeDuplicates
        beforeDedup = len(liData)
        liData = removeDuplicates(liData)
        writeToLog(str(len(liData)) + " records of " + str(beforeDedup) + " left after deduplication\n")
##  Call createCSV function to write the list data to the scrapeFile
##  createCSV needs a list and an open file to run
        createCSV(liData, scrapeFile)
##*********************END MAIN FUNCTION*********************##

##*********************END FUNCTIONS*********************##

##*********************PROGRAM*********************##
##  If statement makes this program standalone
##  Do not need this if statement if another program will be calling above functions
if __name__ == "__main__":
##  Create start time
    startTime = pyTimer.startTimer()
    currDate = datetime.now()
    fileDate = currDate.strftime('%m%d%Y')
    currDate = currDate.strftime('%Y-%m-%d')
    fileName = '/var/www/html/' + fileDate + '_Leafly_MMJScrape.csv'
    main('https://www.leafly.com/finder', '//*[@class="col-xs-6 col-md-4 spacer-bottom-xs"]', './/script', fileName)
##  Find total time in seconds of program run
    endTime = pyTimer.endTimer(startTime)
    writeToLog("Program took " + endTime + " to complete.\n")

##*********************END PROGRAM*********************##
Exemplo n.º 4
0
def main(mainURLList):
    currDate = datetime.now()
##  Make currDate Yesterday's date
    currDate = currDate - timedelta(days=1)
    fileDate = currDate.strftime('%m%d%Y')
    currDate = currDate.strftime('%Y-%m-%d')
    writeToLog("*************************** " + currDate + " ***************************\n")
##  Open a file and overwrite the existing file or create a new file if needed
    fileName = '/var/www/html/' + fileDate + '_ScreenScrape.csv'
    with open(fileName,'w') as scrapeFile:
        writer = csv.writer(scrapeFile, delimiter=',', quoting=csv.QUOTE_NONE, escapechar=' ')
##  Add a header row
        writer.writerow(["PhoneNumber","Email_Address","Website","BackPage_Link"])
        try:
##  Loop through all urls in the mainURLList
            for mainURL in mainURLList:
                liData = []
                writeToLog("\nMain scrape of: " + mainURL + "\n")
                startT = pyTimer.startTimer()
                startPage = 0
                endPage = 0
                increment = 1
##  Increment through 999 possible pages
                while increment < 1000:
##  If increment > 1 then add the page string to the URL
##  Http request the mainURL
                    if increment == 1:
                        mainRequest = requests.get(mainURL + "adult/")
                    else:
                        mainRequest = requests.get(mainURL + "adult/?page=" + str(increment))
##  Translate the request content to HTML
                    mainContent = html.fromstring(mainRequest.content)
##  Use xpath to only grab HTML tags with the CSS class "date"
                    date = mainContent.xpath('//*[@class="date"]')
                    dateStr = ''
##  Loop through dates on the page to make sure that the current date is on the page
                    for dateStr in date:
                        dateStr = tostring(dateStr)
                        dateStr = re.search("\w{3}. \w{3}. \d{1,2}", dateStr)
			dateStr = dateStr.group() + " - " + str(datetime.now().year)
                        dateStr = datetime.strptime(dateStr, '%a. %b. %d - %Y').date()
                        dateStr = dateStr.strftime('%Y-%m-%d')
                        if dateStr == currDate:
                            break
##  Compare current date to date on webpage
                    if dateStr == currDate:
                        if startPage == 0:
                            startPage = increment
##  Extend liData to include anything from the main body of the postings
                        liData.extend(scrapeInfo(mainURL, mainContent, '/html/body/div//*[@href]'))
##  Extend liData to include anything from the sponsorBoxContent
                        liData.extend(scrapeInfo(mainURL, mainContent, '//*[@class="sponsorBoxContent"]/a'))
##  If the date on the page is greater than the currDate variable or the currDate variable is blank, go to next page
                    elif currDate < dateStr and currDate <> '':
                        increment = increment + 1
                        continue
                    else:
                        endPage = increment
                        writeToLog("Scraped pages: " + str(startPage) + " to " + str(endPage) + "\n")
                        writeToLog("Remove dups from scrape of: " + mainURL + "\n")
                        beforeDedup = len(liData)
##  Call function removeDuplicates
                        liData = removeDuplicates(liData)
                        writeToLog(str(len(liData)) + " records of " + str(beforeDedup) + " left after deduplication\n")
                        break
                    increment = increment + 1
                writeToLog(pyTimer.endTimer(startT) + mainURL + "\n")
                writeToLog("Write to scrape to CSV\n")
##  Call createCSV function to write the list data to the scrapeFile
##  createCSV needs a list and a writer from the open file to run
                writeToCSV(liData, writer)
##  Sleep for 30 seconds and then request a different page to make it seem like a human is doing the surfing
                time.sleep(30)
                requests.get("http://www.google.com")
        except:
            e = traceback.format_exc()
            writeToLog("Unexpected error:" + str(e) + "\n")
Exemplo n.º 5
0
                        break
                    increment = increment + 1
                writeToLog(pyTimer.endTimer(startT) + mainURL + "\n")
                writeToLog("Write to scrape to CSV\n")
##  Call createCSV function to write the list data to the scrapeFile
##  createCSV needs a list and a writer from the open file to run
                writeToCSV(liData, writer)
##  Sleep for 30 seconds and then request a different page to make it seem like a human is doing the surfing
                time.sleep(30)
                requests.get("http://www.google.com")
        except:
            e = traceback.format_exc()
            writeToLog("Unexpected error:" + str(e) + "\n")
##*********************END MAIN FUNCTION*********************##

##*********************END FUNCTIONS*********************##

##*********************PROGRAM*********************##
##  If statement makes this program standalone
##  Do not need this if statement if another program will be calling above functions
if __name__ == "__main__":
##  Create start time
    startTime = pyTimer.startTimer()
##  Create list of all Canadian Backpage links and US Backpage links to be used for main function
    mainURLList = ["http://alberta.backpage.com/", "http://britishcolumbia.backpage.com/", "http://manitoba.backpage.com/", "http://newbrunswick.backpage.com/", "http://stjohns.backpage.com/", "http://yellowknife.backpage.com/", "http://halifax.backpage.com/", "http://ontario.backpage.com/", "http://quebec.backpage.com/", "http://saskatchewan.backpage.com/", "http://whitehorse.backpage.com/", "http://alabama.backpage.com/", "http://alaska.backpage.com/", "http://arizona.backpage.com/", "http://arkansas.backpage.com/", "http://california.backpage.com/", "http://colorado.backpage.com/", "http://connecticut.backpage.com/", "http://delaware.backpage.com/", "http://florida.backpage.com/", "http://georgia.backpage.com/", "http://hawaii.backpage.com/", "http://idaho.backpage.com/", "http://illinois.backpage.com/", "http://indiana.backpage.com/", "http://iowa.backpage.com/", "http://kansas.backpage.com/", "http://kentucky.backpage.com/", "http://louisiana.backpage.com/", "http://maine.backpage.com/", "http://maryland.backpage.com/", "http://massachusetts.backpage.com/", "http://michigan.backpage.com/", "http://minnesota.backpage.com/", "http://mississippi.backpage.com/", "http://missouri.backpage.com/", "http://montana.backpage.com/", "http://nebraska.backpage.com/", "http://nevada.backpage.com/", "http://newhampshire.backpage.com/", "http://newjersey.backpage.com/", "http://newmexico.backpage.com/", "http://newyork.backpage.com/", "http://northcarolina.backpage.com/", "http://northdakota.backpage.com/", "http://ohio.backpage.com/", "http://oklahoma.backpage.com/", "http://oregon.backpage.com/", "http://pennsylvania.backpage.com/", "http://rhodeisland.backpage.com/", "http://southcarolina.backpage.com/", "http://southdakota.backpage.com/", "http://tennessee.backpage.com/", "http://texas.backpage.com/", "http://utah.backpage.com/", "http://vermont.backpage.com/", "http://virginia.backpage.com/", "http://washington.backpage.com/", "http://washingtondc.backpage.com/", "http://westvirginia.backpage.com/", "http://wisconsin.backpage.com/", "http://wyoming.backpage.com/"]
    main(mainURLList)
##  Find total time in seconds of program run
    endTime = pyTimer.endTimer(startTime)
    writeToLog("Program took " + endTime + " to complete.\n")
##*********************END PROGRAM*********************##
Exemplo n.º 6
0
    for row in reader:  # read a row as {column1: value1, column2: value2,...}
        for (k, v) in row.items():  # go over each column name and value
            columns[k].append(v)  # append the value into the appropriate list
            # based on column name k
CCs = columns['CREDITCARDNUMBER']
ACCTs = columns['ACCOUNTID']
CCTypes = columns['CREDITCARDTYPE']
Holders = columns['NAME']
CCsCount = columns['NUM_CCS']
Cities = columns['CITY']
States = columns['STATE']
ZIPs = columns['ZIP']
Countries = columns['COUNTRY']
UseCase = columns['USE_CASE_SCENARIO']
ClsdFlags = columns['CLOSEDACCOUNT']
pyTimer.endTimer(startTime, '\n Reading in customer file')
trans_no = 0
maxCheckin = date(2000, 1, 1)
maxBook = date(2000, 1, 1)


def pop_transDetail(cat_desc, maxDate, j, maxBook, maxCheckin, randomrange,
                    randomchoice):
    checkin = date(2000, 1, 1)
    checkout = date(2000, 1, 1)
    booking = date(2000, 1, 1)
    transDetail = ''
    tmp2 = gen_data.create_name()
    addr = gen_data.create_city_state_zip()
    #Add details or Hotel Transactions
    if (cat_desc == 'Hotels/Motels/Inns/Resorts'
Exemplo n.º 7
0
##['LARGE_CASH_EXEMPT']+['DEMARKET_FLAG']+['DEMARKET_DATE']+['PROB_DEFAULT_RISKR']+['OFFICIAL_LANG_PREF']+['CONSENT_SHARING']+\
##['PREFERRED_CHANNEL']+['PRIMARY_BRANCH_NO']+['DEPENDANTS_COUNT']+['SEG_MODEL_ID']+['SEG_MODEL_TYPE']+\
##['SEG_MODEL_NAME']+['SEG_MODEL_GROUP']+['SEG_M_GRP_DESC']+['SEG_MODEL_SCORE']+['ARMS_MANUFACTURER']+['AUCTION']+\
##['CASHINTENSIVE_BUSINESS']+['CASINO_GAMBLING']+['CHANNEL_ONBOARDING']+['CHANNEL_ONGOING_TRANSACTIONS']+['CLIENT_NET_WORTH']+\
##['COMPLEX_HI_VEHICLE']+['DEALER_PRECIOUS_METAL']+['DIGITAL_PM_OPERATOR']+['EMBASSY_CONSULATE']+['EXCHANGE_CURRENCY']+\
##['FOREIGN_FINANCIAL_INSTITUTION']+['FOREIGN_GOVERNMENT']+['FOREIGN_NONBANK_FINANCIAL_INSTITUTION']+['INTERNET_GAMBLING']+\
##['MEDICAL_MARIJUANA_DISPENSARY']+['MONEY_SERVICE_BUSINESS']+['NAICS_CODE']+['NONREGULATED_FINANCIAL_INSTITUTION']+\
##['NOT_PROFIT']+['PRIVATELY_ATM_OPERATOR']+['PRODUCTS']+['SALES_USED_VEHICLES']+['SERVICES']+\
##['SIC_CODE']+['STOCK_MARKET_LISTING']+['THIRD_PARTY_PAYMENT_PROCESSOR']+['TRANSACTING_PROVIDER']+['HIGH_NET_WORTH']+['HIGH_RISK']+['RISK_RATING']+['USE_CASE_SCENARIO'])
#Loop for number of accounts to generate
liMaster = []
start = 10786147
acct_list = []
liSSNMaster = []
liSSNMaster = createSSNs(liSSNMaster, cust_count)
pyTimer.endTimer(startTime,
                 'Creating ' + str(len(liSSNMaster)) + ' SSNs for customers')
chk1Time = pyTimer.startTimer()
for i in xrange(cust_count):
    #Initiate High Risk Flags
    #Politically Exposed Person
    PEP = 'No'
    #Customer with a Suspicous Activity Report
    SAR = 'No'
    #Customer with a closed account
    #generate closed acct flag
    Clsd = choice(Clsd_flag)
    #High risk customer flag
    high_risk = 'No'
    #High Risk Rating
    hr_rating = ''
    #Customer that was demarketed by the bank
Exemplo n.º 8
0
def main():
    #####Customer Count wanted for the end file######
    cust_count = 50000
    liSSNMaster = []
    liSSNMaster = createSSNs(liSSNMaster, cust_count)
    pyTimer.endTimer(
        startTime, 'Creating ' + str(len(liSSNMaster)) + ' SSNs for customers')
    chk1Time = pyTimer.startTimer()
    cust_list = []
    proc = 16
    iterator = xrange(proc)
    remainder = cust_count % proc
    ccount = cust_count / proc
    lenI = len(iterator)
    func = partial(createCustData, liSSNMaster, ccount, remainder, lenI)
    pool = Pool(processes=proc)
    results = pool.map(func, iterator)
    cust_list = results[0] + results[1] + results[2] + results[3] + results[
        4] + results[5] + results[6] + results[7] + results[8] + results[
            9] + results[10] + results[11] + results[12] + results[
                13] + results[14] + results[15]
    endLoopTime = pyTimer.startTimer()
    avgLoopTime = round(((endLoopTime - chk1Time) / cust_count), 6)
    avgLoopTime = ("{0:.6f}".format(avgLoopTime))
    pyTimer.writeRuntimeLog('The average time to create a customer is: ' +
                            str(avgLoopTime) + ' seconds\n')
    ##        cust_list.append(['ROWNUM']+['ACCOUNTID']+['ACCT_TYPE']+['NUM_CCS']+['NAME']+['M_NAME']+['SSN']+['AUTHORIZED_NAME2']+['M_NAME2']+['SSN2']+\
    ##        ['AUTHORIZED_NAME3']+['M_NAME3']+['SSN3']+['AUTHORIZED_NAME4']+['M_NAME4']+['SSN4']+['CREDITCARDNUMBER']+['CREDITCARDTYPE']+['EMPLOYER']+['CUSTEMAIL']+\
    ##        ['OCCUPATION']+['CITY']+['STATE']+['ZIP']+['COUNTRY']+['PREVIOUS_CITY']+['PREVIOUS_STATE']+\
    ##        ['PREVIOUS_ZIP']+['PREVIOUS_COUNTRY']+['DOB']+['PEP']+['SAR']+['CLOSEDACCOUNT']+['RELATED_ACCT']+['RELATED_TYPE']+['PARTY_TYPE']+['PARTY_RELATION']+['PARTY_STARTDATE']+['PARTY_ENDDATE']+\
    ##        ['LARGE_CASH_EXEMPT']+['DEMARKET_FLAG']+['DEMARKET_DATE']+['PROB_DEFAULT_RISKR']+['OFFICIAL_LANG_PREF']+['CONSENT_SHARING']+\
    ##        ['PREFERRED_CHANNEL']+['PRIMARY_BRANCH_NO']+['DEPENDANTS_COUNT']+['SEG_MODEL_ID']+['SEG_MODEL_TYPE']+\
    ##        ['SEG_MODEL_NAME']+['SEG_MODEL_GROUP']+['SEG_M_GRP_DESC']+['SEG_MODEL_SCORE']+['ARMS_MANUFACTURER']+['AUCTION']+\
    ##        ['CASHINTENSIVE_BUSINESS']+['CASINO_GAMBLING']+['CHANNEL_ONBOARDING']+['CHANNEL_ONGOING_TRANSACTIONS']+['CLIENT_NET_WORTH']+\
    ##        ['COMPLEX_HI_VEHICLE']+['DEALER_PRECIOUS_METAL']+['DIGITAL_PM_OPERATOR']+['EMBASSY_CONSULATE']+['EXCHANGE_CURRENCY']+\
    ##        ['FOREIGN_FINANCIAL_INSTITUTION']+['FOREIGN_GOVERNMENT']+['FOREIGN_NONBANK_FINANCIAL_INSTITUTION']+['INTERNET_GAMBLING']+\
    ##        ['MEDICAL_MARIJUANA_DISPENSARY']+['MONEY_SERVICE_BUSINESS']+['NAICS_CODE']+['NONREGULATED_FINANCIAL_INSTITUTION']+\
    ##        ['NOT_PROFIT']+['PRIVATELY_ATM_OPERATOR']+['PRODUCTS']+['SALES_USED_VEHICLES']+['SERVICES']+\
    ##        ['SIC_CODE']+['STOCK_MARKET_LISTING']+['THIRD_PARTY_PAYMENT_PROCESSOR']+['TRANSACTING_PROVIDER']+['HIGH_NET_WORTH']+['HIGH_RISK']+['RISK_RATING']+['USE_CASE_SCENARIO'])
    ##        cust_list=createCustData(cust_count)
    ##        lines=sc.parallelize(cust_list)
    ##        lines.saveAsTextFile("Customers")
    #Creates CSV
    with open('uber_custv3.csv', 'w') as f1:
        #Writer for CSV...Pipe delimited...Return for a new line
        writer = csv.writer(
            f1,
            delimiter='|',
            lineterminator='\n',
        )
        #Header Row
        writer.writerow(['ROWNUM']+['ACCOUNTID']+['ACCT_TYPE']+['NUM_CCS']+['NAME']+['M_NAME']+['SSN']+['AUTHORIZED_NAME2']+['M_NAME2']+['SSN2']+\
        ['AUTHORIZED_NAME3']+['M_NAME3']+['SSN3']+['AUTHORIZED_NAME4']+['M_NAME4']+['SSN4']+['CREDITCARDNUMBER']+['CREDITCARDTYPE']+['EMPLOYER']+['CUSTEMAIL']+\
        ['OCCUPATION']+['CITY']+['STATE']+['ZIP']+['COUNTRY']+['PREVIOUS_CITY']+['PREVIOUS_STATE']+\
        ['PREVIOUS_ZIP']+['PREVIOUS_COUNTRY']+['DOB']+['PEP']+['SAR']+['CLOSEDACCOUNT']+['RELATED_ACCT']+['RELATED_TYPE']+['PARTY_TYPE']+['PARTY_RELATION']+['PARTY_STARTDATE']+['PARTY_ENDDATE']+\
        ['LARGE_CASH_EXEMPT']+['DEMARKET_FLAG']+['DEMARKET_DATE']+['PROB_DEFAULT_RISKR']+['OFFICIAL_LANG_PREF']+['CONSENT_SHARING']+\
        ['PREFERRED_CHANNEL']+['PRIMARY_BRANCH_NO']+['DEPENDANTS_COUNT']+['SEG_MODEL_ID']+['SEG_MODEL_TYPE']+\
        ['SEG_MODEL_NAME']+['SEG_MODEL_GROUP']+['SEG_M_GRP_DESC']+['SEG_MODEL_SCORE']+['ARMS_MANUFACTURER']+['AUCTION']+\
        ['CASHINTENSIVE_BUSINESS']+['CASINO_GAMBLING']+['CHANNEL_ONBOARDING']+['CHANNEL_ONGOING_TRANSACTIONS']+['CLIENT_NET_WORTH']+\
        ['COMPLEX_HI_VEHICLE']+['DEALER_PRECIOUS_METAL']+['DIGITAL_PM_OPERATOR']+['EMBASSY_CONSULATE']+['EXCHANGE_CURRENCY']+\
        ['FOREIGN_FINANCIAL_INSTITUTION']+['FOREIGN_GOVERNMENT']+['FOREIGN_NONBANK_FINANCIAL_INSTITUTION']+['INTERNET_GAMBLING']+\
        ['MEDICAL_MARIJUANA_DISPENSARY']+['MONEY_SERVICE_BUSINESS']+['NAICS_CODE']+['NONREGULATED_FINANCIAL_INSTITUTION']+\
        ['NOT_PROFIT']+['PRIVATELY_ATM_OPERATOR']+['PRODUCTS']+['SALES_USED_VEHICLES']+['SERVICES']+\
        ['SIC_CODE']+['STOCK_MARKET_LISTING']+['THIRD_PARTY_PAYMENT_PROCESSOR']+['TRANSACTING_PROVIDER']+['HIGH_NET_WORTH']+['HIGH_RISK']+['RISK_RATING']+['USE_CASE_SCENARIO'])
        for row in cust_list:
            writer.writerow(row)
    pyTimer.endTimer(startTime, str(cust_count) + ' Customer creation')