from time import sleep
import re
import pandas as pd
import sys
import os
import csv
import pickle
from random import shuffle
sys.path.insert(0, './Functions_scripts') # Allows to set a different path for the scripts being called below (but only if it branches off of the root dir)
from downloaddate_function import longtime, downloaddate
from loggenerator import gen_log

pd.set_option('display.max_columns', 500)

#######Initialization#######
log_starttime = longtime() # Get the start time for the log file

projpath = './'
datapath = projpath + 'data_raw/'
input_path = datapath + 'Register_scrape/'
output_path = datapath + 'Scrape/'

if not os.path.exists(output_path): # If the path doesn't exist, make it
	os.makedirs(output_path)

#######Main program######
#Input file - read in char links
input_file = input_path + 'ca_char_list.csv'

with open(input_file, 'r', encoding='1252', errors='replace', newline='') as inputfile: # At the end, this reads the output file back in and checks it contains the same number of records as the target
	df = pd.read_csv(inputfile)
import pandas as pd
import sys
import os
import csv
sys.path.insert(
    0, './Functions_scripts'
)  # Allows to set a different path for the scripts being called below (but only if it branches off of the root dir)
from downloaddate_function import longtime, downloaddate
from loggenerator import gen_log

#######Toggles#######
start_fresh = False  # This, if set False, allows the scraper to pick up where it left off if it failed during a run. If set true it will overwrite any previous results and start from page 1.
sleeptime = 5  # This is how long to sleep between each page grab - 5 seconds makes the scraper take about 5.5 hours in total.

#######Initialization#######
log_starttime = longtime()  # Get the start time for the log file

projpath = './'
datapath = projpath + 'data_raw/Register_scrape/'

if not os.path.exists(datapath):  # If the path doesn't exist, make it
    os.makedirs(datapath)

baseurl = 'https://apps.cra-arc.gc.ca/ebci/hacc/srch/pub/advncdSrch?dsrdPg='  # This is the URL for the search of all charities - it just need a page number between the base and suffix
suffixurl = '&q.stts=0007&q.ordrClmn=NAME&q.ordrRnk=ASC'
#######Main program######

#Output file
output = datapath + 'ca_char_list.csv'

if not os.path.exists(
Пример #3
0
    print('New proxies grabbed.\n')
    return proxies


#######Main program#######
"""
# Fetch Dropbox authentication
dbtokenpath = 'C:/Users/mcdonndz-local/Desktop/admin/db_token.txt'
#dbtokenpath_pi = '/home/pi/admin/dp_token.txt'
dbtokenfile = open(dbtokenpath, "r")
dbapitoken = dbtokenfile.read()
print(dbapitoken)

dbx = dropbox.Dropbox(dbapitoken) # Create an object for accessing Dropbox API
"""
log_starttime = longtime()  # Get the current date

# Define paths
projpath = './'  # Location of syntax
localdatapath = projpath + 'temp/'  # The location of the input file changes based on the month it was downloaded - move it manually into this temp folder
if not os.path.exists(localdatapath):  # If the paths don't exist, make them
    os.makedirs(localdatapath)
downloadpath = 'ew_download/Trustee_scrape'
if not os.path.exists(downloadpath):
    os.makedirs(downloadpath)

#Set the filenames
inputfile = localdatapath + 'extract_main_charity.csv'  # This file is downloaded by ew_download.py
outputfilename = '/' + 'ew_trustee_data' + '.csv'  # This is the main output data
outputfile = projpath + downloadpath + outputfilename
status = downloadpath + '/Status.txt'  # Status is a text file which records the last charity which the scraper atempted to parse, can be used for debugging failue. This is automatically removed when the script completes.