def move_to_folder(): dest_filename = dir_upload if file_folder == 'general_purpose': dest_filename += general_purpose elif file_folder == 'school_district': dest_filename += school_district elif file_folder == 'public_higher_education': dest_filename += public_higher_education elif file_folder == 'special_district': dest_filename += special_district elif file_folder == 'non_profit': dest_filename += non_profit elif file_folder == 'unclassified': dest_filename += unclassified elif re.match('.*community\s*college.*', auditeename, re.IGNORECASE): dest_filename += community_college_district # test for operating system if operating_system == 'mac' or operating_system == 'linux': dest_filename += '/' + year + '/' elif operating_system == 'windows': dest_filename += '\\' + year + '\\' os.makedirs(dest_filename, exist_ok=True) dest_filename += pdf_name print(dest_filename) try: os.rename(dir_pdfs + pdf, dest_filename) db.saveFileStatus(id=file_id, file_original_name=dest_filename, file_status='Classified') except Exception as e: print(e)
def main(): file_storage_connect() ''' connect to public ftp function ''' ftp = ftplib.FTP(url.netloc) ftp.login() print('login to ' + url.netloc) logging.info('login to ' + url.netloc) stack = [url.path] path = stack.pop() ftp.cwd(path) # add all directories to the queue children = ftp_dir(ftp) dirs = [ posixpath.join(path, child[1]) for child in children if not child[0] ] # set start_from directory while True: itemdir = dirs[0] if itemdir.split('/')[-1] != start_from.strip(): del dirs[0] else: break # put values from Illinois Entities.xlsx in dictionary print('Creating connection with ' + illinois_entities_xlsx_file) wbShort = openpyxl.load_workbook(dir_in + illinois_entities_xlsx_file.strip()) sheetShort = wbShort.get_sheet_by_name(illinois_entities_sheet.strip()) excel_name = {} excel_category = {} row = 2 scrolldown = True while scrolldown: key = str(sheetShort['A' + str(row)].value) if len(key) == 6: key = '00' + key elif len(key) == 7: key = '0' + key excel_name[key] = sheetShort['B' + str(row)].value.strip() excel_category[key] = getGategory(sheetShort['J' + str(row)].value.strip()) row += 1 if sheetShort['A' + str(row)].value == None: scrolldown = False # when finding empty row parsing of Shortnames xlsx will stop for udir in dirs: print('-' * 20) logging.info('-' * 20) print(udir) logging.info(udir) # example of path structure /LocGovAudits/FY2015/00100000 parseddir = udir.split('/')[-1].strip() try: preparename = 'IL@#' + excel_category[ parseddir] + '@#' + excel_name[parseddir] + '@#' + year + '.pdf' except: preparename = parseddir + '.pdf' preparename = preparename.replace('/', '') preparename = preparename.replace(':', '') ftp.cwd(udir) time.sleep(0.8) files = [] try: files = ftp.nlst() files = [f for f in files if not "Del" in f] files.sort() except Exception as e: if str(e) == "550 No files found": print("No files in this directory") logging.info(udir + " No files in this directory") else: print(str(e)) logging.info(udir + ' ' + str(e)) for f in files: with open(dir_pdfs + f, 'wb') as fobj: ftp.retrbinary('RETR %s' % f, fobj.write) print('downloading ' + f) logging.info('downloading ' + f) # if more then one pdf in ftp directory merge them if len(files) > 1: pdfline = ' '.join(files) if platform.system() == "Linux": command = 'pdftk ' + pdfline + ' cat output temp.pdf' if platform.system() == "Windows": command = 'pdftk.exe ' + pdfline + ' cat output temp.pdf' try: os.system(command) os.rename('temp.pdf', preparename) print(preparename + ' generated') logging.info(preparename + ' generated') bOK = True except Exception as e: print(udir + ' ' + pdfline + ' not generated pdf') print(str(e)) logging.info(udir + ' ' + pdfline + ' not generated pdf') logging.info(str(e)) bOK = False else: # check is there only one pdf file if len(files) == 1: try: os.rename(dir_pdfs + files[0].strip(), dir_pdfs + preparename) except Exception as e: logging.info(str(e)) print(str(e)) print(preparename + ' generated') logging.info(preparename + ' generated') else: print('no files in ' + udir) logging.info('no files in ' + udir) # this most probably will never occure # delete original pdf files if more then one, since if one only, with renaming it is deleted if len(files) > 1 and bOK: for f in files: os.remove(dir_pdfs + str(f).strip()) if len(files) > 0: file_details = db.readFileStatus(file_original_name=preparename) if file_details is None: file_details = db.saveFileStatus( script_name=script_name, file_original_name=preparename, file_status='Downloaded') upload_to_file_storage(preparename) os.remove(preparename)
def upload_to_file_storage(filename): global script_name old_filename = filename downloads_path = dir_pdfs fnm = FilenameManager() retries = 0 while retries < 3: try: path = os.path.join(downloads_path, old_filename) file_details = db.readFileStatus(file_original_name=old_filename, file_status='Uploaded') if file_details is not None: print( 'File {} was already uploaded before'.format(old_filename)) retries = 3 break file_details = db.readFileStatus(file_original_name=old_filename, file_status='Downloaded') print('Uploading {}'.format(path)) remote_filename = _get_remote_filename(old_filename) directory = None if not remote_filename: return try: directory, filename, year = remote_filename except: directory, filename = remote_filename filename = fnm.azure_validate_filename(filename) if len(file_storage_dir) > 0: directory = file_storage_dir + '/' + directory if not file_service.exists(file_storage_share, directory_name=directory): file_service.create_directory(file_storage_share, directory) if year: directory += '/' + year if not file_service.exists(file_storage_share, directory_name=directory): file_service.create_directory(file_storage_share, directory) if not overwrite_remote_files: print('Checking if {}/{} already exists'.format( directory, filename)) if file_service.exists(file_storage_share, directory_name=directory, file_name=filename): print('{}/{} already exists'.format(directory, filename)) if file_details is None: db.saveFileStatus(script_name=script_name, file_original_name=old_filename, file_upload_path=directory, file_upload_name=filename, file_status='Uploaded') else: db.saveFileStatus(id=file_details['id'], script_name=script_name, file_upload_path=directory, file_upload_name=filename, file_status='Uploaded') return file_service.create_file_from_path( file_storage_share, directory, filename, path, content_settings=ContentSettings( content_type='application/pdf')) if file_details is None: db.saveFileStatus(script_name=script_name, file_original_name=old_filename, file_upload_path=directory, file_upload_name=filename, file_status='Uploaded') else: db.saveFileStatus(id=file_details['id'], script_name=script_name, file_upload_path=directory, file_upload_name=filename, file_status='Uploaded') print('{} uploaded'.format(path)) retries = 3 except Exception as e: print('Error uploading to Asure file storage,', str(e)) filename = old_filename retries += 1
global dir_in global dir_pdfs global illinois_entities_xlsx_file global illinois_entities_sheet global script_name start_time = datetime.utcnow() script_name = "get_IL.py" result = 1 error_message = "" config_file = "" config = configparser.ConfigParser() config.read('conf.ini') db = db(config) try: dbparameters = db.readProps('illinois') config_file = str(dbparameters) with open('IL_parms.txt', 'r') as fp: dparameters = json.load(fp) ftpurl = dbparameters["url"] or dparameters["ftpurl"] url = urllib.parse.urlparse(ftpurl) start_from = dbparameters["start_from"] or dparameters["start_from"] year = dbparameters["year"] or dparameters["year"] dir_in = dbparameters["dir_in"] or dparameters["dir_in"] dir_pdfs = dbparameters["dir_pdfs"] or dparameters["dir_pdfs"] illinois_entities_xlsx_file = dbparameters[ "illinois_entities_xlsx_file"] or dparameters[ "illinois_entities_xlsx_file"] illinois_entities_sheet = dbparameters[ "illinois_entities_sheet"] or dparameters["illinois_entities_sheet"] # if log file become large, you can change filemode='w' for logging only individual sessons
def upload_to_file_storage(): #init file manager fnm = FilenameManager() # get a list of pdf files in dir_pdfs template = dir_upload + "**" if operating_system == 'mac' or operating_system == 'linux': template += '/*.pdf' elif operating_system == 'windows': template += '\\*.pdf' lpdfs = glob.glob(template, recursive=True) lpdfs.sort() #os.chdir(dir_pdfs) # needed for ftp.storbinary('STOR command work not with paths but with filenames # connect to FTP server and upload files try: file_storage_url = dparameters['fs_server'].strip() file_storage_user = dparameters['fs_username'].strip() file_storage_pwd = dparameters['fs_password'].strip() file_storage_share = dparameters['fs_share'].strip() file_storage_dir = dparameters['fs_directory_prefix'].strip() file_service = FileService(account_name=file_storage_user, account_key=file_storage_pwd) try: if file_service.exists(file_storage_share): print( 'Connection to Azure file storage successfully established...' ) if len(file_storage_dir) > 0 and not file_service.exists( file_storage_share, directory_name=file_storage_dir): file_service.create_directory(file_storage_share, file_storage_dir) print('Created directory:' + file_storage_dir) else: print( 'Failed to connect to Asure file storage, share does not exist: ' + file_storage_share) except Exception as ex: print('Error connecting to Azure file storage: ', ex) for pdffile in lpdfs: file_details = db.readFileStatus(file_original_name=pdffile, file_status='Uploaded') if file_details is None: file_id = None file_details = db.readFileStatus( file_original_name=pdffile, file_status='Classified') if file_details is not None: file_id = file_details["id"] dir, rpdffile = ntpath.split(pdffile) destinationdir = '' if (dir + '\\') == dir_upload or (dir + '/') == dir_upload: destinationdir = 'Unclassified' else: dir, year = ntpath.split(dir) dir, destinationdir = ntpath.split(dir) retries = 0 while retries < 3: try: path = pdffile print('Uploading {}'.format(path)) filename = pdffile remote_filename = fnm.azure_validate_filename( rpdffile) if not remote_filename: return if len(file_storage_dir) > 0: directory = file_storage_dir + '/' + destinationdir else: directory = destinationdir if not file_service.exists( file_storage_share, directory_name=directory): file_service.create_directory( file_storage_share, directory) directory += '/' + year if not file_service.exists( file_storage_share, directory_name=directory): file_service.create_directory( file_storage_share, directory) print('Checking if {}/{} already exists'.format( directory, remote_filename)) if file_service.exists(file_storage_share, directory_name=directory, file_name=remote_filename): print('{}/{} already exists'.format( directory, remote_filename)) if file_id is None: db.saveFileStatus( script_name=script_name, file_original_name=pdffile, file_upload_path=directory, file_upload_name=remote_filename, file_status='Uploaded') else: db.saveFileStatus( id=file_details["id"], file_upload_path=directory, file_upload_name=remote_filename, file_status='Uploaded') os.remove(pdffile) break file_service.create_file_from_path( file_storage_share, directory, remote_filename, path, content_settings=ContentSettings( content_type='application/pdf')) if file_id is None: db.saveFileStatus( script_name=script_name, file_original_name=pdffile, file_upload_path=directory, file_upload_name=remote_filename, file_status='Uploaded') else: db.saveFileStatus( id=file_details["id"], file_upload_path=directory, file_upload_name=remote_filename, file_status='Uploaded') print('{}/{} uploaded'.format( directory, remote_filename)) retries = 3 os.remove(pdffile) except Exception as e: print('Error uploading to Asure file storage,', str(e)) retries += 1 else: print('File {} was uploaded before'.format( file_details["file_original_name"])) os.remove(pdffile) except Exception as e: print(str(e)) logging.critical(str(e))
def rename_and_move_files(): classify_doc() for pdf in pdfs: file_id = db.saveFileStatus(script_name=script_name, file_original_name=pdf, file_status='Downloaded') if re.match('(\d+)(?:19|20)\d{2}\d*\.pdf', pdf): file_key = re.match('(\d+)(?:19|20)\d{2}\d*\.pdf', pdf).group(1) file_folder = [ classify_file['record_' + str(i)]['folder'] for i, rec in enumerate(classify_file) if classify_file['record_' + str(i)]['dbkey'] == file_key ] pdf_record = [ classify_file['record_' + str(i)] for i, rec in enumerate(classify_file) if classify_file['record_' + str(i)]['dbkey'] == file_key ] def move_to_folder(): dest_filename = dir_upload if file_folder == 'general_purpose': dest_filename += general_purpose elif file_folder == 'school_district': dest_filename += school_district elif file_folder == 'public_higher_education': dest_filename += public_higher_education elif file_folder == 'special_district': dest_filename += special_district elif file_folder == 'non_profit': dest_filename += non_profit elif file_folder == 'unclassified': dest_filename += unclassified elif re.match('.*community\s*college.*', auditeename, re.IGNORECASE): dest_filename += community_college_district # test for operating system if operating_system == 'mac' or operating_system == 'linux': dest_filename += '/' + year + '/' elif operating_system == 'windows': dest_filename += '\\' + year + '\\' os.makedirs(dest_filename, exist_ok=True) dest_filename += pdf_name print(dest_filename) try: os.rename(dir_pdfs + pdf, dest_filename) db.saveFileStatus(id=file_id, file_original_name=dest_filename, file_status='Classified') except Exception as e: print(e) if file_folder != []: file_folder = file_folder[0] pdf_record = pdf_record[0] year = re.match('.*\/((?:19|20)\d{2})', pdf_record['yearending']).group(1) if file_key in general: if file_key == '100': pdf_name = pdf_record['state'] + ' ' + pdf_record[ 'city'] + ' ' + 'County' + ' ' + year + '.pdf' move_to_folder() elif file_key == '200': pdf_name = pdf_record['state'] + ' ' + pdf_record[ 'city'] + ' ' + 'County' + ' ' + year + '.pdf' move_to_folder() elif file_key == '300': pdf_name = pdf_record['state'] + ' ' + pdf_record[ 'city'] + ' ' + 'Township' + ' ' + year + '.pdf' move_to_folder() elif re.match('.*financial.*', pdf_name, re.IGNORECASE): num = codes.index(pdf_record['state']) full_name = states[num] pdf_name = pdf_record[ 'state'] + ' ' + 'State of ' + full_name + ' ' + year + '.pdf' else: auditeename = pdf_record['auditeename'].replace( '/', '-') # further testing here if 'CITY OF ' in auditeename: auditeename = auditeename.replace( 'CITY OF ', '') elif 'MUNICIPALITY OF ' in auditeename: auditeename = auditeename.replace( 'MUNICIPALITY OF ', '') elif 'MUNICIPIOS OF ' in auditeename: auditeename = auditeename.replace( 'MUNICIPIOS OF ', '') elif 'VILLAGE OF ' in auditeename: auditeename = auditeename.replace( 'VILLAGE OF ', '') elif 'TOWN OF ' in auditeename: auditeename = auditeename.replace( 'TOWN OF ', '') if re.match('.*(,.*)', auditeename): auditeename = auditeename.replace( re.match('.*(,.*)', auditeename).group(1), '') auditeename = auditeename.title() pdf_name = pdf_record[ 'state'] + ' ' + auditeename + ' ' + year + '.pdf' move_to_folder() else: auditeename = pdf_record['auditeename'].replace( '/', '-') # further testing here if 'CITY OF ' in auditeename: auditeename = auditeename.replace('CITY OF ', '') elif 'MUNICIPALITY OF ' in auditeename: auditeename = auditeename.replace( 'MUNICIPALITY OF ', '') elif 'MUNICIPIOS OF ' in auditeename: auditeename = auditeename.replace( 'MUNICIPIOS OF ', '') elif 'VILLAGE OF ' in auditeename: auditeename = auditeename.replace( 'VILLAGE OF ', '') elif 'TOWN OF ' in auditeename: auditeename = auditeename.replace('TOWN OF ', '') if re.match('.*(,.*)', auditeename): auditeename = auditeename.replace( re.match('.*(,.*)', auditeename).group(1), '') auditeename = auditeename.title() pdf_name = pdf_record[ 'state'] + ' ' + auditeename + ' ' + year + '.pdf' move_to_folder() else: continue