示例#1
0
                def move_to_folder():

                    dest_filename = dir_upload

                    if file_folder == 'general_purpose':
                        dest_filename += general_purpose
                    elif file_folder == 'school_district':
                        dest_filename += school_district
                    elif file_folder == 'public_higher_education':
                        dest_filename += public_higher_education
                    elif file_folder == 'special_district':
                        dest_filename += special_district
                    elif file_folder == 'non_profit':
                        dest_filename += non_profit
                    elif file_folder == 'unclassified':
                        dest_filename += unclassified
                    elif re.match('.*community\s*college.*', auditeename,
                                  re.IGNORECASE):
                        dest_filename += community_college_district

                    # test for operating system
                    if operating_system == 'mac' or operating_system == 'linux':
                        dest_filename += '/' + year + '/'
                    elif operating_system == 'windows':
                        dest_filename += '\\' + year + '\\'

                    os.makedirs(dest_filename, exist_ok=True)

                    dest_filename += pdf_name
                    print(dest_filename)
                    try:
                        os.rename(dir_pdfs + pdf, dest_filename)
                        db.saveFileStatus(id=file_id,
                                          file_original_name=dest_filename,
                                          file_status='Classified')
                    except Exception as e:
                        print(e)
示例#2
0
def main():
    file_storage_connect()
    ''' connect to public ftp function '''
    ftp = ftplib.FTP(url.netloc)
    ftp.login()
    print('login to ' + url.netloc)
    logging.info('login to ' + url.netloc)
    stack = [url.path]
    path = stack.pop()
    ftp.cwd(path)

    # add all directories to the queue
    children = ftp_dir(ftp)
    dirs = [
        posixpath.join(path, child[1]) for child in children if not child[0]
    ]
    # set start_from directory
    while True:
        itemdir = dirs[0]
        if itemdir.split('/')[-1] != start_from.strip():
            del dirs[0]
        else:
            break

    # put values from Illinois Entities.xlsx in dictionary
    print('Creating connection with ' + illinois_entities_xlsx_file)
    wbShort = openpyxl.load_workbook(dir_in +
                                     illinois_entities_xlsx_file.strip())
    sheetShort = wbShort.get_sheet_by_name(illinois_entities_sheet.strip())
    excel_name = {}
    excel_category = {}
    row = 2
    scrolldown = True

    while scrolldown:
        key = str(sheetShort['A' + str(row)].value)
        if len(key) == 6:
            key = '00' + key
        elif len(key) == 7:
            key = '0' + key
        excel_name[key] = sheetShort['B' + str(row)].value.strip()
        excel_category[key] = getGategory(sheetShort['J' +
                                                     str(row)].value.strip())

        row += 1
        if sheetShort['A' + str(row)].value == None:
            scrolldown = False  # when finding empty row parsing of Shortnames xlsx will stop

    for udir in dirs:
        print('-' * 20)
        logging.info('-' * 20)
        print(udir)
        logging.info(udir)
        # example of path structure /LocGovAudits/FY2015/00100000
        parseddir = udir.split('/')[-1].strip()
        try:
            preparename = 'IL@#' + excel_category[
                parseddir] + '@#' + excel_name[parseddir] + '@#' + year + '.pdf'
        except:
            preparename = parseddir + '.pdf'
        preparename = preparename.replace('/', '')
        preparename = preparename.replace(':', '')

        ftp.cwd(udir)
        time.sleep(0.8)
        files = []

        try:
            files = ftp.nlst()
            files = [f for f in files if not "Del" in f]
            files.sort()
        except Exception as e:
            if str(e) == "550 No files found":
                print("No files in this directory")
                logging.info(udir + " No files in this directory")
            else:
                print(str(e))
                logging.info(udir + ' ' + str(e))

        for f in files:
            with open(dir_pdfs + f, 'wb') as fobj:
                ftp.retrbinary('RETR %s' % f, fobj.write)
                print('downloading ' + f)
                logging.info('downloading ' + f)

        # if more then one pdf in ftp directory merge them
        if len(files) > 1:
            pdfline = ' '.join(files)
            if platform.system() == "Linux":
                command = 'pdftk ' + pdfline + ' cat output temp.pdf'
            if platform.system() == "Windows":
                command = 'pdftk.exe ' + pdfline + ' cat output temp.pdf'
            try:
                os.system(command)
                os.rename('temp.pdf', preparename)
                print(preparename + ' generated')
                logging.info(preparename + ' generated')
                bOK = True
            except Exception as e:
                print(udir + ' ' + pdfline + ' not generated pdf')
                print(str(e))
                logging.info(udir + ' ' + pdfline + ' not generated pdf')
                logging.info(str(e))
                bOK = False
        else:
            # check is there only one pdf file
            if len(files) == 1:
                try:
                    os.rename(dir_pdfs + files[0].strip(),
                              dir_pdfs + preparename)
                except Exception as e:
                    logging.info(str(e))
                    print(str(e))
                print(preparename + ' generated')
                logging.info(preparename + ' generated')
            else:
                print('no files in ' + udir)
                logging.info('no files in ' +
                             udir)  # this most probably will never occure

        # delete original pdf files if more then one, since if one only, with renaming it is deleted
        if len(files) > 1 and bOK:
            for f in files:
                os.remove(dir_pdfs + str(f).strip())
        if len(files) > 0:
            file_details = db.readFileStatus(file_original_name=preparename)
            if file_details is None:
                file_details = db.saveFileStatus(
                    script_name=script_name,
                    file_original_name=preparename,
                    file_status='Downloaded')
            upload_to_file_storage(preparename)
            os.remove(preparename)
示例#3
0
def upload_to_file_storage(filename):
    global script_name
    old_filename = filename
    downloads_path = dir_pdfs
    fnm = FilenameManager()
    retries = 0
    while retries < 3:
        try:
            path = os.path.join(downloads_path, old_filename)
            file_details = db.readFileStatus(file_original_name=old_filename,
                                             file_status='Uploaded')
            if file_details is not None:
                print(
                    'File {} was already uploaded before'.format(old_filename))
                retries = 3
                break
            file_details = db.readFileStatus(file_original_name=old_filename,
                                             file_status='Downloaded')
            print('Uploading {}'.format(path))
            remote_filename = _get_remote_filename(old_filename)
            directory = None
            if not remote_filename:
                return
            try:
                directory, filename, year = remote_filename
            except:
                directory, filename = remote_filename
            filename = fnm.azure_validate_filename(filename)
            if len(file_storage_dir) > 0:
                directory = file_storage_dir + '/' + directory
            if not file_service.exists(file_storage_share,
                                       directory_name=directory):
                file_service.create_directory(file_storage_share, directory)
            if year:
                directory += '/' + year
                if not file_service.exists(file_storage_share,
                                           directory_name=directory):
                    file_service.create_directory(file_storage_share,
                                                  directory)
            if not overwrite_remote_files:
                print('Checking if {}/{} already exists'.format(
                    directory, filename))
                if file_service.exists(file_storage_share,
                                       directory_name=directory,
                                       file_name=filename):
                    print('{}/{} already exists'.format(directory, filename))
                    if file_details is None:
                        db.saveFileStatus(script_name=script_name,
                                          file_original_name=old_filename,
                                          file_upload_path=directory,
                                          file_upload_name=filename,
                                          file_status='Uploaded')
                    else:
                        db.saveFileStatus(id=file_details['id'],
                                          script_name=script_name,
                                          file_upload_path=directory,
                                          file_upload_name=filename,
                                          file_status='Uploaded')
                    return
            file_service.create_file_from_path(
                file_storage_share,
                directory,
                filename,
                path,
                content_settings=ContentSettings(
                    content_type='application/pdf'))
            if file_details is None:
                db.saveFileStatus(script_name=script_name,
                                  file_original_name=old_filename,
                                  file_upload_path=directory,
                                  file_upload_name=filename,
                                  file_status='Uploaded')
            else:
                db.saveFileStatus(id=file_details['id'],
                                  script_name=script_name,
                                  file_upload_path=directory,
                                  file_upload_name=filename,
                                  file_status='Uploaded')
            print('{} uploaded'.format(path))
            retries = 3
        except Exception as e:
            print('Error uploading to Asure file storage,', str(e))
            filename = old_filename
            retries += 1
示例#4
0
    global dir_in
    global dir_pdfs
    global illinois_entities_xlsx_file
    global illinois_entities_sheet
    global script_name
    start_time = datetime.utcnow()
    script_name = "get_IL.py"
    result = 1
    error_message = ""
    config_file = ""

    config = configparser.ConfigParser()
    config.read('conf.ini')
    db = db(config)
    try:
        dbparameters = db.readProps('illinois')
        config_file = str(dbparameters)
        with open('IL_parms.txt', 'r') as fp:
            dparameters = json.load(fp)
        ftpurl = dbparameters["url"] or dparameters["ftpurl"]
        url = urllib.parse.urlparse(ftpurl)
        start_from = dbparameters["start_from"] or dparameters["start_from"]
        year = dbparameters["year"] or dparameters["year"]
        dir_in = dbparameters["dir_in"] or dparameters["dir_in"]
        dir_pdfs = dbparameters["dir_pdfs"] or dparameters["dir_pdfs"]
        illinois_entities_xlsx_file = dbparameters[
            "illinois_entities_xlsx_file"] or dparameters[
                "illinois_entities_xlsx_file"]
        illinois_entities_sheet = dbparameters[
            "illinois_entities_sheet"] or dparameters["illinois_entities_sheet"]
        # if log file become large, you can change filemode='w' for logging only individual sessons
示例#5
0
    def upload_to_file_storage():
        #init file manager
        fnm = FilenameManager()

        # get a list of pdf files in dir_pdfs
        template = dir_upload + "**"
        if operating_system == 'mac' or operating_system == 'linux':
            template += '/*.pdf'
        elif operating_system == 'windows':
            template += '\\*.pdf'
        lpdfs = glob.glob(template, recursive=True)
        lpdfs.sort()
        #os.chdir(dir_pdfs) # needed for ftp.storbinary('STOR command work not with paths but with filenames
        # connect to FTP server and upload files
        try:
            file_storage_url = dparameters['fs_server'].strip()
            file_storage_user = dparameters['fs_username'].strip()
            file_storage_pwd = dparameters['fs_password'].strip()
            file_storage_share = dparameters['fs_share'].strip()
            file_storage_dir = dparameters['fs_directory_prefix'].strip()
            file_service = FileService(account_name=file_storage_user,
                                       account_key=file_storage_pwd)
            try:
                if file_service.exists(file_storage_share):
                    print(
                        'Connection to Azure file storage successfully established...'
                    )
                    if len(file_storage_dir) > 0 and not file_service.exists(
                            file_storage_share,
                            directory_name=file_storage_dir):
                        file_service.create_directory(file_storage_share,
                                                      file_storage_dir)
                        print('Created directory:' + file_storage_dir)
                else:
                    print(
                        'Failed to connect to Asure file storage, share does not exist: '
                        + file_storage_share)
            except Exception as ex:
                print('Error connecting to Azure file storage: ', ex)

            for pdffile in lpdfs:
                file_details = db.readFileStatus(file_original_name=pdffile,
                                                 file_status='Uploaded')
                if file_details is None:
                    file_id = None
                    file_details = db.readFileStatus(
                        file_original_name=pdffile, file_status='Classified')
                    if file_details is not None:
                        file_id = file_details["id"]
                    dir, rpdffile = ntpath.split(pdffile)

                    destinationdir = ''

                    if (dir + '\\') == dir_upload or (dir + '/') == dir_upload:
                        destinationdir = 'Unclassified'
                    else:
                        dir, year = ntpath.split(dir)
                        dir, destinationdir = ntpath.split(dir)

                    retries = 0
                    while retries < 3:
                        try:
                            path = pdffile
                            print('Uploading {}'.format(path))
                            filename = pdffile
                            remote_filename = fnm.azure_validate_filename(
                                rpdffile)
                            if not remote_filename:
                                return
                            if len(file_storage_dir) > 0:
                                directory = file_storage_dir + '/' + destinationdir
                            else:
                                directory = destinationdir
                            if not file_service.exists(
                                    file_storage_share,
                                    directory_name=directory):
                                file_service.create_directory(
                                    file_storage_share, directory)
                            directory += '/' + year
                            if not file_service.exists(
                                    file_storage_share,
                                    directory_name=directory):
                                file_service.create_directory(
                                    file_storage_share, directory)
                            print('Checking if {}/{} already exists'.format(
                                directory, remote_filename))
                            if file_service.exists(file_storage_share,
                                                   directory_name=directory,
                                                   file_name=remote_filename):
                                print('{}/{} already exists'.format(
                                    directory, remote_filename))
                                if file_id is None:
                                    db.saveFileStatus(
                                        script_name=script_name,
                                        file_original_name=pdffile,
                                        file_upload_path=directory,
                                        file_upload_name=remote_filename,
                                        file_status='Uploaded')
                                else:
                                    db.saveFileStatus(
                                        id=file_details["id"],
                                        file_upload_path=directory,
                                        file_upload_name=remote_filename,
                                        file_status='Uploaded')
                                os.remove(pdffile)
                                break
                            file_service.create_file_from_path(
                                file_storage_share,
                                directory,
                                remote_filename,
                                path,
                                content_settings=ContentSettings(
                                    content_type='application/pdf'))
                            if file_id is None:
                                db.saveFileStatus(
                                    script_name=script_name,
                                    file_original_name=pdffile,
                                    file_upload_path=directory,
                                    file_upload_name=remote_filename,
                                    file_status='Uploaded')
                            else:
                                db.saveFileStatus(
                                    id=file_details["id"],
                                    file_upload_path=directory,
                                    file_upload_name=remote_filename,
                                    file_status='Uploaded')
                            print('{}/{} uploaded'.format(
                                directory, remote_filename))
                            retries = 3
                            os.remove(pdffile)
                        except Exception as e:
                            print('Error uploading to Asure file storage,',
                                  str(e))
                            retries += 1
                else:
                    print('File {} was uploaded before'.format(
                        file_details["file_original_name"]))
                    os.remove(pdffile)
        except Exception as e:
            print(str(e))
            logging.critical(str(e))
示例#6
0
    def rename_and_move_files():
        classify_doc()
        for pdf in pdfs:
            file_id = db.saveFileStatus(script_name=script_name,
                                        file_original_name=pdf,
                                        file_status='Downloaded')
            if re.match('(\d+)(?:19|20)\d{2}\d*\.pdf', pdf):
                file_key = re.match('(\d+)(?:19|20)\d{2}\d*\.pdf',
                                    pdf).group(1)
                file_folder = [
                    classify_file['record_' + str(i)]['folder']
                    for i, rec in enumerate(classify_file)
                    if classify_file['record_' + str(i)]['dbkey'] == file_key
                ]
                pdf_record = [
                    classify_file['record_' + str(i)]
                    for i, rec in enumerate(classify_file)
                    if classify_file['record_' + str(i)]['dbkey'] == file_key
                ]

                def move_to_folder():

                    dest_filename = dir_upload

                    if file_folder == 'general_purpose':
                        dest_filename += general_purpose
                    elif file_folder == 'school_district':
                        dest_filename += school_district
                    elif file_folder == 'public_higher_education':
                        dest_filename += public_higher_education
                    elif file_folder == 'special_district':
                        dest_filename += special_district
                    elif file_folder == 'non_profit':
                        dest_filename += non_profit
                    elif file_folder == 'unclassified':
                        dest_filename += unclassified
                    elif re.match('.*community\s*college.*', auditeename,
                                  re.IGNORECASE):
                        dest_filename += community_college_district

                    # test for operating system
                    if operating_system == 'mac' or operating_system == 'linux':
                        dest_filename += '/' + year + '/'
                    elif operating_system == 'windows':
                        dest_filename += '\\' + year + '\\'

                    os.makedirs(dest_filename, exist_ok=True)

                    dest_filename += pdf_name
                    print(dest_filename)
                    try:
                        os.rename(dir_pdfs + pdf, dest_filename)
                        db.saveFileStatus(id=file_id,
                                          file_original_name=dest_filename,
                                          file_status='Classified')
                    except Exception as e:
                        print(e)

                if file_folder != []:
                    file_folder = file_folder[0]
                    pdf_record = pdf_record[0]
                    year = re.match('.*\/((?:19|20)\d{2})',
                                    pdf_record['yearending']).group(1)
                    if file_key in general:
                        if file_key == '100':
                            pdf_name = pdf_record['state'] + ' ' + pdf_record[
                                'city'] + ' ' + 'County' + ' ' + year + '.pdf'
                            move_to_folder()
                        elif file_key == '200':
                            pdf_name = pdf_record['state'] + ' ' + pdf_record[
                                'city'] + ' ' + 'County' + ' ' + year + '.pdf'
                            move_to_folder()
                        elif file_key == '300':
                            pdf_name = pdf_record['state'] + ' ' + pdf_record[
                                'city'] + ' ' + 'Township' + ' ' + year + '.pdf'
                            move_to_folder()
                        elif re.match('.*financial.*', pdf_name,
                                      re.IGNORECASE):
                            num = codes.index(pdf_record['state'])
                            full_name = states[num]
                            pdf_name = pdf_record[
                                'state'] + ' ' + 'State of ' + full_name + ' ' + year + '.pdf'
                        else:
                            auditeename = pdf_record['auditeename'].replace(
                                '/', '-')

                            # further testing here
                            if 'CITY OF ' in auditeename:
                                auditeename = auditeename.replace(
                                    'CITY OF ', '')
                            elif 'MUNICIPALITY OF ' in auditeename:
                                auditeename = auditeename.replace(
                                    'MUNICIPALITY OF ', '')
                            elif 'MUNICIPIOS OF ' in auditeename:
                                auditeename = auditeename.replace(
                                    'MUNICIPIOS OF ', '')
                            elif 'VILLAGE OF ' in auditeename:
                                auditeename = auditeename.replace(
                                    'VILLAGE OF ', '')
                            elif 'TOWN OF ' in auditeename:
                                auditeename = auditeename.replace(
                                    'TOWN OF ', '')

                            if re.match('.*(,.*)', auditeename):
                                auditeename = auditeename.replace(
                                    re.match('.*(,.*)', auditeename).group(1),
                                    '')
                            auditeename = auditeename.title()

                            pdf_name = pdf_record[
                                'state'] + ' ' + auditeename + ' ' + year + '.pdf'
                            move_to_folder()
                    else:
                        auditeename = pdf_record['auditeename'].replace(
                            '/', '-')

                        # further testing here
                        if 'CITY OF ' in auditeename:
                            auditeename = auditeename.replace('CITY OF ', '')
                        elif 'MUNICIPALITY OF ' in auditeename:
                            auditeename = auditeename.replace(
                                'MUNICIPALITY OF ', '')
                        elif 'MUNICIPIOS OF ' in auditeename:
                            auditeename = auditeename.replace(
                                'MUNICIPIOS OF ', '')
                        elif 'VILLAGE OF ' in auditeename:
                            auditeename = auditeename.replace(
                                'VILLAGE OF ', '')
                        elif 'TOWN OF ' in auditeename:
                            auditeename = auditeename.replace('TOWN OF ', '')

                        if re.match('.*(,.*)', auditeename):
                            auditeename = auditeename.replace(
                                re.match('.*(,.*)', auditeename).group(1), '')
                        auditeename = auditeename.title()

                        pdf_name = pdf_record[
                            'state'] + ' ' + auditeename + ' ' + year + '.pdf'
                        move_to_folder()
            else:
                continue