示例#1
0
def download_models_from_remote_server(remote_server_config, models_config,
                                       output_dir):
    logging.info('Downloading models...')
    output_dir = sharedlib.abspath(output_dir)
    remote_files = sharedlib.get_list_of_files_from_remote_server(
        remote_server_config['trained_models_dir'])

    models_base_uri = sharedlib.join_remote_server_paths(
        remote_server_config['base_uri'],
        remote_server_config['trained_models_dir'])

    models = []
    for model_config in models_config:
        name_zip_tuple = (model_config['name'], model_config['archive_name'],
                          os.path.join(output_dir,
                                       model_config['archive_name']))
        classifier = None
        vectorizer = None
        if name_zip_tuple[1] in remote_files:
            sharedlib.download_file(
                sharedlib.join_remote_server_paths(
                    models_base_uri, model_config['archive_name']),
                name_zip_tuple[2], True)
            sharedlib.unzip(name_zip_tuple[2], output_dir)
            pickle_file = os.path.join(output_dir,
                                       name_zip_tuple[0] + '.pickle')
            if os.path.exists(pickle_file):
                classifier = sharedlib.load_pickle(pickle_file)

        vectorizer_pickle_file = os.path.join(
            output_dir, name_zip_tuple[0] + '.vectorizer.pickle')
        if os.path.exists(vectorizer_pickle_file):
            logging.info('Vectorizer pickle file: {}'.format(
                os.path.basename(vectorizer_pickle_file)))
            logging.info('Loading the pickled vectorizer...')
            vectorizer = sharedlib.load_pickle(vectorizer_pickle_file)
        else:
            logging.info(
                'No vectorizer (expected: {}) found for this model.'.format(
                    vectorizer_pickle_file))

        if classifier is not None:
            models.append((name_zip_tuple[0], classifier, vectorizer))
        else:
            logging.info(
                'Could not find pickled classifier in the package {} on the Remote Server'
                .format(name_zip_tuple[1]))

    logging.info('{} MODELS LOADED'.format(len(models)))
    return models
示例#2
0
def classify_files(files_to_classify):
    output_dir = sharedlib.abspath(config.output_dir)
    trained_models_dir = sharedlib.abspath(config.trained_models_dir)

    start_time = datetime.datetime.now()
    process_log_first_line = 'MAUDE Classification Process Log. Computer: {}. OS: {} {}  Date/Time: {}. Python Version: {}\n'.format(platform.node(), platform.system(), platform.release(), start_time, sys.version)
    log(process_log_first_line)
    log('classifier::classify_files() starting at {}'.format(start_time))

    models_config = config.models
    models = []
    log('Checking if model(s) need to be downloaded...')

    models_on_remote_server = sharedlib.get_list_of_files_from_remote_server(config.remote_server['trained_models_dir'])
    for model_config in models_config:
        model_name = model_config['name'] 
        classifier_pickle_file = os.path.join(trained_models_dir, model_name + '.pickle')
        vectorizer_pickle_file = os.path.join(trained_models_dir, model_name + '.vectorizer.pickle')
        if model_config['always_download'] == True or os.path.exists(classifier_pickle_file) == False:
            log('Model {} needs to be downloaded.'.format(model_name))

            if not model_config['archive_name'] in models_on_remote_server:
                log('Model archive {} not found on the remote server. This model will be skipped.'.format(model_config['archive_name']))
                continue

            download_zip_file_path = os.path.join(trained_models_dir, model_config['archive_name'])

            model_url = sharedlib.join_remote_server_paths(config.remote_server['base_uri'], config.remote_server['trained_models_dir'], model_config['archive_name'])

            sharedlib.download_file(model_url, download_zip_file_path, True)
            log('Extracting model archive...')
            sharedlib.unzip(download_zip_file_path, trained_models_dir)
            log('Model extracted.')
        
        log('Classifier pickle file: {}'.format(os.path.basename(classifier_pickle_file)))
        log('Loading the pickled classifier...')
        classifier = sharedlib.load_pickle(classifier_pickle_file)
        vectorizer = None

        if os.path.exists(vectorizer_pickle_file):
            log('Vectorizer pickle file: {}'.format(os.path.basename(vectorizer_pickle_file)))
            log('Loading the pickled vectorizer...')
            vectorizer = sharedlib.load_pickle(vectorizer_pickle_file)
        else:
            log('No vectorizer (expected: {}) found for this model.'.format(vectorizer_pickle_file))


        log('Model ({}) loaded.'.format(classifier))
        models.append((model_name, classifier, vectorizer))
       
    log('Total {} model(s) loaded.'.format(len(models)))

    positive_signal_regexes_for_false_negative_check = [re.compile('\s{}\s'.format(p.strip()), re.IGNORECASE) for p in config.positive_signals_for_false_negative_check]

    for input_data_file in files_to_classify:
        classify_file(input_data_file, models, positive_signal_regexes_for_false_negative_check, True, config.target_file_max_num_records_to_classify)

    end_time = datetime.datetime.now()
    log('classifier::classify_files() completed at {}. Total duration: {}.'.format(end_time, end_time - start_time))
def generate_models_per_config(input_data_files):
    input_dir = sharedlib.abspath(config.input_dir)
    output_dir = sharedlib.abspath(config.output_dir)
    start_time = datetime.datetime.now()
    log('modeler::create_models() starting at {}'.format(start_time))

    positive_records_files = []
    negative_records_files = []
    log('Checking if labeled archive(s) need to be downloaded...')
    for input_data_file_set in input_data_files:
        positive_records_file = os.path.join(
            input_dir, input_data_file_set['positive_records_file'])
        negative_records_file = os.path.join(
            input_dir, input_data_file_set['negative_records_file'])
        if input_data_file_set['always_download'] == True or os.path.exists(
                positive_records_file) == False or os.path.exists(
                    negative_records_file) == False:
            log('Labeled archive for input data needs to be downloaded.')
            positive_records_file_uri = sharedlib.join_remote_server_paths(
                config.remote_server['base_uri'],
                input_data_file_set['remote_blob_dir'],
                input_data_file_set['positive_records_file'])
            negative_records_file_uri = sharedlib.join_remote_server_paths(
                config.remote_server['base_uri'],
                input_data_file_set['remote_blob_dir'],
                input_data_file_set['negative_records_file'])
            sharedlib.download_file(positive_records_file_uri,
                                    positive_records_file, True)
            sharedlib.download_file(negative_records_file_uri,
                                    negative_records_file, True)

        log('Positive records file: {}'.format(
            os.path.basename(positive_records_file)))
        log('Negative records file: {}'.format(
            os.path.basename(negative_records_file)))

        positive_records_files.append(positive_records_file)
        negative_records_files.append(negative_records_file)

        generate_models(positive_records_files, negative_records_files,
                        config.models,
                        config.duplicate_record_check_ignore_pattern,
                        output_dir, config.upload_output_to_remote_server)
def download_remote_server_files(remote_server_config, remote_server_files,
                                 output_files):
    auto_labeled_base_uri = sharedlib.join_remote_server_paths(
        remote_server_config['base_uri'],
        remote_server_config['labeling_auto_labeled_dir'])
    logging.info('Downloading remote server files from {}...'.format(
        auto_labeled_base_uri))

    remote_auto_labeled_files_from_config = remote_server_files[
        'auto_labeled_files']

    sharedlib.download_file(
        sharedlib.join_remote_server_paths(
            auto_labeled_base_uri, remote_auto_labeled_files_from_config[
                'autolabeled_positive_records_blob']),
        sharedlib.abspath(output_files['autolabeled_positive_records_file']),
        not remote_auto_labeled_files_from_config[
            'skip_download_if_already_present'])
    sharedlib.download_file(
        sharedlib.join_remote_server_paths(
            auto_labeled_base_uri, remote_auto_labeled_files_from_config[
                'autolabeled_negative_records_blob']),
        sharedlib.abspath(output_files['autolabeled_negative_records_file']),
        not remote_auto_labeled_files_from_config[
            'skip_download_if_already_present'])
    sharedlib.download_file(
        sharedlib.join_remote_server_paths(
            auto_labeled_base_uri, remote_auto_labeled_files_from_config[
                'input_file_total_lines_count_blob']),
        sharedlib.abspath(output_files['input_file_total_lines_count_file']),
        not remote_auto_labeled_files_from_config[
            'skip_download_if_already_present'])
    sharedlib.download_file(
        sharedlib.join_remote_server_paths(
            auto_labeled_base_uri, remote_auto_labeled_files_from_config[
                'already_processed_record_numbers_blob']),
        sharedlib.abspath(
            output_files['already_processed_record_numbers_file']),
        not remote_auto_labeled_files_from_config[
            'skip_download_if_already_present'])
def download_labeled_seed_files(remote_server_config, remote_server_files,
                                output_files):
    verified_records_base_uri = sharedlib.join_remote_server_paths(
        remote_server_config['base_uri'],
        remote_server_config['labeling_verified_samples_dir'])
    logging.info('Downloading remote server files from {}...'.format(
        verified_records_base_uri))

    remote_seed_files_from_config = remote_server_files['labeled_seed_files']

    sharedlib.download_file(
        sharedlib.join_remote_server_paths(
            verified_records_base_uri,
            remote_seed_files_from_config['verified_positive_records_blob']),
        sharedlib.abspath(output_files['autolabeled_positive_records_file']),
        not remote_seed_files_from_config['skip_download_if_already_present'])
    sharedlib.download_file(
        sharedlib.join_remote_server_paths(
            verified_records_base_uri,
            remote_seed_files_from_config['verified_negative_records_blob']),
        sharedlib.abspath(output_files['autolabeled_negative_records_file']),
        not remote_seed_files_from_config['skip_download_if_already_present'])
def build_potential_file_sets(input_files,
                              potential_positive_records_file_merged,
                              potential_negative_records_file_merged,
                              questionable_positive_records_file_merged,
                              questionable_negative_records_file_merged):
    logging.info('Building potential positive and negative files...')

    input_dir = sharedlib.abspath(config.input_dir)

    with open(potential_positive_records_file_merged,
              'w',
              encoding='utf-8',
              errors='ignore') as consolidated_pos:
        with open(potential_negative_records_file_merged,
                  'w',
                  encoding='utf-8',
                  errors='ignore') as consolidated_neg:
            with open(questionable_positive_records_file_merged,
                      'w',
                      encoding='utf-8',
                      errors='ignore') as consolidated_questionable_pos:
                with open(questionable_negative_records_file_merged,
                          'w',
                          encoding='utf-8',
                          errors='ignore') as consolidated_questionable_neg:
                    for input_data_file_set in input_files:
                        potential_positive_records_file = os.path.join(
                            input_dir, input_data_file_set[
                                'potential_positive_records_file'])
                        potential_negative_records_file = os.path.join(
                            input_dir, input_data_file_set[
                                'potential_negative_records_file'])
                        questionable_positive_records_file = os.path.join(
                            input_dir, input_data_file_set[
                                'questionable_positive_records_file'])
                        questionable_negative_records_file = os.path.join(
                            input_dir, input_data_file_set[
                                'questionable_negative_records_file'])
                        if input_data_file_set[
                                'always_download'] == True or os.path.exists(
                                    potential_positive_records_file
                                ) == False or os.path.exists(
                                    potential_negative_records_file) == False:
                            logging.info(
                                'Labeling candidate archive for {} needs to be downloaded.'
                                .format(input_data_file_set['name']))

                            labeling_candidates_file_url = sharedlib.join_remote_server_paths(
                                config.remote_server['base_uri'], config.
                                remote_server['labeling_candidates_dir'],
                                input_data_file_set[
                                    'labeling_candidates_archive_name'])

                            download_zip_file_path = os.path.join(
                                input_dir,
                                input_data_file_set['name'] + '.zip')
                            sharedlib.download_file(
                                labeling_candidates_file_url,
                                download_zip_file_path)
                            logging.info('Extracting auto-labeled archive...')
                            sharedlib.unzip(download_zip_file_path, input_dir)
                            logging.info('Labeling candidate files extracted.')
                        logging.info('Merging {} into {}...'.format(
                            input_data_file_set[
                                'potential_positive_records_file'],
                            potential_positive_records_file_merged))
                        fin = open(potential_positive_records_file,
                                   encoding='utf-8',
                                   errors='ignore')
                        for record in fin:
                            if len(record.strip()) == 0:
                                continue
                            consolidated_pos.write(record)

                        logging.info('Merging {} into {}...'.format(
                            input_data_file_set[
                                'potential_negative_records_file'],
                            potential_negative_records_file_merged))
                        fin = open(potential_negative_records_file,
                                   encoding='utf-8',
                                   errors='ignore')
                        for record in fin:
                            if len(record.strip()) == 0:
                                continue
                            consolidated_neg.write(record)

                        logging.info('Merging {} into {}...'.format(
                            input_data_file_set[
                                'questionable_positive_records_file'],
                            questionable_positive_records_file_merged))
                        fin = open(questionable_positive_records_file,
                                   encoding='utf-8',
                                   errors='ignore')
                        for record in fin:
                            if len(record.strip()) == 0:
                                continue
                            consolidated_questionable_pos.write(record)

                        logging.info('Merging {} into {}...'.format(
                            input_data_file_set[
                                'questionable_negative_records_file'],
                            questionable_negative_records_file_merged))
                        fin = open(questionable_negative_records_file,
                                   encoding='utf-8',
                                   errors='ignore')
                        for record in fin:
                            if len(record.strip()) == 0:
                                continue
                            consolidated_questionable_neg.write(record)
示例#7
0
def download_remote_server_files(remote_server_config, remote_server_files,
                                 output_files):
    labeled_base_uri = sharedlib.join_remote_server_paths(
        remote_server_config['base_uri'],
        remote_server_config['labeling_verified_samples_dir'])
    logging.info('Downloading cloud files from {}'.format(labeled_base_uri))

    sharedlib.download_file(
        sharedlib.join_remote_server_paths(
            labeled_base_uri,
            remote_server_files['potential_positive_records_blob']),
        sharedlib.abspath(output_files['potential_positive_records_file']),
        not remote_server_files['skip_download_if_already_present'])
    sharedlib.download_file(
        sharedlib.join_remote_server_paths(
            labeled_base_uri,
            remote_server_files['potential_negative_records_blob']),
        sharedlib.abspath(output_files['potential_negative_records_file']),
        not remote_server_files['skip_download_if_already_present'])
    sharedlib.download_file(
        sharedlib.join_remote_server_paths(
            labeled_base_uri,
            remote_server_files['questionable_positive_records_blob']),
        sharedlib.abspath(output_files['questionable_positive_records_file']),
        not remote_server_files['skip_download_if_already_present'])
    sharedlib.download_file(
        sharedlib.join_remote_server_paths(
            labeled_base_uri,
            remote_server_files['questionable_negative_records_blob']),
        sharedlib.abspath(output_files['questionable_negative_records_file']),
        not remote_server_files['skip_download_if_already_present'])

    sharedlib.download_file(
        sharedlib.join_remote_server_paths(
            labeled_base_uri,
            remote_server_files['verified_positive_records_blob']),
        sharedlib.abspath(output_files['verified_positive_records_file']),
        True)
    sharedlib.download_file(
        sharedlib.join_remote_server_paths(
            labeled_base_uri,
            remote_server_files['verified_negative_records_blob']),
        sharedlib.abspath(output_files['verified_negative_records_file']),
        True)
    sharedlib.download_file(
        sharedlib.join_remote_server_paths(
            labeled_base_uri,
            remote_server_files['already_processed_record_numbers_blob']),
        sharedlib.abspath(
            output_files['already_processed_record_numbers_file']), True)

    logging.info('Downloading model labeling accuracy files...')
    accuracy_file_pattern = re.compile('.*_accuracy.json')
    remote_files = sharedlib.get_list_of_files_from_remote_server(
        remote_server_config['labeling_verified_samples_dir'])
    accuarcy_files = [
        file_name for file_name in remote_files
        if re.search(accuracy_file_pattern, file_name) is not None
    ]

    for accuracy_file in accuarcy_files:
        file_uri = sharedlib.join_remote_server_paths(labeled_base_uri,
                                                      accuracy_file)
        file_local_path = sharedlib.abspath(
            os.path.join(
                os.path.dirname(
                    output_files['already_processed_record_numbers_file']),
                accuracy_file))
        sharedlib.download_file(file_uri, file_local_path, True)