def classify_files(files_to_classify): output_dir = sharedlib.abspath(config.output_dir) trained_models_dir = sharedlib.abspath(config.trained_models_dir) start_time = datetime.datetime.now() process_log_first_line = 'MAUDE Classification Process Log. Computer: {}. OS: {} {} Date/Time: {}. Python Version: {}\n'.format(platform.node(), platform.system(), platform.release(), start_time, sys.version) log(process_log_first_line) log('classifier::classify_files() starting at {}'.format(start_time)) models_config = config.models models = [] log('Checking if model(s) need to be downloaded...') models_on_remote_server = sharedlib.get_list_of_files_from_remote_server(config.remote_server['trained_models_dir']) for model_config in models_config: model_name = model_config['name'] classifier_pickle_file = os.path.join(trained_models_dir, model_name + '.pickle') vectorizer_pickle_file = os.path.join(trained_models_dir, model_name + '.vectorizer.pickle') if model_config['always_download'] == True or os.path.exists(classifier_pickle_file) == False: log('Model {} needs to be downloaded.'.format(model_name)) if not model_config['archive_name'] in models_on_remote_server: log('Model archive {} not found on the remote server. This model will be skipped.'.format(model_config['archive_name'])) continue download_zip_file_path = os.path.join(trained_models_dir, model_config['archive_name']) model_url = sharedlib.join_remote_server_paths(config.remote_server['base_uri'], config.remote_server['trained_models_dir'], model_config['archive_name']) sharedlib.download_file(model_url, download_zip_file_path, True) log('Extracting model archive...') sharedlib.unzip(download_zip_file_path, trained_models_dir) log('Model extracted.') log('Classifier pickle file: {}'.format(os.path.basename(classifier_pickle_file))) log('Loading the pickled classifier...') classifier = sharedlib.load_pickle(classifier_pickle_file) vectorizer = None if os.path.exists(vectorizer_pickle_file): log('Vectorizer pickle file: {}'.format(os.path.basename(vectorizer_pickle_file))) log('Loading the pickled vectorizer...') vectorizer = sharedlib.load_pickle(vectorizer_pickle_file) else: log('No vectorizer (expected: {}) found for this model.'.format(vectorizer_pickle_file)) log('Model ({}) loaded.'.format(classifier)) models.append((model_name, classifier, vectorizer)) log('Total {} model(s) loaded.'.format(len(models))) positive_signal_regexes_for_false_negative_check = [re.compile('\s{}\s'.format(p.strip()), re.IGNORECASE) for p in config.positive_signals_for_false_negative_check] for input_data_file in files_to_classify: classify_file(input_data_file, models, positive_signal_regexes_for_false_negative_check, True, config.target_file_max_num_records_to_classify) end_time = datetime.datetime.now() log('classifier::classify_files() completed at {}. Total duration: {}.'.format(end_time, end_time - start_time))
def download_models_from_remote_server(remote_server_config, models_config, output_dir): logging.info('Downloading models...') output_dir = sharedlib.abspath(output_dir) remote_files = sharedlib.get_list_of_files_from_remote_server( remote_server_config['trained_models_dir']) models_base_uri = sharedlib.join_remote_server_paths( remote_server_config['base_uri'], remote_server_config['trained_models_dir']) models = [] for model_config in models_config: name_zip_tuple = (model_config['name'], model_config['archive_name'], os.path.join(output_dir, model_config['archive_name'])) classifier = None vectorizer = None if name_zip_tuple[1] in remote_files: sharedlib.download_file( sharedlib.join_remote_server_paths( models_base_uri, model_config['archive_name']), name_zip_tuple[2], True) sharedlib.unzip(name_zip_tuple[2], output_dir) pickle_file = os.path.join(output_dir, name_zip_tuple[0] + '.pickle') if os.path.exists(pickle_file): classifier = sharedlib.load_pickle(pickle_file) vectorizer_pickle_file = os.path.join( output_dir, name_zip_tuple[0] + '.vectorizer.pickle') if os.path.exists(vectorizer_pickle_file): logging.info('Vectorizer pickle file: {}'.format( os.path.basename(vectorizer_pickle_file))) logging.info('Loading the pickled vectorizer...') vectorizer = sharedlib.load_pickle(vectorizer_pickle_file) else: logging.info( 'No vectorizer (expected: {}) found for this model.'.format( vectorizer_pickle_file)) if classifier is not None: models.append((name_zip_tuple[0], classifier, vectorizer)) else: logging.info( 'Could not find pickled classifier in the package {} on the Remote Server' .format(name_zip_tuple[1])) logging.info('{} MODELS LOADED'.format(len(models))) return models
def generate_models_per_config(input_data_files): input_dir = sharedlib.abspath(config.input_dir) output_dir = sharedlib.abspath(config.output_dir) start_time = datetime.datetime.now() log('modeler::create_models() starting at {}'.format(start_time)) positive_records_files = [] negative_records_files = [] log('Checking if labeled archive(s) need to be downloaded...') for input_data_file_set in input_data_files: positive_records_file = os.path.join( input_dir, input_data_file_set['positive_records_file']) negative_records_file = os.path.join( input_dir, input_data_file_set['negative_records_file']) if input_data_file_set['always_download'] == True or os.path.exists( positive_records_file) == False or os.path.exists( negative_records_file) == False: log('Labeled archive for input data needs to be downloaded.') positive_records_file_uri = sharedlib.join_remote_server_paths( config.remote_server['base_uri'], input_data_file_set['remote_blob_dir'], input_data_file_set['positive_records_file']) negative_records_file_uri = sharedlib.join_remote_server_paths( config.remote_server['base_uri'], input_data_file_set['remote_blob_dir'], input_data_file_set['negative_records_file']) sharedlib.download_file(positive_records_file_uri, positive_records_file, True) sharedlib.download_file(negative_records_file_uri, negative_records_file, True) log('Positive records file: {}'.format( os.path.basename(positive_records_file))) log('Negative records file: {}'.format( os.path.basename(negative_records_file))) positive_records_files.append(positive_records_file) negative_records_files.append(negative_records_file) generate_models(positive_records_files, negative_records_files, config.models, config.duplicate_record_check_ignore_pattern, output_dir, config.upload_output_to_remote_server)
def download_labeled_seed_files(remote_server_config, remote_server_files, output_files): verified_records_base_uri = sharedlib.join_remote_server_paths( remote_server_config['base_uri'], remote_server_config['labeling_verified_samples_dir']) logging.info('Downloading remote server files from {}...'.format( verified_records_base_uri)) remote_seed_files_from_config = remote_server_files['labeled_seed_files'] sharedlib.download_file( sharedlib.join_remote_server_paths( verified_records_base_uri, remote_seed_files_from_config['verified_positive_records_blob']), sharedlib.abspath(output_files['autolabeled_positive_records_file']), not remote_seed_files_from_config['skip_download_if_already_present']) sharedlib.download_file( sharedlib.join_remote_server_paths( verified_records_base_uri, remote_seed_files_from_config['verified_negative_records_blob']), sharedlib.abspath(output_files['autolabeled_negative_records_file']), not remote_seed_files_from_config['skip_download_if_already_present'])
def download_remote_server_files(remote_server_config, remote_server_files, output_files): auto_labeled_base_uri = sharedlib.join_remote_server_paths( remote_server_config['base_uri'], remote_server_config['labeling_auto_labeled_dir']) logging.info('Downloading remote server files from {}...'.format( auto_labeled_base_uri)) remote_auto_labeled_files_from_config = remote_server_files[ 'auto_labeled_files'] sharedlib.download_file( sharedlib.join_remote_server_paths( auto_labeled_base_uri, remote_auto_labeled_files_from_config[ 'autolabeled_positive_records_blob']), sharedlib.abspath(output_files['autolabeled_positive_records_file']), not remote_auto_labeled_files_from_config[ 'skip_download_if_already_present']) sharedlib.download_file( sharedlib.join_remote_server_paths( auto_labeled_base_uri, remote_auto_labeled_files_from_config[ 'autolabeled_negative_records_blob']), sharedlib.abspath(output_files['autolabeled_negative_records_file']), not remote_auto_labeled_files_from_config[ 'skip_download_if_already_present']) sharedlib.download_file( sharedlib.join_remote_server_paths( auto_labeled_base_uri, remote_auto_labeled_files_from_config[ 'input_file_total_lines_count_blob']), sharedlib.abspath(output_files['input_file_total_lines_count_file']), not remote_auto_labeled_files_from_config[ 'skip_download_if_already_present']) sharedlib.download_file( sharedlib.join_remote_server_paths( auto_labeled_base_uri, remote_auto_labeled_files_from_config[ 'already_processed_record_numbers_blob']), sharedlib.abspath( output_files['already_processed_record_numbers_file']), not remote_auto_labeled_files_from_config[ 'skip_download_if_already_present'])
def build_potential_file_sets(input_files, potential_positive_records_file_merged, potential_negative_records_file_merged, questionable_positive_records_file_merged, questionable_negative_records_file_merged): logging.info('Building potential positive and negative files...') input_dir = sharedlib.abspath(config.input_dir) with open(potential_positive_records_file_merged, 'w', encoding='utf-8', errors='ignore') as consolidated_pos: with open(potential_negative_records_file_merged, 'w', encoding='utf-8', errors='ignore') as consolidated_neg: with open(questionable_positive_records_file_merged, 'w', encoding='utf-8', errors='ignore') as consolidated_questionable_pos: with open(questionable_negative_records_file_merged, 'w', encoding='utf-8', errors='ignore') as consolidated_questionable_neg: for input_data_file_set in input_files: potential_positive_records_file = os.path.join( input_dir, input_data_file_set[ 'potential_positive_records_file']) potential_negative_records_file = os.path.join( input_dir, input_data_file_set[ 'potential_negative_records_file']) questionable_positive_records_file = os.path.join( input_dir, input_data_file_set[ 'questionable_positive_records_file']) questionable_negative_records_file = os.path.join( input_dir, input_data_file_set[ 'questionable_negative_records_file']) if input_data_file_set[ 'always_download'] == True or os.path.exists( potential_positive_records_file ) == False or os.path.exists( potential_negative_records_file) == False: logging.info( 'Labeling candidate archive for {} needs to be downloaded.' .format(input_data_file_set['name'])) labeling_candidates_file_url = sharedlib.join_remote_server_paths( config.remote_server['base_uri'], config. remote_server['labeling_candidates_dir'], input_data_file_set[ 'labeling_candidates_archive_name']) download_zip_file_path = os.path.join( input_dir, input_data_file_set['name'] + '.zip') sharedlib.download_file( labeling_candidates_file_url, download_zip_file_path) logging.info('Extracting auto-labeled archive...') sharedlib.unzip(download_zip_file_path, input_dir) logging.info('Labeling candidate files extracted.') logging.info('Merging {} into {}...'.format( input_data_file_set[ 'potential_positive_records_file'], potential_positive_records_file_merged)) fin = open(potential_positive_records_file, encoding='utf-8', errors='ignore') for record in fin: if len(record.strip()) == 0: continue consolidated_pos.write(record) logging.info('Merging {} into {}...'.format( input_data_file_set[ 'potential_negative_records_file'], potential_negative_records_file_merged)) fin = open(potential_negative_records_file, encoding='utf-8', errors='ignore') for record in fin: if len(record.strip()) == 0: continue consolidated_neg.write(record) logging.info('Merging {} into {}...'.format( input_data_file_set[ 'questionable_positive_records_file'], questionable_positive_records_file_merged)) fin = open(questionable_positive_records_file, encoding='utf-8', errors='ignore') for record in fin: if len(record.strip()) == 0: continue consolidated_questionable_pos.write(record) logging.info('Merging {} into {}...'.format( input_data_file_set[ 'questionable_negative_records_file'], questionable_negative_records_file_merged)) fin = open(questionable_negative_records_file, encoding='utf-8', errors='ignore') for record in fin: if len(record.strip()) == 0: continue consolidated_questionable_neg.write(record)
def download_remote_server_files(remote_server_config, remote_server_files, output_files): labeled_base_uri = sharedlib.join_remote_server_paths( remote_server_config['base_uri'], remote_server_config['labeling_verified_samples_dir']) logging.info('Downloading cloud files from {}'.format(labeled_base_uri)) sharedlib.download_file( sharedlib.join_remote_server_paths( labeled_base_uri, remote_server_files['potential_positive_records_blob']), sharedlib.abspath(output_files['potential_positive_records_file']), not remote_server_files['skip_download_if_already_present']) sharedlib.download_file( sharedlib.join_remote_server_paths( labeled_base_uri, remote_server_files['potential_negative_records_blob']), sharedlib.abspath(output_files['potential_negative_records_file']), not remote_server_files['skip_download_if_already_present']) sharedlib.download_file( sharedlib.join_remote_server_paths( labeled_base_uri, remote_server_files['questionable_positive_records_blob']), sharedlib.abspath(output_files['questionable_positive_records_file']), not remote_server_files['skip_download_if_already_present']) sharedlib.download_file( sharedlib.join_remote_server_paths( labeled_base_uri, remote_server_files['questionable_negative_records_blob']), sharedlib.abspath(output_files['questionable_negative_records_file']), not remote_server_files['skip_download_if_already_present']) sharedlib.download_file( sharedlib.join_remote_server_paths( labeled_base_uri, remote_server_files['verified_positive_records_blob']), sharedlib.abspath(output_files['verified_positive_records_file']), True) sharedlib.download_file( sharedlib.join_remote_server_paths( labeled_base_uri, remote_server_files['verified_negative_records_blob']), sharedlib.abspath(output_files['verified_negative_records_file']), True) sharedlib.download_file( sharedlib.join_remote_server_paths( labeled_base_uri, remote_server_files['already_processed_record_numbers_blob']), sharedlib.abspath( output_files['already_processed_record_numbers_file']), True) logging.info('Downloading model labeling accuracy files...') accuracy_file_pattern = re.compile('.*_accuracy.json') remote_files = sharedlib.get_list_of_files_from_remote_server( remote_server_config['labeling_verified_samples_dir']) accuarcy_files = [ file_name for file_name in remote_files if re.search(accuracy_file_pattern, file_name) is not None ] for accuracy_file in accuarcy_files: file_uri = sharedlib.join_remote_server_paths(labeled_base_uri, accuracy_file) file_local_path = sharedlib.abspath( os.path.join( os.path.dirname( output_files['already_processed_record_numbers_file']), accuracy_file)) sharedlib.download_file(file_uri, file_local_path, True)