def initialize(): base_path = os.path.dirname(__file__) add_to_path(os.path.abspath(os.path.join(base_path, '..', '..', 'shared'))) add_to_path(os.path.abspath(os.path.join(base_path, 'lib'))) global log_file_path log_file_path = os.path.join(base_path, 'out', 'why_session.log') import config import sharedlib sharedlib.initialize(base_path, log_file_path, config.remote_server) sharedlib.create_dirs([ sharedlib.abspath(os.path.join(base_path, 'out')), sharedlib.abspath(os.path.join(base_path, 'file_chunks')) ])
def get_labeling_accuracy(model_name, output_dir): accuracy_file_path = os.path.join(sharedlib.abspath(output_dir), model_name + '_accuracy.json') if not os.path.exists(accuracy_file_path): return ( 0, 0, 0 ) # Tuple structure is: (<all time accuracy>, <accuracy over last 500>, <accuracy over last 100>) with open(accuracy_file_path, 'r') as f: accuracy_data = json.load(f) if accuracy_data is None: # No previous accuracy info for this model return (0, 0, 0) total_all_time = len(accuracy_data) last_100 = accuracy_data[-100:] last_500 = accuracy_data[-500:] total_last_100 = len(last_100) total_last_500 = len(last_500) correct_all_time = len( [item for item in accuracy_data if item['correct'] == True]) correct_last_100 = len( [item for item in last_100 if item['correct'] == True]) correct_last_500 = len( [item for item in last_500 if item['correct'] == True]) return (correct_all_time / total_all_time, correct_last_500 / total_last_500, correct_last_100 / total_last_100)
def main(args=None): initialize() if args is None: args = sys.argv[1:] if len(args) == 0: logging.info( 'Usage python why.py <Report or Text Record Number> [return_on_first_find=True]' ) return output_dir = 'out' return_on_first_find = len(args) > 1 and not args[1].lower() in ['false'] import sharedlib if not os.path.isabs(output_dir): output_dir = sharedlib.abspath( os.path.join(os.path.dirname(__file__), output_dir)) match_count = 0 for filename in os.listdir(output_dir): if filename.endswith('.process.txt'): logging.info('Looking for {} in: {}...'.format(args[0], filename)) with open(os.path.join(output_dir, filename), 'r') as f: for line in f: match = re.search(args[0], line, re.IGNORECASE) if match is not None: logging.info('{}=> {}.'.format(filename, line)) match_count += 1 if return_on_first_find == True: return if match_count == 0: logging.info('Nothing found for: {}.'.format(args[0]))
def initialize(): base_path = os.path.dirname(__file__) add_to_path(os.path.abspath(os.path.join(base_path, '..', '..', 'shared'))) add_to_path(os.path.abspath(os.path.join(base_path, 'lib'))) global log_file_path log_file_path = os.path.join( base_path, 'out', 'modeling_{}.log'.format( datetime.datetime.now().strftime("%Y-%m-%dT%H%M%S"))) import config import sharedlib sharedlib.initialize(base_path, log_file_path, config.remote_server) sharedlib.create_dirs([ sharedlib.abspath(os.path.join(base_path, 'in')), sharedlib.abspath(os.path.join(base_path, 'out')) ])
def generate_models_per_config(input_data_files): input_dir = sharedlib.abspath(config.input_dir) output_dir = sharedlib.abspath(config.output_dir) start_time = datetime.datetime.now() log('modeler::create_models() starting at {}'.format(start_time)) positive_records_files = [] negative_records_files = [] log('Checking if labeled archive(s) need to be downloaded...') for input_data_file_set in input_data_files: positive_records_file = os.path.join( input_dir, input_data_file_set['positive_records_file']) negative_records_file = os.path.join( input_dir, input_data_file_set['negative_records_file']) if input_data_file_set['always_download'] == True or os.path.exists( positive_records_file) == False or os.path.exists( negative_records_file) == False: log('Labeled archive for input data needs to be downloaded.') positive_records_file_uri = sharedlib.join_remote_server_paths( config.remote_server['base_uri'], input_data_file_set['remote_blob_dir'], input_data_file_set['positive_records_file']) negative_records_file_uri = sharedlib.join_remote_server_paths( config.remote_server['base_uri'], input_data_file_set['remote_blob_dir'], input_data_file_set['negative_records_file']) sharedlib.download_file(positive_records_file_uri, positive_records_file, True) sharedlib.download_file(negative_records_file_uri, negative_records_file, True) log('Positive records file: {}'.format( os.path.basename(positive_records_file))) log('Negative records file: {}'.format( os.path.basename(negative_records_file))) positive_records_files.append(positive_records_file) negative_records_files.append(negative_records_file) generate_models(positive_records_files, negative_records_files, config.models, config.duplicate_record_check_ignore_pattern, output_dir, config.upload_output_to_remote_server)
def upload_output_to_remote_server(pattern_to_match = None): import config import sharedlib if pattern_to_match is None: pattern_to_match = '.zip' logging.info('Uploading output of the previous run(s) to the remote server...') output_dir = sharedlib.abspath(config.output_dir) files_in_output_dir = os.listdir(output_dir) files_to_upload = [os.path.join(output_dir, f) for f in files_in_output_dir if f.lower().endswith(pattern_to_match)] sharedlib.upload_files_to_remote_server_with_prompt(files_to_upload, config.remote_server['labeling_candidates_dir'])
def download_remote_server_files(remote_server_config, remote_server_files, output_files): auto_labeled_base_uri = sharedlib.join_remote_server_paths( remote_server_config['base_uri'], remote_server_config['labeling_auto_labeled_dir']) logging.info('Downloading remote server files from {}...'.format( auto_labeled_base_uri)) remote_auto_labeled_files_from_config = remote_server_files[ 'auto_labeled_files'] sharedlib.download_file( sharedlib.join_remote_server_paths( auto_labeled_base_uri, remote_auto_labeled_files_from_config[ 'autolabeled_positive_records_blob']), sharedlib.abspath(output_files['autolabeled_positive_records_file']), not remote_auto_labeled_files_from_config[ 'skip_download_if_already_present']) sharedlib.download_file( sharedlib.join_remote_server_paths( auto_labeled_base_uri, remote_auto_labeled_files_from_config[ 'autolabeled_negative_records_blob']), sharedlib.abspath(output_files['autolabeled_negative_records_file']), not remote_auto_labeled_files_from_config[ 'skip_download_if_already_present']) sharedlib.download_file( sharedlib.join_remote_server_paths( auto_labeled_base_uri, remote_auto_labeled_files_from_config[ 'input_file_total_lines_count_blob']), sharedlib.abspath(output_files['input_file_total_lines_count_file']), not remote_auto_labeled_files_from_config[ 'skip_download_if_already_present']) sharedlib.download_file( sharedlib.join_remote_server_paths( auto_labeled_base_uri, remote_auto_labeled_files_from_config[ 'already_processed_record_numbers_blob']), sharedlib.abspath( output_files['already_processed_record_numbers_file']), not remote_auto_labeled_files_from_config[ 'skip_download_if_already_present'])
def download_labeled_seed_files(remote_server_config, remote_server_files, output_files): verified_records_base_uri = sharedlib.join_remote_server_paths( remote_server_config['base_uri'], remote_server_config['labeling_verified_samples_dir']) logging.info('Downloading remote server files from {}...'.format( verified_records_base_uri)) remote_seed_files_from_config = remote_server_files['labeled_seed_files'] sharedlib.download_file( sharedlib.join_remote_server_paths( verified_records_base_uri, remote_seed_files_from_config['verified_positive_records_blob']), sharedlib.abspath(output_files['autolabeled_positive_records_file']), not remote_seed_files_from_config['skip_download_if_already_present']) sharedlib.download_file( sharedlib.join_remote_server_paths( verified_records_base_uri, remote_seed_files_from_config['verified_negative_records_blob']), sharedlib.abspath(output_files['autolabeled_negative_records_file']), not remote_seed_files_from_config['skip_download_if_already_present'])
def upload_output_to_remote_server(): import config import sharedlib logging.info( 'Uploading output of the previous run(s) to the remote server...') output_files = config.output_files files_to_upload = [ sharedlib.abspath(output_files['autolabeled_positive_records_file']), sharedlib.abspath(output_files['autolabeled_negative_records_file']), sharedlib.abspath(output_files['input_file_total_lines_count_file']), sharedlib.abspath( output_files['already_processed_record_numbers_file']) ] output_dir = sharedlib.abspath(config.output_dir) files_to_upload = [f for f in files_to_upload if os.path.exists(f) == True] sharedlib.upload_files_to_remote_server_with_prompt( files_to_upload, config.remote_server['labeling_auto_labeled_dir'])
def split_file(large_file, split_dir, max_records_per_file=50000): logging.info( 'Splitting file {} into mutiple files with max {} records each'.format( large_file, max_records_per_file)) input_file_path = sharedlib.abspath(large_file) split_dir = sharedlib.abspath(split_dir) input_file_base_name = os.path.basename(input_file_path) chunk_number = 0 line_number = 0 output_file = None chunks = [] with open(input_file_path, 'r', encoding='utf-8', errors='ignore') as f: for line in f: if line_number == 0 or line_number == max_records_per_file: # Reset line_number = 0 chunk_number += 1 if output_file is not None: output_file.close() input_file_name_without_ext = os.path.splitext( input_file_base_name)[0] chunk_path = os.path.join( split_dir, input_file_name_without_ext + '.{:02d}.txt'.format(chunk_number)) logging.info('Creating new file: {}...'.format(chunk_path)) output_file = open(chunk_path, 'w', encoding='utf-8', errors='ignore') chunks.append(chunk_path) line_number += 1 output_file.write(line) logging.info('{} split into {} smaller files.'.format( input_file_base_name, len(chunks))) output_file.close() return chunks
def download_models_from_remote_server(remote_server_config, models_config, output_dir): logging.info('Downloading models...') output_dir = sharedlib.abspath(output_dir) remote_files = sharedlib.get_list_of_files_from_remote_server( remote_server_config['trained_models_dir']) models_base_uri = sharedlib.join_remote_server_paths( remote_server_config['base_uri'], remote_server_config['trained_models_dir']) models = [] for model_config in models_config: name_zip_tuple = (model_config['name'], model_config['archive_name'], os.path.join(output_dir, model_config['archive_name'])) classifier = None vectorizer = None if name_zip_tuple[1] in remote_files: sharedlib.download_file( sharedlib.join_remote_server_paths( models_base_uri, model_config['archive_name']), name_zip_tuple[2], True) sharedlib.unzip(name_zip_tuple[2], output_dir) pickle_file = os.path.join(output_dir, name_zip_tuple[0] + '.pickle') if os.path.exists(pickle_file): classifier = sharedlib.load_pickle(pickle_file) vectorizer_pickle_file = os.path.join( output_dir, name_zip_tuple[0] + '.vectorizer.pickle') if os.path.exists(vectorizer_pickle_file): logging.info('Vectorizer pickle file: {}'.format( os.path.basename(vectorizer_pickle_file))) logging.info('Loading the pickled vectorizer...') vectorizer = sharedlib.load_pickle(vectorizer_pickle_file) else: logging.info( 'No vectorizer (expected: {}) found for this model.'.format( vectorizer_pickle_file)) if classifier is not None: models.append((name_zip_tuple[0], classifier, vectorizer)) else: logging.info( 'Could not find pickled classifier in the package {} on the Remote Server' .format(name_zip_tuple[1])) logging.info('{} MODELS LOADED'.format(len(models))) return models
def upload_output_to_remote_server(also_uplaod_merged_input_files): import config import sharedlib logging.info( 'Uploading output of the previous run(s) to the remote server...') output_files = config.output_files files_to_upload = [ sharedlib.abspath(output_files['verified_positive_records_file']), sharedlib.abspath(output_files['verified_negative_records_file']), sharedlib.abspath( output_files['already_processed_record_numbers_file']) ] output_dir = sharedlib.abspath(config.output_dir) accuracy_file_pattern = re.compile('.*_accuracy.json') accuarcy_files = [ sharedlib.abspath(os.path.join(output_dir, file_name)) for file_name in os.listdir(output_dir) if re.search(accuracy_file_pattern, file_name) is not None ] files_to_upload += accuarcy_files if also_uplaod_merged_input_files == True: files_to_upload += [ sharedlib.abspath(output_files['potential_positive_records_file']), sharedlib.abspath(output_files['potential_negative_records_file']), sharedlib.abspath( output_files['questionable_positive_records_file']), sharedlib.abspath( output_files['questionable_negative_records_file']) ] files_to_upload = [f for f in files_to_upload if os.path.exists(f) == True] sharedlib.upload_files_to_remote_server_with_prompt( files_to_upload, config.remote_server['labeling_verified_samples_dir'])
def save_labeling_accuracy(model_name, output_dir, record_id, classification, is_correct): accuracy_file_path = os.path.join(sharedlib.abspath(output_dir), model_name + '_accuracy.json') accuracy_data = None if os.path.exists(accuracy_file_path): with open(accuracy_file_path, 'r') as f: accuracy_data = json.load(f) if accuracy_data is None: # No previous accuracy info for this model accuracy_data = [] item = {} item['timestamp'] = datetime.datetime.now().isoformat() item['recordid'] = record_id item['classification'] = classification item['correct'] = is_correct accuracy_data.append(item) with open(accuracy_file_path, 'w') as f: json.dump(accuracy_data, f, indent=4)
def merge_file_sets(file_base_name, out_dir, positive_files, negative_files, maybe_positive_files, maybe_negative_files, process_log_file_paths): output_dir = sharedlib.abspath(out_dir) file_name_without_ext = os.path.splitext(file_base_name)[0] positive_records_output_file_path = os.path.join( output_dir, file_name_without_ext + '.potential_pos.txt') negative_records_output_file_path = os.path.join( output_dir, file_name_without_ext + '.potential_neg.txt') maybe_positive_records_output_file_path = os.path.join( output_dir, file_name_without_ext + '.questionable_pos.txt') maybe_negative_records_output_file_path = os.path.join( output_dir, file_name_without_ext + '.questionable_neg.txt') process_log_file_path = os.path.join( output_dir, file_name_without_ext + '.process.txt') logging.info('Merging {} positive labeled files into: {}...'.format( len(positive_files), positive_records_output_file_path)) merge_files(positive_files, positive_records_output_file_path) logging.info('Merging {} negative labeled files into: {}...'.format( len(negative_files), negative_records_output_file_path)) merge_files(negative_files, negative_records_output_file_path) logging.info('Merging {} maybe positive labeled files into: {}...'.format( len(maybe_positive_files), maybe_positive_records_output_file_path)) merge_files(maybe_positive_files, maybe_positive_records_output_file_path) logging.info('Merging {} maybe negative labeled files into: {}...'.format( len(maybe_negative_files), maybe_negative_records_output_file_path)) merge_files(maybe_negative_files, maybe_negative_records_output_file_path) logging.info('Merging {} process log files into: {}...'.format( len(process_log_file_paths), process_log_file_path)) merge_files(process_log_file_paths, process_log_file_path) return (positive_records_output_file_path, negative_records_output_file_path, maybe_positive_records_output_file_path, maybe_negative_records_output_file_path, process_log_file_path)
def download_remote_server_files(remote_server_config, remote_server_files, output_files): labeled_base_uri = sharedlib.join_remote_server_paths( remote_server_config['base_uri'], remote_server_config['labeling_verified_samples_dir']) logging.info('Downloading cloud files from {}'.format(labeled_base_uri)) sharedlib.download_file( sharedlib.join_remote_server_paths( labeled_base_uri, remote_server_files['potential_positive_records_blob']), sharedlib.abspath(output_files['potential_positive_records_file']), not remote_server_files['skip_download_if_already_present']) sharedlib.download_file( sharedlib.join_remote_server_paths( labeled_base_uri, remote_server_files['potential_negative_records_blob']), sharedlib.abspath(output_files['potential_negative_records_file']), not remote_server_files['skip_download_if_already_present']) sharedlib.download_file( sharedlib.join_remote_server_paths( labeled_base_uri, remote_server_files['questionable_positive_records_blob']), sharedlib.abspath(output_files['questionable_positive_records_file']), not remote_server_files['skip_download_if_already_present']) sharedlib.download_file( sharedlib.join_remote_server_paths( labeled_base_uri, remote_server_files['questionable_negative_records_blob']), sharedlib.abspath(output_files['questionable_negative_records_file']), not remote_server_files['skip_download_if_already_present']) sharedlib.download_file( sharedlib.join_remote_server_paths( labeled_base_uri, remote_server_files['verified_positive_records_blob']), sharedlib.abspath(output_files['verified_positive_records_file']), True) sharedlib.download_file( sharedlib.join_remote_server_paths( labeled_base_uri, remote_server_files['verified_negative_records_blob']), sharedlib.abspath(output_files['verified_negative_records_file']), True) sharedlib.download_file( sharedlib.join_remote_server_paths( labeled_base_uri, remote_server_files['already_processed_record_numbers_blob']), sharedlib.abspath( output_files['already_processed_record_numbers_file']), True) logging.info('Downloading model labeling accuracy files...') accuracy_file_pattern = re.compile('.*_accuracy.json') remote_files = sharedlib.get_list_of_files_from_remote_server( remote_server_config['labeling_verified_samples_dir']) accuarcy_files = [ file_name for file_name in remote_files if re.search(accuracy_file_pattern, file_name) is not None ] for accuracy_file in accuarcy_files: file_uri = sharedlib.join_remote_server_paths(labeled_base_uri, accuracy_file) file_local_path = sharedlib.abspath( os.path.join( os.path.dirname( output_files['already_processed_record_numbers_file']), accuracy_file)) sharedlib.download_file(file_uri, file_local_path, True)
def autolabel(mode, input_files, autolabeled_positive_records_file_path, autolabeled_negative_records_file_path, already_processed_record_numbers_file_path, input_file_total_lines_count_file_path, model): autolabled_positive_records_file_basename = os.path.basename(autolabeled_positive_records_file_path).lower() autolabled_negative_records_file_basename = os.path.basename(autolabeled_negative_records_file_path).lower() autolabled_pos_duplicates_table = remove_semantically_duplicate_records(autolabeled_positive_records_file_path, config.duplicate_record_check_ignore_pattern, config.max_semantic_duplicate_records_allowed) autolabled_neg_duplicates_table = remove_semantically_duplicate_records(autolabeled_negative_records_file_path, config.duplicate_record_check_ignore_pattern, config.max_semantic_duplicate_records_allowed) sharedlib.remove_duplicate_records([autolabeled_positive_records_file_path, autolabeled_negative_records_file_path]) # Perform QC of the master labeled set while True: # QC until it passes 100% or user skips new_model = __modeling_helper.rebuild_models(autolabeled_positive_records_file_path, autolabeled_negative_records_file_path, already_processed_record_numbers_file_path, input_file_total_lines_count_file_path)[0] (qc_score, user_aborted) = perform_manual_qc(new_model, autolabeled_positive_records_file_path, autolabeled_negative_records_file_path, 50, 20) if qc_score != 1 and user_aborted == False: logging.info('QC of the sampling of the entire dataset found at least one correction needed (QC Score: {:.2f}). Performing another round of QC...'.format(qc_score)) continue logging.info('QC (Score: {:.2f}) passed or user skipped. Do you want to continue auto-labeling? [Y]es to continue; [N]o to quit, [R] to re-QC...'.format(qc_score)) decision = None while (decision != 'y' and decision != 'n' and decision != 'r'): decision = sharedlib.get_char_input() if not isinstance(decision, str): decision = bytes.decode(decision) decision = decision.lower() logging.info('Selected: {}'.format(decision)) if decision == 'n': return; elif decision == 'r': continue else: break; # QC of master set complete, exit the loop and continue with auto-labeling. total_new_records_labeled_this_session = 0 autolabeled_positive_records_pending_qc_file_path = sharedlib.abspath(os.path.join(config.output_dir, 'positive_records_pending_qc.txt')) autolabeled_negative_records_pending_qc_file_path = sharedlib.abspath(os.path.join(config.output_dir, 'negative_records_pending_qc.txt')) # Create the model for auto labeling while True: logging.info('Confirm to continue with auto-labeling. [Y]es, [N]o: ') decision = None while (decision != 'y' and decision != 'n'): decision = sharedlib.get_char_input() if not isinstance(decision, str): decision = bytes.decode(decision) decision = decision.lower() logging.info('Selected: {}'.format(decision)) if decision == 'n': break; logging.info('[Re]building model to be used in classification...') new_model = __modeling_helper.rebuild_models(autolabeled_positive_records_file_path, autolabeled_negative_records_file_path, already_processed_record_numbers_file_path, input_file_total_lines_count_file_path)[0] min_required_model_score = config.min_model_score_for_auto_labeling model_score = new_model[3] if model_score < min_required_model_score: logging.info('Model accuracy score is {}, which is less than the minimum required for auto labeling. Quitting...'.format(model_score, min_required_model_score)) break total_new_records_labeled_using_current_models = 0 previous_qc_score = .8 # For the very first QC of the session, we assume 20% failure. This then gets updated with every QC performed. input_file_basename_to_full_path_map = {} for input_file in input_files: input_file_basename_to_full_path_map[os.path.basename(input_file).lower()] = sharedlib.abspath(input_file) already_read_records = get_already_read_records(already_processed_record_numbers_file_path) if already_read_records is None or len(already_read_records) == 0: logging.info('Already read records data not found. Creating new...') already_read_records = {} for input_file in input_files: input_file_basename = os.path.basename(input_file).lower() if input_file_basename not in already_read_records: already_read_records[os.path.basename(input_file).lower()] = {} total_available_records = get_total_available_records(input_file_total_lines_count_file_path) if total_available_records is None or len(total_available_records) == 0: logging.info('Input file total available records data not found. Creating new...') total_available_records = {} for input_file in input_files: input_file_basename = os.path.basename(input_file).lower() if input_file_basename not in total_available_records: logging.info('Creating new...') total_available_records[os.path.basename(input_file).lower()] = get_total_lines_count(input_file_basename_to_full_path_map[input_file_basename]) save_already_read_records(already_processed_record_numbers_file_path, already_read_records) save_total_available_records(input_file_total_lines_count_file_path, total_available_records) total_autolabeled_positive_records = get_total_lines_count(autolabeled_positive_records_file_path) if os.path.exists(autolabeled_positive_records_file_path) else 0 total_autolabeled_negative_records = get_total_lines_count(autolabeled_negative_records_file_path) if os.path.exists(autolabeled_negative_records_file_path) else 0 autolabeled_positive_records_pending_qc_file = open(autolabeled_positive_records_pending_qc_file_path, 'w', encoding='utf-8', errors='ignore') autolabeled_negative_records_pending_qc_file = open(autolabeled_negative_records_pending_qc_file_path, 'w', encoding='utf-8', errors='ignore') total_autolabeled_positive_records_pending_qc = 0 total_autolabeled_negative_records_pending_qc = 0 input_file_basenames = [key for key in total_available_records for input_file in input_files if key in input_file.lower()] while total_new_records_labeled_using_current_models <= config.models_auto_regen_records_threshold: logging.info('-------------------------------------------------------------------') file_to_read_basename = None if mode is None else next([file for file in input_file_basenames if file == mode.lower()], None) if file_to_read_basename == None: file_to_read_basename = random.choice(input_file_basenames) file_to_read = None aleady_read_record_numbers = already_read_records[file_to_read_basename] record_number_to_read = get_unique_random_record_number(total_available_records[file_to_read_basename], aleady_read_record_numbers) file_to_read = input_file_basename_to_full_path_map[file_to_read_basename] minibatch_labeled_records_count = 0 minibatch_attempted_records_count = 0 file_to_read_handle = open(file_to_read, 'r', encoding='utf-8', errors='ignore'); line_number = 0 # Advance to the record being read. A 'mini batch' will begin at that location until <minibatch_size> samples are found. logging.info('Locating the record {} in this file...'.format(record_number_to_read)) while line_number < record_number_to_read: next(file_to_read_handle) line_number += 1 configured_minibatch_size = config.minibatch_size logging.info('Entering minibatch loop for file {}, starting Record# {}. Looking for {} labeled records in this file...'.format(file_to_read_basename, record_number_to_read, configured_minibatch_size)) while minibatch_labeled_records_count < configured_minibatch_size: if minibatch_attempted_records_count != 0: # This means this pass is not the first in the minibatch loop. Advance the record number. record_number_to_read += 1 if record_number_to_read >= total_available_records[file_to_read_basename]: # End of the file reached. Exit the minibatch loop to determine the next file and/or entry point break; if total_new_records_labeled_using_current_models > config.models_auto_regen_records_threshold: # Model re-generation is due break; logging.info('So far pending QC => POS: {}, NEG: {}. Model accuracy {:.2f}. File: {} Record#: {}. Auto-labeled since last model generation: {}. Still looking for {} labeled in this minibatch.'.format(total_autolabeled_positive_records_pending_qc, total_autolabeled_negative_records_pending_qc, new_model[3], file_to_read_basename, record_number_to_read, total_new_records_labeled_using_current_models, (config.minibatch_size - minibatch_labeled_records_count))) line = file_to_read_handle.readline() minibatch_attempted_records_count +=1 record_hash = hashlib.sha1(re.sub(config.duplicate_record_check_ignore_pattern, '', line).upper().encode(errors='ignore')).hexdigest() line_id = line[:40] (model_name, result) = __classification_helper.classify(line, [new_model])[0] # returns tuple: (name, (predicted_classification, positive_proba)) pos_prob = result[1] neg_prob = 1 - pos_prob if pos_prob >= config.min_probability_for_auto_labeling: if (total_autolabeled_positive_records + total_autolabeled_positive_records_pending_qc) > (total_autolabeled_negative_records + total_autolabeled_negative_records_pending_qc): logging.info('This is a positive record, but the search is for a negative record to maintain positive/negative parity. Skipping...') # We maintain positive/negative count parity as we go continue # Do not allow more than n duplicates to prevent bias if record_hash not in autolabled_pos_duplicates_table: autolabled_pos_duplicates_table[record_hash] = 0 # Initialize the hash table entry if autolabled_pos_duplicates_table[record_hash] >= config.max_semantic_duplicate_records_allowed: logging.info('This is a technically unique but semantically duplicate record. There are already {} copies in the positive set. Skipping...'.format(autolabled_pos_duplicates_table[record_hash])) continue autolabled_pos_duplicates_table[record_hash] += 1 logging.info(line) logging.info('Auto-Selected: Positive') autolabeled_positive_records_pending_qc_file.write(line) total_autolabeled_positive_records_pending_qc += 1 minibatch_labeled_records_count += 1 total_new_records_labeled_using_current_models += 1 total_new_records_labeled_this_session += 1 if not record_number_to_read in already_read_records: aleady_read_record_numbers[record_number_to_read] = [] aleady_read_record_numbers[record_number_to_read].append({line_id: positive_class_str}) elif neg_prob >= config.min_probability_for_auto_labeling: if (total_autolabeled_negative_records + total_autolabeled_negative_records_pending_qc) > (total_autolabeled_positive_records + total_autolabeled_positive_records_pending_qc) : logging.info('This is a negative record, but the search is for a positive record to maintain positive/negative parity. Skipping...') # We maintain positive/negative count parity as we go continue # Do not allow more than n duplicates to prevent bias if record_hash not in autolabled_neg_duplicates_table: autolabled_neg_duplicates_table[record_hash] = 0 # Initialize the hash table entry if autolabled_neg_duplicates_table[record_hash] >= config.max_semantic_duplicate_records_allowed: logging.info('This is a technically unique but semantically duplicate record. There are already {} copies in the negative set. Skipping...'.format(autolabled_neg_duplicates_table[record_hash])) continue autolabled_neg_duplicates_table[record_hash] += 1 logging.info(line) logging.info('Auto-selected: Negative') autolabeled_negative_records_pending_qc_file.write(line) minibatch_labeled_records_count += 1 total_autolabeled_negative_records_pending_qc += 1 total_new_records_labeled_using_current_models += 1 total_new_records_labeled_this_session += 1 if not record_number_to_read in already_read_records: aleady_read_record_numbers[record_number_to_read] = [] aleady_read_record_numbers[record_number_to_read].append({line_id: negative_class_str}) else: logging.info('This record (POS: {:.2f}, NEG: {:.2f}) is not strong enough (min required: {:.2f}) to be in the labeled set. Skipping...'.format(pos_prob, neg_prob, config.min_probability_for_auto_labeling)) continue; save_already_read_records(already_processed_record_numbers_file_path, already_read_records) file_to_read_handle.close() autolabeled_positive_records_pending_qc_file.close() autolabeled_negative_records_pending_qc_file.close() logging.info('{} records auto-labeled since the last model. These new records must be QCed...'.format(total_new_records_labeled_using_current_models)) while True: total_autolabeled_positive_records_pending_qc = sharedlib.get_total_lines_count(autolabeled_positive_records_pending_qc_file_path) total_autolabeled_negative_records_pending_qc = sharedlib.get_total_lines_count(autolabeled_negative_records_pending_qc_file_path) total_pending_qc_records_count = total_autolabeled_positive_records_pending_qc + total_autolabeled_negative_records_pending_qc sample_size = math.ceil(total_pending_qc_records_count * ((1-previous_qc_score) * config.inaccuracy_to_qc_sample_size_multiplier)) if sample_size < 1 or sample_size > total_pending_qc_records_count: sample_size = total_pending_qc_records_count (qc_score, user_aborted) = perform_manual_qc(new_model, autolabeled_positive_records_pending_qc_file_path, autolabeled_negative_records_pending_qc_file_path, total_new_records_labeled_using_current_models, sample_size) previous_qc_score = qc_score if qc_score != 1 and user_aborted == False: logging.info('QC found at least one correction needed (QC Score: {:.2f}). Additional QC will be needed...'.format(qc_score)) continue logging.info('Model re-built and QC (Score: {:.2f}) passed or user skipped. Do you want to merge pending QC records to the master set? [Y]es to continue; [N]o to quit; [R] to re-run QC...'.format(qc_score)) decision = None while (decision != 'y' and decision != 'n' and decision != 'r'): decision = sharedlib.get_char_input() if not isinstance(decision, str): decision = bytes.decode(decision) decision = decision.lower() logging.info('Selected: {}'.format(decision)) if decision == 'n': break; elif decision == 'r': continue else: # User chose 'y'. Proceed with the merge. logging.info('Merging pending QC files to positive/negative master set...') tmp_merged_positive_file_path = autolabeled_positive_records_file_path + '.tmp' sharedlib.merge_files([autolabeled_positive_records_file_path, autolabeled_positive_records_pending_qc_file_path], tmp_merged_positive_file_path) shutil.move(tmp_merged_positive_file_path, autolabeled_positive_records_file_path) tmp_merged_negative_file_path = autolabeled_negative_records_file_path + '.tmp' sharedlib.merge_files([autolabeled_negative_records_file_path, autolabeled_negative_records_pending_qc_file_path], tmp_merged_negative_file_path) shutil.move(tmp_merged_negative_file_path, autolabeled_negative_records_file_path) total_autolabeled_positive_records = get_total_lines_count(autolabeled_positive_records_file_path) if os.path.exists(autolabeled_positive_records_file_path) else 0 total_autolabeled_negative_records = get_total_lines_count(autolabeled_negative_records_file_path) if os.path.exists(autolabeled_negative_records_file_path) else 0 logging.info('Files merged. Total {} positive and {} negative records in autolabeled master set.'.format(total_autolabeled_positive_records, total_autolabeled_negative_records)) break;
def extract_records(input_files, output_dir, max_potential_records_to_extract=None, max_questionable_records_to_extract=None): logging.info( 'Extracting potential positive and negative records from {} file(s)...' .format(len(input_files))) output_dir = sharedlib.abspath(output_dir) sharedlib.dump_list_to_file( config.known_positive_records_qualifying_terms, os.path.join(output_dir, 'positive_qualifying_criteria.txt')) sharedlib.dump_list_to_file( config.known_positive_records_disqualifying_terms, os.path.join(output_dir, 'positive_disqualifying_criteria.txt')) total_positive_count = 0 total_negative_count = 0 max_records_per_file = config.file_split_lines_per_file known_positive_records_qualifying_terms_regex_list = build_compiled_regex_list( config.known_positive_records_qualifying_terms) known_positive_records_disqualifying_terms_regex_list = build_compiled_regex_list( config.known_positive_records_disqualifying_terms) potential_positive_records_qualifying_terms_regex_list = build_compiled_regex_list( config.potential_positive_records_qualifying_terms) for file_name in input_files: positive_records_output_files = [] negative_records_output_files = [] maybe_positive_records_output_file_paths = [] maybe_negative_records_output_file_paths = [] process_log_file_paths = [] # Split each file for parallelization chunks = split_file(file_name, config.file_split_dir, max_records_per_file) chunk_max = None if max_potential_records_to_extract is not None: chunk_max = math.ceil( round(max_potential_records_to_extract / len(chunks), 0)) if chunk_max <= 0: chunk_max = None chunk_questionable_records_max = None if max_questionable_records_to_extract is not None: chunk_questionable_records_max = math.ceil( round(max_questionable_records_to_extract / len(chunks), 0)) if chunk_questionable_records_max <= 0: chunk_questionable_records_max = None processes = [] process_return_values = [] is_first_chunk = True for chunk in chunks: logging.info( 'Extracting up to {} potential positive and negative records from {}...' .format(chunk_max, chunk)) chunk_name_without_ext = os.path.splitext( os.path.basename(chunk))[0] positive_records_output_file_path = os.path.join( output_dir, chunk_name_without_ext + '.potential_pos.txt') negative_records_output_file_path = os.path.join( output_dir, chunk_name_without_ext + '.potential_neg.txt') maybe_positive_records_output_file_path = os.path.join( output_dir, chunk_name_without_ext + '.questionable_pos.txt') maybe_negative_records_output_file_path = os.path.join( output_dir, chunk_name_without_ext + '.questionable_neg.txt') process_log_file_path = os.path.join( output_dir, chunk_name_without_ext + '.process.txt') positive_records_output_files.append( positive_records_output_file_path) negative_records_output_files.append( negative_records_output_file_path) maybe_positive_records_output_file_paths.append( maybe_positive_records_output_file_path) maybe_negative_records_output_file_paths.append( maybe_negative_records_output_file_path) process_log_file_paths.append(process_log_file_path) args = (chunk, positive_records_output_file_path, negative_records_output_file_path, maybe_positive_records_output_file_path, maybe_negative_records_output_file_path, process_log_file_path, is_positive, is_negative, known_positive_records_qualifying_terms_regex_list, known_positive_records_disqualifying_terms_regex_list, potential_positive_records_qualifying_terms_regex_list, process_return_values, chunk_max, chunk_questionable_records_max, is_first_chunk) process = multiprocessing.Process( name=chunk_name_without_ext, target=extract_matching_records_from_file, args=args) processes.append(process) is_first_chunk = False for process in processes: logging.info('Starting process: {}...'.format(process.name)) process.start() for process in processes: process.join() for (positive_count, negative_count) in process_return_values: total_positive_count += positive_count total_negative_count += negative_count # Merge output files (positive_records_output_file_path, negative_records_output_file_path, maybe_positive_records_output_file_path, maybe_negative_records_output_file_path, process_log_file_path) = merge_file_sets( os.path.basename(file_name), output_dir, positive_records_output_files, negative_records_output_files, maybe_positive_records_output_file_paths, maybe_negative_records_output_file_paths, process_log_file_paths) logging.info('Deleting temprary chunked files..') for chunk in chunks: logging.info('Deleting {}...'.format(chunk)) os.remove(chunk) # Upload merged files if config.upload_output_to_remote_server == True: list_of_files_to_upload = [ positive_records_output_file_path, negative_records_output_file_path, maybe_positive_records_output_file_path, maybe_negative_records_output_file_path, process_log_file_path ] archive_path = os.path.join( os.path.dirname(positive_records_output_file_path), os.path.splitext(os.path.basename(file_name))[0] + '.labeling_candidates.zip') sharedlib.zip_files(list_of_files_to_upload, archive_path) sharedlib.upload_files_to_labeling_candidates_dir([archive_path])
def generate_models(positive_records_files, negative_records_files, models_config, duplicate_record_check_ignore_pattern, output_dir, upload_generated_models_to_remote_server): output_dir = sharedlib.abspath(output_dir) start_time = datetime.datetime.now() log('modeler::generate_models() starting at {}'.format(start_time)) process_log_first_line = 'MAUDE Modeling Process Log. Computer: {}. OS: {} {} Date/Time: {}. Python Version: {}\n'.format( platform.node(), platform.system(), platform.release(), start_time, sys.version) log(process_log_first_line) log('Merging all positive/negative labeled files. Two sets (all and one without duplicate records) will be produced...' ) all_pos_records_file_path = os.path.join(output_dir, 'positive_records_all.txt') all_neg_records_file_path = os.path.join(output_dir, 'negative_records_all.txt') sharedlib.merge_files( [sharedlib.abspath(p) for p in negative_records_files], all_neg_records_file_path, False, None) sharedlib.merge_files( [sharedlib.abspath(p) for p in positive_records_files], all_pos_records_file_path, False, None) sharedlib.randomize_records(all_pos_records_file_path) sharedlib.randomize_records(all_neg_records_file_path) log('Combined (merged and randomized) positive labeled (all) file: {}'. format(all_pos_records_file_path)) log('Combined (merged and randomized) negative labeled (all) file: {}'. format(all_neg_records_file_path)) at_least_one_no_dup_model = any( model_config['ignore_duplicate_training_records'] == True for model_config in models_config) if at_least_one_no_dup_model: nodups_pos_records_file_path = os.path.join( output_dir, 'positive_records_nodups.txt') nodups_neg_records_file_path = os.path.join( output_dir, 'negative_records_nodups.txt') sharedlib.merge_files( [sharedlib.abspath(p) for p in negative_records_files], nodups_neg_records_file_path, True, duplicate_record_check_ignore_pattern) sharedlib.merge_files( [sharedlib.abspath(p) for p in positive_records_files], nodups_pos_records_file_path, True, duplicate_record_check_ignore_pattern) sharedlib.randomize_records(nodups_pos_records_file_path) sharedlib.randomize_records(nodups_neg_records_file_path) log('Combined (merged and randomized) positive labeled (no-duplicates) file: {}' .format(nodups_pos_records_file_path)) log('Combined (merged and randomized) negative labeled (no-duplicates) file: {}' .format(nodups_neg_records_file_path)) generated_models = [] for model_config in models_config: model_start_time = datetime.datetime.now() model_name = model_config['name'] log('Starting model generation for: {} at {}...'.format( model_name, model_start_time)) pos_labeled_file_path = None neg_labeled_file_path = None if 'nltk.naive_bayes' in model_name or 'sklearn' in model_name: if model_config['ignore_duplicate_training_records'] == True: pos_labeled_file_path = nodups_pos_records_file_path neg_labeled_file_path = nodups_neg_records_file_path else: pos_labeled_file_path = all_pos_records_file_path neg_labeled_file_path = all_neg_records_file_path else: raise ValueError('Unsupported model: {}'.format(model_name)) classifier = None vectorizer = None score = None if 'nltk.naive_bayes' in model_name: classifier, score = _nltk_naive_bayes.generate_model( pos_labeled_file_path, neg_labeled_file_path, model_config, output_dir) else: classifier, vectorizer, score = _sklearn.generate_model( pos_labeled_file_path, neg_labeled_file_path, model_config, output_dir) classifier_pickle_file = sharedlib.abspath( os.path.join(output_dir, model_name + '.pickle')) logging.info('Pickling the model as: {}...'.format( os.path.basename(classifier_pickle_file))) sharedlib.pickle_object(classifier, classifier_pickle_file) vectorizer_pickle_file = None if vectorizer is not None: vectorizer_pickle_file = sharedlib.abspath( os.path.join(output_dir, model_name + '.vectorizer.pickle')) logging.info('Pickling the Vectorizer as: {}...'.format( os.path.basename(vectorizer_pickle_file))) sharedlib.pickle_object(vectorizer, vectorizer_pickle_file) logging.info('Model pickled.') generated_models.append((model_name, classifier_pickle_file, vectorizer_pickle_file, score)) if upload_generated_models_to_remote_server == True: model_archive_name = model_config['archive_name'] files_to_zip = [ pos_labeled_file_path, neg_labeled_file_path, classifier_pickle_file ] if vectorizer is not None: files_to_zip.append(vectorizer_pickle_file) zipped_file = sharedlib.zip_files( files_to_zip, sharedlib.abspath(os.path.join(output_dir, model_archive_name))) log('Uploading the pickled model ({}) to the Remote Server...'. format(model_archive_name)) sharedlib.upload_files_to_trained_models_dir([zipped_file]) model_end_time = datetime.datetime.now() log('Completed creating model for: {} at {}. Duration: {}...'.format( model_name, model_end_time, model_end_time - model_start_time)) end_time = datetime.datetime.now() return generated_models
def build_potential_file_sets(input_files, potential_positive_records_file_merged, potential_negative_records_file_merged, questionable_positive_records_file_merged, questionable_negative_records_file_merged): logging.info('Building potential positive and negative files...') input_dir = sharedlib.abspath(config.input_dir) with open(potential_positive_records_file_merged, 'w', encoding='utf-8', errors='ignore') as consolidated_pos: with open(potential_negative_records_file_merged, 'w', encoding='utf-8', errors='ignore') as consolidated_neg: with open(questionable_positive_records_file_merged, 'w', encoding='utf-8', errors='ignore') as consolidated_questionable_pos: with open(questionable_negative_records_file_merged, 'w', encoding='utf-8', errors='ignore') as consolidated_questionable_neg: for input_data_file_set in input_files: potential_positive_records_file = os.path.join( input_dir, input_data_file_set[ 'potential_positive_records_file']) potential_negative_records_file = os.path.join( input_dir, input_data_file_set[ 'potential_negative_records_file']) questionable_positive_records_file = os.path.join( input_dir, input_data_file_set[ 'questionable_positive_records_file']) questionable_negative_records_file = os.path.join( input_dir, input_data_file_set[ 'questionable_negative_records_file']) if input_data_file_set[ 'always_download'] == True or os.path.exists( potential_positive_records_file ) == False or os.path.exists( potential_negative_records_file) == False: logging.info( 'Labeling candidate archive for {} needs to be downloaded.' .format(input_data_file_set['name'])) labeling_candidates_file_url = sharedlib.join_remote_server_paths( config.remote_server['base_uri'], config. remote_server['labeling_candidates_dir'], input_data_file_set[ 'labeling_candidates_archive_name']) download_zip_file_path = os.path.join( input_dir, input_data_file_set['name'] + '.zip') sharedlib.download_file( labeling_candidates_file_url, download_zip_file_path) logging.info('Extracting auto-labeled archive...') sharedlib.unzip(download_zip_file_path, input_dir) logging.info('Labeling candidate files extracted.') logging.info('Merging {} into {}...'.format( input_data_file_set[ 'potential_positive_records_file'], potential_positive_records_file_merged)) fin = open(potential_positive_records_file, encoding='utf-8', errors='ignore') for record in fin: if len(record.strip()) == 0: continue consolidated_pos.write(record) logging.info('Merging {} into {}...'.format( input_data_file_set[ 'potential_negative_records_file'], potential_negative_records_file_merged)) fin = open(potential_negative_records_file, encoding='utf-8', errors='ignore') for record in fin: if len(record.strip()) == 0: continue consolidated_neg.write(record) logging.info('Merging {} into {}...'.format( input_data_file_set[ 'questionable_positive_records_file'], questionable_positive_records_file_merged)) fin = open(questionable_positive_records_file, encoding='utf-8', errors='ignore') for record in fin: if len(record.strip()) == 0: continue consolidated_questionable_pos.write(record) logging.info('Merging {} into {}...'.format( input_data_file_set[ 'questionable_negative_records_file'], questionable_negative_records_file_merged)) fin = open(questionable_negative_records_file, encoding='utf-8', errors='ignore') for record in fin: if len(record.strip()) == 0: continue consolidated_questionable_neg.write(record)
def label_records(mode): input_files = config.input_data_file_sets logging.info( 'Labeling known positive and negative records from {} file(s)...'. format(len(input_files))) potential_positive_records_file = sharedlib.abspath( config.output_files['potential_positive_records_file']) questionable_positive_records_file = sharedlib.abspath( config.output_files['questionable_positive_records_file']) potential_negative_records_file = sharedlib.abspath( config.output_files['potential_negative_records_file']) questionable_negative_records_file = sharedlib.abspath( config.output_files['questionable_negative_records_file']) positive_records_output_file = sharedlib.abspath( config.output_files['verified_positive_records_file']) negative_records_output_file = sharedlib.abspath( config.output_files['verified_negative_records_file']) already_processed_record_numbers_file = sharedlib.abspath( config.output_files['already_processed_record_numbers_file']) existing_work_in_progress = __remote_server_helper.all_work_in_progress_files_present_on_remote_server( config.remote_server, config.remote_server_files) if existing_work_in_progress: __remote_server_helper.download_remote_server_files( config.remote_server, config.remote_server_files, config.output_files) else: # No cloud files or incomplete set. Create new using data files. build_potential_file_sets(input_files, potential_positive_records_file, potential_negative_records_file, questionable_positive_records_file, questionable_negative_records_file) models = __remote_server_helper.download_models_from_remote_server( config.remote_server, config.models, config.models_output_dir) label(mode, potential_positive_records_file, potential_negative_records_file, questionable_positive_records_file, questionable_negative_records_file, positive_records_output_file, negative_records_output_file, already_processed_record_numbers_file, models) sharedlib.remove_duplicate_records( [positive_records_output_file, negative_records_output_file]) if config.upload_output_to_remote_server == True: logging.info('Upload output{}? [y/n] '.format( '' if existing_work_in_progress else ' (POTENTIALLY OVERWRITE CLOUD)')) upload_confirmation = sharedlib.get_char_input() if not isinstance(upload_confirmation, str): upload_confirmation = bytes.decode(upload_confirmation) if upload_confirmation == 'y': files_to_upload = [ positive_records_output_file, negative_records_output_file, already_processed_record_numbers_file ] accuracy_file_pattern = re.compile('.*_accuracy.json') accuarcy_files = [ sharedlib.abspath(os.path.join(config.output_dir, file_name)) for file_name in os.listdir(config.output_dir) if re.search(accuracy_file_pattern, file_name) is not None ] files_to_upload += accuarcy_files if not existing_work_in_progress: files_to_upload += [ potential_positive_records_file, potential_negative_records_file, questionable_positive_records_file, questionable_negative_records_file ] sharedlib.upload_files_to_remote_server( files_to_upload, config.remote_server['labeling_verified_samples_dir'])
def classify_file(input_data_file, models, positive_signal_regexes_for_false_negative_check, skip_first_record=True, max_records = None): start_time = datetime.datetime.now() log('classifier::classify_file() starting at {}'.format(start_time)) file_base_name = os.path.basename(input_data_file) out_dir = sharedlib.abspath(config.output_dir) predicted_pos_file_ext = '.predicted.pos.txt' predicted_neg_file_ext = '.predicted.neg.txt' possible_false_neg_file_ext = '.possible.false.neg.txt' prediction_summary_file_ext = '.prediction.summary.txt' overall_predicted_pos_records_file_path = os.path.join(out_dir, '{}{}'.format(file_base_name, predicted_pos_file_ext)) overall_predicted_neg_records_file_path = os.path.join(out_dir, '{}{}'.format(file_base_name, predicted_neg_file_ext)) overall_possible_false_neg_records_file_path = os.path.join(out_dir, '{}{}'.format(file_base_name, possible_false_neg_file_ext)) prediction_summary_file_path = os.path.join(out_dir, '{}{}'.format(file_base_name, prediction_summary_file_ext)) log('Predicted positive records file (overall): {}'.format(overall_predicted_pos_records_file_path)) log('Predicted negative records file (overall): {}'.format(overall_predicted_neg_records_file_path)) log('Prediction summary file: {}'.format(prediction_summary_file_path)) prediction_summary_file = open(prediction_summary_file_path, 'w', encoding='utf-8', errors='ignore') prediction_summary_file.write('MDR_REPORT_KEY|MDR_TEXT_KEY|TEXT_TYPE_CODE|PATIENT_SEQUENCE_NUMBER|DATE_REPORT|FOI_TEXT|MODEL_NAME|POS_PROB|NEG_PROB|CLASSIFICATION|HAS_POS_SIGNALS|POS_SIGNAL\n') classifiers_info = [] for (name, classifier, vectorizer) in models: log('Building classifier parameters for {}...'.format(name)) predicted_positive_records_file_path = os.path.join(out_dir, '{}_{}{}'.format(file_base_name, name, predicted_pos_file_ext)) predicted_negative_records_file_path = os.path.join(out_dir, '{}_{}{}'.format(file_base_name, name, predicted_neg_file_ext)) possible_false_negative_records_file_path = os.path.join(out_dir, '{}_{}{}'.format(file_base_name, name, possible_false_neg_file_ext)) log('Predicted positive records file for this classifier: {}'.format(predicted_positive_records_file_path)) log('Predicted negative records file for this classifier: {}'.format(predicted_negative_records_file_path)) classifiers_info.append((name, classifier, vectorizer, predicted_positive_records_file_path, open(predicted_positive_records_file_path, 'w', encoding='utf-8', errors='ignore'), predicted_negative_records_file_path, open(predicted_negative_records_file_path, 'w', encoding='utf-8', errors='ignore'), possible_false_negative_records_file_path, open(possible_false_negative_records_file_path, 'w', encoding='utf-8', errors='ignore') )) file_to_classify = sharedlib.abspath(input_data_file) log('Total {} models loaded.'.format(len(classifiers_info))) log('File to classify: {}. Reading one record at a time...'.format(file_to_classify)) total_records = 0 total_data_records = 0 total_positive = 0 total_negative = 0 positive_percent = 0 negative_percent = 0 total_possible_false_negative = 0 possible_false_negative_percent_of_negatives = 0 possible_false_negative_percent_of_all_records = 0 overall_predicted_pos_records_file = open(overall_predicted_pos_records_file_path, 'w', encoding='utf-8', errors='ignore') overall_predicted_neg_records_file = open(overall_predicted_neg_records_file_path, 'w', encoding='utf-8', errors='ignore') overall_possible_false_neg_records_file = open(overall_possible_false_neg_records_file_path, 'w', encoding='utf-8', errors='ignore') fin = codecs.open(file_to_classify, mode='r+', encoding='utf-8', errors='ignore') for record in fin: total_records += 1 sys.stdout.write('{} => POS: {}/{:.2f}% NEG: {}/{:.2f}% Possible FALSE NEG: {}/{:.2f}% of all NEG; {:.2f}% of all records. Next: {}...\r'.format(file_base_name, total_positive, positive_percent, total_negative, negative_percent, total_possible_false_negative, possible_false_negative_percent_of_negatives, possible_false_negative_percent_of_all_records, total_data_records)) sys.stdout.flush() if total_records == 1 and skip_first_record == True: continue total_data_records += 1 if max_records is not None and total_data_records > max_records: break # Compute if this record could be a possible false negative (i.e. has positive signals, but classified as negative) # Since this is global evaluation (irrespective of models), we will perform this check once, and here. # The outcome will be used later to determine if it is a possible false negative. has_positive_signals = False first_positive_signal_found = None for pattern in positive_signal_regexes_for_false_negative_check: match = re.search(pattern, record) if match is not None: has_positive_signals = True first_positive_signal_found = match.group() break classifications = [] for (name, classifier, vectorizer, pos_file_path, pos_file, neg_file_path, neg_file, possible_false_neg_file_path, possible_false_neg_file) in classifiers_info: predicted_classification, positive_probability = classify(record, name, classifier, vectorizer, config.min_required_record_length, config.positive_probability_threshold) is_positive = predicted_classification == 'pos' and round(positive_probability, 2) >= config.positive_probability_threshold if config.verbose == True: log('Classification by {} is {}'.format(name, predicted_classification)) log('Probabilities: pos: {}, neg: {}'.format(positive_probability, probabilities.prob('neg'))) if is_positive: classifications.append(('pos', positive_probability)) pos_file.write(record.rstrip(os.linesep) + '\n') else: classifications.append(('neg', positive_probability)) # For consistency, we store all probability in terms of positive neg_file.write(record.rstrip(os.linesep) + '\n') if has_positive_signals: # Potential false negative. possible_false_neg_file.write(record.rstrip(os.linesep) + '\n') prediction_summary_file.write('{}|{}|{:.2f}|{:.2f}|{}|{}|{}\n'.format(record[:40].strip(), name, positive_probability, 1-positive_probability, 'pos' if is_positive == True else 'neg', has_positive_signals, first_positive_signal_found)) if total_data_records % 10000 == 0: pos_file.flush() neg_file.flush() possible_false_neg_file.flush() prediction_summary_file.flush() show_model_classification_stats(file_base_name, name, pos_file_path, neg_file_path, possible_false_neg_file_path) overall_classification, overall_positive_probability = get_overall_classification(classifications) if overall_classification is None: continue if overall_classification == 'pos': total_positive +=1 else: total_negative +=1 if has_positive_signals: total_possible_false_negative += 1 positive_percent = (total_positive / total_data_records) * 100 negative_percent = (total_negative / total_data_records) * 100 possible_false_negative_percent_of_all_records = (total_possible_false_negative / total_data_records) * 100 possible_false_negative_percent_of_negatives = 0 if total_negative == 0 else (total_possible_false_negative / total_negative) * 100 if len(classifications) > 1: if overall_classification == 'pos': overall_predicted_pos_records_file.write(record.rstrip(os.linesep) + '\n') else: overall_predicted_neg_records_file.write(record.rstrip(os.linesep) + '\n') if has_positive_signals: # Potential false negative. overall_possible_false_neg_records_file.write(record.rstrip(os.linesep) + '\n') prediction_summary_file.write('{}|{}|{:.2f}|{:.2f}|{}|{}|{}\n'.format(record[:40].strip(), 'overall', overall_positive_probability, 1-overall_positive_probability, overall_classification, has_positive_signals, first_positive_signal_found)) log('{}=> Overall {} POS records in total {} ({:.2f}%) with a probability of {} or higher. *Possible* false negatives {}/{:.2f}% of all NEG; {:.2f}% of all records.'.format(file_base_name, total_positive, total_data_records -1, positive_percent, config.positive_probability_threshold, total_possible_false_negative, possible_false_negative_percent_of_negatives, possible_false_negative_percent_of_all_records)) fin.close() log('Closing output files...') overall_predicted_pos_records_file.close() overall_predicted_neg_records_file.close() overall_possible_false_neg_records_file.close() prediction_summary_file.close() files_to_zip = [] for (name, classifier, vectorizer, pos_file_path, pos_file_handle, neg_file_path, neg_file_handle, possible_false_neg_file_path, possible_false_neg_file_handle) in classifiers_info: files_to_zip.append(pos_file_path) if config.upload_positive_files_only == False: files_to_zip.append(neg_file_path) if config.verbose == True: log('Closing {}...'.format(pos_file_path)) pos_file_handle.close() if config.verbose == True: log('Closing {}...'.format(neg_file_path)) neg_file_handle.close() if config.verbose == True: log('Closing {}...'.format(possible_false_neg_file_handle)) possible_false_neg_file_handle.close() show_model_classification_stats(file_base_name, name, pos_file_path, neg_file_path, possible_false_neg_file_path) files_to_zip.append(prediction_summary_file_path) files_to_zip.append(overall_predicted_pos_records_file_path) if config.upload_positive_files_only == False: files_to_zip.append(overall_predicted_neg_records_file_path) if config.upload_output_to_remote_server == True: archive_name = os.path.splitext(file_base_name)[0]+'.zip' zip_file = sharedlib.zip_files(files_to_zip, os.path.join(out_dir, archive_name)) log('Uploading the output files ({}) to the Remote Server...'.format(archive_name)) sharedlib.upload_files_to_classified_dir([zip_file]) end_time = datetime.datetime.now() log('classifier::classify_file() completed at {}. Total duration: {}.'.format(end_time, end_time - start_time))
def label(mode, potential_positive_records_file, potential_negative_records_file, questionable_positive_records_file, questionable_negative_records_file, positive_records_output_file, negative_records_output_file, already_processed_record_numbers_file, models): potential_positive_records_file_basename = os.path.basename( potential_positive_records_file).lower() potential_negative_records_file_basename = os.path.basename( potential_negative_records_file).lower() questionable_positive_records_file_basename = os.path.basename( questionable_positive_records_file).lower() questionable_negative_records_file_basename = os.path.basename( questionable_negative_records_file).lower() input_file_basename_to_full_path_map = {} input_file_basename_to_full_path_map[ potential_positive_records_file_basename] = potential_positive_records_file input_file_basename_to_full_path_map[ potential_negative_records_file_basename] = potential_negative_records_file input_file_basename_to_full_path_map[ questionable_positive_records_file_basename] = questionable_positive_records_file input_file_basename_to_full_path_map[ questionable_negative_records_file_basename] = questionable_negative_records_file already_read_records = get_already_read_records( already_processed_record_numbers_file) if already_read_records is None or len(already_read_records) == 0: already_read_records = {} total_available_records = {} total_available_records[ potential_positive_records_file_basename] = get_total_lines_count( potential_positive_records_file) total_available_records[ potential_negative_records_file_basename] = get_total_lines_count( potential_negative_records_file) total_available_records[ questionable_positive_records_file_basename] = get_total_lines_count( questionable_positive_records_file) total_available_records[ questionable_negative_records_file_basename] = get_total_lines_count( questionable_negative_records_file) if not potential_positive_records_file_basename in already_read_records: already_read_records[potential_positive_records_file_basename] = {} if not potential_negative_records_file_basename in already_read_records: already_read_records[potential_negative_records_file_basename] = {} if not questionable_positive_records_file_basename in already_read_records: already_read_records[questionable_positive_records_file_basename] = {} if not questionable_negative_records_file_basename in already_read_records: already_read_records[questionable_negative_records_file_basename] = {} verified_positive_records_file_path = sharedlib.abspath( config.output_files['verified_positive_records_file']) verified_negative_records_file_path = sharedlib.abspath( config.output_files['verified_negative_records_file']) semantic_duplicates_table = remove_semantically_duplicate_lines( [positive_records_output_file, negative_records_output_file], config.duplicate_record_check_ignore_pattern, config.max_semantic_duplicate_records_allowed) total_verified_positive_records = get_total_lines_count( verified_positive_records_file_path) if os.path.exists( verified_positive_records_file_path) else 0 total_verified_negative_records = get_total_lines_count( verified_negative_records_file_path) if os.path.exists( verified_negative_records_file_path) else 0 total_new_records_labeled_this_session = 0 total_new_records_labeled_using_current_models = 0 model_accuracy_counts = {} verified_positive_records_file = open(verified_positive_records_file_path, 'a+', encoding='utf-8', errors='ignore') verified_negative_records_file = open(verified_negative_records_file_path, 'a+', encoding='utf-8', errors='ignore') new_models = None while True: if config.auto_regen_models == True and ( new_models is None or total_new_records_labeled_using_current_models >= config.models_auto_regen_records_threshold): logging.info( 'Models need to re-regenerated because {} records have been labeled in this session without models regenerated.' .format(total_new_records_labeled_using_current_models)) bulk_close_files([ verified_positive_records_file, verified_negative_records_file ]) new_models = __modeling_helper.rebuild_models( verified_positive_records_file_path, verified_negative_records_file_path, already_processed_record_numbers_file) for (name, classifier, vectorizer, score) in new_models: logging.info('{}-->{}'.format(name, score)) output_files = bulk_open_files([ verified_positive_records_file_path, verified_negative_records_file_path ], 'a+') verified_positive_records_file = output_files[0] verified_negative_records_file = output_files[1] if new_models is not None: models = new_models total_new_records_labeled_using_current_models = 0 logging.info( '-------------------------------------------------------------------' ) file_to_read_basename = mode if mode is not None else random.choice( [key for key in already_read_records]) if file_to_read_basename == 'pos': file_to_read_basename = potential_positive_records_file_basename elif file_to_read_basename == 'neg': file_to_read_basename = potential_negative_records_file_basename elif file_to_read_basename == 'pos?': file_to_read_basename = questionable_positive_records_file_basename elif file_to_read_basename == 'neg?': file_to_read_basename = questionable_negative_records_file_basename logging.info( 'So far => POS: {}, NEG: {}. Next file to look at: {}. Number of records before models auto re-generated: {}' .format( total_verified_positive_records, total_verified_negative_records, file_to_read_basename, config.models_auto_regen_records_threshold - total_new_records_labeled_using_current_models)) file_to_read = None aleady_read_record_numbers = already_read_records[ file_to_read_basename] file_to_read = input_file_basename_to_full_path_map[ file_to_read_basename] record_number_to_read = None line = None line_id = None while record_number_to_read is None: record_number_to_read = get_unique_random_record_number( total_available_records[file_to_read_basename], aleady_read_record_numbers) logging.info('Input File: {}'.format( os.path.basename(file_to_read))) logging.info('Record Number: {}'.format(record_number_to_read)) line = get_line(file_to_read, record_number_to_read) line_id = line[:40] line_hash = hashlib.sha1( re.sub(config.duplicate_record_check_ignore_pattern, '', line).upper().encode(errors='ignore')).hexdigest() if line_hash not in semantic_duplicates_table: semantic_duplicates_table[line_hash] = 0 if semantic_duplicates_table[ line_hash] >= config.max_semantic_duplicate_records_allowed: logging.info( 'This is a semantically duplicate record. There are already {} copies in the set. Skipping...' .format(semantic_duplicates_table[line_hash])) record_number_to_read = None logging.info( 'Duplicates of this record in the verified set before this: {}'. format(semantic_duplicates_table[line_hash])) semantic_duplicates_table[line_hash] += 1 logging.info('') logging.info(line) logging.info('') logging.info('SUGGESTIONS:') suggestions = [] suggestions.append(get_label_from_filename(file_to_read_basename)) logging.info(' Per candidate extractor: {}'.format(suggestions[0])) classification_results = [] overall_suggestion_model_name = 'overall.decision_support' overall_suggestion = None if len(models) > 0: classification_results = __classification_helper.classify( line, models ) # returns tuple: (name, (predicted_classification, positive_proba)) for (model_name, result) in classification_results: suggestions.append(result[0]) accuracy = get_labeling_accuracy( model_name, sharedlib.abspath(config.output_dir)) logging.info( ' Per {} (Past accuracy {:}%/{:}%/{:}%): {}'.format( model_name, round(accuracy[0] * 100, 2), round(accuracy[1] * 100, 2), round(accuracy[2] * 100, 2), result[0].upper())) else: logging.info( ' No trained model available to provide a suggestion.') overall_suggestion_accuracy = get_labeling_accuracy( overall_suggestion_model_name, sharedlib.abspath(config.output_dir)) overall_suggestion = get_likely_suggestion(suggestions) logging.info('OVERALL (Past accuracy {:}%/{:}%/{:}%): {}'.format( round(overall_suggestion_accuracy[0] * 100, 2), round(overall_suggestion_accuracy[1] * 100, 2), round(overall_suggestion_accuracy[2] * 100, 2), overall_suggestion)) logging.info('') logging.info( '[P]ositive, [N]egative, [U]nknown, [R]ebuild Models or [Q]uit? ') logging.info('') decision = None while (decision != 'q' and decision != 'r' and decision != 'p' and decision != 'n' and decision != 'u'): decision = sharedlib.get_char_input() if not isinstance(decision, str): decision = bytes.decode(decision) decision = decision.lower() if decision == 'q': logging.info('Selected: Quit') break elif decision == 'r': logging.info('Selected: Rebuild models') bulk_close_files([ verified_positive_records_file, verified_negative_records_file ]) new_models = __modeling_helper.rebuild_models( verified_positive_records_file_path, verified_negative_records_file_path, already_processed_record_numbers_file) output_files = bulk_open_files([ verified_positive_records_file_path, verified_negative_records_file_path ], 'a+') verified_positive_records_file = output_files[0] verified_negative_records_file = output_files[1] if new_models is not None: models = new_models total_new_records_labeled_using_current_models = 0 continue elif decision == 'p': logging.info('Selected: Positive') verified_positive_records_file.write(line) total_verified_positive_records += 1 total_new_records_labeled_using_current_models += 1 total_new_records_labeled_this_session += 1 if not record_number_to_read in already_read_records: aleady_read_record_numbers[record_number_to_read] = [] aleady_read_record_numbers[record_number_to_read].append( {line_id: 'pos'}) elif decision == 'n': logging.info('Selected: Negative') verified_negative_records_file.write(line) total_verified_negative_records += 1 total_new_records_labeled_using_current_models += 1 total_new_records_labeled_this_session += 1 if not record_number_to_read in already_read_records: aleady_read_record_numbers[record_number_to_read] = [] aleady_read_record_numbers[record_number_to_read].append( {line_id: 'neg'}) else: if not record_number_to_read in already_read_records: aleady_read_record_numbers[record_number_to_read] = [] aleady_read_record_numbers[record_number_to_read].append( {line_id: 'unk'}) total_new_records_labeled_using_current_models += 1 logging.info('Selected: Unknown') for ( model_name, result ) in classification_results: # result is a tuple: (predicted_classification, predicted_proba) is_correct = False if decision == 'p' and result[0].lower() == 'pos': is_correct = True elif decision == 'n' and result[0].lower() == 'neg': is_correct = True elif decision == 'u': # If the human's final decision is indeterminate, machine's feedback is correct no matter what that is. is_correct = True save_labeling_accuracy( model_name, os.path.dirname(verified_positive_records_file_path), line_id, result[0], is_correct) if overall_suggestion is not None: if (decision == 'p' and overall_suggestion.lower() == 'pos') or ( decision == 'n' and overall_suggestion.lower() == 'neg'): save_labeling_accuracy(overall_suggestion_model_name, sharedlib.abspath(config.output_dir), line_id, overall_suggestion, True) else: save_labeling_accuracy(overall_suggestion_model_name, sharedlib.abspath(config.output_dir), line_id, overall_suggestion, False) save_already_read_records(already_processed_record_numbers_file, already_read_records) verified_positive_records_file.close() verified_negative_records_file.close()