예제 #1
0
def initialize():
    base_path = os.path.dirname(__file__)
    add_to_path(os.path.abspath(os.path.join(base_path, '..', '..', 'shared')))
    add_to_path(os.path.abspath(os.path.join(base_path, 'lib')))

    global log_file_path
    log_file_path = os.path.join(base_path, 'out', 'why_session.log')

    import config
    import sharedlib
    sharedlib.initialize(base_path, log_file_path, config.remote_server)

    sharedlib.create_dirs([
        sharedlib.abspath(os.path.join(base_path, 'out')),
        sharedlib.abspath(os.path.join(base_path, 'file_chunks'))
    ])
def get_labeling_accuracy(model_name, output_dir):
    accuracy_file_path = os.path.join(sharedlib.abspath(output_dir),
                                      model_name + '_accuracy.json')
    if not os.path.exists(accuracy_file_path):
        return (
            0, 0, 0
        )  # Tuple structure is: (<all time accuracy>, <accuracy over last 500>, <accuracy over last 100>)

    with open(accuracy_file_path, 'r') as f:
        accuracy_data = json.load(f)

    if accuracy_data is None:  # No previous accuracy info for this model
        return (0, 0, 0)

    total_all_time = len(accuracy_data)
    last_100 = accuracy_data[-100:]
    last_500 = accuracy_data[-500:]
    total_last_100 = len(last_100)
    total_last_500 = len(last_500)

    correct_all_time = len(
        [item for item in accuracy_data if item['correct'] == True])
    correct_last_100 = len(
        [item for item in last_100 if item['correct'] == True])
    correct_last_500 = len(
        [item for item in last_500 if item['correct'] == True])

    return (correct_all_time / total_all_time,
            correct_last_500 / total_last_500,
            correct_last_100 / total_last_100)
예제 #3
0
def main(args=None):
    initialize()

    if args is None:
        args = sys.argv[1:]

    if len(args) == 0:
        logging.info(
            'Usage python why.py <Report or Text Record Number> [return_on_first_find=True]'
        )
        return

    output_dir = 'out'
    return_on_first_find = len(args) > 1 and not args[1].lower() in ['false']

    import sharedlib

    if not os.path.isabs(output_dir):
        output_dir = sharedlib.abspath(
            os.path.join(os.path.dirname(__file__), output_dir))

    match_count = 0
    for filename in os.listdir(output_dir):
        if filename.endswith('.process.txt'):
            logging.info('Looking for {} in: {}...'.format(args[0], filename))
            with open(os.path.join(output_dir, filename), 'r') as f:
                for line in f:
                    match = re.search(args[0], line, re.IGNORECASE)
                    if match is not None:
                        logging.info('{}=> {}.'.format(filename, line))
                        match_count += 1
                        if return_on_first_find == True:
                            return
    if match_count == 0:
        logging.info('Nothing found for: {}.'.format(args[0]))
예제 #4
0
def initialize():
    base_path = os.path.dirname(__file__)
    add_to_path(os.path.abspath(os.path.join(base_path, '..', '..', 'shared')))
    add_to_path(os.path.abspath(os.path.join(base_path, 'lib')))

    global log_file_path
    log_file_path = os.path.join(
        base_path, 'out', 'modeling_{}.log'.format(
            datetime.datetime.now().strftime("%Y-%m-%dT%H%M%S")))

    import config
    import sharedlib
    sharedlib.initialize(base_path, log_file_path, config.remote_server)

    sharedlib.create_dirs([
        sharedlib.abspath(os.path.join(base_path, 'in')),
        sharedlib.abspath(os.path.join(base_path, 'out'))
    ])
def generate_models_per_config(input_data_files):
    input_dir = sharedlib.abspath(config.input_dir)
    output_dir = sharedlib.abspath(config.output_dir)
    start_time = datetime.datetime.now()
    log('modeler::create_models() starting at {}'.format(start_time))

    positive_records_files = []
    negative_records_files = []
    log('Checking if labeled archive(s) need to be downloaded...')
    for input_data_file_set in input_data_files:
        positive_records_file = os.path.join(
            input_dir, input_data_file_set['positive_records_file'])
        negative_records_file = os.path.join(
            input_dir, input_data_file_set['negative_records_file'])
        if input_data_file_set['always_download'] == True or os.path.exists(
                positive_records_file) == False or os.path.exists(
                    negative_records_file) == False:
            log('Labeled archive for input data needs to be downloaded.')
            positive_records_file_uri = sharedlib.join_remote_server_paths(
                config.remote_server['base_uri'],
                input_data_file_set['remote_blob_dir'],
                input_data_file_set['positive_records_file'])
            negative_records_file_uri = sharedlib.join_remote_server_paths(
                config.remote_server['base_uri'],
                input_data_file_set['remote_blob_dir'],
                input_data_file_set['negative_records_file'])
            sharedlib.download_file(positive_records_file_uri,
                                    positive_records_file, True)
            sharedlib.download_file(negative_records_file_uri,
                                    negative_records_file, True)

        log('Positive records file: {}'.format(
            os.path.basename(positive_records_file)))
        log('Negative records file: {}'.format(
            os.path.basename(negative_records_file)))

        positive_records_files.append(positive_records_file)
        negative_records_files.append(negative_records_file)

        generate_models(positive_records_files, negative_records_files,
                        config.models,
                        config.duplicate_record_check_ignore_pattern,
                        output_dir, config.upload_output_to_remote_server)
예제 #6
0
def upload_output_to_remote_server(pattern_to_match = None):
    import config
    import sharedlib

    if pattern_to_match is None:
        pattern_to_match = '.zip'

    logging.info('Uploading output of the previous run(s) to the remote server...')
    output_dir = sharedlib.abspath(config.output_dir)
    files_in_output_dir = os.listdir(output_dir)
    files_to_upload = [os.path.join(output_dir, f) for f in files_in_output_dir if f.lower().endswith(pattern_to_match)]
    sharedlib.upload_files_to_remote_server_with_prompt(files_to_upload, config.remote_server['labeling_candidates_dir'])
def download_remote_server_files(remote_server_config, remote_server_files,
                                 output_files):
    auto_labeled_base_uri = sharedlib.join_remote_server_paths(
        remote_server_config['base_uri'],
        remote_server_config['labeling_auto_labeled_dir'])
    logging.info('Downloading remote server files from {}...'.format(
        auto_labeled_base_uri))

    remote_auto_labeled_files_from_config = remote_server_files[
        'auto_labeled_files']

    sharedlib.download_file(
        sharedlib.join_remote_server_paths(
            auto_labeled_base_uri, remote_auto_labeled_files_from_config[
                'autolabeled_positive_records_blob']),
        sharedlib.abspath(output_files['autolabeled_positive_records_file']),
        not remote_auto_labeled_files_from_config[
            'skip_download_if_already_present'])
    sharedlib.download_file(
        sharedlib.join_remote_server_paths(
            auto_labeled_base_uri, remote_auto_labeled_files_from_config[
                'autolabeled_negative_records_blob']),
        sharedlib.abspath(output_files['autolabeled_negative_records_file']),
        not remote_auto_labeled_files_from_config[
            'skip_download_if_already_present'])
    sharedlib.download_file(
        sharedlib.join_remote_server_paths(
            auto_labeled_base_uri, remote_auto_labeled_files_from_config[
                'input_file_total_lines_count_blob']),
        sharedlib.abspath(output_files['input_file_total_lines_count_file']),
        not remote_auto_labeled_files_from_config[
            'skip_download_if_already_present'])
    sharedlib.download_file(
        sharedlib.join_remote_server_paths(
            auto_labeled_base_uri, remote_auto_labeled_files_from_config[
                'already_processed_record_numbers_blob']),
        sharedlib.abspath(
            output_files['already_processed_record_numbers_file']),
        not remote_auto_labeled_files_from_config[
            'skip_download_if_already_present'])
def download_labeled_seed_files(remote_server_config, remote_server_files,
                                output_files):
    verified_records_base_uri = sharedlib.join_remote_server_paths(
        remote_server_config['base_uri'],
        remote_server_config['labeling_verified_samples_dir'])
    logging.info('Downloading remote server files from {}...'.format(
        verified_records_base_uri))

    remote_seed_files_from_config = remote_server_files['labeled_seed_files']

    sharedlib.download_file(
        sharedlib.join_remote_server_paths(
            verified_records_base_uri,
            remote_seed_files_from_config['verified_positive_records_blob']),
        sharedlib.abspath(output_files['autolabeled_positive_records_file']),
        not remote_seed_files_from_config['skip_download_if_already_present'])
    sharedlib.download_file(
        sharedlib.join_remote_server_paths(
            verified_records_base_uri,
            remote_seed_files_from_config['verified_negative_records_blob']),
        sharedlib.abspath(output_files['autolabeled_negative_records_file']),
        not remote_seed_files_from_config['skip_download_if_already_present'])
예제 #9
0
def upload_output_to_remote_server():
    import config
    import sharedlib

    logging.info(
        'Uploading output of the previous run(s) to the remote server...')

    output_files = config.output_files

    files_to_upload = [
        sharedlib.abspath(output_files['autolabeled_positive_records_file']),
        sharedlib.abspath(output_files['autolabeled_negative_records_file']),
        sharedlib.abspath(output_files['input_file_total_lines_count_file']),
        sharedlib.abspath(
            output_files['already_processed_record_numbers_file'])
    ]

    output_dir = sharedlib.abspath(config.output_dir)

    files_to_upload = [f for f in files_to_upload if os.path.exists(f) == True]
    sharedlib.upload_files_to_remote_server_with_prompt(
        files_to_upload, config.remote_server['labeling_auto_labeled_dir'])
예제 #10
0
def split_file(large_file, split_dir, max_records_per_file=50000):
    logging.info(
        'Splitting file {} into mutiple files with max {} records each'.format(
            large_file, max_records_per_file))
    input_file_path = sharedlib.abspath(large_file)
    split_dir = sharedlib.abspath(split_dir)
    input_file_base_name = os.path.basename(input_file_path)

    chunk_number = 0
    line_number = 0
    output_file = None
    chunks = []
    with open(input_file_path, 'r', encoding='utf-8', errors='ignore') as f:
        for line in f:
            if line_number == 0 or line_number == max_records_per_file:
                # Reset
                line_number = 0
                chunk_number += 1
                if output_file is not None:
                    output_file.close()
                input_file_name_without_ext = os.path.splitext(
                    input_file_base_name)[0]
                chunk_path = os.path.join(
                    split_dir, input_file_name_without_ext +
                    '.{:02d}.txt'.format(chunk_number))
                logging.info('Creating new file: {}...'.format(chunk_path))
                output_file = open(chunk_path,
                                   'w',
                                   encoding='utf-8',
                                   errors='ignore')
                chunks.append(chunk_path)

            line_number += 1
            output_file.write(line)

    logging.info('{} split into {} smaller files.'.format(
        input_file_base_name, len(chunks)))
    output_file.close()
    return chunks
예제 #11
0
def download_models_from_remote_server(remote_server_config, models_config,
                                       output_dir):
    logging.info('Downloading models...')
    output_dir = sharedlib.abspath(output_dir)
    remote_files = sharedlib.get_list_of_files_from_remote_server(
        remote_server_config['trained_models_dir'])

    models_base_uri = sharedlib.join_remote_server_paths(
        remote_server_config['base_uri'],
        remote_server_config['trained_models_dir'])

    models = []
    for model_config in models_config:
        name_zip_tuple = (model_config['name'], model_config['archive_name'],
                          os.path.join(output_dir,
                                       model_config['archive_name']))
        classifier = None
        vectorizer = None
        if name_zip_tuple[1] in remote_files:
            sharedlib.download_file(
                sharedlib.join_remote_server_paths(
                    models_base_uri, model_config['archive_name']),
                name_zip_tuple[2], True)
            sharedlib.unzip(name_zip_tuple[2], output_dir)
            pickle_file = os.path.join(output_dir,
                                       name_zip_tuple[0] + '.pickle')
            if os.path.exists(pickle_file):
                classifier = sharedlib.load_pickle(pickle_file)

        vectorizer_pickle_file = os.path.join(
            output_dir, name_zip_tuple[0] + '.vectorizer.pickle')
        if os.path.exists(vectorizer_pickle_file):
            logging.info('Vectorizer pickle file: {}'.format(
                os.path.basename(vectorizer_pickle_file)))
            logging.info('Loading the pickled vectorizer...')
            vectorizer = sharedlib.load_pickle(vectorizer_pickle_file)
        else:
            logging.info(
                'No vectorizer (expected: {}) found for this model.'.format(
                    vectorizer_pickle_file))

        if classifier is not None:
            models.append((name_zip_tuple[0], classifier, vectorizer))
        else:
            logging.info(
                'Could not find pickled classifier in the package {} on the Remote Server'
                .format(name_zip_tuple[1]))

    logging.info('{} MODELS LOADED'.format(len(models)))
    return models
예제 #12
0
def upload_output_to_remote_server(also_uplaod_merged_input_files):
    import config
    import sharedlib

    logging.info(
        'Uploading output of the previous run(s) to the remote server...')

    output_files = config.output_files

    files_to_upload = [
        sharedlib.abspath(output_files['verified_positive_records_file']),
        sharedlib.abspath(output_files['verified_negative_records_file']),
        sharedlib.abspath(
            output_files['already_processed_record_numbers_file'])
    ]

    output_dir = sharedlib.abspath(config.output_dir)

    accuracy_file_pattern = re.compile('.*_accuracy.json')
    accuarcy_files = [
        sharedlib.abspath(os.path.join(output_dir, file_name))
        for file_name in os.listdir(output_dir)
        if re.search(accuracy_file_pattern, file_name) is not None
    ]

    files_to_upload += accuarcy_files

    if also_uplaod_merged_input_files == True:
        files_to_upload += [
            sharedlib.abspath(output_files['potential_positive_records_file']),
            sharedlib.abspath(output_files['potential_negative_records_file']),
            sharedlib.abspath(
                output_files['questionable_positive_records_file']),
            sharedlib.abspath(
                output_files['questionable_negative_records_file'])
        ]

    files_to_upload = [f for f in files_to_upload if os.path.exists(f) == True]
    sharedlib.upload_files_to_remote_server_with_prompt(
        files_to_upload, config.remote_server['labeling_verified_samples_dir'])
def save_labeling_accuracy(model_name, output_dir, record_id, classification,
                           is_correct):
    accuracy_file_path = os.path.join(sharedlib.abspath(output_dir),
                                      model_name + '_accuracy.json')

    accuracy_data = None
    if os.path.exists(accuracy_file_path):
        with open(accuracy_file_path, 'r') as f:
            accuracy_data = json.load(f)

    if accuracy_data is None:  # No previous accuracy info for this model
        accuracy_data = []

    item = {}
    item['timestamp'] = datetime.datetime.now().isoformat()
    item['recordid'] = record_id
    item['classification'] = classification
    item['correct'] = is_correct

    accuracy_data.append(item)
    with open(accuracy_file_path, 'w') as f:
        json.dump(accuracy_data, f, indent=4)
예제 #14
0
def merge_file_sets(file_base_name, out_dir, positive_files, negative_files,
                    maybe_positive_files, maybe_negative_files,
                    process_log_file_paths):
    output_dir = sharedlib.abspath(out_dir)
    file_name_without_ext = os.path.splitext(file_base_name)[0]
    positive_records_output_file_path = os.path.join(
        output_dir, file_name_without_ext + '.potential_pos.txt')
    negative_records_output_file_path = os.path.join(
        output_dir, file_name_without_ext + '.potential_neg.txt')
    maybe_positive_records_output_file_path = os.path.join(
        output_dir, file_name_without_ext + '.questionable_pos.txt')
    maybe_negative_records_output_file_path = os.path.join(
        output_dir, file_name_without_ext + '.questionable_neg.txt')
    process_log_file_path = os.path.join(
        output_dir, file_name_without_ext + '.process.txt')

    logging.info('Merging {} positive labeled files into: {}...'.format(
        len(positive_files), positive_records_output_file_path))
    merge_files(positive_files, positive_records_output_file_path)
    logging.info('Merging {} negative labeled files into: {}...'.format(
        len(negative_files), negative_records_output_file_path))
    merge_files(negative_files, negative_records_output_file_path)
    logging.info('Merging {} maybe positive labeled files into: {}...'.format(
        len(maybe_positive_files), maybe_positive_records_output_file_path))
    merge_files(maybe_positive_files, maybe_positive_records_output_file_path)
    logging.info('Merging {} maybe negative labeled files into: {}...'.format(
        len(maybe_negative_files), maybe_negative_records_output_file_path))
    merge_files(maybe_negative_files, maybe_negative_records_output_file_path)
    logging.info('Merging {} process log files into: {}...'.format(
        len(process_log_file_paths), process_log_file_path))
    merge_files(process_log_file_paths, process_log_file_path)

    return (positive_records_output_file_path,
            negative_records_output_file_path,
            maybe_positive_records_output_file_path,
            maybe_negative_records_output_file_path, process_log_file_path)
예제 #15
0
def download_remote_server_files(remote_server_config, remote_server_files,
                                 output_files):
    labeled_base_uri = sharedlib.join_remote_server_paths(
        remote_server_config['base_uri'],
        remote_server_config['labeling_verified_samples_dir'])
    logging.info('Downloading cloud files from {}'.format(labeled_base_uri))

    sharedlib.download_file(
        sharedlib.join_remote_server_paths(
            labeled_base_uri,
            remote_server_files['potential_positive_records_blob']),
        sharedlib.abspath(output_files['potential_positive_records_file']),
        not remote_server_files['skip_download_if_already_present'])
    sharedlib.download_file(
        sharedlib.join_remote_server_paths(
            labeled_base_uri,
            remote_server_files['potential_negative_records_blob']),
        sharedlib.abspath(output_files['potential_negative_records_file']),
        not remote_server_files['skip_download_if_already_present'])
    sharedlib.download_file(
        sharedlib.join_remote_server_paths(
            labeled_base_uri,
            remote_server_files['questionable_positive_records_blob']),
        sharedlib.abspath(output_files['questionable_positive_records_file']),
        not remote_server_files['skip_download_if_already_present'])
    sharedlib.download_file(
        sharedlib.join_remote_server_paths(
            labeled_base_uri,
            remote_server_files['questionable_negative_records_blob']),
        sharedlib.abspath(output_files['questionable_negative_records_file']),
        not remote_server_files['skip_download_if_already_present'])

    sharedlib.download_file(
        sharedlib.join_remote_server_paths(
            labeled_base_uri,
            remote_server_files['verified_positive_records_blob']),
        sharedlib.abspath(output_files['verified_positive_records_file']),
        True)
    sharedlib.download_file(
        sharedlib.join_remote_server_paths(
            labeled_base_uri,
            remote_server_files['verified_negative_records_blob']),
        sharedlib.abspath(output_files['verified_negative_records_file']),
        True)
    sharedlib.download_file(
        sharedlib.join_remote_server_paths(
            labeled_base_uri,
            remote_server_files['already_processed_record_numbers_blob']),
        sharedlib.abspath(
            output_files['already_processed_record_numbers_file']), True)

    logging.info('Downloading model labeling accuracy files...')
    accuracy_file_pattern = re.compile('.*_accuracy.json')
    remote_files = sharedlib.get_list_of_files_from_remote_server(
        remote_server_config['labeling_verified_samples_dir'])
    accuarcy_files = [
        file_name for file_name in remote_files
        if re.search(accuracy_file_pattern, file_name) is not None
    ]

    for accuracy_file in accuarcy_files:
        file_uri = sharedlib.join_remote_server_paths(labeled_base_uri,
                                                      accuracy_file)
        file_local_path = sharedlib.abspath(
            os.path.join(
                os.path.dirname(
                    output_files['already_processed_record_numbers_file']),
                accuracy_file))
        sharedlib.download_file(file_uri, file_local_path, True)
예제 #16
0
def autolabel(mode, 
         input_files,
         autolabeled_positive_records_file_path, 
         autolabeled_negative_records_file_path, 
         already_processed_record_numbers_file_path, 
         input_file_total_lines_count_file_path, model):
        
    autolabled_positive_records_file_basename = os.path.basename(autolabeled_positive_records_file_path).lower()
    autolabled_negative_records_file_basename = os.path.basename(autolabeled_negative_records_file_path).lower()

    autolabled_pos_duplicates_table = remove_semantically_duplicate_records(autolabeled_positive_records_file_path, config.duplicate_record_check_ignore_pattern, config.max_semantic_duplicate_records_allowed)
    autolabled_neg_duplicates_table = remove_semantically_duplicate_records(autolabeled_negative_records_file_path, config.duplicate_record_check_ignore_pattern, config.max_semantic_duplicate_records_allowed)

    sharedlib.remove_duplicate_records([autolabeled_positive_records_file_path, autolabeled_negative_records_file_path])
    
    # Perform QC of the master labeled set    
    while True:
        # QC until it passes 100% or user skips
        new_model = __modeling_helper.rebuild_models(autolabeled_positive_records_file_path, autolabeled_negative_records_file_path, already_processed_record_numbers_file_path, input_file_total_lines_count_file_path)[0]
        (qc_score, user_aborted)  = perform_manual_qc(new_model, autolabeled_positive_records_file_path, autolabeled_negative_records_file_path, 50, 20)

        if qc_score != 1 and user_aborted == False:
            logging.info('QC of the sampling of the entire dataset found at least one correction needed (QC Score: {:.2f}). Performing another round of QC...'.format(qc_score))
            continue

        logging.info('QC (Score: {:.2f}) passed or user skipped. Do you want to continue auto-labeling? [Y]es to continue; [N]o to quit, [R] to re-QC...'.format(qc_score))
        decision = None
        while (decision != 'y' and decision != 'n' and decision != 'r'):
            decision = sharedlib.get_char_input()
            if not isinstance(decision, str):
                decision = bytes.decode(decision)
            decision = decision.lower()

        logging.info('Selected: {}'.format(decision))
        if decision == 'n':
            return;
        elif decision == 'r':
            continue
        else:
            break; # QC of master set complete, exit the loop and continue with auto-labeling.

    total_new_records_labeled_this_session = 0
    autolabeled_positive_records_pending_qc_file_path = sharedlib.abspath(os.path.join(config.output_dir, 'positive_records_pending_qc.txt'))
    autolabeled_negative_records_pending_qc_file_path = sharedlib.abspath(os.path.join(config.output_dir, 'negative_records_pending_qc.txt'))

    # Create the model for auto labeling
    while True:
        logging.info('Confirm to continue with auto-labeling. [Y]es, [N]o: ')
        decision = None
        while (decision != 'y' and decision != 'n'):
            decision = sharedlib.get_char_input()
            if not isinstance(decision, str):
                decision = bytes.decode(decision)
            decision = decision.lower()

        logging.info('Selected: {}'.format(decision))
        if decision == 'n':
            break;

        logging.info('[Re]building model to be used in classification...')

        new_model = __modeling_helper.rebuild_models(autolabeled_positive_records_file_path, autolabeled_negative_records_file_path, already_processed_record_numbers_file_path, input_file_total_lines_count_file_path)[0]
        min_required_model_score = config.min_model_score_for_auto_labeling
        model_score = new_model[3]
        if model_score < min_required_model_score:
            logging.info('Model accuracy score is {}, which is less than the minimum required for auto labeling. Quitting...'.format(model_score, min_required_model_score))
            break
        
        
        total_new_records_labeled_using_current_models = 0
        previous_qc_score = .8 # For the very first QC of the session, we assume 20% failure. This then gets updated with every QC performed.

        input_file_basename_to_full_path_map = {}

        for input_file in input_files:
            input_file_basename_to_full_path_map[os.path.basename(input_file).lower()] = sharedlib.abspath(input_file)

        already_read_records = get_already_read_records(already_processed_record_numbers_file_path)
        if already_read_records is None or len(already_read_records) == 0:
            logging.info('Already read records data not found. Creating new...')
            already_read_records = {}

        for input_file in input_files:
            input_file_basename = os.path.basename(input_file).lower()
            if input_file_basename not in already_read_records:
                already_read_records[os.path.basename(input_file).lower()] = {}

        total_available_records = get_total_available_records(input_file_total_lines_count_file_path)
        if total_available_records is None or len(total_available_records) == 0:
            logging.info('Input file total available records data not found. Creating new...')
            total_available_records = {}

        for input_file in input_files:
            input_file_basename = os.path.basename(input_file).lower()
            if input_file_basename not in total_available_records:
                logging.info('Creating new...')
                total_available_records[os.path.basename(input_file).lower()] = get_total_lines_count(input_file_basename_to_full_path_map[input_file_basename])

        save_already_read_records(already_processed_record_numbers_file_path, already_read_records)
        save_total_available_records(input_file_total_lines_count_file_path, total_available_records)

        total_autolabeled_positive_records = get_total_lines_count(autolabeled_positive_records_file_path) if os.path.exists(autolabeled_positive_records_file_path) else 0
        total_autolabeled_negative_records = get_total_lines_count(autolabeled_negative_records_file_path) if os.path.exists(autolabeled_negative_records_file_path) else 0

        autolabeled_positive_records_pending_qc_file = open(autolabeled_positive_records_pending_qc_file_path, 'w', encoding='utf-8', errors='ignore')
        autolabeled_negative_records_pending_qc_file = open(autolabeled_negative_records_pending_qc_file_path, 'w', encoding='utf-8', errors='ignore')

        total_autolabeled_positive_records_pending_qc = 0
        total_autolabeled_negative_records_pending_qc = 0

        input_file_basenames = [key for key in total_available_records for input_file in input_files if key in input_file.lower()]
        while total_new_records_labeled_using_current_models <= config.models_auto_regen_records_threshold:
            logging.info('-------------------------------------------------------------------')
            file_to_read_basename = None if mode is None else next([file for file in input_file_basenames if file == mode.lower()], None)
            if file_to_read_basename == None:
                file_to_read_basename = random.choice(input_file_basenames)

            file_to_read = None
            aleady_read_record_numbers = already_read_records[file_to_read_basename]
            record_number_to_read = get_unique_random_record_number(total_available_records[file_to_read_basename],
                                                                    aleady_read_record_numbers)
            file_to_read = input_file_basename_to_full_path_map[file_to_read_basename]

            minibatch_labeled_records_count = 0
            minibatch_attempted_records_count =  0

            file_to_read_handle = open(file_to_read, 'r', encoding='utf-8', errors='ignore');
            line_number = 0

            # Advance to the record being read. A 'mini batch' will begin at that location until <minibatch_size> samples are found. 
            logging.info('Locating the record {} in this file...'.format(record_number_to_read))
            while line_number < record_number_to_read:
                next(file_to_read_handle)
                line_number += 1

            configured_minibatch_size = config.minibatch_size
        
            logging.info('Entering minibatch loop for file {}, starting Record# {}. Looking for {} labeled records in this file...'.format(file_to_read_basename, record_number_to_read, configured_minibatch_size))
            while minibatch_labeled_records_count < configured_minibatch_size:
                if minibatch_attempted_records_count != 0:
                    # This means this pass is not the first in the minibatch loop. Advance the record number.
                    record_number_to_read += 1

                if record_number_to_read >= total_available_records[file_to_read_basename]: 
                    # End of the file reached. Exit the minibatch loop to determine the next file and/or entry point
                    break;
            
                if total_new_records_labeled_using_current_models > config.models_auto_regen_records_threshold:
                    # Model re-generation is due
                    break;
                
                logging.info('So far pending QC => POS: {}, NEG: {}. Model accuracy {:.2f}. File: {} Record#: {}. Auto-labeled since last model generation: {}. Still looking for {} labeled in this minibatch.'.format(total_autolabeled_positive_records_pending_qc, total_autolabeled_negative_records_pending_qc, new_model[3], file_to_read_basename, record_number_to_read, total_new_records_labeled_using_current_models, (config.minibatch_size - minibatch_labeled_records_count)))
                line = file_to_read_handle.readline()
                minibatch_attempted_records_count +=1 

                record_hash = hashlib.sha1(re.sub(config.duplicate_record_check_ignore_pattern, '', line).upper().encode(errors='ignore')).hexdigest()

                line_id = line[:40]
                (model_name, result) = __classification_helper.classify(line, [new_model])[0] # returns tuple: (name, (predicted_classification, positive_proba))
        
                pos_prob = result[1]
                neg_prob = 1 - pos_prob

                if pos_prob >= config.min_probability_for_auto_labeling:
                    if (total_autolabeled_positive_records + total_autolabeled_positive_records_pending_qc) > (total_autolabeled_negative_records + total_autolabeled_negative_records_pending_qc):
                        logging.info('This is a positive record, but the search is for a negative record to maintain positive/negative parity. Skipping...')
                        # We maintain positive/negative count parity as we go
                        continue

                    # Do not allow more than n duplicates to prevent bias
                    if record_hash not in autolabled_pos_duplicates_table:
                        autolabled_pos_duplicates_table[record_hash] = 0 # Initialize the hash table entry

                    if autolabled_pos_duplicates_table[record_hash] >= config.max_semantic_duplicate_records_allowed:
                        logging.info('This is a technically unique but semantically duplicate record. There are already {} copies in the positive set. Skipping...'.format(autolabled_pos_duplicates_table[record_hash]))
                        continue

                    autolabled_pos_duplicates_table[record_hash] += 1


                    logging.info(line)
                    logging.info('Auto-Selected: Positive')
                    autolabeled_positive_records_pending_qc_file.write(line)
                    total_autolabeled_positive_records_pending_qc += 1
                    minibatch_labeled_records_count += 1
                    total_new_records_labeled_using_current_models += 1
                    total_new_records_labeled_this_session += 1
                    if not record_number_to_read in already_read_records:
                        aleady_read_record_numbers[record_number_to_read] = []
                    aleady_read_record_numbers[record_number_to_read].append({line_id: positive_class_str})

                elif neg_prob >= config.min_probability_for_auto_labeling:
                    if (total_autolabeled_negative_records + total_autolabeled_negative_records_pending_qc) > (total_autolabeled_positive_records + total_autolabeled_positive_records_pending_qc) :
                        logging.info('This is a negative record, but the search is for a positive record to maintain positive/negative parity. Skipping...')
                        # We maintain positive/negative count parity as we go
                        continue

                    # Do not allow more than n duplicates to prevent bias
                    if record_hash not in autolabled_neg_duplicates_table:
                        autolabled_neg_duplicates_table[record_hash] = 0 # Initialize the hash table entry

                    if autolabled_neg_duplicates_table[record_hash] >= config.max_semantic_duplicate_records_allowed:
                        logging.info('This is a technically unique but semantically duplicate record. There are already {} copies in the negative set. Skipping...'.format(autolabled_neg_duplicates_table[record_hash]))
                        continue

                    autolabled_neg_duplicates_table[record_hash] += 1


                    logging.info(line)
                    logging.info('Auto-selected: Negative')
                    autolabeled_negative_records_pending_qc_file.write(line)
                    minibatch_labeled_records_count += 1
                    total_autolabeled_negative_records_pending_qc += 1
                    total_new_records_labeled_using_current_models += 1
                    total_new_records_labeled_this_session += 1
                    if not record_number_to_read in already_read_records:
                        aleady_read_record_numbers[record_number_to_read] = []
                    aleady_read_record_numbers[record_number_to_read].append({line_id: negative_class_str})

                else:
                    logging.info('This record (POS: {:.2f}, NEG: {:.2f}) is not strong enough (min required: {:.2f}) to be in the labeled set. Skipping...'.format(pos_prob, neg_prob, config.min_probability_for_auto_labeling))
                    continue;

        
                save_already_read_records(already_processed_record_numbers_file_path, already_read_records)

            file_to_read_handle.close()

        autolabeled_positive_records_pending_qc_file.close()
        autolabeled_negative_records_pending_qc_file.close()
        logging.info('{} records auto-labeled since the last model. These new records must be QCed...'.format(total_new_records_labeled_using_current_models))
        while True:
            total_autolabeled_positive_records_pending_qc = sharedlib.get_total_lines_count(autolabeled_positive_records_pending_qc_file_path)
            total_autolabeled_negative_records_pending_qc = sharedlib.get_total_lines_count(autolabeled_negative_records_pending_qc_file_path)
            total_pending_qc_records_count = total_autolabeled_positive_records_pending_qc + total_autolabeled_negative_records_pending_qc
            sample_size =   math.ceil(total_pending_qc_records_count * ((1-previous_qc_score) * config.inaccuracy_to_qc_sample_size_multiplier))
            if sample_size < 1 or sample_size > total_pending_qc_records_count:
                sample_size = total_pending_qc_records_count

            (qc_score, user_aborted) = perform_manual_qc(new_model, autolabeled_positive_records_pending_qc_file_path, autolabeled_negative_records_pending_qc_file_path, total_new_records_labeled_using_current_models, sample_size)

            previous_qc_score = qc_score
            if qc_score != 1 and user_aborted == False:
                logging.info('QC found at least one correction needed (QC Score: {:.2f}). Additional QC will be needed...'.format(qc_score))
                continue

            logging.info('Model re-built and QC (Score: {:.2f}) passed or user skipped. Do you want to merge pending QC records to the master set? [Y]es to continue; [N]o to quit; [R] to re-run QC...'.format(qc_score))
            decision = None
            while (decision != 'y' and decision != 'n' and decision != 'r'):
                decision = sharedlib.get_char_input()
                if not isinstance(decision, str):
                    decision = bytes.decode(decision)
                decision = decision.lower()

            logging.info('Selected: {}'.format(decision))
            if decision == 'n':
                break;
            elif decision == 'r':
                continue
            else:
                # User chose 'y'. Proceed with the merge.
                logging.info('Merging pending QC files to positive/negative master set...')
                tmp_merged_positive_file_path = autolabeled_positive_records_file_path + '.tmp'
                sharedlib.merge_files([autolabeled_positive_records_file_path, autolabeled_positive_records_pending_qc_file_path], tmp_merged_positive_file_path)
                shutil.move(tmp_merged_positive_file_path, autolabeled_positive_records_file_path)

                tmp_merged_negative_file_path = autolabeled_negative_records_file_path + '.tmp'
                sharedlib.merge_files([autolabeled_negative_records_file_path, autolabeled_negative_records_pending_qc_file_path], tmp_merged_negative_file_path)
                shutil.move(tmp_merged_negative_file_path, autolabeled_negative_records_file_path)

                total_autolabeled_positive_records = get_total_lines_count(autolabeled_positive_records_file_path) if os.path.exists(autolabeled_positive_records_file_path) else 0
                total_autolabeled_negative_records = get_total_lines_count(autolabeled_negative_records_file_path) if os.path.exists(autolabeled_negative_records_file_path) else 0

                logging.info('Files merged. Total {} positive and {} negative records in autolabeled master set.'.format(total_autolabeled_positive_records, total_autolabeled_negative_records))
                break;
예제 #17
0
def extract_records(input_files,
                    output_dir,
                    max_potential_records_to_extract=None,
                    max_questionable_records_to_extract=None):
    logging.info(
        'Extracting potential positive and negative records from {} file(s)...'
        .format(len(input_files)))

    output_dir = sharedlib.abspath(output_dir)

    sharedlib.dump_list_to_file(
        config.known_positive_records_qualifying_terms,
        os.path.join(output_dir, 'positive_qualifying_criteria.txt'))
    sharedlib.dump_list_to_file(
        config.known_positive_records_disqualifying_terms,
        os.path.join(output_dir, 'positive_disqualifying_criteria.txt'))

    total_positive_count = 0
    total_negative_count = 0

    max_records_per_file = config.file_split_lines_per_file

    known_positive_records_qualifying_terms_regex_list = build_compiled_regex_list(
        config.known_positive_records_qualifying_terms)
    known_positive_records_disqualifying_terms_regex_list = build_compiled_regex_list(
        config.known_positive_records_disqualifying_terms)
    potential_positive_records_qualifying_terms_regex_list = build_compiled_regex_list(
        config.potential_positive_records_qualifying_terms)

    for file_name in input_files:
        positive_records_output_files = []
        negative_records_output_files = []
        maybe_positive_records_output_file_paths = []
        maybe_negative_records_output_file_paths = []
        process_log_file_paths = []

        # Split each file for parallelization
        chunks = split_file(file_name, config.file_split_dir,
                            max_records_per_file)

        chunk_max = None
        if max_potential_records_to_extract is not None:
            chunk_max = math.ceil(
                round(max_potential_records_to_extract / len(chunks), 0))
            if chunk_max <= 0:
                chunk_max = None

        chunk_questionable_records_max = None
        if max_questionable_records_to_extract is not None:
            chunk_questionable_records_max = math.ceil(
                round(max_questionable_records_to_extract / len(chunks), 0))
            if chunk_questionable_records_max <= 0:
                chunk_questionable_records_max = None

        processes = []
        process_return_values = []

        is_first_chunk = True
        for chunk in chunks:
            logging.info(
                'Extracting up to {} potential positive and negative records from {}...'
                .format(chunk_max, chunk))
            chunk_name_without_ext = os.path.splitext(
                os.path.basename(chunk))[0]
            positive_records_output_file_path = os.path.join(
                output_dir, chunk_name_without_ext + '.potential_pos.txt')
            negative_records_output_file_path = os.path.join(
                output_dir, chunk_name_without_ext + '.potential_neg.txt')
            maybe_positive_records_output_file_path = os.path.join(
                output_dir, chunk_name_without_ext + '.questionable_pos.txt')
            maybe_negative_records_output_file_path = os.path.join(
                output_dir, chunk_name_without_ext + '.questionable_neg.txt')
            process_log_file_path = os.path.join(
                output_dir, chunk_name_without_ext + '.process.txt')

            positive_records_output_files.append(
                positive_records_output_file_path)
            negative_records_output_files.append(
                negative_records_output_file_path)
            maybe_positive_records_output_file_paths.append(
                maybe_positive_records_output_file_path)
            maybe_negative_records_output_file_paths.append(
                maybe_negative_records_output_file_path)
            process_log_file_paths.append(process_log_file_path)

            args = (chunk, positive_records_output_file_path,
                    negative_records_output_file_path,
                    maybe_positive_records_output_file_path,
                    maybe_negative_records_output_file_path,
                    process_log_file_path, is_positive, is_negative,
                    known_positive_records_qualifying_terms_regex_list,
                    known_positive_records_disqualifying_terms_regex_list,
                    potential_positive_records_qualifying_terms_regex_list,
                    process_return_values, chunk_max,
                    chunk_questionable_records_max, is_first_chunk)

            process = multiprocessing.Process(
                name=chunk_name_without_ext,
                target=extract_matching_records_from_file,
                args=args)
            processes.append(process)

            is_first_chunk = False

        for process in processes:
            logging.info('Starting process: {}...'.format(process.name))
            process.start()

        for process in processes:
            process.join()

        for (positive_count, negative_count) in process_return_values:
            total_positive_count += positive_count
            total_negative_count += negative_count

        # Merge output files
        (positive_records_output_file_path, negative_records_output_file_path,
         maybe_positive_records_output_file_path,
         maybe_negative_records_output_file_path,
         process_log_file_path) = merge_file_sets(
             os.path.basename(file_name), output_dir,
             positive_records_output_files, negative_records_output_files,
             maybe_positive_records_output_file_paths,
             maybe_negative_records_output_file_paths, process_log_file_paths)

        logging.info('Deleting temprary chunked files..')
        for chunk in chunks:
            logging.info('Deleting {}...'.format(chunk))
            os.remove(chunk)

        # Upload merged files
        if config.upload_output_to_remote_server == True:
            list_of_files_to_upload = [
                positive_records_output_file_path,
                negative_records_output_file_path,
                maybe_positive_records_output_file_path,
                maybe_negative_records_output_file_path, process_log_file_path
            ]

            archive_path = os.path.join(
                os.path.dirname(positive_records_output_file_path),
                os.path.splitext(os.path.basename(file_name))[0] +
                '.labeling_candidates.zip')
            sharedlib.zip_files(list_of_files_to_upload, archive_path)
            sharedlib.upload_files_to_labeling_candidates_dir([archive_path])
def generate_models(positive_records_files, negative_records_files,
                    models_config, duplicate_record_check_ignore_pattern,
                    output_dir, upload_generated_models_to_remote_server):
    output_dir = sharedlib.abspath(output_dir)
    start_time = datetime.datetime.now()
    log('modeler::generate_models() starting at {}'.format(start_time))
    process_log_first_line = 'MAUDE Modeling Process Log. Computer: {}. OS: {} {}  Date/Time: {}. Python Version: {}\n'.format(
        platform.node(), platform.system(), platform.release(), start_time,
        sys.version)

    log(process_log_first_line)

    log('Merging all positive/negative labeled files. Two sets (all and one without duplicate records) will be produced...'
        )

    all_pos_records_file_path = os.path.join(output_dir,
                                             'positive_records_all.txt')
    all_neg_records_file_path = os.path.join(output_dir,
                                             'negative_records_all.txt')
    sharedlib.merge_files(
        [sharedlib.abspath(p) for p in negative_records_files],
        all_neg_records_file_path, False, None)
    sharedlib.merge_files(
        [sharedlib.abspath(p) for p in positive_records_files],
        all_pos_records_file_path, False, None)
    sharedlib.randomize_records(all_pos_records_file_path)
    sharedlib.randomize_records(all_neg_records_file_path)
    log('Combined (merged and randomized) positive labeled (all) file: {}'.
        format(all_pos_records_file_path))
    log('Combined (merged and randomized) negative labeled (all) file: {}'.
        format(all_neg_records_file_path))

    at_least_one_no_dup_model = any(
        model_config['ignore_duplicate_training_records'] == True
        for model_config in models_config)
    if at_least_one_no_dup_model:
        nodups_pos_records_file_path = os.path.join(
            output_dir, 'positive_records_nodups.txt')
        nodups_neg_records_file_path = os.path.join(
            output_dir, 'negative_records_nodups.txt')
        sharedlib.merge_files(
            [sharedlib.abspath(p)
             for p in negative_records_files], nodups_neg_records_file_path,
            True, duplicate_record_check_ignore_pattern)
        sharedlib.merge_files(
            [sharedlib.abspath(p)
             for p in positive_records_files], nodups_pos_records_file_path,
            True, duplicate_record_check_ignore_pattern)
        sharedlib.randomize_records(nodups_pos_records_file_path)
        sharedlib.randomize_records(nodups_neg_records_file_path)
        log('Combined (merged and randomized) positive labeled (no-duplicates) file: {}'
            .format(nodups_pos_records_file_path))
        log('Combined (merged and randomized) negative labeled (no-duplicates) file: {}'
            .format(nodups_neg_records_file_path))

    generated_models = []

    for model_config in models_config:
        model_start_time = datetime.datetime.now()
        model_name = model_config['name']
        log('Starting model generation for: {} at {}...'.format(
            model_name, model_start_time))

        pos_labeled_file_path = None
        neg_labeled_file_path = None

        if 'nltk.naive_bayes' in model_name or 'sklearn' in model_name:
            if model_config['ignore_duplicate_training_records'] == True:
                pos_labeled_file_path = nodups_pos_records_file_path
                neg_labeled_file_path = nodups_neg_records_file_path
            else:
                pos_labeled_file_path = all_pos_records_file_path
                neg_labeled_file_path = all_neg_records_file_path
        else:
            raise ValueError('Unsupported model: {}'.format(model_name))

        classifier = None
        vectorizer = None
        score = None

        if 'nltk.naive_bayes' in model_name:
            classifier, score = _nltk_naive_bayes.generate_model(
                pos_labeled_file_path, neg_labeled_file_path, model_config,
                output_dir)
        else:
            classifier, vectorizer, score = _sklearn.generate_model(
                pos_labeled_file_path, neg_labeled_file_path, model_config,
                output_dir)

        classifier_pickle_file = sharedlib.abspath(
            os.path.join(output_dir, model_name + '.pickle'))
        logging.info('Pickling the model as: {}...'.format(
            os.path.basename(classifier_pickle_file)))
        sharedlib.pickle_object(classifier, classifier_pickle_file)

        vectorizer_pickle_file = None
        if vectorizer is not None:
            vectorizer_pickle_file = sharedlib.abspath(
                os.path.join(output_dir, model_name + '.vectorizer.pickle'))
            logging.info('Pickling the Vectorizer as: {}...'.format(
                os.path.basename(vectorizer_pickle_file)))
            sharedlib.pickle_object(vectorizer, vectorizer_pickle_file)

        logging.info('Model pickled.')

        generated_models.append((model_name, classifier_pickle_file,
                                 vectorizer_pickle_file, score))

        if upload_generated_models_to_remote_server == True:
            model_archive_name = model_config['archive_name']

            files_to_zip = [
                pos_labeled_file_path, neg_labeled_file_path,
                classifier_pickle_file
            ]
            if vectorizer is not None:
                files_to_zip.append(vectorizer_pickle_file)

            zipped_file = sharedlib.zip_files(
                files_to_zip,
                sharedlib.abspath(os.path.join(output_dir,
                                               model_archive_name)))
            log('Uploading the pickled model ({}) to the Remote Server...'.
                format(model_archive_name))
            sharedlib.upload_files_to_trained_models_dir([zipped_file])

        model_end_time = datetime.datetime.now()
        log('Completed creating model for: {} at {}. Duration: {}...'.format(
            model_name, model_end_time, model_end_time - model_start_time))

    end_time = datetime.datetime.now()
    return generated_models
def build_potential_file_sets(input_files,
                              potential_positive_records_file_merged,
                              potential_negative_records_file_merged,
                              questionable_positive_records_file_merged,
                              questionable_negative_records_file_merged):
    logging.info('Building potential positive and negative files...')

    input_dir = sharedlib.abspath(config.input_dir)

    with open(potential_positive_records_file_merged,
              'w',
              encoding='utf-8',
              errors='ignore') as consolidated_pos:
        with open(potential_negative_records_file_merged,
                  'w',
                  encoding='utf-8',
                  errors='ignore') as consolidated_neg:
            with open(questionable_positive_records_file_merged,
                      'w',
                      encoding='utf-8',
                      errors='ignore') as consolidated_questionable_pos:
                with open(questionable_negative_records_file_merged,
                          'w',
                          encoding='utf-8',
                          errors='ignore') as consolidated_questionable_neg:
                    for input_data_file_set in input_files:
                        potential_positive_records_file = os.path.join(
                            input_dir, input_data_file_set[
                                'potential_positive_records_file'])
                        potential_negative_records_file = os.path.join(
                            input_dir, input_data_file_set[
                                'potential_negative_records_file'])
                        questionable_positive_records_file = os.path.join(
                            input_dir, input_data_file_set[
                                'questionable_positive_records_file'])
                        questionable_negative_records_file = os.path.join(
                            input_dir, input_data_file_set[
                                'questionable_negative_records_file'])
                        if input_data_file_set[
                                'always_download'] == True or os.path.exists(
                                    potential_positive_records_file
                                ) == False or os.path.exists(
                                    potential_negative_records_file) == False:
                            logging.info(
                                'Labeling candidate archive for {} needs to be downloaded.'
                                .format(input_data_file_set['name']))

                            labeling_candidates_file_url = sharedlib.join_remote_server_paths(
                                config.remote_server['base_uri'], config.
                                remote_server['labeling_candidates_dir'],
                                input_data_file_set[
                                    'labeling_candidates_archive_name'])

                            download_zip_file_path = os.path.join(
                                input_dir,
                                input_data_file_set['name'] + '.zip')
                            sharedlib.download_file(
                                labeling_candidates_file_url,
                                download_zip_file_path)
                            logging.info('Extracting auto-labeled archive...')
                            sharedlib.unzip(download_zip_file_path, input_dir)
                            logging.info('Labeling candidate files extracted.')
                        logging.info('Merging {} into {}...'.format(
                            input_data_file_set[
                                'potential_positive_records_file'],
                            potential_positive_records_file_merged))
                        fin = open(potential_positive_records_file,
                                   encoding='utf-8',
                                   errors='ignore')
                        for record in fin:
                            if len(record.strip()) == 0:
                                continue
                            consolidated_pos.write(record)

                        logging.info('Merging {} into {}...'.format(
                            input_data_file_set[
                                'potential_negative_records_file'],
                            potential_negative_records_file_merged))
                        fin = open(potential_negative_records_file,
                                   encoding='utf-8',
                                   errors='ignore')
                        for record in fin:
                            if len(record.strip()) == 0:
                                continue
                            consolidated_neg.write(record)

                        logging.info('Merging {} into {}...'.format(
                            input_data_file_set[
                                'questionable_positive_records_file'],
                            questionable_positive_records_file_merged))
                        fin = open(questionable_positive_records_file,
                                   encoding='utf-8',
                                   errors='ignore')
                        for record in fin:
                            if len(record.strip()) == 0:
                                continue
                            consolidated_questionable_pos.write(record)

                        logging.info('Merging {} into {}...'.format(
                            input_data_file_set[
                                'questionable_negative_records_file'],
                            questionable_negative_records_file_merged))
                        fin = open(questionable_negative_records_file,
                                   encoding='utf-8',
                                   errors='ignore')
                        for record in fin:
                            if len(record.strip()) == 0:
                                continue
                            consolidated_questionable_neg.write(record)
def label_records(mode):
    input_files = config.input_data_file_sets
    logging.info(
        'Labeling known positive and negative records from {} file(s)...'.
        format(len(input_files)))

    potential_positive_records_file = sharedlib.abspath(
        config.output_files['potential_positive_records_file'])
    questionable_positive_records_file = sharedlib.abspath(
        config.output_files['questionable_positive_records_file'])
    potential_negative_records_file = sharedlib.abspath(
        config.output_files['potential_negative_records_file'])
    questionable_negative_records_file = sharedlib.abspath(
        config.output_files['questionable_negative_records_file'])

    positive_records_output_file = sharedlib.abspath(
        config.output_files['verified_positive_records_file'])
    negative_records_output_file = sharedlib.abspath(
        config.output_files['verified_negative_records_file'])
    already_processed_record_numbers_file = sharedlib.abspath(
        config.output_files['already_processed_record_numbers_file'])

    existing_work_in_progress = __remote_server_helper.all_work_in_progress_files_present_on_remote_server(
        config.remote_server, config.remote_server_files)
    if existing_work_in_progress:
        __remote_server_helper.download_remote_server_files(
            config.remote_server, config.remote_server_files,
            config.output_files)
    else:
        # No cloud files or incomplete set. Create new using data files.
        build_potential_file_sets(input_files, potential_positive_records_file,
                                  potential_negative_records_file,
                                  questionable_positive_records_file,
                                  questionable_negative_records_file)

    models = __remote_server_helper.download_models_from_remote_server(
        config.remote_server, config.models, config.models_output_dir)

    label(mode, potential_positive_records_file,
          potential_negative_records_file, questionable_positive_records_file,
          questionable_negative_records_file, positive_records_output_file,
          negative_records_output_file, already_processed_record_numbers_file,
          models)

    sharedlib.remove_duplicate_records(
        [positive_records_output_file, negative_records_output_file])

    if config.upload_output_to_remote_server == True:
        logging.info('Upload output{}? [y/n] '.format(
            '' if existing_work_in_progress else
            ' (POTENTIALLY OVERWRITE CLOUD)'))
        upload_confirmation = sharedlib.get_char_input()
        if not isinstance(upload_confirmation, str):
            upload_confirmation = bytes.decode(upload_confirmation)
        if upload_confirmation == 'y':
            files_to_upload = [
                positive_records_output_file, negative_records_output_file,
                already_processed_record_numbers_file
            ]
            accuracy_file_pattern = re.compile('.*_accuracy.json')
            accuarcy_files = [
                sharedlib.abspath(os.path.join(config.output_dir, file_name))
                for file_name in os.listdir(config.output_dir)
                if re.search(accuracy_file_pattern, file_name) is not None
            ]

            files_to_upload += accuarcy_files

            if not existing_work_in_progress:
                files_to_upload += [
                    potential_positive_records_file,
                    potential_negative_records_file,
                    questionable_positive_records_file,
                    questionable_negative_records_file
                ]

            sharedlib.upload_files_to_remote_server(
                files_to_upload,
                config.remote_server['labeling_verified_samples_dir'])
예제 #21
0
def classify_file(input_data_file, models, positive_signal_regexes_for_false_negative_check, skip_first_record=True,  max_records = None):
    start_time = datetime.datetime.now()
    log('classifier::classify_file() starting at {}'.format(start_time))

    file_base_name = os.path.basename(input_data_file)
    out_dir = sharedlib.abspath(config.output_dir)
    predicted_pos_file_ext = '.predicted.pos.txt'
    predicted_neg_file_ext = '.predicted.neg.txt'
    possible_false_neg_file_ext = '.possible.false.neg.txt'
    prediction_summary_file_ext = '.prediction.summary.txt'

    overall_predicted_pos_records_file_path = os.path.join(out_dir, '{}{}'.format(file_base_name, predicted_pos_file_ext))
    overall_predicted_neg_records_file_path = os.path.join(out_dir, '{}{}'.format(file_base_name, predicted_neg_file_ext))
    overall_possible_false_neg_records_file_path = os.path.join(out_dir, '{}{}'.format(file_base_name, possible_false_neg_file_ext))
    prediction_summary_file_path = os.path.join(out_dir, '{}{}'.format(file_base_name, prediction_summary_file_ext))
    log('Predicted positive records file (overall): {}'.format(overall_predicted_pos_records_file_path))
    log('Predicted negative records file (overall): {}'.format(overall_predicted_neg_records_file_path))
    log('Prediction summary file: {}'.format(prediction_summary_file_path))

    prediction_summary_file =  open(prediction_summary_file_path, 'w', encoding='utf-8', errors='ignore')
    prediction_summary_file.write('MDR_REPORT_KEY|MDR_TEXT_KEY|TEXT_TYPE_CODE|PATIENT_SEQUENCE_NUMBER|DATE_REPORT|FOI_TEXT|MODEL_NAME|POS_PROB|NEG_PROB|CLASSIFICATION|HAS_POS_SIGNALS|POS_SIGNAL\n')

    classifiers_info = []
    for (name, classifier, vectorizer) in models:
        log('Building classifier parameters for {}...'.format(name))
        predicted_positive_records_file_path = os.path.join(out_dir, '{}_{}{}'.format(file_base_name, name, predicted_pos_file_ext))
        predicted_negative_records_file_path = os.path.join(out_dir, '{}_{}{}'.format(file_base_name, name, predicted_neg_file_ext))
        possible_false_negative_records_file_path = os.path.join(out_dir, '{}_{}{}'.format(file_base_name, name, possible_false_neg_file_ext))
        log('Predicted positive records file for this classifier: {}'.format(predicted_positive_records_file_path))
        log('Predicted negative records file for this classifier: {}'.format(predicted_negative_records_file_path))
        classifiers_info.append((name, 
                                 classifier, 
                                 vectorizer,
                                 predicted_positive_records_file_path, 
                                 open(predicted_positive_records_file_path, 'w', encoding='utf-8', errors='ignore'), 
                                 predicted_negative_records_file_path, 
                                 open(predicted_negative_records_file_path, 'w', encoding='utf-8', errors='ignore'),
                                 possible_false_negative_records_file_path,
                                 open(possible_false_negative_records_file_path, 'w', encoding='utf-8', errors='ignore')
                                 ))

    file_to_classify = sharedlib.abspath(input_data_file)
    log('Total {} models loaded.'.format(len(classifiers_info)))
    log('File to classify: {}. Reading one record at a time...'.format(file_to_classify))

    total_records = 0
    total_data_records = 0
    total_positive = 0
    total_negative = 0
    positive_percent = 0
    negative_percent = 0

    total_possible_false_negative = 0
    possible_false_negative_percent_of_negatives = 0
    possible_false_negative_percent_of_all_records = 0
    
    overall_predicted_pos_records_file = open(overall_predicted_pos_records_file_path, 'w', encoding='utf-8', errors='ignore')
    overall_predicted_neg_records_file = open(overall_predicted_neg_records_file_path, 'w', encoding='utf-8', errors='ignore')
    overall_possible_false_neg_records_file = open(overall_possible_false_neg_records_file_path, 'w', encoding='utf-8', errors='ignore')
    fin = codecs.open(file_to_classify, mode='r+', encoding='utf-8', errors='ignore')
    for record in fin:
        total_records += 1
        sys.stdout.write('{} => POS: {}/{:.2f}% NEG: {}/{:.2f}% Possible FALSE NEG: {}/{:.2f}% of all NEG; {:.2f}% of all records. Next: {}...\r'.format(file_base_name, 
                                                                                                            total_positive, positive_percent,
                                                                                                            total_negative, negative_percent,
                                                                                                            total_possible_false_negative, 
                                                                                                            possible_false_negative_percent_of_negatives,
                                                                                                            possible_false_negative_percent_of_all_records,
                                                                                                            total_data_records))
        sys.stdout.flush()

        if total_records == 1 and skip_first_record == True:
            continue
            
        total_data_records += 1

        if max_records is not None and total_data_records > max_records:
            break


        # Compute if this record could be a possible false negative (i.e. has positive signals, but classified as negative)
        # Since this is global evaluation (irrespective of models), we will perform this check once, and here.
        # The outcome will be used later to determine if it is a possible false negative.
        has_positive_signals = False
        first_positive_signal_found = None
        for pattern in positive_signal_regexes_for_false_negative_check:
            match = re.search(pattern, record)
            if match is not None:
                has_positive_signals = True
                first_positive_signal_found = match.group()
                break

        classifications = []
        for (name, classifier, vectorizer, pos_file_path, pos_file, neg_file_path, neg_file, possible_false_neg_file_path, possible_false_neg_file) in classifiers_info:

            predicted_classification, positive_probability = classify(record, name, classifier, vectorizer, config.min_required_record_length, config.positive_probability_threshold)

            is_positive = predicted_classification == 'pos' and round(positive_probability, 2) >= config.positive_probability_threshold

            if config.verbose == True:
                log('Classification by {} is {}'.format(name, predicted_classification))
                log('Probabilities: pos: {}, neg: {}'.format(positive_probability, probabilities.prob('neg')))
    
            if is_positive:
                classifications.append(('pos', positive_probability))
                pos_file.write(record.rstrip(os.linesep) + '\n')
            else:
                classifications.append(('neg', positive_probability)) # For consistency, we store all probability in terms of positive
                neg_file.write(record.rstrip(os.linesep) + '\n')

                if has_positive_signals:
                    # Potential false negative. 
                    possible_false_neg_file.write(record.rstrip(os.linesep) + '\n')

            prediction_summary_file.write('{}|{}|{:.2f}|{:.2f}|{}|{}|{}\n'.format(record[:40].strip(), name, positive_probability, 1-positive_probability, 'pos' if is_positive == True else 'neg', has_positive_signals, first_positive_signal_found))

            if total_data_records % 10000 == 0:
                pos_file.flush()
                neg_file.flush()
                possible_false_neg_file.flush()
                prediction_summary_file.flush()
                show_model_classification_stats(file_base_name, name, pos_file_path, neg_file_path, possible_false_neg_file_path)            

        overall_classification, overall_positive_probability = get_overall_classification(classifications)
        if overall_classification is None:
            continue

        if overall_classification == 'pos':
            total_positive +=1
        else:
            total_negative +=1
            if has_positive_signals:
                total_possible_false_negative += 1

        positive_percent = (total_positive / total_data_records) * 100
        negative_percent = (total_negative / total_data_records) * 100
        possible_false_negative_percent_of_all_records = (total_possible_false_negative / total_data_records) * 100
        possible_false_negative_percent_of_negatives = 0 if total_negative == 0 else (total_possible_false_negative / total_negative) * 100
        
        if len(classifications) > 1:
            if overall_classification == 'pos':
                overall_predicted_pos_records_file.write(record.rstrip(os.linesep) + '\n')
            else:
                overall_predicted_neg_records_file.write(record.rstrip(os.linesep) + '\n')
                if has_positive_signals:
                    # Potential false negative. 
                    overall_possible_false_neg_records_file.write(record.rstrip(os.linesep) + '\n')

            prediction_summary_file.write('{}|{}|{:.2f}|{:.2f}|{}|{}|{}\n'.format(record[:40].strip(), 'overall',  overall_positive_probability, 1-overall_positive_probability, overall_classification, has_positive_signals, first_positive_signal_found))

    log('{}=> Overall {} POS records in total {} ({:.2f}%) with a probability of {} or higher. *Possible* false negatives {}/{:.2f}% of all NEG; {:.2f}% of all records.'.format(file_base_name, total_positive, total_data_records -1, positive_percent, config.positive_probability_threshold, total_possible_false_negative, possible_false_negative_percent_of_negatives, possible_false_negative_percent_of_all_records)) 
    fin.close()

    log('Closing output files...')
    overall_predicted_pos_records_file.close()
    overall_predicted_neg_records_file.close()
    overall_possible_false_neg_records_file.close()
    prediction_summary_file.close()

    files_to_zip = []
    for (name, classifier, vectorizer, pos_file_path, pos_file_handle, neg_file_path, neg_file_handle, possible_false_neg_file_path, possible_false_neg_file_handle) in classifiers_info:
        files_to_zip.append(pos_file_path)
        if config.upload_positive_files_only == False:
            files_to_zip.append(neg_file_path)
        if config.verbose == True:
            log('Closing {}...'.format(pos_file_path))
        pos_file_handle.close()
    
        if config.verbose == True:
            log('Closing {}...'.format(neg_file_path))
        neg_file_handle.close()

        if config.verbose == True:
            log('Closing {}...'.format(possible_false_neg_file_handle))
        possible_false_neg_file_handle.close()

        show_model_classification_stats(file_base_name, name, pos_file_path, neg_file_path, possible_false_neg_file_path)

    files_to_zip.append(prediction_summary_file_path)
    files_to_zip.append(overall_predicted_pos_records_file_path)
    if config.upload_positive_files_only == False:
        files_to_zip.append(overall_predicted_neg_records_file_path)

    if config.upload_output_to_remote_server == True:
        archive_name = os.path.splitext(file_base_name)[0]+'.zip'
        zip_file = sharedlib.zip_files(files_to_zip, os.path.join(out_dir, archive_name))
        log('Uploading the output files ({}) to the Remote Server...'.format(archive_name))
        sharedlib.upload_files_to_classified_dir([zip_file])

    end_time = datetime.datetime.now()
    log('classifier::classify_file() completed at {}. Total duration: {}.'.format(end_time, end_time - start_time))
def label(mode, potential_positive_records_file,
          potential_negative_records_file, questionable_positive_records_file,
          questionable_negative_records_file, positive_records_output_file,
          negative_records_output_file, already_processed_record_numbers_file,
          models):
    potential_positive_records_file_basename = os.path.basename(
        potential_positive_records_file).lower()
    potential_negative_records_file_basename = os.path.basename(
        potential_negative_records_file).lower()
    questionable_positive_records_file_basename = os.path.basename(
        questionable_positive_records_file).lower()
    questionable_negative_records_file_basename = os.path.basename(
        questionable_negative_records_file).lower()

    input_file_basename_to_full_path_map = {}
    input_file_basename_to_full_path_map[
        potential_positive_records_file_basename] = potential_positive_records_file
    input_file_basename_to_full_path_map[
        potential_negative_records_file_basename] = potential_negative_records_file
    input_file_basename_to_full_path_map[
        questionable_positive_records_file_basename] = questionable_positive_records_file
    input_file_basename_to_full_path_map[
        questionable_negative_records_file_basename] = questionable_negative_records_file

    already_read_records = get_already_read_records(
        already_processed_record_numbers_file)
    if already_read_records is None or len(already_read_records) == 0:
        already_read_records = {}

    total_available_records = {}
    total_available_records[
        potential_positive_records_file_basename] = get_total_lines_count(
            potential_positive_records_file)
    total_available_records[
        potential_negative_records_file_basename] = get_total_lines_count(
            potential_negative_records_file)
    total_available_records[
        questionable_positive_records_file_basename] = get_total_lines_count(
            questionable_positive_records_file)
    total_available_records[
        questionable_negative_records_file_basename] = get_total_lines_count(
            questionable_negative_records_file)

    if not potential_positive_records_file_basename in already_read_records:
        already_read_records[potential_positive_records_file_basename] = {}

    if not potential_negative_records_file_basename in already_read_records:
        already_read_records[potential_negative_records_file_basename] = {}

    if not questionable_positive_records_file_basename in already_read_records:
        already_read_records[questionable_positive_records_file_basename] = {}

    if not questionable_negative_records_file_basename in already_read_records:
        already_read_records[questionable_negative_records_file_basename] = {}

    verified_positive_records_file_path = sharedlib.abspath(
        config.output_files['verified_positive_records_file'])
    verified_negative_records_file_path = sharedlib.abspath(
        config.output_files['verified_negative_records_file'])

    semantic_duplicates_table = remove_semantically_duplicate_lines(
        [positive_records_output_file, negative_records_output_file],
        config.duplicate_record_check_ignore_pattern,
        config.max_semantic_duplicate_records_allowed)

    total_verified_positive_records = get_total_lines_count(
        verified_positive_records_file_path) if os.path.exists(
            verified_positive_records_file_path) else 0
    total_verified_negative_records = get_total_lines_count(
        verified_negative_records_file_path) if os.path.exists(
            verified_negative_records_file_path) else 0

    total_new_records_labeled_this_session = 0
    total_new_records_labeled_using_current_models = 0
    model_accuracy_counts = {}

    verified_positive_records_file = open(verified_positive_records_file_path,
                                          'a+',
                                          encoding='utf-8',
                                          errors='ignore')
    verified_negative_records_file = open(verified_negative_records_file_path,
                                          'a+',
                                          encoding='utf-8',
                                          errors='ignore')

    new_models = None
    while True:
        if config.auto_regen_models == True and (
                new_models is None
                or total_new_records_labeled_using_current_models >=
                config.models_auto_regen_records_threshold):
            logging.info(
                'Models need to re-regenerated because {} records have been labeled in this session without models regenerated.'
                .format(total_new_records_labeled_using_current_models))
            bulk_close_files([
                verified_positive_records_file, verified_negative_records_file
            ])
            new_models = __modeling_helper.rebuild_models(
                verified_positive_records_file_path,
                verified_negative_records_file_path,
                already_processed_record_numbers_file)
            for (name, classifier, vectorizer, score) in new_models:
                logging.info('{}-->{}'.format(name, score))

            output_files = bulk_open_files([
                verified_positive_records_file_path,
                verified_negative_records_file_path
            ], 'a+')
            verified_positive_records_file = output_files[0]
            verified_negative_records_file = output_files[1]
            if new_models is not None:
                models = new_models
                total_new_records_labeled_using_current_models = 0

        logging.info(
            '-------------------------------------------------------------------'
        )
        file_to_read_basename = mode if mode is not None else random.choice(
            [key for key in already_read_records])

        if file_to_read_basename == 'pos':
            file_to_read_basename = potential_positive_records_file_basename
        elif file_to_read_basename == 'neg':
            file_to_read_basename = potential_negative_records_file_basename
        elif file_to_read_basename == 'pos?':
            file_to_read_basename = questionable_positive_records_file_basename
        elif file_to_read_basename == 'neg?':
            file_to_read_basename = questionable_negative_records_file_basename

        logging.info(
            'So far => POS: {}, NEG: {}. Next file to look at: {}. Number of records before models auto re-generated: {}'
            .format(
                total_verified_positive_records,
                total_verified_negative_records, file_to_read_basename,
                config.models_auto_regen_records_threshold -
                total_new_records_labeled_using_current_models))
        file_to_read = None
        aleady_read_record_numbers = already_read_records[
            file_to_read_basename]
        file_to_read = input_file_basename_to_full_path_map[
            file_to_read_basename]

        record_number_to_read = None
        line = None
        line_id = None
        while record_number_to_read is None:
            record_number_to_read = get_unique_random_record_number(
                total_available_records[file_to_read_basename],
                aleady_read_record_numbers)

            logging.info('Input File: {}'.format(
                os.path.basename(file_to_read)))
            logging.info('Record Number: {}'.format(record_number_to_read))
            line = get_line(file_to_read, record_number_to_read)
            line_id = line[:40]

            line_hash = hashlib.sha1(
                re.sub(config.duplicate_record_check_ignore_pattern, '',
                       line).upper().encode(errors='ignore')).hexdigest()

            if line_hash not in semantic_duplicates_table:
                semantic_duplicates_table[line_hash] = 0

            if semantic_duplicates_table[
                    line_hash] >= config.max_semantic_duplicate_records_allowed:
                logging.info(
                    'This is a semantically duplicate record. There are already {} copies in the set. Skipping...'
                    .format(semantic_duplicates_table[line_hash]))
                record_number_to_read = None

        logging.info(
            'Duplicates of this record in the verified set before this: {}'.
            format(semantic_duplicates_table[line_hash]))
        semantic_duplicates_table[line_hash] += 1

        logging.info('')
        logging.info(line)
        logging.info('')
        logging.info('SUGGESTIONS:')
        suggestions = []
        suggestions.append(get_label_from_filename(file_to_read_basename))
        logging.info('    Per candidate extractor: {}'.format(suggestions[0]))
        classification_results = []
        overall_suggestion_model_name = 'overall.decision_support'
        overall_suggestion = None
        if len(models) > 0:
            classification_results = __classification_helper.classify(
                line, models
            )  # returns tuple: (name, (predicted_classification, positive_proba))
            for (model_name, result) in classification_results:
                suggestions.append(result[0])
                accuracy = get_labeling_accuracy(
                    model_name, sharedlib.abspath(config.output_dir))
                logging.info(
                    '    Per {} (Past accuracy {:}%/{:}%/{:}%): {}'.format(
                        model_name, round(accuracy[0] * 100, 2),
                        round(accuracy[1] * 100, 2),
                        round(accuracy[2] * 100, 2), result[0].upper()))
        else:
            logging.info(
                '    No trained model available to provide a suggestion.')

        overall_suggestion_accuracy = get_labeling_accuracy(
            overall_suggestion_model_name,
            sharedlib.abspath(config.output_dir))
        overall_suggestion = get_likely_suggestion(suggestions)
        logging.info('OVERALL (Past accuracy {:}%/{:}%/{:}%): {}'.format(
            round(overall_suggestion_accuracy[0] * 100, 2),
            round(overall_suggestion_accuracy[1] * 100, 2),
            round(overall_suggestion_accuracy[2] * 100, 2),
            overall_suggestion))

        logging.info('')
        logging.info(
            '[P]ositive, [N]egative, [U]nknown, [R]ebuild Models or [Q]uit? ')
        logging.info('')

        decision = None
        while (decision != 'q' and decision != 'r' and decision != 'p'
               and decision != 'n' and decision != 'u'):
            decision = sharedlib.get_char_input()
            if not isinstance(decision, str):
                decision = bytes.decode(decision)
            decision = decision.lower()

        if decision == 'q':
            logging.info('Selected: Quit')
            break
        elif decision == 'r':
            logging.info('Selected: Rebuild models')
            bulk_close_files([
                verified_positive_records_file, verified_negative_records_file
            ])
            new_models = __modeling_helper.rebuild_models(
                verified_positive_records_file_path,
                verified_negative_records_file_path,
                already_processed_record_numbers_file)
            output_files = bulk_open_files([
                verified_positive_records_file_path,
                verified_negative_records_file_path
            ], 'a+')
            verified_positive_records_file = output_files[0]
            verified_negative_records_file = output_files[1]
            if new_models is not None:
                models = new_models
                total_new_records_labeled_using_current_models = 0
            continue
        elif decision == 'p':
            logging.info('Selected: Positive')
            verified_positive_records_file.write(line)
            total_verified_positive_records += 1
            total_new_records_labeled_using_current_models += 1
            total_new_records_labeled_this_session += 1
            if not record_number_to_read in already_read_records:
                aleady_read_record_numbers[record_number_to_read] = []
            aleady_read_record_numbers[record_number_to_read].append(
                {line_id: 'pos'})
        elif decision == 'n':
            logging.info('Selected: Negative')
            verified_negative_records_file.write(line)
            total_verified_negative_records += 1
            total_new_records_labeled_using_current_models += 1
            total_new_records_labeled_this_session += 1
            if not record_number_to_read in already_read_records:
                aleady_read_record_numbers[record_number_to_read] = []
            aleady_read_record_numbers[record_number_to_read].append(
                {line_id: 'neg'})
        else:
            if not record_number_to_read in already_read_records:
                aleady_read_record_numbers[record_number_to_read] = []
            aleady_read_record_numbers[record_number_to_read].append(
                {line_id: 'unk'})
            total_new_records_labeled_using_current_models += 1
            logging.info('Selected: Unknown')

        for (
                model_name, result
        ) in classification_results:  # result is a tuple: (predicted_classification, predicted_proba)
            is_correct = False
            if decision == 'p' and result[0].lower() == 'pos':
                is_correct = True
            elif decision == 'n' and result[0].lower() == 'neg':
                is_correct = True
            elif decision == 'u':  # If the human's final decision is indeterminate, machine's feedback is correct no matter what that is.
                is_correct = True

            save_labeling_accuracy(
                model_name,
                os.path.dirname(verified_positive_records_file_path), line_id,
                result[0], is_correct)

        if overall_suggestion is not None:
            if (decision == 'p' and overall_suggestion.lower() == 'pos') or (
                    decision == 'n' and overall_suggestion.lower() == 'neg'):
                save_labeling_accuracy(overall_suggestion_model_name,
                                       sharedlib.abspath(config.output_dir),
                                       line_id, overall_suggestion, True)
            else:
                save_labeling_accuracy(overall_suggestion_model_name,
                                       sharedlib.abspath(config.output_dir),
                                       line_id, overall_suggestion, False)

        save_already_read_records(already_processed_record_numbers_file,
                                  already_read_records)

    verified_positive_records_file.close()
    verified_negative_records_file.close()