예제 #1
0
def calc_cross_occurences(merges_list,
                          path_to_save: Optional[str] = None) -> Counter:
    occ_accross_lists = count_occurences_across_lists(merges_list)
    if path_to_save:
        dump_list(
            list(
                map(
                    lambda t: f'{t[0]} {t[1]}',
                    sorted(occ_accross_lists.items(),
                           key=lambda s: s[1],
                           reverse=True))), path_to_save)
    return occ_accross_lists
예제 #2
0
def calc_cross_occurences_summary(
        occ_accross_lists: Counter,
        path_to_save: Optional[str] = None) -> Dict[int, float]:
    summary = defaultdict(int)
    for k, v in occ_accross_lists.items():
        summary[v] += v

    total_merges_across_fractions = sum(summary.values())

    summary_for_1_list = {}
    for k, v in summary.items():
        summary_for_1_list[k] = float(v) / total_merges_across_fractions

    if path_to_save:
        dump_list(
            list(
                map(
                    lambda t: f'{t[0]} {t[1]}',
                    sorted(summary_for_1_list.items(),
                           key=lambda s: s[0],
                           reverse=True))), path_to_save)
    return summary_for_1_list
예제 #3
0
def run(dataset, repr, threshold):
    PrepConfig.assert_classification_config(repr)

    path_to_classification = os.path.join(DEFAULT_PARSED_DATASETS_DIR, dataset,
                                          CLASSIFICATION_DIR)
    dest_dir = os.path.join(path_to_classification, CLASSIFICATION_TYPE, repr)

    logger.info(f"Getting stats for {dest_dir}")
    logger.info(
        f"Ignoring projects where the percentage of file that contain logging is less than {threshold} %"
    )
    projects_to_ignore, logged_stats = calc_stats(dest_dir, threshold)
    for i, p in enumerate(projects_to_ignore):
        logger.info(f"{i}: {p}")
    logger.info("")
    logger.info(logged_stats)
    output_file_path = os.path.join(
        path_to_classification, f"{IGNORED_PROJECTS_FILE_NAME}.{threshold}")
    dump_list(projects_to_ignore, output_file_path)
    logger.info(
        f"Ignored files with threshold {threshold} % were written to {output_file_path}"
    )
    logger.info(f"Total ignored projects: {len(projects_to_ignore)}")
예제 #4
0
def run(dataset: str, repr: str, n_merges: int, reset: bool, percent: float, start_from: float) -> None:
    bpe_dir_prefix = fractions_manager.get_percent_prefix(percent, start_from)
    bpe_dir_prefix = '' if bpe_dir_prefix == '100_' else bpe_dir_prefix

    base_dir = os.path.join(DEFAULT_PARSED_DATASETS_DIR, dataset, METADATA_DIR, repr)
    if reset:
        starting_from_scratch = True
        archive_existing_common_bpe_folder(base_dir)
    else:
        logger.info("Using existing merges...")
        most_recent_bpe_dir = get_most_recent_bpe_dir(base_dir, bpe_dir_prefix)
        if not most_recent_bpe_dir:
            logger.warning("Existing merges not found ")
            starting_from_scratch = True
        else:
            all_vocab = read_dict_from_2_columns(
                os.path.join(most_recent_bpe_dir, REASSEMBLED_VOCAB_FILE_NAME))
            vocab, non_splitable_vocab = separate_non_splittable_vocab(all_vocab, from_reassambled=True)
            merges = read_list(os.path.join(most_recent_bpe_dir, MERGES_FILE_NAME))
            starting_from_scratch = False

    if starting_from_scratch:
        logger.info("Starting the encoding from scratch...")
        all_vocab = read_dict_from_2_columns(os.path.join(base_dir, f'{bpe_dir_prefix}{VOCAB_FILE_NAME}'))
        vocab, non_splitable_vocab = separate_non_splittable_vocab(all_vocab, from_reassambled=False)
        merges = []

    pairs = get_stats(vocab)
    n_done_merges = len(merges)
    for i in range(n_merges):
        try:
            best, occurences = pairs.pop_pair()
            print(f'Processing pair number {n_done_merges + i+1} {best}')
            merges.append((best[0], best[1], str(occurences)))
        except KeyError:
            break
        vocab = merge_vocab(best, vocab, pairs)

    for k, v in non_splitable_vocab.items():
        vocab[k] = v
    resulting_vocab = collections.defaultdict(int)
    for entry, frequency in vocab.items():
        for subword in entry.split(" "):
            resulting_vocab[subword] += frequency
    resulting_vocab_sorted = sorted(resulting_vocab.items(), key=lambda x: x[1], reverse=True)

    merges_cache = {}
    for entry, frequency in vocab.items():
        subword_list = entry.split(' ')
        key = ''.join(subword_list)
        merges_cache[key] = subword_list

    new_bpe_dir = os.path.join(base_dir, f'{bpe_dir_prefix}{BPE_DIR}', str(len(merges)))
    if os.path.exists(new_bpe_dir):
        raise AssertionError(f'Dir {new_bpe_dir} already exists? Something went wrong.'
                             f'Check the contents of {os.path.join(base_dir, BPE_DIR)} folder')
    os.makedirs(new_bpe_dir)

    dump_list(merges, os.path.join(new_bpe_dir, MERGES_FILE_NAME))
    dump_dict_into_2_columns(vocab, os.path.join(new_bpe_dir, REASSEMBLED_VOCAB_FILE_NAME))
    dump_dict_into_2_columns(merges_cache, os.path.join(new_bpe_dir, MERGES_CACHE_FILE_NAME), val_type=list)
    dump_dict_into_2_columns(resulting_vocab_sorted, os.path.join(new_bpe_dir, RESULTING_VOCAB_FILE_NAME))
    logger.info(f'Bpe output files are saved into {new_bpe_dir} folder')