Python ProcessedFolder示例，util.ProcessedFolder Python示例

示例#1

0

显示文件

def create_entity_packs(processed_folder: ProcessedFolder,
                        pack_size: int) -> List[Tuple[int, List]]:
    #add a flag to force recalc

    if os.path.exists(processed_folder.entity_packs(pack_size)):
        #Do not load if the flag is set
        print("Loading packs for each entity")
        return pickle.load(open(processed_folder.entity_packs(pack_size),
                                'rb'))

    print("Creating packs for each entity")
    _, _, author_to_changes, _ = compute_occurrences(processed_folder)
    packs = []
    for author, changes in author_to_changes.items():
        np.random.shuffle(changes)
        while len(changes) % pack_size != 0:
            changes.append(np.random.choice(changes))

        for s in range(0, len(changes), pack_size):
            if s + pack_size <= len(changes):
                packs.append((author, changes[s:s + pack_size]))

    pickle.dump(packs, open(processed_folder.entity_packs(pack_size), 'wb'))
    print("Packs saved on disk")

    return packs

示例#2

0

显示文件

def time_split(processed_folder: ProcessedFolder, n_time_buckets: int,
               uniform_distribution: bool) -> Dict:
    if os.path.exists(processed_folder.time_buckets_split(n_time_buckets)):
        print("Loading split into time-separated buckets")
        return pickle.load(
            open(processed_folder.time_buckets_split(n_time_buckets), 'rb'))

    print("Splitting into time-separated buckets")
    change_metadata = pd.read_csv(processed_folder.change_metadata_file,
                                  index_col="id",
                                  usecols=["id", "authorTime"],
                                  squeeze=True)
    change_metadata.sort_values(inplace=True)

    author_occurrences, change_occurrences, author_to_changes, total_count = compute_occurrences(
        processed_folder)
    change_entities = resolve_entities(processed_folder)

    change_to_time_bucket, bucket_to_timestamps = \
        continuous_distribution(change_metadata, change_occurrences, n_time_buckets, total_count) \
            if not uniform_distribution else \
            uni_distribution(author_occurrences, change_metadata, change_occurrences, n_time_buckets, change_entities)

    bucket_to_timestamps.to_csv(
        processed_folder.time_buckets_range(n_time_buckets), index=False)

    pickle.dump(
        change_to_time_bucket,
        open(processed_folder.time_buckets_split(n_time_buckets), 'wb'))
    print("Buckets saved on disk")
    return change_to_time_bucket

示例#3

0

显示文件

def time_split(processed_folder: ProcessedFolder, n_time_buckets: int) -> Dict:
    if os.path.exists(processed_folder.time_buckets_split(n_time_buckets)):
        print("Loading split into time-separated buckets")
        return pickle.load(
            open(processed_folder.time_buckets_split(n_time_buckets), 'rb'))

    print("Splitting into time-separated buckets")
    change_metadata = pd.read_csv(processed_folder.change_metadata_file,
                                  index_col="id",
                                  usecols=["id", "authorTime"],
                                  squeeze=True)
    change_metadata.sort_values(inplace=True)

    _, change_occurrences, author_to_changes, total_count = compute_occurrences(
        processed_folder)
    bucket_size = total_count // n_time_buckets + 1
    change_to_time_bucket = {}
    cur_changes = 0
    cur_bucket = 0

    bucket_indices = [i for i in range(1, n_time_buckets + 1)]
    bucket_start_times = [None for _ in range(n_time_buckets)]
    bucket_finish_times = [None for _ in range(n_time_buckets)]

    for change_id in change_metadata.index:
        cur_changes += change_occurrences[change_id]
        change_to_time_bucket[change_id] = cur_bucket

        if bucket_start_times[cur_bucket] is None:
            bucket_start_times[cur_bucket] = change_metadata.loc[change_id]
        bucket_finish_times[cur_bucket] = change_metadata.loc[change_id]

        while cur_changes >= bucket_size:
            cur_bucket += 1
            cur_changes -= bucket_size

    bucket_to_timestamps = pd.DataFrame(data={
        'start_time': bucket_start_times,
        'finish_time': bucket_finish_times
    },
                                        index=bucket_indices)
    bucket_to_timestamps['start_date'] = bucket_to_timestamps[
        'start_time'].map(lambda tstamp: datetime.fromtimestamp(tstamp))
    bucket_to_timestamps['finish_date'] = bucket_to_timestamps[
        'finish_time'].map(lambda tstamp: datetime.fromtimestamp(tstamp))
    bucket_to_timestamps.to_csv(
        processed_folder.time_buckets_range(n_time_buckets))

    pickle.dump(
        change_to_time_bucket,
        open(processed_folder.time_buckets_split(n_time_buckets), 'wb'))
    print("Buckets saved on disk")
    return change_to_time_bucket

示例#4

0

显示文件

文件： run_preprocessing.py 项目： JetBrains-Research/authorship-detection

def run_preprocessing(n_time_buckets: int, min_context_train: float, max_context_train: float, min_count: int,
                      max_count: int, interactive: bool,
                      random_seed: int = 239, projects_file: str = None, project_folder: str = None, ):
    fix_seed(random_seed)
    if project_folder is not None:
        process_folder(ProcessedFolder(project_folder), n_time_buckets,
                       min_context_train, max_context_train, min_count, max_count, interactive)
    elif projects_file is not None:
        projects = [l.strip() for l in open(projects_file, "r").readlines()]
        for p in projects:
            process_folder(ProcessedFolder("../gitminer/out/" + p + "/"), n_time_buckets,
                           min_context_train, max_context_train, min_count, max_count, interactive)
    else:
        raise ValueError("One of projects folder or projects file should be set")

示例#5

0

显示文件

文件： get_trained_model.py 项目： vovak/codestyle

def get_trained_model(processed_folder: ProcessedFolder, pack_size: int,
                      embedding_size: int, min_samples: int, n_run: int,
                      total_runs: int,
                      mask_tokens: bool) -> Tuple[Model, List]:

    print("Gathering model configuration")
    author_occurrences, _, _, _ = compute_occurrences(processed_folder)
    filtered_authors = []
    for author, count in author_occurrences.most_common():
        if count >= min_samples:
            filtered_authors.append(author)
    print("{} authors have at least {} samples".format(len(filtered_authors),
                                                       min_samples))

    n_tokens = processed_folder.n_tokens()
    n_paths = processed_folder.n_paths()
    print("Found {} tokens and {} paths".format(n_tokens, n_paths))

    load_path = os.path.join(
        processed_folder.trained_model_folder(pack_size, min_samples,
                                              mask_tokens), "model")

    config = Config.get_representation_config(
        dataset_folder=processed_folder.folder,
        load_path=load_path,
        changes_path=processed_folder.file_changes,
        n_tokens=n_tokens,
        n_paths=n_paths,
        n_entities=max(filtered_authors),
        embedding_size=embedding_size,
        pack_size=pack_size,
        n_run=n_run,
        total_runs=total_runs)

    code2vec_model = Model(config)
    if config.LOAD_PATH == '':
        print("Did not find a pretrained model")
        packs = create_entity_packs(processed_folder, pack_size)
        packs = [pack for pack in packs if pack[0] in filtered_authors]
        code2vec_model.train(packs, mask_tokens)
        print("Completed training")

    return code2vec_model, filtered_authors

示例#6

0

显示文件

def get_representations(processed_folder: ProcessedFolder, pack_size: int,
                        embedding_size: int, min_samples: int,
                        n_time_buckets: int, n_run: int, total_runs: int,
                        mask_tokens: bool):
    if os.path.exists(
            processed_folder.vectorization_file(pack_size, min_samples)):
        print("Loading previously computed representations")
        return pd.read_csv(processed_folder.vectorization_file(
            pack_size, min_samples),
                           index_col=0)

    code2vec_model, filtered_authors = get_trained_model(
        processed_folder, pack_size, embedding_size, min_samples, n_run,
        total_runs, mask_tokens)
    change_authors = resolve_entities(processed_folder)
    change_to_time_bucket = time_split(processed_folder, n_time_buckets)
    print("Computing representations")
    code2vec_model.programmer_representation(
        processed_folder.vectorization_file(pack_size, min_samples),
        change_authors, change_to_time_bucket, filtered_authors, mask_tokens)
    print("Representations saved on disk")

示例#7

0

显示文件

文件： merge_aliases_naive.py 项目： vovak/codestyle

    def dump(self, processed_folder: ProcessedFolder):
        pickle.dump(self.entity_dict, open(processed_folder.entity_dict, 'wb'))
        pickle.dump(self.reverse_dict,
                    open(processed_folder.reversed_entity_dict, 'wb'))


def merge_aliases_naive(processed_folder: ProcessedFolder) -> dict:
    if os.path.exists(processed_folder.entity_dict):
        print("Loading merged entities")
        return pickle.load(open(processed_folder.entity_dict, 'rb'))

    print("Naively merging entities...")
    naive_merger = NaiveEntityMerger()
    change_metadata = pd.read_csv(processed_folder.change_metadata_file,
                                  index_col="id",
                                  usecols=["id", "authorName", "authorEmail"])
    for index, row in change_metadata.iterrows():
        naive_merger.add_entity(row["authorName"], row["authorEmail"])
    naive_merger.dump(processed_folder)
    print("Merged entities saved on disk")

    return naive_merger.entity_dict


if __name__ == '__main__':
    parser = ArgumentParser()
    parser.add_argument("--data_folder", type=str, required=True)
    args = parser.parse_args()
    merge_aliases_naive(ProcessedFolder(args.data_folder))

示例#8

0

显示文件

文件： run_all.py 项目： vovak/codestyle


parser = ArgumentParser()

parser.add_argument("--pack_size", type=int, required=True)
parser.add_argument("--embedding_size", type=int, required=True)
parser.add_argument("--min_samples", type=int, default=0)
parser.add_argument("--n_time_buckets", type=int, required=True)
parser.add_argument("--n_runs", type=int, required=True)
parser.add_argument("--init_run_number", type=int, default=1)
parser.add_argument("--mask_tokens", action="store_true")

args = parser.parse_args()

projects = [
    l.strip() for l in open("../../pythonminer/projects.txt", "r").readlines()
]
for p in projects:
    project_folder = ProcessedFolder("../../pythonminer/out/" + p + "/",
                                     args.init_run_number)
    merge_aliases_bipartite(project_folder)
    for n_run in range(args.init_run_number,
                       args.init_run_number + args.n_runs):
        project_folder.set_run_number(n_run)
        tf.reset_default_graph()
        fix_seed(n_run)
        get_representations(project_folder, args.pack_size,
                            args.embedding_size, args.min_samples,
                            args.n_time_buckets, n_run, args.n_runs,
                            args.mask_tokens)

示例#9

0

显示文件

            processed_folder.vectorization_file(pack_size, min_samples)):
        print("Loading previously computed representations")
        return pd.read_csv(processed_folder.vectorization_file(
            pack_size, min_samples),
                           index_col=0)

    code2vec_model, filtered_authors = get_trained_model(
        processed_folder, pack_size, embedding_size, min_samples, n_run,
        total_runs, mask_tokens)
    change_authors = resolve_entities(processed_folder)
    change_to_time_bucket = time_split(processed_folder, n_time_buckets)
    print("Computing representations")
    code2vec_model.programmer_representation(
        processed_folder.vectorization_file(pack_size, min_samples),
        change_authors, change_to_time_bucket, filtered_authors, mask_tokens)
    print("Representations saved on disk")


if __name__ == '__main__':
    parser = ArgumentParser()
    parser.add_argument("--data_folder", type=str, required=True)
    parser.add_argument("--pack_size", type=int, required=True)
    parser.add_argument("--embedding_size", type=int, required=True)
    parser.add_argument("--min_samples", type=int, default=0)
    parser.add_argument("--n_time_buckets", type=int, required=True)
    parser.add_argument("--mask_tokens", type=bool, default=False)
    args = parser.parse_args()
    get_representations(ProcessedFolder(args.data_folder), args.pack_size,
                        args.embedding_size, args.min_samples,
                        args.n_time_buckets, 0, 0, args.mask_tokens)

示例#10

0

显示文件

文件： merge_aliases_bipartite.py 项目： vovak/codestyle

            for ent, maps in self.reverse_dict.items():
                fout.write("{},{},{}\n".format(ent, "|".join(maps["names"]),
                                               "|".join(maps["emails"])))


def merge_aliases_bipartite(processed_folder: ProcessedFolder) -> dict:
    if os.path.exists(processed_folder.entity_dict):
        print("Loading merged entities")
        return pickle.load(open(processed_folder.entity_dict, 'rb'))

    print("Merging entities by bipartite strategy...")
    bipartite_merger = BipartiteEntityMerger()
    change_metadata = pd.read_csv(processed_folder.change_metadata_file,
                                  index_col="id",
                                  usecols=["id", "authorName", "authorEmail"])
    for index, row in change_metadata.iterrows():
        bipartite_merger.add_entity(row["authorName"], row["authorEmail"])

    bipartite_merger.run_matching()
    bipartite_merger.dump(processed_folder)
    print("Merged entities saved on disk")

    return bipartite_merger.entity_dict


if __name__ == '__main__':
    parser = ArgumentParser()
    parser.add_argument("--data_folder", type=str, required=True)
    args = parser.parse_args()
    merge_aliases_bipartite(ProcessedFolder(args.data_folder))

示例#11

0

显示文件

文件： compute_occurrences.py 项目： JetBrains-Research/authorship-detection

            if row['pathsCountBefore'] > 0 or row['pathsCountAfter'] == 0:
                continue

            author = resolved_entities.loc[row['changeId']]
            author_occurrences[author] += 1
            change_occurrences[row['changeId']] += 1

            if author not in author_to_changes:
                author_to_changes[author] = []
            author_to_changes[author].append(total_count)
            total_count += 1

    for i, (author, count) in enumerate(author_occurrences.most_common()):
        print(f"#{i + 1} entity: {author} -> {count}")

    pickle.dump(author_occurrences,
                open(processed_folder.author_occurrences, 'wb'))
    pickle.dump(change_occurrences,
                open(processed_folder.change_occurrences, 'wb'))
    pickle.dump(author_to_changes,
                open(processed_folder.author_to_changes, 'wb'))
    print("Occurrences saved on disk")
    return author_occurrences, change_occurrences, author_to_changes, total_count


if __name__ == '__main__':
    parser = ArgumentParser()
    parser.add_argument("--data_folder", type=str, required=True)
    args = parser.parse_args()
    compute_occurrences(ProcessedFolder(args.data_folder))

示例#12

0

显示文件

文件： compute_caliskan_features.py 项目： JetBrains-Research/authorship-detection

                    lambda path: change_entities[int(
                        pathlib.Path(path).name.split('_')[0])], files))),
        author_occurrences)
    print("Computing mutual info")
    print(feature_values.shape)

    with Parallel(n_jobs=-1) as pool:
        part_size = 1000
        m = dataset.feature_values.shape[1]
        mutual_info_parts = pool(
            delayed(mutual_info_classif)(dataset.feature_values[:, i:i +
                                                                part_size],
                                         dataset.authors,
                                         random_state=0)
            for i in tqdm(range(0, m, part_size)))
    mutual_info = np.concatenate(mutual_info_parts)
    mutual_info /= np.max(mutual_info)

    pickle.dump(dataset, open(processed_folder.caliskan_dataset, 'wb'))
    pickle.dump(mutual_info, open(processed_folder.caliskan_mutual_info, 'wb'))
    print("Extracted data dumped on disk")

    return dataset, mutual_info


if __name__ == '__main__':
    parser = ArgumentParser()
    parser.add_argument("--data_folder", type=str, required=True)
    args = parser.parse_args()
    compute_caliskan_features(ProcessedFolder(args.data_folder))

示例#13

0

显示文件

文件： run_classification.py 项目： JetBrains-Research/authorship-detection

def main(args):
    # if os.path.isfile(output_filename(args.config_file)):
    #     print("Already processed")
    #     exit(0)

    config = Config.fromyaml(args.config_file)

    if config.mode() == 'snapshot':
        project_folder = ProcessedSnapshotFolder(config.source_folder())
        change_entities = None
        author_occurrences = None
    else:
        project_folder = ProcessedFolder(config.source_folder())
        change_entities = resolve_entities(project_folder)
        author_occurrences, _, _, _ = compute_occurrences(project_folder)

    if config.mode() == 'time':
        change_to_time_bucket = time_split(project_folder,
                                           config.time_folds(),
                                           uniform_distribution=True)
    else:
        change_to_time_bucket = None

    if config.mode() == 'context':
        context_splits = context_split(project_folder, *config.min_max_count(),
                                       *config.min_max_train())
    else:
        context_splits = None

    if config.classifier_type() == 'nn':
        classifier = NNClassifier(config, project_folder,
                                  change_entities, change_to_time_bucket,
                                  config.min_max_count(), author_occurrences,
                                  context_splits)
    elif config.classifier_type() == 'rf':
        classifier = RFClassifier(config, project_folder,
                                  change_entities, change_to_time_bucket,
                                  config.min_max_count(), author_occurrences,
                                  context_splits)
    elif config.classifier_type() == 'caliskan':
        classifier = CaliskanClassifier(config, project_folder,
                                        change_entities, change_to_time_bucket,
                                        config.min_max_count(), context_splits)
    else:
        raise ValueError('Classifier type should be set in config')

    if config.mode() == 'time':
        fold_indices = [(i, j) for i in range(config.time_folds())
                        for j in range(i + 1, config.time_folds())]
    elif config.mode() == 'context':
        fold_indices = [i for i in range(len(context_splits))]
    else:
        fold_indices = classifier.cross_validation_folds()

    mean, std, scores = classifier.run(fold_indices)
    print(f'{mean:.3f}+-{std:.3f}')
    for i, score in enumerate(scores):
        if isinstance(score, ClassificationResult):
            scores[i] = ClassificationResult(float(score.accuracy),
                                             float(score.macro_precision),
                                             float(score.macro_recall),
                                             score.fold_ind)

    yaml.dump({
        'mean': mean,
        'std': std,
        'scores': scores
    },
              output_file(args.config_file),
              default_flow_style=False)

示例#14

0

显示文件

    if not os.path.exists(processed_folder.entity_dict):
        raise ValueError(
            "You should provide dictionary of entities for resolving: {}".
            format(processed_folder.entity_dict))

    print("Resolving entities for individual changes")
    entity_resolver = EntityResolver(processed_folder.entity_dict)
    change_metadata = pd.read_csv(processed_folder.change_metadata_file,
                                  index_col="id",
                                  usecols=["id", "authorName", "authorEmail"])
    change_entities = change_metadata.apply(
        lambda row: entity_resolver.get_entity(row["authorName"], row[
            "authorEmail"]),
        axis=1)
    change_entities.to_csv(processed_folder.resolved_entities, header=True)
    print("Resolved entities saved on disk")

    print("{} unknown aliases in EntityResolver".format(
        entity_resolver.unknown_count))
    dump_unknowns(entity_resolver.unknowns, processed_folder.unknown_entities)

    return change_entities


if __name__ == '__main__':
    parser = ArgumentParser()
    parser.add_argument("--data_folder", type=str, required=True)
    args = parser.parse_args()
    resolve_entities(ProcessedFolder(args.data_folder))

示例#15

0

显示文件

        #Do not load if the flag is set
        print("Loading packs for each entity")
        return pickle.load(open(processed_folder.entity_packs(pack_size),
                                'rb'))

    print("Creating packs for each entity")
    _, _, author_to_changes, _ = compute_occurrences(processed_folder)
    packs = []
    for author, changes in author_to_changes.items():
        np.random.shuffle(changes)
        while len(changes) % pack_size != 0:
            changes.append(np.random.choice(changes))

        for s in range(0, len(changes), pack_size):
            if s + pack_size <= len(changes):
                packs.append((author, changes[s:s + pack_size]))

    pickle.dump(packs, open(processed_folder.entity_packs(pack_size), 'wb'))
    print("Packs saved on disk")

    return packs


if __name__ == '__main__':
    parser = ArgumentParser()
    parser.add_argument("--data_folder", type=str, required=True)
    parser.add_argument("--pack_size", type=int, required=True)
    args = parser.parse_args()
    print(
        create_entity_packs(ProcessedFolder(args.data_folder), args.pack_size))

示例#16

0

显示文件

        while cur_changes >= bucket_size:
            cur_bucket += 1
            cur_changes -= bucket_size

    bucket_to_timestamps = pd.DataFrame(data={
        'start_time': bucket_start_times,
        'finish_time': bucket_finish_times
    },
                                        index=bucket_indices)
    bucket_to_timestamps['start_date'] = bucket_to_timestamps[
        'start_time'].map(lambda tstamp: datetime.fromtimestamp(tstamp))
    bucket_to_timestamps['finish_date'] = bucket_to_timestamps[
        'finish_time'].map(lambda tstamp: datetime.fromtimestamp(tstamp))
    bucket_to_timestamps.to_csv(
        processed_folder.time_buckets_range(n_time_buckets))

    pickle.dump(
        change_to_time_bucket,
        open(processed_folder.time_buckets_split(n_time_buckets), 'wb'))
    print("Buckets saved on disk")
    return change_to_time_bucket


if __name__ == '__main__':
    parser = ArgumentParser()
    parser.add_argument("--data_folder", type=str, required=True)
    parser.add_argument("--n_time_buckets", type=int, required=True)
    args = parser.parse_args()
    print(time_split(ProcessedFolder(args.data_folder), args.n_time_buckets))

示例#17

0

显示文件

def context_split(processed_folder: ProcessedFolder,
                  min_count: int = 100,
                  max_count: int = 10**9,
                  min_train: float = 0.7,
                  max_train: float = 0.8) -> List[ContextSplit]:
    author_occurrences, change_occurrences, author_to_changes, total_count = compute_occurrences(
        processed_folder)
    change_entities = resolve_entities(processed_folder)

    if os.path.exists(processed_folder.context_split(min_train, max_train)):
        print("Loading context-split data")
        resulting_split = pickle.load(
            open(processed_folder.context_split(min_train, max_train), 'rb'))
        _filter_authors(resulting_split, min_count, max_count,
                        author_occurrences, change_occurrences,
                        change_entities)
        return resulting_split

    print("Splitting changes by context")
    change_metadata = pd.read_csv(processed_folder.change_metadata_file,
                                  index_col="id",
                                  usecols=["id", "newPath"],
                                  squeeze=True)

    project_root = _build_tree(
        change_metadata, change_entities, change_occurrences,
        lambda change_id: change_occurrences[change_id] > 0)

    depth = _max_depth(project_root)

    nodes_at_depth = [[] for _ in range(depth + 1)]
    _get_all_nodes_at_depth(project_root, nodes_at_depth)
    print(f"Depth: {depth}")

    min_depth, max_depth = _detect_min_max_depth(project_root, nodes_at_depth,
                                                 max_train)

    print(f'Trying to find splits for depth from {min_depth} to {max_depth}')

    authors = {author for author, count in author_occurrences.items()}

    resulting_split = [
        ContextSplit(d, {}) for d in range(min_depth, max_depth + 1)
    ]
    success_size = 0
    author_success = 0
    with Parallel(n_jobs=-1) as pool:
        split_result = pool(
            delayed(_find_split)(author,
                                 change_entities,
                                 min_depth,
                                 max_depth,
                                 min_train,
                                 max_train,
                                 nodes_at_depth,
                                 iters=10) for author in tqdm(authors))

    for author_split, success, size in split_result:
        if success:
            success_size += size
            author_success += 1
            _merge_splits(resulting_split, author_split)

    print(
        f"Kept {success_size / project_root.count * 100:.2f}% of changes by {author_success}/{len(authors)} authors"
    )
    pickle.dump(
        resulting_split,
        open(processed_folder.context_split(min_train, max_train), 'wb'))
    print("Buckets saved on disk")
    _filter_authors(resulting_split, min_count, max_count, author_occurrences,
                    change_occurrences, change_entities)
    return resulting_split

示例#18

0

显示文件

                                 min_depth,
                                 max_depth,
                                 min_train,
                                 max_train,
                                 nodes_at_depth,
                                 iters=10) for author in tqdm(authors))

    for author_split, success, size in split_result:
        if success:
            success_size += size
            author_success += 1
            _merge_splits(resulting_split, author_split)

    print(
        f"Kept {success_size / project_root.count * 100:.2f}% of changes by {author_success}/{len(authors)} authors"
    )
    pickle.dump(
        resulting_split,
        open(processed_folder.context_split(min_train, max_train), 'wb'))
    print("Buckets saved on disk")
    _filter_authors(resulting_split, min_count, max_count, author_occurrences,
                    change_occurrences, change_entities)
    return resulting_split


if __name__ == '__main__':
    parser = ArgumentParser()
    parser.add_argument("--data_folder", type=str, required=True)
    args = parser.parse_args()
    print(context_split(ProcessedFolder(args.data_folder)))

示例#19

0

显示文件

文件： get_trained_model.py 项目： vovak/codestyle

        n_paths=n_paths,
        n_entities=max(filtered_authors),
        embedding_size=embedding_size,
        pack_size=pack_size,
        n_run=n_run,
        total_runs=total_runs)

    code2vec_model = Model(config)
    if config.LOAD_PATH == '':
        print("Did not find a pretrained model")
        packs = create_entity_packs(processed_folder, pack_size)
        packs = [pack for pack in packs if pack[0] in filtered_authors]
        code2vec_model.train(packs, mask_tokens)
        print("Completed training")

    return code2vec_model, filtered_authors


if __name__ == '__main__':
    parser = ArgumentParser()
    parser.add_argument("--data_folder", type=str, required=True)
    parser.add_argument("--pack_size", type=int, required=True)
    parser.add_argument("--embedding_size", type=int, required=True)
    parser.add_argument("--min_samples", type=int, default=0)
    parser.add_argument("--mask_tokens", type=bool, default=False)
    args = parser.parse_args()

    get_trained_model(ProcessedFolder(args.data_folder), args.pack_size,
                      args.embedding_size, args.min_samples, 0, 0,
                      args.mask_tokens)