Exemplo n.º 1
0
def extract(config: Config) -> None:
    LOGGER.info('Extracting repositories list file')
    LOGGER.info('This operation might take several minutes...')

    compressed_filename = config.absolute(File.COMPRESSED_DATASET)
    with tarfile.open(compressed_filename) as tar:
        tar.extract(DATASET_FILENAME, path=config.absolute('.'))

    extracted_file = config.absolute(DATASET_FILENAME)
    extracted_file.rename(config.absolute(File.DATASET))
Exemplo n.º 2
0
def alter(config: Config) -> None:
    LOGGER.info('Alter repositories list file')
    LOGGER.info('This operation might take several minutes...')

    output_path = config.absolute(File.ALTERED_DATASET)

    df = config.load_csv(File.SHRUNK_DATASET)

    # Set repositories with no language as Markdown repositories.
    # Because most of Github repositories have a Readme.md file.
    mask = df['repository_language'].isnull()
    df.loc[mask, 'repository_language'] = 'Markdown'

    # Handle language aliases
    for alias, languages in config.alias_mapping.items():
        lang = languages[0]
        mask = df['repository_language'] == alias
        df.loc[mask, 'repository_language'] = lang

    # There are too few repositories for some languages.
    # To mitigate this problem, a list of known repositories
    # is added to the dataset.
    other_df = pd.read_csv(OTHER_REPO_DATASET_PATH)
    df = pd.concat([other_df, df]).drop_duplicates('repository_name')
    df.to_csv(output_path, index=False)
Exemplo n.º 3
0
def download(config: Config) -> None:
    LOGGER.info('Download chosen repositories')
    LOGGER.info('This operation might take a lot of time...')

    input_data = config.load_csv(File.PREPARED_REPOSITORIES)

    input_data.loc[:, 'repository_is_empty'] = True
    rows = (dict(row) for _, row in input_data.iterrows())
    result_rows = []
    total = len(input_data)
    for step, row in enumerate(pool_map(_clone_repository, rows, config), 1):
        result_rows.append(row)
        if step % LOG_STEP == 0:
            LOGGER.info(f'--> Processed {step} / {total} repositories...')
    LOGGER.info(f'--> Processed {total} / {total} repositories!')

    data = pd.DataFrame(result_rows)

    LOGGER.info('Removing empty repositories')
    data = data[~data['repository_is_empty']]
    LOGGER.info(f'Kept {len(data)} non empty repositories')

    fieldnames = ['repository_language', 'repository_dirname']
    output_data = data[fieldnames]
    output_path = config.absolute(File.DOWNLOADED_REPOSITORIES)
    output_data.to_csv(output_path, index=False)
Exemplo n.º 4
0
def select(config: Config) -> None:
    LOGGER.info('Choose repositories per language')
    LOGGER.info('This operation might take several minutes...')

    input_data = config.load_csv(File.ALTERED_DATASET)
    shuffled = input_data.sample(frac=1).reset_index(drop=True)

    max_repositories = config.nb_repositories_per_language

    selected_list = []
    for lang in config.languages:
        filtered = shuffled[shuffled['repository_language'] == lang]
        nb_found = len(filtered)
        nb_selected = min(nb_found, max_repositories)

        LOGGER.info(
            f'{lang} repositories, found: {nb_found}, kept: {nb_selected}')

        if nb_selected < max_repositories:
            LOGGER.warning(f'{lang}, not enough repositories, '
                           f'required: {max_repositories}')

        if nb_selected == 0:
            continue

        selected = filtered[:nb_selected]
        selected_list.append(selected)

    if not selected_list:
        LOGGER.error('No repository found')
        raise RuntimeError('No repository found')

    output_path = config.absolute(File.SELECTED_REPOSITORIES)
    united = pd.concat(selected_list)
    united.to_csv(output_path, index=False)
Exemplo n.º 5
0
def prepare(config: Config) -> None:
    LOGGER.info('Prepare repositories download')
    LOGGER.info('This operation should take few seconds...')

    input_data = config.load_csv(File.SELECTED_REPOSITORIES)
    input_data.loc[:, 'repository_dirname'] = ''
    input_data.loc[:, 'repository_url'] = ''

    output_data = input_data.apply(_add_download_info, axis=1)
    output_path = config.absolute(File.PREPARED_REPOSITORIES)
    output_data.to_csv(output_path, index=False)
Exemplo n.º 6
0
def finalize(config: Config) -> None:
    items = config.extensions.items()
    lang_ext = OrderedDict(sorted(items, key=_lang_name))
    language_filename = config.absolute('languages.json')
    with language_filename.open('w') as output:
        json.dump(lang_ext, output, indent=2)

    LOGGER.info('Dataset successfully generated')
    LOGGER.info('To train Guesslang with this dataset:')
    LOGGER.info(f'* copy {language_filename} into guesslang/data/ directory')
    LOGGER.info(
        f'* run $ guesslang --train {config.cache_path} /path/to/new_model'
    )
Exemplo n.º 7
0
def shrink(config: Config) -> None:
    LOGGER.info('Shrink repositories list file')
    LOGGER.info('This operation might take several minutes...')

    input_path = config.absolute(File.DATASET)
    output_path = config.absolute(File.SHRUNK_DATASET)

    # The input dataset is too huge to be fully loaded into memory
    csv.field_size_limit(CSV_FIELD_LIMIT)
    with input_path.open() as input_file, output_path.open('w') as output_file:
        reader = csv.DictReader(input_file)
        fieldnames = ['repository_name', 'repository_language']
        writer = csv.DictWriter(output_file, fieldnames=fieldnames)
        writer.writeheader()

        for item in reader:
            if _ignore(item):
                continue

            smaller_item = {
                'repository_name': item['Name with Owner'],
                'repository_language': item['Language'],
            }
            writer.writerow(smaller_item)
Exemplo n.º 8
0
def list_all(config: Config) -> None:
    LOGGER.info('List source files from repositories')
    LOGGER.info('This operation might take several minutes...')

    # Start or resume files listing
    repo = config.load_csv(File.DOWNLOADED_REPOSITORIES)
    try:
        files = config.load_csv(File.AVAILABLE_FILES)
    except IOError:
        files = pd.DataFrame([], columns=AVAILABLE_FILES_COLUMNS)

    # Find repositories that have not been processed yet
    mask = ~repo['repository_dirname'].isin(files['repository_dirname'])
    new_repo = repo[mask]
    LOGGER.info(f'{len(new_repo)} newly downloaded repositories')

    # Show the number of deleted repositories
    nb_repo_before = len(files['repository_dirname'].unique())
    mask = files['repository_dirname'].isin(repo['repository_dirname'])
    files = files[mask]
    nb_repo_after = len(files['repository_dirname'].unique())
    nb_removed = nb_repo_before - nb_repo_after
    LOGGER.info(f'{nb_removed} deleted repositories')

    # List unprocessed repositories files
    total = len(new_repo)
    rows = (dict(repo) for _, repo in new_repo.iterrows())

    output_path = config.absolute(File.AVAILABLE_FILES)
    write_headers = not output_path.exists()
    csv.field_size_limit(CSV_FIELD_LIMIT)
    with output_path.open('a') as output:
        writer = csv.DictWriter(output, fieldnames=AVAILABLE_FILES_COLUMNS)
        if write_headers:
            writer.writeheader()

        for index, result in enumerate(pool_map(_list_files, rows, config)):
            for item in result:
                writer.writerow(item)

            if index % LOG_STEP == 0:
                LOGGER.info(f'--> Processed {index} / {total} repositories...')
        LOGGER.info(f'--> Processed {total} / {total} repositories!')

    LOGGER.info(f'Created file: {output_path}')
Exemplo n.º 9
0
def download(config: Config) -> None:
    LOGGER.info('Retrieving repositories dataset (8GB)')
    LOGGER.info('This operation might take a lot of time...')

    destination = config.absolute(File.COMPRESSED_DATASET)
    download_file(DATASET_URL, destination)