def merge_tt_with_ati(path: str, to_filter_ati_data: bool = True) -> str:
    """
    At this stage, merging data from the task-tracker plugin and activity tracker plugin takes place.
    Code snapshots that did not find activity tracker events are assigned empty values.

    For more details see
    https://github.com/JetBrains-Research/codetracker-data/wiki/Data-preprocessing:-merge-activity-tracker-and-code-tracker-files
    """
    output_directory = get_output_directory(
        path, consts.MERGING_TT_AND_ATI_OUTPUT_DIRECTORY)
    user_folders = get_all_file_system_items(path, user_subdirs_condition,
                                             consts.FILE_SYSTEM_ITEM.SUBDIR)
    for user_folder in user_folders:
        log.info(f'Start handling the folder {user_folder}')
        task_folders = get_all_file_system_items(
            user_folder, item_type=consts.FILE_SYSTEM_ITEM.SUBDIR)
        for task_folder in task_folders:
            log.info(f'Start handling the folder {task_folder}')
            files = get_all_file_system_items(
                task_folder, extension_file_condition(consts.EXTENSION.CSV))
            try:
                tt_files, ati_file = __separate_ati_and_tt_files(files)
            # Skip the current folder
            except ValueError:
                continue

            ati_df = handle_ati_file(ati_file, to_filter_ati_data)
            for tt_file in tt_files:
                tt_df, language = handle_tt_file(tt_file)
                tt_df = handle_tt_and_at(tt_file, tt_df, ati_df, language)
                write_result(output_directory, path, tt_file, tt_df)

        log.info(f'Finish handling the folder {user_folder}')
    return output_directory
예제 #2
0
def reorganize_files_structure(path: str,
                               output_directory_suffix: str = 'separated_tasks'
                               ) -> str:
    """
    3.0 version: reorganize the file structure
    Before the function calling:

    -root
     --user_N1
      ---task1
       ----user_N1_files
     --user_N2
      ---task1
       ----user_N2_files

    After the function calling:

    -root
     --task1
      ----user_N1_files
      ----user_N2_files

    For more details see
    https://github.com/JetBrains-Research/task-tracker-post-processing/wiki/Data-processing:-reorganize-files-structure
    """
    output_directory = get_output_directory(path, output_directory_suffix)
    files = get_all_file_system_items(path, tt_file_condition)
    for file in files:
        log.info(f'Start splitting file {file}')
        dst_path = __get_dst_path(file, output_directory)
        log.info(f'Destination for the file {file} is {dst_path}')
        copy_file(file, dst_path)
    return output_directory
예제 #3
0
def run_tests(path: str) -> str:
    log.info(f'Start running tests on path {path}')
    output_directory = get_output_directory(
        path, consts.RUNNING_TESTS_OUTPUT_DIRECTORY)

    files = get_all_file_system_items(path, ct_file_condition)
    str_len_files = str(len(files))
    log.info(f'Found {str_len_files} files to run tests on them')

    files = filter_already_tested_files(files, output_directory)
    str_len_files = str(len(files))
    log.info(
        f'Found {str_len_files} files to run tests on them after filtering already tested'
    )

    tasks = TASK.tasks()
    in_and_out_files_dict = create_in_and_out_dict(tasks)

    for i, file in enumerate(files):
        file_log_info = f'file: {str(i + 1)}/{str_len_files}'
        log.info(f'Start running tests on {file_log_info}, {file}')
        data = pd.read_csv(file, encoding=consts.ISO_ENCODING)
        language, data = __check_tasks_on_correct_fragments(
            data, tasks, in_and_out_files_dict, file_log_info)
        log.info(f'Finish running tests on {file_log_info}, {file}')
        write_based_on_language(output_directory, file, data, language)

    return output_directory
def unify_program_experience(
        path: str,
        output_directory_prefix: str = 'unify_program_experience') -> str:
    """
    This function allows to unify program experience:

    new data contains two columns: programExperienceYears and programExperienceMonths
    this function allows to categorize them (see enum class EXPERIENCE):

    LESS_THAN_HALF_YEAR = 'LESS_THAN_HALF_YEAR'
    FROM_HALF_TO_ONE_YEAR = 'FROM_HALF_TO_ONE_YEAR'
    FROM_ONE_TO_TWO_YEARS = 'FROM_ONE_TO_TWO_YEARS'
    FROM_TWO_TO_FOUR_YEARS = 'FROM_TWO_TO_FOUR_YEARS'
    FROM_FOUR_TO_SIX_YEARS = 'FROM_FOUR_TO_SIX_YEARS'
    MORE_THAN_SIX = 'MORE_THAN_SIX'

    After executing the function, the dataset will have only EXPERIENCE column
    (EXPERIENCE_YEARS and EXPERIENCE_MONTHS will be deleted)
    """
    languages = get_all_file_system_items(path, language_item_condition,
                                          FILE_SYSTEM_ITEM.SUBDIR)
    output_directory = get_output_directory(path, output_directory_prefix)
    for _ in languages:
        handle_folder(path, output_directory_prefix,
                      __unify_program_experience)
    return output_directory
def add_int_experience(path: str, output_directory_prefix: str = 'int_exp') -> str:
    output_directory = get_output_directory(path, output_directory_prefix)
    files = get_all_file_system_items(path)
    for file in files:
        df = pd.read_csv(file, encoding=ISO_ENCODING)
        df[CODE_TRACKER_COLUMN.INT_EXPERIENCE.value] = \
            df[CODE_TRACKER_COLUMN.EXPERIENCE.value].apply(convert_to_int_experience)
        write_result(output_directory, path, file, df)
    return output_directory
def run_tests(path: str) -> str:
    """
    Run tests on all code snapshots in the data for the task.
    Note: the enum class TASK (see consts.py file)  must have the task key.
    It also must match the name of the folder with test files in the resources/tasks_tests.

    For example, if your task has key [my_key], you should add a new value into TASK const with value [my_key]
    and add a new folder [my_key] with input and output files for tests in the resources/tasks_tests folder.

    The test result is an array containing values for all tasks from the TASK enum class.
    If the code snapshot is incorrect, then the value -1 is specified.
    To deserialize this array of ratings, use the function unpack_tests_results from task_scoring.py.
    To get the rate only for the current task use the calculate_current_task_rate function from plots/scoring_solutions_plots.py

    For more details see
    https://github.com/JetBrains-Research/task-tracker-post-processing/wiki/Data-processing:-find-tests-results-for-the-tasks
    """
    log.info(f'Start running tests on path {path}')
    output_directory = get_output_directory(
        path, consts.RUNNING_TESTS_OUTPUT_DIRECTORY)

    files = get_all_file_system_items(path, tt_file_condition)
    str_len_files = str(len(files))
    log.info(f'Found {str_len_files} files to run tests on them')

    files = filter_already_tested_files(files, output_directory)
    str_len_files = str(len(files))
    log.info(
        f'Found {str_len_files} files to run tests on them after filtering already tested'
    )

    tasks = TASK.tasks()
    in_and_out_files_dict = create_in_and_out_dict(tasks)

    for i, file in enumerate(files):
        file_log_info = f'file: {str(i + 1)}/{str_len_files}'
        log.info(f'Start running tests on {file_log_info}, {file}')
        current_task = __get_task_by_ct_file(file)
        if not current_task:
            # We don't need to handle other files with tasks which are not in the TASK enum class
            continue
        data = pd.read_csv(file, encoding=consts.ISO_ENCODING)
        language, data = __check_tasks_on_correct_fragments(
            data,
            tasks,
            in_and_out_files_dict,
            file_log_info,
            current_task=current_task)
        log.info(f'Finish running tests on {file_log_info}, {file}')
        output_directory_with_user_folder = os.path.join(
            output_directory, __get_user_folder_name_from_path(file))
        write_based_on_language(output_directory_with_user_folder, file, data,
                                language)

    return output_directory
def filter_same_fragments(
        path: str,
        output_directory_prefix: str = 'filter_same_fragments') -> str:
    """
    This function allows to delete consecutive same fragments and add column id with row number
    """
    languages = get_all_file_system_items(path, language_item_condition,
                                          FILE_SYSTEM_ITEM.SUBDIR)
    output_directory = get_output_directory(path, output_directory_prefix)
    for _ in languages:
        handle_folder(path, output_directory_prefix, __filter_same_fragments)
    return output_directory
예제 #8
0
def unpack_tests_results(path: str, output_directory_prefix: str = 'unpack_tests_results') -> str:
    """
    This function allows to unpack tests results from array like [-1, -1, -1, -1, -1, -1] into score:
    -1, or a number in [0, 1]

    Also this function allows to add a new column TASK with task key
    """
    languages = get_all_file_system_items(path, language_item_condition, FILE_SYSTEM_ITEM.SUBDIR)
    output_directory = get_output_directory(path, output_directory_prefix)
    for _ in languages:
        handle_folder(path, output_directory_prefix, __unpack_tests_results)
    return output_directory
예제 #9
0
def handle_folder(path: str, output_directory_prefix: str,
                  handle_df: Callable) -> str:
    log.info(f'Start handling the folder {path}')
    output_directory = get_output_directory(path, output_directory_prefix)
    files = get_all_file_system_items(path,
                                      extension_file_condition(EXTENSION.CSV))
    for file in files:
        log.info(f'Start handling the file {file}')
        df = pd.read_csv(file, encoding=ISO_ENCODING)
        df = handle_df(df)
        write_result(output_directory, path, file, df)
        log.info(f'Finish handling the file {file}')
    log.info(f'Finish handling the folder {path}')
    return output_directory
예제 #10
0
def filter_incorrect_fragments(
        path: str,
        output_directory_prefix: str = 'filter_incorrect_fragments') -> str:
    """
    This function allows to filter incorrect fragments.
    The fragment is incorrect if the TESTS_RESULT column value is -1
    """
    languages = get_all_file_system_items(path, language_item_condition,
                                          FILE_SYSTEM_ITEM.SUBDIR)
    output_directory = get_output_directory(path, output_directory_prefix)
    for _ in languages:
        handle_folder(path, output_directory_prefix,
                      __filter_incorrect_fragments)
    return output_directory
def get_profile_statistics(path: str) -> str:
    output_directory = get_output_directory(path,
                                            consts.STATISTICS_OUTPUT_DIRECTORY)
    folders = get_all_file_system_items(path, user_subdirs_condition,
                                        consts.FILE_SYSTEM_ITEM.SUBDIR)
    statistics = __get_empty_statistics_dict()
    for folder in folders:
        log.info(f'Start handling the folder {folder}')
        ct_files = get_all_file_system_items(folder, tt_file_condition)
        age, experience = __get_age_and_experience_of_one_user(
            list(map(__get_age_and_experience, ct_files)))
        log.info(f'Folder: {folder}, age is {age}, experience is {experience}')
        __add_values_in_statistics_dict(statistics, age, experience)

    __write_results(output_directory, statistics)
    return output_directory
def delete_unnecessary_columns(
        path: str,
        output_directory_prefix: str = 'delete_unnecessary_columns') -> str:
    """
    This function allows to delete columns that are not needed to build the solution
    space in the coding assistant project.

    After executing the function, only the next columns will remain:
    [FRAGMENT, AGE, EXPERIENCE, TEST_RESULT, TASK]
    """
    languages = get_all_file_system_items(path, language_item_condition,
                                          FILE_SYSTEM_ITEM.SUBDIR)
    output_directory = get_output_directory(path, output_directory_prefix)
    for _ in languages:
        handle_folder(path, output_directory_prefix,
                      __delete_unnecessary_columns)
    return output_directory
예제 #13
0
def split_tasks_into_separate_files(
        path: str, output_directory_suffix: str = 'separated_tasks') -> str:
    files = get_all_file_system_items(path, ct_file_condition)
    output_directory = get_output_directory(path, output_directory_suffix)
    for file in files:
        log.info(f'Start splitting file {file}')
        ct_df = pd.read_csv(file, encoding=consts.ISO_ENCODING)
        language = get_ct_language(ct_df)
        split_df = find_splits(ct_df)
        for task in consts.TASK:
            task_dfs = find_task_dfs(split_df, task)
            for i, task_df in enumerate(task_dfs):
                if not task_df.empty:
                    # Change name to get something like pies/ati_207_test_5894859_i.csv
                    filename = task.value + '/' + get_parent_folder_name(file) + '_' + get_name_from_path(file, False) \
                               + f'_{i}' + get_extension_from_file(file).value
                    write_based_on_language(output_directory, filename,
                                            task_df, language)
    return output_directory
예제 #14
0
def preprocess_data(path: str) -> str:
    """
    We use task-tracker plugin (see https://github.com/JetBrains-Research/task-tracker-plugin)
    and activity tracker plugin (see https://plugins.jetbrains.com/plugin/8126-activity-tracker)
    to gather the source data. The data gathering consists of us collecting code snapshots and actions during
    the solving of various programming tasks by students. The data also contains information about the age,
    programming experience and so on of the student (student profile), and the current task that the student is solving.

    - At this stage, the test files that were created during the testing phase are deleted. They have ON value in the
    test mode column in the task-tracker file.
    - Also, the student could send several files with the history of solving the task, each of which can include
    the previous ones. At this stage, unnecessary files are deleted. Ultimately, there is only one file with a unique
    history of solving the current problem.
    - In addition, for each task-tracker file, a unique file of the activity tracker is sent. In this step,
    all files of the activity tracker are combined into one.

    For more details see
    https://github.com/JetBrains-Research/task-tracker-post-processing/wiki/Data-processing:-primary-data-processing
    """
    output_directory = get_output_directory(path,
                                            consts.PREPROCESSING_DIRECTORY)
    user_folders = get_all_file_system_items(path, user_subdirs_condition,
                                             consts.FILE_SYSTEM_ITEM.SUBDIR)
    for user_folder in user_folders:
        output_user_path = os.path.join(output_directory,
                                        get_name_from_path(user_folder, False))
        log.info(f'Start handling the path {user_folder}')
        task_folders = get_all_file_system_items(
            user_folder, all_items_condition, consts.FILE_SYSTEM_ITEM.SUBDIR)
        for task_folder in task_folders:
            output_task_path = os.path.join(
                output_user_path, get_name_from_path(task_folder, False))
            log.info(f'Start handling the folder {task_folder}')
            files = get_all_file_system_items(
                task_folder, extension_file_condition(EXTENSION.CSV))
            tt_files, ati_files = __partition_into_tt_and_ati_files(files)
            if __handle_tt_files(tt_files, output_task_path) and ati_files:
                new_ati_path = os.path.join(output_task_path,
                                            get_name_from_path(ati_files[0]))
                __merge_ati_files(ati_files).to_csv(new_ati_path)
    return output_directory
def add_int_experience(path: str,
                       output_directory_prefix: str = 'int_exp') -> str:
    """
    This function allows to add the int experience column to the files. It can be useful if you need to sort the data
    by the users' experience. Int experience values can be found in the const file (the INT_EXPERIENCE Enum class).

    Note: It may be necessary for files with old data format

    For more details see
    https://github.com/JetBrains-Research/task-tracker-post-processing/wiki/Data-processing:-add-int-experience-column
    """
    output_directory = get_output_directory(path, output_directory_prefix)
    files = get_all_file_system_items(path)
    for file in files:
        df = pd.read_csv(file, encoding=ISO_ENCODING)
        if TASK_TRACKER_COLUMN.EXPERIENCE.value in df.columns:
            # It is old file structure
            df[TASK_TRACKER_COLUMN.INT_EXPERIENCE.value] = \
                df[TASK_TRACKER_COLUMN.EXPERIENCE.value].apply(convert_to_int_experience)
        write_result(output_directory, path, file, df)
    return output_directory
예제 #16
0
def preprocess_data(path: str) -> str:
    output_directory = get_output_directory(path, consts.PREPROCESSING_OUTPUT_DIRECTORY)
    folders = get_all_file_system_items(path, data_subdirs_condition, consts.FILE_SYSTEM_ITEM.SUBDIR)
    for folder in folders:
        log.info(f'Start handling the folder {folder}')
        files = get_all_file_system_items(folder, extension_file_condition(consts.EXTENSION.CSV))
        try:
            ct_files, ati_file = __separate_ati_and_other_files(files)
        # Drop the current folder
        except ValueError:
            continue

        ati_df = handle_ati_file(ati_file)

        for ct_file in ct_files:
            ct_df, language = handle_ct_file(ct_file)
            ct_df = handle_ct_and_at(ct_file, ct_df, ati_file, ati_df, language)

            write_result(output_directory, path, ct_file, ct_df)

        log.info(f'Finish handling the folder {folder}')
    return output_directory
def remove_inefficient_statements(
        path: str,
        output_directory_prefix: str = 'remove_inefficient_statements') -> str:
    """
    This module allows to remove inefficient statements in the files. We use pylint library to detect such kinds of
    statements. For example, in the code fragment:

    a = input()
    print

    the statement print is an inefficient statement.

    Note: available only for Python language.

    For more details see
    https://github.com/JetBrains-Research/task-tracker-post-processing/wiki/Data-processing:-remove-inefficient-statements
    """
    languages = get_all_file_system_items(path, language_item_condition,
                                          FILE_SYSTEM_ITEM.SUBDIR)
    output_directory = get_output_directory(path, output_directory_prefix)
    for _ in languages:
        handle_folder(path, output_directory_prefix,
                      remove_inefficient_statements_from_df)
    return output_directory
def remove_intermediate_diffs(path: str, output_directory_prefix: str = 'remove_intermediate_diffs') -> str:
    """
    This function allows to remove intermediate diffs in the files. This means deleting all intermediate code snapshots
    that are collected during the writing of a code fragment.

    For example, if we have three consecutive snapshots:

    ...
    prin
    print
    print(5)
    ...

    we would like to delete the first 2 fragments because these are not final states.
    The final state is a completed line entered by the user.

    For more details see
    https://github.com/JetBrains-Research/task-tracker-post-processing/wiki/Data-processing:-remove-intermediate-diffs
    """
    languages = get_all_file_system_items(path, language_item_condition, FILE_SYSTEM_ITEM.SUBDIR)
    output_directory = get_output_directory(path, output_directory_prefix)
    for _ in languages:
        handle_folder(path, output_directory_prefix, remove_intermediate_diffs_from_df)
    return output_directory