def merge_tt_with_ati(path: str, to_filter_ati_data: bool = True) -> str:
    """
    At this stage, merging data from the task-tracker plugin and activity tracker plugin takes place.
    Code snapshots that did not find activity tracker events are assigned empty values.

    For more details see
    https://github.com/JetBrains-Research/codetracker-data/wiki/Data-preprocessing:-merge-activity-tracker-and-code-tracker-files
    """
    output_directory = get_output_directory(
        path, consts.MERGING_TT_AND_ATI_OUTPUT_DIRECTORY)
    user_folders = get_all_file_system_items(path, user_subdirs_condition,
                                             consts.FILE_SYSTEM_ITEM.SUBDIR)
    for user_folder in user_folders:
        log.info(f'Start handling the folder {user_folder}')
        task_folders = get_all_file_system_items(
            user_folder, item_type=consts.FILE_SYSTEM_ITEM.SUBDIR)
        for task_folder in task_folders:
            log.info(f'Start handling the folder {task_folder}')
            files = get_all_file_system_items(
                task_folder, extension_file_condition(consts.EXTENSION.CSV))
            try:
                tt_files, ati_file = __separate_ati_and_tt_files(files)
            # Skip the current folder
            except ValueError:
                continue

            ati_df = handle_ati_file(ati_file, to_filter_ati_data)
            for tt_file in tt_files:
                tt_df, language = handle_tt_file(tt_file)
                tt_df = handle_tt_and_at(tt_file, tt_df, ati_df, language)
                write_result(output_directory, path, tt_file, tt_df)

        log.info(f'Finish handling the folder {user_folder}')
    return output_directory
예제 #2
0
def are_graph_folder_structures_equal(old_graph_folder: str,
                                      new_graph_folder: str):
    old_items = get_all_file_system_items(old_graph_folder)
    old_items.sort()
    new_items = get_all_file_system_items(new_graph_folder)
    new_items.sort()
    return old_items == new_items
예제 #3
0
def __get_sources_and_goals(task: TASK, test_type: TEST_TYPE = TEST_TYPE.DIFF) -> Tuple[List[str], List[str]]:
    root = os.path.join(BASE_DATA_PATH, test_type.value, task.value)
    sources_paths = get_all_file_system_items(root, match_condition(r'source_\d+.py'))
    sources_paths.sort()
    goals_paths = get_all_file_system_items(root, match_condition(r'goal_\d+.py'))
    goals_paths.sort()

    sources = [get_content_from_file(f) for f in sources_paths]
    goals = [get_content_from_file(f) for f in goals_paths]
    return sources, goals
def create_in_and_out_dict(tasks: List[TASK]) -> FilesDict:
    in_and_out_files_dict = {}
    for task in tasks:
        root = os.path.join(TASKS_TESTS_PATH, task.value)
        in_files = get_all_file_system_items(root,
                                             match_condition(r'in_\d+.txt'))
        out_files = get_all_file_system_items(root,
                                              match_condition(r'out_\d+.txt'))
        if len(out_files) != len(in_files):
            log_and_raise_error(
                'Length of out files list does not equal in files list', log)
        in_and_out_files_dict[task] = pair_in_and_out_files(
            in_files, out_files)
    return in_and_out_files_dict
예제 #5
0
def __get_chains(task: TASK) -> List[Tuple[List[str], str]]:
    root = os.path.join(BASE_DATA_PATH, task.value)
    chains = get_all_file_system_items(root, match_condition(r'chain_\d+'), item_type=FILE_SYSTEM_ITEM.SUBDIR)
    # We have to know ids for vertexes
    chains.sort()
    res_chains = []
    for chain in chains:
        sources_paths = get_all_file_system_items(chain, match_condition(r'source_\d+.py'))
        goals = get_all_file_system_items(chain, match_condition(r'goal.py'))
        if len(goals) != 1:
            log_and_raise_error(f'The chain {chain} contains more than 1 goal', log)
        sources_paths.sort()
        res_chains.append(([get_content_from_file(f) for f in sources_paths], get_content_from_file(goals[0])))
    return res_chains
def anonymize_cpp_code(root: str,
                       local_gorshochek_path: str,
                       output_folder_name: str = 'anonymizerResult') -> None:
    """
    We use gorshochek library: https://github.com/JetBrains-Research/gorshochek
    You need to clone the repo and build a docker image (see gorshochek README).

    Note: you need to change the config.yaml file before building the docker image:

    n transformations: 1
    transformations:
      - remove comments:
          p: 1.0
      - rename entities:
          p: 1
          rename functions: true
          rename variables: true
          strategy:
              name: hash
              hash prefix: d

    You can change 'seed', 'max tokens', 'max token len' params if you want.
    """
    cpp_path = f'{remove_slash(root)}/{LANGUAGE.CPP.value}'
    output_path = f'{get_parent_folder(root)}/{output_folder_name}/{LANGUAGE.CPP.value}'

    task_dirs = get_all_file_system_items(cpp_path,
                                          item_condition=task_item_condition,
                                          item_type=FILE_SYSTEM_ITEM.SUBDIR)
    gorshochek_anonymizer = GorshochekAnonymizer(local_gorshochek_path)
    for task_dir in task_dirs:
        task = get_name_from_path(task_dir, with_extension=False)
        print(f'Start handling the task {task}')
        files = get_all_file_system_items(
            task_dir, item_condition=extension_file_condition(EXTENSION.CSV))
        for file in files:
            print(f'Start handling the file {file}')
            df = pd.read_csv(file, encoding=ISO_ENCODING)
            # Delete incorrect fragments
            df = df[df.apply(
                lambda row: not is_incorrect_fragment(row[TESTS_RESULTS]),
                axis=1)]
            df[TASK_TRACKER_COLUMN.FRAGMENT.value] = \
                df[TASK_TRACKER_COLUMN.FRAGMENT.value].apply(gorshochek_anonymizer.anonymize_code_fragment)
            current_output_path = f'{output_path}/{task}/{get_name_from_path(file)}'
            create_file('', current_output_path)
            df.to_csv(current_output_path)

    gorshochek_anonymizer.remove_directories()
예제 #7
0
def run_tests(path: str) -> str:
    log.info(f'Start running tests on path {path}')
    output_directory = get_output_directory(
        path, consts.RUNNING_TESTS_OUTPUT_DIRECTORY)

    files = get_all_file_system_items(path, ct_file_condition)
    str_len_files = str(len(files))
    log.info(f'Found {str_len_files} files to run tests on them')

    files = filter_already_tested_files(files, output_directory)
    str_len_files = str(len(files))
    log.info(
        f'Found {str_len_files} files to run tests on them after filtering already tested'
    )

    tasks = TASK.tasks()
    in_and_out_files_dict = create_in_and_out_dict(tasks)

    for i, file in enumerate(files):
        file_log_info = f'file: {str(i + 1)}/{str_len_files}'
        log.info(f'Start running tests on {file_log_info}, {file}')
        data = pd.read_csv(file, encoding=consts.ISO_ENCODING)
        language, data = __check_tasks_on_correct_fragments(
            data, tasks, in_and_out_files_dict, file_log_info)
        log.info(f'Finish running tests on {file_log_info}, {file}')
        write_based_on_language(output_directory, file, data, language)

    return output_directory
def unify_program_experience(
        path: str,
        output_directory_prefix: str = 'unify_program_experience') -> str:
    """
    This function allows to unify program experience:

    new data contains two columns: programExperienceYears and programExperienceMonths
    this function allows to categorize them (see enum class EXPERIENCE):

    LESS_THAN_HALF_YEAR = 'LESS_THAN_HALF_YEAR'
    FROM_HALF_TO_ONE_YEAR = 'FROM_HALF_TO_ONE_YEAR'
    FROM_ONE_TO_TWO_YEARS = 'FROM_ONE_TO_TWO_YEARS'
    FROM_TWO_TO_FOUR_YEARS = 'FROM_TWO_TO_FOUR_YEARS'
    FROM_FOUR_TO_SIX_YEARS = 'FROM_FOUR_TO_SIX_YEARS'
    MORE_THAN_SIX = 'MORE_THAN_SIX'

    After executing the function, the dataset will have only EXPERIENCE column
    (EXPERIENCE_YEARS and EXPERIENCE_MONTHS will be deleted)
    """
    languages = get_all_file_system_items(path, language_item_condition,
                                          FILE_SYSTEM_ITEM.SUBDIR)
    output_directory = get_output_directory(path, output_directory_prefix)
    for _ in languages:
        handle_folder(path, output_directory_prefix,
                      __unify_program_experience)
    return output_directory
예제 #9
0
def reorganize_files_structure(path: str,
                               output_directory_suffix: str = 'separated_tasks'
                               ) -> str:
    """
    3.0 version: reorganize the file structure
    Before the function calling:

    -root
     --user_N1
      ---task1
       ----user_N1_files
     --user_N2
      ---task1
       ----user_N2_files

    After the function calling:

    -root
     --task1
      ----user_N1_files
      ----user_N2_files

    For more details see
    https://github.com/JetBrains-Research/task-tracker-post-processing/wiki/Data-processing:-reorganize-files-structure
    """
    output_directory = get_output_directory(path, output_directory_suffix)
    files = get_all_file_system_items(path, tt_file_condition)
    for file in files:
        log.info(f'Start splitting file {file}')
        dst_path = __get_dst_path(file, output_directory)
        log.info(f'Destination for the file {file} is {dst_path}')
        copy_file(file, dst_path)
    return output_directory
    def test_python_parsing(self) -> None:
        log.info('mypy and pylint testing:')
        # Files contain 12 incorrect files, which have 'error' in their names, and 1 correct file, which hasn't
        files = get_all_file_system_items(
            PARSING_TEST_DATA_PATH,
            extension_file_condition(consts.EXTENSION.TXT))
        mypy_rate = 0
        pylint_rate = 0
        mypy_with_execution_rate = 0
        for file in files:
            log.info(file)

            mypy = check_python_file_by_mypy(file)
            mypy_rate += ('error' in file) != mypy

            pylint = check_python_file_by_pylint(file)
            pylint_rate += ('error' in file) != pylint

            mypy_with_execution = check_file_by_mypy_and_execution(file)
            mypy_with_execution_rate += ('error'
                                         in file) != mypy_with_execution

            log.info(
                f'mypy: {mypy}, pylint: {pylint}, mypy with compile: {mypy_with_execution}'
            )

        log.info(
            f'mypy: {mypy_rate}, pylint: {pylint_rate}, mypy with compile: {mypy_with_execution_rate}, '
            f'all: {len(files)}')
def get_profile_statistics(path: str) -> str:
    output_directory = get_output_directory(path,
                                            consts.STATISTICS_OUTPUT_DIRECTORY)
    folders = get_all_file_system_items(path, user_subdirs_condition,
                                        consts.FILE_SYSTEM_ITEM.SUBDIR)
    statistics = __get_empty_statistics_dict()
    for folder in folders:
        log.info(f'Start handling the folder {folder}')
        ct_files = get_all_file_system_items(folder, tt_file_condition)
        age, experience = __get_age_and_experience_of_one_user(
            list(map(__get_age_and_experience, ct_files)))
        log.info(f'Folder: {folder}, age is {age}, experience is {experience}')
        __add_values_in_statistics_dict(statistics, age, experience)

    __write_results(output_directory, statistics)
    return output_directory
def get_expected_task_dfs(task: consts.TASK) -> List[pd.DataFrame]:
    df_files = sorted(
        get_all_file_system_items(TEST_DATA_FOLDER,
                                  (lambda n: task.value in n)))
    return [
        pd.read_csv(df_file, encoding=consts.ISO_ENCODING)
        for df_file in df_files
    ]
예제 #13
0
def get_test_in_and_out_files(
        test_type: Union[CANONICALIZATION_TESTS_TYPES,
                         DIFF_HANDLER_TEST_TYPES],
        task: TASK = None,
        additional_folder_name: str = '') -> List[Tuple[str, str]]:
    root = os.path.join(CANONICALIZATION_TESTS.DATA_PATH.value,
                        additional_folder_name, test_type.value)
    if task is not None:
        root = os.path.join(root, task.value)
    in_files = get_all_file_system_items(root, match_condition(r'in_\d+.py'))
    out_files = get_all_file_system_items(root, match_condition(r'out_\d+.py'))
    if len(out_files) != len(in_files):
        log_and_raise_error(
            'Length of out files list does not equal in files list', log)
    if len(in_files) == 0:
        log_and_raise_error(
            f'Number of test files is zero! Root for files is {root}', log)
    return pair_in_and_out_files(in_files, out_files)
def get_src_and_dst_files(test_type: DIFF_HANDLER_TEST_TYPES,
                          task: TASK) -> List[Tuple[str, str]]:
    root = os.path.join(CANONICALIZATION_TESTS.DATA_PATH.value,
                        ADDITIONAL_FOLDER, test_type.value, task.value)
    files = get_all_file_system_items(root, match_condition(r'\d+.py'))
    if len(files) == 0:
        log_and_raise_error(
            f'Number of test files is zero! Root for files is {root}', log)
    return list(itertools.product(files, repeat=2))
def add_int_experience(path: str, output_directory_prefix: str = 'int_exp') -> str:
    output_directory = get_output_directory(path, output_directory_prefix)
    files = get_all_file_system_items(path)
    for file in files:
        df = pd.read_csv(file, encoding=ISO_ENCODING)
        df[CODE_TRACKER_COLUMN.INT_EXPERIENCE.value] = \
            df[CODE_TRACKER_COLUMN.EXPERIENCE.value].apply(convert_to_int_experience)
        write_result(output_directory, path, file, df)
    return output_directory
def filter_already_tested_files(files: List[str],
                                output_directory_path: str) -> List[str]:
    tested_files = get_all_file_system_items(output_directory_path,
                                             tt_file_condition)
    tested_folder_and_file_names = list(
        map(lambda f: get_file_and_parent_folder_names(f), tested_files))
    return list(
        filter(
            lambda f: get_file_and_parent_folder_names(f) not in
            tested_folder_and_file_names, files))
def run_tests(path: str) -> str:
    """
    Run tests on all code snapshots in the data for the task.
    Note: the enum class TASK (see consts.py file)  must have the task key.
    It also must match the name of the folder with test files in the resources/tasks_tests.

    For example, if your task has key [my_key], you should add a new value into TASK const with value [my_key]
    and add a new folder [my_key] with input and output files for tests in the resources/tasks_tests folder.

    The test result is an array containing values for all tasks from the TASK enum class.
    If the code snapshot is incorrect, then the value -1 is specified.
    To deserialize this array of ratings, use the function unpack_tests_results from task_scoring.py.
    To get the rate only for the current task use the calculate_current_task_rate function from plots/scoring_solutions_plots.py

    For more details see
    https://github.com/JetBrains-Research/task-tracker-post-processing/wiki/Data-processing:-find-tests-results-for-the-tasks
    """
    log.info(f'Start running tests on path {path}')
    output_directory = get_output_directory(
        path, consts.RUNNING_TESTS_OUTPUT_DIRECTORY)

    files = get_all_file_system_items(path, tt_file_condition)
    str_len_files = str(len(files))
    log.info(f'Found {str_len_files} files to run tests on them')

    files = filter_already_tested_files(files, output_directory)
    str_len_files = str(len(files))
    log.info(
        f'Found {str_len_files} files to run tests on them after filtering already tested'
    )

    tasks = TASK.tasks()
    in_and_out_files_dict = create_in_and_out_dict(tasks)

    for i, file in enumerate(files):
        file_log_info = f'file: {str(i + 1)}/{str_len_files}'
        log.info(f'Start running tests on {file_log_info}, {file}')
        current_task = __get_task_by_ct_file(file)
        if not current_task:
            # We don't need to handle other files with tasks which are not in the TASK enum class
            continue
        data = pd.read_csv(file, encoding=consts.ISO_ENCODING)
        language, data = __check_tasks_on_correct_fragments(
            data,
            tasks,
            in_and_out_files_dict,
            file_log_info,
            current_task=current_task)
        log.info(f'Finish running tests on {file_log_info}, {file}')
        output_directory_with_user_folder = os.path.join(
            output_directory, __get_user_folder_name_from_path(file))
        write_based_on_language(output_directory_with_user_folder, file, data,
                                language)

    return output_directory
예제 #18
0
def preprocess_data(path: str) -> str:
    """
    We use task-tracker plugin (see https://github.com/JetBrains-Research/task-tracker-plugin)
    and activity tracker plugin (see https://plugins.jetbrains.com/plugin/8126-activity-tracker)
    to gather the source data. The data gathering consists of us collecting code snapshots and actions during
    the solving of various programming tasks by students. The data also contains information about the age,
    programming experience and so on of the student (student profile), and the current task that the student is solving.

    - At this stage, the test files that were created during the testing phase are deleted. They have ON value in the
    test mode column in the task-tracker file.
    - Also, the student could send several files with the history of solving the task, each of which can include
    the previous ones. At this stage, unnecessary files are deleted. Ultimately, there is only one file with a unique
    history of solving the current problem.
    - In addition, for each task-tracker file, a unique file of the activity tracker is sent. In this step,
    all files of the activity tracker are combined into one.

    For more details see
    https://github.com/JetBrains-Research/task-tracker-post-processing/wiki/Data-processing:-primary-data-processing
    """
    output_directory = get_output_directory(path,
                                            consts.PREPROCESSING_DIRECTORY)
    user_folders = get_all_file_system_items(path, user_subdirs_condition,
                                             consts.FILE_SYSTEM_ITEM.SUBDIR)
    for user_folder in user_folders:
        output_user_path = os.path.join(output_directory,
                                        get_name_from_path(user_folder, False))
        log.info(f'Start handling the path {user_folder}')
        task_folders = get_all_file_system_items(
            user_folder, all_items_condition, consts.FILE_SYSTEM_ITEM.SUBDIR)
        for task_folder in task_folders:
            output_task_path = os.path.join(
                output_user_path, get_name_from_path(task_folder, False))
            log.info(f'Start handling the folder {task_folder}')
            files = get_all_file_system_items(
                task_folder, extension_file_condition(EXTENSION.CSV))
            tt_files, ati_files = __partition_into_tt_and_ati_files(files)
            if __handle_tt_files(tt_files, output_task_path) and ati_files:
                new_ati_path = os.path.join(output_task_path,
                                            get_name_from_path(ati_files[0]))
                __merge_ati_files(ati_files).to_csv(new_ati_path)
    return output_directory
def filter_same_fragments(
        path: str,
        output_directory_prefix: str = 'filter_same_fragments') -> str:
    """
    This function allows to delete consecutive same fragments and add column id with row number
    """
    languages = get_all_file_system_items(path, language_item_condition,
                                          FILE_SYSTEM_ITEM.SUBDIR)
    output_directory = get_output_directory(path, output_directory_prefix)
    for _ in languages:
        handle_folder(path, output_directory_prefix, __filter_same_fragments)
    return output_directory
예제 #20
0
def unpack_tests_results(path: str, output_directory_prefix: str = 'unpack_tests_results') -> str:
    """
    This function allows to unpack tests results from array like [-1, -1, -1, -1, -1, -1] into score:
    -1, or a number in [0, 1]

    Also this function allows to add a new column TASK with task key
    """
    languages = get_all_file_system_items(path, language_item_condition, FILE_SYSTEM_ITEM.SUBDIR)
    output_directory = get_output_directory(path, output_directory_prefix)
    for _ in languages:
        handle_folder(path, output_directory_prefix, __unpack_tests_results)
    return output_directory
예제 #21
0
class TestPreviousErrors:

    @pytest.mark.parametrize('fragment_file', get_all_file_system_items(PREVIOUS_ERRORS_TEST_DATA,
                                                                        (lambda name: 'fragment' in name)))
    def test_fragments(self, fragment_file: str) -> None:
        in_and_out_files_dict = create_in_and_out_dict(TASK.tasks())
        language = get_language_by_extension(get_extension_from_file(fragment_file))
        check_tasks(TASK.tasks(), get_content_from_file(fragment_file), in_and_out_files_dict, language, False)

    # need to test ati_327/Main_67885, put it in PREVIOUS_ERRORS_TEST_DATA before running
    def test_codetracker_data(self) -> None:
        run_tests(PREVIOUS_ERRORS_TEST_DATA)
예제 #22
0
def preprocess_data(path: str) -> str:
    output_directory = get_output_directory(path, consts.PREPROCESSING_OUTPUT_DIRECTORY)
    folders = get_all_file_system_items(path, data_subdirs_condition, consts.FILE_SYSTEM_ITEM.SUBDIR)
    for folder in folders:
        log.info(f'Start handling the folder {folder}')
        files = get_all_file_system_items(folder, extension_file_condition(consts.EXTENSION.CSV))
        try:
            ct_files, ati_file = __separate_ati_and_other_files(files)
        # Drop the current folder
        except ValueError:
            continue

        ati_df = handle_ati_file(ati_file)

        for ct_file in ct_files:
            ct_df, language = handle_ct_file(ct_file)
            ct_df = handle_ct_and_at(ct_file, ct_df, ati_file, ati_df, language)

            write_result(output_directory, path, ct_file, ct_df)

        log.info(f'Finish handling the folder {folder}')
    return output_directory
예제 #23
0
def check_anonymization(old_files_root: str, new_files_root: str) -> List[str]:
    """
    Find incorrect anonymized files. The file is incorrect if:
     - does not exist in the new folder
     - has more or less rows than in the old folder
    """
    files_with_errors = []
    language_dirs = get_all_file_system_items(
        new_files_root,
        item_condition=language_item_condition,
        item_type=FILE_SYSTEM_ITEM.SUBDIR)
    for language_dir in language_dirs:
        task_dirs = get_all_file_system_items(
            language_dir,
            item_condition=task_item_condition,
            item_type=FILE_SYSTEM_ITEM.SUBDIR)
        language = get_name_from_path(language_dir, with_extension=False)
        for task_dir in task_dirs:
            task = get_name_from_path(task_dir, with_extension=False)
            old_path = f'{remove_slash(old_files_root)}/{language}/{task}'
            old_files = get_all_file_system_items(
                old_path,
                item_condition=extension_file_condition(EXTENSION.CSV))
            for old_file in old_files:
                name = get_name_from_path(old_file)
                new_file_path = f'{task_dir}/{name}'
                if not does_exist(new_file_path):
                    files_with_errors.append(new_file_path)
                else:
                    try:
                        new_df = pd.read_csv(new_file_path,
                                             encoding=ISO_ENCODING)
                        old_df = pd.read_csv(old_file, encoding=ISO_ENCODING)
                        if new_df.shape[0] != old_df.shape[0]:
                            files_with_errors.append(new_file_path)
                    except pd.errors.EmptyDataError:
                        files_with_errors.append(new_file_path)
    return files_with_errors
예제 #24
0
def handle_folder(path: str, output_directory_prefix: str,
                  handle_df: Callable) -> str:
    log.info(f'Start handling the folder {path}')
    output_directory = get_output_directory(path, output_directory_prefix)
    files = get_all_file_system_items(path,
                                      extension_file_condition(EXTENSION.CSV))
    for file in files:
        log.info(f'Start handling the file {file}')
        df = pd.read_csv(file, encoding=ISO_ENCODING)
        df = handle_df(df)
        write_result(output_directory, path, file, df)
        log.info(f'Finish handling the file {file}')
    log.info(f'Finish handling the folder {path}')
    return output_directory
def get_tasks_statistics(path: str) -> TaskStatistics:
    statistics = {}
    language_values = [language.value for language in consts.LANGUAGE]
    language_folders = get_all_file_system_items(
        path, contains_substrings_condition(language_values), SUBDIR)
    for l_f in language_folders:
        language = consts.LANGUAGE(get_name_from_path(l_f, False))
        if statistics.get(language):
            log_and_raise_error(
                f'Duplicate language folder for {language.value}', log)
        statistics[language] = {}
        task_values = consts.TASK.tasks_values()
        task_folders = get_all_file_system_items(
            l_f, contains_substrings_condition(task_values), SUBDIR)
        for t_f in task_folders:
            files = get_all_file_system_items(t_f)
            task = consts.TASK(get_name_from_path(t_f, False))
            if statistics.get(language).get(task):
                log_and_raise_error(
                    f'Duplicate task for {task.value} in folder {l_f}', log)
            statistics.get(language)[task] = len(files)

    return statistics
예제 #26
0
def filter_incorrect_fragments(
        path: str,
        output_directory_prefix: str = 'filter_incorrect_fragments') -> str:
    """
    This function allows to filter incorrect fragments.
    The fragment is incorrect if the TESTS_RESULT column value is -1
    """
    languages = get_all_file_system_items(path, language_item_condition,
                                          FILE_SYSTEM_ITEM.SUBDIR)
    output_directory = get_output_directory(path, output_directory_prefix)
    for _ in languages:
        handle_folder(path, output_directory_prefix,
                      __filter_incorrect_fragments)
    return output_directory
def delete_unnecessary_columns(
        path: str,
        output_directory_prefix: str = 'delete_unnecessary_columns') -> str:
    """
    This function allows to delete columns that are not needed to build the solution
    space in the coding assistant project.

    After executing the function, only the next columns will remain:
    [FRAGMENT, AGE, EXPERIENCE, TEST_RESULT, TASK]
    """
    languages = get_all_file_system_items(path, language_item_condition,
                                          FILE_SYSTEM_ITEM.SUBDIR)
    output_directory = get_output_directory(path, output_directory_prefix)
    for _ in languages:
        handle_folder(path, output_directory_prefix,
                      __delete_unnecessary_columns)
    return output_directory
예제 #28
0
def construct_solution_graph(
        path: str,
        task: TASK,
        language: LANGUAGE = LANGUAGE.PYTHON) -> SolutionGraph:
    files = get_all_file_system_items(path,
                                      extension_file_condition(EXTENSION.CSV))
    sg = SolutionGraph(task, language)
    log.info(f'Start creating solution space from path {path}')
    for file in files:
        log.info(f'Start handling file {file}')
        code_info_chain = __create_code_info_chain(file, task)
        sg.add_code_info_chain(code_info_chain)
    log.info(f'Finish creating solution space from path {path}')
    log.info('Start finding medians')
    sg.find_all_medians()
    log.info('Finish finding medians')
    return sg
class TestBrackets:
    EXPRESSIONS_SOURCES_PATH = os.path.join(TEST_RESOURCES_PATH,
                                            'inverse_parser',
                                            'inverse_parser_3', 'brackets')
    EXPRESSIONS_SOURCES = get_all_file_system_items(EXPRESSIONS_SOURCES_PATH)

    @staticmethod
    def __calculate_expression_safely(popen_args: List[str],
                                      input: str = '') -> Optional[str]:
        try:
            out = check_output(popen_args,
                               input=input,
                               universal_newlines=True)
            return out.rstrip('\n')
        except CalledProcessError:
            return None

    @staticmethod
    def __get_python_popen_args(py_file_path: str) -> List[str]:
        return ['python3', py_file_path]

    def __get_actual_out(self, source: str) -> Optional[str]:
        parsed_code = InverseParserTestsUtil.source_to_xml_to_source(source)
        log.info(f'Parsed code is:\n{parsed_code}')
        source_file = self.__create_source_file(parsed_code)
        actual_out = self.__calculate_expression_safely(
            self.__get_python_popen_args(source_file))
        remove_file(source_file)
        return actual_out

    def __create_source_file(self, source_code: str) -> str:
        source_code_file = os.path.join(self.EXPRESSIONS_SOURCES_PATH,
                                        'brackets_test.py')
        create_file(source_code, source_code_file)
        return source_code_file

    @pytest.mark.parametrize('source', EXPRESSIONS_SOURCES)
    def test_expressions(self, source: str) -> None:
        log.info(f'Start checking source: {source}')
        log.info(f'Source code is:\n{get_content_from_file(source)}')
        expected_out = self.__calculate_expression_safely(
            self.__get_python_popen_args(source))
        actual_out = self.__get_actual_out(source)
        log.info(
            f'Expected out is: {expected_out}. Actual out is: {actual_out}')
        assert expected_out == actual_out
def find_all_pseudo_solutions(
        path: str,
        task: TASK,
        language: LANGUAGE,
        to_add_int_experience: bool = True,
        to_remove_incorrect_fragments: bool = True,
        to_remove_intermediate_diffs: bool = True,
        to_remove_inefficient_statements: bool = True,
        result_name_suffix: str = 'pseudo_solutions') -> str:
    files = get_all_file_system_items(path, ct_file_condition)
    result_name_suffix = f'{task.value}_{result_name_suffix}'
    result_folder = get_result_folder(path, result_name_suffix)
    task_index = get_task_index(task)

    for i, file in enumerate(files):
        log.info(
            f'Finding pseudo solutions in file {i}/{len(files)}, file: {file}')
        df = pd.read_csv(file, encoding=ISO_ENCODING)
        task_dfs = find_task_dfs(df, task)
        for j, task_df in enumerate(task_dfs):
            log.info(f'Handling df {j}/{len(task_dfs)}')
            if not task_df.empty:
                # Replace test_results for all task with test_result for given task
                task_df[CODE_TRACKER_COLUMN.TESTS_RESULTS.value] = \
                    task_df[CODE_TRACKER_COLUMN.TESTS_RESULTS.value].apply(lambda t_r: get_rate(t_r, task_index))
                if to_add_int_experience:
                    task_df[CODE_TRACKER_COLUMN.INT_EXPERIENCE.value] = \
                        task_df[CODE_TRACKER_COLUMN.EXPERIENCE.value].apply(convert_to_int_experience)
                if to_remove_incorrect_fragments:
                    task_df = task_df[
                        task_df[CODE_TRACKER_COLUMN.TESTS_RESULTS.value] !=
                        TEST_RESULT.INCORRECT_CODE.value]
                if to_remove_intermediate_diffs:
                    task_df = remove_intermediate_diffs_from_df(task_df)
                if to_remove_inefficient_statements:
                    task_df = remove_inefficient_statements_from_df(task_df)

                # Change name to get something like pies/ati_207_test_5894859_i.csv
                filename = f'{task.value}/{get_parent_folder_name(file)}_{get_name_from_path(file, False)}_{j}' \
                           f'{get_extension_from_file(file).value}'
                write_based_on_language(result_folder, filename, task_df,
                                        language)
    return result_folder