Пример #1
0
def read_monitored_parsed_c99_slk_top_down_code(debug=False):
    def parse_df(df):
        identifier_set, type_set = extract_fake_c_header_identifier()
        clex = BufferedCLex(error_func=lambda self, msg, line, column: None,
                            on_lbrace_func=lambda: None,
                            on_rbrace_func=lambda: None,
                            type_lookup_func=lambda typ: None)
        clex.build()
        BEGIN, END, UNK = ["<BEGIN>", "<END>", "<UNK>"]
        from embedding.wordembedding import load_vocabulary
        vocabulary = load_vocabulary(get_token_vocabulary, get_vocabulary_id_map_with_keyword, [BEGIN], [END], UNK)
        print("the size of predefined_identifer:{}".format(len(identifier_set)))
        print("the size of typeset:{}".format(len(type_set)))
        parse_fn = monitored_slk_parse(clex=clex, predefined_identifer=identifier_set, predefined_typename=type_set,
                                       vocabulary=vocabulary)
        parsed_code = show_process_map(parse_fn, df['code'],
                                       error_default_value=tuple([None, ] * 7))
        parsed_code = unzip(parsed_code)
        df['parse_tree'] = list(parsed_code[0])
        df['tokens'] = list(parsed_code[1])
        df['consistent_identifier'] = list(parsed_code[2])
        df['identifier_scope_index'] = list(parsed_code[3])
        df['is_identifier'] = list(parsed_code[4])
        df['max_scope_list'] = list(parsed_code[5])
        df['consistent_typename'] = list(parsed_code[6])
        return df

    if not debug:
        return [parse_df(df) for df in read_filtered_without_include_distinct_problem_user_ac_c99_code_dataset()]
    else:
        return [parse_df(df.head(100)) for df in read_filtered_without_include_distinct_problem_user_ac_c99_code_dataset()]
Пример #2
0
def read_parsed_tree_code(debug=False):
    def parse_df(df):
        monitor = MonitoredParser()
        parsed_code = show_process_map(monitor.parse_get_production_list_and_token_list, df['code'],
                                       error_default_value=(None, None, None))
        parsed_code = unzip(parsed_code)
        df['parse_tree'] = list(parsed_code[0])
        df['ast'] = list(parsed_code[1])
        df['tokens'] = list(parsed_code[2])
        return df
    if not debug:
        return [parse_df(df) for df in read_filtered_without_include_distinct_problem_user_ac_c99_code_dataset()]
    else:
        return [parse_df(df.head(100)) for df in read_filtered_without_include_distinct_problem_user_ac_c99_code_dataset()]
def read_antlr_parse_records_train_set():
    test_df = read_filtered_without_include_distinct_problem_user_ac_c99_code_dataset(
    )[0]
    df = process_df_multiple(test_df)
    print('finish multiple process')
    df = df[df['tokens'].map(lambda x: x is not None)]
    return df
Пример #4
0
def read_parsed_c99_slk_top_down_code(debug=False):
    def parse_df(df):
        clex = BufferedCLex(error_func=lambda self, msg, line, column: None,
                            on_lbrace_func=lambda: None,
                            on_rbrace_func=lambda: None,
                            type_lookup_func=lambda typ: None)
        clex.build()
        parse_fn = c99_slk_parse(clex=clex)
        parsed_code = show_process_map(parse_fn, df['code'],
                                       error_default_value=(None, None))
        parsed_code = unzip(parsed_code)
        df['parse_tree'] = list(parsed_code[0])
        df['tokens'] = list(parsed_code[1])
        return df
    if not debug:
        return [parse_df(df) for df in read_filtered_without_include_distinct_problem_user_ac_c99_code_dataset()]
    else:
        return [parse_df(df.head(100)) for df in read_filtered_without_include_distinct_problem_user_ac_c99_code_dataset()]
def read_parser_train_dfa():
    train_df = read_filtered_without_include_distinct_problem_user_ac_c99_code_dataset(
    )[0]
    train_df['tokens'] = train_df['code'].apply(collect_dfa_do_parse,
                                                total=len(train_df))
    tmp_code = '''int main(){
    return 0;
}'''
    _, _, _, parser = create_monitor_parser(tmp_code)
    return parser.decisionsToDFA
def read_antlr_parse_records_dataset():
    datasets = read_filtered_without_include_distinct_problem_user_ac_c99_code_dataset(
    )
    total = 0
    for df in datasets:
        total += len(df)

    datasets = [process_df_multiple(df) for df in datasets]
    datasets = [
        df[df['tokens'].map(lambda x: x is not None)] for df in datasets
    ]
    print('train: {}, valid: {}, test: {}'.format(len(datasets[0]),
                                                  len(datasets[1]),
                                                  len(datasets[2])))
    return datasets
def read_antlr_parse_train_records_part(i):
    train_df, _, _ = read_filtered_without_include_distinct_problem_user_ac_c99_code_dataset(
    )
    # df_list = split_df_to_part(train_df, size=100)
    size = 10000
    df = train_df.iloc[i * size:(i + 1) * size]
    print('len df: ', len(df))
    # df = read_antlr_parse_records_df(df_list[i])
    df_list = split_df_to_part(df, 1000)
    for df in df_list:
        print('len parallel df: ', len(df))
    res = list(parallel_map(10, read_antlr_parse_records_df, df_list))
    df = res[0]
    for i in range(1, len(res)):
        df = df.append(res[i], ignore_index=True)

    # df = read_antlr_parse_records_df(df)
    return df
def read_antlr_parse_records_valid_set():
    valid_df = read_filtered_without_include_distinct_problem_user_ac_c99_code_dataset(
    )[1]
    df = process_df_multiple(valid_df)
    df = df[df['tokens'].map(lambda x: x is not None)]
    return df
Пример #9
0
def read_filtered_without_include_code_tokens():
    return [parse_c99_code_to_token(df) for df in read_filtered_without_include_distinct_problem_user_ac_c99_code_dataset()]