Exemplo n.º 1
0
def transform_data_from_df_to_dataset(data, stack_size):
    vocabulary, keyword_num = load_keyword_identifier_split_vocabulary(
        get_token_vocabulary, [BEGIN], [END], UNK)
    print("vocab_size:{}".format(vocabulary.vocabulary_size))
    print("The max token id:{}".format(max(
        vocabulary.word_to_id_dict.values())))
    slk_constants = C99SLKConstants()
    # terminal_token_index = set(range(slk_constants.START_SYMBOL-2)) - {63, 64}
    label_vocabulary = C99LabelVocabulary(slk_constants)
    production_vocabulary = SLKProductionVocabulary(slk_constants)
    transforms_fn = transforms.Compose([
        # IsNone("original"),
        CopyMap(),
        key_transform(RangeMaskMap(stack_size), "max_scope_list"),
        key_transform(IndexMaskMap(stack_size), "identifier_scope_index"),
        # IsNone("after type input"),
        FlatMap(),
        # IsNone("Flat Map"),
        PadMap(keyword_num, stack_size),
        # IsNone("Pad Map"),
    ])
    generate_dataset = lambda df: CCodeDataSet(df, vocabulary, stack_size,
                                               transforms_fn)
    res = generate_dataset(data[0])
    del data[0]
    return res, keyword_num, vocabulary
def get_transform(stack_size):
    vocabulary, keyword_num = load_keyword_identifier_split_vocabulary(
        get_token_vocabulary, [BEGIN], [END], UNK)
    print("vocab_size:{}".format(vocabulary.vocabulary_size))
    print("The max token id:{}".format(max(
        vocabulary.word_to_id_dict.values())))
    slk_constants = C99SLKConstants()
    # terminal_token_index = set(range(slk_constants.START_SYMBOL-2)) - {63, 64}
    label_vocabulary = C99LabelVocabulary(slk_constants)
    production_vocabulary = SLKProductionVocabulary(slk_constants)
    transforms_fn = transforms.Compose([
        # IsNone("original"),
        CopyMap(),
        key_transform(RangeMaskMap(stack_size), "max_scope_list"),
        key_transform(IndexMaskMap(stack_size), "identifier_scope_index"),
        key_transform(
            GrammarLanguageModelTypeInputMap(production_vocabulary, vocabulary,
                                             label_vocabulary, keyword_num),
            "tree", "target"),
        # IsNone("after type input"),
        FlatMap(),
        # IsNone("Flat Map"),
        PadMap(keyword_num, stack_size),
        # IsNone("Pad Map"),
    ])
    return keyword_num, vocabulary, transforms_fn
Exemplo n.º 3
0
def transform_data_from_df_to_dataset(data, ):
    for d, n in zip(data, ["train", "val", "test"]):
        print("There are {} raw data in the {} dataset".format(len(d), n))
    vocabulary, keyword_num = load_keyword_identifier_split_vocabulary(
        get_token_vocabulary, [BEGIN], [END], UNK)
    print("vocab_size:{}".format(vocabulary.vocabulary_size))
    print("The max token id:{}".format(max(
        vocabulary.word_to_id_dict.values())))
    slk_constants = C99SLKConstants()
    # terminal_token_index = set(range(slk_constants.START_SYMBOL-2)) - {63, 64}
    label_vocabulary = C99LabelVocabulary(slk_constants)
    production_vocabulary = SLKProductionVocabulary(slk_constants)
    transforms_fn = transforms.Compose([
        # IsNone("original"),
        CopyMap(),
        key_transform(
            GrammarLanguageModelTypeInputMap(production_vocabulary, vocabulary,
                                             label_vocabulary, keyword_num),
            "tree", "target"),
        # IsNone("after type input"),
        FlatMap(),
        # IsNone("Flat Map"),
        PadMap(keyword_num),
        # IsNone("Pad Map"),
    ])
    generate_dataset = lambda df: CCodeDataSet(df, vocabulary, transforms_fn)
    res = generate_dataset(data[0])
    del data[0]
    return res, keyword_num, vocabulary
Exemplo n.º 4
0
def train_and_evaluate(data,
                       batch_size,
                       embedding_dim,
                       hidden_state_size,
                       rnn_num_layer,
                       learning_rate,
                       epoches,
                       saved_name,
                       load_previous_model=False):
    save_path = os.path.join(config.save_model_root, saved_name)
    for d, n in zip(data, ["train", "val", "test"]):
        print("There are {} raw data in the {} dataset".format(len(d), n))
    vocabulary = load_vocabulary(get_token_vocabulary, get_vocabulary_id_map_with_keyword, [BEGIN], [END], UNK)
    print("vocab_size:{}".format(vocabulary.vocabulary_size))
    print("The max token id:{}".format(max(vocabulary.word_to_id_dict.values())))

    slk_constants = C99SLKConstants()
    terminal_token_index = set(range(slk_constants.START_SYMBOL-2)) - {63, 64}
    label_vocabulary = C99LabelVocabulary(slk_constants)
    production_vocabulary = SLKProductionVocabulary(slk_constants)
    transforms_fn = transforms.Compose([
        IsNone("original"),
        key_transform(GrammarLanguageModelTypeInputMap(production_vocabulary), "tree"),
        IsNone("after type input"),
        FlatMap(),
        IsNone("Flat Map"),
        PadMap(production_vocabulary.token_num()),
        IsNone("Pad Map"),
    ])
    generate_dataset = lambda df: CCodeDataSet(df, vocabulary, transforms_fn)
    data = [generate_dataset(d) for d in data]
    for d, n in zip(data, ["train", "val", "test"]):
        print("There are {} parsed data in the {} dataset".format(len(d), n))
    train_dataset, valid_dataset, test_dataset = data
    keyword_index = [vocabulary.word_to_id(t) for t in pre_defined_c_tokens | {"CONSTANT", "STRING_LITERAL"}]
    identifier_index = label_vocabulary.get_label_id("ID") - 1 # zero

    loss_function = nn.CrossEntropyLoss(size_average=False, ignore_index=PAD_TOKEN)
    model = GrammarLanguageModel(
        vocabulary.vocabulary_size,
        production_vocabulary.token_num(),
        embedding_dim,
        hidden_state_size,
        rnn_num_layer,
        identifier_index,
        keyword_index,
        terminal_token_index,
        batch_size
    )
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min')
    if load_previous_model:
        torch_util.load_model(model, save_path)
        valid_loss = evaluate(model, valid_dataset, batch_size, loss_function)
        test_loss = evaluate(model, test_dataset, batch_size, loss_function)
        best_valid_perplexity = torch.exp(valid_loss)[0]
        best_test_perplexity = torch.exp(test_loss)[0]
        print(
            "load the previous mode, validation perplexity is {}, test perplexity is :{}".format(best_valid_perplexity,
                                                                                                 best_test_perplexity))
        scheduler.step(best_valid_perplexity)
    else:
        best_valid_perplexity = None
        best_test_perplexity = None
    for epoch in range(epoches):
        train_loss = train(model, train_dataset, batch_size, loss_function, optimizer)
        valid_loss = evaluate(model, valid_dataset, batch_size, loss_function)
        test_loss = evaluate(model, test_dataset, batch_size, loss_function)

        train_perplexity = torch.exp(train_loss)[0]
        valid_perplexity = torch.exp(valid_loss)[0]
        test_perplexity = torch.exp(test_loss)[0]

        scheduler.step(valid_perplexity)

        if best_valid_perplexity is None or valid_perplexity < best_valid_perplexity:
            best_valid_perplexity = valid_perplexity
            best_test_perplexity = test_perplexity
            torch_util.save_model(model, save_path)

        print("epoch {}: train perplexity of {},  valid perplexity of {}, test perplexity of {}".
              format(epoch, train_perplexity, valid_perplexity, test_perplexity))
    print("The model {} best valid perplexity is {} and test perplexity is {}".
          format(saved_name, best_valid_perplexity, best_test_perplexity))
Exemplo n.º 5
0
def train_and_evaluate(data,
                       batch_size,
                       embedding_dim,
                       hidden_state_size,
                       rnn_num_layer,
                       learning_rate,
                       epoches,
                       saved_name,
                       load_previous_model=False):
    save_path = os.path.join(config.save_model_root, saved_name)
    for d, n in zip(data, ["train", "val", "test"]):
        print("There are {} raw data in the {} dataset".format(len(d), n))
    vocabulary = load_vocabulary(get_token_vocabulary, get_vocabulary_id_map,
                                 [BEGIN], [END], UNK)
    production_vocabulary = get_all_c99_production_vocabulary()
    print("terminal num:{}".format(len(
        production_vocabulary._terminal_id_set)))
    transforms_fn = transforms.Compose([
        key_transform(GrammarLanguageModelTypeInputMap(production_vocabulary),
                      "tree"),
        FlatMap(),
        PadMap(production_vocabulary.token_num()),
    ])
    generate_dataset = lambda df: CCodeDataSet(df, vocabulary, transforms_fn)
    data = [generate_dataset(d) for d in data]
    for d, n in zip(data, ["train", "val", "test"]):
        print("There are {} parsed data in the {} dataset".format(len(d), n))
    train_dataset, valid_dataset, test_dataset = data

    loss_function = nn.CrossEntropyLoss(size_average=False,
                                        ignore_index=PAD_TOKEN)
    model = GrammarLanguageModel(vocabulary.vocabulary_size,
                                 production_vocabulary.token_num(),
                                 embedding_dim, hidden_state_size,
                                 rnn_num_layer, batch_size)
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min')
    if load_previous_model:
        torch_util.load_model(model, save_path)
        valid_loss = evaluate(model, valid_dataset, batch_size, loss_function)
        test_loss = evaluate(model, test_dataset, batch_size, loss_function)
        best_valid_perplexity = torch.exp(valid_loss)[0]
        best_test_perplexity = torch.exp(test_loss)[0]
        print(
            "load the previous mode, validation perplexity is {}, test perplexity is :{}"
            .format(best_valid_perplexity, best_test_perplexity))
        scheduler.step(best_valid_perplexity)
    else:
        best_valid_perplexity = None
        best_test_perplexity = None
    for epoch in range(epoches):
        train_loss = train(model, train_dataset, batch_size, loss_function,
                           optimizer)
        valid_loss = evaluate(model, valid_dataset, batch_size, loss_function)
        test_loss = evaluate(model, test_dataset, batch_size, loss_function)

        train_perplexity = torch.exp(train_loss)[0]
        valid_perplexity = torch.exp(valid_loss)[0]
        test_perplexity = torch.exp(test_loss)[0]

        scheduler.step(valid_perplexity)

        if best_valid_perplexity is None or valid_perplexity < best_valid_perplexity:
            best_valid_perplexity = valid_perplexity
            best_test_perplexity = test_perplexity
            torch_util.save_model(model, save_path)

        print(
            "epoch {}: train perplexity of {},  valid perplexity of {}, test perplexity of {}"
            .format(epoch, train_perplexity, valid_perplexity,
                    test_perplexity))
    print("The model {} best valid perplexity is {} and test perplexity is {}".
          format(saved_name, best_valid_perplexity, best_test_perplexity))