Пример #1
0
MAX_LENGTH = 50
EMB_SIZE = 256
HIDDEN_SIZE = 128
DROPOUT = 0.3
DROPOUT_REC = 0.3
BATCH_SIZE = 32
EPOCHS = 10

path_xml = '../SemCor/semcor.data.xml'  # path of resources needed to build the mappings among various sense representations.
path_inst2sk = '../SemCor/semcor.gold.key.txt'
path_bn2wn = '../resources/babelnet2wordnet.tsv'
path_bn2lex = '../resources/babelnet2lexnames.tsv'
path_bn2wnd = '../resources/babelnet2wndomains.tsv'

inst2sk_dict = get_dictionary(
    path_inst2sk,
    0)  # collection of the dictionaries to realise those mappings
wn2bn_dict = get_dictionary(path_bn2wn, 1)
bn2lex_dict = get_dictionary(path_bn2lex, 0)
bn2wnDom_dict = get_bn2wnDomains(path_bn2wnd)

inputs, labels_BN, bnIds = parse(
    path_xml, inst2sk_dict, wn2bn_dict, bn2lex_dict,
    'BN')  # parsing of the .xml file for retrieving fine-grained (BN) data
_, labels_WND, wndIds = parse(
    path_xml, inst2sk_dict, wn2bn_dict, bn2wnDom_dict,
    'WND')  # parsing of the .xml file for retrieving coarse-grained (WND) data
_, labels_LEX, lexIds = parse(
    path_xml, inst2sk_dict, wn2bn_dict, bn2lex_dict,
    'LN')  # parsing of the .xml file for retrieving coarse-grained (LEX) data
Пример #2
0
def predict_lexicographer(input_path: str, output_path: str,
                          resources_path: str) -> None:
    """
    DO NOT MODIFY THE SIGNATURE!
    This is the skeleton of the prediction function.
    The predict function will build your model, load the weights from the checkpoint and write a new file (output_path)
    with your predictions in the "<id> <lexicographerId>" format (e.g. "d000.s000.t000 noun.animal").

    The resources folder should contain everything you need to make the predictions. It is the "resources" folder in your submission.

    N.B. DO NOT HARD CODE PATHS IN HERE. Use resource_path instead, otherwise we will not be able to run the code.
    If you don't know what HARD CODING means see: https://en.wikipedia.org/wiki/Hard_coding

    :param input_path: the path of the input file to predict in the same format as Raganato's framework (XML files you downloaded).
    :param output_path: the path of the output file (where you save your predictions)
    :param resources_path: the path of the resources folder containing your model and stuff you might need.
    :return: None
    """

    batch_size = 32
    max_len = 50

    out_file = open(output_path, 'w', encoding='utf-8')
    inputs, pos, flags, ids = blind_parsing(input_path)  # parsing xml

    with open(resources_path + '/vocabulary', 'rb') as f:
        vocFG = pickle.load(f)
    with open(resources_path + '/vocabularyWND', 'rb') as f:
        vocWND = pickle.load(f)
    with open(resources_path + '/vocabularyLEX', 'rb') as f:
        vocLEX = pickle.load(f)

    wn2bn_dict = get_dictionary(resources_path + '/babelnet2wordnet.tsv', 1)
    bn2lex_dict = get_dictionary(resources_path + '/babelnet2lexnames.tsv', 0)

    model = MultitaskBiLSTM((max_len, ), len(vocFG), len(vocWND), len(vocLEX),
                            HIDDEN_SIZE, DROPOUT, DROPOUT_REC)
    model.load_weights(resources_path + '/mod082_weights.h5')

    inputs = crop(inputs, max_len)
    pos = crop(pos, max_len)
    flags = crop(flags, max_len)
    ids = crop(ids, max_len)
    pos = ext(pos, max_len, batch_size, 'pos')
    flags = ext(flags, max_len, batch_size, 'flags')
    ids = ext(ids, max_len, batch_size, 'ids')
    inputs = ext(inputs, max_len, batch_size, 'inputs')
    inputs, _ = pre_ELMO(inputs, max_len)

    predictions = model.predict(inputs, batch_size=batch_size, verbose=True)[2]

    for r, sentence in enumerate(flags):  # for each sentence
        for c, word in enumerate(sentence):  # for each word
            if word:  # flag = True, that word must be disambiguated
                pox_synsets = get_synsetsIds(inputs[r][c], pos[r][c],
                                             wn2bn_dict)
                pox_synsets = convert_bn2lex(pox_synsets, bn2lex_dict)
                pox_synsets = restrict_index_synsets(pox_synsets, vocLEX)
                if len(pox_synsets) == 0:
                    best = get_MFS(inputs[r][c], wn2bn_dict)
                    best = convert_bn2lex([best], bn2lex_dict)[0]
                else:
                    best = get_best(predictions[r][c], pox_synsets, vocLEX)
                out_file.write(ids[r][c] + ' ' + best + '\n')
    out_file.close()
    pass
Пример #3
0
def predict_babelnet_mod096_MODIFIED(input_path: str, output_path: str,
                                     resources_path: str) -> None:

    batch_size = 32
    max_len = 50

    out_file = open(output_path, 'w', encoding='utf-8')
    inputs, pos, flags, ids = blind_parsing(input_path)  # parsing xml

    with open(resources_path + '/OMSTI_voc_LEX', 'rb') as f:
        voc_LEX = pickle.load(f)
    with open(resources_path + '/OMSTI_voc_BN', 'rb') as f:
        voc_BN = pickle.load(f)
    with open(resources_path + '/OMSTI_voc_WND', 'rb') as f:
        voc_WND = pickle.load(f)

    wn2bn_dict = get_dictionary(resources_path + '/babelnet2wordnet.tsv', 1)
    bn2lex_dict = get_dictionary(resources_path + '/babelnet2lexnames.tsv', 0)
    bn2wnd_dict = get_dictionary(resources_path + '/babelnet2wndomains.tsv', 0)

    model = MultitaskBiLSTM((max_len, ), len(voc_BN), len(voc_WND),
                            len(voc_LEX), HIDDEN_SIZE, DROPOUT, DROPOUT_REC)
    model.load_weights(resources_path + '/mod_OMSTI_082_weights.h5')

    inputs = crop(inputs, max_len)
    pos = crop(pos, max_len)
    flags = crop(flags, max_len)
    ids = crop(ids, max_len)
    inputs = ext(inputs, max_len, batch_size, 'inputs')
    pos = ext(pos, max_len, batch_size, 'pos')
    flags = ext(flags, max_len, batch_size, 'flags')
    ids = ext(ids, max_len, batch_size, 'ids')
    inputs, _ = pre_ELMO(inputs, max_len)

    pred_BN, pred_WND, pred_LEX = model.predict(inputs,
                                                batch_size=batch_size,
                                                verbose=True)

    for r, sentence in enumerate(flags):  # for each sentence
        for c, word in enumerate(sentence):  # for each word
            if word:  # flag = True, that word must be disambiguated

                pox_synsets = get_synsetsIds(inputs[r][c], pos[r][c],
                                             wn2bn_dict)

                pox_LEX = convert_bn2lex(pox_synsets, bn2lex_dict)
                pox_LEX = restrict_index_synsets(pox_LEX, voc_LEX)

                if len(pox_LEX) == 0:
                    bestLEX = get_MFS(inputs[r][c], wn2bn_dict)
                    bestLEX = convert_bn2lex([bestLEX], bn2lex_dict)[0]
                else:
                    bestLEX = get_best(pred_LEX[r][c], pox_LEX,
                                       voc_LEX)  # best lex cathegory

                pox_WND = convert_bn2wnd(pox_synsets, bn2wnd_dict)
                pox_WND = restrict_index_synsets(pox_WND, voc_WND)

                if len(pox_WND) == 0:
                    bestWND = get_MFS(inputs[r][c], wn2bn_dict)
                    bestWND = convert_bn2wnd([bestWND], bn2wnd_dict)[0]
                else:
                    bestWND = get_best(pred_WND[r][c], pox_WND,
                                       voc_WND)  # best lex cathegory

                pox_synsets = filter_with_LEX(
                    pox_synsets, bestLEX,
                    bn2lex_dict)  # pox synsets restricted with lex
                pox_synsets = filter_with_WND(pox_synsets, bestWND,
                                              bn2wnd_dict)
                pox_synsets = restrict_index_synsets(pox_synsets, voc_BN)

                if len(pox_synsets) == 0:
                    best = get_MFS(inputs[r][c], wn2bn_dict)
                else:
                    best = get_best(pred_BN[r][c], pox_synsets, voc_BN)
                # write best on external sheet
                out_file.write(ids[r][c] + ' ' + best + '\n')
    out_file.close()

    pass
Пример #4
0
def predict_babelnet(input_path: str, output_path: str,
                     resources_path: str) -> None:
    """
    DO NOT MODIFY THE SIGNATURE!
    This is the skeleton of the prediction function.
    The predict function will build your model, load the weights from the checkpoint and write a new file (output_path)
    with your predictions in the "<id> <BABELSynset>" format (e.g. "d000.s000.t000 bn:01234567n").
    
    The resources folder should contain everything you need to make the predictions. It is the "resources" folder in your submission.
    
    N.B. DO NOT HARD CODE PATHS IN HERE. Use resource_path instead, otherwise we will not be able to run the code.
    If you don't know what HARD CODING means see: https://en.wikipedia.org/wiki/Hard_coding

    :param input_path: the path of the input file to predict in the same format as Raganato's framework (XML files you downloaded).
    :param output_path: the path of the output file (where you save your predictions)
    :param resources_path: the path of the resources folder containing your model and stuff you might need.
    :return: None
    """
    batch_size = 32
    max_len = 50

    out_file = open(output_path, 'w', encoding='utf-8')
    inputs, pos, flags, ids = blind_parsing(
        input_path
    )  # parsing xml (inputs: list of lists of lemmas, pos: list of lists of POS, flags:
    # list of lists of boolean (True:to disamb, False otherwise) and ids: list of lists of ids)

    with open(resources_path + '/OMSTI_voc_LEX',
              'rb') as f:  # retrieving vocabularies
        voc_LEX = pickle.load(f)
    with open(resources_path + '/OMSTI_voc_BN', 'rb') as f:
        voc_BN = pickle.load(f)
    with open(resources_path + '/OMSTI_voc_WND', 'rb') as f:
        voc_WND = pickle.load(f)

    wn2bn_dict = get_dictionary(resources_path + '/babelnet2wordnet.tsv',
                                1)  # retrieving mappings
    bn2lex_dict = get_dictionary(resources_path + '/babelnet2lexnames.tsv', 0)
    bn2wnd_dict = get_dictionary(resources_path + '/babelnet2wndomains.tsv', 0)

    model = MultitaskBiLSTM((max_len, ), len(voc_BN), len(voc_WND),
                            len(voc_LEX), HIDDEN_SIZE, DROPOUT,
                            DROPOUT_REC)  # initialize the model
    model.load_weights(resources_path +
                       '/mod_OMSTI_082_weights.h5')  # load weights

    # Shaping the inputs, pos, flags and ids to match ELMo and network requirements
    inputs = crop(inputs, max_len)
    pos = crop(pos, max_len)
    flags = crop(flags, max_len)
    ids = crop(ids, max_len)
    inputs = ext(inputs, max_len, batch_size, 'inputs')
    pos = ext(pos, max_len, batch_size, 'pos')
    flags = ext(flags, max_len, batch_size, 'flags')
    ids = ext(ids, max_len, batch_size, 'ids')
    inputs, _ = pre_ELMO(inputs, max_len)

    pred_BN, pred_WND, pred_LEX = model.predict(inputs,
                                                batch_size=batch_size,
                                                verbose=True)  # predict

    for r, sentence in enumerate(flags):  # for each sentence
        for c, word in enumerate(sentence):  # for each word
            if word:  # flag = True, that word must be disambiguated

                pox_synsets = get_synsetsIds(
                    inputs[r][c], pos[r][c],
                    wn2bn_dict)  #retrieve candidate synsets

                pox_LEX = convert_bn2lex(pox_synsets, bn2lex_dict)
                pox_LEX = restrict_index_synsets(
                    pox_LEX, voc_LEX
                )  # restrict pox synsets to only those belonging to the vocabulary

                if len(pox_LEX) == 0:  # if non is present
                    bestLEX = get_MFS(inputs[r][c],
                                      wn2bn_dict)  # backoff strategy
                    bestLEX = convert_bn2lex(
                        [bestLEX],
                        bn2lex_dict)[0]  # get LEX category of bakeoff strategy
                else:
                    bestLEX = get_best(pred_LEX[r][c], pox_LEX,
                                       voc_LEX)  # best LEX cathegory

                pox_WND = convert_bn2wnd(pox_synsets, bn2wnd_dict)
                pox_WND = restrict_index_synsets(
                    pox_WND, voc_WND
                )  # restrict pox synsets to only those belonging to the vocabulary

                if len(pox_WND) == 0:
                    bestWND = get_MFS(inputs[r][c],
                                      wn2bn_dict)  # backoff strategy
                    bestWND = convert_bn2wnd(
                        [bestWND],
                        bn2wnd_dict)[0]  # get WND category of bakeoff strategy
                else:
                    bestWND = get_best(pred_WND[r][c], pox_WND,
                                       voc_WND)  # best WND cathegory

                pox_synsets = filter_with_LEX(
                    pox_synsets, bestLEX,
                    bn2lex_dict)  # pox synsets restricted with LEX
                pox_synsets = filter_with_WND(
                    pox_synsets, bestWND,
                    bn2wnd_dict)  # pox synsets restricted with WND
                pox_synsets = restrict_index_synsets(pox_synsets, voc_BN)

                if len(pox_synsets) == 0:
                    best = get_MFS(inputs[r][c],
                                   wn2bn_dict)  # backoff strategy
                else:
                    best = get_best(pred_BN[r][c], pox_synsets,
                                    voc_BN)  # best synset like BabelNet id
                out_file.write(
                    ids[r][c] + ' ' + best + '\n'
                )  # write on the external file matching the format required
    out_file.close()

    pass
Пример #5
0
def predict_babelnet_mod095(input_path: str, output_path: str,
                            resources_path: str) -> None:

    batch_size = 32
    max_len = 50

    out_file = open(output_path, 'w', encoding='utf-8')
    inputs, pos, flags, ids = blind_parsing(input_path)  # parsing xml

    with open(resources_path + '/vocabularyLEX', 'rb') as f:
        voc_LEX = pickle.load(f)
    with open(resources_path + '/vocabulary', 'rb') as f:
        voc_BN = pickle.load(f)
    with open(resources_path + '/vocabularyWND', 'rb') as f:
        voc_WND = pickle.load(f)

    wn2bn_dict = get_dictionary(resources_path + '/babelnet2wordnet.tsv', 1)
    bn2lex_dict = get_dictionary(resources_path + '/babelnet2lexnames.tsv', 0)
    bn2wnd_dict = get_dictionary(resources_path + '/babelnet2wndomains.tsv', 0)

    model_in2lex = AttentionBiLSTM((max_len, ), len(voc_LEX), EMB_SIZE,
                                   HIDDEN_SIZE, DROPOUT, DROPOUT_REC)
    model_in2lex.load_weights(resources_path + '/mod06_LEX_weights.h5')

    inputs = crop(inputs, max_len)
    flags = crop(flags, max_len)
    ids = crop(ids, max_len)
    pos = crop(pos, max_len)
    flags = ext(flags, max_len, batch_size, 'flags')
    ids = ext(ids, max_len, batch_size, 'ids')
    pos = ext(pos, max_len, batch_size, 'pos')
    inputs = ext(inputs, max_len, batch_size, 'inputs')
    flags = pad_sequences(flags,
                          maxlen=max_len,
                          dtype='bool',
                          padding='post',
                          value=False)
    inputs, _ = pre_ELMO(inputs, max_len)

    preds_LEX = model_in2lex.predict(inputs,
                                     batch_size=batch_size,
                                     verbose=True)

    inputs_LEX = []

    for r, sentence in enumerate(flags):  # for each sentence
        inputs_LEX_part = []
        for c, word in enumerate(sentence):  # for each word
            if word:  # flag = True, that word must be disambiguated

                pox_synsets = get_synsetsIds(inputs[r][c], pos[r][c],
                                             wn2bn_dict)

                pox_LEX = convert_bn2lex(pox_synsets, bn2lex_dict)
                pox_LEX = restrict_index_synsets(pox_LEX, voc_LEX)

                if len(pox_LEX) == 0:
                    bestLEX = get_MFS(inputs[r][c], wn2bn_dict)
                    bestLEX = convert_bn2lex([bestLEX], bn2lex_dict)[0]
                else:
                    bestLEX = get_best(preds_LEX[r][c], pox_LEX,
                                       voc_LEX)  # best lex cathegory

                bestLEX = voc_LEX.index(bestLEX)
            else:

                bestLEX = np.argmax(preds_LEX[r][c])
            inputs_LEX_part.append(bestLEX)
        inputs_LEX.append(inputs_LEX_part)

    inputs_LEX = np.asarray(inputs_LEX)

    model_inLex2wn = Model095A([(max_len, ), inputs_LEX.shape[1:2]],
                               len(voc_LEX), len(voc_WND), EMB_SIZE,
                               HIDDEN_SIZE, DROPOUT, DROPOUT_REC)
    model_inLex2wn.load_weights(resources_path + '/mod095A_weights.h5')

    preds_WND = model_inLex2wn.predict([inputs, inputs_LEX],
                                       batch_size=batch_size,
                                       verbose=True)

    inputs_WND = []

    for r, sentence in enumerate(flags):  # for each sentence
        inputs_WND_part = []
        for c, word in enumerate(sentence):  # for each word
            if word:  # flag = True, that word must be disambiguated

                pox_synsets = get_synsetsIds(inputs[r][c], pos[r][c],
                                             wn2bn_dict)

                pox_WND = convert_bn2wnd(pox_synsets, bn2wnd_dict)
                pox_WND = restrict_index_synsets(pox_WND, voc_WND)

                if len(pox_WND) == 0:
                    bestWND = get_MFS(inputs[r][c], wn2bn_dict)
                    bestWND = convert_bn2lex([bestWND], bn2wnd_dict)[0]
                else:
                    bestWND = get_best(preds_WND[r][c], pox_WND,
                                       voc_WND)  # best lex cathegory

                bestWND = voc_WND.index(bestWND)
            else:

                bestWND = np.argmax(preds_WND[r][c])
            inputs_WND_part.append(bestWND)
        inputs_WND.append(inputs_WND_part)

    inputs_WND = np.asarray(inputs_WND)

    model_inWnd2bn = Model095B([(max_len, ), inputs_LEX.shape[1:2]],
                               len(voc_WND), len(voc_BN), EMB_SIZE,
                               HIDDEN_SIZE, DROPOUT, DROPOUT_REC)
    model_inWnd2bn.load_weights(resources_path + '/mod095B_weights.h5')

    predictions = model_inWnd2bn.predict([inputs, inputs_WND],
                                         batch_size=batch_size,
                                         verbose=True)

    for r, sentence in enumerate(flags):  # for each sentence
        for c, word in enumerate(sentence):  # for each word
            if word:  # flag = True, that word must be disambiguated
                pox_synsets = get_synsetsIds(inputs[r][c], pos[r][c],
                                             wn2bn_dict)
                pox_synsets = restrict_index_synsets(pox_synsets, voc_BN)
                if len(pox_synsets) == 0:
                    best = get_MFS(inputs[r][c], wn2bn_dict)
                else:
                    best = get_best(predictions[r][c], pox_synsets, voc_BN)
                # write best on external sheet
                out_file.write(ids[r][c] + ' ' + best + '\n')
    out_file.close()
    pass