def filter_with_LEX(set_syn, bestLex, bn2lex_dict): new_set = [] set_syn = list(set(set_syn)) set_lex = convert_bn2lex(set_syn, bn2lex_dict) for i, elem in enumerate(set_lex): if elem == bestLex: new_set.append(set_syn[i]) return new_set
def predict_lexicographer(input_path: str, output_path: str, resources_path: str) -> None: """ DO NOT MODIFY THE SIGNATURE! This is the skeleton of the prediction function. The predict function will build your model, load the weights from the checkpoint and write a new file (output_path) with your predictions in the "<id> <lexicographerId>" format (e.g. "d000.s000.t000 noun.animal"). The resources folder should contain everything you need to make the predictions. It is the "resources" folder in your submission. N.B. DO NOT HARD CODE PATHS IN HERE. Use resource_path instead, otherwise we will not be able to run the code. If you don't know what HARD CODING means see: https://en.wikipedia.org/wiki/Hard_coding :param input_path: the path of the input file to predict in the same format as Raganato's framework (XML files you downloaded). :param output_path: the path of the output file (where you save your predictions) :param resources_path: the path of the resources folder containing your model and stuff you might need. :return: None """ batch_size = 32 max_len = 50 out_file = open(output_path, 'w', encoding='utf-8') inputs, pos, flags, ids = blind_parsing(input_path) # parsing xml with open(resources_path + '/vocabulary', 'rb') as f: vocFG = pickle.load(f) with open(resources_path + '/vocabularyWND', 'rb') as f: vocWND = pickle.load(f) with open(resources_path + '/vocabularyLEX', 'rb') as f: vocLEX = pickle.load(f) wn2bn_dict = get_dictionary(resources_path + '/babelnet2wordnet.tsv', 1) bn2lex_dict = get_dictionary(resources_path + '/babelnet2lexnames.tsv', 0) model = MultitaskBiLSTM((max_len, ), len(vocFG), len(vocWND), len(vocLEX), HIDDEN_SIZE, DROPOUT, DROPOUT_REC) model.load_weights(resources_path + '/mod082_weights.h5') inputs = crop(inputs, max_len) pos = crop(pos, max_len) flags = crop(flags, max_len) ids = crop(ids, max_len) pos = ext(pos, max_len, batch_size, 'pos') flags = ext(flags, max_len, batch_size, 'flags') ids = ext(ids, max_len, batch_size, 'ids') inputs = ext(inputs, max_len, batch_size, 'inputs') inputs, _ = pre_ELMO(inputs, max_len) predictions = model.predict(inputs, batch_size=batch_size, verbose=True)[2] for r, sentence in enumerate(flags): # for each sentence for c, word in enumerate(sentence): # for each word if word: # flag = True, that word must be disambiguated pox_synsets = get_synsetsIds(inputs[r][c], pos[r][c], wn2bn_dict) pox_synsets = convert_bn2lex(pox_synsets, bn2lex_dict) pox_synsets = restrict_index_synsets(pox_synsets, vocLEX) if len(pox_synsets) == 0: best = get_MFS(inputs[r][c], wn2bn_dict) best = convert_bn2lex([best], bn2lex_dict)[0] else: best = get_best(predictions[r][c], pox_synsets, vocLEX) out_file.write(ids[r][c] + ' ' + best + '\n') out_file.close() pass
def predict_babelnet(input_path: str, output_path: str, resources_path: str) -> None: """ DO NOT MODIFY THE SIGNATURE! This is the skeleton of the prediction function. The predict function will build your model, load the weights from the checkpoint and write a new file (output_path) with your predictions in the "<id> <BABELSynset>" format (e.g. "d000.s000.t000 bn:01234567n"). The resources folder should contain everything you need to make the predictions. It is the "resources" folder in your submission. N.B. DO NOT HARD CODE PATHS IN HERE. Use resource_path instead, otherwise we will not be able to run the code. If you don't know what HARD CODING means see: https://en.wikipedia.org/wiki/Hard_coding :param input_path: the path of the input file to predict in the same format as Raganato's framework (XML files you downloaded). :param output_path: the path of the output file (where you save your predictions) :param resources_path: the path of the resources folder containing your model and stuff you might need. :return: None """ batch_size = 32 max_len = 50 out_file = open(output_path, 'w', encoding='utf-8') inputs, pos, flags, ids = blind_parsing( input_path ) # parsing xml (inputs: list of lists of lemmas, pos: list of lists of POS, flags: # list of lists of boolean (True:to disamb, False otherwise) and ids: list of lists of ids) with open(resources_path + '/OMSTI_voc_LEX', 'rb') as f: # retrieving vocabularies voc_LEX = pickle.load(f) with open(resources_path + '/OMSTI_voc_BN', 'rb') as f: voc_BN = pickle.load(f) with open(resources_path + '/OMSTI_voc_WND', 'rb') as f: voc_WND = pickle.load(f) wn2bn_dict = get_dictionary(resources_path + '/babelnet2wordnet.tsv', 1) # retrieving mappings bn2lex_dict = get_dictionary(resources_path + '/babelnet2lexnames.tsv', 0) bn2wnd_dict = get_dictionary(resources_path + '/babelnet2wndomains.tsv', 0) model = MultitaskBiLSTM((max_len, ), len(voc_BN), len(voc_WND), len(voc_LEX), HIDDEN_SIZE, DROPOUT, DROPOUT_REC) # initialize the model model.load_weights(resources_path + '/mod_OMSTI_082_weights.h5') # load weights # Shaping the inputs, pos, flags and ids to match ELMo and network requirements inputs = crop(inputs, max_len) pos = crop(pos, max_len) flags = crop(flags, max_len) ids = crop(ids, max_len) inputs = ext(inputs, max_len, batch_size, 'inputs') pos = ext(pos, max_len, batch_size, 'pos') flags = ext(flags, max_len, batch_size, 'flags') ids = ext(ids, max_len, batch_size, 'ids') inputs, _ = pre_ELMO(inputs, max_len) pred_BN, pred_WND, pred_LEX = model.predict(inputs, batch_size=batch_size, verbose=True) # predict for r, sentence in enumerate(flags): # for each sentence for c, word in enumerate(sentence): # for each word if word: # flag = True, that word must be disambiguated pox_synsets = get_synsetsIds( inputs[r][c], pos[r][c], wn2bn_dict) #retrieve candidate synsets pox_LEX = convert_bn2lex(pox_synsets, bn2lex_dict) pox_LEX = restrict_index_synsets( pox_LEX, voc_LEX ) # restrict pox synsets to only those belonging to the vocabulary if len(pox_LEX) == 0: # if non is present bestLEX = get_MFS(inputs[r][c], wn2bn_dict) # backoff strategy bestLEX = convert_bn2lex( [bestLEX], bn2lex_dict)[0] # get LEX category of bakeoff strategy else: bestLEX = get_best(pred_LEX[r][c], pox_LEX, voc_LEX) # best LEX cathegory pox_WND = convert_bn2wnd(pox_synsets, bn2wnd_dict) pox_WND = restrict_index_synsets( pox_WND, voc_WND ) # restrict pox synsets to only those belonging to the vocabulary if len(pox_WND) == 0: bestWND = get_MFS(inputs[r][c], wn2bn_dict) # backoff strategy bestWND = convert_bn2wnd( [bestWND], bn2wnd_dict)[0] # get WND category of bakeoff strategy else: bestWND = get_best(pred_WND[r][c], pox_WND, voc_WND) # best WND cathegory pox_synsets = filter_with_LEX( pox_synsets, bestLEX, bn2lex_dict) # pox synsets restricted with LEX pox_synsets = filter_with_WND( pox_synsets, bestWND, bn2wnd_dict) # pox synsets restricted with WND pox_synsets = restrict_index_synsets(pox_synsets, voc_BN) if len(pox_synsets) == 0: best = get_MFS(inputs[r][c], wn2bn_dict) # backoff strategy else: best = get_best(pred_BN[r][c], pox_synsets, voc_BN) # best synset like BabelNet id out_file.write( ids[r][c] + ' ' + best + '\n' ) # write on the external file matching the format required out_file.close() pass
def predict_babelnet_mod096_MODIFIED(input_path: str, output_path: str, resources_path: str) -> None: batch_size = 32 max_len = 50 out_file = open(output_path, 'w', encoding='utf-8') inputs, pos, flags, ids = blind_parsing(input_path) # parsing xml with open(resources_path + '/OMSTI_voc_LEX', 'rb') as f: voc_LEX = pickle.load(f) with open(resources_path + '/OMSTI_voc_BN', 'rb') as f: voc_BN = pickle.load(f) with open(resources_path + '/OMSTI_voc_WND', 'rb') as f: voc_WND = pickle.load(f) wn2bn_dict = get_dictionary(resources_path + '/babelnet2wordnet.tsv', 1) bn2lex_dict = get_dictionary(resources_path + '/babelnet2lexnames.tsv', 0) bn2wnd_dict = get_dictionary(resources_path + '/babelnet2wndomains.tsv', 0) model = MultitaskBiLSTM((max_len, ), len(voc_BN), len(voc_WND), len(voc_LEX), HIDDEN_SIZE, DROPOUT, DROPOUT_REC) model.load_weights(resources_path + '/mod_OMSTI_082_weights.h5') inputs = crop(inputs, max_len) pos = crop(pos, max_len) flags = crop(flags, max_len) ids = crop(ids, max_len) inputs = ext(inputs, max_len, batch_size, 'inputs') pos = ext(pos, max_len, batch_size, 'pos') flags = ext(flags, max_len, batch_size, 'flags') ids = ext(ids, max_len, batch_size, 'ids') inputs, _ = pre_ELMO(inputs, max_len) pred_BN, pred_WND, pred_LEX = model.predict(inputs, batch_size=batch_size, verbose=True) for r, sentence in enumerate(flags): # for each sentence for c, word in enumerate(sentence): # for each word if word: # flag = True, that word must be disambiguated pox_synsets = get_synsetsIds(inputs[r][c], pos[r][c], wn2bn_dict) pox_LEX = convert_bn2lex(pox_synsets, bn2lex_dict) pox_LEX = restrict_index_synsets(pox_LEX, voc_LEX) if len(pox_LEX) == 0: bestLEX = get_MFS(inputs[r][c], wn2bn_dict) bestLEX = convert_bn2lex([bestLEX], bn2lex_dict)[0] else: bestLEX = get_best(pred_LEX[r][c], pox_LEX, voc_LEX) # best lex cathegory pox_WND = convert_bn2wnd(pox_synsets, bn2wnd_dict) pox_WND = restrict_index_synsets(pox_WND, voc_WND) if len(pox_WND) == 0: bestWND = get_MFS(inputs[r][c], wn2bn_dict) bestWND = convert_bn2wnd([bestWND], bn2wnd_dict)[0] else: bestWND = get_best(pred_WND[r][c], pox_WND, voc_WND) # best lex cathegory pox_synsets = filter_with_LEX( pox_synsets, bestLEX, bn2lex_dict) # pox synsets restricted with lex pox_synsets = filter_with_WND(pox_synsets, bestWND, bn2wnd_dict) pox_synsets = restrict_index_synsets(pox_synsets, voc_BN) if len(pox_synsets) == 0: best = get_MFS(inputs[r][c], wn2bn_dict) else: best = get_best(pred_BN[r][c], pox_synsets, voc_BN) # write best on external sheet out_file.write(ids[r][c] + ' ' + best + '\n') out_file.close() pass
def predict_babelnet_mod095(input_path: str, output_path: str, resources_path: str) -> None: batch_size = 32 max_len = 50 out_file = open(output_path, 'w', encoding='utf-8') inputs, pos, flags, ids = blind_parsing(input_path) # parsing xml with open(resources_path + '/vocabularyLEX', 'rb') as f: voc_LEX = pickle.load(f) with open(resources_path + '/vocabulary', 'rb') as f: voc_BN = pickle.load(f) with open(resources_path + '/vocabularyWND', 'rb') as f: voc_WND = pickle.load(f) wn2bn_dict = get_dictionary(resources_path + '/babelnet2wordnet.tsv', 1) bn2lex_dict = get_dictionary(resources_path + '/babelnet2lexnames.tsv', 0) bn2wnd_dict = get_dictionary(resources_path + '/babelnet2wndomains.tsv', 0) model_in2lex = AttentionBiLSTM((max_len, ), len(voc_LEX), EMB_SIZE, HIDDEN_SIZE, DROPOUT, DROPOUT_REC) model_in2lex.load_weights(resources_path + '/mod06_LEX_weights.h5') inputs = crop(inputs, max_len) flags = crop(flags, max_len) ids = crop(ids, max_len) pos = crop(pos, max_len) flags = ext(flags, max_len, batch_size, 'flags') ids = ext(ids, max_len, batch_size, 'ids') pos = ext(pos, max_len, batch_size, 'pos') inputs = ext(inputs, max_len, batch_size, 'inputs') flags = pad_sequences(flags, maxlen=max_len, dtype='bool', padding='post', value=False) inputs, _ = pre_ELMO(inputs, max_len) preds_LEX = model_in2lex.predict(inputs, batch_size=batch_size, verbose=True) inputs_LEX = [] for r, sentence in enumerate(flags): # for each sentence inputs_LEX_part = [] for c, word in enumerate(sentence): # for each word if word: # flag = True, that word must be disambiguated pox_synsets = get_synsetsIds(inputs[r][c], pos[r][c], wn2bn_dict) pox_LEX = convert_bn2lex(pox_synsets, bn2lex_dict) pox_LEX = restrict_index_synsets(pox_LEX, voc_LEX) if len(pox_LEX) == 0: bestLEX = get_MFS(inputs[r][c], wn2bn_dict) bestLEX = convert_bn2lex([bestLEX], bn2lex_dict)[0] else: bestLEX = get_best(preds_LEX[r][c], pox_LEX, voc_LEX) # best lex cathegory bestLEX = voc_LEX.index(bestLEX) else: bestLEX = np.argmax(preds_LEX[r][c]) inputs_LEX_part.append(bestLEX) inputs_LEX.append(inputs_LEX_part) inputs_LEX = np.asarray(inputs_LEX) model_inLex2wn = Model095A([(max_len, ), inputs_LEX.shape[1:2]], len(voc_LEX), len(voc_WND), EMB_SIZE, HIDDEN_SIZE, DROPOUT, DROPOUT_REC) model_inLex2wn.load_weights(resources_path + '/mod095A_weights.h5') preds_WND = model_inLex2wn.predict([inputs, inputs_LEX], batch_size=batch_size, verbose=True) inputs_WND = [] for r, sentence in enumerate(flags): # for each sentence inputs_WND_part = [] for c, word in enumerate(sentence): # for each word if word: # flag = True, that word must be disambiguated pox_synsets = get_synsetsIds(inputs[r][c], pos[r][c], wn2bn_dict) pox_WND = convert_bn2wnd(pox_synsets, bn2wnd_dict) pox_WND = restrict_index_synsets(pox_WND, voc_WND) if len(pox_WND) == 0: bestWND = get_MFS(inputs[r][c], wn2bn_dict) bestWND = convert_bn2lex([bestWND], bn2wnd_dict)[0] else: bestWND = get_best(preds_WND[r][c], pox_WND, voc_WND) # best lex cathegory bestWND = voc_WND.index(bestWND) else: bestWND = np.argmax(preds_WND[r][c]) inputs_WND_part.append(bestWND) inputs_WND.append(inputs_WND_part) inputs_WND = np.asarray(inputs_WND) model_inWnd2bn = Model095B([(max_len, ), inputs_LEX.shape[1:2]], len(voc_WND), len(voc_BN), EMB_SIZE, HIDDEN_SIZE, DROPOUT, DROPOUT_REC) model_inWnd2bn.load_weights(resources_path + '/mod095B_weights.h5') predictions = model_inWnd2bn.predict([inputs, inputs_WND], batch_size=batch_size, verbose=True) for r, sentence in enumerate(flags): # for each sentence for c, word in enumerate(sentence): # for each word if word: # flag = True, that word must be disambiguated pox_synsets = get_synsetsIds(inputs[r][c], pos[r][c], wn2bn_dict) pox_synsets = restrict_index_synsets(pox_synsets, voc_BN) if len(pox_synsets) == 0: best = get_MFS(inputs[r][c], wn2bn_dict) else: best = get_best(predictions[r][c], pox_synsets, voc_BN) # write best on external sheet out_file.write(ids[r][c] + ' ' + best + '\n') out_file.close() pass