count = cos(v, self.model.wv[ps]) best_syn = ps try: if ps.capitalize() in self.model.vocab: cap = ps.capitalize() if count <= cos(v, self.model.wv[cap].capitalize()): count = cos(v, self.model.wv[cap].capitalize()) best_syn = ps except: #print(f"{ps} not in vocab") pass return best_syn if __name__ == "__main__": # At submission time, this program should run your best predictor (part 6). W2VMODEL_FILENAME = 'GoogleNews-vectors-negative300.bin.gz' predictor = Word2VecSubst(W2VMODEL_FILENAME) for context in read_lexsub_xml(sys.argv[1]): #print(context) # useful for debugging #prediction = wn_simple_lesk_predictor(context) #print(prediction) #prediction = wn_simple_lesk_predictor(context) prediction = predictor.improved_predictor(context) print("{}.{} {} :: {}".format(context.lemma, context.pos, context.cid, prediction))
def __init__(self, filename, maxlen=80): self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') self.maxlen = maxlen self.file = [i for i in read_lexsub_xml(filename)]