def run(test_file, sense, context, sense_dep, context_dep, output, wsd_method='sim', filter_ctx=2, lowercase=False, ignore_case=False): print("Loading models...") vs = SenseGram.load_word2vec_format(sense, binary=True) vc = word2vec.Word2Vec.load_word2vec_format(context, binary=True) vs_dep = SenseGram.load_word2vec_format(sense_dep, binary=True) vc_dep = word2vec.Word2Vec.load_word2vec_format(context_dep, binary=False) wsd_model = WSDdep(vs, vc, vs_dep, vc_dep, method=wsd_method, filter_ctx=filter_ctx, ignore_case=ignore_case) print("Loading test set...") reader = read_csv(test_file, encoding="utf-8", delimiter="\t", dtype={'predict_related': object, 'gold_sense_ids':object, 'predict_sense_ids':object, 'deps':object}) rows_count = reader.shape[0] print(unicode(rows_count) + " test instances") pb = pbar.Pbar(rows_count, 100) uncovered_words = [] # target words for which sense model has zero senses print("Start prediction over " + test_file) pb.start() reader = reader.fillna('') for i, row in reader.iterrows(): ctx = row.context.lower() if lowercase else row.context start, end = [int(x) for x in row.target_position.split(',')] if row.deps == "ParseError" or row.deps == "": deps = [] else: deps = [dep for dep in row.deps.split() if dep in vc.vocab] prediction = wsd_model.dis(ctx, row.target, start, end, deps) if prediction: sense, sense_scores = prediction reader.set_value(i, 'predict_sense_ids', sense.split("#")[1]) #neighbours = wsd_model.vs.most_similar(sense, topn=n_neighbours) #neighbours = ["%s:%.3f" % (n.split("#")[0], float(sim)) for n, sim in neighbours] #reader.set_value(i, 'predict_related', ",".join(neighbours)) else: uncovered_words.append(row.target) continue pb.update(i) pb.finish() reader.to_csv(sep='\t', path_or_buf=output, encoding="utf-8", index=False, quoting=QUOTE_NONE) print("Saved predictions to " + output)
def run(test_file, sense, context, output, wsd_method='sim', filter_ctx=2, lowercase=False, ignore_case=False): print("Loading models...") vs = SenseGram.load_word2vec_format(sense, binary=True) vc = word2vec.Word2Vec.load_word2vec_format(context, binary=True) wsd_model = WSD(vs, vc, method=wsd_method, filter_ctx=filter_ctx, ignore_case=ignore_case) print("Loading test set...") reader = read_csv(test_file, encoding="utf-8", delimiter="\t", dtype={ 'predict_related': object, 'gold_sense_ids': object, 'predict_sense_ids': object }) rows_count = reader.shape[0] print((str(rows_count) + " test instances")) pb = pbar.Pbar(rows_count, 100) uncovered_words = [] # target words for which sense model has zero senses print(("Start prediction over " + test_file)) pb.start() for i, row in reader.iterrows(): # Form of prediction: (sense, sense_scores) ctx = row.context.lower() if lowercase else row.context start, end = [int(x) for x in row.target_position.split(',')] prediction = wsd_model.dis_text(ctx, row.target, start, end) if prediction: sense, sense_scores = prediction reader.set_value(i, 'predict_sense_ids', sense.split("#")[1]) #neighbours = wsd_model.vs.most_similar(sense, topn=n_neighbours) #neighbours = ["%s:%.3f" % (n.split("#")[0], float(sim)) for n, sim in neighbours] #reader.set_value(i, 'predict_related', ",".join(neighbours)) else: uncovered_words.append(row.target) continue pb.update(i) pb.finish() reader.to_csv(sep='\t', path_or_buf=output, encoding="utf-8", index=False, quoting=QUOTE_NONE) print(("Saved predictions to " + output))
def main(): parser = argparse.ArgumentParser(description='Fill in a test dataset using Random Sense method.') parser.add_argument('test_file', help='A path to a test dataset. Format: "context_id<TAB>target<TAB>target_pos<TAB>target_position<TAB>gold_sense_ids<TAB>predict_sense_ids<TAB>golden_related<TAB>predict_related<TAB>context') parser.add_argument("senses", help="A path to sense vectors") parser.add_argument("output", help="An output path to the filled dataset. Same format as test_file") args = parser.parse_args() print("Loading sense model...") vs = SenseGram.load_word2vec_format(args.senses, binary=False) run(args.test_file, vs, args.output)
def build( self, wv, # wvo = an intance of dense word vectors sense_dim_num=10000, # unused save_pkl=True, # unused norm_type="sum", weight_type="score", max_cluster_words=20): """ Build sense vectors out of word vectors and save them in binary format. """ # initialize the sense vectors model vector_dim = wv.vectors.syn0.shape[1] senses_num = self.pcz.get_num_senses() sv = SenseGram(size=vector_dim, sorted_vocab=0) sv.create_zero_vectors(senses_num, vector_dim) sense_count = 0 # fill the sense vectors model for word in self.pcz.data: for sense_id in self.pcz.data[word]: # try to build sense vector for a word sense try: sense_count += 1 if sense_count % 10000 == 0: print(sense_count, "senses processed") sense_vector = np.zeros( wv.vectors.syn0[0].shape, dtype=np.float32) # or the word vector? non_oov = 0 for i, cluster_word in enumerate( self.pcz.data[word][sense_id]["cluster"]): if i >= max_cluster_words: break # define the weight if weight_type == "ones": weight = 1.0 elif weight_type == "score": weight = float(self.pcz.data[word][sense_id] ["cluster"][cluster_word]) elif weight_type == "rank": weight = 1.0 / (i + 1) else: weight = float(self.pcz.data[word][sense_id] ["cluster"][cluster_word]) if weight == 0: print("Warning: zero weight:", cluster_word, end=' ') # define the word if cluster_word in wv.vectors.vocab: cw = cluster_word elif cluster_word.split("#")[0] in wv.vectors.vocab: cw = cluster_word.split("#")[0] else: if self.VERBOSE: print("Warning: word is OOV: '%s'" % (cluster_word), file=stderr) compounds = cluster_word.split("#")[0].split("_") for cw in compounds: if cw in wv.vectors.vocab and len(cw) > 3: if self.VERBOSE: print( "Warning: adding a compound '{}' of '{}'" .format(cw, cluster_word)) sense_vector += (weight / len(compounds) ) * wv.vectors[cw] non_oov += 1 continue non_oov += 1 sense_vector += weight * wv.vectors[cw] if non_oov == 0: if self.VERBOSE: print("Warning: sense is OOV: %s#%s" % (word, sense_id), file=stderr) normalizer = self._normalizer(word, sense_id, norm_type, weight_type, max_cluster_words) sense_vector = sense_vector / normalizer sense_prob = self.pcz.get_sense_prob(word, sense_id) sv.add_sense(word, sense_id, sense_vector, sense_prob) except: print("Cannot process sense:", word, sense_id) print(format_exc()) # serialize the sense vector model sv.save_word2vec_format(self.sense_vectors_bin_fpath, fvocab=None, binary=False) print("Sense vectors:", self.sense_vectors_bin_fpath) print("Created %d sense vectors" % sense_count) return sv
def _load_sense2vector_precomp(self, sense2vector_fpath): return SenseGram.load_word2vec_format(sense2vector_fpath)