def __init__(self, sense_clusters_fpath, strip_dst_senses=False, load_sim=True, verbose=False, normalized_bow=False, use_pickle=True, voc_fpath="", voc=[], normalize_sim=False): """ Loads and operates sense clusters in the format 'word<TAB>cid<TAB>prob<TAB>cluster<TAB>isas' """ self._verbose = verbose self._normalized_bow = normalized_bow self._stoplist = get_stoplist() self._normalize_sim = normalize_sim if len(voc) > 0: self._voc = voc elif exists(voc_fpath): self._voc = load_voc(voc_fpath) else: self._voc = {} sense_clusters_pkl_fpath = sense_clusters_fpath + ".pkl" if use_pickle and exists(sense_clusters_pkl_fpath): pkl = pickle.load(open(sense_clusters_pkl_fpath, "rb")) if "sense_clusters" in pkl: self._sc = pkl["sense_clusters"] else: print("Error: cannot find sense_clusters in ", sense_clusters_pkl_fpath) self._sc = {} if "normword2word" in pkl: self._normword2word = pkl["normword2word"] else: print("Error: cannot find normword2word in ", sense_clusters_pkl_fpath) self._normword2word = {} print("Loaded %d words from: %s" % (len(self._sc), sense_clusters_pkl_fpath)) else: self._sc, self._normword2word = self._load(sense_clusters_fpath, strip_dst_senses, load_sim) if use_pickle: pkl = { "sense_clusters": self._sc, "normword2word": self._normword2word } pickle.dump(pkl, open(sense_clusters_pkl_fpath, "wb")) print("Pickled sense clusters:", sense_clusters_pkl_fpath)
def __init__(self, sense_clusters_fpath, skip_voc_fpath=""): self._skip_voc = load_voc( skip_voc_fpath, preprocess=True, sep='\t', use_pickle=True) if exists(skip_voc_fpath) else set() print("Skip voc:", len(self._skip_voc)) self._sense_clusters = SenseClusters(sense_clusters_fpath, strip_dst_senses=False) self._sc = self._sense_clusters.data
def _load(self, pcz_fpath, strip_dst_senses, load_sim): """ Loads a dict[word][sense] --> {"cluster": Counter(), "cluster_norm": Counter(), "isas": Counter()} """ senses = defaultdict(dict) normword2word = defaultdict(set) if not exists(pcz_fpath): return senses, normword2word df = read_csv(pcz_fpath, encoding='utf-8', delimiter=SEP, error_bad_lines=True, quotechar='\0') df = df.fillna("") err_clusters = 0 num_senses = 0 # foreach sense cluster for i, row in df.iterrows(): try: if i % 25000 == 0: print("%d (%d) senses loaded of %d" % (i, num_senses, len(df))) if len(self._voc) > 0 and row.word not in self._voc: continue r = {} r["prob"] = row.prob if "prob" in row else 1.0 r["cluster"] = self._get_words( row.cluster, strip_dst_senses, load_sim) if "cluster" in row else Counter() r["cluster_norm"] = self._get_normalized_words( r["cluster"]) if self._normalized_bow else r["cluster"] r["isas"] = self._get_words( row.isas, strip_dst_senses, load_sim) if "isas" in row else Counter() r["isas_norm"] = self._get_normalized_words( r["isas"]) if self._normalized_bow else r["isas"] senses[row.word][row.cid] = r normword2word[self.norm(row.word)].add(row.word) num_senses += 1 except: print(".", end=' ') if self._verbose: print("Warning: bad cluster") print(row) print(format_exc()) err_clusters += 1 print(err_clusters, "cluster errors") print(num_senses, "senses loaded out of", i + 1) print(len(senses), "words loaded") return senses, normword2word
def main(): parser = argparse.ArgumentParser(description='Performs training of a word sense embeddings model from a raw text ' 'corpus using the SkipGram approach based on word2vec and graph ' 'clustering of ego networks of semantically related terms.') parser.add_argument('train_corpus', help="Path to a training corpus in text form (can be .gz).") parser.add_argument('-phrases', help="Path to a file with extra vocabulary words, e.g. multiword expressions," "which should be included into the vocabulary of the model. Each " "line of this text file should contain one word or phrase with no header.", default="") parser.add_argument('-cbow', help="Use the continuous bag of words model (default is 1, use 0 for the " "skip-gram model).", default=1, type=int) parser.add_argument('-size', help="Set size of word vectors (default is 300).", default=300, type=int) parser.add_argument('-window', help="Set max skip length between words (default is 5).", default=5, type=int) parser.add_argument('-threads', help="Use <int> threads (default {}).".format(cpu_count()), default=cpu_count(), type=int) parser.add_argument('-iter', help="Run <int> training iterations (default 5).", default=5, type=int) parser.add_argument('-min_count', help="This will discard words that appear less than <int> times" " (default is 10).", default=10, type=int) parser.add_argument('-N', help="Number of nodes in each ego-network (default is 200).", default=200, type=int) parser.add_argument('-n', help="Maximum number of edges a node can have in the network" " (default is 200).", default=200, type=int) parser.add_argument('-bigrams', help="Detect bigrams in the input corpus.", action="store_true") parser.add_argument('-min_size', help="Minimum size of the cluster (default is 5).", default=5, type=int) parser.add_argument('-make-pcz', help="Perform two extra steps to label the original sense inventory with" " hypernymy labels and disambiguate the list of related words." "The obtained resource is called proto-concepualization or PCZ.", action="store_true") args = parser.parse_args() corpus_name = basename(args.train_corpus) model_dir = "model/" ensure_dir(model_dir) vectors_fpath = join(model_dir, corpus_name + ".cbow{}-size{}-window{}-iter{}-mincount{}-bigrams{}.word_vectors".format( args.cbow, args.size, args.window, args.iter, args.min_count, args.bigrams)) vectors_short_fpath = join(model_dir, corpus_name + ".word_vectors") neighbours_fpath = join(model_dir, corpus_name + ".N{}.graph".format(args.N)) clusters_fpath = join(model_dir, corpus_name + ".n{}.clusters".format(args.n)) clusters_minsize_fpath = clusters_fpath + ".minsize" + str(args.min_size) # clusters that satisfy min_size clusters_removed_fpath = clusters_minsize_fpath + ".removed" # cluster that are smaller than min_size if exists(vectors_fpath): print("Using existing vectors:", vectors_fpath) elif exists(vectors_short_fpath): print("Using existing vectors:", vectors_short_fpath) vectors_fpath = vectors_short_fpath else: learn_word_embeddings(args.train_corpus, vectors_fpath, args.cbow, args.window, args.iter, args.size, args.threads, args.min_count, detect_bigrams=args.bigrams, phrases_fpath=args.phrases) if not exists(neighbours_fpath): compute_graph_of_related_words(vectors_fpath, neighbours_fpath, neighbors=args.N) else: print("Using existing neighbors:", neighbours_fpath) if not exists(clusters_fpath): word_sense_induction(neighbours_fpath, clusters_fpath, args.n, args.threads) else: print("Using existing clusters:", clusters_fpath) if not exists(clusters_minsize_fpath): filter_clusters.run(clusters_fpath, clusters_minsize_fpath, args.min_size) else: print("Using existing filtered clusters:", clusters_minsize_fpath) building_sense_embeddings(clusters_minsize_fpath, vectors_fpath) if (args.make_pcz): # add isas isas_fpath = "" # in: clusters_minsize_fpath clusters_with_isas_fpath = clusters_minsize_fpath + ".isas" # disambiguate the original sense clusters clusters_disambiguated_fpath = clusters_with_isas_fpath + ".disambiguated" pcz.disamgiguate_sense_clusters.run(clusters_with_isas_fpath, clusters_disambiguated_fpath) # make the closure clusters_closure_fpath = clusters_disambiguated_fpath + ".closure"
def main(): parser = argparse.ArgumentParser(description='Performs training of a word sense embeddings model from a raw text ' 'corpus using the SkipGram approach based on word2vec and graph ' 'clustering of ego networks of semantically related terms.') parser.add_argument('train_corpus', help="Path to a training corpus in text form (can be .gz).") parser.add_argument('-cbow', help="Use the continuous bag of words model (default is 1, use 0 for the " "skip-gram model).", default=1, type=int) parser.add_argument('-size', help="Set size of word vectors (default is 300).", default=300, type=int) parser.add_argument('-window', help="Set max skip length between words (default is 5).", default=5, type=int) parser.add_argument('-threads', help="Use <int> threads (default {}).".format(cpu_count()), default=cpu_count(), type=int) parser.add_argument('-iter', help="Run <int> training iterations (default 5).", default=5, type=int) parser.add_argument('-min_count', help="This will discard words that appear less than <int> times" " (default is 10).", default=10, type=int) #parser.add_argument('-only_letters', help="Use only words built from letters/dash/point for DT.", action="store_true") #parser.add_argument('-vocab_limit', help="Use only <int> most frequent words from word vector model" # " for DT. By default use all words (default is none).", default=None, type=int) parser.add_argument('-N', help="Number of nodes in each ego-network (default is 200).", default=200, type=int) parser.add_argument('-n', help="Maximum number of edges a node can have in the network" " (default is 200).", default=200, type=int) parser.add_argument('-min_size', help="Minimum size of the cluster (default is 5).", default=5, type=int) parser.add_argument('-make-pcz', help="Perform two extra steps to label the original sense inventory with" " hypernymy labels and disambiguate the list of related words." "The obtained resource is called proto-concepualization or PCZ.", action="store_true") args = parser.parse_args() vectors_fpath, neighbours_fpath, clusters_fpath, clusters_minsize_fpath, clusters_removed_fpath = get_paths( args.train_corpus, args.min_size) if not exists(vectors_fpath): print(vectors_fpath) learn_word_embeddings(args.train_corpus, vectors_fpath, args.cbow, args.window, args.iter, args.size, args.threads, args.min_count, detect_phrases=True) else: print("Using existing vectors:", vectors_fpath) if not exists(neighbours_fpath): compute_graph_of_related_words(vectors_fpath, neighbours_fpath) else: print("Using existing neighbors:", neighbours_fpath) if not exists(clusters_fpath): word_sense_induction(neighbours_fpath, clusters_fpath, args.n, args.threads) else: print("Using existing clusters:", clusters_fpath) if not exists(clusters_minsize_fpath): filter_clusters.run(clusters_fpath, clusters_minsize_fpath, args.min_size) else: print("Using existing filtered clusters:", clusters_minsize_fpath) building_sense_embeddings(clusters_minsize_fpath, vectors_fpath) if (args.make_pcz): # add isas isas_fpath = "" # in: clusters_minsize_fpath clusters_with_isas_fpath = clusters_minsize_fpath + ".isas" # disambiguate the original sense clusters clusters_disambiguated_fpath = clusters_with_isas_fpath + ".disambiguated" pcz.disamgiguate_sense_clusters.run(clusters_with_isas_fpath, clusters_disambiguated_fpath) # make the closure clusters_closure_fpath = clusters_disambiguated_fpath + ".closure"
def main(): parser = argparse.ArgumentParser( description= 'Performs training of a word sense embeddings model from a raw text ' 'corpus using the SkipGram approach based on word2vec and graph ' 'clustering of ego networks of semantically related terms.') parser.add_argument('-vectors', help="Existing embeddings to make sense vectors from") parser.add_argument('-threads', help="Use <int> threads (default {}).".format( cpu_count()), default=cpu_count(), type=int) parser.add_argument('-iter', help="Run <int> training iterations (default 5).", default=5, type=int) parser.add_argument( '-min_count', help="This will discard words that appear less than <int> times" " (default is 10).", default=10, type=int) parser.add_argument( '-N', help="Number of nodes in each ego-network (default is 200).", default=200, type=int) parser.add_argument( '-n', help="Maximum number of edges a node can have in the network" " (default is 200).", default=200, type=int) parser.add_argument('-min_size', help="Minimum size of the cluster (default is 5).", default=5, type=int) args = parser.parse_args() model_dir = "model/" ensure_dir(model_dir) vectors_fpath = args.vectors neighbours_fpath = join(model_dir, args.vectors + ".N{}.graph".format(args.N)) clusters_fpath = join(model_dir, args.vectors + ".n{}.clusters".format(args.n)) clusters_minsize_fpath = clusters_fpath + ".minsize" + str( args.min_size) # clusters that satisfy min_size clusters_removed_fpath = clusters_minsize_fpath + ".removed" # cluster that are smaller than min_size if exists(vectors_fpath): print("Using existing vectors:", vectors_fpath) else: return FileNotFoundError if not exists(neighbours_fpath): compute_graph_of_related_words(vectors_fpath, neighbours_fpath, neighbors=args.N) else: print("Using existing neighbors:", neighbours_fpath) word_sense_induction(neighbours_fpath, clusters_fpath, args.n, args.threads) filter_clusters.run(clusters_fpath, clusters_minsize_fpath, args.min_size) building_sense_embeddings(clusters_minsize_fpath, vectors_fpath)
def __init__(self, isas_fpath, min_freq=0.0, preprocess=True, sep='\t', strip_pos=True, use_pickle=True, lowercase=True): """ Provides access to a ISAs relations from a CSV file "hyponym<TAB>hypernym<TAB>freq" """ if not exists(isas_fpath): self._hypo2hyper = {} self._hyper2hypo = {} return isas_pkl_fpath = isas_fpath + ".pkl" if use_pickle and exists(isas_pkl_fpath): pkl = pickle.load(open(isas_pkl_fpath, "rb")) if "hypo2hyper" in pkl: hypo2hyper = pkl["hypo2hyper"] else: print(("Error: cannot find hypo2hyper in ", isas_pkl_fpath)) hypo2hyper = {} if "hyper2hypo" in pkl: hyper2hypo = pkl["hyper2hypo"] else: print(("Error: cannot find hyper2hypo in ", isas_pkl_fpath)) hyper2hypo = {} else: if preprocess: isas_cln_fpath = isas_fpath + ".cleaned" preprocess_pandas_csv(isas_fpath, isas_cln_fpath) isas_df = read_csv(isas_cln_fpath, sep, encoding='utf8', error_bad_lines=False) try_remove(isas_cln_fpath) else: isas_df = read_csv(isas_fpath, sep, encoding='utf8', error_bad_lines=False) isas_df = isas_df.drop(isas_df[isas_df["freq"] < min_freq].index) hypo2hyper = defaultdict(dict) hyper2hypo = defaultdict(dict) for i, row in isas_df.iterrows(): try: hypo = str(row["hyponym"]).split("#")[0].lower( ) if lowercase else str(row["hyponym"]).split("#")[0] hyper = str(row["hypernym"]).split("#")[0].lower( ) if lowercase else str(row["hypernym"]).split("#")[0] freq = float(row["freq"]) hypo_lemma = lemmatize(hypo).lower() hyper_lemma = lemmatize(hyper).lower() # hypo2hyper if hypo not in hypo2hyper or hyper not in hypo2hyper[hypo]: hypo2hyper[hypo][hyper] = freq else: hypo2hyper[hypo][hyper] += freq if (hypo_lemma, hyper_lemma) != (hypo, hyper): if hypo_lemma not in hypo2hyper or hyper_lemma not in hypo2hyper[ hypo_lemma]: hypo2hyper[hypo_lemma][hyper_lemma] = freq else: hypo2hyper[hypo_lemma][hyper_lemma] += freq # hyper2hypo if hyper not in hyper2hypo or hypo not in hyper2hypo[hyper]: hyper2hypo[hyper][hypo] = freq else: hyper2hypo[hyper][hypo] += freq if (hypo_lemma, hyper_lemma) != (hypo, hyper): if hyper_lemma not in hyper2hypo or hypo_lemma not in hyper2hypo[ hyper_lemma]: hyper2hypo[hyper_lemma][hypo_lemma] = freq else: hyper2hypo[hyper_lemma][hypo_lemma] += freq except: print(("Bad row:", row)) print((format_exc())) print(("dictionary is loaded:", len(hypo2hyper))) if use_pickle: pkl = {"hypo2hyper": hypo2hyper, "hyper2hypo": hyper2hypo} pickle.dump(pkl, open(isas_pkl_fpath, "wb")) print(("Pickled voc:", isas_pkl_fpath)) print(("Loaded %d words from: %s" % (len(hypo2hyper), isas_pkl_fpath if isas_pkl_fpath else isas_fpath))) self._hypo2hyper = hypo2hyper self._hyper2hypo = hyper2hypo