def main(): args = parse_args() file_paths = [ os.path.join(x, i) for x, _, z in os.walk(args.corpus_path) for i in z ] if "ruwordnet_path" in args: ruwordnet = RuWordnet(db_path=args.ruwordnet_path, ruwordnet_path="") sense2synset = create_sense2synset(ruwordnet.get_all_senses(), args.pos) synset_senses = create_senses_chain(ruwordnet, args.pos) for filename in file_paths: start_time = time.time() retrieve_ruwordnet_positions(filename, args.output_path, synset_senses, sense2synset) print( f"---- File {filename} took {(time.time() - start_time)} seconds ----" ) elif "data_path" in args: data = read_test_data(args.data_path) for filename in file_paths: start_time = time.time() retrieve_word_positions(filename, args.output_path, data) print( f"---- File {filename} took {(time.time() - start_time)} seconds ----" )
def generate_predictions(self, path): data = defaultdict(list) ruwordnet = RuWordnet(self.params["db_path"], self.params["ruwordnet_path"]) with open(path, 'r', encoding='utf-8') as f: # "./labelled_hch.tsv" for line in f: label, _, neologism, candidate_word = line.strip().split("\t") label = float(label) candidate = ruwordnet.get_id_by_name(candidate_word) if label == 1.0: data[neologism].append(candidate) return data
def main(): params = load_config() model = HCHModel(params) ruwordnet = RuWordnet(db_path=params["db_path"], ruwordnet_path=params["ruwordnet_path"]) with open(params['test_path'], 'r', encoding='utf-8') as f: test_data = f.read().split("\n")[:-1] with open("private_nouns_top100_candidates_second_order.tsv", "w", encoding="utf-8") as w: for neologism in test_data: candidates = model.generate_associates(neologism, topn=10) for candidate, similarity in candidates: w.write(f"{neologism}\t{candidate}\t{similarity}\n") for second_order in ruwordnet.get_hypernyms_by_id(candidate): w.write(f"{neologism}\t{second_order}\t{model.get_similarity(neologism, second_order)}\n")
def __init__(self, params): self.ruwordnet = RuWordnet(db_path=params["db_path"], ruwordnet_path=params["ruwordnet_path"]) self.w2v_ruwordnet = KeyedVectors.load_word2vec_format( params['ruwordnet_vectors_path'], binary=False) self.w2v_data = KeyedVectors.load_word2vec_format( params['data_vectors_path'], binary=False)
def generate_taxonomy_fns(params, model): # for English WordNet if params['language'] == 'en': wn = WordNetCorpusReader(params["wordnet_path"], None) return lambda x: [hypernym.name() for hypernym in wn.synset(x).hypernyms() if hypernym.name() in model.w2v_synsets.vocab], \ lambda x: [hyponym.name() for hyponym in wn.synset(x).hyponyms() if hyponym.name() in model.w2v_synsets.vocab], \ lambda x: x.split(".")[0].replace("_", " ") # for RuWordNet elif params['language'] == 'ru': ruwordnet = RuWordnet(db_path=params["db_path"], ruwordnet_path=params["wordnet_path"]) return lambda x: ruwordnet.get_hypernyms_by_id(x), lambda x: ruwordnet.get_hyponyms_by_id(x), \ lambda x: ruwordnet.get_name_by_id(x) else: raise Exception("task / language is not supported")
# save # ------------------------------------------------------------- @staticmethod def save_as_w2v(words: list, vectors: np.array, output_path: str): assert len(words) == len(vectors) with open(output_path, 'w', encoding='utf-8') as w: w.write(f"{vectors.shape[0]} {vectors.shape[1]}\n") for word, vector in zip(words, vectors): vector_line = " ".join(map(str, vector)) w.write(f"{word} {vector_line}\n") if __name__ == '__main__': ft_vec = FasttextVectorizer("models/cc.ru.300.bin") ruwordnet = RuWordnet(db_path="../dataset/ruwordnet.db", ruwordnet_path=None) noun_synsets = defaultdict(list) verb_synsets = defaultdict(list) for sense_id, synset_id, text in ruwordnet.get_all_senses(): if synset_id.endswith("N"): noun_synsets[synset_id].append(text) elif synset_id.endswith("V"): verb_synsets[synset_id].append(text) ft_vec.vectorize_ruwordnet(noun_synsets, "models/vectors/nouns_ruwordnet_fasttext.txt") ft_vec.vectorize_ruwordnet(verb_synsets, "models/vectors/verbs_ruwordnet_fasttext.txt") with open("../dataset/public/verbs_public_no_labels.tsv", 'r',
hchs = [hypernym for associate in associates for hypernym in compute_hypernyms(associate)] return hchs, node2vec_vector def get_node2vec_score(self, neologism, node2vec_vector, candidate, count): nn_score = 0.5 if candidate in self.predicted[neologism] else 1 return count * (self.get_similarity(neologism, candidate)) + \ self.get_node2vec_similarity(node2vec_vector, candidate) def get_node2vec_similarity(self, v1, candidate): v2 = self.node2vec[candidate] v1 = v1 / (sum(v1 ** 2) ** 0.5) v2 = v2 / (sum(v2 ** 2) ** 0.5) return 1 - spatial.distance.cosine(v1, v2) data = defaultdict(list) ruwordnet = RuWordnet("../dataset/ruwordnet.db", None) with open("./labelled_hch.tsv", 'r', encoding='utf-8') as f: for line in f: label, similarity, neologism, candidate_word = line.strip().split("\t") label = float(label) similarity = float(similarity) candidate = ruwordnet.get_id_by_name(candidate_word) if label == 1.0: data[neologism].append((candidate, similarity)) with open("predictions_classification_private_nouns.tsv", 'w', encoding='utf-8') as w: for i in data: candidates = reversed(sorted(set(data[i]), key=lambda x: x[1])) for candidate in candidates: w.write(f"{i}\t{candidate[0]}\t{ruwordnet.get_name_by_id(candidate[0])}\n")
data_parser = subparsers.add_parser('data', help='data help') data_parser.add_argument('--data_path', type=str, dest="data_path", help='path to test data') data_parser.add_argument('--upper', action="store_true") return parser.parse_args() if __name__ == '__main__': args = parse_args() bert_vectorizer = BertVectorizer(args.bert_path) if 'ruwordnet_path' in args: ruwordnet = RuWordnet(args.ruwordnet_path, None) synsets = defaultdict(list) for sense_id, synset_id, text in ruwordnet.get_all_senses(): if synset_id.endswith(args.pos): synsets[synset_id].append(text.lower()) bert_vectorizer.vectorize_groups(synsets, args.output_path, to_upper=False) if 'wordnet_old' in args: wn_old = WordNetCorpusReader(args.wordnet_old, None) wn_new = WordNetCorpusReader(args.wordnet_new, None) synsets = compute_synsets_from_wordnets(wn_old, wn_new, args.pos) bert_vectorizer.vectorize_groups(synsets, args.output_path, to_upper=False)
def __init__(self, params, part, phase): self.part = part # the part of speech self.phase = phase self.ruwordnet = RuWordnet(db_path=params["db_path"], ruwordnet_path=params["ruwordnet_path"]) self.w2v_ruwordnet = KeyedVectors.load_word2vec_format(params[f"ruwordnet_vectors_{part}_path"], binary=False) self.w2v_data = KeyedVectors.load_word2vec_format(params[f"{phase}_data_vectors_{part}_path"], binary=False)
for word, vector in zip(words, vectors): vector_line = " ".join(map(str, vector)) w.write(f"{word.upper()} {vector_line}\n") def process_data(input_file, output_file): with open(input_file, 'r', encoding='utf-8') as f: dataset = f.read().lower().split("\n")[:-1] w2v_vec.vectorize_data(dataset, output_file) if __name__ == '__main__': from helpers.utils import load_config config = load_config() w2v_vec = wiki2vecVectorizer(config["vectorizer_path"]) ruwordnet = RuWordnet(db_path=config["db_path"], ruwordnet_path=config["ruwordnet_path"], with_lemmas=False) noun_synsets = defaultdict(list) verb_synsets = defaultdict(list) for sense_id, synset_id, text in ruwordnet.get_all_senses(): if synset_id.endswith("N"): noun_synsets[synset_id].append(text.lower()) elif synset_id.endswith("V"): verb_synsets[synset_id].append(text.lower()) w2v_vec.vectorize_ruwordnet(noun_synsets, "models/vectors/ruwordnet_nouns.txt") w2v_vec.vectorize_ruwordnet(verb_synsets, "models/vectors/ruwordnet_verbs.txt") process_data("../data/public_test/verbs_public.tsv", "models/vectors/verbs_public.txt") process_data("../data/public_test/nouns_public.tsv", "models/vectors/nouns_public.txt") process_data("../data/private_test/verbs_private.tsv", "models/vectors/verbs_private.txt") process_data("../data/private_test/nouns_private.tsv", "models/vectors/nouns_private.txt")
# -*- coding: utf-8 -*- import json from collections import defaultdict from itertools import combinations import networkx as nx from pymorphy2 import MorphAnalyzer from ruwordnet.ruwordnet_reader import RuWordnet USE_SYNSETS = False USE_TOPONYMS = False ruwordnet = RuWordnet("../dataset/ruwordnet.db", None) morph = MorphAnalyzer() def read_file(filename): with open(filename, encoding='utf-8') as f: return [([i.split(";")[0]] + i.split(";")[-2:]) for i in f.read().split("\n")[:-1]] adj_nouns = [ "ДАУНХИЛ", "ИНФОРМВОЙНА", "МАСТИТ", "ШКОЛЕНЬЕ", "ЭКЗИТПОЛ", "АВАРКОМ", "АКТИВ-НЕТТО", "БАМУТСКИЙ", "БАСАЕВСКИЙ", "БАШХИМ", "БЛИННАЯ", "БУЛОЧНАЯ", "ГОЙСКОЕ", "ГОНЧАЯ", "ГРАНДЖ", "ДЕТСКАЯ", "ДМИТРИЕВ-ЛЬГОВСКИЙ", "ПОДУШЕВОЙ", "ДМИТРОВСК-ОРЛОВСКИЙ", "ЖЕЛЕЗНОГОРСК-ИЛИМСКИЙ", "ИРБИТСКОЕ", "КАМЕНКА-ДНЕПРОВСКАЯ", "КАМЕНЬ-КАШИРСКИЙ", "КОНДИТЕРСКАЯ", "ЛИКВИДКОМ", "МОГИЛЕВ-ПОДОЛЬСКИЙ", "МРАВИНСКИЙ", "МУНДА", "ОТПУСКНЫЕ", "ПАРИКМАХЕРСКАЯ", "ПЕЛЬМЕННАЯ", "ПИРОЖКОВАЯ", "ПЛИССЕ", "ПРАЛИНЕ", "ПРИЕМНАЯ", "РОКОКО", "РЮМОЧНАЯ", "СПАССК-РЯЗАНСКИЙ", "ТУ-154М", "УРАЛХИМ", "ЧАЙНАЯ",
def main(): args = parse_args() description1 = "---- File {0} took {1} seconds ----\n" description2 = "All: {2}, Found: {3}, Left: {4}" description = description1 + description2 if "ruwordnet_path1" in args: file_paths = tqdm([ os.path.join(x, i) for x, _, z in os.walk(args.corpus_path) for i in z ]) # ------------ RuWordnet initialization ------------ ruwordnet1 = RuWordnet(db_path=args.ruwordnet_path1, ruwordnet_path="") ruwordnet2 = RuWordnet(db_path=args.ruwordnet_path2, ruwordnet_path="") senses = ruwordnet1.get_all_senses() + ruwordnet2.get_all_senses() synset_senses, sense2synset = create_senses_data(senses, args.pos) synsets = set(ruwordnet1.get_all_ids(args.pos)) print(sense2synset) # ------------ Find contexts ------------ # for filename in file_paths: # start_time = time.time() # retrieve_ruwordnet_positions(filename, args.output_path, synset_senses, sense2synset) # file_paths.set_description(description.format(filename, (time.time() - start_time), # len(synsets), len(found_lemmas), # len(synsets.difference(set(found_lemmas))))) # # print(description2.format(len(synsets), len(found_lemmas), len(synsets.difference(set(found_lemmas))))) # print(found_lemmas) # print(synsets.difference(set(found_lemmas))) if "wordnet_old" in args: wordnet_old = WordNetCorpusReader(args.wordnet_old, None) wordnet_new = WordNetCorpusReader(args.wordnet_new, None) synsets = compute_synsets_from_wordnets(wordnet_old, wordnet_new, 'n') for synset in synsets: print( set([i.name() for i in wordnet_old.synset(synset).lemmas()] + [i.name() for i in wordnet_new.synset(synset).lemmas()])) # for filename in file_paths: # start_time = time.time() # retrieve_ruwordnet_positions(filename, args.output_path, synset_senses, sense2synset) # file_paths.set_description(description.format(filename, (time.time() - start_time), # len(synsets), len(found_lemmas), # len(synsets.difference(set(found_lemmas))))) # # print(description2.format(len(synsets), len(found_lemmas), len(synsets.difference(set(found_lemmas))))) # print(found_lemmas) # print(synsets.difference(set(found_lemmas))) elif "data_path" in args: file_paths = tqdm([ os.path.join(x, i) for x, _, z in os.walk(args.corpus_path) for i in z ]) data = read_test_data(args.data_path) for filename in file_paths: start_time = time.time() retrieve_word_positions(filename, args.output_path, data) file_paths.set_description( description.format(filename, (time.time() - start_time), len(data), len(found_lemmas), len(data.difference(set(found_lemmas))))) print( description2.format(len(data), len(found_lemmas), len(data.difference(set(found_lemmas))))) print(found_lemmas) print(data.difference(set(found_lemmas)))
import os from collections import defaultdict from ruwordnet.ruwordnet_reader import RuWordnet from vectorizers.fasttext_vectorizer import FasttextVectorizer def process_data(vectorizer, input_file, output_file): with open(input_file, 'r', encoding='utf-8') as f: dataset = f.read().lower().split("\n")[:-1] vectorizer.vectorize_words(dataset, output_file) if __name__ == '__main__': ft = FasttextVectorizer("models/cc.ru.300.bin") ruwordnet = RuWordnet(db_path="../dataset/ruwordnet.db", ruwordnet_path=None) vector_path = "models/vectors/fasttext/ru/" # ---------------------- # vectorize synsets # ---------------------- # noun_synsets = defaultdict(list) # verb_synsets = defaultdict(list) # all_synsets = defaultdict(list) # # for sense_id, synset_id, text in ruwordnet.get_all_senses(): # if synset_id.endswith("N"): # noun_synsets[synset_id].append(text.lower()) # elif synset_id.endswith("V"): # verb_synsets[synset_id].append(text.lower()) # all_synsets[synset_id].append(text.lower())