def create_word2wordsense_dic(vocFile="", wwsFile="", vocWsFile=""): """ sample call: create_word2wordsense_dic() :param vocFile: :param wwsFile: :param vocWsFile: :return: """ if vocFile == "": vocFile = FILE_Loc["de"]["kyubyong"]["voc"] if wwsFile == "": wwsFile = FILE_Loc["de"]["kyubyong"]["w2ws"] if vocWsFile == "": vocWsFile = FILE_Loc["de"]["kyubyong"]["wsVoc"] wwlst = [] wslst = [] gn = load_germanet() with open(vocFile, 'r') as ifh: for ln in ifh: word = ln[:-1] lst = [get_name_of_synset(ele) for ele in gn.synsets(word)] if lst: wwlst.append(word + " " + " ".join(lst)) wslst += lst wslst = list(set(wslst)) wslst.sort() print(wwlst[:10]) print(wslst[:10]) with open(wwsFile, 'w') as ofh: ofh.write("\n".join(wwlst)) with open(vocWsFile, 'w') as ofh: ofh.write("\n".join(wslst))
def _add_germanet_categories(self,word): gn = load_germanet() cats=[] hp = gn.synset(word+'.n.1').hypernym_paths for h in hp: for s in h[-4:-1]: cats.append(str(s).split('(')[1].split('.')[0]) cats = list(set(cats)) return cats
def main(): gn = load_germanet() if len(sys.argv) < 2: raise Exception( "Provide 1+ arguments:\n\t1+,model(s)") models = sys.argv[1:] senses2rel = compare(models, gn) print("senses reliability") for senses, rel in senses2rel: print(str(senses), str(rel))
def germanet_processor(data): gn = load_germanet() results = [] for record in data: word = record[0] value = record[1] synsets = gn.synsets(word) if synsets == None: continue for synset in synsets: results += find_hyponyms(synset, value) return results
def load_tree(self, outputfile): """Creates a tree and fills it with words and hypernyms. :param outputfile: the outputfile to be created containing all words :return: complete tree """ germanet = load_germanet() # step 1: extract words from GermaNet print("extracting words..") words, embedded_words = self.__extract_words(germanet, outputfile) # step 2: fill tree with hypernym paths count = 0 skipped = 0 mult_paths = 0 tree = Tree() for word in words: synset = germanet.synset(word) if synset is None: skipped += 1 continue paths = synset.hypernym_paths if len(paths) == 0: skipped += 1 return elif len(paths) > 0 and isinstance( paths[0], list): # checks if synset has multiple paths mult_paths += 1 for path in paths: count += 1 tree.add_hypernym_path(path, embedded_words, self.__ignore_duplicates) else: count += 1 tree.add_hypernym_path(paths, embedded_words, self.__ignore_duplicates) print("number of words added = " + str(len(tree.words))) print("number of paths = " + str(count)) print("number of synsets with multiple paths = " + str(mult_paths)) print("skipped = " + str(skipped)) return tree
def create_wordsense_struc_dic(vocWsFile="", childrenFile="", parentFile="", pathFile=""): if vocWsFile == "": vocWsFile = FILE_Loc["de"]["kyubyong"]["wsVoc"] if childrenFile == "": childrenFile = FILE_Loc["de"]["kyubyong"]["wsChildren"] if parentFile == "": parentFile = FILE_Loc["de"]["kyubyong"]["wsParent"] if pathFile == "": pathFile = FILE_Loc["de"]["kyubyong"]["wsPaths"] chidrenLst = [] parentLst = [] pathLst = [] gn = load_germanet() with open(vocWsFile, 'r') as ifh: for ln in ifh: ws = ln[:-1] ins = gn.synset(ws) if ins.__class__.__name__ != 'Synset': print(ins) continue chidren = [get_name_of_synset(ele) for ele in ins.hyponyms] chidrenLst.append(ws + " " + " ".join(chidren)) parent = [get_name_of_synset(ele) for ele in ins.hypernyms] parentLst.append(ws + " " + " ".join(parent)) plst = [] for apath in ins.hypernym_paths: pathStr = " ".join([get_name_of_synset(ele) for ele in apath]) plst.append(pathStr) pathLst.append(":".join(plst)) with open(childrenFile, 'w') as ofh: ofh.write("\n".join(chidrenLst)) with open(parentFile, 'w') as ofh: ofh.write("\n".join(parentLst)) with open(pathFile, 'w') as ofh: ofh.write("\n".join(pathLst))
class Germanet: """This code is used to find all synonym words of a given word. The words that are searched here are nouns. If you are interested searching verbs please use str(lemmatisedWord) instead of str(lemmatisedWord).capitalize() To run this code you need to load germanet data using mongodb using pygerman. See details of how to run germanet here https://pypi.org/project/pygermanet/ mkdir -p ./mongodb mongod --dbpath ./mongodb""" gn = load_germanet() def lematise(self, word): """This method returns the lemmatised form of the word if it has lemmatised form, otherwise returns the word as it is.""" if word != "": return self.gn.lemmatise(word) def getSynonyms(self, word): """This method returns the written representation(orth) of all the possible synsets of the given word.""" synonymSynset = [] synonymWords = set() for lemmatisedWord in self.lematise(word): synsets = self.gn.synsets(str(lemmatisedWord).capitalize()) for synset in synsets: synonymSynset.append(synset) for synset in synonymSynset: for lemma in synset.lemmas: synonymWords.add(lemma.orthForm.strip()) return synonymWords
from pygermanet import load_germanet import numpy as np from germanet.tree import Tree germanet = load_germanet() num_nodes = 0 words = {} leaf_nodes = [] errors = 0 def __load_tree(file, log): """ Creates and fills tree from the input file. :param file: file containing word-sense parents and their children :param log: log file """ global words, errors tree = Tree() with open(file, 'r') as f: for line in f: parent, children = line.split()[0], len( line.split()) > 1 and line.split()[1:] or None # validation step 1: check for duplicate nodes if parent in words: if words[parent] >= 2: log.write("validation error: synset '" + parent +
from nltk.tokenize import RegexpTokenizer from pygermanet import load_germanet from modules import file_reader as fr ######################## # GLOBAL FILE SETTINGS ######################## config = configparser.ConfigParser() config._interpolation = configparser.ExtendedInterpolation() config.read('../config.ini') ######################## # GermaNet & WordNet ######################## try: ger = load_germanet() except: print( 'Error! Please start mongodb on GermaNet xml files: mongod --dbpath ./mongodb or refer to README.md' ) sys.exit() # Tokenizer sent_tok = load('tokenizers/punkt/german.pickle') word_tok = RegexpTokenizer(r'\w+') # Filter stopwords german_stopwords = stopwords.words('german') german_stopwords.extend(('dass', 'bzw', 'p', 'http', '0', '1', '2', '3', '4')) stop_words = set(german_stopwords)
def analyzeTextCohesion(text): """Analyzed the cohesion of a txt. Args: text (String) - A string that is Analyzed Returns: Array - An array of word pairs """ # Check if text is string or unicode # if type(text) is not str: # raise TypeError('you did not pass a string as argument') # # Remove percent sign text = re.sub(r'%', '', text) text = re.sub(r'“', '', text) text = re.sub(r'–', '', text) text = re.sub(r'„', '', text) text = re.sub(r'ca\.', '', text) text = re.sub(r'Dr\.', 'Doktor', text) text = re.sub(r'St\.', 'Sankt', text) text = re.sub(r'bzw\.', 'beziehungsweise', text) text = re.sub(r'[zZ]\. ?[bB]\.', 'zum Beispiel', text) text = re.sub(r'usw\.', 'und so weiter', text) # Split text by line breaks paragraph_split = text.split('[LINEBREAK]') # Remove brackets and parenthesis from text text = re.sub(r"[\(\[].*?[\)\]]", "", text) # Remove trailing white space text = text.strip() # If text doesn't end with a dot, fill it in if not text[-1:] in ['.', '!', '?']: text += '.' ############################################################################ # Tag text ############################################################################ # Save text to file f = open(constants.temp_text, 'w') f.write(text.encode('utf-8')) f.close() # Tokenize f = open(constants.temp_tokens, 'w') subprocess.call([constants.tokenizer, constants.temp_text], \ stdout=f, shell=False) f.close() # Tag Tokens from temp_tokens f = open(constants.temp_tags, 'w') subprocess.call([constants.rftagger, constants.german_par, \ constants.temp_tokens], stdout=f, shell=False) f.close() # Read tags from file f = open(constants.temp_tags, 'r') tags = f.readlines() f.close() # Split tags in array tags = [str.split(tag, '\t') for tag in tags] # Remove last entry # the entry is only a \n character and can # be ignored. It is a percularity of the # RFTagger tags.pop() # Remove \n from end of tag tags = [[tag[0].decode('utf-8'), tag[1][:-1]] for tag in tags] ############################################################################ # Further processing ############################################################################ # Load germanet gn = load_germanet() # Lemmatise all words tags = [{ 'orth': tag[0], 'lemma': gn.lemmatise(tag[0])[0], 'pos': tag[1] } for tag in tags] # Filter only relevant tags: Verbs, Nouns, Pronouns regex = re.compile( r'.*N.Name.*|.*N.Reg.*|.*SYM.Pun.Sent.*|.*VFIN.*|.*PRO.Pers.*|.*PRO.Dem' ) # Filtered tags tags = [tag for tag in tags if regex.match(tag['pos']) != None] # Get specific elements of words tags = getPOSElement('singular', r'.*Sg', tags) tags = getPOSElement('accusative', r'.*N.(Reg|Name).Acc', tags) tags = getPOSElement('dative', r'.*N.(Reg|Name).Dat', tags) tags = getPOSElement('nominative', r'.*N.(Reg|Name).Nom', tags) tags = getPOSElement('genitive', r'.*N.(Reg|Name).Gen', tags) tags = getPOSElement('feminin', r'.*Fem', tags) tags = getPOSElement('neutrum', r'.*Neut', tags) tags = getPOSElement('noun', r'.*N.Name.*|.*N.Reg', tags) tags = getPOSElement('pronoun', r'.*PRO.Dem.*|.*PRO.Pers', tags) tags = getPOSElement('verb', r'.*VFIN', tags) # Get sentences sentences = [] sentenceArray = [] for word in tags: if word['pos'] != 'SYM.Pun.Sent': sentenceArray.append(word) else: sentences.append(sentenceArray) sentenceArray = [] ############################################################################ # Build word pairs ############################################################################ # Init word pairs array word_pairs = [] # Build lexical overlap word pairs for val, sentence in enumerate(sentences): # Get all nouns nouns = [word['lemma'] for word in sentence if word['noun']] nouns_full = [word for word in sentence if word['noun']] nominatives = filter(lambda x: x['nominative'], sentence) # There is only one noun in the current sentence if len(nouns) == 1: # Append lonely noun word_pairs.append({ 'source': { 'word': nouns_full[0]['orth'], 'lemma': nouns_full[0]['lemma'], 'sentence': val }, 'target': { 'word': nouns_full[0]['orth'], 'lemma': nouns_full[0]['lemma'], 'sentence': val }, 'device': 'single word' }) # There are at least two nouns in the sentence elif len(nouns) > 1: # There is a nominative among the nouns if len(nominatives) > 0: # Loop over every combination of nouns in current sentence for subset in itertools.combinations_with_replacement( nouns_full, 2): if subset[0] != subset[1]: # Check if first word is nominative if subset[0]['nominative']: # Only combine nominatives with accusative, dative # and genitive if subset[1]['accusative'] or subset[1]['dative'] or \ subset[1]['genitive'] or subset[1]['nominative']: # Append word pairs word_pairs.append({ 'source': { 'word': subset[0]['orth'], 'lemma': subset[0]['lemma'], 'sentence': val }, 'target': { 'word': subset[1]['orth'], 'lemma': subset[1]['lemma'], 'sentence': val }, 'device': 'within sentence' }) # Check if second word is nominative if subset[1]['nominative']: # Only combine nominatives with accusative, dative, # and genitive if subset[0]['accusative'] or subset[0]['dative'] or \ subset[0]['genitive'] or subset[0]['nominative']: # Append word pairs word_pairs.append({ 'source': { 'word': subset[1]['orth'], 'lemma': subset[1]['lemma'], 'sentence': val }, 'target': { 'word': subset[0]['orth'], 'lemma': subset[0]['lemma'], 'sentence': val }, 'device': 'within sentence' }) # There are no nominatives in the sentence else: # Loop over every combination of nouns in current sentence for subset in itertools.combinations_with_replacement( nouns_full, 2): if subset[0] != subset[1]: # Combine accusative with dative if subset[0]['accusative'] and subset[1]['dative'] and \ subset[0]['genitive']: # Append word pairs word_pairs.append({ 'source': { 'word': subset[0]['orth'], 'lemma': subset[0]['lemma'], 'sentence': val }, 'target': { 'word': subset[1]['orth'], 'lemma': subset[1]['lemma'], 'sentence': val }, 'device': 'within sentence' }) elif subset[1]['accusative'] and subset[0]['dative'] and \ subset[1]['genitive']: # Append word pairs word_pairs.append({ 'source': { 'word': subset[0]['orth'], 'lemma': subset[0]['lemma'], 'sentence': val }, 'target': { 'word': subset[1]['orth'], 'lemma': subset[1]['lemma'], 'sentence': val }, 'device': 'within sentence' }) # Get hypernym hyponym pairs hyponym_hyper_pairs = [] # Get coreference resolutions coreferences = [] # Get compounds compounds = [] # Get stem relations stem_relations = [] # Get hypernym hyponym pairs # hyponym_hyper_pairs = getHypoHyperPairs(sentences, gn) # Get coreference resolutions # coreferences = get_coreferences(sentences, gn) # Get compounds # compounds = get_compounds(sentences) # Get stem relations # stem_relations = get_stem_relations(sentences, gn) # Merge all word pairs # word_pairs = word_pairs + hyponym_hyper_pairs + coreferences + compounds + \ # stem_relations ###################################### # Calculate number of relations ###################################### word_tuples = map(lambda x: (x['source']['lemma'], x['target']['lemma']), word_pairs) word_tuples = list( set([(pair['source']['lemma'], pair['target']['lemma']) for pair in word_pairs if pair['source']['lemma'] != pair['target']['lemma']])) # Calc number of sentences num_sentences = len(sentences) # Calculate local cohesion local_cohesion = calc_local_cohesion(word_pairs, sentences) # Calculate clusters cluster = get_clusters(word_pairs, sentences) # When clusters are calculated assign them to the word_pairs as # an additional value word_cluster_index = {} for index, single_cluster in enumerate(cluster): # Get words for current cluster source_words = map(lambda x: x['source']['lemma'], single_cluster) target_words = map(lambda x: x['target']['lemma'], single_cluster) # Concatenate sources and targets in to one array words = source_words + target_words # Assign index to word_cluster_index dict for word in words: word_cluster_index[word] = index # Now that we have the indexes for each cluster we can assign the index # to the word_pairs for word_pair in word_pairs: word_pair['cluster'] = word_cluster_index[word_pair['source']['lemma']] # Get dictionary of orthographic forms of all lemmas word_lemma_mapping = get_lemma_mapping(word_pairs) # Prepare data for frontend links = [{ 'source': pair['source']['lemma'], 'target': pair['target']['lemma'], 'device': pair['device'], 'cluster': pair['cluster'] } for pair in word_pairs] nodes = [{ 'id': word, 'index': ind } for ind, word in enumerate(word_lemma_mapping['lemma_word'])] # Get number of concepts num_concepts = len( set([concept['lemma'] for concept in tags if concept['noun'] == True])) # Generate html string for editor html_string = generateHTML(paragraph_split, word_lemma_mapping, word_cluster_index) return { 'word_pairs': word_pairs, 'links': links, 'nodes': nodes, 'numSentences': num_sentences, 'numConcepts': num_concepts, 'clusters': cluster, 'numRelations': len(word_tuples), 'numCluster': len(cluster), 'local cohesion': local_cohesion['local_cohesion'], 'cohSentences': local_cohesion['cohSentences'], 'cohNotSentences': local_cohesion['cohNotSentences'], 'lemmaWordRelations': word_lemma_mapping['lemma_word'], 'wordLemmaRelations': word_lemma_mapping['word_lemma'], 'wordClusterIndex': word_cluster_index, 'numCompounds': len(compounds), 'numCoreferences': len(coreferences), 'numStemRelations': len(stem_relations), 'numHypoHyper': len(hyponym_hyper_pairs), 'html_string': html_string }
from rest_framework.response import Response from rest_framework.decorators import api_view from pygermanet import load_germanet from django.conf import settings gn_host = settings.MONGO_SETTINGS['host'] gn_port = settings.MONGO_SETTINGS['port'] gn = load_germanet(host=gn_host, port=gn_port) @api_view() def synset(request): """ get: Expects a `token` parameter (e.g. ?token=flog) which will be checked against germanet. """ token = request.GET.get('token') enriched = {} if token: lemma = gn.lemmatise("{}".format(token)) if len(lemma) > 0: synsets = [] for x in lemma: for y in gn.synsets("{}".format(x)): synsets.append(y) else: for y in gn.synsets("{}".format(lemma[0])): synsets.append(y) synonyms = []
from os.path import join, exists from time import time import numpy as np from pygermanet import load_germanet, Synset from tqdm import tqdm from constants import LDA_PATH from evaluate_topics import parse_args from utils import load, init_logging, log_args np.set_printoptions(precision=3) gn = load_germanet() tqdm.pandas() def orth(synset): return synset.lemmas[0].orthForm def compare_synset_lists(synset_list1, synset_list2, sim_func, agg_func): try: return agg_func( sim_func(ss1, ss2) for ss1 in synset_list1 for ss2 in synset_list2) except ValueError: return np.nan def similarities(topic, topn, ignore_unknown=True,
def get_tags(self): """ Generates tags from string. Takes a text as input and extracts nominatives using RFTagger. Args: None Returns: List with tags """ # Create directory temp if not existent if not os.path.exists(constants.temp_dir): os.makedirs(constants.temp_dir) # Create random string rand_string = ''.join( random.choice(string.ascii_lowercase + string.digits) for _ in range(15)) # Path for text files tokens = constants.temp_tokens + "_" + rand_string + ".txt" curr_text = constants.temp_text + "_" + rand_string + ".txt" # Save text to file f = open(curr_text, 'w') f.write(self.text) f.close() # Tokenize f = open(tokens, 'w') subprocess.call([constants.tokenizer, curr_text], stdout=f, shell=False) f.close() # Tag Tokens from temp_tokens f = open(constants.temp_tags + "_" + rand_string + ".txt", 'w') subprocess.call([constants.rftagger, constants.german_par, tokens], stdout=f, shell=False) f.close() # Read tags from file f = open(constants.temp_tags + "_" + rand_string + ".txt", 'r') tags = f.readlines() f.close() # Regular Expression # regex = re.compile(r'.*N.Name.*|.*N.Reg.*|.*SYM.Pun.Sent') # # Filtered tags # filtered_tags = [regex.match(tag).string for tag in tags # if regex.match(tag) is not None] # # Split tags in lists splited_tags = [str.split(tag, '\t') for tag in tags] # Load germanet g = load_germanet() # Build Lemmas splited_tags_lemma = [[ g.lemmatise(tag[0].decode('utf-8'))[0], tag[0], tag[1] ] for tag in splited_tags[:-1]] # Update self.tags tags = splited_tags_lemma # Remove files os.remove(curr_text) os.remove(tokens) os.remove(constants.temp_tags + "_" + rand_string + ".txt") return splited_tags_lemma