def query_frog_sentence(words): if have_frog: try: frog_out = frog.process(words) if verbose: print("frog_out", frog_out) return frog_out except: print("Unexpected Frog error:", sys.exc_info()[0]) sys.exit(1) return None
def find_keywords_and_groups(text, keywords, frog): """Returns a list of the keywords and a list of associated groups that occur in tokens.""" tokens = frog.process(text) # a list of dictionaries with frog's analysis per token kw = [] groups = [] for t in tokens: lemma = t["lemma"] g = keywords.get(lemma.lower(), None) if g is not None: kw.append(lemma) groups += g return list(set(kw)), list(set(groups))
def lemmatise_frog(word, lemma, tag): #[{'index': '1', 'lemma': 'κρατήρ', 'pos': 'N--s---ma-', 'eos': True, 'posprob': 1.0, 'text': 'κρητῆρα'}] if have_frog: try: frog_out = frog.process(word) the_lemma = frog_out[0]["lemma"] the_tag = frog_out[0]["pos"] new_lemma = Lemma(word, the_lemma, tag, 0) return( new_lemma, "FROG" ) except: pass return (None, "UNKNOWN")
def read_keywords(filename): """Returns a list of keywords from the datafile. Keywords are lemmatized.""" keywords = [] frog = get_frog() with open(filename) as f: for line in f: word = line.decode("utf-8").strip().lower() if word[0] == "#": word = word[1:] tokens = frog.process(word) if len(tokens) == 1: # TODO: we skip over multi-word keywords keywords.append(tokens[0]["lemma"]) return list(set(keywords))
def frog_process(): text = request.json["text"] if request.json and "text" in request.json else request.args.get('text', '') if not text: return jsonify({ "message": "You must include a text as GET parameter or in the body." }), 400 app.logger.debug("Analyzing text ..") app.logger.debug(text.replace('\n', ' ')) return jsonify({ "response": frog.process(text), "text": text })
def find_keywords_and_groups(text, keywords, frog): """Returns a list of the keywords and a list of associated groups that occur in tokens.""" tokens = frog.process(text) # a list of dictionaries with frog's analysis per token kw = [] groups = [] for t in tokens: lemma = t["lemma"].lower() k = keywords.get(lemma, None) if k is not None: if t["posprob"] > 0.6: if not t["pos"].startswith(k.pos + "("): continue kw.append(lemma) groups += k.groups return list(set(kw)), list(set(groups))
def read_keywords(filename): """Returns a list of Keyword objects from the datafile.""" keywords = [] lemmas = [] frog = get_frog() with open(filename) as f: for line in f: word, pos = line.decode("utf-8").strip().split(",") word = word.lower() if word[0] == "#": word = word[1:] tokens = frog.process(word) if len(tokens) == 1: # TODO: we skip over multi-word keywords lemma = tokens[0]["lemma"] if lemma not in lemmas: # we only want unique lemma's k = Keyword(lemma=lemma, pos=pos) keywords.append(k) lemmas.append(lemma) return keywords
def clean_wordlist(filename): frog = get_frog() keywords = [] with open("data/{}".format(filename)) as f: for line in f: word, pos = line.strip().split(",") if not pos.startswith("SPEC"): word = word.lower() if word[0] == "#": word = word[1:] tokens = frog.process(word) if len(tokens) > 1: # TODO: we skip over multi-word keywords lemma = word else: lemma = tokens[0]["lemma"] keywords.append((lemma, pos)) keywords = sorted(set(keywords)) with open("data/new_{}".format(filename), "w") as f: for (word, pos) in keywords: f.write("{},{}\n".format(word, pos))
def lemmatize_word(word, lang): if lang == "D": word = frog.process(word)[0]['lemma'] else: word = wn_lemmatizer(word) return word
from __future__ import print_function, unicode_literals import frog frog = frog.Frog(frog.FrogOptions(parser=False), "/etc/frog/frog.cfg") output = frog.process_raw("Dit is een test") print("RAW OUTPUT=",output) output = frog.process("Dit is nog een test.") print("PARSED OUTPUT=",output)
with open("./data/neg.translated.tok", "r") as f_in: neg_trans_list = [l for l in f_in] frog = frog.Frog(frog.FrogOptions(parser=False, ner=False, tok=False)) p = re.compile('(ADJ|BW|LID|N|SPEC|TSW|TW|VG|VNW|VZ|WW|LET)\((.*)\)') def parse_pos(pos): m = p.match(pos) coarse = m.group(1) fine = m.group(2) return coarse, fine.split(",") X_pos = [ [parse_pos(t["pos"])[0] for t in frog.process(sent)] for sent in pos_trans_list ] X_neg = [ [parse_pos(t["pos"])[0] for t in frog.process(sent)] for sent in neg_trans_list ] with open("./data/parsed/positive_pos_coarse.txt", "w") as f: for s in X_pos: f.write(" ".join(s)+'\n') with open("./data/parsed/negative_pos_coarse.txt", "w") as f: for s in X_neg: f.write(" ".join(s)+'\n')
# runnen met LaMachine (zie readme) from __future__ import print_function, unicode_literals #to make this work on Python 2 as well as Python 3 import frog, pickle frog = frog.Frog(frog.FrogOptions(parser=False)) f = open(r"50zinnen", "rb") tekst = pickle.load(f) output = frog.process("\n".join(tekst)) f.close() f = open("frog_resultaat.json", "w+") f.write(str(output)) f.close()
# #!/usr/bin/env python # # -*- coding: utf-8 -*- from __future__ import print_function, unicode_literals #to make this work on Python 2 as well as Python 3 import frog filepath = '/Users/roelsmeets/desktop/af_corpora/af_corpus_1stpers_clean/AaVander_DeLichtekooiVanLoven_clean.txt' text = open(filepath, encoding='utf-8').read() frog = frog.Frog(frog.FrogOptions(parser=False)) raw_output = frog.process_raw(text) # print ('**********************************************************') # print("RAW OUTPUT=",raw_output) # print ('**********************************************************') parsed_output = frog.process(text) # print ('**********************************************************') # print("PARSED OUTPUT=",parsed_output) # print ('**********************************************************') named_entities = [] # for element in parsed_output: # if parsed_output[element]['ner'] == 'B-PER': # named_entities.append(parsed_output[element]['text']) # print (named_entities)
# In[7]: if not os.path.exists('data/' + 'dev' + '.POS.txt') or not os.path.exists('data/' + 'train' + '.POS.txt'): import frog frog = frog.Frog(frog.FrogOptions(parser=False)) for t in ['dev', 'train']: with open('data/' + t + '.POS.txt', 'w') as out: with open('data/' + t + '.txt', 'r') as f: for line in f: sentence, tag = line.strip().split("\t") froggo = frog.process(sentence) postext = [] for w in froggo: postext.append(w['pos'].split("(")[0]) out.write(" ".join(postext) + "\t" + tag + "\n") # In[8]: _X_pos_training = [] _y_pos_training = [] with open('data/train.POS.txt', 'r') as f: for line in f: sentence, tag = line.strip().split("\t") _X_pos_training.append(sentence)