def frog_process_files(files, verbose=True): seen = [] start_time = time.time() frogger = frog.Frog(frog.FrogOptions(parser=False,mwu=False,ner=False,morph=False,chunking=False, numThreads=8),'/etc/frog/frog.cfg') for i, filename in enumerate(files): with open(filename,'r') as in_file: output = frogger.process_raw(in_file.read()) if verbose: print ('> PROCESSING', filename, str(len(seen))+'/'+str(len(files))) seen.append(filename) #Timings (estimation of time remaining) runtime = time.time() - start_time per_document_time = runtime/len(seen) remaining_time = (len(files)-len(seen))*per_document_time total_time = remaining_time+runtime print ("RUNTIME", duration_to_string(runtime), "("+duration_to_string(per_document_time)+")", 'REMAINING', duration_to_string(remaining_time), 'TOTAL', duration_to_string(total_time)) frogged_filename = util.filename_without_extension(filename, '.txt') with open(OUTPUT_FOLDER+frogged_filename+'.frog.out', 'w') as f: f.write(output)
def process_data(self, X): import frog frogg = frog.Frog(frog.FrogOptions(lemma=False, morph=False)) new_X = [ ' '.join([word['pos'] for word in frogg.process(x)]) for x in X ] return new_X
def __init__(self, lmdir, sleep=False): """Starts the frog server if the sleep function isn't on.""" if not sleep: import frog opts = frog.FrogOptions(parser=False, ner=False) self.frogger = frog.Frog( opts, lmdir + "LaMachine/lamachine/etc/" "frog/frog-twitter.cfg")
def process_data(self, X): frogg = frog.Frog( frog.FrogOptions(morph=False, mwu=False, chunking=False, ner=False)) new_X = [ ' '.join([word['lemma'] for word in frogg.process(x)]) for x in X ] return new_X
def preprocess(files): """ Preprocess a list of XML-files The cleaned files will be saved in the output folder Remove the XML-tags and clean the remaining raw text to have one sentence per line with lemmatized words """ frog_options = frog.FrogOptions(tok=False, morph=False, mwu=True, chunking=False, ner=False, numThreads=8) frogger = frog.Frog(frog_options, '/vol/customopt/lamachine/etc/frog/frog.cfg') start_time = time.time() for i, file_name in enumerate(files): outfile = ntpath.basename(file_name)[:-4] + '.txt' out_name = os.path.join(OUTPUT_FOLDER, outfile) if os.path.isfile(out_name): print('Already done:', out_name) continue with open(file_name, 'r', encoding='utf-8') as file: try: text = file.read() # Remove all XML tags text = re.sub('<[^>]*>', '', text) lines = text.splitlines() # Remove abundant whitespace lines = [line.strip() for line in lines] # One sentence per line lines = [ re.sub(r'(\w)\. ([A-Z])', '\\1.\n\\2', line) for line in lines ] # Remove punctuation lines = [ re.sub(r'[\.,:;/\(\)\[\]\'\"]', '', line) for line in lines ] # Remove empty lines and make lower case lines = [line.lower() for line in lines if line != ''] # Convert each word to its lemma lemmas = [lemmatize(line, frogger) for line in lines] # Change extension to .txt with open(out_name, 'w', encoding='utf-8') as out: out.write('\n'.join(lemmas)) if i % 49 == 0 and i != 0: print('Done {}/{}'.format(i, len(files))) time_per_doc = (time.time() - start_time) / i print('Average time/document:', sec_to_string(time_per_doc)) time_remaining = time_per_doc * (len(files) - i) print('Time remaining:', sec_to_string(time_remaining)) except UnicodeError: print('Skipping {}, UnicodeError'.format(file_name))
def activate_lemmatizers(): global frog_installed, frog_lemmatizer, lemmas_nl, lemmas_nl_file, wn_lemmatizer wn_lemmatizer = WordNetLemmatizer() frog_installed = True with open("./data/lemmas_nl.csv", 'r') as lemmas_nl_file: lemmas_nl_df = pandas.read_csv(lemmas_nl_file, sep=",") lemmas_nl = dict(zip(lemmas_nl_df["word"], lemmas_nl_df["lemma"])) try: import frog frog_lemmatizer = frog.Frog(frog.FrogOptions(parser=False)) lemmas_nl_file = open("./data/lemmas_nl.csv", 'a') except ImportError: frog_installed = False
def get_frog(): """Returns the interface object to frog NLP. (There should only be one instance, because it spawns a frog process that consumes a lot of RAM.) """ global FROG if FROG is None: FROG = frog.Frog( frog.FrogOptions(tok=True, lemma=True, morph=False, daringmorph=False, mwu=True, chunking=False, ner=False, parser=False), "/home/rahiel/hortiradar/venv/share/frog/nld/frog.cfg") return FROG
def function_sents(X): import frog frogg = frog.Frog(frog.FrogOptions(morph=False, mwu=False, chunking=False)) aux = open('data/ww.txt', 'r').read().splitlines() new_X = [] for x in X: new_x = [] output = frogg.process(x) for word in output: if word['pos'][:3] not in ['LID', 'VNW', 'VG(', 'WW(']: continue if word['pos'][:2] == 'WW': if word['lemma'] in aux: new_x.append(word['lemma']) continue new_x.append(word['text'].lower()) new_X.append(new_x) return new_X
def process_data(self, X): """Filter data. Leave only articles, pronouns, conjunctions and auxiliary verbs.""" frogg = frog.Frog( frog.FrogOptions(morph=False, mwu=False, chunking=False)) aux = open(config.VERB_FILE, 'r').read().splitlines() new_X = [] for x in X: new_x = [] output = frogg.process(x) for word in output: if word['pos'][:3] not in ['LID', 'VNW', 'VG(', 'WW(']: continue if word['pos'][:2] == 'WW': if word['lemma'] in aux: new_x.append(word['lemma']) continue new_x.append(word['text'].lower()) new_X.append(new_x) return new_X
# Sanity checks, aborts if specified lexicon files not found. files_found = True for f in [greekHDfile, filename, nofreqfile, extrafile, frog_cfg]: if f and not os.path.exists(f): print("ERROR: FILE NOT FOUND:", f, file=sys.stderr) files_found = False if not files_found: sys.exit(1) # Initialise Frog. if have_frog: print("INITIALISE FROG", file=sys.stderr) frog = frog.Frog( frog.FrogOptions(parser=True, tok=False, morph=False, mwu=False, chunking=False, ner=False), frog_cfg) # Statistics on lexicon files. line_count = 0 new_entries = 0 zero_freq = 0 if greekHDfile: print("READING", greekHDfile, file=sys.stderr) with open(greekHDfile, 'r') as f: ''' WORD LEMMA TAG COUNT ἀλλήλοις ἀλλήλων Pc-p---md--i 5 ἀλλήλοις ἀλλήλων Pc-p---nd--i 2
import os import frog from nltk import pos_tag from nltk import WordNetLemmatizer from nltk.corpus import wordnet as wn import csv frog = frog.Frog(frog.FrogOptions(parser=False)) lemmatizer = WordNetLemmatizer() def wn_lemmatizer(word): tag = pos_tag([word])[0][1] # Converting it to WordNet format. mapping = {'N': wn.NOUN, 'V': wn.VERB, 'R': wn.ADV, 'J': wn.ADJ} tag_wn = mapping.get(tag[0], wn.NOUN) lemma = lemmatizer.lemmatize(word, tag_wn) return lemma def preprocess_word(w): # An auxiliary function to clean test files. if "f-" in w: w = w[2:] if w[:3] == "vk-": w = w[3:] if w[:2] == "vk": w = w[2:] if w == "geen": return "" if "(" in w: w = w[:w.index("(")]
import csv import frog from tqdm import tqdm frogger = frog.Frog(frog.FrogOptions(parser=False, ner=False)) reader = csv.reader(open('dmad_a.csv')) writer = csv.writer(open('dmad_a_tagged.csv', 'w')) corp = [x for x in reader] for i, r in enumerate(tqdm(corp)): try: r += ["\n".join(["\t".join([token["text"], token["lemma"], token["pos"]]) for token in frogger.process(r[4])])] if i else ['frogs'] writer.writerow(r) except IndexError: print(r)
import nltk.data import frog import codecs import docopt Usage: sample_file_builder <input_file> <output_file> sent_detector = nltk.data.load('tokenizers/punkt/dutch.pickle') froggie = frog.Frog(frog.FrogOptions(parser=False), "/etc/frog/frog.cfg") # counter = 1 with codecs.open(output_file,"w","utf-8") as of: with codecs.open(input_file,"r","utf-8") as infile: for line in infile: of.write(froggie.process_raw(s)+"\n") # counter+=1
def lemmatize_sents(X): import frog frogg = frog.Frog( frog.FrogOptions(morph=False, mwu=False, chunking=False, ner=False)) new_X = [' '.join([word['lemma'] for word in frogg.process(x)]) for x in X] return new_X
# -*- coding: utf-8 -*- import frog import re with open("./data/pos.translated.tok", "r") as f_in: pos_trans_list = [l for l in f_in] with open("./data/neg.translated.tok", "r") as f_in: neg_trans_list = [l for l in f_in] frog = frog.Frog(frog.FrogOptions(parser=False, ner=False, tok=False)) p = re.compile('(ADJ|BW|LID|N|SPEC|TSW|TW|VG|VNW|VZ|WW|LET)\((.*)\)') def parse_pos(pos): m = p.match(pos) coarse = m.group(1) fine = m.group(2) return coarse, fine.split(",") X_pos = [ [parse_pos(t["pos"])[0] for t in frog.process(sent)] for sent in pos_trans_list ] X_neg = [ [parse_pos(t["pos"])[0] for t in frog.process(sent)] for sent in neg_trans_list