def extract_pos_tags(self): # create frogclient bLocal = False if bLocal: port = 8080 frogclient = FrogClient("localhost", port, returnall=True) else: port = 443 frogClient = FrogClient("https://languagemachines.github.io/frog", port, returnall=True) # create wordstream wordstream = '' for file in os.listdir(self.input_path): path = os.path.join(self.input_path, file) with open(path, 'rb') as reader: data = reader.read().decode('utf-8-sig') data = [line.split("\t")[-1] for line in data.split("\n")] wordstream += ' '.join(data) # extract pos tags window_size = 250 window_shift = 50 index = 0 with open(self.postag_file, 'w') as file: while index + window_size < len(wordstream): substream = wordstream[index:index + window_size] for data in frogclient.process(substream): sys.stdout.write('\r') percentage = round(100 * index / float(len(wordstream)), 2) sys.stdout.write(str(percentage) + '%') sys.stdout.flush() file.write(json.dumps(data) + "\n") index += window_shift
def tokenize(text): try: frogclient = FrogClient('localhost', FROGPORT, returnall=True) except Exception as e: sys.exit(COMMAND + ": cannot run frog: " + str(e)) tokens, nbrOfSents = processFrogData(frogclient.process(text)) return (tokens, nbrOfSents)
def applyNer(lines): frogclient = FrogClient('localhost', PORT, returnall=True) nerOutput = "" for line in lines: data = frogclient.process(line) nerOutput += prettyPrint(data) return nerOutput
def frog_process(texts): frogclient = FrogClient( config.frog_hostname, config.frog_port, returnall=True, timeout=1800.0, ) for text in texts: cache = Cache.get_or_new(hash_text(text)) sentences = [s for s in sent_tokenize(text) if s] sentences = split_long_sentences(sentences, 250) tokens = frogclient.process(' '.join(sentences)) tokens_no_none = [token for token in tokens if None not in token] cache.data = tokens_no_none cache.save() frogclient.socket.close()
def call_frog(text): """ Call frog on the text and return (sent, offset, word, lemma, pos, morphofeat) tuples """ host, port = os.environ.get('FROG_HOST', 'localhost:9887').split(":") frogclient = FrogClient(host, port, returnall=True) sent = 1 offset = 0 for word, lemma, morph, morphofeat, ner, chunk, _p1, _p2 in frogclient.process(text): if word is None: sent += 1 else: pos = _POSMAP[morphofeat.split("(")[0]] yield Token(sent, offset, word, lemma, pos, morphofeat, ner, chunk) offset += len(word)
def get_frogclient(port=8020): try: frogclient = FrogClient('localhost', port) return frogclient except: logger.error('Cannot connect to the Frog server. ' 'Is it running at port {}?'.format(port)) logger.info('Start the Frog server with "docker run -p ' '127.0.0.1:{}:{} -t -i proycon/lamachine frog ' '-S {}"'.format(port, port, port)) sys.exit(1)
def call_frog(self, text): """ Call frog on the text and return (sent, offset, word, lemma, pos, morphofeat) tuples """ logging.debug("Creating frog client") frogclient = FrogClient(self.host, self.port, returnall=True, timeout=600) sent = 1 offset = 0 logging.debug("Calling frog") tokens = list(frogclient.process(text)) logging.debug("Got {} tokens".format(len(tokens))) for word, lemma, morph, morphofeat, ner, chunk, _p1, _p2 in tokens: if word is None: sent += 1 else: yield (sent, offset, word, lemma, morphofeat, ner, chunk) offset += len(word)
def retag(doc, i): global threads print "\tRetagging:" r = re.compile('\[(.*)\]') frogclient = FrogClient('localhost', 9000 + (i % threads)) for sentence in doc.sentences(): words = " ".join([w.text() for w in sentence.words()]) for j, (word, lemma, morph, pos) in enumerate(frogclient.process(words)): wordelement = sentence.words(j) wordelement.replace(cgn.parse_cgn_postag(pos)) wordelement.replace(folia.LemmaAnnotation, cls=lemma) #parse mbma morphemes = r.findall(morph) if morphemes: layer = wordelement.append(folia.MorphologyLayer) for morpheme in morphemes: layer.append(folia.Morpheme, cls=morpheme)
def frog_naf(text): """ Call from on the text and return a Naf object """ naf = KafNafParser(type="NAF") frogclient = FrogClient('localhost', 9887) for token in call_frog(text): wf = naf.create_wf(token.word, token.sent, token.offset) term = naf.create_term(token.lemma, token.pos, token.morphofeat, [wf]) naf.create_linguistic_processor("text", "Frog tokenizer", get_frog_version()) naf.create_linguistic_processor("term", "Frog MBT", get_frog_version()) s = BytesIO() naf.dump(s) return s.getvalue()
""" Extract POS tags by using Frog. """ import json import sys from pynlpl.clients.frogclient import FrogClient port = 8020 frogclient = FrogClient('localhost', port, returnall=True) import os wordstream = '' data_folder = '../data/output/1_preprocessed/' for file in os.listdir(data_folder): path = os.path.join(data_folder, file) with open(path, 'rb') as reader: data = reader.read().decode('utf-8-sig') data = [line.split("\t")[-1] for line in data.split("\n")] wordstream += ' '.join(data) window_size = 250 window_shift = 50 index = 0 with open('pos_tags.txt', 'w') as file: while index + window_size < len(wordstream): substream = wordstream[index:index + window_size] for data in frogclient.process(substream): sys.stdout.write('\r') percentage = round(100 * index / float(len(wordstream)), 2) sys.stdout.write(str(percentage) + '%')
tok = True else: print >>sys.stderr, "ERROR: Unknown option:",o sys.exit(1) if not port: print >> sys.stderr,"ERROR: No port specified to connect to Frog server" sys.exit(2) elif (not textfile and not xmlfile and not foliafile): print >> sys.stderr,"ERROR: Specify a file with either --txt, --xml or --folia" sys.exit(2) elif xmlfile and not xpathselect: print >> sys.stderr,"ERROR: You need to specify --selectsen or --selectpar when using --xml" sys.exit(2) frogclient = FrogClient('localhost',port) idmap = [] data = [] if textfile: f = codecs.open(textfile, 'r', encoding) for line in f.readlines(): if idinfirstcolumn: id, line = line.split('\t',1) idmap.append(id.strip()) else: idmap.append(None) data.append(line.strip()) f.close()
#!/usr/bin/env python from __future__ import print_function, unicode_literals, division, absolute_import import sys import io from pynlpl.clients.frogclient import FrogClient frogclient = FrogClient('localhost',12345) inputfile = sys.argv[1] with io.open(inputfile + '.lem', 'w',encoding='utf-8') as f_lemma: with io.open(inputfile + '.pos', 'w',encoding='utf-8') as f_pos: with io.open(inputfile,'r',encoding='utf-8') as f: for i, line in enumerate(f): print(i,file=sys.stderr) posline = [] lemline = [] for word,lemma,morph,pos in frogclient.process(line.strip()): posline.append(pos) lemline.append(lemma) f_lemma.write(" ".join(lemline).strip() + "\n") f_pos.write(" ".join(posline).strip() + "\n")
def check_status(self): frogclient = FrogClient(self.host, self.port, returnall=True)
def __init__(self, *args): global WSDDIR self.tagger = None self.mode = args[0] if args[0] == "file": if len(args) != 2: raise Exception("Syntax: file:[filename]") self.tagger = codecs.open(args[1],'r','utf-8') elif args[0] == "frog": if len(args) != 3: raise Exception("Syntax: frog:[host]:[port]") from pynlpl.clients.frogclient import FrogClient port = int(args[2]) self.tagger = FrogClient(args[1],port) elif args[0] == "freeling": if len(args) != 3: raise Exception("Syntax: freeling:[host]:[port]") from pynlpl.clients.freeling import FreeLingClient host = args[1] port = int(args[2]) self.tagger = FreeLingClient(host,port) elif args[0] == "corenlp": if len(args) != 1: raise Exception("Syntax: corenlp") import corenlp print("Initialising Stanford Core NLP",file=stderr) self.tagger = corenlp.StanfordCoreNLP() elif args[0] == 'treetagger': if not len(args) == 2: raise Exception("Syntax: treetagger:[treetagger-bin]") self.tagger = args[1] elif args[0] == "durmlex": if not len(args) == 2: raise Exception("Syntax: durmlex:[filename]") print("Reading durm lexicon: ", args[1],file=stderr) self.mode = "lookup" self.tagger = {} f = codecs.open(args[1],'r','utf-8') for line in f: fields = line.split('\t') wordform = fields[0].lower() lemma = fields[4].split('.')[0] self.tagger[wordform] = (lemma, 'n') f.close() print("Loaded ", len(self.tagger), " wordforms",file=stderr) elif args[0] == "oldlex": if not len(args) == 2: raise Exception("Syntax: oldlex:[filename]") print("Reading OLDLexique: ", args[1],file=stderr) self.mode = "lookup" self.tagger = {} f = codecs.open(args[1],'r','utf-8') for line in f: fields = line.split('\t') wordform = fields[0].lower() lemma = fields[1] if lemma == '=': lemma == fields[0] pos = fields[2][0].lower() self.tagger[wordform] = (lemma, pos) print("Loaded ", len(self.tagger), " wordforms",file=stderr) f.close() else: raise Exception("Invalid mode: " + args[0])
try: inputfile = sys.argv[1] frogport = int(sys.argv[2]) outputdir = sys.argv[3] if len(sys.argv) >= 5: stripcorrections = bool(int(sys.argv[4])) else: stripcorrections = False except: print >>sys.stderr ,"Usage: opentaalerrorharvest2folia.py inputfile frogport outputdir [stripcorrections=0/1]\nStart a frog server with: $ frog --skip=mp -S portnum" sys.exit(2) frogclient = FrogClient('localhost', frogport) correctioncount = 0 with codecs.open(inputfile,'r','utf-8','ignore') as f: for i, line in enumerate(f): print >>sys.stderr,"@" + str(i), if i % 1000 == 0: if doc: doc.save(outputfile) print >>sys.stderr,"Saved " + outputfile docnum += 1 outputfile = outputdir + '/opentaalerrorharvest' + str(docnum) + '.xml' doc = folia.Document(id='opentaalerrorharvest' + str(docnum)) doc.declare(folia.AnnotationType.TOKEN, set='http://ilk.uvt.nl/folia/sets/ucto-nl.foliaset', annotator='Frog',annotatortype=folia.AnnotatorType.AUTO) doc.declare(folia.AnnotationType.POS, set='http://ilk.uvt.nl/folia/sets/cgn-legacy.foliaset', annotator='Frog',annotatortype=folia.AnnotatorType.AUTO) doc.declare(folia.AnnotationType.LEMMA, set='http://ilk.uvt.nl/folia/sets/mblem-nl.foliaset', annotator='Frog',annotatortype=folia.AnnotatorType.AUTO)
# ner.py: perform named antity recognition with frog # usage: ner.py < text # note adapted from: https://www.tutorialspoint.com/python/python_networking.htm # 20180604 erikt(at)xs4all.nl from pynlpl.clients.frogclient import FrogClient import re import socket import sys PORT = 8080 MAXDATA = 1024 NERID = 4 POSID = 3 TOKENID = 0 def prettyPrint(data): for row in data: if len(row) >= NERID + 1 and row[0] != None: lastLine = row[TOKENID] + " " + row[POSID] + " " + row[NERID] print(lastLine) print("") return () frogclient = FrogClient('localhost', PORT, returnall=True) for line in sys.stdin: data = frogclient.process(line) prettyPrint(data)
from pynlpl.formats.sonar import CorpusDocumentX, ns from pynlpl.clients.frogclient import FrogClient import sys import datetime if len(sys.argv) == 3 and sys.argv[2].isdigit(): docname = sys.argv[1] port = int(sys.argv[2]) else: print >> sys.stderr, "Usage: ./sonar_poslem_tagger_singlefile.py [filename] [frog-port]" print >> sys.stderr, "Please first start a Frog server with: frog --skip=tmp -S 12345 (or some other port number)" print >> sys.stderr, "Reads and writes D-Coi XML" #Make sure Tadpole/Frog server runs with tokeniser and MWU *DISABLED* ! frogclient = FrogClient('localhost', port) print >> sys.stderr, "[" + datetime.datetime.now().strftime( "%Y-%m-%d %H:%M:%S") + "] PROCESSING " + docname + " (port " + str( port) + ")" doc = CorpusDocumentX(docname) processed_doc = False for sentence in doc.sentences(): words = " ".join([x.text for x in sentence]) process_sentence = False for x in sentence: if not (ns('dcoi') + 'pos' in x.attrib or ns('dcoi') + 'lemma' in x.attrib or 'pos' in x.attrib or 'lemma' in x.attrib): process_sentence = True
def connectToFrog(): try: frogClient = FrogClient(HOST, PORT, returnall=True) return (frogClient) except Exception as e: error(NOFROGCONTACTMSG + " " + str(e))
#!/usr/bin/env python #-*- coding:utf-8 -*- from pynlpl.formats.sonar import CorpusX, CorpusDocumentX, ns from pynlpl.clients.frogclient import FrogClient import sys import os.path sonardir = sys.argv[1] #Starting temporary Frog server os.system("frog --skip=tmp -S 7551 &") time.sleep(3) frogclient = FrogClient('localhost',7551) for doc in CorpusX(sonardir,'tok',"", lambda f: not os.path.exists(f + '.pos') ): #read the *.tok files, on condition there are no *.pos equivalents (will not overwrite) processed_doc = False print doc.filename + '\tPROCESSING' for sentence in doc.sentences(): words = " ".join([ x.text for x in sentence ]) process_sentence = False for x in sentence: if not ns('dcoi') + 'pos' in x.attrib or not ns('dcoi') + 'lemma' in x.attrib: process_sentence = True if process_sentence: processed_doc = True for i, (word, lemma, morph, pos) in enumerate(frogclient.process(words)): try:
import flask from flask import Flask from flask import request, render_template, jsonify TEMPLATES_AUTO_RELOAD = True app = Flask(__name__) app.jinja_env.auto_reload = True from pynlpl.clients.frogclient import FrogClient frogclient = FrogClient('nlp', 12345) frogclient.returnall = True def translate(text): return frogclient.process(text) @app.route("/", methods=['GET', 'POST']) def hello(): if request.method == 'POST': text = request.json['text'] return jsonify(translate(text)) else: return 'hello\n'