示例#1
0
    def extract_pos_tags(self):
        # create frogclient
        bLocal = False
        if bLocal:
            port = 8080
            frogclient = FrogClient("localhost", port, returnall=True)
        else:
            port = 443
            frogClient = FrogClient("https://languagemachines.github.io/frog",
                                    port,
                                    returnall=True)

        # create wordstream
        wordstream = ''
        for file in os.listdir(self.input_path):
            path = os.path.join(self.input_path, file)
            with open(path, 'rb') as reader:
                data = reader.read().decode('utf-8-sig')
                data = [line.split("\t")[-1] for line in data.split("\n")]
                wordstream += ' '.join(data)

        # extract pos tags
        window_size = 250
        window_shift = 50
        index = 0
        with open(self.postag_file, 'w') as file:
            while index + window_size < len(wordstream):
                substream = wordstream[index:index + window_size]
                for data in frogclient.process(substream):
                    sys.stdout.write('\r')
                    percentage = round(100 * index / float(len(wordstream)), 2)
                    sys.stdout.write(str(percentage) + '%')
                    sys.stdout.flush()
                    file.write(json.dumps(data) + "\n")
                index += window_shift
def tokenize(text):
    try:
        frogclient = FrogClient('localhost', FROGPORT, returnall=True)
    except Exception as e:
        sys.exit(COMMAND + ": cannot run frog: " + str(e))
    tokens, nbrOfSents = processFrogData(frogclient.process(text))
    return (tokens, nbrOfSents)
示例#3
0
def applyNer(lines):
    frogclient = FrogClient('localhost', PORT, returnall=True)
    nerOutput = ""
    for line in lines:
        data = frogclient.process(line)
        nerOutput += prettyPrint(data)
    return nerOutput
示例#4
0
文件: tasks.py 项目: newsgac/platform
def frog_process(texts):
    frogclient = FrogClient(
        config.frog_hostname,
        config.frog_port,
        returnall=True,
        timeout=1800.0,
    )
    for text in texts:
        cache = Cache.get_or_new(hash_text(text))
        sentences = [s for s in sent_tokenize(text) if s]
        sentences = split_long_sentences(sentences, 250)
        tokens = frogclient.process(' '.join(sentences))
        tokens_no_none = [token for token in tokens if None not in token]
        cache.data = tokens_no_none
        cache.save()
    frogclient.socket.close()
示例#5
0
def call_frog(text):
    """
    Call frog on the text and return (sent, offset, word, lemma, pos, morphofeat) tuples
    """
    
    host, port = os.environ.get('FROG_HOST', 'localhost:9887').split(":")
    frogclient = FrogClient(host, port, returnall=True)
    sent = 1
    offset = 0
    for word, lemma, morph, morphofeat, ner, chunk, _p1, _p2 in frogclient.process(text):
        if word is None:
            sent += 1
        else:
            pos = _POSMAP[morphofeat.split("(")[0]]
            yield Token(sent, offset, word, lemma, pos, morphofeat, ner, chunk)
            offset += len(word)
示例#6
0
def get_frogclient(port=8020):
    try:
        frogclient = FrogClient('localhost', port)
        return frogclient
    except:
        logger.error('Cannot connect to the Frog server. '
                     'Is it running at port {}?'.format(port))
        logger.info('Start the Frog server with "docker run -p '
                    '127.0.0.1:{}:{} -t -i proycon/lamachine frog '
                    '-S {}"'.format(port, port, port))
        sys.exit(1)
示例#7
0
文件: frog.py 项目: mcomsa/nlpipe
 def call_frog(self, text):
     """
     Call frog on the text and return (sent, offset, word, lemma, pos, morphofeat) tuples
     """
     logging.debug("Creating frog client")
     frogclient = FrogClient(self.host,
                             self.port,
                             returnall=True,
                             timeout=600)
     sent = 1
     offset = 0
     logging.debug("Calling frog")
     tokens = list(frogclient.process(text))
     logging.debug("Got {} tokens".format(len(tokens)))
     for word, lemma, morph, morphofeat, ner, chunk, _p1, _p2 in tokens:
         if word is None:
             sent += 1
         else:
             yield (sent, offset, word, lemma, morphofeat, ner, chunk)
             offset += len(word)
示例#8
0
def retag(doc, i):
    global threads
    print "\tRetagging:"
    r = re.compile('\[(.*)\]')
    frogclient = FrogClient('localhost', 9000 + (i % threads))

    for sentence in doc.sentences():
        words = " ".join([w.text() for w in sentence.words()])
        for j, (word, lemma, morph,
                pos) in enumerate(frogclient.process(words)):
            wordelement = sentence.words(j)
            wordelement.replace(cgn.parse_cgn_postag(pos))
            wordelement.replace(folia.LemmaAnnotation, cls=lemma)

            #parse mbma
            morphemes = r.findall(morph)
            if morphemes:
                layer = wordelement.append(folia.MorphologyLayer)
                for morpheme in morphemes:
                    layer.append(folia.Morpheme, cls=morpheme)
示例#9
0
def frog_naf(text):
    """
    Call from on the text and return a Naf object
    """
    naf = KafNafParser(type="NAF")
    frogclient = FrogClient('localhost', 9887)
    for token in call_frog(text):
        wf = naf.create_wf(token.word, token.sent, token.offset)
        term = naf.create_term(token.lemma, token.pos, token.morphofeat, [wf])

    naf.create_linguistic_processor("text", "Frog tokenizer", get_frog_version())
    naf.create_linguistic_processor("term", "Frog MBT", get_frog_version())
    s = BytesIO()
    naf.dump(s)
    return s.getvalue()
示例#10
0
"""
Extract POS tags by using Frog.
"""

import json

import sys
from pynlpl.clients.frogclient import FrogClient
port = 8020
frogclient = FrogClient('localhost', port, returnall=True)

import os
wordstream = ''
data_folder = '../data/output/1_preprocessed/'
for file in os.listdir(data_folder):
    path = os.path.join(data_folder, file)
    with open(path, 'rb') as reader:
        data = reader.read().decode('utf-8-sig')
        data = [line.split("\t")[-1] for line in data.split("\n")]
        wordstream += ' '.join(data)

window_size = 250
window_shift = 50
index = 0
with open('pos_tags.txt', 'w') as file:
    while index + window_size < len(wordstream):
        substream = wordstream[index:index + window_size]
        for data in frogclient.process(substream):
            sys.stdout.write('\r')
            percentage = round(100 * index / float(len(wordstream)), 2)
            sys.stdout.write(str(percentage) + '%')
示例#11
0
        tok = True
    else:
        print >>sys.stderr, "ERROR: Unknown option:",o
        sys.exit(1)
        
if not port:
    print >> sys.stderr,"ERROR: No port specified to connect to Frog server"    
    sys.exit(2)
elif (not textfile and not xmlfile and not foliafile):
    print >> sys.stderr,"ERROR: Specify a file with either --txt, --xml or --folia"
    sys.exit(2)
elif xmlfile and not xpathselect:
    print >> sys.stderr,"ERROR: You need to specify --selectsen or --selectpar when using --xml"
    sys.exit(2)

frogclient = FrogClient('localhost',port)

idmap = []
data = []

if textfile:
    f = codecs.open(textfile, 'r', encoding)
    for line in f.readlines():
        if idinfirstcolumn:
            id, line = line.split('\t',1)
            idmap.append(id.strip())
        else:
            idmap.append(None)
        data.append(line.strip())        
    f.close()
        
示例#12
0
#!/usr/bin/env python

from __future__ import print_function, unicode_literals, division, absolute_import

import sys
import io
from pynlpl.clients.frogclient import FrogClient



frogclient = FrogClient('localhost',12345)

inputfile = sys.argv[1]


with io.open(inputfile + '.lem', 'w',encoding='utf-8') as f_lemma:
    with io.open(inputfile + '.pos', 'w',encoding='utf-8') as f_pos:
        with io.open(inputfile,'r',encoding='utf-8') as f:
            for i, line in enumerate(f):
                print(i,file=sys.stderr)
                posline = []
                lemline = []
                for word,lemma,morph,pos in frogclient.process(line.strip()):
                    posline.append(pos)
                    lemline.append(lemma)
                f_lemma.write(" ".join(lemline).strip() + "\n")
                f_pos.write(" ".join(posline).strip() + "\n")


示例#13
0
文件: frog.py 项目: mcomsa/nlpipe
 def check_status(self):
     frogclient = FrogClient(self.host, self.port, returnall=True)
示例#14
0
 def __init__(self, *args):        
    global WSDDIR
    self.tagger = None
    self.mode = args[0]
    if args[0] == "file":
        if len(args) != 2:
            raise Exception("Syntax: file:[filename]")            
        self.tagger = codecs.open(args[1],'r','utf-8') 
    elif args[0] == "frog":
        if len(args) != 3:
            raise Exception("Syntax: frog:[host]:[port]")
        from pynlpl.clients.frogclient import FrogClient
        port = int(args[2])
        self.tagger = FrogClient(args[1],port)                
    elif args[0] == "freeling":
        if len(args) != 3:
            raise Exception("Syntax: freeling:[host]:[port]")
        from pynlpl.clients.freeling import FreeLingClient
        host = args[1]
        port = int(args[2])
        self.tagger = FreeLingClient(host,port)            
    elif args[0] == "corenlp":
        if len(args) != 1:
            raise Exception("Syntax: corenlp")
        import corenlp
        print("Initialising Stanford Core NLP",file=stderr)
        self.tagger = corenlp.StanfordCoreNLP()
    elif args[0] == 'treetagger':                        
        if not len(args) == 2:
            raise Exception("Syntax: treetagger:[treetagger-bin]")
        self.tagger = args[1]            
    elif args[0] == "durmlex":
        if not len(args) == 2:
            raise Exception("Syntax: durmlex:[filename]")
        print("Reading durm lexicon: ", args[1],file=stderr)
        self.mode = "lookup"
        self.tagger = {}
        f = codecs.open(args[1],'r','utf-8')
        for line in f:
            fields = line.split('\t')
            wordform = fields[0].lower()
            lemma = fields[4].split('.')[0]
            self.tagger[wordform] = (lemma, 'n')
        f.close()
        print("Loaded ", len(self.tagger), " wordforms",file=stderr)
    elif args[0] == "oldlex":
        if not len(args) == 2:
            raise Exception("Syntax: oldlex:[filename]")
        print("Reading OLDLexique: ", args[1],file=stderr)
        self.mode = "lookup"
        self.tagger = {}
        f = codecs.open(args[1],'r','utf-8')
        for line in f:
            fields = line.split('\t')
            wordform = fields[0].lower()                
            lemma = fields[1]
            if lemma == '=': 
                lemma == fields[0]
            pos = fields[2][0].lower()
            self.tagger[wordform] = (lemma, pos)
            print("Loaded ", len(self.tagger), " wordforms",file=stderr)
        f.close()        
    else:
        raise Exception("Invalid mode: " + args[0])


try:
    inputfile = sys.argv[1]
    frogport = int(sys.argv[2])
    outputdir = sys.argv[3]
    if len(sys.argv) >= 5:
        stripcorrections = bool(int(sys.argv[4]))
    else:
        stripcorrections = False
except:
    print >>sys.stderr ,"Usage: opentaalerrorharvest2folia.py inputfile frogport outputdir [stripcorrections=0/1]\nStart a frog server with: $ frog --skip=mp -S portnum"
    sys.exit(2)

frogclient = FrogClient('localhost', frogport)
correctioncount = 0

with codecs.open(inputfile,'r','utf-8','ignore') as f:
    for i, line in enumerate(f):
        print >>sys.stderr,"@" + str(i),
        if i % 1000 == 0:
            if doc:
                doc.save(outputfile)
                print >>sys.stderr,"Saved " + outputfile
            docnum += 1
            outputfile = outputdir + '/opentaalerrorharvest' + str(docnum) + '.xml'
            doc = folia.Document(id='opentaalerrorharvest' + str(docnum))
            doc.declare(folia.AnnotationType.TOKEN, set='http://ilk.uvt.nl/folia/sets/ucto-nl.foliaset', annotator='Frog',annotatortype=folia.AnnotatorType.AUTO)
            doc.declare(folia.AnnotationType.POS, set='http://ilk.uvt.nl/folia/sets/cgn-legacy.foliaset', annotator='Frog',annotatortype=folia.AnnotatorType.AUTO)
            doc.declare(folia.AnnotationType.LEMMA, set='http://ilk.uvt.nl/folia/sets/mblem-nl.foliaset', annotator='Frog',annotatortype=folia.AnnotatorType.AUTO)
示例#16
0
# ner.py: perform named antity recognition with frog
# usage: ner.py < text
# note adapted from: https://www.tutorialspoint.com/python/python_networking.htm
# 20180604 erikt(at)xs4all.nl

from pynlpl.clients.frogclient import FrogClient
import re
import socket
import sys

PORT = 8080
MAXDATA = 1024
NERID = 4
POSID = 3
TOKENID = 0


def prettyPrint(data):
    for row in data:
        if len(row) >= NERID + 1 and row[0] != None:
            lastLine = row[TOKENID] + " " + row[POSID] + " " + row[NERID]
            print(lastLine)
    print("")
    return ()


frogclient = FrogClient('localhost', PORT, returnall=True)
for line in sys.stdin:
    data = frogclient.process(line)
    prettyPrint(data)
from pynlpl.formats.sonar import CorpusDocumentX, ns
from pynlpl.clients.frogclient import FrogClient
import sys
import datetime

if len(sys.argv) == 3 and sys.argv[2].isdigit():
    docname = sys.argv[1]
    port = int(sys.argv[2])
else:
    print >> sys.stderr, "Usage: ./sonar_poslem_tagger_singlefile.py [filename] [frog-port]"
    print >> sys.stderr, "Please first start a Frog server with: frog --skip=tmp -S 12345 (or some other port number)"
    print >> sys.stderr, "Reads and writes D-Coi XML"

#Make sure Tadpole/Frog server runs with tokeniser and MWU *DISABLED* !
frogclient = FrogClient('localhost', port)

print >> sys.stderr, "[" + datetime.datetime.now().strftime(
    "%Y-%m-%d %H:%M:%S") + "] PROCESSING " + docname + " (port " + str(
        port) + ")"
doc = CorpusDocumentX(docname)

processed_doc = False
for sentence in doc.sentences():
    words = " ".join([x.text for x in sentence])

    process_sentence = False
    for x in sentence:
        if not (ns('dcoi') + 'pos' in x.attrib or ns('dcoi') + 'lemma'
                in x.attrib or 'pos' in x.attrib or 'lemma' in x.attrib):
            process_sentence = True
示例#18
0
def connectToFrog():
    try:
        frogClient = FrogClient(HOST, PORT, returnall=True)
        return (frogClient)
    except Exception as e:
        error(NOFROGCONTACTMSG + " " + str(e))
示例#19
0
#!/usr/bin/env python
#-*- coding:utf-8 -*-

from pynlpl.formats.sonar import CorpusX, CorpusDocumentX, ns
from pynlpl.clients.frogclient import FrogClient
import sys
import os.path

sonardir = sys.argv[1]

#Starting temporary Frog server
os.system("frog --skip=tmp -S 7551 &")


time.sleep(3)
frogclient = FrogClient('localhost',7551)

for doc in CorpusX(sonardir,'tok',"", lambda f: not os.path.exists(f + '.pos') ): #read the *.tok files, on condition there are no *.pos equivalents (will not overwrite)
    processed_doc = False
    print doc.filename + '\tPROCESSING'
    for sentence in doc.sentences():
            words = " ".join([ x.text for x in sentence ])

            process_sentence = False
            for x in sentence:
                if not ns('dcoi') + 'pos' in x.attrib or not ns('dcoi') + 'lemma' in x.attrib:
                    process_sentence = True
            if process_sentence:
                processed_doc = True
                for i, (word, lemma, morph, pos) in enumerate(frogclient.process(words)):
                    try:
示例#20
0
import flask
from flask import Flask
from flask import request, render_template, jsonify

TEMPLATES_AUTO_RELOAD = True
app = Flask(__name__)
app.jinja_env.auto_reload = True

from pynlpl.clients.frogclient import FrogClient

frogclient = FrogClient('nlp', 12345)
frogclient.returnall = True


def translate(text):
    return frogclient.process(text)


@app.route("/", methods=['GET', 'POST'])
def hello():
    if request.method == 'POST':
        text = request.json['text']
        return jsonify(translate(text))
    else:
        return 'hello\n'