def __init__(self, *args): global WSDDIR self.tagger = None self.mode = args[0] if args[0] == "file": if len(args) != 2: raise Exception("Syntax: file:[filename]") self.tagger = codecs.open(args[1],'r','utf-8') elif args[0] == "frog": if len(args) != 3: raise Exception("Syntax: frog:[host]:[port]") from pynlpl.clients.frogclient import FrogClient port = int(args[2]) self.tagger = FrogClient(args[1],port) elif args[0] == "freeling": if len(args) != 3: raise Exception("Syntax: freeling:[host]:[port]") from pynlpl.clients.freeling import FreeLingClient host = args[1] port = int(args[2]) self.tagger = FreeLingClient(host,port) elif args[0] == "corenlp": if len(args) != 1: raise Exception("Syntax: corenlp") import corenlp print("Initialising Stanford Core NLP",file=stderr) self.tagger = corenlp.StanfordCoreNLP() elif args[0] == 'treetagger': if not len(args) == 2: raise Exception("Syntax: treetagger:[treetagger-bin]") self.tagger = args[1] elif args[0] == "durmlex": if not len(args) == 2: raise Exception("Syntax: durmlex:[filename]") print("Reading durm lexicon: ", args[1],file=stderr) self.mode = "lookup" self.tagger = {} f = codecs.open(args[1],'r','utf-8') for line in f: fields = line.split('\t') wordform = fields[0].lower() lemma = fields[4].split('.')[0] self.tagger[wordform] = (lemma, 'n') f.close() print("Loaded ", len(self.tagger), " wordforms",file=stderr) elif args[0] == "oldlex": if not len(args) == 2: raise Exception("Syntax: oldlex:[filename]") print("Reading OLDLexique: ", args[1],file=stderr) self.mode = "lookup" self.tagger = {} f = codecs.open(args[1],'r','utf-8') for line in f: fields = line.split('\t') wordform = fields[0].lower() lemma = fields[1] if lemma == '=': lemma == fields[0] pos = fields[2][0].lower() self.tagger[wordform] = (lemma, pos) print("Loaded ", len(self.tagger), " wordforms",file=stderr) f.close() else: raise Exception("Invalid mode: " + args[0])
outputdir = "." shortpos = False for o, a in opts: if o in ("-h", "--help"): usage() sys.exit(0) elif o == "-o": outputdir = a if not os.path.exists(outputdir): print >> sys.stderr, "Output directory %s does not exist" % outputdir sys.exit() elif o == "-a": alignfilename = a elif o == "--Sfreeling": Sfreeling = FreeLingClient(a) elif o == "--Tfreeling": Tfreeling = FreeLingClient(a) elif o == "--Stadpole": Stadpole = TadpoleClient('localhost',int(a)) elif o == "--Ttadpole": Ttadpole = TadpoleClient('localhost',int(a)) elif o == "--encoding": encoding = a elif o == "--shortpos": shortpos = True elif o == "-d": if a == "s2t": multiwordalignment = True elif a == "t2s": multiwordalignment = False
class Tagger(object): def __init__(self, *args): global WSDDIR self.tagger = None self.mode = args[0] if args[0] == "file": if len(args) != 2: raise Exception("Syntax: file:[filename]") self.tagger = codecs.open(args[1],'r','utf-8') elif args[0] == "frog": if len(args) != 3: raise Exception("Syntax: frog:[host]:[port]") from pynlpl.clients.frogclient import FrogClient port = int(args[2]) self.tagger = FrogClient(args[1],port) elif args[0] == "freeling": if len(args) != 3: raise Exception("Syntax: freeling:[host]:[port]") from pynlpl.clients.freeling import FreeLingClient host = args[1] port = int(args[2]) self.tagger = FreeLingClient(host,port) elif args[0] == "corenlp": if len(args) != 1: raise Exception("Syntax: corenlp") import corenlp print("Initialising Stanford Core NLP",file=stderr) self.tagger = corenlp.StanfordCoreNLP() elif args[0] == 'treetagger': if not len(args) == 2: raise Exception("Syntax: treetagger:[treetagger-bin]") self.tagger = args[1] elif args[0] == "durmlex": if not len(args) == 2: raise Exception("Syntax: durmlex:[filename]") print("Reading durm lexicon: ", args[1],file=stderr) self.mode = "lookup" self.tagger = {} f = codecs.open(args[1],'r','utf-8') for line in f: fields = line.split('\t') wordform = fields[0].lower() lemma = fields[4].split('.')[0] self.tagger[wordform] = (lemma, 'n') f.close() print("Loaded ", len(self.tagger), " wordforms",file=stderr) elif args[0] == "oldlex": if not len(args) == 2: raise Exception("Syntax: oldlex:[filename]") print("Reading OLDLexique: ", args[1],file=stderr) self.mode = "lookup" self.tagger = {} f = codecs.open(args[1],'r','utf-8') for line in f: fields = line.split('\t') wordform = fields[0].lower() lemma = fields[1] if lemma == '=': lemma == fields[0] pos = fields[2][0].lower() self.tagger[wordform] = (lemma, pos) print("Loaded ", len(self.tagger), " wordforms",file=stderr) f.close() else: raise Exception("Invalid mode: " + args[0]) def __iter__(self): if self.mode != 'file': raise Exception("Iteration only possible in file mode") line = self.tagger.next() newwords = [] postags = [] lemmas = [] for item in line: word,lemma,pos = item.split('|') newwords.append(word) postags.append(pos) lemmas.append(lemma) yield newwords, postags, lemmas def reset(self): if self.mode == 'file': self.tagger.seek(0) def process(self, words, debug=False): if self.mode == 'file': line = self.tagger.next() newwords = [] postags = [] lemmas = [] for item in line.split(' '): if item.strip(): try: word,lemma,pos = item.split('|') except: raise Exception("Unable to parse word|lemma|pos in " + item) newwords.append(word) postags.append(pos) lemmas.append(lemma) return newwords, postags, lemmas elif self.mode == "frog": newwords = [] postags = [] lemmas = [] for fields in self.tagger.process(' '.join(words)): word,lemma,morph,pos = fields[:4] newwords.append(word) postags.append(pos) lemmas.append(lemma) return newwords, postags, lemmas elif self.mode == "freeling": postags = [] lemmas = [] for fields in self.tagger.process(words, debug): word, lemma,pos = fields[:3] postags.append(pos) lemmas.append(lemma) return words, postags, lemmas elif self.mode == "corenlp": data = json.loads(self.tagger.parse(" ".join(words))) words = [] postags = [] lemmas = [] for sentence in data['sentences']: for word, worddata in sentence['words']: words.append(word) lemmas.append(worddata['Lemma']) postags.append(worddata['PartOfSpeech']) return words, postags, lemmas elif self.mode == 'lookup': postags = [] lemmas = [] for word in words: try: lemma, pos = self.tagger[word.lower()] lemmas.append(lemma) postags.append(pos) except KeyError: lemmas.append(word) postags.append('?') return words, postags, lemmas elif self.mode == 'treetagger': s = " ".join(words) s = u(s) p = subprocess.Popen([self.tagger], shell=False, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) (out, err) = p.communicate(s.encode('utf-8')) newwords = [] postags = [] lemmas = [] for line in out.split('\n'): line = line.strip() if line: fields = line.split('\t') newwords.append( unicode(fields[0],'utf-8') ) postags.append( unicode(fields[1],'utf-8') ) lemmas.append( unicode(fields[2],'utf-8') ) if p.returncode != 0: print(err,file=stderr) raise OSError('TreeTagger failed') return newwords, postags, lemmas else: raise Exception("Unknown mode") def treetagger_tag(self, f_in, f_out,oneperline=False, debug=False): def flush(sentences): if sentences: print("Processing " + str(len(sentences)) + " lines",file=stderr) for sentence in sentences: out = "" p = subprocess.Popen([self.tagger], shell=False, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) (results, err) = p.communicate("\n".join(sentences).encode('utf-8')) for line in results.split('\n'): line = line.strip() if line: fields = line.split('\t') word = fields[0] pos = fields[1] lemma = fields[2] if oneperline: if out: out += "\n" out += word + "\t" + lemma + "\t" + pos else: if out: out += " " if '|' in word: word = word.replace('|','_') if '|' in lemma: lemma = lemma.replace('|','_') if '|' in pos: pos = pos.replace('|','_') out += word + "|" + lemma + "|" + pos if pos[0] == '$': out = u(out) f_out.write(out + "\n") if oneperline: f_out.write("\n") out = "" if out: out = u(out) f_out.write(out + "\n") if oneperline: f_out.write("\n") #buffered tagging sentences = [] linenum = 0 for line in f_in: linenum += 1 print(" Buffering input @" + str(linenum),file=stderr) line = line.strip() if not line or ('.' in line[:-1] or '?' in line[:-1] or '!' in line[:-1]) or (line[-1] != '.' and line[-1] != '?' and line[-1] != '!'): flush(sentences) sentences = [] if not line.strip(): f_out.write("\n") if oneperline: f_out.write("\n") sentences.append(line) flush(sentences) def tag(self, f_in, f_out,oneperline=False, debug=False): if self.mode == 'treetagger': self.treetagger_tag(f_in, f_out,oneperline=False, debug=False) else: linenum = 0 for line in f_in: linenum += 1 print(" Tagger input @" + str(linenum),file=stderr) if line.strip(): words = line.strip().split(' ') words, postags, lemmas = self.process(words, debug) out = "" for word, pos, lemma in zip(words,postags, lemmas): if word is None: word = "" if lemma is None: lemma = "?" if pos is None: pos = "?" if oneperline: if out: out += "\n" out += word + "\t" + lemma + "\t" + pos else: if out: out += " " if '|' in word: word = word.replace('|','_') if '|' in lemma: lemma = lemma.replace('|','_') if '|' in pos: pos = pos.replace('|','_') out += word + "|" + lemma + "|" + pos if not isinstance(out, unicode): out = unicode(out, 'utf-8') f_out.write(out + "\n") if oneperline: f_out.write("\n") else: f_out.write("\n")