def test_parse(self): # Assert parsed output with Penn Treebank II tags (slash-formatted). # 1) "der große Hund" is a noun phrase, "auf der Matte" is a prepositional noun phrase. v = de.parser.parse(u"Der große Hund sitzt auf der Matte.") self.assertEqual(v, u"Der/DT/B-NP/O große/JJ/I-NP/O Hund/NN/I-NP/O " + \ u"sitzt/VB/B-VP/O " + \ u"auf/IN/B-PP/B-PNP der/DT/B-NP/I-PNP Matte/NN/I-NP/I-PNP ././O/O" ) # 2) "große" and "sitzt" lemmata are "groß" and "sitzen". # Note how articles are problematic ("der" can be male subject but also plural possessive). v = de.parser.parse(u"Der große Hund sitzt auf der Matte.", lemmata=True) self.assertEqual(v, u"Der/DT/B-NP/O/der große/JJ/I-NP/O/groß Hund/NN/I-NP/O/hund " + \ u"sitzt/VB/B-VP/O/sitzen " + \ u"auf/IN/B-PP/B-PNP/auf der/DT/B-NP/I-PNP/der Matte/NN/I-NP/I-PNP/matte ././O/O/." ) # 3) Assert the accuracy of the German tagger. i, n = 0, 0 for sentence in open( os.path.join(PATH, "corpora", "tagged-de-tiger.txt")).readlines(): sentence = sentence.decode("utf-8").strip() s1 = [w.split("/") for w in sentence.split(" ")] s1 = [de.stts2penntreebank(w, pos) for w, pos in s1] s2 = [[w for w, pos in s1]] s2 = de.parse(s2, tokenize=False) s2 = [w.split("/") for w in s2.split(" ")] for j in range(len(s1)): if s1[j][1] == s2[j][1]: i += 1 n += 1 self.assertTrue(float(i) / n > 0.844) print("pattern.de.parse()")
def process_text( text ): annotated = [] parsed_text = parse(text, lemmata=True) doc = parsed_text.split(" ") for token in doc: pos_tag = token.split("/")[1] lemma = token.split("/")[4] if pos_tag == ".": continue current_token = token.split("/")[0] if current_token not in spell: current_token = spell.correction(current_token) if pos_tag[0] == "N": current_token = current_token[0].upper()+current_token[1:] else: current_token = current_token[0].lower()+current_token[1:] annotated.append(current_token) synonyms = get_synonyms(current_token) for synonym in synonyms: annotated.append(synonym) if lemma.lower() != current_token.lower(): annotated.append(lemma) text = ' '.join(annotated) return text
def test_parse(self): # Assert parsed output with Penn Treebank II tags (slash-formatted). # 1) "der große Hund" is a noun phrase, "auf der Matte" is a prepositional noun phrase. v = de.parser.parse(u"Der große Hund sitzt auf der Matte.") self.assertEqual( v, u"Der/DT/B-NP/O große/JJ/I-NP/O Hund/NN/I-NP/O " + u"sitzt/VB/B-VP/O " + u"auf/IN/B-PP/B-PNP der/DT/B-NP/I-PNP Matte/NN/I-NP/I-PNP ././O/O", ) # 2) "große" and "sitzt" lemmata are "groß" and "sitzen". # Note how articles are problematic ("der" can be male subject but also plural possessive). v = de.parser.parse(u"Der große Hund sitzt auf der Matte.", lemmata=True) self.assertEqual( v, u"Der/DT/B-NP/O/der große/JJ/I-NP/O/groß Hund/NN/I-NP/O/hund " + u"sitzt/VB/B-VP/O/sitzen " + u"auf/IN/B-PP/B-PNP/auf der/DT/B-NP/I-PNP/der Matte/NN/I-NP/I-PNP/matte ././O/O/.", ) # 3) Assert the accuracy of the German tagger. i, n = 0, 0 for sentence in open(os.path.join(PATH, "corpora", "tagged-de-tiger.txt")).readlines(): sentence = sentence.decode("utf-8").strip() s1 = [w.split("/") for w in sentence.split(" ")] s1 = [de.stts2penntreebank(w, pos) for w, pos in s1] s2 = [[w for w, pos in s1]] s2 = de.parse(s2, tokenize=False) s2 = [w.split("/") for w in s2.split(" ")] for j in range(len(s1)): if s1[j][1] == s2[j][1]: i += 1 n += 1 self.assertTrue(float(i) / n > 0.844) print "pattern.de.parse()"
def extract_lang_features(self, utterance, embeddings, word_id): shape = (64,) word_embeddings = np.zeros(shape) token_appearance = dict() if '@' in utterance: utterance = self.delete_username(utterance) if self.has_link(utterance): utterance = self.delete_link(utterance) utterance = self.delete_non_alphabetic_symbols(utterance) sentences = parse(utterance, relations=True, lemmata=True).split() # tokens = utterance.split(' ') token_number = 1 for sentence in sentences: token_number += len(sentence) for token in sentence: if token[5] in token_appearance: token_appearance[token[5]] += 1 else: token_appearance[token[5]] = 1 embedding = words2vec.find_word_embeddings(token[0], embeddings, word_id) if embedding is not None: word_embeddings = np.add(word_embeddings, embedding) # embedding = np.zeros(64) # embeddings_list = np.append(embeddings_list, embedding, axis=0) # embeddings_list.append(embedding) if token_number > 1: token_number = token_number - 1 word_embeddings = np.divide(word_embeddings, token_number) return word_embeddings, token_appearance
def pos_clean(x): from pattern.de import parse s = parse(x, chunks=True, tagset="STTS", relations=True, lemmata=True).split()[0] sen = [] for i in s: if i[1] == 'NN' or i[1] == 'ADJA' or i[1] == 'FM' or i[ 1] == 'ADJD' or i[1] == 'APPRART': sen.append(i[0]) return ' '.join(sen)
def test(): platforms = load_platforms() for party, sections in platforms.items(): for section in sections: tagged = split(parse(section.text)) for sentence in tagged: #if not sentence.is_question: # continue try: # for word in sentence.words: # print word.tags #dir(word) #print [sentence.subjects, sentence.verbs] #print [sentence.is_question] #print [sentence.words] print [sentence.text] except UnicodeEncodeError: pass
def _getParse(word, language): import pattern.en as pattern_en # @UnresolvedImport import pattern.es as pattern_es # @UnresolvedImport import pattern.fr as pattern_fr # @UnresolvedImport import pattern.de as pattern_de # @UnresolvedImport import pattern.it as pattern_it # @UnresolvedImport if language == "es": return pattern_es.parse(word) elif language == "en": return pattern_en.parse(word) elif language == "it": return pattern_it.parse(word) elif language == "fr": return pattern_fr.parse(word) elif language == "de": return pattern_de.parse(word) else: return pattern_en.parse(word)
def is_first_verb(utterance): is_verb = False is_imperativ = False if '@' in utterance: utterance = Feature.delete_username(utterance) utterance = Feature.delete_conjuction(utterance) sentences = parse(utterance, relations=True, lemmata=True, tagset='STTS').split() pos_list = [ 'VVFIN','VAFIN', 'VVINF', 'VAINF', 'VVIZU', 'VVIMP', 'VAIMP', 'VVPP', 'VAPP'] pos_imp = ['VVIMP', 'VAIMP'] # a = mood(utterance) # print a if len(sentences) != 0: if len(sentences[0]) != 0: pos = sentences[0][0][1] if pos in pos_list: is_verb = True if pos in pos_imp: is_imperativ = True return is_verb, is_imperativ return is_verb, is_imperativ
def main(): args = parse_args() words = [] print("Loading from {}".format(args.input)) with io.open(args.input, encoding='utf8') as f: acceptable_characters = string.letters + string.digits + " äüö" for line in f.readlines(): if line.strip() == "suggestterm": continue word = filter(lambda c: c in acceptable_characters, line).strip() if len(word) > 0 and not any(c.isdigit() for c in word): words.append(word) print("Parsing {} words".format(len(words))) parsed_words = [] for f in words: parsed = parse(f, tags=False, chunks=False, relations=False, lemmata=True) parsedlist = u" ".join( [word.split("/")[2] for word in parsed.split(" ")]) parsed_words.append(parsedlist) print("Saving {} words to {}".format(len(parsed_words), args.output)) with open(args.output, "w") as f: for word in parsed_words: print(clean_umlauts(word), file=f) single_word_entries = list( {clean_umlauts(f.strip()) for f in parsed_words if " " not in f}) single_word_entries.sort() print("Saving {} words to {}".format(len(single_word_entries), args.singleoutput)) with open(args.singleoutput, "w") as f: for word in single_word_entries: print(word, file=f)
#!/usr/bin/env python2 # coding: utf-8 import sys sys.path.insert(0, '/zen/tez/pattern-2.6/build/lib') from pattern.de import parse parse_sent = lambda sent: parse( sent, tokenize=True, tags=True, chunks=True, relations=True, lemmata=True) #if __name__ == '__main__': # while True: line = sys.stdin.readline() sent = line.strip().decode('utf-8') if sent == '000': sys.exit(0) ps = parse_sent(sent).encode('utf-8') + "\n" sys.stdout.write(ps)
# -*- coding: utf-8 -*- from pattern.de import parse, split, pprint, tag # from pprint import pprint # s = parse('Die Katze liegt auf der Matte.') # for sentence in split(s): # for word in sentence: # print(word) # pprint(sentence) pprint( parse('Die Katze liegt auf der Matte mit weniger als 10%.', tags=True, chunks=True, relations=True, lemmata=True, encoding='utf-8', tagset="STTS")) for word, pos in tag('Die Katze liegt auf der Matte mit weniger als 10%.', tagset="STTS"): if pos == "ARTDEF" or pos == "NN": print word + '\t' + pos
from pattern.de import parse, parsetree, split """ import pattern.de pattern.de.verbs - 1962 verbs. pattern.de.tenses pattern.de.tenses('erblicken') pattern.de.conjugate.__doc__ """ """ lst=parse(raw) (Pdb) split(lst)[0] Sentence('Stehen/VB/B-VP/O bleiben/VB/I-VP/O !/./O/O') """ lst = parse(raw) for sent in split(lst): print "sent.string: ", sent.string pdb.set_trace() sys.exit(2) s = parsetree(raw) print "sentences now" for sentence in s: print "sentence: ", sentence for chunk in sentence.chunks: print "sentence: ", " ".join([w.string for w in chunk.words]) #print "\tchunk type: ", chunk.type, [(w.string, w.type) for w in chunk.words] pdb.set_trace()
elif len(field[2]) != 3: print line.encode('utf8'), continue # F. Get the text, clean leading chevrons, and print the line try: text = re.sub('^[>,\ ]{0,6}', '', field[3]) except IndexError: print line.encode('utf8') continue print line.encode('utf8'), snt = "" # G. Clean ups text = re.sub('Mind\.', 'Mindestens', text) # H. Pattern 2.6 parts of speech -- split the text if needed try: pos = parse(text, lemmata=True, relations=True, encoding = 'utf-8') for pos in pos.splitlines(): pos = re.sub('\ ', '|', pos) print u"".join([field[0],"|",field[1],"|POS_03|",pos]).encode('utf-8').strip() except (UnicodeDecodeError, UnicodeEncodeError, IndexError, AssertionError): # Tag failed lines NA to enable repair print "".join([field[0],"|",field[1],"|POS_03","|NA"]) # I. Close the file fp.close() # EOF
def evaluate(): global graphs, sentences, pageNumbers, probs, graphs # remove old evaluation del sentences[:] del graphs[:] del tagged[:] del tagSentences[:] del probs[:] # read from GUI fname = tDocument.get("1.0", END).rstrip('\n') # read text from file with open(fname, 'r') as f: text = f.read() f.close() #f = codecs.open(fname, encoding='utf-8') #text = f.read() size = len(text) print(("Text with " + str(size) + " characters loaded!")) #print(repr(text)) text = repr(text) #print(text) text = text.replace('\\x0c', '\\n\\x3Cnewpage\\x3E\\n\\n') text = eval(text) # mark pagenumbers text = re.sub(r'\n([0-9]+)\n+<newpage>', r'<pagebreak>\1<pagebreak>', text) #print(text) # replace whitespaces by spaces text = " ".join(text.split()) # replace abbr. text = text.replace('eg.', 'eg') # TODO: Problem here! text = text.replace('Dr.', 'Dr') text = text.replace('Prof.', 'Prof') text = text.replace('bzw.', 'bzw') text = text.replace('Vgl.', 'vgl') text = text.replace('vgl.', 'vgl') text = text.replace('etc.', 'etc') text = text.replace('Abb.', 'Abbildung') text = text.replace('z. B.', 'zum Beispiel') text = text.replace('ca.', 'cirka') text = text.replace('Nr.', 'Nr') text = text.replace('nr.', 'nr') text = text.replace('Bg.', 'Bg') text = text.replace('al.', 'al') text = text.replace('europ.', 'europ') text = re.sub(" [a-zA-Z]\.", "", text) middle_abbr = re.compile('[A-Za-z0-9]\.[A-Za-z0-9]\.') # middle abb a = middle_abbr.search(text) # find the abbreviation b = re.compile('\.') # period pattern c = b.sub('', a.group(0)) # remove periods from abbreviation text = middle_abbr.sub(c, text) # substitute new abbr for old # extract sentences pat = re.compile(r'([A-Z][^\.!?]*[\.!?])', re.M) sentences = pat.findall(text) # TODO: remove all very short sentences sentences = [elem for elem in sentences if not (calcWords(elem) < 4)] # remove all setences that contain a number sentences = [elem for elem in sentences if not (hasNumbers(elem))] # remove sentences with URL sentences = [elem for elem in sentences if not (hasURL(elem))] # remove sentences with Noise sentences = [elem for elem in sentences if not (hasNoise(elem))] # add pagenumbers to sentences subset = text del pageNumbers[:] for sentence in sentences: start = subset.find(sentence) subset = subset[start:] num = re.search("<pagebreak>[0-9]+<pagebreak>", subset).group() num = num.replace("<pagebreak>", "") pageNumbers.append(num) # print all sentences for i in range(0, len(sentences)): print(("Sentence Nr. " + str(i) + ", Page Nr. " + str(pageNumbers[i]) + ": " + sentences[i] + "\n")) # create pos for sentence in sentences: tagged.append(parse(sentence)) # print all tagged sentences for i in range(0, len(sentences)): print(("Tagged Sentence Nr. " + str(i) + ", Page Nr. " + str(pageNumbers[i]) + ": " + tagged[i] + "\n")) # cut words, keep tags, simplify them for sentence in tagged: fragment = sentence.split(' ') tags = [] for tag in fragment: fragment = tag.split('/')[1] # simplify tags fragment = fragment.replace(".", ".X") fragment = fragment.replace(",", ",X") fragment = fragment.replace(":", ":X") fragment = fragment.replace("(", ")X") fragment = fragment.replace("(", ")X") fragment = fragment = fragment[0:2] tags.append(fragment) tagSentences.append(tags) # print tags only for i in range(0, len(sentences)): print(("Sentence Nr. " + str(i) + ", Page Nr. " + str(pageNumbers[i]) + ": " + str(tagSentences[i]) + "\n")) print("Processing done") # create transitions for each sentence for sentence in tagSentences: print(sentence) print((sentence[0])) # first transition trans = [] trans.append("ST" + str(sentence[0])) # mark start for i in range(1, len(sentence) - 1): trans.append(str(sentence[i]) + str(sentence[i + 1])) # last transition trans.append(str(sentence[i]) + "EN") # mark end #trans = [] #for i in range(0, len(sentence)): #trans.append(str(i) + sentence[i]) transitions.append(trans) print(transitions) # Get total probabilities allTrans = [] total = 0 for trans in transitions: for x in trans: allTrans.append(x) total += 1 counts = Counter(allTrans) print(counts) # get first part of transition probabilities (for cond. prob.) condTrans = [] for x in allTrans: condTrans.append(x[0:2]) countsCond = Counter(condTrans) print(countsCond) # Get average probability of transitions in a sentence for trans in transitions: size = len(trans) score = 1 #for x in trans: for x in range(0, 1): # len(trans)): pTrans = counts[trans[x]] # count of transition pFirst = countsCond[trans[x][0:2]] # count of first part of trans #print(pTrans, pFirst) score *= pTrans / float(pFirst) probs.append(score) # / float(size)) print(probs)
def parse_text(text): """ takes german text, 1 or more sentences and applies part of speech information """ # STTS works better than standard tagset. The target words are NN and NE return parse(text, tagset="STTS")
continue # F. Get the text, clean leading chevrons, and print the line try: text = re.sub('^[>,\ ]{0,6}', '', field[3]) except IndexError: print line.encode('utf8') continue print line.encode('utf8'), snt = "" # G. Clean ups text = re.sub('Mind\.', 'Mindestens', text) # H. Pattern 2.6 parts of speech -- split the text if needed try: pos = parse(text, lemmata=True, relations=True, encoding='utf-8') for pos in pos.splitlines(): pos = re.sub('\ ', '|', pos) print u"".join([field[0], "|", field[1], "|POS_03|", pos]).encode('utf-8').strip() except (UnicodeDecodeError, UnicodeEncodeError, IndexError, AssertionError): # Tag failed lines NA to enable repair print "".join([field[0], "|", field[1], "|POS_03", "|NA"]) # I. Close the file fp.close() # EOF
raise # tt_en = TreeTagger(encoding='utf-8', language='english') # pprint(tt_en.tag('Does this thing work?')) tagger = ttw.TreeTagger(TAGLANG='de', TAGDIR='/home/niklas/treetagger/') # satz = u'Dies ist ein Testsatz.' # print type(satz) # satzu = satz.decode('utf-8') # tags = tagger.tag_text(satz) # pprint.pprint(tags) datei = open('196.txt', 'r') dat = datei.read() s = parse(dat, tagset='STTS') s = split(s) print s.sentences[0] print predicative('neugierige') with open('196.txt', 'r') as openfile: for line in openfile: nltk.tag.brill.BrillTagger(line) # datu = dat.decode('utf-8') # print tagger.tag_text(dat) # print datu # tags = tagger.TagText(datu) # # for tag in tags: # # print tag datei.close()