from nltk import accuracy from nltk.tree import Tree #output = open('200DadeganSents-chunkExtractor.txt', 'w', encoding='utf8') dadegan = DadeganReader('resources/Dadegan/test.conll') tagger = SequencePOSTagger(model='Resources/postagger-remove-w3-all.model') lemmatizer = Lemmatizer() chunker = Chunker(model='Resources/chunker-dadeganFull.model') parser = DependencyParser(tagger, lemmatizer=lemmatizer ) normalizer = Normalizer() chunk_extractor = ChunkTreeInformationExtractor() dep_extractor = DependencyTreeInformationExtractor() trees = list(dadegan.chunked_trees()) chunk_trees = trees[:100] + trees[200:300] trees = list(dadegan.trees()) dep_trees = trees[:100] + trees[200:300] #dep_output = codecs.open('dep_output.txt', 'w', encoding='utf8') #sentences = [] #for sent in dadegan.sents(): # sentences.append(' '.join([w for w, t in sent])) indices = [] def tag_sent(chunks, args): tagged_sent = [] global_index = 0 for chunk in chunks: if type(chunk) is Tree: tokens = chunk.leaves() else: tokens = [chunk]
import codecs, re from nltk import accuracy from nltk.tree import Tree #output = open('200DadeganSents-chunkExtractor.txt', 'w', encoding='utf8') dadegan = DadeganReader('resources/Dadegan/test.conll') tagger = SequencePOSTagger(model='Resources/postagger-remove-w3-all.model') lemmatizer = Lemmatizer() chunker = Chunker(model='Resources/chunker-dadeganFull.model') parser = DependencyParser(tagger, lemmatizer=lemmatizer) normalizer = Normalizer() chunk_extractor = ChunkTreeInformationExtractor() dep_extractor = DependencyTreeInformationExtractor() trees = list(dadegan.chunked_trees()) chunk_trees = trees[:100] + trees[200:300] trees = list(dadegan.trees()) dep_trees = trees[:100] + trees[200:300] #dep_output = codecs.open('dep_output.txt', 'w', encoding='utf8') #sentences = [] #for sent in dadegan.sents(): # sentences.append(' '.join([w for w, t in sent])) indices = [] def tag_sent(chunks, args): tagged_sent = [] global_index = 0 for chunk in chunks: if type(chunk) is Tree: tokens = chunk.leaves() else:
arg_list = [] index = sent.strip().find(arg) if index >= 0: tokens = sent.split() for i in range(len(tokens)): index -= len(tokens[i]) + 1 if index < 0: for c in range(i, i + len(arg.split())): arg_list.append(c) break info_list.append(arg_list) return info_list input = codecs.open('200DadeganSents.txt', 'r', encoding='utf8') dadegan = DadeganReader('Resources/Dadegan/train.conll') dadegan_trees = dadegan.trees() informations = [] sentences = [] for sentence in dadegan.sents(): sentences.append(' '.join([w for w, t in sentence])) for tree, chunks, sent in zip(dadegan_trees, dadegan.chunked_trees(), sentences): info_list = ([], [], []) for information in dependencyExtractor.extract(tree): temp_list = positions(information, sent) for i in range(3): if len(temp_list[i]) > 0 and temp_list[i] not in info_list[i]: info_list[i].append(temp_list[i]) if [] in info_list: continue else: tag_sent(chunks, info_list)
import codecs from hazm import DadeganReader from baaz import DependencyTreeInformationExtractor, ChunkTreeInformationExtractor output = codecs.open('resources/informations.txt', 'w', encoding='utf8') dadegan = DadeganReader('corpora/train.conll') chunk_extractor = ChunkTreeInformationExtractor() dependency_extractor = DependencyTreeInformationExtractor() for chunk_tree, dependency_tree in zip(dadegan.chunked_trees(), dadegan.trees()): for information in chunk_extractor.extract(chunk_tree): print(*information, sep=' - ', file=output) print(file=output) for information in dependency_extractor.extract(dependency_tree): print(*information, sep=' + ', file=output) print(file=output)