def main(): system("clear") print("\ ####TNWST####\n\ \n\ \n\ 1. Contador de palavras e Gráfico:\n\ 2. Concordancia e Dispersão:\n\ 3. Comparação:\n\ 4. Tagger:(Inglês)\n\n\ 5. Sair\n\n") choice = int(input('Digite a opção desejada: ')) if choice is 1: contagem() elif choice == 2: concord() elif choice == 3: compare() elif choice == 4: tag() elif choice == 5: exit()
def process(): # Mengakses data form dari request HTTP text = request.form.get("text", "") # Melakukan preprocessing text = preprocess(text) # Melakukan tagging text = tag(text, "http://localhost:7000") # Melakukan chunking text = chunk(text) # Melakukan proses normalisasi text = normalize(text) # Membuat response HTTP dengan format JSON yang berisi teks yang telah diproses return jsonify({ "status": "success", "message": "Request successful", "data": { "text": text } })
def tag_tropes(fp): tropes = read_json(fp) results = list() for tup in tropes: trope = tup[0] description = tup[1] result = tagger.tag(description) results.append((trope, result['tagged'], result['by_tag'])) return results
def check(self, instance): lowtags = tagger.get_tags("test_entity", False) self.gauge("old_method.low_card", 1, tags=lowtags) alltags = tagger.get_tags("test_entity", True) self.gauge("old_method.high_card", 1, tags=alltags) notags = tagger.get_tags("404", True) self.gauge("old_method.unknown", 1, tags=notags) lowtags = tagger.tag("test_entity", tagger.LOW) self.gauge("new_method.low_card", 1, tags=lowtags) orchtags = tagger.tag("test_entity", tagger.ORCHESTRATOR) self.gauge("new_method.orch_card", 1, tags=orchtags) alltags = tagger.tag("test_entity", tagger.HIGH) self.gauge("new_method.high_card", 1, tags=alltags) notags = tagger.tag("404", tagger.LOW) self.gauge("new_method.unknown", 1, tags=notags)
def parse(self, tokens, tagger = None): # tokens = self._preprocess(list(tokens)) if (tagger == None): tagged = nltk.pos_tag(tokens) else: tagged = tagger.tag(tokens) # print tagged missing = False for tok, pos in tagged: if not self._grammar._lexical_index.get(tok): missing = True self._grammar._productions.append(ProbabilisticProduction(Nonterminal(pos), [tok], prob=0.000001)) if missing: self._grammar._calculate_indexes() return super(PCFGViterbiParser, self).parse(tokens)
def capitalize_named_entities(text): # 04: Mengubah penulisan entitas bernama dalam teks sesuai dengan kapitalisasi yang benar current_dir_path = os.path.dirname(os.path.realpath(__file__)) entity_capitalizations = json.loads( open(os.path.join(current_dir_path, "entity_capitalizations.json")).read()) tagged_text = tag(text) result = text.lower() tagged_entities = [] container = "" is_combining = False for i, token in enumerate(tagged_text): token_text = token[0].lower() token_code = token[1][0] if (not is_combining and token_code == "B"): is_combining = True container = token_text elif (not is_combining and token_code == "I"): raise Exception() elif (is_combining and token_code == "B"): tagged_entities.append(container) container = token_text elif (is_combining and token_code == "I"): if (token_text in string.punctuation): container += token_text else: container += (" " + token_text) if (i == (len(tagged_text) - 1)): tagged_entities.append(container) print("Hasil deteksi entitas bernama: ") print(tagged_text, end="\n\n") for entity in tagged_entities: lowercased_entity = entity.lower() if (lowercased_entity in entity_capitalizations): result = result.replace( lowercased_entity, entity_capitalizations[lowercased_entity]["text"]) return result
def extract_noun_phrases(text): """Extracts all noun_phrases from the given text""" toks = nltk.regexp_tokenize(text, sentence_re) postoks = tagger.tag(toks) # Build a POS tree tree = chunker.parse(postoks) terms = get_terms(tree) # Extract Noun Phrase noun_phrases = [] for term in terms: np = "" for word in term: np += word + " " if np != "": noun_phrases.append(np.strip()) return noun_phrases
from preprocessor import preprocess from tagger import tag from chunker import chunk from normalizer import normalize input_file = open("input.txt") preprocessed_file = open("preprocessed.txt", "w") tagged_file = open("tagged.txt", "w") chunked_file = open("chunked.txt", "w") normalized_file = open("normalized.txt", "w") input_text = "\n".join([line for line in input_file]) # Praproses temp = preprocess(input_text) preprocessed_file.write(temp) # Tagging temp = tag(temp, "http://localhost:7000") tagged_file.write(temp) # Chunking temp = chunk(temp) chunked_file.write(temp) # Normalisasi temp = normalize(temp) normalized_file.write(temp)
"""Generate one or more HTML tags""" if cls is not None: attrs['class'] = cls if attrs: attr_str = ''.join(' %s="%s"' % (attr, value) for attr, value in sorted(attrs.items())) else: attr_str = '' if content: return '\n'.join('<%s%s>%s</%s>' % (name, attr_str, c, name) for c in content) else: return '<%s%s />' % (name, attr_str) tag_br = tag('br') print('tag_br = {0}'.format(tag_br)) print() tag_p_hello = tag('p', 'hello') print('tag_p_hello = {0}'.format(tag_p_hello)) print() tag_p_hello_world = tag('p', 'hello', 'world') print('tag_p_hello_world = \n{0}'.format(tag_p_hello_world)) print() tag_p_hello_id = tag('p', 'hello', id=33) print('tag_p_hello_id = {0}'.format(tag_p_hello_id)) print()
for tag in tagger.tags: p = tagger.precision(matrix, tag) r = tagger.recall(matrix, tag) LOG("tag %s: precision = %.4f, recall = %.4f\n" % (tag, p, r)) LOG("accuracy = %.4f\n" % tagger.accuracy(matrix)) # Load a trained model from a file and use it to tag new data (in CoNLL format). # Usage: python3 lab2.py tag MODEL_FILE IN_DATA_FILE OUT_DATA_FILE if sys.argv[1] == "tag": tagger = MyTagger(TAGS) LOG("Loading the model from %s ..." % sys.argv[2]) tagger.load(sys.argv[2]) LOG(" done\n") with open(sys.argv[4], 'w') as fp: for sentence in load_data(sys.argv[3]): tagged_tokens = tagger.tag([token[1] for token in sentence]) for token, tagged_token in zip(sentence, tagged_tokens): for i in range(len(token), 6): token.append("_") token[3] = tagged_token[1] token[4] = tagged_token[1] fp.write("\t".join(token)) fp.write("\n") fp.write("\n") # Load a trained model from a file and evaluate it on test data. # This will print the confusion matrix. # Usage: python3 lab2.py matrix MODEL_FILE TEST_DATA_FILE if sys.argv[1] == "matrix": tagger = MyTagger(TAGS) LOG("Loading the model from %s ..." % sys.argv[2])