def check_sent(sent, day, num, BN): #nhung cau co do tuong tu cao pos_tag_sent = pos_tag(sent) list_day = [] phraise = [] phraise_BN = [] if len(num) > 0: for i in num: sub_sum = sent[i[0]:i[1] - 1].strip(' ') sub_sent = sent[i[1] - 1:] pos_tag_sub_sent = pos_tag(sub_sent) if pos_tag_sub_sent[0][1] != 'N': line = word_tokenize(sent, format="text") masked_line = word_tokenize( sent[:i[1] - 1], format="text") + ' <mask> ' + word_tokenize( sent[i[1] - 1:], format="text") result = mark(line, masked_line) phraise.append(sub_sum + ' ' + result) else: phraise.append(sub_sum + ' ' + pos_tag_sub_sent[0][0]) if len(day) > 0: for i in day: sub_day = sent[i[0]:i[1]].strip(' ') sub_day = re.sub('-', '/', sub_day) list_day.append(sub_day) if len(BN) > 0: for i in BN: sub_BN = sent[i[0]:i[1] - 1].strip(' ') phraise_BN.append(sub_BN) return phraise, list_day, phraise_BN
def pos_tagging_sentence(self, sent): if self.use_vncorenlp: sent = sent.replace("_", " ") temp = self.annotator.annotate(sent) return [(element["form"], element["posTag"]) for sent in temp["sentences"] for element in sent] else: return pos_tag(sent)
def extractNoun(content): arr_nouns = [] if ((content is not None) and len(content.strip()) > 0): for word, pos in pos_tag(content): if ((pos == 'N' and isNoun(word))): arr_nouns.append(word) return arr_nouns
def is_clause(sentence): pos = pos_tag(sentence.lower()) typeofword = [] for i in range(len(pos)): typeofword.append(pos[i][1]) pass if (('N' in typeofword) or ('Np' in typeofword) or ('V') in typeofword or ('M') in typeofword or ('Nc' in typeofword)) and ('A' in typeofword): if typeofword[0]=='C' or typeofword[0]=='R': if typeofword[1]=='N' or typeofword[1] == 'Np' or typeofword[1]=='V' or typeofword[1]=='M' or typeofword[1]=='Nc': return True if typeofword[1] == 'A': if typeofword[2] =='N' or typeofword[2] == 'Np' or typeofword[2] == 'M' or typeofword[2]=='Nc': return True else: return False elif typeofword[0]=='X': if typeofword[1]=='C' or typeofword[1]=='R': if typeofword[2]=='N' or typeofword[2] == 'Np' or typeofword[2]=='V' or typeofword[2]=='M' or typeofword[2]=='Nc': return True if typeofword[2] == 'A': if typeofword[3] =='N' or typeofword[3] == 'Np' or typeofword[3] == 'M' or typeofword[3]=='V' or typeofword[3]=='Nc': return True else: return False elif typeofword[0]=='N' or typeofword[0] == 'Np' or typeofword[0]=='V' or typeofword[0]=='M': return True elif typeofword[0]=='A': if typeofword[1] =='N' or typeofword[1] == 'Np' or typeofword[1] == 'M' or typeofword[0]=='V': return True else: return False else: return False return False pass
def wseg_and_add_pos_tag_feature(sentence: str or list, pos_tags: list = None, ner_labels: list = None) -> (list, list): if type(sentence) == str: sentence = sentence.split() if ner_labels is None: ner_labels = ['O'] * len(sentence) pos_features = [] words = [] labels = [] if pos_tags is None: annotated_text = pos_tag(" ".join(sentence)) sentence = [] pos_tags = [] for word, pos in annotated_text: sentence.append(word.strip()) pos_tags.append(pos.strip()) ner_labels = ['O'] * len(sentence) for word, pos, label in list(zip(sentence, pos_tags, ner_labels)): tokens = word.split() (prefix, tag) = label.split('-') if not label == 'O' else ('', label) for idx, token in enumerate(tokens): if token.strip() == '': continue if idx == 0 and prefix.strip() == 'B': labels.append(label) else: labels.append( f'I-{tag.strip()}' if not label == 'O' else 'O') words.append(token.strip()) pos_features.append('[POS]' + pos_tag_normalize(pos.strip())) return words, pos_features, labels
def auto_annotation(input_file, output_folder="."): file_id = basename(input_file).split(".")[0] texts = open(input_file).read().strip().decode("utf-8").split("\n") content = u"\n".join([u" ".join(word_sent(text)) for text in texts]) output_text_file = join(output_folder, "%s.txt" % file_id) io.open(output_text_file, "w", encoding="utf-8", newline="\n").write(content) start = 0 end = 0 output_annotation_file = join(output_folder, "%s.ann" % file_id) ann_file = io.open(output_annotation_file, "w", encoding="utf-8", newline="\n") token_id = 1 for text in texts: tokens = pos_tag(text) for token in tokens: word, tag = token end = start + len(word) ann_file.write(u"T%d\t%s %d %d\t%s\n" % (token_id, tag, start, end, word)) token_id += 1 start = end + 1
def keyword_extraction(question): keywords = [] question = question.replace('_', ' ') if 'nhất' in question.lower(): keywords.append('nhất') words = pos_tag(question) for i in range(0, len(words)): words[i] = (words[i][0].replace(' ', '_'), words[i][1]) for token in words: word = token[0] pos = token[1] if (pos in ['A', 'Ab']): keywords += word.lower().split('_') keywords = list(set(keywords)) keywords = [[w] for w in keywords] ners = get_ner(question) ners = [n.lower() for n in ners] for ne in ners: variants = extractEntVariants(ne) keywords.append(variants) return keywords
def set_uc1_and_uc2_for_conversations(self, rasa_chatlog_df: pd.DataFrame): # with open("models/ic_for_uc1_2.pkl", "rb") as file: # clf = pickle.load(file) conversation_ids = list(rasa_chatlog_df["conversation_id"]) conversation_ids = list(dict.fromkeys(conversation_ids)) rasa_chatlog_df.insert(2, "use_case", "") for id in conversation_ids: chatlog_sub_df = rasa_chatlog_df[rasa_chatlog_df["conversation_id"] == id] conversation_attachments = list(chatlog_sub_df['attachments']) if any("scontent" in str(x) for x in conversation_attachments): chatlog_sub_df_first_turn = chatlog_sub_df[ (chatlog_sub_df["turn"] == 0) | (chatlog_sub_df["turn"] == 1)] for index, item in chatlog_sub_df_first_turn.iterrows(): user_message = item["user_message"] if str(item["entities"]) != "nan": entities_list = item["entities"].split(",") if any("price" in str(x) for x in entities_list): rasa_chatlog_df.at[index, "use_case"] = "uc_2" break if str(user_message) != "nan": user_message_correction = do_correction(user_message) message_pos_tag = pos_tag(user_message_correction) # message_pos_tag = [user_message_correction] ################################################################## words = [x[0] for x in message_pos_tag] pos = [x[1] for x in message_pos_tag] con_x_khong_form = False if "còn" in words and "không" in words: con_index = words.index("còn") khong_index = words.index("không") if con_index < khong_index: in_between_word_pos = pos[ con_index:khong_index] """ N - Common noun Nc - Noun Classifier Ny - Noun abbreviation Np - Proper noun Nu - Unit noun """ if any(x in in_between_word_pos for x in ["N", "Nc", "Ny", "Np", "Nu"]): con_x_khong_form = True if con_x_khong_form or "còn không" in user_message_correction or ( "còn" in user_message_correction and "không" in user_message_correction): rasa_chatlog_df.at[index, "use_case"] = "uc_1" break ################################################################## # input_message = pd.DataFrame([{"feature": user_message_correction}]) # predicted = list(clf.predict(input_message["feature"])) # if predicted[0] == "uc_1": # rasa_chatlog_df.at[index, "use_case"] = "uc_1" # break return rasa_chatlog_df
def smart_tokenize( self, sentence, language ): # use under_the_sea/nltk tokenizer and pos_tag to keep noun phrase only print(sentence) if language == "vietnamese": tags = underthesea.pos_tag(sentence) else: sentence = nltk.word_tokenize(sentence) tags = nltk.pos_tag(sentence) tokens = [] noun_phrase = "" for i in range(0, len(tags)): if tags[i][1] in [ "N", "Np", "Nu", "Nc", "M", "NN", "NNP", "NNPS", "NNS" ] and tags[i][0].strip() not in ["", " "]: if noun_phrase != "": noun_phrase += " " + tags[i][0].strip() else: noun_phrase = tags[i][0].strip() else: if noun_phrase not in [ "", " " ] and len(noun_phrase.strip().split()) >= 2: tokens.append(noun_phrase.strip()) noun_phrase = "" if noun_phrase.strip() not in [ "", " " ] and len(noun_phrase.strip().split()) >= 2: tokens.append(noun_phrase.strip()) print(tokens) return tokens
def chunk(sentence, format=None): """ Vietnamese chunking Parameters ========== sentence: {unicode, str} raw sentence Returns ======= tokens: list of tuple with word, pos tag, chunking tag tagged sentence Examples -------- >>> # -*- coding: utf-8 -*- >>> from underthesea import chunk >>> sentence = "Nghi vấn 4 thi thể Triều Tiên trôi dạt bờ biển Nhật Bản" >>> chunk(sentence) [('Nghi vấn', 'N', 'B-NP'), ('4', 'M', 'B-NP'), ('thi thể', 'N', 'B-NP'), ('Triều Tiên', 'Np', 'B-NP'), ('trôi dạt', 'V', 'B-VP'), ('bờ biển', 'N', 'B-NP'), ('Nhật Bản', 'Np', 'B-NP')] """ sentence = pos_tag(sentence) crf_model = CRFChunkingPredictor.Instance() result = crf_model.predict(sentence, format) return result
def test(): sentence="Vì thấy quán đang chạy giảm giá giá cả khá hợp lý nên mình cũng quên ko vào đọc bình luận trước khi order" sentence1="Lẩu bò thì chỉ có 1 loại bò; hải sản đắt hơn và toàn mấy thứ ôi, mực bé tí, nhũn nhèo, ko có cá lăng như quảng cáo,món ăn thì không ngon." sentence2="Đồ ăn nguội và nhân viên thì hơi chậm chạm" # sentence = sentence+" "+sentence1+" "+sentence2 s = 'nhà hàng được trang trí rất đẹp thoáng mát rộng rãi,nhưng nhân viên rất cẩu thả, thái độ không tốt' print(detect_sentence(s)) print(pos_tag('nhân viên rất cẩu thả'))
def keywords_extraction(sent): '''A native function to define good tokens for searching currently get all but except P and CH''' rs="" for i in pos_tag(sent): if i[1] !='P' and i[1] != 'CH': rs=rs+' '+i[0] return rs.strip()
def predict(sentence): sentences = word_tokenize(sentence) tagger = pycrfsuite.Tagger() tagger.open('crf.model') tokens = [(word, tag, "X") for word,tag in pos_tag(sentence)] X = extract_features(tokens) y_pred = tagger.tag(X) output = [(tag, token) for token, tag in zip(y_pred, sentences)] return output
def pos_tag(request): result = {} try: text = json.loads(request.body.decode("utf-8"))["text"] tags = uts.pos_tag(text) result["output"] = tags except: result = {"error": "Bad request!"} return JsonResponse(result)
def processing_NER(sentence): pos = pos_tag(sentence) sentence = word_tokenize(sentence) X = [sent2features(pos)] import time start_time = time.time() with open('model_CRF_NER.pkl', 'rb') as fp: crf = pickle.load(fp) pred = crf.predict(X) pred = np.array(pred) pred = pred.flatten() end_time = time.time() print('total run-time: %f ms' % ((end_time - start_time) * 1000)) # Product perform by upper word sentence2string = '' words = [] tag = [] i = 0 if (len(pred) >= 2): for word, label in list(zip(sentence, pred)): if label[0] == 'B': sentence2string = '' sentence2string += (word) tag.append(label[2:]) if label[0] == 'I' and word != ',': sentence2string += (' ' + word) if label[0] == 'I' and word == ',': sentence2string += (word) if label[0] == 'I' and (i + 1 == len(pred)): words.append(sentence2string) if ((i + 1) > len(pred)): break if ((i + 1) < len(pred)): if label[0] == 'I' and pred[i + 1][0] == 'O': words.append(sentence2string) if label[0] == 'I' and pred[i + 1][0] == 'B': words.append(sentence2string) if label[0] == 'B' and pred[i + 1][0] == 'O': words.append(sentence2string) if label[0] == 'B' and pred[i + 1][0] == 'B': words.append(sentence2string) if ((i + 1) == len(pred)): if label[0] == 'B': words.append(sentence2string) i = i + 1 return words, tag if (len(pred) < 2): for word, label in list(zip(sentence, pred)): if label[0] == 'B': tag = [] sentence2string = '' sentence2string += (word + ' ') tag.append(label[2:]) words.append(sentence2string) return words, tag
def response_message(self, latest_message): list_pos_tag_text = pos_tag(latest_message) subjects = self.get_subject_from_text(list_pos_tag_text) record = db.get_info_gv_from_mon_hoc(subjects) if subjects == '0' or len(record) == 0: message = 'Chưa có thông tin giáo viên cho môn học này bạn nhé !' return message teacher = ', '.join([', '.join(i) for i in record]) message = "Môn học " + subjects + ' có thầy/cô ' + teacher + 'dạy nhé. Bạn có thể lên trang tín chỉ để đăng kí những thầy,cô này .' return message
def encode(self, text): text = text.lower() text_format = text.translate(self.translator) text_split = word_tokenize(text_format) text_values = [self.dictionary.get(i, self.dictionary["<unknown>"]) for i in text_split] text_tags = [self.tags_dictionary.get(i[1].lower(), self.tags_dictionary["<unknown>"]) for i in pos_tag(text_format)] text_values = self.resize(text_values, 50, self.dictionary["<unknown>"]) text_tags = self.resize(text_tags, 50, self.tags_dictionary["<unknown>"]) return text_values, text_tags
def chunk(sentence, format=None): """ chunk a sentence to phrases :param unicode sentence: raw sentence :return: list of tuple with word, pos tag, chunking tag :rtype: list """ sentence = pos_tag(sentence) crf_model = ChunkingCRFModel.Instance() result = crf_model.predict(sentence, format) return result
def process_match_info(text): res = {} tagged_sentence = pos_tag(text) names = [] for i, comp in enumerate(tagged_sentence): if comp[1] == "Np": if i - 1 >= 0 and tagged_sentence[i - 1][1] == "Np": names[-1] = names[-1] + " " + comp[0] else: names.append(comp[0]) res["names"] = names return res
def ner(): a = request.form['content'] corpus = sent_tokenize(a) b = [] for sen in corpus: x_test = [] for word in word_tokenize(sen, format="text").split(' '): x_test.extend(pos_tag(word)) b.append(x_test) b1 = [ner_train.get_features(s) for s in b] c = crf.predict(b1) return json.dumps([b, c])
def scrape(url): print('Getting content for page: ' + url) doc = getContentFromURL(url) print('Extracting nouns from the content...') nouns = [] if ((doc is not None) and len(doc.strip()) > 0): nouns = [ word.lower() for word, tu_loai in pos_tag(doc) if tu_loai == 'N' and hp.isValidWord(word) ] print('Done extracting nouns.') return nouns
def load_from_raw_content(raw_content): sentences = raw_content.split("\n") headers = sentences[:3] headers = dict([UDSentence._extract_header(_) for _ in headers]) text = sentences[-1] headers["text"] = text headers["type"] = "bronze" headers["authors"] = BOT_VERSION pos_tags = pos_tag(text) dp_tags = dependency_parse(text) rows = [(item[0][0], item[0][1], str(item[1][1]), item[1][2]) for item in zip(pos_tags, dp_tags)] s = UDSentence(rows=rows, headers=headers) return s
def test_accuracy(self): test_dir = join(dirname(__file__), "samples") files = listdir(test_dir) ids = [f.split(".")[0] for f in files] for id in ids: file = join(test_dir, "%s.txt" % id) sentence = load_input(file) actual = pos_tag(sentence) expected = load_output(file) if actual != expected: print("Fail {}".format(id)) save_temp(id, actual) self.assertEqual(actual, expected)
def main(): reverb = Reverb() for line in fileinput.input(): tokens = word_tokenize(line.strip()) tokens_tagged = pos_tag(tokens) print(tokens_tagged) pattern_tags = reverb.extract_reverb_patterns_tagged_ptb(tokens_tagged) print(pattern_tags) if reverb.detect_passive_voice(pattern_tags): print("Passive Voice: True") else: print("Passive Voice: False") print("\n") fileinput.close()
def ner(self, text, mode=None): if self.tag is None: self.tag = self.process_file.read_bin(self.path_to_model) for i in range(0, 1): if i == 0: pos_sent = pos_tag(text) test_features = self.w2.sent2features( SentenceGetter.combine_word_with_pos(pos_sent)) prediction = self.tag.predict([test_features]) # if i == 0 and 'PRODUCT' not in prediction: # pos_sent = [(word[0].replace('_', ' ').strip(), word[1]) for word in # annotator.pos_tag(tokenize(text))[0]] return Tagger().combine_tag_with_word(prediction[0], text, mode)
def underthesea_annotate(self, text, mode): if mode == 'sent_tokenize': return sent_tokenize(text) elif mode == 'word_tokenize': return word_tokenize(text) elif mode == 'pos_tag': return pos_tag(text) elif mode == 'chunk': return chunk(text) elif mode == 'ner': return ner(text) elif mode == 'classify': return classify(text) elif mode == 'sentiment': return sentiment(text) else: raise Exception("Wrong request, please check your request")
def sentence_segment(doc, lower, candidate_pos): """Store those words only in cadidate_pos""" sentences = [] for sent in doc: postag = pos_tag(sent) words = [x[0] for x in postag] selected_words = [] res = [] for i in range(len(words)): # Store words only with cadidate POS tag if postag[i][1] in candidate_pos: if lower is True: selected_words.append(words[i].lower()) else: selected_words.append(words[i]) sentences.append(' '.join(selected_words)) return '. '.join(sentences)
def _preprocess(self): tokens = underthesea.pos_tag(self.sentence) self.n = len(tokens) words_service = WordsService() synonyms_service = SynonymsService() for word, pos in tokens: syn = [word] pos = uts_pos_convert(pos) word_object = words_service.find_one(word, pos) if word_object is not None: cursor = synonyms_service.find(id_word_1=word_object['_id']) for record in cursor: s_word_object = words_service.find_one(_id=ObjectId(record['id_word_2'])) syn.append(s_word_object['word']) cursor = synonyms_service.find(id_word_2=word_object['_id']) for record in cursor: s_word_object = words_service.find_one(_id=ObjectId(record['id_word_1'])) syn.append(s_word_object['word']) self._word_syn_list.append(syn)
def generateVariants(untokenize_text): words = pos_tag(untokenize_text) for i in range(0,len(words)): words[i] = (words[i][0].replace(' ','_'),words[i][1]) tokens = words combinations = generateCombinations(tokens,0.001) num_variatns = functools.reduce(lambda x, y: x*y, [len(c) for c in combinations]) base_line = 0.001 while(num_variatns > 10000): base_line = base_line * 2 combinations = generateCombinations(tokens,base_line) num_variatns = functools.reduce(lambda x, y: x*y, [len(c) for c in combinations]) combinations = list(itertools.product(*combinations)) combinations = [' '.join(e) for e in combinations] return combinations
def read_raw_file(self) -> list: f = open("test_a1000.txt", "r+") noun_list = [] adj_list = [] for line in f: result = pos_tag(line) print(result) print('\n') record_n = [] record_adj = [] self.sentences.extend(sent_tokenize(line)) for item in result: if self.is_noun(item[1]) and self.one_word_prune(item[0]): record_n.append(str(item[0]).lower()) if item[1] == 'A' or item[1] == 'AP': record_adj.append(str(item[0]).lower()) noun_list.append(record_n) adj_list.append(record_adj) self.transaction = noun_list return noun_list