Пример #1
0
def check_sent(sent, day, num, BN):
    #nhung cau co do tuong tu cao
    pos_tag_sent = pos_tag(sent)
    list_day = []
    phraise = []
    phraise_BN = []
    if len(num) > 0:
        for i in num:
            sub_sum = sent[i[0]:i[1] - 1].strip(' ')
            sub_sent = sent[i[1] - 1:]
            pos_tag_sub_sent = pos_tag(sub_sent)
            if pos_tag_sub_sent[0][1] != 'N':
                line = word_tokenize(sent, format="text")
                masked_line = word_tokenize(
                    sent[:i[1] - 1],
                    format="text") + '  <mask> ' + word_tokenize(
                        sent[i[1] - 1:], format="text")
                result = mark(line, masked_line)
                phraise.append(sub_sum + ' ' + result)
            else:
                phraise.append(sub_sum + ' ' + pos_tag_sub_sent[0][0])

    if len(day) > 0:
        for i in day:
            sub_day = sent[i[0]:i[1]].strip(' ')
            sub_day = re.sub('-', '/', sub_day)
            list_day.append(sub_day)
    if len(BN) > 0:
        for i in BN:
            sub_BN = sent[i[0]:i[1] - 1].strip(' ')
            phraise_BN.append(sub_BN)
    return phraise, list_day, phraise_BN
Пример #2
0
 def pos_tagging_sentence(self, sent):
     if self.use_vncorenlp:
         sent = sent.replace("_", " ")
         temp = self.annotator.annotate(sent)
         return [(element["form"], element["posTag"]) for sent in temp["sentences"] for element in sent]
     else:
         return pos_tag(sent)
Пример #3
0
 def extractNoun(content):
     arr_nouns = []
     if ((content is not None) and len(content.strip()) > 0):
         for word, pos in pos_tag(content):
             if ((pos == 'N' and isNoun(word))):
                 arr_nouns.append(word)
     return arr_nouns
Пример #4
0
def is_clause(sentence):
	pos = pos_tag(sentence.lower())
	typeofword = []
	for i in range(len(pos)):
		typeofword.append(pos[i][1])
		pass
	if (('N' in typeofword) or ('Np' in typeofword) or ('V') in typeofword or ('M') in typeofword or ('Nc' in typeofword)) and ('A' in typeofword):
		if typeofword[0]=='C' or typeofword[0]=='R':
			if typeofword[1]=='N' or typeofword[1] == 'Np' or typeofword[1]=='V' or typeofword[1]=='M' or typeofword[1]=='Nc':
				return True
			if typeofword[1] == 'A':
				if typeofword[2] =='N' or typeofword[2] == 'Np' or typeofword[2] == 'M' or typeofword[2]=='Nc':
					return True
			else:
				return False
		elif typeofword[0]=='X':
			if typeofword[1]=='C' or typeofword[1]=='R':
				if typeofword[2]=='N' or typeofword[2] == 'Np' or typeofword[2]=='V' or typeofword[2]=='M' or typeofword[2]=='Nc':
					return True
				if typeofword[2] == 'A':
					if typeofword[3] =='N' or typeofword[3] == 'Np' or typeofword[3] == 'M' or typeofword[3]=='V' or typeofword[3]=='Nc':
						return True
			else:
				return False
		elif typeofword[0]=='N' or typeofword[0] == 'Np' or typeofword[0]=='V' or typeofword[0]=='M':
			return True
		elif typeofword[0]=='A':
			if typeofword[1] =='N' or typeofword[1] == 'Np' or typeofword[1] == 'M' or typeofword[0]=='V':
				return True
		else:
			return False
	else:
		return False
	return False
	pass
Пример #5
0
    def wseg_and_add_pos_tag_feature(sentence: str or list,
                                     pos_tags: list = None,
                                     ner_labels: list = None) -> (list, list):
        if type(sentence) == str:
            sentence = sentence.split()
        if ner_labels is None:
            ner_labels = ['O'] * len(sentence)
        pos_features = []
        words = []
        labels = []

        if pos_tags is None:
            annotated_text = pos_tag(" ".join(sentence))
            sentence = []
            pos_tags = []
            for word, pos in annotated_text:
                sentence.append(word.strip())
                pos_tags.append(pos.strip())
            ner_labels = ['O'] * len(sentence)

        for word, pos, label in list(zip(sentence, pos_tags, ner_labels)):
            tokens = word.split()
            (prefix, tag) = label.split('-') if not label == 'O' else ('',
                                                                       label)
            for idx, token in enumerate(tokens):
                if token.strip() == '':
                    continue
                if idx == 0 and prefix.strip() == 'B':
                    labels.append(label)
                else:
                    labels.append(
                        f'I-{tag.strip()}' if not label == 'O' else 'O')
                words.append(token.strip())
                pos_features.append('[POS]' + pos_tag_normalize(pos.strip()))
        return words, pos_features, labels
Пример #6
0
def auto_annotation(input_file, output_folder="."):
    file_id = basename(input_file).split(".")[0]
    texts = open(input_file).read().strip().decode("utf-8").split("\n")
    content = u"\n".join([u" ".join(word_sent(text)) for text in texts])
    output_text_file = join(output_folder, "%s.txt" % file_id)
    io.open(output_text_file, "w", encoding="utf-8",
            newline="\n").write(content)

    start = 0
    end = 0
    output_annotation_file = join(output_folder, "%s.ann" % file_id)
    ann_file = io.open(output_annotation_file,
                       "w",
                       encoding="utf-8",
                       newline="\n")
    token_id = 1
    for text in texts:
        tokens = pos_tag(text)
        for token in tokens:
            word, tag = token
            end = start + len(word)
            ann_file.write(u"T%d\t%s %d %d\t%s\n" %
                           (token_id, tag, start, end, word))
            token_id += 1
            start = end + 1
def keyword_extraction(question):
    keywords = []
    question = question.replace('_', ' ')

    if 'nhất' in question.lower():
        keywords.append('nhất')

    words = pos_tag(question)
    for i in range(0, len(words)):
        words[i] = (words[i][0].replace(' ', '_'), words[i][1])

    for token in words:
        word = token[0]
        pos = token[1]
        if (pos in ['A', 'Ab']):
            keywords += word.lower().split('_')

    keywords = list(set(keywords))
    keywords = [[w] for w in keywords]

    ners = get_ner(question)
    ners = [n.lower() for n in ners]

    for ne in ners:
        variants = extractEntVariants(ne)
        keywords.append(variants)

    return keywords
    def set_uc1_and_uc2_for_conversations(self, rasa_chatlog_df: pd.DataFrame):
        # with open("models/ic_for_uc1_2.pkl", "rb") as file:
        #     clf = pickle.load(file)
        conversation_ids = list(rasa_chatlog_df["conversation_id"])
        conversation_ids = list(dict.fromkeys(conversation_ids))
        rasa_chatlog_df.insert(2, "use_case", "")
        for id in conversation_ids:
            chatlog_sub_df = rasa_chatlog_df[rasa_chatlog_df["conversation_id"]
                                             == id]
            conversation_attachments = list(chatlog_sub_df['attachments'])
            if any("scontent" in str(x) for x in conversation_attachments):
                chatlog_sub_df_first_turn = chatlog_sub_df[
                    (chatlog_sub_df["turn"] == 0) |
                    (chatlog_sub_df["turn"] == 1)]
                for index, item in chatlog_sub_df_first_turn.iterrows():
                    user_message = item["user_message"]
                    if str(item["entities"]) != "nan":
                        entities_list = item["entities"].split(",")
                        if any("price" in str(x) for x in entities_list):
                            rasa_chatlog_df.at[index, "use_case"] = "uc_2"
                            break
                    if str(user_message) != "nan":
                        user_message_correction = do_correction(user_message)
                        message_pos_tag = pos_tag(user_message_correction)
                        # message_pos_tag = [user_message_correction]

                        ##################################################################
                        words = [x[0] for x in message_pos_tag]
                        pos = [x[1] for x in message_pos_tag]
                        con_x_khong_form = False
                        if "còn" in words and "không" in words:
                            con_index = words.index("còn")
                            khong_index = words.index("không")
                            if con_index < khong_index:
                                in_between_word_pos = pos[
                                    con_index:khong_index]
                                """
                                N - Common noun
                                Nc - Noun Classifier
                                Ny - Noun abbreviation
                                Np - Proper noun
                                Nu - Unit noun
                                """
                                if any(x in in_between_word_pos
                                       for x in ["N", "Nc", "Ny", "Np", "Nu"]):
                                    con_x_khong_form = True

                        if con_x_khong_form or "còn không" in user_message_correction or (
                                "còn" in user_message_correction
                                and "không" in user_message_correction):
                            rasa_chatlog_df.at[index, "use_case"] = "uc_1"
                            break
                        ##################################################################

                        # input_message = pd.DataFrame([{"feature": user_message_correction}])
                        # predicted = list(clf.predict(input_message["feature"]))
                        # if predicted[0] == "uc_1":
                        #     rasa_chatlog_df.at[index, "use_case"] = "uc_1"
                        # break
        return rasa_chatlog_df
Пример #9
0
    def smart_tokenize(
        self, sentence, language
    ):  # use under_the_sea/nltk tokenizer and pos_tag to keep noun phrase only
        print(sentence)
        if language == "vietnamese":
            tags = underthesea.pos_tag(sentence)
        else:
            sentence = nltk.word_tokenize(sentence)
            tags = nltk.pos_tag(sentence)
        tokens = []
        noun_phrase = ""
        for i in range(0, len(tags)):
            if tags[i][1] in [
                    "N", "Np", "Nu", "Nc", "M", "NN", "NNP", "NNPS", "NNS"
            ] and tags[i][0].strip() not in ["", " "]:
                if noun_phrase != "":
                    noun_phrase += " " + tags[i][0].strip()
                else:
                    noun_phrase = tags[i][0].strip()
            else:
                if noun_phrase not in [
                        "", " "
                ] and len(noun_phrase.strip().split()) >= 2:
                    tokens.append(noun_phrase.strip())
                noun_phrase = ""
        if noun_phrase.strip() not in [
                "", " "
        ] and len(noun_phrase.strip().split()) >= 2:
            tokens.append(noun_phrase.strip())

        print(tokens)
        return tokens
Пример #10
0
def chunk(sentence, format=None):
    """
    Vietnamese chunking

    Parameters
    ==========

    sentence: {unicode, str}
        raw sentence

    Returns
    =======
    tokens: 	list of tuple with word, pos tag, chunking tag
        tagged sentence

    Examples
    --------

    >>> # -*- coding: utf-8 -*-
    >>> from underthesea import chunk
    >>> sentence = "Nghi vấn 4 thi thể Triều Tiên trôi dạt bờ biển Nhật Bản"
    >>> chunk(sentence)
    [('Nghi vấn', 'N', 'B-NP'),
    ('4', 'M', 'B-NP'),
    ('thi thể', 'N', 'B-NP'),
    ('Triều Tiên', 'Np', 'B-NP'),
    ('trôi dạt', 'V', 'B-VP'),
    ('bờ biển', 'N', 'B-NP'),
    ('Nhật Bản', 'Np', 'B-NP')]
    """
    sentence = pos_tag(sentence)
    crf_model = CRFChunkingPredictor.Instance()
    result = crf_model.predict(sentence, format)
    return result
Пример #11
0
def test():
	sentence="Vì thấy quán đang chạy giảm giá giá cả khá hợp lý nên mình cũng quên ko vào đọc bình luận trước khi order"
	sentence1="Lẩu bò thì chỉ có 1 loại bò; hải sản đắt hơn và toàn mấy thứ ôi, mực bé tí, nhũn nhèo, ko có cá lăng như quảng cáo,món ăn thì không ngon."
	sentence2="Đồ ăn nguội và nhân viên thì hơi chậm chạm"
	# sentence = sentence+" "+sentence1+" "+sentence2
	s = 'nhà hàng được trang trí rất đẹp thoáng mát rộng rãi,nhưng nhân viên rất cẩu thả, thái độ không tốt'
	print(detect_sentence(s))
	print(pos_tag('nhân viên rất cẩu thả'))
Пример #12
0
def keywords_extraction(sent):
  '''A native function to define good tokens for searching 
        currently get all but except P and CH'''
  rs=""
  for i in pos_tag(sent):
    if i[1] !='P' and i[1] != 'CH':
      rs=rs+' '+i[0]
  return rs.strip()
Пример #13
0
def predict(sentence):
    sentences = word_tokenize(sentence)
    tagger = pycrfsuite.Tagger()
    tagger.open('crf.model')
    tokens = [(word, tag, "X") for word,tag in pos_tag(sentence)]
    X = extract_features(tokens)
    y_pred = tagger.tag(X)
    output = [(tag, token) for token, tag in zip(y_pred, sentences)]
    return output
Пример #14
0
def pos_tag(request):
    result = {}
    try:
        text = json.loads(request.body.decode("utf-8"))["text"]
        tags = uts.pos_tag(text)
        result["output"] = tags
    except:
        result = {"error": "Bad request!"}
    return JsonResponse(result)
Пример #15
0
def processing_NER(sentence):

    pos = pos_tag(sentence)
    sentence = word_tokenize(sentence)
    X = [sent2features(pos)]
    import time
    start_time = time.time()
    with open('model_CRF_NER.pkl', 'rb') as fp:
        crf = pickle.load(fp)
    pred = crf.predict(X)
    pred = np.array(pred)
    pred = pred.flatten()
    end_time = time.time()
    print('total run-time: %f ms' % ((end_time - start_time) * 1000))
    # Product perform by upper word
    sentence2string = ''
    words = []
    tag = []
    i = 0
    if (len(pred) >= 2):
        for word, label in list(zip(sentence, pred)):
            if label[0] == 'B':
                sentence2string = ''
                sentence2string += (word)
                tag.append(label[2:])
            if label[0] == 'I' and word != ',':
                sentence2string += (' ' + word)
            if label[0] == 'I' and word == ',':
                sentence2string += (word)
            if label[0] == 'I' and (i + 1 == len(pred)):
                words.append(sentence2string)
            if ((i + 1) > len(pred)):
                break
            if ((i + 1) < len(pred)):
                if label[0] == 'I' and pred[i + 1][0] == 'O':
                    words.append(sentence2string)
                if label[0] == 'I' and pred[i + 1][0] == 'B':
                    words.append(sentence2string)
                if label[0] == 'B' and pred[i + 1][0] == 'O':
                    words.append(sentence2string)
                if label[0] == 'B' and pred[i + 1][0] == 'B':
                    words.append(sentence2string)
            if ((i + 1) == len(pred)):
                if label[0] == 'B':
                    words.append(sentence2string)
            i = i + 1
        return words, tag
    if (len(pred) < 2):
        for word, label in list(zip(sentence, pred)):
            if label[0] == 'B':
                tag = []
                sentence2string = ''
                sentence2string += (word + ' ')
                tag.append(label[2:])
                words.append(sentence2string)
        return words, tag
Пример #16
0
 def response_message(self, latest_message):
     list_pos_tag_text = pos_tag(latest_message)
     subjects = self.get_subject_from_text(list_pos_tag_text)
     record = db.get_info_gv_from_mon_hoc(subjects)
     if subjects == '0' or len(record) == 0:
         message = 'Chưa có thông tin giáo viên cho môn học này bạn nhé !'
         return message
     teacher = ', '.join([', '.join(i) for i in record])
     message = "Môn học " + subjects + ' có thầy/cô ' + teacher + 'dạy nhé. Bạn có thể lên trang tín chỉ để đăng kí những thầy,cô này .'
     return message
    def encode(self, text):

        text = text.lower()
        text_format = text.translate(self.translator)
        text_split = word_tokenize(text_format)
        text_values = [self.dictionary.get(i, self.dictionary["<unknown>"]) for i in text_split]
        text_tags = [self.tags_dictionary.get(i[1].lower(), self.tags_dictionary["<unknown>"]) for i in
                     pos_tag(text_format)]
        text_values = self.resize(text_values, 50, self.dictionary["<unknown>"])
        text_tags = self.resize(text_tags, 50, self.tags_dictionary["<unknown>"])
        return text_values, text_tags
Пример #18
0
def chunk(sentence, format=None):
    """
    chunk a sentence to phrases 
    
    :param unicode sentence: raw sentence
    :return: list of tuple with word, pos tag, chunking tag 
    :rtype: list 
    """
    sentence = pos_tag(sentence)
    crf_model = ChunkingCRFModel.Instance()
    result = crf_model.predict(sentence, format)
    return result
Пример #19
0
def process_match_info(text):
    res = {}
    tagged_sentence = pos_tag(text)
    names = []
    for i, comp in enumerate(tagged_sentence):
        if comp[1] == "Np":
            if i - 1 >= 0 and tagged_sentence[i - 1][1] == "Np":
                names[-1] = names[-1] + " " + comp[0]
            else:
                names.append(comp[0])
    res["names"] = names
    return res
Пример #20
0
def ner():
    a = request.form['content']
    corpus = sent_tokenize(a)
    b = []
    for sen in corpus:
        x_test = []
        for word in word_tokenize(sen, format="text").split(' '):
            x_test.extend(pos_tag(word))
        b.append(x_test)
    b1 = [ner_train.get_features(s) for s in b]
    c = crf.predict(b1)
    return json.dumps([b, c])
Пример #21
0
def scrape(url):
    print('Getting content for page: ' + url)
    doc = getContentFromURL(url)
    print('Extracting nouns from the content...')
    nouns = []
    if ((doc is not None) and len(doc.strip()) > 0):
        nouns = [
            word.lower() for word, tu_loai in pos_tag(doc)
            if tu_loai == 'N' and hp.isValidWord(word)
        ]
    print('Done extracting nouns.')
    return nouns
Пример #22
0
 def load_from_raw_content(raw_content):
     sentences = raw_content.split("\n")
     headers = sentences[:3]
     headers = dict([UDSentence._extract_header(_) for _ in headers])
     text = sentences[-1]
     headers["text"] = text
     headers["type"] = "bronze"
     headers["authors"] = BOT_VERSION
     pos_tags = pos_tag(text)
     dp_tags = dependency_parse(text)
     rows = [(item[0][0], item[0][1], str(item[1][1]), item[1][2]) for item in zip(pos_tags, dp_tags)]
     s = UDSentence(rows=rows, headers=headers)
     return s
Пример #23
0
 def test_accuracy(self):
     test_dir = join(dirname(__file__), "samples")
     files = listdir(test_dir)
     ids = [f.split(".")[0] for f in files]
     for id in ids:
         file = join(test_dir, "%s.txt" % id)
         sentence = load_input(file)
         actual = pos_tag(sentence)
         expected = load_output(file)
         if actual != expected:
             print("Fail {}".format(id))
             save_temp(id, actual)
         self.assertEqual(actual, expected)
Пример #24
0
def main():
    reverb = Reverb()
    for line in fileinput.input():
        tokens = word_tokenize(line.strip())
        tokens_tagged = pos_tag(tokens)
        print(tokens_tagged)
        pattern_tags = reverb.extract_reverb_patterns_tagged_ptb(tokens_tagged)
        print(pattern_tags)
        if reverb.detect_passive_voice(pattern_tags):
            print("Passive Voice: True")
        else:
            print("Passive Voice: False")
        print("\n")
    fileinput.close()
Пример #25
0
    def ner(self, text, mode=None):
        if self.tag is None:
            self.tag = self.process_file.read_bin(self.path_to_model)
        for i in range(0, 1):
            if i == 0:
                pos_sent = pos_tag(text)
            test_features = self.w2.sent2features(
                SentenceGetter.combine_word_with_pos(pos_sent))
            prediction = self.tag.predict([test_features])
            # if i == 0 and 'PRODUCT' not in prediction:
            #     pos_sent = [(word[0].replace('_', ' ').strip(), word[1]) for word in
            # annotator.pos_tag(tokenize(text))[0]]

        return Tagger().combine_tag_with_word(prediction[0], text, mode)
Пример #26
0
 def underthesea_annotate(self, text, mode):
     if mode == 'sent_tokenize':
         return sent_tokenize(text)
     elif mode == 'word_tokenize':
         return word_tokenize(text)
     elif mode == 'pos_tag':
         return pos_tag(text)
     elif mode == 'chunk':
         return chunk(text)
     elif mode == 'ner':
         return ner(text)
     elif mode == 'classify':
         return classify(text)
     elif mode == 'sentiment':
         return sentiment(text)
     else:
         raise Exception("Wrong request, please check your request")
Пример #27
0
def sentence_segment(doc, lower, candidate_pos):
    """Store those words only in cadidate_pos"""
    sentences = []
    for sent in doc:
        postag = pos_tag(sent)
        words = [x[0] for x in postag]
        selected_words = []
        res = []
        for i in range(len(words)):
            # Store words only with cadidate POS tag
            if postag[i][1] in candidate_pos:
                if lower is True:
                    selected_words.append(words[i].lower())
                else:
                    selected_words.append(words[i])
        sentences.append(' '.join(selected_words))

    return '. '.join(sentences)
Пример #28
0
 def _preprocess(self):
     tokens = underthesea.pos_tag(self.sentence)
     self.n = len(tokens)
     words_service = WordsService()
     synonyms_service = SynonymsService()
     for word, pos in tokens:
         syn = [word]
         pos = uts_pos_convert(pos)
         word_object = words_service.find_one(word, pos)
         if word_object is not None:
             cursor = synonyms_service.find(id_word_1=word_object['_id'])
             for record in cursor:
                 s_word_object = words_service.find_one(_id=ObjectId(record['id_word_2']))
                 syn.append(s_word_object['word'])
             cursor = synonyms_service.find(id_word_2=word_object['_id'])
             for record in cursor:
                 s_word_object = words_service.find_one(_id=ObjectId(record['id_word_1']))
                 syn.append(s_word_object['word'])
         self._word_syn_list.append(syn)
Пример #29
0
def generateVariants(untokenize_text):
    words = pos_tag(untokenize_text)
    for i in range(0,len(words)):
        words[i] = (words[i][0].replace(' ','_'),words[i][1])
    
    tokens = words
    
    combinations = generateCombinations(tokens,0.001)
    num_variatns = functools.reduce(lambda x, y: x*y, [len(c) for c in combinations])
    
    base_line = 0.001
    while(num_variatns > 10000):
        base_line = base_line * 2
        combinations = generateCombinations(tokens,base_line)
        num_variatns = functools.reduce(lambda x, y: x*y, [len(c) for c in combinations])
     
    combinations = list(itertools.product(*combinations))
    combinations = [' '.join(e) for e in combinations]
    return combinations
Пример #30
0
 def read_raw_file(self) -> list:
     f = open("test_a1000.txt", "r+")
     noun_list = []
     adj_list = []
     for line in f:
         result = pos_tag(line)
         print(result)
         print('\n')
         record_n = []
         record_adj = []
         self.sentences.extend(sent_tokenize(line))
         for item in result:
             if self.is_noun(item[1]) and self.one_word_prune(item[0]):
                 record_n.append(str(item[0]).lower())
             if item[1] == 'A' or item[1] == 'AP':
                 record_adj.append(str(item[0]).lower())
         noun_list.append(record_n)
         adj_list.append(record_adj)
     self.transaction = noun_list
     return noun_list