示例#1
0
def ner(sentence, format=None):
    """
    Location and classify named entities in text

    Parameters
    ==========

    sentence: {unicode, str}
        raw sentence

    Returns
    =======
    tokens: list of tuple with word, pos tag, chunking tag, ner tag
        tagged sentence

    Examples
    --------

    >>> # -*- coding: utf-8 -*-
    >>> from underthesea import ner
    >>> sentence = "Ông Putin ca ngợi những thành tựu vĩ đại của Liên Xô"
    >>> ner(sentence)
    [('Ông', 'Nc', 'B-NP', 'O'),
    ('Putin', 'Np', 'B-NP', 'B-PER'),
    ('ca ngợi', 'V', 'B-VP', 'O'),
    ('những', 'L', 'B-NP', 'O'),
    ('thành tựu', 'N', 'B-NP', 'O'),
    ('vĩ đại', 'A', 'B-AP', 'O'),
    ('của', 'E', 'B-PP', 'O'),
    ('Liên Xô', 'Np', 'B-NP', 'B-LOC')]
    """
    sentence = chunk(sentence)
    crf_model = CRFNERPredictor.Instance()
    result = crf_model.predict(sentence, format)
    return result
示例#2
0
def input_lstm_char(text):
    sentences = nltk.sent_tokenize(text)
    idx_words = [words2idx(s) for s in sentences]
    idx_postags = [
        postag2idx([word[1] for word in underthesea.chunk(s)])
        for s in sentences
    ]
    idx_chars = [chars2idx(s) for s in sentences]
    # print(idx_postags)
    # print(idx_words[0])

    x_words = pad_sequences(maxlen=max_len,
                            sequences=idx_words,
                            value=dict_words['PAD'],
                            padding='post',
                            truncating='post')

    x_postags = pad_sequences(maxlen=max_len,
                              sequences=idx_postags,
                              value=dict_pos_tags['PAD'],
                              padding='post',
                              truncating='post')

    x_chars = pad_sequences(maxlen=max_len * max_len_char,
                            sequences=idx_chars,
                            value=dict_chars['PAD'],
                            padding='post',
                            truncating='post')

    # print(x_words.shape)
    # print(x_postags.shape)
    return x_words, x_chars
示例#3
0
def chunking(request):
    result = {}
    try:
        text = json.loads(request.body.decode("utf-8"))["text"]
        tags = uts.chunk(text)
        result["output"] = tags
    except:
        result = {"error": "Bad request!"}
    return JsonResponse(result)
示例#4
0
def ner(sentence, format=None):
    """
    part of speech tagging

    :param unicode|str sentence: raw sentence
    :return: ner tagged sentence
    :rtype: list
    """
    sentence = chunk(sentence)
    crf_model = CRFNERPredictor.Instance()
    result = crf_model.predict(sentence, format)
    return result
示例#5
0
 def test_accuracy(self):
     test_dir = join(dirname(__file__), "samples")
     files = listdir(test_dir)
     ids = [f.split(".")[0] for f in files]
     for id in ids:
         file = join(test_dir, "%s.txt" % id)
         sentence = load_input(file)
         actual = chunk(sentence)
         expected = load_output(file)
         if actual != expected:
             print("Fail {}".format(id))
             save_temp(id, actual)
         self.assertEqual(actual, expected)
示例#6
0
def get_dict_pos_tag(paragraph):
    words = underthesea.chunk(paragraph)
    pos_tags = [word[1] for word in words]
    pos_tags = set(pos_tags)
    try:
        with open('data/dict/postag.txt', 'r') as f:
            tags = f.read().split('\n')
            pos_tags.update(tags)
    except FileNotFoundError as e:
        print(e)
    with open('data/dict/postag.txt', 'w') as f:
        pos_tags = sorted(pos_tags)
        for t in pos_tags[:-1]:
            f.write(t + '\n')
        f.write(pos_tags[-1])
def tokenize(para):
    # split paragraph to sentences
    try:
        sentences = para.split('. ')
    except Exception as e:
        print(sentences)

    # add '.' after split paragraph
    for idx, sent in enumerate(sentences[:-1]):
        sentences[idx] = sent + "."

    # tokenize sentence
    for idx, sent in enumerate(sentences):
        sentences[idx] = underthesea.chunk(sent)
        for i in range(len(sentences[idx])):
            sentences[idx][i] += tuple('O')
    return sentences
示例#8
0
 def underthesea_annotate(self, text, mode):
     if mode == 'sent_tokenize':
         return sent_tokenize(text)
     elif mode == 'word_tokenize':
         return word_tokenize(text)
     elif mode == 'pos_tag':
         return pos_tag(text)
     elif mode == 'chunk':
         return chunk(text)
     elif mode == 'ner':
         return ner(text)
     elif mode == 'classify':
         return classify(text)
     elif mode == 'sentiment':
         return sentiment(text)
     else:
         raise Exception("Wrong request, please check your request")
示例#9
0
# s3 = "\r\n                                    7.6\r\n                                "
# print(len(s3))
# s3.strip("\n\r\n")
# print(s3)
# print(s3[38:len(s3) - 32])
# string.strip(" ")
# print(len(string[43:89]))
# print(string[43:len(string) - 39])
# print(s2[43:len(s2) - 39])
# print(string.find("ĐN"))
# 43:87
from underthesea import ner, chunk, word_tokenize
text = "176 Nguyễn Chí Thanh"
# print(ner(text))
# print(word_tokenize(text))
rs = chunk(text)
print(rs)
mon = []
duong = []
checkduong = ['Đường', 'đường']
# print(type(rs))
for i in range(len(rs)):
    if rs[i][1] == 'Np' and rs[i-1][0] in checkduong:
        duong.append(rs[i][0])
    if rs[i][1] == 'Np' and str(rs[i-1][0]).isnumeric():
        soduong = rs[i-1][0] + " " + rs[i][0]
        duong.append(soduong)
    elif rs[i][1] == 'Np':
        mon.append(rs[i][0])   
print(mon)
print(duong)
示例#10
0
 def test_simple_cases(self):
     sentence = u""
     actual = chunk(sentence)
     expected = []
     self.assertEqual(actual, expected)
示例#11
0
 def test_accuracy(self):
     output = chunk(
         u"Tổng Bí thư: Ai trót để tay nhúng chàm thì hãy sớm tự gột rửa")
     self.assertEqual(len(output), 13)
示例#12
0
def search(request):
    search = str(request.GET.get("search"))
    # print(search)
    rs = chunk(search)
    print(rs)
    mon = []
    duong = []
    quan = []
    checkduong = ['Đường', 'đường']
    checkquan = ['Quận', 'quận']
    for i in range(len(rs)):
        if rs[i][1] in ['Np', 'N', 'M']:
            if i == 0:
                if rs[i][1] in ['Np', 'N', 'M']:
                    if len(rs) > 1 and rs[i + 1][1] == 'E':
                        mon.append(rs[i][0])
            else:
                # print("i: ",i)
                if rs[i - 1][0] in checkquan:
                    quan.append(rs[i][0])
                elif rs[i - 1][0] in checkduong:
                    duong.append(rs[i][0])
                elif str(rs[i - 1][0]).isnumeric():
                    soduong = rs[i - 1][0] + " " + rs[i][0]
                    duong.append(soduong)
                elif rs[i - 1][1] == 'V' or rs[i + 1][1] == 'E':
                    mon.append(rs[i][0])

        # if rs[i][1] in ['Np','N','M'] and rs[i-1][0] in checkquan:
        #     quan.append(rs[i][0])
        # elif rs[i][1] in ['Np','N','M'] and rs[i-1][0] in checkduong:
        #     duong.append(rs[i][0])
        # elif rs[i][1] in ['Np','N','M'] and str(rs[i-1][0]).isnumeric():
        #     soduong = rs[i-1][0] + " " + rs[i][0]
        #     duong.append(soduong)
        # elif rs[i][1] in ['Np','N','M'] and (rs[i+1][1] == 'E' or rs[i-1][1] == 'V'):
        #     mon.append(rs[i][0])
    print("Quận: ", quan)
    print("Đường: ", duong)
    print("Món: ", mon)
    # return JsonResponse({
    #     "data": []
    # })
    dt = []
    # g = rdflib.Graph()
    # g = g.parse("hnag/static/hnag/restaurants.xml", format="xml")
    g = SPARQLWrapper("http://localhost:7200/repositories/HNAG")

    if (len(quan) > 0):
        print(mon)
        print(quan)
        regex = """"""
        for i in range(len(mon)):
            regex += """FILTER regex(?name,""" + """'""" + mon[
                i] + """','i')."""
        for i in range(len(quan)):
            regex += """FILTER regex(?districtName,""" + """'""" + quan[
                i] + """','i')."""
        print(regex)
        queryString = """
            PREFIX res: <http://www.hnag.com/>
            SELECT DISTINCT *
            WHERE {
                ?place res:name ?name.                
                ?place res:url ?url.
                ?place res:rating ?rating.
                ?place res:image ?image.
                ?place res:id ?id.
                ?place res:address ?address.
                ?address res:district ?district.
                ?district res:name ?districtName                
                """ + regex + """            
            } 
            ORDER BY DESC(?rating)
            """
        print(queryString)
        g.setQuery(queryString)
    elif len(duong) > 0:
        print(mon)
        print(duong)
        regex = """"""
        for i in range(len(mon)):
            regex += """FILTER regex(?name,""" + """'""" + mon[
                i] + """','i')."""
        for i in range(len(duong)):
            regex += """FILTER regex(?street,""" + """'""" + duong[
                i] + """','i')."""
        print(regex)
        queryString = """
            PREFIX res: <http://www.hnag.com/>
            SELECT DISTINCT ?name ?address ?url ?rating ?image ?id
            WHERE {
                ?place res:name ?name.
                ?place res:address ?address.
                ?address res:street ?street.
                ?place res:url ?url.
                ?place res:rating ?rating.
                ?place res:image ?image.
                ?place res:id ?id.
                """ + regex + """            
            } 
            ORDER BY DESC(?rating)
            """
        print(queryString)
        g.setQuery(queryString)
    elif len(mon) == 0:
        queryString = """
            PREFIX res: <http://www.hnag.com/>
            SELECT DISTINCT ?name ?address ?url ?rating ?image ?id
            WHERE {
                ?place res:name ?name.
                ?place res:address ?address.
                ?place res:url ?url.
                ?place res:rating ?rating.
                ?place res:image ?image.
                ?place res:id ?id.
                FILTER regex(?name,""" + """'""" + search + """','i').            
            } 
            ORDER BY DESC(?rating)
            """
        print(queryString)
        g.setQuery(queryString)

    else:
        queryString = """
            PREFIX res: <http://www.hnag.com/>
            SELECT DISTINCT ?name ?address ?url ?rating ?image ?id
            WHERE {
                ?place res:name ?name.
                ?place res:address ?address.
                ?place res:url ?url.
                ?place res:rating ?rating.
                ?place res:image ?image.
                ?place res:id ?id.
                FILTER regex(?name,""" + """'""" + mon[
            0] + """','i').            
            } 
            ORDER BY DESC(?rating)
            """
        print(queryString)
        g.setQuery(queryString)

    g.setReturnFormat(JSON)
    results = g.query().convert()

    for row in results["results"]["bindings"]:
        print(row["id"]["value"])
        lJson = {
            "url": row["url"]["value"],
            "name": row["name"]["value"],
            "rate": row["rating"]["value"],
            "image": row["image"]["value"]
        }
        dt.append(lJson)
    return JsonResponse({"posts": dt})
示例#13
0
from src.eval import whole_word_position
from underthesea import word_sent, pos_tag, chunk
test = "Đối với các chuyên khoa khác như : Phẩu thuật tổng quát ( nội trú 5 năm ) , sản ( nội trú 5 năm ) , chấn thương chỉnh hình ( nội trú 5 năm ) . Và chuyên sâu mỗi chuyên khoa tầm ( 1 - 3 năm tùy chuyên khoa ) . Nói chung cũng tầm 15 - 16 năm ( cho lầm sàn , chưa kể Ph.D )"
print(word_sent(test))
print(chunk(test))

from underthesea.word_sent.model_crf import CRFModel
示例#14
0
def ner(sentence):
    sentence = chunk(sentence)
    crf_model = CRFNERPredictor.Instance()
    result = crf_model.predict(sentence, format)
    return result