Exemplo n.º 1
0
def trans_list(text, root):
    trans_set = []

    for line in text:
        tmp = []
        #print(line)
        line = line.replace("\n", "")
        word = line.split(" ")
        flag = 0
        for x in range(len(word)):
            flag = 0
            if word[x] == " ":
                continue
            word_length = len(word[x])
            # find the word match
            while word_length > 1 and flag == 0:

                # find ngram search
                result_list = ngram.search(root, word[x][:word_length])
                #if match
                if result_list != []:
                    for result in result_list:
                        value = result.split(" ")
                        arr = []
                        count = 0
                        for i in range(len(value)):
                            arr.append("0")
                        arr[0] = "1"
                        for next_word_index in range(len(value)):
                            if next_word_index + x < len(word) and (
                                    word[x + next_word_index]
                                    == value[next_word_index]
                                    or next_word_index == 0 and word[x]
                                [:word_length] == value[next_word_index]) and (
                                    next_word_index >= 1
                                    and arr[next_word_index - 1] == "1"
                                    or next_word_index == 0 and arr[0] == "1"):
                                arr[next_word_index] = "1"
                                count = count + 1
                            if count != 0 and (
                                    count != 1 and count == len(value)
                                    or len(value) == 1 and count
                                    == 1) and arr[len(arr) - 1] == "1":
                                val = ngram.findValue(root, result)
                                if val == 'none':
                                    continue
                                val = val.lower()
                                synoni = val.split(" & ")
                                tmp += synoni
                                tmp = list(set(tmp))
                                for i in range(len(value)):
                                    word[x + i] = " "
                                flag = 1
                                break
                word_length = word_length - 1
        trans_set.append(tmp)

    return trans_set
Exemplo n.º 2
0
def trans_list(l, root):
    trans_set = []
    kor_set = []
    article_word_index = 0
    article_line_index = 0

    while 1:
        if (article_line_index >= len(l)):
            break

        line = l[article_line_index].replace("\n", "")

        split_one_line = line.split()
        print(line)
        if (len(split_one_line) == 0):
            article_line_index += 1
            trans_set.append([])
            kor_set.append([])
            continue
        line_set = []
        kortmp_set = []
        while 1:
            if (len(split_one_line[article_word_index]) != 1):
                search_result_list = ngram.search(
                    root, split_one_line[article_word_index])
                if (search_result_list != "none"):
                    for x in search_result_list:
                        length = len(x.split())
                        to_find_string = ""

                        if (article_word_index + length < len(split_one_line)):
                            for y in range(length):
                                to_find_string = to_find_string + " " + split_one_line[
                                    article_word_index + y]
                        if (to_find_string[1:].find(x) == 0):
                            print("find : ", x)
                            c = ngram.findValue(root, x)
                            kortmp_set.append(x)
                            print("ngram result : ", c)
                            article_word_index = article_word_index + length - 1
                            sp = c.split(" & ")
                            for i in sp:
                                line_set.append(i)
                            break
            article_word_index += 1
            if (article_word_index >= len(split_one_line)):
                trans_set.append(line_set)
                kor_set.append(kortmp_set)
                article_line_index += 1
                article_word_index = 0
                break
    return trans_set, kor_set
Exemplo n.º 3
0
def extractNNP_KOR(root, i):
    try:
        read_page = open(header_path.format(lang='kor', idx=i),
                         "rU",
                         encoding='UTF8')
    except:
        return -1
    trans_set = []

    for line in read_page:
        tmp = []

        line = line.replace("\n", "")
        word = line.split(" ")
        flag = 0
        for x in range(len(word)):
            flag = 0
            if word[x] == " ":
                continue
            word_length = len(word[x])
            # find the word match
            while word_length > 1 and flag == 0:
                #print("word[x] : ", word[x][:word_length])
                # find ngram search
                result_list = ngram.search(root, word[x][:word_length])
                if result_list != []:
                    for result in result_list:
                        value = result.split(" ")
                        #print("===============")
                        #print("value : ", value)
                        arr = []
                        count = 0
                        for i in range(len(value)):
                            arr.append("0")
                        arr[0] = "1"
                        for next_word_index in range(len(value)):
                            if next_word_index + x < len(word) and (
                                    word[x + next_word_index]
                                    == value[next_word_index]
                                    or next_word_index == 0 and word[x]
                                [:word_length] == value[next_word_index]) and (
                                    next_word_index >= 1
                                    and arr[next_word_index - 1] == "1"
                                    or next_word_index == 0 and arr[0] == "1"):
                                arr[next_word_index] = "1"
                                count = count + 1
                            if count != 0 and (
                                    count != 1 and count == len(value)
                                    or len(value) == 1 and count
                                    == 1) and arr[len(arr) - 1] == "1":
                                val = ngram.findValue(root, result)
                                synoni = val.split(" & ")
                                tmp += synoni
                                for i in range(len(value)):
                                    word[x + i] = " "
                                flag = 1
                                break
                word_length = word_length - 1

        trans_set.append(tmp)
    read_page.close()
    #print(trans_set)
    return trans_set
Exemplo n.º 4
0
def trans_list(l, root):
    trans_set = []
    article_word_index = 0
    article_line_index = 0
    korset = []
    while 1:
        if (article_line_index >= len(l)):
            break

        line = l[article_line_index].replace("\n", "")

        split_one_line = line.split()
        if (len(split_one_line) == 0):
            korset.append([])
            trans_set.append([])
            article_line_index += 1
            continue
        line_set = []
        korTmp = []
        while 1:
            if (len(split_one_line[article_word_index]) != 1):
                search_result_list = ngram.search(
                    root, split_one_line[article_word_index])
                #print(split_one_line[article_word_index])
                #print(search_result_list)
                if (len(search_result_list) != 0):
                    for x in search_result_list:
                        length = len(x.split())
                        to_find_string = ""

                        if (article_word_index + length < len(split_one_line)):
                            for y in range(length):
                                to_find_string = to_find_string + " " + split_one_line[
                                    article_word_index + y]
                        if (to_find_string[1:].find(x) == 0):
                            c = ngram.findValue(root, x)
                            article_word_index = article_word_index + length - 1
                            sp = c.split(" & ")
                            korTmp.append(x)
                            for i in sp:
                                line_set.append(i)
                            break
                length = len(split_one_line[article_word_index]) - 1
                #print(split_one_line[article_word_index][:length])
                if length > 3:
                    wordLimit = length - 3
                else:
                    wordLimit = 1
                while length > wordLimit:
                    candi = ngram.findValue(
                        root, split_one_line[article_word_index][:length])
                    #print(candi)
                    if candi == "none":
                        length -= 1
                        continue
                    else:
                        sp = candi.split(" & ")
                        for i in sp:
                            line_set.append(i)
                        korTmp.append(
                            split_one_line[article_word_index][:length])
                        break

            article_word_index += 1
            if (article_word_index >= len(split_one_line)):
                korset.append(korTmp)
                trans_set.append(line_set)
                article_line_index += 1
                article_word_index = 0
                break
    return trans_set, korset