예제 #1
0
def getMorphemes(word):
    """
    Check whether word in dictionary and if not then try to segment it
    """
    try:
        if WordMorphemeDicts.contains(word):
            return WordMorphemeDicts.get(word)
        else:
            return segment_to_morphemes(word)
    except Exception:
        print("Error to get morphemes from word '" + word + " '")
예제 #2
0
    def form_roots_vects(self):
        roots_vects = defaultdict(list)
        roots_avg_vects = dict()
        for word in WordMorphemeDicts.words():
            if word in self.modelWord_from_word:
                model_word = self.modelWord_from_word[word]
                for root in WordMorphemeDicts.get(word).roots:
                    vect = self.model[model_word]
                    roots_vects[root].append(vect)

        for root in roots_vects:
            roots_avg_vects[root] = sum(roots_vects[root]) / len(
                roots_vects[root])

        return roots_avg_vects
예제 #3
0
def get_morph(x, w, fs, d, filter_key):
    it = iter(WordMorphemeDicts.get(w).all_in_order)
    l = len(x)
    tag, morph = it.next()
    morph_len = len(morph)

    w = w.replace(u'-', '')

    try:
        for inx in range(len(w) - l + 1):
            sub = w[inx: inx + l]

            if morph_len == 0:
                tag, morph = it.next()
                morph_len = len(morph)
            # if

            if sub == x:
                # Match, then build key from features
                key = tuple(map(lambda f: f(l, w, inx), fs))

                if key == filter_key:
                    if sub == morph:
                        d[key][tag] += 1
                    # if

                    d[key]['ALL'] += 1
                # if
            # if

            morph_len -= 1
        # for
    except Exception as e:
        print(w)
예제 #4
0
def getSplits(word):
    """
    Check whether word in dictionary and if not then try to segment it
    """
    try:
        if WordMorphemeDicts.contains(word):
            result = []
            for tag, sub in WordMorphemeDicts.get(word).all_in_order:
                result.append(sub)
            # for

            return result
        else:
            p, k, s = parse(word)

            return reversed(s)
    except Exception:
        "Error to get morphemes from word '" + word + " '"
예제 #5
0
def get_morph_by_pos(x, w, fs, d, filter_key, pos):
    tag_for_letter = WordMorphemeDicts.get(w).tag_for_letter
    all_in_order = WordMorphemeDicts.get(w).all_in_order
    pos = len(w) + pos

    sub = w[pos: pos + len(x)]

    l = len(x)
    tags = set(tag_for_letter[pos: pos + len(x)])

    if x == sub:
        key = tuple(map(lambda f: f(l, w, pos), fs))

        if key == filter_key:
            tag = next(iter(tags))
            if len(tags) == 1 and (tag, sub) in all_in_order:
                d[key][tag] += 1
            # if

            d[key]["ALL"] += 1
예제 #6
0
def parse_word(word, start_from):
    global D
    global matches

    features_f = [ #lambda _, w, pos: len(w) - pos
                   #, lambda _, w, pos: get(w, pos - 3)
                   #, lambda _, w, pos: get(w, pos - 2)
                    lambda _, w, pos: get(w, pos - 1)
                   , lambda l, w, pos: w[pos: pos + l]
                   #, lambda l, w, pos: get(w, pos + l) ]
                   , lambda l, w, pos: get(w, pos + l + 1)]

    small_feature = False
    if start_from > 4:
        features_f = features_f[1:]
        small_feature = True
    # if

    result = []
    for inx in range(1, len(word) - start_from + 1):
        # get statistic for concrete substring
        #D = defaultdict(lambda: defaultdict(lambda: 0))

        if start_from != 0:
            sub = word[-(inx + start_from):-start_from]
        else:
            sub = word[-(inx + start_from):]
        # if
        sub_key = tuple(map(lambda f: f(len(sub), word, len(word) - (inx + start_from)), features_f))

        if sub_key not in D:
            for w in WordMorphemeDicts.words():
                if small_feature:
                    get_morph(sub, w, features_f, D, sub_key)
                else:
                    get_morph_by_pos(sub, w, features_f, D, sub_key, -(inx + start_from))
            # for
        else:
            matches += 1
        # if

        # apply statistic to determine max probability
        max_prob, max_key = 0.000001, 'R'

        for key, value in D[sub_key].iteritems():
            if key == 'ALL':

                continue
            # if

            p = (value + 0.0) / D[sub_key]['ALL']

            if p > max_prob:
                max_prob, max_key = p, key
            # if
        # for

        result.append([max_prob, max_key, sub])
    # for

    return result
예제 #7
0
        print '/'.join(slovorod_d[word]), ' | ', '/'.join(orig_list)

        #if mismatch_num > 100:
        #    break
        # if
    # if
# for

print mismatch_num
"""


# This code to get the values after the root, and roots
out = codecs.open('init_split', 'w', encoding='utf-8')

for w in WordMorphemeDicts.words():
    res = []

    for t, s in WordMorphemeDicts.get(w).all_in_order:
        res.append(s)
    # for

    out.write(w + ' : ' + ','.join(res) + '\n')
# for

out.close()


#for w in after_root:
#    out.write(w + '\n')
## for
예제 #8
0
        return segment_to_morphemes(word)
    except Exception:
        print ("Error to get morphemes from word:" + word)
# def

if __name__ == "__main__":

    all_count = 0
    test_count = 0
    false_negative = 0
    root_count = 0
    pref_count = 0
    suf_count = 0
    equal_count = 0

    for word in WordMorphemeDicts.words():
        m_orig = WordMorphemeDicts.get(word)
        m_test = getMorphemes(word)
        all_count += 3
        is_equal = True

        count = 0
        n_count = 0
        for r in m_test.roots:
            if r in m_orig.roots:
                count += 1
            else:
                n_count += 1
            # if
        # for
        if len(m_orig.roots):