Пример #1
0
def parse_string(string):
    """
    Morphologically segments and POS tags string of Tunisian Arabic text.
    :param string: Tunisian Arabic text
    :return: list of parse tuples in the format ('word', 'POS')
    """
    # print("string")
    saved_parses = load_saved_parses()
    tokens = preprocess(string)
    # print("after preprocessing: ", tokens)
    parsed_list = []
    for word in tokens:
        if word in saved_parses.keys():
            # print(word, "in saved keys")
            parsed_list.append(saved_parses[word])
            continue
        if not word.isalpha():
            # print(word, "is not alpha")
            parsed_list.append((word, 'PUNCT'))
            continue
        if test_lang(word) != 'AR':
            # print(word, "is not arabic")
            parsed_list.append((word, 'FW'))
            continue
        # print("none of three conditions is true")
        # print("word is still ", word)
        parse_dict = stemmer(word)
        # print(parse_dict)
        parse, pos = choose_best_parse(parse_dict, debug=False)
        pos = re.sub('UNINVBD', 'VBD', pos)
        pos = re.sub('UNIN', 'N', pos)  # default to noun for uninflected unknown words
        pos_list = pos.split('_')
        mapped = list(zip(parse, pos_list))
        parsed_list.extend(mapped)
    return parsed_list
Пример #2
0
 def test_ave_vbz_freq(self):
     vbz = 'يكتبوا'
     parse_dict = stemmer(vbz)
     parse = parse_dict.get('VBZ')
     stem, word_forms = make_alt_verb_forms(parse)
     ave_freq = compute_ave_freq(word_forms)
     self.assertNotEqual(ave_freq, 0)
Пример #3
0
 def test_ave_unin_vbd_freq(self):
     vbd = 'كتب'
     parse_dict = stemmer(vbd)
     parse = parse_dict.get('UNIN')
     stem, word_forms = make_alt_unin_verb_forms(parse)
     ave_freq = compute_ave_freq(word_forms)
     self.assertNotEqual(ave_freq, 0)
Пример #4
0
 def test_ave_noun_freq(self):
     noun = 'الكتاب'
     parse_dict = stemmer(noun)
     parse = parse_dict.get('DET_N')
     stem, word_forms = make_alt_noun_forms(parse)
     ave_freq = compute_ave_freq(word_forms)
     self.assertNotEqual(ave_freq, 0)
Пример #5
0
 def make_alt_verb_forms(self):
     vbd = 'كتبت'
     parse_dict = stemmer(vbd)
     parse = parse_dict.get('VBD')
     stem, word_forms = make_alt_verb_forms(parse)
     ave_freq = compute_ave_freq(word_forms)
     self.assertNotEqual(ave_freq, 0)
Пример #6
0
 def test_make_unin_verb_forms(self):
     verb = 'كتب'
     parse_dict = stemmer(verb)
     parse = parse_dict.get('UNIN')
     stem, verb_forms = make_alt_unin_verb_forms(parse)
     all_verb_forms = [
         'يكتب', 'يكتبوا', 'يكتبو', 'تكتب', 'تكتبوا', 'تكتبو', 'نكتب',
         'نكتبوا', 'نكتبو', 'كتبت', 'كتبنا', 'كتبو', 'كتبوا'
     ]
     self.assertEqual(sorted(verb_forms), sorted(all_verb_forms))
Пример #7
0
 def test_make_alt_noun_forms(self):
     n = 'الكتاب'
     parse_dict = stemmer(n)
     parse = parse_dict.get('DET_N')
     stem, word_forms = make_alt_noun_forms(parse)
     self.assertEqual(
         sorted(word_forms),
         sorted([
             'كتابك', 'كتابكم', 'كتابنا', 'كتابه', 'كتابها', 'كتابهم',
             'كتابو', 'كتابي', 'كتابيا', 'لكتاب', 'كتاب'
         ]))
Пример #8
0
 def test_make_alt_vbd_forms(self):
     vbd = 'كتبت'
     parse_dict = stemmer(vbd)
     parse = parse_dict.get('VBD')
     stem, word_forms = make_alt_verb_forms(parse)
     self.assertEqual(
         sorted(word_forms),
         sorted([
             'كتبنا', 'كتبو', 'كتبوا', 'يكتب', 'يكتبوا', 'يكتبو', 'تكتب',
             'تكتبوا', 'تكتبو', 'نكتب', 'نكتبوا', 'نكتبو', 'كتب'
         ]))
Пример #9
0
def choose_best_stem_test(word, debug=False):
    parse_dict = stemmer(word)
    if debug: print("Parse dict is", parse_dict)
    best_parse, word_type = choose_best_parse(parse_dict, debug=debug)
    stem = extract_stem(best_parse)
    return stem
Пример #10
0
 def test_verb_suffix_defined(self):
     parse_dict = stemmer(self.vbd)
     parse = parse_dict.get('VBD')
     self.assertEqual('ت', extract_suffix(parse))
Пример #11
0
 def test_returns_dict_with_correct_key(self):
     parse_dict = stemmer(self.vbd)
     self.assertIsNotNone(parse_dict.get('VBD'))
     for k in parse_dict.keys():
         self.assertIsInstance(k, str)
Пример #12
0
 def test_simple_stemmer(self):
     for w, num in self.test_words:
         parse_list = stemmer(w)
         self.assertEqual(num, len(parse_list))