Пример #1
0
 def test_empty_readings(self):
     with warnings.catch_warnings(record=True) as caught_warnings:
         warnings.simplefilter('always')
         next(parse('^foo/$'))
         self.assertEqual(len(caught_warnings), 1)
         self.assertTrue(issubclass(caught_warnings[0].category, RuntimeWarning))
         self.assertIn('Empty readings', str(caught_warnings[0].message))
Пример #2
0
 def test_empty_readings(self):
     with warnings.catch_warnings(record=True) as caught_warnings:
         warnings.simplefilter('always')
         next(parse('^foo/$'))
         self.assertEqual(len(caught_warnings), 1)
         self.assertTrue(issubclass(caught_warnings[0].category, RuntimeWarning))
         self.assertIn('Empty readings', str(caught_warnings[0].message))
Пример #3
0
 def test_reading_to_string(self):
     lexical_units = list(parse(self.s4))
     self.assertEqual(len(lexical_units), 1)
     self.assertEqual(
         reading_to_string(lexical_units[0].readings[0]),
         'decir<vblex><imp><p2><sg>+me<prn><enc><p1><mf><sg>+lo<prn><enc><p3><nt>'
     )
Пример #4
0
    def get(self):
        in_text = self.get_argument('q') + '*'
        in_mode = to_alpha3_code(self.get_argument('lang'))
        if '-' in in_mode:
            l1, l2 = map(to_alpha3_code, in_mode.split('-', 1))
            in_mode = '%s-%s' % (l1, l2)
        in_mode = self.find_fallback_mode(in_mode, self.spellers)
        logging.info(in_text)
        logging.info(self.get_argument('lang'))
        logging.info(in_mode)
        logging.info(self.spellers)
        if in_mode in self.spellers:
            logging.info(self.spellers[in_mode])
            [path, mode] = self.spellers[in_mode]
            logging.info(path)
            logging.info(mode)
            formatting = 'none'
            commands = [[
                'apertium', '-d', path, '-f', formatting,
                self.get_argument('lang') + '-tokenise'
            ]]
            result = yield translate_simple(in_text, commands)

            tokens = streamparser.parse(result)
            units = []
            for token in tokens:
                if token.knownness == streamparser.known:
                    units.append({
                        'token': token.wordform,
                        'known': True,
                        'sugg': []
                    })
                else:
                    suggestion = []
                    commands = [[
                        'apertium', '-d', path, '-f', formatting, mode
                    ]]

                    result = yield translate_simple(token.wordform, commands)
                    found_sugg = False
                    for line in result.splitlines():
                        if line.count('Corrections for'):
                            found_sugg = True
                            continue
                        if found_sugg and '\t' in line:
                            s, w = line.split('\t')
                            suggestion.append((s, w))

                    units.append({
                        'token': token.wordform,
                        'known': False,
                        'sugg': suggestion
                    })

            self.send_response(units)
        else:
            error_explanation = '{} on spellchecker mode: {}'.format(
                'Error 404',
                'Spelling mode for ' + in_mode + ' is not installed')
            self.send_error(404, explanation=error_explanation)
def collect(sentence, sentence_tagged):
    """
	Collect a dictionary, which contains positions of sentence words as keys
	and tokens + tags as items. 
	"""

    sentence_words = {}
    tokens = tokenizer(sentence.lower())

    sentence_tagged = sentence_tagged.split(' ')

    for i in range(len(sentence_tagged)):
        if len(re.findall('\/\*', sentence_tagged[i])) > 1:
            word = re.search('\/\*(.*?)\/\*', sentence_tagged[i]).group(1)
            word = re.sub('\$', '', word)
            word = re.sub('\^', '', word)
            word = '^' + word + '/*' + word + '$'
            sentence_tagged[i] = word

    sentence_tagged = ' '.join(sentence_tagged)

    units = parse(c for c in sentence_tagged)
    counter = 0

    for unit in units:
        sentence_words[counter] = [
            tokens[counter], set(unit.readings[0][0][1])
        ]
        counter += 1

    return sentence_words
Пример #6
0
def collect(sentence, sentence_tagged):
    sentence_words = {}
    tokens = tokenizer(sentence.lower())

    sentence_tagged = sentence_tagged.split(' ')

    for i in range(len(sentence_tagged)):
        if len(re.findall('\/\*', sentence_tagged[i])) > 1:
            word = re.search('\/\*(.*?)\/\*', sentence_tagged[i]).group(1)
            word = re.sub('\$', '', word)
            word = re.sub('\^', '', word)
            word = '^' + word + '/*' + word + '$'
            sentence_tagged[i] = word

    sentence_tagged = ' '.join(sentence_tagged)

    units = parse(c for c in sentence_tagged)
    counter = 0

    try:
        for unit in units:
            sentence_words[counter] = [
                tokens[counter], set(unit.readings[0][0][1])
            ]
            counter += 1
    except:
        pass

    return sentence_words
Пример #7
0
def parse_sf(apertium_string):
    surface_forms = []
    parsed_string = parse(apertium_string)

    for word in parsed_string:
        surface_forms.append(word.wordform)

    return surface_forms
Пример #8
0
 def test_parse(self):
     lexical_units = list(parse(self.s1))
     self.assertEqual(len(lexical_units), 1)
     lexical_unit = lexical_units[0]
     self.assertEqual(str(lexical_unit), r'x\/y\^\$\<z\>å/A\$\^B<tag><tag2>/A\/S<tag><#1-\>2>')
     readings = lexical_unit.readings
     self.assertListEqual(readings, [[SReading(baseform='A\\$\\^B', tags=['tag', 'tag2'])], [SReading(baseform='A\\/S', tags=['tag', '#1-\\>2'])]])
     self.assertEqual(lexical_unit.wordform, r'x\/y\^\$\<z\>å')
     self.assertEqual(lexical_unit.knownness, known)
Пример #9
0
 def test_blanks_with_wordbound_blanks(self):
     lexical_units_with_blanks = list(parse(self.s5, with_text=True))
     self.assertEqual(len(lexical_units_with_blanks), 3)
     blank, _lexical_unit = lexical_units_with_blanks[0]
     self.assertEqual(blank, r'[] ')
     blank, _lexical_unit = lexical_units_with_blanks[1]
     self.assertEqual(blank, r' [bl] ')
     blank, _lexical_unit = lexical_units_with_blanks[2]
     self.assertEqual(blank, r' [\[] [\]blank] ')
def stream_parser_split_X_y(file_lines):
    lexical_units = parse('\n'.join(file_lines))
    X = []
    y = []
    for lexical_unit in lexical_units:
        y.append(reading_to_string(lexical_unit.readings[0]))
        X.append(lexical_unit.wordform)
    assert(len(y)==len(X)), 'Token and Target vectors size mismatch ({}!={})'.format(len(y), len(X))
    return X, y
Пример #11
0
 def test_parse(self):
     lexical_units = list(parse(self.s1))
     self.assertEqual(len(lexical_units), 1)
     lexical_unit = lexical_units[0]
     self.assertEqual(str(lexical_unit), r'x\/y\^\$\<z\>å/A\$\^B<tag><tag2>/A\/S<tag><#1-\>2>')
     readings = lexical_unit.readings
     self.assertListEqual(readings, [[SReading(baseform='A\\$\\^B', tags=['tag', 'tag2'])], [SReading(baseform='A\\/S', tags=['tag', '#1-\\>2'])]])
     self.assertEqual(lexical_unit.wordform, r'x\/y\^\$\<z\>å')
     self.assertEqual(lexical_unit.knownness, known)
def stream_parser_extract_analyses(line):
    unit = [unit for unit in parse(line)]
	# TODO: Handle cases such as "Empty readings for ///<sent>" in a better way
    if not unit:
    	return ['']
    unit = unit[0]

    # Is the "///<sent>" really handeled?
    analyses = [reading_to_string(reading) for reading in unit.readings]
    return analyses if len(analyses) else ['']
Пример #13
0
    def _postproc_text(result: str) -> List[LexicalUnit]:
        """
        Postprocesses the input

        Args:
            result (str)

        Returns:
            List[LexicalUnit]
        """
        lexical_units = list(parse(result))  # type: List[LexicalUnit]
        return lexical_units
Пример #14
0
    def _postproc_text(self, result):  # type: (Analyzer, str) -> List[LexicalUnit]
        """
        Postprocesses the input

        Args:
            result (str)

        Returns:
            List[LexicalUnit]
        """
        lexical_units = list(parse(result))
        return lexical_units
Пример #15
0
 def test_wordbound_blanks(self):
     lexical_units = list(parse(self.s5))
     self.assertEqual(len(lexical_units), 3)
     self.assertListEqual(
         lexical_units[2].readings,
         [
             [SReading(baseform='name', tags=['n', 'sg'])],
             [SReading(baseform='name', tags=['vblex', 'inf'])],
             [SReading(baseform='name', tags=['vblex', 'pres'])],
         ],
     )
     self.assertEqual(lexical_units[0].wordform, 'My')
     self.assertEqual(lexical_units[0].wordbound_blank, '[[t:b:123456]]')
     self.assertEqual(lexical_units[1].wordform, 'test')
     self.assertEqual(lexical_units[1].wordbound_blank, '')
     self.assertEqual(lexical_units[2].wordform, 'name')
     self.assertEqual(lexical_units[2].wordbound_blank, '[[t:i:12asda; t:p:1abc76]]')
Пример #16
0
 def test_parse_subreadings(self):
     lexical_units = list(parse(self.s4))
     self.assertEqual(len(lexical_units), 1)
     self.assertListEqual(
         lexical_units[0].readings,
         [
             [
                 SReading(baseform='decir', tags=['vblex', 'imp', 'p2', 'sg']),
                 SReading(baseform='me', tags=['prn', 'enc', 'p1', 'mf', 'sg']),
                 SReading(baseform='lo', tags=['prn', 'enc', 'p3', 'nt']),
             ],
             [
                 SReading(baseform='decir', tags=['vblex', 'imp', 'p2', 'sg']),
                 SReading(baseform='me', tags=['prn', 'enc', 'p1', 'mf', 'sg']),
                 SReading(baseform='lo', tags=['prn', 'enc', 'p3', 'm', 'sg']),
             ],
         ],
     )
Пример #17
0
 def test_parse_subreadings(self):
     lexical_units = list(parse(self.s4))
     self.assertEqual(len(lexical_units), 1)
     self.assertListEqual(
         lexical_units[0].readings,
         [
             [
                 SReading(baseform='decir', tags=['vblex', 'imp', 'p2', 'sg']),
                 SReading(baseform='me', tags=['prn', 'enc', 'p1', 'mf', 'sg']),
                 SReading(baseform='lo', tags=['prn', 'enc', 'p3', 'nt']),
             ],
             [
                 SReading(baseform='decir', tags=['vblex', 'imp', 'p2', 'sg']),
                 SReading(baseform='me', tags=['prn', 'enc', 'p1', 'mf', 'sg']),
                 SReading(baseform='lo', tags=['prn', 'enc', 'p3', 'm', 'sg']),
             ],
         ],
     )
Пример #18
0
 def test_subreading_to_string(self):
     lexical_units = list(parse(self.s4))
     self.assertEqual(len(lexical_units), 1)
     self.assertEqual(subreading_to_string(lexical_units[0].readings[0][0]), 'decir<vblex><imp><p2><sg>')
Пример #19
0
 def test_reading_to_string(self):
     lexical_units = list(parse(self.s4))
     self.assertEqual(len(lexical_units), 1)
     self.assertEqual(reading_to_string(lexical_units[0].readings[0]), 'decir<vblex><imp><p2><sg>+me<prn><enc><p1><mf><sg>+lo<prn><enc><p3><nt>')
Пример #20
0
 def test_mainpos_ltr(self):
     lexical_units = list(parse(self.s4))
     self.assertEqual(len(lexical_units), 1)
     pos = mainpos(lexical_units[0].readings[0], ltr=True)
     self.assertEqual(pos, 'vblex')
Пример #21
0
 def test_mainpos(self):
     lexical_units = list(parse(self.s4))
     self.assertEqual(len(lexical_units), 1)
     pos = mainpos(lexical_units[0].readings[0])
     self.assertEqual(pos, 'prn')
Пример #22
0
 def test_parse_with_text(self):
     lexical_units_with_blanks = list(parse(self.s1, with_text=True))
     self.assertEqual(len(lexical_units_with_blanks), 1)
     blank, _lexical_unit = lexical_units_with_blanks[0]
     self.assertEqual(blank, r'[\^keep<escapes>\$] \^ \$ \/ \[ \] ')
Пример #23
0
 def test_parse_unknown(self):
     lexical_units = list(parse(self.s2))
     self.assertEqual(len(lexical_units), 1)
     self.assertEqual(lexical_units[0].knownness, unknown)
Пример #24
0
 def test_parse_with_text(self):
     lexical_units_with_blanks = list(parse(self.s1, with_text=True))
     self.assertEqual(len(lexical_units_with_blanks), 1)
     blank, _lexical_unit = lexical_units_with_blanks[0]
     self.assertEqual(blank, r'[\^keep<escapes>\$] \^ \$ \/ \[ \] ')
def clean_trash(orig_source, orig_mt, orig_target, freqs, s_lang, t_lang, input_file):
	with open('tagged_source_entries.txt', 'r', encoding='utf-8') as file:
		source = file.read().strip('\n').split('\n')

	with open('tagged_mt_entries.txt', 'r', encoding='utf-8') as file:
		mt = file.read().strip('\n').split('\n')

	with open('tagged_target_entries.txt', 'r', encoding='utf-8') as file:
		target = file.read().strip('\n').split('\n')	

	for i in range(len(source)):
		source[i] = bla(source[i])

	for i in range(len(mt)):
		mt[i] = bla(mt[i])

	for i in range(len(target)):
		target[i] = bla(target[i])

	counter = 0

	with open(s_lang + '-' + t_lang + '-cleaned_' + input_file, 'w', encoding='utf-8') as file:	
		for s, m, t in zip(source, mt, target):
			s_units = parse(c for c in s)
			m_units = parse(c for c in m)
			t_units = parse(c for c in t)

			for su, mu, tu in zip(s_units, m_units, t_units):
				for sr, mr, tr in  zip(su.readings, mu.readings, tu.readings):
					s_lemma = sr[0][0]
					m_lemma = mr[0][0]
					t_lemma = tr[0][0]

					s_tags = sr[0][1]
					m_tags = mr[0][1]
					t_tags = tr[0][1]

					if 'lquot' in t_tags or 'sent' in t_tags or 'rquot' in t_tags or 'cm' in t_tags or 'guio' in t_tags: 
						pass
					elif 'lquot' in s_tags or 'sent' in s_tags or 'rquot' in s_tags or 'cm' in s_tags or 'guio' in s_tags: 
						pass
					else:
						st_dis = distance(s_lemma, t_lemma)
						st_letters = len(t_lemma)
						st_percent = (st_letters - st_dis) / st_letters * 100

						sm_dis = distance(s_lemma, m_lemma)
						sm_letters = len(m_lemma)
						sm_percent = (sm_letters - sm_dis) / sm_letters * 100

						mt_dis = distance(m_lemma, t_lemma)
						mt_letters = len(t_lemma)
						mt_percent = (mt_letters - mt_dis) / mt_letters * 100

						if st_percent >= 30 and sm_percent >= 30 and mt_percent >= 30 and 'other' not in input_file:
							file.write('%s\t%s\t%s\t%s\n' % (orig_source[counter], orig_mt[counter], orig_target[counter], freqs[counter]))
						elif 'other' in input_file:
							file.write('%s\t%s\t%s\t%s\n' % (orig_source[counter], orig_mt[counter], orig_target[counter], freqs[counter]))
						else:
							pass

			
			counter += 1
Пример #26
0
else:
    input_idiom = sys.stdin.read()

#n = 1 #Number of words it will replace

from streamparser import parse, mainpos, reading_to_string

replacement_candidates = []
candidate_tags = ['n', 'adj', 'adv', 'vblex', 'v']

#Parse Input Idiom to get a candidate for replacement
lu_count = 0
input_idiom_surface = []

for lu in parse(input_idiom):
    analyses = lu.readings
    firstreading = analyses[0]
    surfaceform = lu.wordform

    input_idiom_surface.append(surfaceform)

    #print(firstreading[0].tags)
    #print("^{}/{}$".format(surfaceform,
    #                       reading_to_string(firstreading)))

    for tag in candidate_tags:
        if tag in firstreading[0].tags:
            replacement_candidates.append(
                [firstreading[0], surfaceform, lu_count])
Пример #27
0
 def test_parse_unknown(self):
     lexical_units = list(parse(self.s2))
     self.assertEqual(len(lexical_units), 1)
     self.assertEqual(lexical_units[0].knownness, unknown)
Пример #28
0
 def test_subreading_to_string(self):
     lexical_units = list(parse(self.s4))
     self.assertEqual(len(lexical_units), 1)
     self.assertEqual(subreading_to_string(lexical_units[0].readings[0][0]), 'decir<vblex><imp><p2><sg>')
Пример #29
0
 def test_mainpos_ltr(self):
     lexical_units = list(parse(self.s4))
     self.assertEqual(len(lexical_units), 1)
     pos = mainpos(lexical_units[0].readings[0], ltr=True)
     self.assertEqual(pos, 'vblex')
Пример #30
0
def malaphor(input_idiom, lang):
    idioms_list = open(f'{lang}-idioms-analysed.txt', 'r').read().split('\n')
    replacement_candidates = []
    candidate_tags = ['n', 'adj', 'adv', 'vblex', 'v']

    #Parse Input Idiom to get a candidate for replacement
    lu_count = 0
    input_idiom_surface = []

    for lu in parse(input_idiom):
        if debug:
            print(lu)
        analyses = lu.readings
        if debug:
            print(analyses)
        firstreading = analyses[0]
        surfaceform = lu.wordform

        input_idiom_surface.append(surfaceform)

        if debug:
            print(firstreading[0].tags)
            print("^{}/{}$".format(surfaceform,
                                   reading_to_string(firstreading)))

        for tag in candidate_tags:
            if tag in firstreading[0].tags:
                replacement_candidates.append(
                    [firstreading[0], surfaceform, lu_count])

        lu_count += 1

    if debug:
        print(replacement_candidates)

    if len(replacement_candidates) < 1:
        raise NoCandidatesForReplacement()

    elif len(replacement_candidates) == 1:
        final_replacement_candidate = replacement_candidates[0]

    else:
        final_replacement_candidate = replacement_candidates[random.randint(
            0,
            len(replacement_candidates) - 1)]

    if debug:
        print("Final Replacement Candidate:")
        print(final_replacement_candidate)

    #Parse Idioms list (analysed through the morph) to find a suitable replacement

    possible_replacements = []

    for idiom in idioms_list:
        for lu in parse(idiom):
            if lu.readings[0][0].tags == final_replacement_candidate[0].tags:
                possible_replacements.append([lu.readings[0][0], lu.wordform])

    replacement_flag = 1

    if len(possible_replacements) < 1:
        raise NoReplacement()

    elif len(possible_replacements) == 1:
        final_replacement_word = possible_replacements[0]

    else:
        final_replacement_word = possible_replacements[random.randint(
            0,
            len(possible_replacements) - 1)]

    if debug:
        print("Replacement Word:")
        print(final_replacement_word)

    #Make the replacement in the original idiom
    pos = final_replacement_candidate[
        2]  #contains the original position of the word in the input idiom

    if debug:
        print(input_idiom_surface)

    input_idiom_surface[pos] = final_replacement_word[1]

    if debug:
        print(input_idiom_surface)

    out = ''

    for word in input_idiom_surface:
        if out != '' and word[0] in string.ascii_letters + string.digits:
            out += ' ' + word
        else:
            out += word
    return out.capitalize()
Пример #31
0
######################################################################################################################

print( "\nProcessing main, words_case, words, lemmas, tags, tags_uniq section:" )

######################################################################################################################
#   1st cycle
######################################################################################################################

#
# Collecting words, lemmas, tags...
#
#for blank, lexicalUnit in parse_file(fr, with_text=True):
for line in fr:
	line = line.strip(" \r\n")
	#print("LINE:" + str(line))
	for blank, lexicalUnit in parse(line, with_text=True):
		# MySQL (MariaDB) does not differ Russian "ё" and "е" letters
		#line = re.sub( r'ё', r"е", line ) # ё => е
		#line = re.sub( r'Ё', r"Е", line ) # Ё => Е
		#line = re.sub( r'\\', r"", line ) # remove backslashes, otherwise mysql will not import those lines

		#print("BLANK:" + str(blank) + " lexicalUnit:" + str(lexicalUnit))
		#sys.stdout.flush()
		
		blank = blank.strip(' \t')
		if blank != "":
			status = collect_tokens(blank, blank, blank, blanktag, blanktag)
		
		status, word_case, word, lemma, pos, tags = parse_apertium(word_case, word, lemma, pos, tags)
		status = collect_tokens(word_case, word, lemma, pos, tags)
Пример #32
0
 def test_mainpos(self):
     lexical_units = list(parse(self.s4))
     self.assertEqual(len(lexical_units), 1)
     pos = mainpos(lexical_units[0].readings[0])
     self.assertEqual(pos, 'prn')