Пример #1
0
def prepare_and_process_translation(source_text):
    translation = translate_text(source_text, pair, directory=args.directory)

    if args.ignoreCase:
        translation = translation.lower()

    analyzed_translation = analyze_text(translation,
                                        pair,
                                        pair[::-1],
                                        directory=args.directory)
    analyzed_translation_units = list(
        parse(analyzed_translation, withText=True))
    translation = re.sub("\s+", " ", translation)

    analyzed_tu_subsegments = []

    for length in range(1, args.maxTranslationLength + 1):
        for start_ind in range(0,
                               len(analyzed_translation_units) - length + 1):
            last_ind = start_ind + length - 1
            analyzed_tu_subsegments.append(
                (analyzed_translation_units[start_ind:last_ind + 1], start_ind,
                 last_ind))

    return translation, analyzed_translation, analyzed_translation_units, analyzed_tu_subsegments
def function5(st_subsegment):
    tt_subsegment = translate_text(st_subsegment,
                                   pair,
                                   directory=args.directory)  #t

    if args.ignoreCase:
        tt_subsegment = tt_subsegment.lower()

    analyzed_tt_subsegment = analyze_text(tt_subsegment,
                                          pair,
                                          pair[::-1],
                                          directory=args.directory)
    analyzed_tts_units = list(parse(analyzed_tt_subsegment, withText=True))

    return tt_subsegment, analyzed_tt_subsegment, analyzed_tts_units
Пример #3
0
def process_source(source_text):
    analyzed_source = analyze_text(source_text,
                                   pair,
                                   pair,
                                   directory=args.directory)
    analyzed_source_units = list(parse(analyzed_source, withText=True))

    correspondence = namedtuple('Correspondence',
                                ['s', 't', 'i', 'j', 'k', 'l'])
    correspondences, analyzed_su_subsegments = [], []

    for length in range(1, args.maxSourceLength + 1):
        for start_ind in range(0, len(analyzed_source_units) - length + 1):
            last_ind = start_ind + length - 1
            analyzed_su_subsegments.append(
                (analyzed_source_units[start_ind:last_ind + 1], start_ind,
                 last_ind))

    return analyzed_source, analyzed_source_units, correspondence, correspondences, analyzed_su_subsegments
Пример #4
0
def process_subsegments(st_subsegment):
    #tt_subsegment = translate_text(st_subsegment, pair, directory=args.directory) #t

    #if args.ignoreCase:
    #tt_subsegment = tt_subsegment.lower()

    translate_file('st_info.txt', 'tt_info.txt')
    analyze_file('tt_info.txt', 'an_tt_info.txt')

    with open('tt_info.txt', 'r', encoding='utf-8') as file:
        tt_subsegments = file.read().strip('\n').split('\n')

    with open('an_tt_info.txt', 'r', encoding='utf-8') as file:
        analyzed_tt_subsegments = file.read().strip('\n').split('\n')

    #analyzed_tt_subsegment = analyze_text(tt_subsegment, pair, pair[::-1], directory=args.directory)
    analyzed_tts_units = [
        list(parse(elem, withText=True)) for elem in analyzed_tt_subsegments
    ]

    return tt_subsegments, analyzed_tt_subsegments, analyzed_tts_units
def getCorrespondences(sourceLanguage,targetLanguage,ignoreCase,maxSourceLength,directory,maxTranslationLength, s):
    pair = (sourceLanguage, targetLanguage)
    sourceText = s.lower() if ignoreCase else s #S

    #this stuff analyzes source text
    analyzedSourceText = analyzeText(sourceText, pair, pair, directory=directory)

    analyzedSourceUnits = list(parse(analyzedSourceText, withText=True))

    Correspondence = collections.namedtuple('Correspondence', ['s', 't', 'i', 'j', 'k', 'l'])

    correspondences = []

    analyzedSourceUnitsSubsegments = []

    for length in range(1, maxSourceLength + 1):
        for startIndex in range(0, len(analyzedSourceUnits) - length + 1):
            lastIndex = startIndex + length - 1 
            analyzedSourceUnitsSubsegments.append((analyzedSourceUnits[startIndex:lastIndex+1], startIndex, lastIndex)) #s, i, j (analyzed units forms of them)


    #this stuff translates source text
    translatedText = translateText(sourceText, pair, directory=directory)

    if ignoreCase:
        translatedText = translatedText.lower()

    #this stuff analyzes translated text    
    analyzedTranslation = analyzeText(translatedText, pair, pair[::-1], directory=directory)
    analyzedTranslationUnits = list(parse(analyzedTranslation, withText=True))

    analyzedTranslationUnitsSubsegments = []

    for length in range(1, maxTranslationLength + 1):
        for startIndex in range(0, len(analyzedTranslationUnits) - length + 1):
            lastIndex = startIndex + length - 1
            analyzedTranslationUnitsSubsegments.append((analyzedTranslationUnits[startIndex:lastIndex+1], startIndex, lastIndex))

    #translatedTextSubsegements = []

    startIndexes = []
    lastIndexes = []
    sourceTextSubsegments = []

    for analyzedSourceUnitsSubsegment, startIndexInUnits, lastIndexInUnits in analyzedSourceUnitsSubsegments:
        sourceTextSubsegment = '' #s

        for i, (analyzedSourceUnitPreceedingText, analyzedSourceLexicalUnit) in enumerate(analyzedSourceUnitsSubsegment):
            sourceTextSubsegment += (analyzedSourceUnitPreceedingText if i != 0 else '') + analyzedSourceLexicalUnit.wordform

        startIndexInSourceText = sum(list(map(lambda x: len(x[0]) + len(x[1].wordform), analyzedSourceUnits[:startIndexInUnits]))) + len(analyzedSourceUnitsSubsegment[0][0]) #i
        lastIndexInSourceText = sum(list(map(lambda x: len(x[0]) + len(x[1].wordform), analyzedSourceUnits[:lastIndexInUnits+1]))) - 1 #j

        startIndexes.append(startIndexInSourceText)
        lastIndexes.append(lastIndexInSourceText)        

        if ignoreCase:
            sourceTextSubsegment = sourceTextSubsegment.lower()

        sourceTextSubsegments.append(sourceTextSubsegment)

    with open('source_text_subsegments.txt', 'w', encoding='utf-8') as file:
        for s in sourceTextSubsegments:
            file.write(('(( %s ))\n\n') % (s))

    translate('source_text_subsegments.txt', 'translated_text_subsegments.txt', directory, pair)
    analyze('translated_text_subsegments.txt', 'analyzed_translations.txt', directory, pair)

    """with open('source_text_subsegments.txt', 'r', encoding='utf-8') as file:
        sourceTextSubsegments = file.read().strip('\n').split('\n\n')  """

    with open('translated_text_subsegments.txt', 'r', encoding='utf-8') as file:
        translatedTextSubsegments = file.read().strip('\n').split('\n\n')  

    """for i in range(len(sourceTextSubsegments)):
        sourceTextSubsegments[i] = sourceTextSubsegments[i].strip('(( ')
        sourceTextSubsegments[i] = sourceTextSubsegments[i].strip(' ))')

        if ignoreCase:
            sourceTextSubsegments[i] = sourceTextSubsegments[i].lower()"""

    for i in range(len(translatedTextSubsegments)):
        translatedTextSubsegments[i] = translatedTextSubsegments[i].strip('(( ')
        translatedTextSubsegments[i] = translatedTextSubsegments[i].strip(' ))')

        if ignoreCase:
            translatedTextSubsegments[i] = translatedTextSubsegments[i].lower()

    with open('analyzed_translations.txt', 'r', encoding='utf-8') as file:
        analyzedTranslatedTextSubsegments = file.read().strip('\n').split('\n\n')    
        #analyzedTranslatedTextSubsegment = analyzeText(translatedTextSubsegment, pair, pair[::-1], directory=directory)

    #print(len(startIndexes))
    #print(len(sourceTextSubsegments))
    
    for i in range(len(analyzedTranslatedTextSubsegments)):
        analyzedTranslatedTextSubsegment = analyzedTranslatedTextSubsegments[i]
        translatedTextSubsegment = translatedTextSubsegments[i]
        sourceTextSubsegment = sourceTextSubsegments[i]
        startIndexInSourceText = startIndexes[i]
        lastIndexInSourceText = lastIndexes[i]

        analyzedTranslatedTextSubsegment = re.sub('\^\(\/\(\<lpar\>\$\^\(\/\(\<lpar\>\$ ', '', analyzedTranslatedTextSubsegment)
        analyzedTranslatedTextSubsegment = re.sub(' \^\)\/\)\<rpar\>\$\^\)\/\)\<rpar\>\$', '', analyzedTranslatedTextSubsegment)

        #print('meh', analyzedTranslatedTextSubsegment)    
        analyzedTranslatedTextSubsegmentUnits = list(parse(analyzedTranslatedTextSubsegment, withText=True))
        #pprint.pprint(analyzedTranslatedTextSubsegmentUnits)

        #print('meh', analyzedTranslatedTextSubsegmentUnits)
        #print('suka', analyzedTranslationUnitsSubsegments[0])

        subsegmentMatches = list(filter(lambda x: list(map(lambda y: str(y[1]), x[0])) == list(map(lambda z: str(z[1]), analyzedTranslatedTextSubsegmentUnits)) , analyzedTranslationUnitsSubsegments))
        
        if subsegmentMatches:
            startIndexInTranslatedText = sum(list(map(lambda x: len(x[0]) + len(x[1].wordform), analyzedTranslationUnits[:subsegmentMatches[0][1]]))) + len(subsegmentMatches[0][0][0][0]) #k
            lastIndexInTranslatedText = sum(list(map(lambda x: len(x[0]) + len(x[1].wordform), analyzedTranslationUnits[:subsegmentMatches[0][2]+1]))) - 1 #l

            correspondences.append(Correspondence(
                s=sourceTextSubsegment, 
                t=translatedTextSubsegment,
                i=startIndexInSourceText, 
                j=lastIndexInSourceText, 
                k=startIndexInTranslatedText, 
                l=lastIndexInTranslatedText
            ))

            #print(correspondences)
           
    return correspondences
Пример #6
0
def getCorrespondences(sourceLanguage, targetLanguage, ignoreCase,
                       maxSourceLength, directory, maxTranslationLength, s):

    pair = (sourceLanguage, targetLanguage)

    sourceText = s.lower() if ignoreCase else s  #S
    analyzedSourceText = analyzeText(sourceText,
                                     pair,
                                     pair,
                                     directory=directory)
    analyzedSourceUnits = list(parse(analyzedSourceText, withText=True))

    Correspondence = collections.namedtuple('Correspondence',
                                            ['s', 't', 'i', 'j', 'k', 'l'])

    correspondences = []

    analyzedSourceUnitsSubsegments = []

    for length in range(1, maxSourceLength + 1):
        for startIndex in range(0, len(analyzedSourceUnits) - length + 1):
            lastIndex = startIndex + length - 1
            analyzedSourceUnitsSubsegments.append(
                (analyzedSourceUnits[startIndex:lastIndex + 1], startIndex,
                 lastIndex))  #s, i, j (analyzed units forms of them)

    translatedText = translateText(sourceText, pair, directory=directory)

    if ignoreCase:
        translatedText = translatedText.lower()

    analyzedTranslation = analyzeText(translatedText,
                                      pair,
                                      pair[::-1],
                                      directory=directory)
    analyzedTranslationUnits = list(parse(analyzedTranslation, withText=True))

    analyzedTranslationUnitsSubsegments = []
    for length in range(1, maxTranslationLength + 1):
        for startIndex in range(0, len(analyzedTranslationUnits) - length + 1):
            lastIndex = startIndex + length - 1
            analyzedTranslationUnitsSubsegments.append(
                (analyzedTranslationUnits[startIndex:lastIndex + 1],
                 startIndex, lastIndex))

    translatedTextSubsegements = []
    for analyzedSourceUnitsSubsegment, startIndexInUnits, lastIndexInUnits in analyzedSourceUnitsSubsegments:
        sourceTextSubsegment = ''  #s
        for i, (analyzedSourceUnitPreceedingText, analyzedSourceLexicalUnit
                ) in enumerate(analyzedSourceUnitsSubsegment):
            sourceTextSubsegment += (analyzedSourceUnitPreceedingText
                                     if i != 0 else
                                     '') + analyzedSourceLexicalUnit.wordform

        startIndexInSourceText = sum(
            list(
                map(lambda x: len(x[0]) + len(x[1].wordform),
                    analyzedSourceUnits[:startIndexInUnits]))) + len(
                        analyzedSourceUnitsSubsegment[0][0])  #i
        lastIndexInSourceText = sum(
            list(
                map(lambda x: len(x[0]) + len(x[1].wordform),
                    analyzedSourceUnits[:lastIndexInUnits + 1]))) - 1  #j

        if ignoreCase:
            sourceTextSubsegment = sourceTextSubsegment.lower()

        translatedTextSubsegment = translateText(sourceTextSubsegment,
                                                 pair,
                                                 directory=directory)  #t
        if ignoreCase:
            translatedTextSubsegment = translatedTextSubsegment.lower()
        analyzedTranslatedTextSubsegment = analyzeText(
            translatedTextSubsegment, pair, pair[::-1], directory=directory)

        #print(analyzedTranslatedTextSubsegment)

        #print('meh', analyzedTranslatedTextSubsegment)
        analyzedTranslatedTextSubsegmentUnits = list(
            parse(analyzedTranslatedTextSubsegment, withText=True))
        #pprint.pprint(analyzedTranslatedTextSubsegmentUnits)

        print('meh', analyzedTranslatedTextSubsegment)

        subsegmentMatches = list(
            filter(
                lambda x: list(map(lambda y: str(y[1]), x[0])) == list(
                    map(lambda z: str(z[1]),
                        analyzedTranslatedTextSubsegmentUnits)),
                analyzedTranslationUnitsSubsegments))
        if subsegmentMatches:
            startIndexInTranslatedText = sum(
                list(
                    map(lambda x: len(x[0]) + len(x[1].wordform),
                        analyzedTranslationUnits[:subsegmentMatches[0][1]]))
            ) + len(subsegmentMatches[0][0][0][0])  #k
            lastIndexInTranslatedText = sum(
                list(
                    map(lambda x: len(x[0]) + len(x[1].wordform),
                        analyzedTranslationUnits[:subsegmentMatches[0][2] +
                                                 1]))) - 1  #l

            correspondences.append(
                Correspondence(s=sourceTextSubsegment,
                               t=translatedTextSubsegment,
                               i=startIndexInSourceText,
                               j=lastIndexInSourceText,
                               k=startIndexInTranslatedText,
                               l=lastIndexInTranslatedText))

            #print(correspondences)

    #print('Source text: %s' % repr(sourceText))
    #print('Translated text: %s\n' % repr(translatedText))
    #   pprint.pprint(correspondences)

    return correspondences
Пример #7
0
    #    sourceText = args.text.lower() if args.ignoreCase else args.text #S
    if args.source_texts == None:
        f = sys.stdin
    else:
        f = open(args.source_texts, 'r')
    sen_num = 0

    for sourceText in f:
        sourceText = sourceText.strip()
        sen_num = sen_num + 1
        analyzedSourceText = analyzeText(sourceText,
                                         pair,
                                         pair,
                                         directory=args.directory)
        analyzedSourceUnits = list(parse(analyzedSourceText, withText=True))

        Correspondence = collections.namedtuple('Correspondence',
                                                ['s', 't', 'i', 'j', 'k', 'l'])
        correspondences = []

        analyzedSourceUnitsSubsegments = []

        for length in range(1, args.maxSourceLength + 1):
            for startIndex in range(0, len(analyzedSourceUnits) - length + 1):
                lastIndex = startIndex + length - 1
                analyzedSourceUnitsSubsegments.append(
                    (analyzedSourceUnits[startIndex:lastIndex + 1], startIndex,
                     lastIndex))  #s, i, j (analyzed units forms of them)

        translatedText = translateText(sourceText,
Пример #8
0
def getCorrespondences(sourceLanguage,targetLanguage,ignoreCase,maxSourceLength,directory,maxTranslationLength, s):

    pair = (sourceLanguage, targetLanguage)
    sourceText = s.lower() if ignoreCase else s #S

    cache_db = TinyDB('cache_db2.json')
    Data = Query()


    #this stuff analyzes source text
    analyzedSourceText = analyzeText(sourceText, pair, pair, directory=directory)

    analyzedSourceUnits = list(parse(analyzedSourceText, withText=True))

    Correspondence = collections.namedtuple('Correspondence', ['s', 't', 'i', 'j', 'k', 'l'])

    correspondences = []

    analyzedSourceUnitsSubsegments = []

    for length in range(1, maxSourceLength + 1):
        for startIndex in range(0, len(analyzedSourceUnits) - length + 1):
            lastIndex = startIndex + length - 1 
            analyzedSourceUnitsSubsegments.append((analyzedSourceUnits[startIndex:lastIndex+1], startIndex, lastIndex)) #s, i, j (analyzed units forms of them)


    #this stuff translates source text
    translatedText = translateText(sourceText, pair, directory=directory)

    if ignoreCase:
        translatedText = translatedText.lower()


    #this stuff analyzes translated text    
    analyzedTranslation = analyzeText(translatedText, pair, pair[::-1], directory=directory)
    analyzedTranslationUnits = list(parse(analyzedTranslation, withText=True))

    analyzedTranslationUnitsSubsegments = []
    for length in range(1, maxTranslationLength + 1):
        for startIndex in range(0, len(analyzedTranslationUnits) - length + 1):
            lastIndex = startIndex + length - 1
            analyzedTranslationUnitsSubsegments.append((analyzedTranslationUnits[startIndex:lastIndex+1], startIndex, lastIndex))


    translatedTextSubsegements = []
    for analyzedSourceUnitsSubsegment, startIndexInUnits, lastIndexInUnits in analyzedSourceUnitsSubsegments:
        sourceTextSubsegment = '' #s
        for i, (analyzedSourceUnitPreceedingText, analyzedSourceLexicalUnit) in enumerate(analyzedSourceUnitsSubsegment):
            sourceTextSubsegment += (analyzedSourceUnitPreceedingText if i != 0 else '') + analyzedSourceLexicalUnit.wordform

        startIndexInSourceText = sum(list(map(lambda x: len(x[0]) + len(x[1].wordform), analyzedSourceUnits[:startIndexInUnits]))) + len(analyzedSourceUnitsSubsegment[0][0]) #i
        lastIndexInSourceText = sum(list(map(lambda x: len(x[0]) + len(x[1].wordform), analyzedSourceUnits[:lastIndexInUnits+1]))) - 1 #j

        if ignoreCase:
            sourceTextSubsegment = sourceTextSubsegment.lower()


        #this stuff translates source text subsegment
        if cache_db.search((Data.type == 'stsb_translation') & (Data.key == sourceTextSubsegment)):
            translatedTextSubsegment = cache_db.search((Data.type == 'stsb_translation') & (Data.key == sourceTextSubsegment))[0]['value']
            #print('ага\n%s\n\n' % (translatedTextSubsegment))
        else:
            translatedTextSubsegment = translateText(sourceTextSubsegment, pair, directory=directory) #t
            cache_db.insert({'type': 'stsb_translation', 'key': sourceTextSubsegment, 'value': translatedTextSubsegment})
        
        if ignoreCase:
            translatedTextSubsegment = translatedTextSubsegment.lower()


        #this stuff analyzes translated text subsegment
        if cache_db.search((Data.type == 'trsb_analysis') & (Data.key == translatedTextSubsegment)):
            analyzedTranslatedTextSubsegment = cache_db.search((Data.type == 'trsb_analysis') & (Data.key == translatedTextSubsegment))[0]['value']
            #print('угу\n%s\n\n' % (analyzedTranslatedTextSubsegment))
        else:
            analyzedTranslatedTextSubsegment = analyzeText(translatedTextSubsegment, pair, pair[::-1], directory=directory)
            cache_db.insert({'type': 'trsb_analysis', 'key': translatedTextSubsegment, 'value': analyzedTranslatedTextSubsegment})
        
        #analyzedTranslatedTextSubsegment = analyzeText(translatedTextSubsegment, pair, pair[::-1], directory=directory)
        
        analyzedTranslatedTextSubsegmentUnits = list(parse(analyzedTranslatedTextSubsegment, withText=True))
        #pprint.pprint(analyzedTranslatedTextSubsegmentUnits)

        subsegmentMatches = list(filter(lambda x: list(map(lambda y: str(y[1]), x[0])) == list(map(lambda z: str(z[1]), analyzedTranslatedTextSubsegmentUnits)) , analyzedTranslationUnitsSubsegments))
        if subsegmentMatches:
            startIndexInTranslatedText = sum(list(map(lambda x: len(x[0]) + len(x[1].wordform), analyzedTranslationUnits[:subsegmentMatches[0][1]]))) + len(subsegmentMatches[0][0][0][0]) #k
            lastIndexInTranslatedText = sum(list(map(lambda x: len(x[0]) + len(x[1].wordform), analyzedTranslationUnits[:subsegmentMatches[0][2]+1]))) - 1 #l

            correspondences.append(Correspondence(
                s=sourceTextSubsegment, 
                t=translatedTextSubsegment,
                i=startIndexInSourceText, 
                j=lastIndexInSourceText, 
                k=startIndexInTranslatedText, 
                l=lastIndexInTranslatedText
            ))
           
    #print('Source text: %s' % repr(sourceText))
    #print('Translated text: %s\n' % repr(translatedText))
    #   pprint.pprint(correspondences)



    return correspondences