Exemplo n.º 1
0
def searchPath(rootpath, include_pairs=True, verbosity=1):
    lang_code = r'[a-z]{2,3}(?:_[A-Za-z]+)?'
    type_re = {
        'pair':
        re.compile(r'({0})-({0})\.mode'.format(lang_code)),
        'analyzer':
        re.compile(r'(({0}(-{0})?)-(an)?mor(ph)?)\.mode'.format(lang_code)),
        'generator':
        re.compile(r'(({0}(-{0})?)-gener[A-z]*)\.mode'.format(lang_code)),
        'tagger':
        re.compile(r'(({0}(-{0})?)-tagger)\.mode'.format(lang_code)),
        'spell':
        re.compile(r'(({0}(-{0})?)-spell)\.mode'.format(lang_code)),
        'tokenise':
        re.compile(r'(({0}(-{0})?)-tokenise)\.mode'.format(lang_code)),
    }
    modes = {
        'pair': [],
        'analyzer': [],
        'generator': [],
        'tagger': [],
        'spell': [],
        'tokenise': [],
    }  # type: Dict[str, List[Tuple[str, str, str]]]

    real_root = os.path.abspath(os.path.realpath(rootpath))

    for dirpath, dirnames, files in os.walk(rootpath, followlinks=True):
        if is_loop(dirpath, rootpath, real_root):
            dirnames[:] = []
            continue
        for filename in [f for f in files if f.endswith('.mode')]:
            for mtype, regex in type_re.items():
                m = regex.match(filename)
                if m:
                    if mtype != 'pair':
                        modename = m.group(1)  # e.g. en-es-anmorph
                        langlist = [
                            toAlpha3Code(l) for l in m.group(2).split('-')
                        ]
                        lang_pair = '-'.join(langlist)  # e.g. en-es
                        dir_of_modes = os.path.dirname(dirpath)
                        mode = (dir_of_modes, modename, lang_pair)
                        modes[mtype].append(mode)
                    elif include_pairs:
                        lang_src = m.group(1)
                        lang_trg = m.group(2)
                        mode = (os.path.join(dirpath, filename),
                                toAlpha3Code(lang_src), toAlpha3Code(lang_trg))
                        modes[mtype].append(mode)

    if verbosity > 1:
        _log_modes(modes)

    return modes
Exemplo n.º 2
0
def searchPath(rootpath, include_pairs=True, verbosity=1):
    lang_code = r'[a-z]{2,3}(?:_[A-Za-z]+)?'
    type_re = {
        'pair': re.compile(r'({0})-({0})\.mode'.format(lang_code)),
        'analyzer': re.compile(r'(({0}(-{0})?)-(an)?mor(ph)?)\.mode'.format(lang_code)),
        'generator': re.compile(r'(({0}(-{0})?)-gener[A-z]*)\.mode'.format(lang_code)),
        'tagger': re.compile(r'(({0}(-{0})?)-tagger)\.mode'.format(lang_code))
    }
    modes = {
        'pair': [],
        'analyzer': [],
        'generator': [],
        'tagger': [],
    }

    real_root = os.path.abspath(os.path.realpath(rootpath))

    for dirpath, dirnames, files in os.walk(rootpath, followlinks=True):
        if is_loop(dirpath, rootpath, real_root):
            dirnames[:]=[]
            continue
        for filename in [f for f in files if f.endswith('.mode')]:
            for mtype, regex in type_re.items():
                m = regex.match(filename)
                if m:
                    if mtype != 'pair':
                        modename = m.group(1) # e.g. en-es-anmorph
                        langlist = [toAlpha3Code(l) for l in m.group(2).split('-')]
                        lang_src = langlist[0]         # e.g. en
                        lang_pair = '-'.join(langlist) # e.g. en-es
                        dir_of_modes = os.path.dirname(dirpath)
                        mode = (dir_of_modes,
                                modename,
                                lang_pair)
                        modes[mtype].append(mode)
                    elif include_pairs:
                        lang_src = m.group(1)
                        lang_trg = m.group(2)
                        mode = (os.path.join(dirpath, filename),
                                toAlpha3Code(lang_src),
                                toAlpha3Code(lang_trg))
                        modes[mtype].append(mode)

    if verbosity>1:
        for mtype in modes:
            if modes[mtype]:
                logging.info("\"%s\" modes found:\n%s" % (
                    mtype,
                    "\n".join(["\t".join(m) for m in modes[mtype]])))


    return modes
Exemplo n.º 3
0
    def get(self):
        mode = toAlpha3Code(self.get_argument('lang'))
        text = self.get_argument('q')
        if not text:
            self.send_error(400, explanation='Missing q argument')
            return

        def handleCoverage(coverage):
            if coverage is None:
                self.send_error(408, explanation='Request timed out')
            else:
                self.sendResponse([coverage])

        if mode in self.analyzers:
            pool = Pool(processes=1)
            result = pool.apply_async(getCoverage, [text, self.analyzers[mode][0], self.analyzers[mode][1]])
            pool.close()

            @run_async_thread
            def worker(callback):
                try:
                    callback(result.get(timeout=self.timeout))
                except TimeoutError:
                    pool.terminate()
                    callback(None)

            coverage = yield tornado.gen.Task(worker)
            handleCoverage(coverage)
        else:
            self.send_error(400, explanation='That mode is not installed')
Exemplo n.º 4
0
    def get(self):
        mode = toAlpha3Code(self.get_argument('lang'))
        toGenerate = self.get_argument('q')

        def handleGeneration(generated):
            if generated is None:
                self.send_error(408, explanation='Request timed out')
            else:
                generated = removeLast(toGenerate, generated)
                self.sendResponse([(generation, lexicalUnits[index]) for (index, generation) in enumerate(generated.split('[SEP]'))])

        if mode in self.generators:
            lexicalUnits = re.findall(r'(\^[^\$]*\$[^\^]*)', toGenerate)
            if len(lexicalUnits) == 0:
                lexicalUnits = ['^%s$' % toGenerate]
            pool = Pool(processes=1)
            result = pool.apply_async(apertium, ('[SEP]'.join(lexicalUnits), self.generators[mode][0], self.generators[mode][1]), {'formatting': 'none'})
            pool.close()

            @run_async
            def worker(callback):
                try:
                    callback(result.get(timeout=self.timeout))
                except TimeoutError:
                    pool.terminate()
                    callback(None)

            generated = yield tornado.gen.Task(worker)
            handleGeneration(generated)
        else:
            self.send_error(400, explanation='That mode is not installed')
Exemplo n.º 5
0
    def get(self):
        mode = toAlpha3Code(self.get_argument('lang'))
        toAnalyze = self.get_argument('q')

        def handleAnalysis(analysis):
            if analysis is None:
                self.send_error(408, explanation='Request timed out')
            else:
                lexicalUnits = removeLast(toAnalyze, re.findall(r'\^([^\$]*)\$([^\^]*)', analysis))
                self.sendResponse([(lexicalUnit[0], lexicalUnit[0].split('/')[0] + lexicalUnit[1]) for lexicalUnit in lexicalUnits])

        if mode in self.analyzers:
            pool = Pool(processes=1)
            result = pool.apply_async(apertium, [toAnalyze, self.analyzers[mode][0], self.analyzers[mode][1]])
            pool.close()

            @run_async
            def worker(callback):
                try:
                    callback(result.get(timeout=self.timeout))
                except TimeoutError:
                    pool.terminate()
                    callback(None)

            analysis = yield tornado.gen.Task(worker)
            handleAnalysis(analysis)
        else:
            self.send_error(400, explanation='That mode is not installed')
Exemplo n.º 6
0
    def get(self):
        text = self.get_argument('q')
        if not text:
            return self.send_error(400, explanation='Missing q argument')

        if cld2:
            cldResults = cld2.detect(text)
            if cldResults[0]:
                possibleLangs = filter(lambda x: x[1] != 'un', cldResults[2])
                self.sendResponse({toAlpha3Code(possibleLang[1]): possibleLang[2] for possibleLang in possibleLangs})
            else:
                self.sendResponse({'nob': 100})  # TODO: Some more reasonable response
        else:
            def handleCoverages(coverages):
                self.sendResponse(coverages)

            pool = Pool(processes=1)
            result = pool.apply_async(getCoverages, [text, self.analyzers], {'penalize': True}, callback=handleCoverages)
            pool.close()
            try:
                coverages = result.get(timeout=self.timeout)
                # TODO: Coverages are not actually sent!!
            except TimeoutError:
                self.send_error(408, explanation='Request timed out')
                pool.terminate()
Exemplo n.º 7
0
    def get(self):
        text = self.get_argument('q')
        if not text:
            return self.send_error(400, explanation='Missing q argument')

        if cld2:
            cldResults = cld2.detect(text)
            if cldResults[0]:
                possibleLangs = filter(lambda x: x[1] != 'un', cldResults[2])
                self.sendResponse({
                    toAlpha3Code(possibleLang[1]): possibleLang[2]
                    for possibleLang in possibleLangs
                })
            else:
                self.sendResponse({'nob':
                                   100})  # TODO: Some more reasonable response
        else:

            def handleCoverages(coverages):
                self.sendResponse(coverages)

            pool = Pool(processes=1)
            result = pool.apply_async(getCoverages, [text, self.analyzers],
                                      {'penalize': True},
                                      callback=handleCoverages)
            pool.close()
            try:
                coverages = result.get(timeout=self.timeout)
                # TODO: Coverages are not actually sent!!
            except TimeoutError:
                self.send_error(408, explanation='Request timed out')
                pool.terminate()
Exemplo n.º 8
0
    def get(self):
        mode = toAlpha3Code(self.get_argument('lang'))
        text = self.get_argument('q')
        if not text:
            self.send_error(400, explanation='Missing q argument')
            return

        def handleCoverage(coverage):
            if coverage is None:
                self.send_error(408, explanation='Request timed out')
            else:
                self.sendResponse([coverage])

        if mode in self.analyzers:
            pool = Pool(processes=1)
            result = pool.apply_async(
                getCoverage,
                [text, self.analyzers[mode][0], self.analyzers[mode][1]])
            pool.close()

            @run_async_thread
            def worker(callback):
                try:
                    callback(result.get(timeout=self.timeout))
                except TimeoutError:
                    pool.terminate()
                    callback(None)

            coverage = yield tornado.gen.Task(worker)
            handleCoverage(coverage)
        else:
            self.send_error(400, explanation='That mode is not installed')
Exemplo n.º 9
0
 def get(self):
     in_text = self.get_argument('q')
     in_mode = toAlpha3Code(self.get_argument('lang'))
     if in_mode in self.analyzers:
         [path, mode] = self.analyzers[in_mode]
         formatting = 'txt'
         commands = [['apertium', '-d', path, '-f', formatting, mode]]
         result = yield translation.translateSimple(in_text, commands)
         self.sendResponse(self.postproc_text(in_text, result))
     else:
         self.send_error(400, explanation='That mode is not installed')
Exemplo n.º 10
0
 def get(self):
     in_text = self.get_argument('q')
     in_mode = toAlpha3Code(self.get_argument('lang'))
     if in_mode in self.analyzers:
         [path, mode] = self.analyzers[in_mode]
         formatting = 'txt'
         commands = [['apertium', '-d', path, '-f', formatting, mode]]
         result = yield translation.translateSimple(in_text, commands)
         self.sendResponse(self.postproc_text(in_text, result))
     else:
         self.send_error(400, explanation='That mode is not installed')
Exemplo n.º 11
0
 def get(self):
     in_text = self.get_argument('q')
     in_mode = toAlpha3Code(self.get_argument('lang'))
     if in_mode in self.generators:
         [path, mode] = self.generators[in_mode]
         formatting = 'none'
         commands = [['apertium', '-d', path, '-f', formatting, mode]]
         lexical_units, to_generate = self.preproc_text(in_text)
         result = yield translation.translateSimple(to_generate, commands)
         self.sendResponse(self.postproc_text(lexical_units, result))
     else:
         self.send_error(400, explanation='That mode is not installed')
Exemplo n.º 12
0
 def get(self):
     in_text = self.get_argument('q')
     in_mode = toAlpha3Code(self.get_argument('lang'))
     if in_mode in self.generators:
         [path, mode] = self.generators[in_mode]
         formatting = 'none'
         commands = [['apertium', '-d', path, '-f', formatting, mode]]
         lexical_units, to_generate = self.preproc_text(in_text)
         result = yield translation.translateSimple(to_generate, commands)
         self.sendResponse(self.postproc_text(lexical_units, result))
     else:
         self.send_error(400, explanation='That mode is not installed')
Exemplo n.º 13
0
    def get(self):
        lang = toAlpha3Code(self.get_argument('lang'))
        modes = set(self.get_argument('modes').split(' '))
        query = self.get_argument('q')

        if not modes <= {'morph', 'biltrans', 'tagger', 'disambig', 'translate'}:
            self.send_error(400, explanation='Invalid mode argument')
            return

        def handleOutput(output):
            '''toReturn = {}
            for mode in modes:
                toReturn[mode] = outputs[mode]
            for mode in modes:
                toReturn[mode] = {outputs[mode + '_inputs'][index]: output for (index, output) in enumerate(outputs[mode])}
            for mode in modes:
                toReturn[mode] = [(outputs[mode + '_inputs'][index], output) for (index, output) in enumerate(outputs[mode])]
            for mode in modes:
                toReturn[mode] = {'outputs': outputs[mode], 'inputs': outputs[mode + '_inputs']}
            self.sendResponse(toReturn)'''

            if output is None:
                self.send_error(400, explanation='No output')
                return
            elif not output:
                self.send_error(408, explanation='Request timed out')
                return
            else:
                outputs, tagger_lexicalUnits, morph_lexicalUnits = output

            toReturn = []

            for (index, lexicalUnit) in enumerate(tagger_lexicalUnits if tagger_lexicalUnits else morph_lexicalUnits):
                unitToReturn = {}
                unitToReturn['input'] = stripTags(lexicalUnit.split('/')[0])
                for mode in modes:
                    unitToReturn[mode] = outputs[mode][index]
                toReturn.append(unitToReturn)

            if self.get_argument('pos', default=None):
                requestedPos = int(self.get_argument('pos')) - 1
                currentPos = 0
                for unit in toReturn:
                    input = unit['input']
                    currentPos += len(input.split(' '))
                    if requestedPos < currentPos:
                        self.sendResponse(unit)
                        return
            else:
                self.sendResponse(toReturn)

        pool = Pool(processes=1)
        result = pool.apply_async(processPerWord, (self.analyzers, self.taggers, lang, modes, query))
        pool.close()

        @run_async_thread
        def worker(callback):
            try:
                callback(result.get(timeout=self.timeout))
            except TimeoutError:
                pool.terminate()
                callback(None)

        output = yield tornado.gen.Task(worker)
        handleOutput(output)
Exemplo n.º 14
0
    def get(self):
        lang = toAlpha3Code(self.get_argument('lang'))
        modes = set(self.get_argument('modes').split(' '))
        query = self.get_argument('q')

        if not modes <= {
                'morph', 'biltrans', 'tagger', 'disambig', 'translate'
        }:
            self.send_error(400, explanation='Invalid mode argument')
            return

        def handleOutput(output):
            '''toReturn = {}
            for mode in modes:
                toReturn[mode] = outputs[mode]
            for mode in modes:
                toReturn[mode] = {outputs[mode + '_inputs'][index]: output for (index, output) in enumerate(outputs[mode])}
            for mode in modes:
                toReturn[mode] = [(outputs[mode + '_inputs'][index], output) for (index, output) in enumerate(outputs[mode])]
            for mode in modes:
                toReturn[mode] = {'outputs': outputs[mode], 'inputs': outputs[mode + '_inputs']}
            self.sendResponse(toReturn)'''

            if output is None:
                self.send_error(400, explanation='No output')
                return
            elif not output:
                self.send_error(408, explanation='Request timed out')
                return
            else:
                outputs, tagger_lexicalUnits, morph_lexicalUnits = output

            toReturn = []

            for (index, lexicalUnit
                 ) in enumerate(tagger_lexicalUnits if tagger_lexicalUnits else
                                morph_lexicalUnits):
                unitToReturn = {}
                unitToReturn['input'] = stripTags(lexicalUnit.split('/')[0])
                for mode in modes:
                    unitToReturn[mode] = outputs[mode][index]
                toReturn.append(unitToReturn)

            if self.get_argument('pos', default=None):
                requestedPos = int(self.get_argument('pos')) - 1
                currentPos = 0
                for unit in toReturn:
                    input = unit['input']
                    currentPos += len(input.split(' '))
                    if requestedPos < currentPos:
                        self.sendResponse(unit)
                        return
            else:
                self.sendResponse(toReturn)

        pool = Pool(processes=1)
        result = pool.apply_async(
            processPerWord, (self.analyzers, self.taggers, lang, modes, query))
        pool.close()

        @run_async_thread
        def worker(callback):
            try:
                callback(result.get(timeout=self.timeout))
            except TimeoutError:
                pool.terminate()
                callback(None)

        output = yield tornado.gen.Task(worker)
        handleOutput(output)