Пример #1
0
    def wordByWord(self, sentence, hiragana=True):
        """Get reading for every element in provided sentence"""
        self.includeSurface().includeReading()
        info = self.parse(sentence)
        words = []
        if info:
            # Compile list of {word: reading} excluding okurigana and and so on
            for word in info:
                # No reading
                if not word.get('pronounciation'):
                    reading = u''
                # Word is already in kana
                elif (
                    word.get('pronounciation') == word.get('surface') or
                    kata2hira(word.get('pronounciation')) == word.get('surface')
                ):
                    reading = u''
                # Need to convert to hiragana
                elif hiragana:
                    reading = kata2hira(word.get('pronounciation'))
                # Otherwise, let it be
                else:
                    reading = word.get('pronounciation')
                # Append tuple to word list
                words.append((word.get('surface'), reading))

        return words
Пример #2
0
    def word_by_word(self, sentence, hiragana=True):
        """Get reading for every element in provided sentence"""
        words = []
        info = self.include('pronounciation', 'surface').parse(sentence)
        if info:
            # Compile list of {word: reading} excluding okurigana and and so on
            for word in info:
                # No reading
                if not word.get('pronounciation'):
                    reading = u''
                # Word is already in kana
                elif (
                    word.get('pronounciation') == word.get('surface') or
                    kata2hira(word.get('pronounciation')) == word.get('surface')
                ):
                    reading = u''
                # Need to convert to hiragana
                elif hiragana:
                    reading = kata2hira(word.get('pronounciation'))
                # Otherwise, let it be
                else:
                    reading = word.get('pronounciation')
                # Append tuple to word list
                words.append((word.get('surface'), reading))

        return words
Пример #3
0
    def parseReadings(self):
        items_grouped = {}
        for kanji in self.items:
            readings = {}
            try:
                lookup = self.kjd[kanji.character]
                for kun in lookup.kun_readings:
                    kun = kun.replace('.', '').replace('-', '')
                    for word in kanji.word:
                        if kun in kata2hira(MecabTool.parseToReadingsKana(word.word)[0]):
                            if readings.has_key(kun):
                                readings[kun].append(word.word)
                            else:
                                readings[kun] = [word.word]
                for on in lookup.on_readings:
                    on = kata2hira(on.replace('.', '').replace('-', ''))
                    for word in kanji.word:
                        if on in kata2hira(MecabTool.parseToReadingsKana(word.word)[0]):
                            if readings.has_key(on):
                                readings[on].append(word.word)
                            else:
                                readings[on] = [word.word]
            except Exception, e:
                log.error(e)
                
            # simple solution - difficult implementation
#            items_grouped[kanji.character] = readings
            # slightly more complicated solution - easier implementation
            for reading in readings:
                i = 0
                if items_grouped.has_key(kanji.character): items_grouped[kanji.character + '_' + str(i)] = (reading, readings[reading]); i += 1
                else: items_grouped[kanji.character] = (reading, readings[reading])
Пример #4
0
def kana_minus_dakuten(char):
    if is_katakana(char):
        hira = kata2hira(char)
        hira = __by_dakuten.get(hira, hira)
        return hira2kata(hira)
    else:
        return __by_dakuten.get(char, char)
Пример #5
0
def kana_minus_dakuten(char):
    if is_katakana(char):
        hira = kata2hira(char)
        hira = __by_dakuten.get(hira, hira)
        return hira2kata(hira)
    else:
        return __by_dakuten.get(char, char)
Пример #6
0
 def getWordPronunciationFromExample(self, item):
     words = MecabTool.parseToWordsFull(self.currentExample.sentence)
     answer = self.find(lambda word: item in word['word'] , words)
     try:
         return kata2hira(answer['pronunciation'])
     except Exception:
         return u' '
Пример #7
0
 def getCorrectAnswer(self):
     words = MecabTool.parseToWordsFull(self.currentExample.sentence)
     answer = self.find(lambda word: self.currentItem.character in word['word'] , words)
     try:
         return kata2hira(answer['pronunciation'])
     except Exception:
         return u' '
Пример #8
0
 def getWordPronunciationFromText(query, text):
     words = MecabTool.parseToWordsFull(text)
     answer = MecabTool.findUsingF(lambda word: query in word['word'] , words)
     try:
         return kata2hira(answer['pronunciation'])
     except Exception:
         return None
Пример #9
0
 def getWordPronunciationFromText(query, text):
     words = MecabTool.parseToWordsFull(text)
     answer = MecabTool.findUsingF(lambda word: query in word['word'],
                                   words)
     try:
         return kata2hira(answer['pronunciation'])
     except Exception:
         return None
Пример #10
0
def _reading(node):
    surface = node.surface.decode('utf8')
    reading = node.feature.decode('utf8').split(',')[-2]
    if reading == '*':
        return None
    reading = jcconv.kata2hira(reading)

    return reading
Пример #11
0
def _reading(node):
    surface = node.surface.decode('utf8')
    reading = node.feature.decode('utf8').split(',')[-2]
    if reading == '*':
        return None
    reading = jcconv.kata2hira(reading)

    return reading
Пример #12
0
def kana_plus_mini(char):
    yield char

    is_kata = is_katakana(char)
    if is_kata:
        char = kata2hira(char)

    for char in __to_mini.get(char, ''):
        yield hira2kata(char) if is_kata else char
Пример #13
0
def kana_plus_mini(char):
    yield char

    is_kata = is_katakana(char)
    if is_kata:
        char = kata2hira(char)

    for char in __to_mini.get(char, ''):
        yield hira2kata(char) if is_kata else char
Пример #14
0
def create_idx_file(datfile, idxfile):
    print "Reading in dat file..."
    words = OrderedDict()
    position = 0
    with codecs.open(datfile, 'r', "utf-8") as f:
        for line in f:
            # TODO create key for each ; separated value
            temp = line.split(' ', 1)[0]
            temp2 = line.split(';', 1)[0]
            if len(temp) < len(temp2):
                symbol = temp
            else:
                symbol = temp2

            # vu wouldn't be converted if not reserved, but for clarity purposes ...
            symbol = jcconv.kata2hira(symbol, 'ヴ')
            if not words.get(symbol):
                words.update({symbol: str(position)})
            else:
                words.update({symbol: words.get(symbol) + "," + str(position)})

            symbol_in_brackets = re.search(r'\[(.*?)\]', line.split('/', 1)[0])
            if symbol_in_brackets:
                symbol_in_brackets = symbol_in_brackets.group(1)
                # vu wouldn't be converted if not reserved, but for clarity purposes ...
                symbol_in_brackets = jcconv.kata2hira(symbol_in_brackets, 'ヴ')
                if words.get(symbol_in_brackets):
                    words.update({
                        symbol_in_brackets:
                        words.get(symbol_in_brackets) + "," + str(position)
                    })
                else:
                    words.update({symbol_in_brackets: str(position)})

            position += len(line)
    print "Finished reading in dat file, now sorting index..."

    words = OrderedDict(sorted(words.items(), key=lambda t: t[0]))
    print "Finished sorting index, now writing idx file..."

    with codecs.open(idxfile, 'w+', "utf-8") as f:
        for key, value in words.iteritems():
            f.write(key + "," + value + "\n")
    print "Finished writing idx file"
Пример #15
0
def create_idx_file(datfile, idxfile):
    print "Reading in dat file..."
    words = OrderedDict()
    position = 0
    with codecs.open(datfile, 'r', "utf-8") as f:
        for line in f:   
            # TODO create key for each ; separated value
            temp = line.split(' ', 1)[0]
            temp2 = line.split(';', 1)[0]
            if len(temp) < len(temp2):
                symbol = temp
            else:
                symbol = temp2

            # vu wouldn't be converted if not reserved, but for clarity purposes ...
            symbol = jcconv.kata2hira(symbol, 'ヴ')
            if not words.get(symbol):
                words.update({symbol : str(position)})
            else:
                words.update({symbol : words.get(symbol) + "," + str(position)})

            symbol_in_brackets = re.search(r'\[(.*?)\]', line.split('/', 1)[0])
            if symbol_in_brackets:
                symbol_in_brackets = symbol_in_brackets.group(1)
                # vu wouldn't be converted if not reserved, but for clarity purposes ...
                symbol_in_brackets = jcconv.kata2hira(symbol_in_brackets, 'ヴ')
                if words.get(symbol_in_brackets):
                    words.update({symbol_in_brackets : words.get(symbol_in_brackets) + "," + str(position)})
                else:
                    words.update({symbol_in_brackets : str(position)})   

            position += len(line)
    print "Finished reading in dat file, now sorting index..."

    words = OrderedDict(sorted(words.items(), key=lambda t: t[0]))
    print "Finished sorting index, now writing idx file..."

    with codecs.open(idxfile, 'w+', "utf-8") as f:
        for key, value in  words.iteritems():
            f.write(key + "," + value + "\n")
    print "Finished writing idx file"
Пример #16
0
 def __getattribute__(self, key):
     feature_keys = dict.__getattribute__(self, 'feature_keys')
     if key in self:
         return self[key]
     if key in feature_keys:
         features = dict.__getattribute__(self, 'feature').split(',')
         for name, index in feature_keys.iteritems():
             self[name] = features[index]
         readings = dict.__getattribute__(self, 'readings')
         readings = set([jcconv.kata2hira(e) for e in readings if e not in ('*',)])
         self['readings'] = readings
     return dict.__getattribute__(self, key)
Пример #17
0
def all_to_hiragana(string):
    out = u''

    for index, char in enumerate(string):
        if char == u'ー' or char == u'|':
            char = char_to_base_vowel(out[-1])

        char = kata2hira(char)

        out += char

    return out
Пример #18
0
 def reading(self, sentence, hiragana=True):
     """Get reading for provided sentence|word"""
     self.includeReading()
     info = self.parse(sentence)
     if info:
         kana = u''.join([
             reading.get('pronounciation', '') for reading in info
             if reading.get('pronounciation')
         ])
         if hiragana:
             return kata2hira(kana)
         return kana
Пример #19
0
def all_to_hiragana(string):
    out = u''

    for index, char in enumerate(string):
        if char == u'ー' or char == u'|':
            char = char_to_base_vowel(out[-1])

        char = kata2hira(char)

        out += char

    return out
Пример #20
0
 def reading(self, sentence, hiragana=True):
     """Get reading for provided sentence|word"""
     self.includeReading()
     info = self.parse(sentence)
     if info:
         kana = u''.join([
             reading.get('pronounciation', '') for reading in info
             if reading.get('pronounciation')
         ])
         if hiragana:
             return kata2hira(kana)
         return kana
Пример #21
0
def generate_reading(expression):
    expression = expression.encode(settings.MECAB_ENCODING)
    proc = subprocess.Popen('mecab', shell=False, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
    mecab_output = proc.communicate(expression)[0].decode(settings.MECAB_ENCODING)
    lines = mecab_output.split(u'\n')[:-2] #skip the \nEOS\n

    ret = u''
    for line in lines:
        if line[0] == u',':
            ret += u','
            continue
        elif line[:3] == u'EOS':
            ret += u'\n'
            continue
        elif line[0].strip() == '':
            ret += line[0]
            continue
        fields = line.split(u',')
        word = fields[0].split()[0]

        if len(fields) == 9:
            reading = fields[7]

            # Has kanji and a reading?
            if (jcconv.kata2hira(reading) != word
                    and reading != word
                    and any(_code_page(char) != 'hiragana'
                            and _code_page(char) != 'katakana'
                            for char in word)):

                # The reading comes in as katakana, we want hiragana.
                reading = jcconv.kata2hira(reading)

                ret += _furiganaize(word, reading, not ret)
            else:
                ret += word
        else:
            ret += word
    return ret
Пример #22
0
 def get_reading(self, sentence, hiragana=True):
     """
     Get reading for provided sentence|word
     NB: for some rare words there may be no readings available!
     """
     info = self.include('pronounciation').parse(sentence)
     if info:
         katakana = u''.join([
             reading.get('pronounciation', '') for reading in info
             if reading.get('pronounciation')
         ])
         if hiragana:
             return kata2hira(katakana)
         return katakana
Пример #23
0
    def getKanaReading(query):
        MECAPI_URL = u'http://mimitako.net/api/mecapi.cgi?sentence='
        OPTIONS = u'&response=pronounciation'
        XML_TAG = u'word/pronounciation'
        
        url = MECAPI_URL + query + OPTIONS
        result = urllib2.urlopen(url)

        tree = ElementTree.fromstring(result.read())
        reading = []
        
        for node in tree.findall(XML_TAG):
            reading.append(node.text)
        
        return kata2hira(''.join(reading))
Пример #24
0
 def reading(self, sentence, hiragana=True):
     """
     Get reading for provided sentence|word
     NB: for some rare words there may be no readings available!
     """
     self.include('pronounciation')
     info = self.parse(sentence)
     if info:
         kana = u''.join([
             reading.get('pronounciation', '') for reading in info
             if reading.get('pronounciation')
         ])
         if hiragana:
             return kata2hira(kana)
         return kana
Пример #25
0
    def as_mecab(self):
        parser = BakaMeCab(self.front)
        parsed_example = []
        for word, info in parser.get_info().iteritems():
            reading = u''
            if(len(info) > 4):
                kana = info[6] if len(info) > 6 else info[4]
                hiragana = kata2hira(kana)
                if kana != word and hiragana != word and word != 'は':
                    reading = hiragana

            parsed_example.append({'front': word, 'reading': reading})

        return {
            'parsed': parsed_example,
            'original': self.front,
            'reading': self.reading,
            'gloss': self.gloss
        }
Пример #26
0
 def getCurrentSentenceReading(self):
     return kata2hira(''.join(MecabTool.parseToReadingsKana(self.currentExample.sentence)))
Пример #27
0
 def getWordPronounciation(self, item):
     try:
         return kata2hira(MecabTool.parseToWordsFull(item)[0]['pronunciation'])
     except:
         return item
Пример #28
0
 def _can_furigana(self, ma, expression):
     return (ma.reading != expression[ma.position:ma.position+ma.word_length] and 
         kata2hira(expression[ma.position:ma.position+ma.word_length]) != ma.reading and
         expression[ma.position:ma.position+ma.word_length] not in u"一二三四五六七八九十0123456789")
Пример #29
0
total_freq = 0

for file_name in file_list:
    print file_name

    with codecs.open(file_name, 'r', 'utf-8') as f:
        for line in f:
            arr = line.split()

            base = 0
            for i, e in enumerate(arr):
                if e[0] <= '9' and e[0] >= '0':
                    base = i
            kanji = arr[base - 1][1:]
            freq = arr[base][:-2]
            kana = kata2hira(arr[base + 2][:-1])

            print kanji, freq, kana

            if len(kana) == 0:
                continue
            elif len(kana) > 1 and kana[0] == '{':
                ks = kana[1:kana.index('}')].split('/')
                ks = [k + kana[kana.index('}') + 1:] for k in ks if len(k) > 0]
            else:
                ks = [
                    kana,
                ]

            freq = int(freq)
            for k in ks:
Пример #30
0
    def doGo(self):
        #pressing the go button we first see which dict is selected

        try:
            theWord = str(self.ui.textEdit.toPlainText())
        except:
            # for JDICT
            theWord = self.ui.textEdit.toPlainText()

        if self.ui.comboBox.currentText() == 'KATEGLO':
            start = time.clock()

            kx = kateglo()

            data = kx.getData(theWord)
            if data == -1:
                s = '\nHello : ' + theWord + '  maybe mispelled or not Indonesian or not the correct root'
                self.ui.textEdit_2.append(s)
                self.ui.textEdit_2.append('Correct and try again')
            else:
                tr = kx.translator(data)
                self.aline()
                s = '\nPhrase : ' + theWord
                self.ui.textEdit_2.append(s)
                rxc = 0
                for rx in tr:
                    if rxc == 0:
                        s = 'Source : ' + rx
                        self.ui.textEdit_2.append(s)
                    else:
                        self.ui.textEdit_2.append(rx)
                    rxc += 1
                    self.aline()

                df = kx.definitor(data)

                self.ui.textEdit_2.append("\nDefinitions\n")
                for rx in df:
                    self.ui.textEdit_2.append(rx)

                pr = kx.proverbor(data)

                self.ui.textEdit_2.append("\nProverbs\n")
                for rx in pr:
                    self.ui.textEdit_2.append(rx)

                rl = kx.relator(data)

                self.ui.textEdit_2.append("\nRelations\n")
                for rx in rl:
                    self.ui.textEdit_2.append(rx)

            self.ui.label_3.setText('Finished KATEGLO request ...')
            end = time.clock()
            s = 'Request duration : ' + str(end - start) + ' secs'
            self.ui.label_4.setText(s)

        elif self.ui.comboBox.currentText() == 'KBBI':
            start = time.clock()

            kb = kbbi()
            soup = kb.processData(theWord)

            sxt = soup.get_text()
            atitle = soup.title.string.split('- definisi kata')
            s = '\n' + atitle[0]
            self.ui.textEdit_2.append(s)
            self.aline()
            s = '\nKata : ' + atitle[1]
            self.ui.textEdit_2.append(s)

            sxts = sxt.split('Pranala (link): http://kbbi.web.id/%s' % theWord)

            try:
                sxts2 = sxts[1]
                sxts2 = sxts2.split('Tweet')
                res1 = sxts2[0]
                res1 = res1.split(';')
                s = ''
                s = res1[0].strip('-1').strip('-2').strip('-3')
                self.ui.textEdit_2.append(s)
                s = ''
                for rx in range(1, len(res1)):
                    rc = res1[rx]
                    s = rc + '\n'
                    self.ui.textEdit_2.append(s)
            except:
                self.ui.textEdit_2.append('')
                s = theWord + ' ==>  Tidak ditemukan - KBBI\n\nMaybe incorrect root word'
                self.ui.textEdit_2.append(s)
                # we want to get data from the Memuat section , if any
                sxtch = sxt.split('Memuat')
                try:
                    sxtchz = sxtch[1].split('Pranala')
                    self.ui.textEdit_2.append(
                        'Try with these suggestions provided by kbbi (if any)  :'
                    )
                    s = sxtchz[0].replace('1', '\n').replace(
                        '2', '\n').replace(
                            '3', '\n')  # occasionaly there is are subscripts
                    self.ui.textEdit_2.append(s)

                except:
                    pass

            self.ui.label_3.setText('Finished KBBI request ...')
            end = time.clock()
            s = 'Request duration : ' + str(end - start) + ' secs'
            self.ui.label_4.setText(s)

        elif self.ui.comboBox.currentText() == 'GLOSBE':

            start = time.clock()
            lp = str(self.ui.comboBox_2.currentText())
            al = lp.split('/')
            orglang = al[0]
            destlang = al[1]

            gb = glosbe()
            data = gb.getData(theWord, orglang, destlang)
            if data == -1:
                s = '\nHello : ' + theWord + '  maybe mispelled or not the correct root word for dicitionary lookup or wrong language code'
                self.ui.textEdit_2.append(s)
                s = 'Correct and try again'
                self.ui.textEdit_2.append(s)
            else:

                s = 'From   :  ' + data['from']
                self.ui.textEdit_2.append(s)
                s = 'Dest   :  ' + data['dest']
                self.ui.textEdit_2.append(s)
                s = 'Result :  ' + data['result']
                self.ui.textEdit_2.append(s)
                s = 'Phrases:  ' + data['phrase']
                self.ui.textEdit_2.append(s)
                self.ui.textEdit_2.append('\n')
                self.ui.textEdit_2.append('Translations : ')
                # translation results
                phr = data['tuc']
                for item in range(0, len(phr)):
                    try:
                        s = data['tuc'][item]['phrase']['text'] + ' , '
                        self.ui.textEdit_2.append(s)
                    except:
                        pass

            self.ui.textEdit_2.append(
                '\n\n\n Translation + Sample Sentence\n\n')
            r2 = requests.get(
                'http://glosbe.com/gapi/translate?from=%s&dest=%s&format=json&tm=true&phrase=%s&pretty=true'
                % (orglang, destlang, theWord))
            try:
                data = json.loads(r2.text)
                # translation results
                phr = data['tuc']
                #pprint(phr)
                if len(phr) == 0:
                    self.ui.textEdit_2.append(
                        '      Nothing returned from Glosbe')
                else:
                    # for formatting precalc maxl
                    maxl = 0
                    ll = 0
                    for item in range(0, len(phr)):
                        try:
                            if destlang == 'jpn' or destlang == 'zh' or destlang == 'rus':
                                ll = len(
                                    str(phr[item]['phrase']['text']).rstrip(
                                        ' '))
                            else:
                                ll = len(
                                    str(phr[item]['phrase']['text']).encode(
                                        'UTF-8').rstrip(' '))
                            if ll > maxl:
                                maxl = ll
                        except:
                            ll = 10
                            pass

                    try:
                        self.ui.textEdit_2.append('\nPhrase/Meanings :\n')
                        if len(phr) == 0:
                            self.ui.textEdit_2.append(
                                '      Nothing returned from Glosbe')
                        else:
                            for item in range(0, len(phr)):
                                try:

                                    if destlang == 'jpn' or destlang == 'zh' or destlang == 'rus':
                                        if phr[item]['phrase']['text'] <> '':
                                            s = 'Phrase   : ' + phr[item][
                                                'phrase']['text'].replace(
                                                    '&#39;', "'").replace(
                                                        '&rsquo;',
                                                        "'").replace(
                                                            '&eacute;', '`')
                                            self.ui.textEdit_2.append(s)

                                            for itx in range(
                                                    0, len(phr[item])):
                                                try:
                                                    if phr[item]['meanings'][
                                                            itx]['text'] <> '':
                                                        s = 'Meaning  : ' + phr[
                                                            item]['meanings'][itx][
                                                                'text'].replace(
                                                                    '&#39;',
                                                                    "'"
                                                                ).replace(
                                                                    '&rsquo;',
                                                                    "'"
                                                                ).replace(
                                                                    '&eacute;',
                                                                    '`')
                                                        self.ui.textEdit_2.append(
                                                            s)
                                                except:
                                                    pass

                                    else:
                                        if phr[item]['phrase']['text'].encode(
                                                'UTF-8') <> '':
                                            s = 'Phrase   : ' + phr[item][
                                                'phrase']['text'].encode(
                                                    'UTF-8').replace(
                                                        '&#39;', "'").replace(
                                                            '&rsquo;',
                                                            "'").replace(
                                                                '&eacute;',
                                                                '`')
                                            self.ui.textEdit_2.append(s)

                                            for itx in range(
                                                    0, len(phr[item])):
                                                try:
                                                    if phr[item]['meanings'][
                                                            itx]['text'].encode(
                                                                'UTF-8') <> '':
                                                        s = 'Meaning  : ' + phr[
                                                            item]['meanings'][
                                                                itx]['text'].encode(
                                                                    'UTF-8'
                                                                ).replace(
                                                                    '&#39;',
                                                                    "'"
                                                                ).replace(
                                                                    '&rsquo;',
                                                                    "'"
                                                                ).replace(
                                                                    '&eacute;',
                                                                    '`')
                                                        self.ui.textEdit_2.append(
                                                            s)
                                                except:
                                                    pass
                                    self.aline()
                                except:
                                    pass

                    except:
                        #print 'Error in Phrase/Meanings'
                        #raise
                        pass

                    try:
                        self.ui.textEdit_2.append('\n')
                        if data['tuc'][item]['phrase']['text'] <> ' ':

                            for ite in range(0, len(data['examples'])):

                                if destlang == 'jpn' or destlang == 'zh' or destlang == 'rus':
                                    ss = data['examples'][ite][
                                        'second'].replace(
                                            '<strong class="keyword">', '')
                                    ss = ss.replace('</strong>', '').replace(
                                        '#', '').replace('|', '')

                                    sf = data['examples'][ite][
                                        'first'].replace(
                                            '<strong class="keyword">', '')
                                    sf = sf.replace('</strong>', '').replace(
                                        '#', '').replace('|', '')
                                    if sf <> '':
                                        s = '\nExamples for : ' + data['tuc'][
                                            item]['phrase']['text']
                                        self.ui.textEdit_2.append(s)
                                        s = sf
                                        self.ui.textEdit_2.append(s)
                                        self.ui.textEdit_2.append(ss)

                                else:
                                    ss = data['examples'][ite][
                                        'second'].encode('UTF-8').replace(
                                            '<strong class="keyword">', '')
                                    ss = ss.replace('</strong>', '').replace(
                                        '#', '').replace('|', '')

                                    sf = data['examples'][ite][
                                        'first'].replace(
                                            '<strong class="keyword">', '')
                                    sf = sf.replace('</strong>', '').replace(
                                        '#', '').replace('|', '')
                                    if sf.encode('utf-8') <> '':
                                        s = '\nExamples for : ' + data['tuc'][
                                            item]['phrase']['text']
                                        self.ui.textEdit_2.append(s)
                                        s = sf.encode('utf-8')
                                        self.ui.textEdit_2.append(s)
                                        self.ui.textEdit_2.append(ss)

                    except:
                        #raise
                        pass

            except:

                #raise
                self.ui.textEdit_2.append(
                    'JSon Error, maybe no data retrieved')
                self.ui.textEdit_2.append('Re-try')

            end = time.clock()
            s = 'Request duration : ' + str(end - start) + ' secs'
            self.ui.label_4.setText(s)
            self.ui.label_3.setText('Finished GLOSBE request ...')

        elif self.ui.comboBox.currentText() == 'WEBLIO':

            start = time.clock()

            try:
                self.oldcurrLine = self.lineNumber2()
                s = Weblio()
                lt = -1
                theText = theWord
                # we need to limit the length: testing with 80
                lt = len(theText)

                #firstLine     =  self.lineNumber2()
                #print 'FirstLine :',firstLine
                #firstPosition =  self.ui.textEdit_3.textCursor().position()
                #print 'Firstpos  :',firstPosition

                if (lt > 0) and (lt < 80):
                    self.ui.textEdit_2.append('\nWeblio Results')
                    self.ui.textEdit_2.append('for:')
                    self.ui.textEdit_2.append(theText)
                    self.ui.textEdit_2.append('---------------------\n')
                    #x = self.ui.spinBox_2.value()  # how many items to fetch
                    # here we hardset to 10 examples max
                    x = 10
                    res = s.examples(theText, x)  # ok
                    key = 0
                    for dx in res:
                        rx = self.remove_comments2(dx[1], '<!--')
                        rx2 = self.remove_comments2(dx[2], '<!--')
                        rx3 = self.doMecab(rx2)
                        key += 1
                        # now we check if we are ascii and print accordingly
                        try:
                            theText.decode('ascii')
                        except:  # we selected a japanese text
                            rx4 = self.doMecab(rx)
                            # oks=str(key)+' : '+rx+'   '+rx2.encode('utf8')+'  '+kata2hira(rx4).decode('utf8')
                            oks = str(key) + ' : ' + rx
                            self.ui.textEdit_2.append(oks)
                            oks = str(key) + ' : ' + rx2.encode('utf8').strip(
                                '\n')
                            self.ui.textEdit_2.append(oks)
                            oks = str(key) + ' : ' + kata2hira(rx4).decode(
                                'utf8')
                            self.ui.textEdit_2.append(oks)

                        else:  # we selected a english text
                            oks = str(key) + ' : ' + rx.encode('utf8')
                            self.ui.textEdit_2.append(oks)
                            oks = str(key) + ' : ' + rx2
                            self.ui.textEdit_2.append(oks)
                            oks = str(key) + ' : ' + kata2hira(rx3).decode(
                                'utf8')
                            self.ui.textEdit_2.append(oks)

                    self.ui.textEdit_2.append('---------------------\n')

                else:
                    if lt > 1:
                        oks = 'Weblio line is too long. Length : %i' % lt
                        self.ui.textEdit_2.append(oks)
                        self.ui.textEdit_2.append('---------------------\n')
            except:

                pass

            # we try jump to the begining of the latest weblio data

            # for time being just jump to bottom
            self.ui.textEdit_2.moveCursor(QTextCursor.End)
            currLine = self.lineNumber2()
            mv = currLine - self.oldcurrLine
            # now move
            #print '\nOldCurrLine :',self.oldcurrLine
            #print 'CurrLine    :',currLine
            #print 'mv          :',mv
            #print '\n'
            for j in range(0, mv):
                self.ui.textEdit_2.moveCursor(
                    QTextCursor.Up)  #,QTextCursor.MoveAnchor)
                nowLine = self.lineNumber2()
                #print 'NowLine : ',nowLine

            self.oldcurrLine = currLine
            self.ui.label_3.setText('Finished Weblio request ....')
            end = time.clock()
            s = 'Request duration : ' + str(end - start) + ' secs'
            self.ui.label_4.setText(s)

        elif self.ui.comboBox.currentText() == 'JDICT':
            start = time.clock()

            # try translate from JDic
            # while ok it cud be faster
            self.ui.textEdit_2.clear()
            self.JDictToggleFlag = True
            if self.JDictToggleFlag == True:

                try:
                    jdictranslations = JDic().lookup(unicode(theWord))
                    sl = 0
                    jdi = 0
                    # nrset used as divider for linefeed inserts below
                    nrset = [
                        '(1)', '(2)', '(3)', '(4)', '(5)', '(6)', '(7)', '(8)',
                        '(9)', '(10)', '(11)', '(12)', '(13)', '(14)', '(15)',
                        '(16)', '(17)', '(18)', '(19)', '(20)'
                    ]

                    for key in jdictranslations.keys():
                        sl += 1  # if no key than we never come here
                        if jdi == 0:  # only append once per loop
                            self.ui.textEdit_2.append("\nJDic Info : \n")
                            jdi = 1

                        # this gives a wider view but still messy
                        #self.ui.textEdit_2.append(jdictranslations[key]+"\n")

                        # this gives a more readable view
                        tt = ''
                        for xs in jdictranslations[key]:
                            if xs <> ";":  # only one space or semicolon or we get empty stuff
                                tt = tt + xs
                            else:
                                # below code by trial and error to have a readable representation
                                tt = tt.replace(
                                    ' 	',
                                    ' ')  # note this is a space and a tab
                                tt = tt.replace('\n',
                                                '')  # get rid of linefeeds
                                tt = tt.replace(
                                    '  ', '\n'
                                )  # insert a linefeed if there are 2 spaces
                                tt = tt + ';'  # add the semicolon back
                                for nx in nrset:
                                    # iterate over our nrset and insert linefeeds for better readability
                                    tt = tt.replace(nx, '\n' + nx + '\n')

                                self.ui.textEdit_2.append(tt)
                                tt = ''

                    if sl == 0:
                        self.ui.textEdit_2.append('No info from JDict for ' +
                                                  theWord)

                except:
                    # occasional non type objects will occure so we just skip it
                    pass

                finally:
                    # give a notice if run finished
                    self.ui.textEdit_2.append('JDict-Finished')

                    #TODO:: try similar wordnet , wordnet has quota so may not work as wanted

            end = time.clock()
            s = 'Request duration : ' + str(end - start) + ' secs'
            self.ui.label_4.setText(s)
Пример #31
0
total_freq = 0

for file_name in file_list:
    print file_name

    with codecs.open(file_name, 'r', 'utf-8') as f:
        for line in f:
            arr = line.split()

            base = 0
            for i, e in enumerate(arr):
                if e[0] <= '9' and e[0] >= '0':
                    base = i
            kanji = arr[base-1][1:]
            freq = arr[base][:-2]
            kana = kata2hira(arr[base+2][:-1])

            print kanji, freq, kana

            if len(kana) == 0:
                continue
            elif len(kana) > 1 and kana[0] == '{':
                ks = kana[1:kana.index('}')].split('/')
                ks = [k + kana[kana.index('}')+1:] for k in ks if len(k) > 0]
            else:
                ks = [kana, ]

            freq = int(freq)
            for k in ks:

                if k not in temp_obj:
Пример #32
0
    def getExamplesKana(query):
        return JishoClient.getExamples(kata2hira(''.join(MecabTool.parseToReadingsKana(query))))    #it works but slightly incorrect (mecab shenanigans)
  
#test = JishoClient.getExamplesKana(u'軈て')
#print '\n'.join(test)