def pos_tag(sentence, kuro_server):
    tagged_words = []
    kuromoji = kuro_server.kuromoji
    tokenizer = kuromoji.Tokenizer.builder().build()
    tokens = tokenizer.tokenize(sentence)
    for elem in [[
            x.getBaseForm()
            if x.getBaseForm() is not None else x.getSurfaceForm(),
            pos_interpreter(x.getPartOfSpeech()),
            x.getSurfaceForm(),
            x.getReading(),
    ] for x in tokens]:
        ru_or_u = ru_or_u_verb(elem)
        if ru_or_u:
            tagged_words.append([
                elem[0], [*elem[1], ru_or_u], elem[2],
                romkan.to_roma(elem[0]), elem[3],
                romkan.to_roma(elem[3])
            ])
        else:
            try:
                tagged_words.append([
                    elem[0], elem[1], elem[2],
                    romkan.to_roma(elem[0]) if elem[0] else "", elem[3],
                    romkan.to_roma(elem[3])
                ])
            except:
                pass
    return tagged_words
    def maybe_potential_form(self) -> Optional[str]:
        pos = self.pos_str()
        surface = self.surface()

        maybe_dform = None

        if (pos[0] == "v" and len(self.morphemes) == 1
                and self.morphemes[0].dictionary_form() == surface
                and romkan.to_roma(surface).endswith("eru")
                and not jmdict_lookup(surface).entries):
            suf = romkan.to_hiragana(romkan.to_roma(surface[-2:])[:-3] + "u")
            maybe_dform = surface[:-2] + suf

        elif (pos[0] == "v"
              and romkan.to_roma(self.morphemes[0].surface()).endswith("e")
              and not jmdict_lookup(surface).entries):
            suf = romkan.to_hiragana(
                romkan.to_roma(self.morphemes[0].surface()[-1])[:-1] + "u")
            maybe_dform = self.morphemes[0].surface()[:-1] + suf

        if not maybe_dform:
            return

        maybe_pos: SudachiPos = parse(maybe_dform)[0].part_of_speech()

        if (surface not in merge_multi_dicts([
                flip_multi_dict(m)
                for m in all_conjugations(maybe_dform, maybe_pos).values()
        ]).keys()):
            return

        if not jmdict_lookup(maybe_dform).entries:
            return

        return maybe_dform
Exemplo n.º 3
0
def slugify(data: str) -> str:
    slug = "".join(s for s in data.strip() if s in ALLOWED)
    # if its just a date with underscores or spaces
    if set(slug.strip()).issubset(SLUG_DIGITS):
        # try to convert japanese text to romaji to prevent image clashes
        slug = "".join(s for s in romkan.to_roma(data).strip() if s in ALLOWED)
    return slug.replace(" ", "_").casefold().strip()
Exemplo n.º 4
0
 def __init__(self, latin = "", romaji = "", kana = "",
         display_name = "", type = "", hanabira_setsumei = "", hana_tsukikata = "", ha_tsukikata = "", ha_katachi = "", kyoshi = "", iro = "", hanabira_kazu = [0], shokudoku = [], kaki = [], seiikubasho = [], bunpu = [], kishibe_type = None):
     kana = try_unicode(kana)
     if not romaji and kana:
         romaji = romkan.to_roma(kana).replace("n'", "nn")
     elif not kana and romaji:
         kana = romkan.to_kana(romaji)
     self.romaji = romaji
     self.kana = kana
     self.latin = latin
     #self.latin = latin if latin else ""
     self.display_name = display_name if display_name else self.kana
     self.kaki = kaki
     self.bunpu = bunpu
     #self.kaki = kaki if kaki else []
     #self.bunpu = bunpu if bunpu else []
     self.kishibe_type = kishibe_type
     self.seiikubasho = seiikubasho
     #self.seiikubasho = seiikubasho if seiikubasho else []
     self.type = type
     self.hanabira_kazu = hanabira_kazu
     self.hanabira_setsumei = hanabira_setsumei
     self.shokudoku = shokudoku
     self.hana_tsukikata = hana_tsukikata
     self.ha_tsukikata = ha_tsukikata 
     self.ha_katachi = ha_katachi
     self.iro = iro
     self.kyoshi = kyoshi
Exemplo n.º 5
0
def findMatch(line):
	alphabets = []
	nonalphabets = []
	romanized = []
	words = filter(lambda word : word != '' and not re.search(r'\d', word),line.split(' '))
	for word in words:
		if re.search('[a-zA-Z]' ,word) and len(word) > 1:
			alphabets.append(word)
		elif is_katakana(word):
			nonalphabets.append(word)
			romanized.append(romkan.to_roma(word))
	dim = (len(alphabets), len(romanized))
	similarity = numpy.zeros(dim)		
	for i in range(len(alphabets)):
		for j in range(len(romanized)):
			alphabet_len = len(alphabets[i])
			romanized_len = len(romanized[j])
			max_len = max(alphabet_len, romanized_len)
			similarity[i][j] = numpy.linalg.norm(vectorize(alphabets[i], max_len)-vectorize(romanized[j], max_len))
			# similarity[i][j] = distance.euclidean(vectorize(alphabets[i], max_len),vectorize(romanized[j], max_len))
	ans = []
	for i in range(min(dim[0], dim[1])):
		row_index = similarity.argmin() / similarity.shape[1]
		col_index = similarity.argmin() % similarity.shape[1]
		ans.append((alphabets[row_index],nonalphabets[col_index], similarity[row_index,col_index],line))
		del alphabets[row_index]
		del nonalphabets[col_index]
		similarity = numpy.delete(similarity, row_index, 0)
		similarity = numpy.delete(similarity, col_index, 1)
	return ans
Exemplo n.º 6
0
    def kanji_to_romaji(self, text):
        convert = self.conv

        hiragana_text = convert.do(text)
        romaji_text = romkan.to_roma(hiragana_text)
        
        return (hiragana_text, romaji_text)
Exemplo n.º 7
0
def findMatch(line):
    alphabets = []
    nonalphabets = []
    romanized = []
    words = filter(lambda word: word != '' and not re.search(r'\d', word),
                   line.split(' '))
    for word in words:
        if re.search('[a-zA-Z]', word) and len(word) > 1:
            alphabets.append(word)
        elif has_katakana(word):
            nonalphabets.append(word)
            romanized.append(romkan.to_roma(word))
    dim = (len(alphabets), len(romanized))
    similarity = numpy.zeros(dim)
    for i in range(len(alphabets)):
        for j in range(len(romanized)):
            similarity[i][j] = distance.euclidean(vectorize(alphabets[i]),
                                                  vectorize(romanized[j]))
    ans = []
    if dim[1] > 0:
        for i in range(dim[0]):
            if min(similarity[i, :]) < 0.5:
                j = numpy.argmin(similarity[i, :])
                ans.append((alphabets[i], nonalphabets[j]))
    return ans
Exemplo n.º 8
0
def map_dict_form_to_different_ending(verb, romaji_ending, *special_endings):
    '''Generates Godan verb stem and computes the correct particle to attach based on the
    verb's last kana 

    Args:
        verb (str): Japanese verb in kana, might contain kanji
        romaji_ending (str): target sound of the particle to append to the verb
        *special_endings: Variable length argument list. Based on the target Godan particle 
        class (-a, -e, -i, -o). Order of particles is -u / -tsu / -su.

    Returns:
        str: verb stem with the correct particle attached depending on the last kana particle
    of the Godan verb
    '''
    last_kana = splice_verb(verb, VerbClass.GODAN, False)
    verb_stem = splice_verb(verb, VerbClass.GODAN)

    if last_kana == U_PARTICLE:
        return "{}{}".format(verb_stem, special_endings[0])
    elif last_kana == TSU_PARTICLE:
        return "{}{}".format(verb_stem, special_endings[1])
    elif last_kana == SU_PARTICLE:
        return "{}{}".format(verb_stem, special_endings[2])
    else:
        transformed_last_kana_as_romaji = "{}{}".format(
            romkan.to_roma(last_kana)[:-1], romaji_ending)
        return "{}{}".format(
            verb_stem, romkan.to_hiragana(transformed_last_kana_as_romaji))
Exemplo n.º 9
0
def output_text_per_hom(OutJsonFP, Max=3000):
    def is_of_interest(Orths):
        #Bool=True
        KanjiOnly = False
        # at least 2 variations in orth
        if len(Orths) < 2:
            return False, None
        # at least two is kanji orths
        else:
            KanjiOrths = [
                Orth for Orth in Orths
                if myModule.at_least_one_of_chartypes_p(Orth, ['han'])
            ]
            if len(KanjiOrths) < 2:
                return False, None
            elif len(KanjiOrths) == len(Orths):
                KanjiOnly = True
        return True, KanjiOnly

    LCnt = 7900  #myModule.get_linecount(OutJsonFP)
    TxtDir = os.path.join(os.path.dirname(OutJsonFP),
                          os.path.basename(OutJsonFP) + '_txt')
    if not os.path.isdir(TxtDir):
        os.mkdir(TxtDir)
    CntSoFar = 0
    Cntr = 0
    CntThresh = LCnt / 1000
    with open(OutJsonFP) as FSr:
        #print('retrieving homvecs for '+Hom+'...')
        while FSr or Cntr < Max:
            Ret = get_hom_in_file(FSr, OutJsonFP, FstPosition=CntSoFar)
            if Ret:
                FSr, OrthsVecs, Hom, Cnt, MultiToks = Ret
            else:
                break
            #except:
            #    get_hom_in_file(FSr,OutJsonFP,FstPosition=CntSoFar)
            Orths = list(OrthsVecs.keys())
            print('For ' + Hom + ', we found the following orths, ' +
                  str(Cnt) + ' items')
            print(Orths)
            print(CntThresh)
            IsOfInt, KanjiOnly = is_of_interest(Orths)
            if not (Cnt > CntThresh and len(Hom.split(':')[0]) >= 2
                    and IsOfInt):
                print('not selected for printing\n')
            else:
                print('writing out...')
                RomHom = romkan.to_roma(Hom)
                OutHomFP = os.path.join(TxtDir, 'homvecs_' + RomHom)
                with open(OutHomFP, 'wt') as FSw:
                    FSw.write(stringify_hom_vecs(OrthsVecs))
                    print('... done, fp: ' + OutHomFP)
                if KanjiOnly:
                    RefClusterFP = OutHomFP + '.refclusters'
                    with open(RefClusterFP, 'wt') as FSw:
                        FSw.write('\t'.join(get_cluster_ref(OrthsVecs)))
                CntSoFar += Cnt
                Cntr += 1
Exemplo n.º 10
0
 def verify_reading(self, guess, readings=None):
     guess = romkan.to_roma(romkan.to_kana(guess.replace(' ', '')))
     if not readings:
         readings = set(
             Association.objects.filter(
                 expression=self.expression).values_list('reading',
                                                         flat=True))
     readings = map(romkan.to_roma, readings)
     return guess in readings
Exemplo n.º 11
0
 def make_roma_subname(name_ja):
     subname = re.sub(r'[=]', '', name_ja)
     subname = re.sub(r'[「」]', '・', subname)
     adjusted_subname = ''
     for part in subname.split('・'):
         roma_part = romkan.to_roma(part)
         if part != roma_part and not contains_ja(roma_part):
             adjusted_subname += ' ' + roma_part.strip('-')
     return adjusted_subname.strip()
Exemplo n.º 12
0
    def calculatePartsKana(self, aString):
        s = set()

        kana = romkan.to_katakana(romkan.to_roma(aString))

        for e in aString.split(" "):
            s |= self.calculatePartsElement(kana)

        return self.serializeSet(s)
Exemplo n.º 13
0
def get_romaji(sent):
    t = Tokenizer()
    readings = ""
    for token in t.tokenize(sent):
        surface = regex.split("[\t,]", str(token).decode('utf8'))[0]
        reading = regex.split("[\t,]", str(token).decode('utf8'))[-2]
        reading = surface if reading == "*" else reading
        readings += reading
    romaji = romkan.to_roma(readings)
    return romaji
Exemplo n.º 14
0
    def clean(self):
        if not self.hiragana:
            self.hiragana = romkan.to_hiragana(self.romaji)
        elif not self.romaji:
            self.romaji = romkan.to_roma(self.hiragana).capitalize()
        elif not self.hiragana and not self.romaji:
            raise ValidationError(
                "You have to enter either the Hiragana or Romaji of a Word")

        self.slug = slugify(self.romaji)
Exemplo n.º 15
0
def lemmatize_with_mecab(expression, kanji):
    '''Find the first word containing kanji; return (lemma, reading).'''
    nodes = mecab_tagger.parseToNode(expression)
    while nodes:
        features = nodes.feature.split(',')
        if kanji in features[10]:
            lemma = features[10]
            reading = romkan.to_hiragana(romkan.to_roma(features[6]))
            return ((lemma, reading))
        nodes = nodes.next
    raise (ValueError("Mecab failed: %s, %s" % (expression, kanji)))
Exemplo n.º 16
0
def lemmatize_with_mecab(expression, kanji):
    '''Find the first word containing kanji; return (lemma, reading).'''
    nodes = mecab_tagger.parseToNode(expression)
    while nodes:
        features = nodes.feature.split(',')
        if kanji in features[10]:
            lemma = features[10]
            reading = romkan.to_hiragana(romkan.to_roma(features[6]))
            return((lemma, reading))
        nodes = nodes.next
    raise(ValueError("Mecab failed: %s, %s" % (expression, kanji)))
Exemplo n.º 17
0
def split_alpha(line):
    out = []
    for char in line:
        if char in ('ァ', 'ィ', 'ゥ', 'ェ', 'ォ', 'ャ', 'ュ', 'ョ'):
            if out:
                out[-1] += char
            else:
                out.append(char)
        else:
            out.append(char)
    return [("ッ".join("ン".join(kana.split("N")).split("q")), romkan.to_roma(kana)) for kana in out]
Exemplo n.º 18
0
def index():
    text = request.query.getunicode('text', '')
    words = mc.parse(text).strip().split(' ')
    pronunciation = [jmdict.get(w, ([], []))[0] for w in words]
    english = [jmdict.get(w, ([], []))[1] for w in words]
    romaji = [[romkan.to_roma(w) for w in p] for p in pronunciation]

    return {"text": text,
            "words": words,
            "pronunciation": pronunciation,
            "romaji": romaji,
            "english": english}
Exemplo n.º 19
0
def train(features):
    print "START TRAINING"
    worddic = dict()
    for linenum,line in enumerate(features):
        items = line.strip().split("\t")
        string = items[0].decode("utf8")
        string_roma = unicodedata.normalize("NFKC",romkan.to_roma(string))
        freq = float(items[1])
        worddic[string_roma] = freq
        if linenum % 10000 == 0:
            print "{:>2}%".format(linenum/10000)
    print "FINISH TRAINING\n"
    return worddic
Exemplo n.º 20
0
 def listen_japanese(self):
     pygame.mixer.init()
     for i, item in enumerate(self.get_japanese_items()):
         file_name = '%s.mp3' % romkan.to_roma(item.text().replace(' ', '_'))
         path = os.path.join(self.window().japanese_path(), file_name)
         self.soundfiles.append(path)
         if not os.path.exists(path):
             tts = gTTS(text=item.text(), lang='ja')
             tts.save(path)
         pygame.mixer.music.load(path)
         pygame.mixer.music.play()
         while pygame.mixer.music.get_busy():
             pygame.time.Clock().tick(20000)
Exemplo n.º 21
0
def slugify(text):
    """Version of slugify that supports Japanese characters"""
    if not text:
        return ""
    slug = django_slugify(text)
    if not slug:
        # Title may be in Japanese
        slug = django_slugify(romkan.to_roma(text))
    if not slug:
        # Title may be in Chinese
        pinyin = Pinyin()
        slug = django_slugify(pinyin.get_pinyin(text))
    return slug[:50]
Exemplo n.º 22
0
def train(features):
    print "START TRAINING"
    worddic = dict()
    for linenum, line in enumerate(features):
        items = line.strip().split("\t")
        string = items[0].decode("utf8")
        string_roma = unicodedata.normalize("NFKC", romkan.to_roma(string))
        freq = float(items[1])
        worddic[string_roma] = freq
        if linenum % 10000 == 0:
            print "{:>2}%".format(linenum / 10000)
    print "FINISH TRAINING\n"
    return worddic
Exemplo n.º 23
0
def read_kanjidic(filepath):
    """Given path to kanjidic file, returns a dictionary of character readings by
       language, eg: char_dict["犬"] >> (['quan3', 'quan2'], ['gyeon'], ['ken'])
    """
    char_dict = {}  # Should have 6355 characters
    with open(filepath, encoding="u-jis") as f:
        for line in f:
            han = re.findall(re_han, line)
            if len(han) == 1:  # Skip non dictionary entry lines
                char = han[0]  # Character itself
                mandarin = re.findall(re_mandarin, line)
                hanja = re.findall(re_hanja, line)
                # Note: In Japanese, some characters have on-yomi but not kun-yomi, and vice-versa
                jp_onyomi = re.findall(re_katakana,
                                       line)  # Sino-japanese reading(s)
                jp_kunyomi = re.findall(re_hiragana,
                                        line)  # Native japanese reading(s)
                # Convert to Latin alphabet
                jp_onyomi = [romkan.to_roma(x) for x in jp_onyomi]
                jp_kunyomi = [romkan.to_roma(x) for x in jp_kunyomi]
                # Fix things like 瓩:キログラム being interpreted as onyomi b/c katakana usage
                for x in jp_onyomi:
                    if len(x) > 6:
                        jp_kunyomi += [x]
                        jp_onyomi.remove(x)
                # Remove leading identifier character, eg: Ywo3 -> wo3
                hanja = [x[1:] for x in hanja]
                mandarin = [x[1:] for x in mandarin]
                # Provide dummy values if one training lanaguage is missing a reading
                # eg: Learn mandarin pronunciation from just the hanjul
                # (Assumes Mandarin is training objective)
                if len(hanja) < 1:
                    hanja = ["*"]
                if len(jp_onyomi) < 1:
                    jp_onyomi = ["*"]
                char_dict[char] = (mandarin, hanja, jp_onyomi
                                   )  # Don't care about kunyomi
    return char_dict
Exemplo n.º 24
0
def get_inserts(max_chunk=10000):
    inserts = []
    parser = Parser(PATH_TO_EDICT2)
    i = 0
    for e in parser.parse():
        i += 1
        e['english'] = [g['english'] for g in e['glosses']]
        e['romaji'] = romkan.to_roma(e['furigana'])
        e['common_boost'] = 2.0 if e['common'] is True else 1.0
        inserts.append(e)
        if i % max_chunk == 0:
            yield inserts
            inserts = []
    yield inserts
Exemplo n.º 25
0
 def default_to_hiragana(self, row, col):
     items = self.tableWidget.selectedItems()
     if items:
         for item in items:
             if item.column():
                 if not self.lang_mode:
                     item.setText(romkan.to_hiragana(item.text()))
     item = self.tableWidget.item(row, col)
     if item:
         self.page_data['%s,%s' % (row, col)] = item.text()
         if col:
             self.page_data['%s,%s' % (row, col)] = romkan.to_roma(item.text())
     self.data['page_%s' % self.window().pageLab.text()] = self.page_data
     self.update_config(self.data)
Exemplo n.º 26
0
def get_inserts(max_chunk=10000):
    inserts = []
    parser = Parser(PATH_TO_EDICT2)
    i = 0
    for e in parser.parse():
        i += 1
        e['english'] = [g['english'] for g in e['glosses']]
        e['romaji'] = romkan.to_roma(e['furigana'])
        e['common_boost'] = 2.0 if e['common'] == True else 1.0
        inserts.append(e)
        if i % max_chunk == 0:
            yield inserts
            inserts = []
    yield inserts
def ru_or_u_verb(pos_tags):
    if 'verb' in pos_tags[1]:
        if pos_tags[0] in ["する", "くる"]:
            return "exception"
        if pos_tags[0][-1] != "る":
            return 'u-verb'
        roma = romkan.to_roma(pos_tags[0].replace(pos_tags[2], pos_tags[3]))
        if roma[-3] not in ['i', 'e']:
            return 'u-verb'
        else:
            if pos_tags[0] not in ru_verb_exceptions:
                return 'ru-verb'
            else:
                return 'u-verb'
Exemplo n.º 28
0
def to_romaji(token):
	replace_dict = {'': ['a', 'e', 'i', 'o', 'u'],
					'b': ['v'],
					'p': ['f', 'h'],
					'c': ['k'],
					'l': ['r'],
					's': ['z'],
					'g': ['j']
					}
	token = re.sub('[^a-z]', '', romkan.to_roma(token).lower())
	for newtokens, tokens in replace_dict.iteritems():
		for oldtoken in tokens:
			token = token.replace(oldtoken, newtokens)
	return token
Exemplo n.º 29
0
def slugify(text):
    """Version of slugify that supports Japanese and Chinese characters"""
    if not text:
        return ""
    slug = django_slugify(text)
    if not slug:
        # Title may be in Japanese
        slug = django_slugify(romkan.to_roma(text))
    if not slug:
        # Title may be in Chinese
        pinyin = Pinyin()
        slug = django_slugify(pinyin.get_pinyin(text))
    if not slug:
        # Try transliterate which supports Cyryllic, Greek and other alphabets
        slug = django_slugify(translit(text, reversed=True))
    return slug[:50]
Exemplo n.º 30
0
def hiragana_candidates(word,num):
    if not isinstance(word,unicode): #unicode check
        word = word.decode("utf8")

    romaji = unicodedata.normalize("NFKC",romkan.to_roma(word))
    print "romaji:{}".format(romaji)

    candidates = prob(romaji) + edit1_prob(romaji) + edit2_prob(romaji)
    if candidates:
        for i,word_prob_tuple in enumerate(sorted(candidates,key=lambda x :x[1],reverse=True)[:num]):
            romaji = word_prob_tuple[0]
            p = word_prob_tuple[1]
            kana = romkan.to_hiragana(romaji).encode("utf8")
            print " {} : {:<10}{:<20} {:<}".format(i+1,kana,"("+romaji+")",p)
    else:
        print "NO RESULT"
Exemplo n.º 31
0
def decompose_hiraganas(hiraganas):
    '''Decompose the hiragana str into consonants & vowels'''

    alphabets = []

    # Convert hiragans into Romaji,
    # then divide string (e.g. "abc") into list (e.g. ["a", "b", "c"])
    alphabets[:0] = romkan.to_roma(hiraganas)

    vowels = []

    for i, alphabet in enumerate(alphabets):
        if alphabet in ["a", "e", "i", "o", "u"]:
            vowels.append(alphabets.pop(i))

    return {"consonants": alphabets, "vowels": vowels}
Exemplo n.º 32
0
def phonemize(x):
    if x not in cachedPhonemization:
        phonemized = romkan.to_roma(x)
        if max([ord(y) for y in phonemized]) > 200:  # contains Kanji
            cachedPhonemization[x] = x
        else:
            if x.endswith("っ"):
                assert phonemized.endswith("xtsu")
                phonemized = phonemized.replace("xtsu",
                                                "G")  # G for `geminate'
            phonemized = phonemized.replace("ch", "C")
            phonemized = phonemized.replace("sh", "S")
            phonemized = phonemized.replace("ts", "T")
            cachedPhonemization[x] = phonemized
    phonemized = cachedPhonemization[x]
    return phonemized
Exemplo n.º 33
0
def scrape_japanese_definitions(html, max_results=3):
  '''
  Extract japanese kanji, kana, english definitions and parts of speech
  from html off jisho.org.
  Return the values as a list of strings.
  If nothing found, return None.
  '''
  results = []
  try:
    soup = BeautifulSoup(html)
    kanji = soup.findAll('td', {'class': 'kanji_column'})
    kana = soup.findAll('td', {'class': 'kana_column'})
    engrish = soup.findAll('td', {'class': 'meanings_column'})
    if not kanji or not kana or not engrish:
      return None
    kanji = [' '.join(x.stripped_strings) for x in kanji]
    kana = [' '.join(x.stripped_strings) for x in kana]
    romaji = [romkan.to_roma(x) for x in kana]
    engrish = [elipsize(' '.join(x.stripped_strings)) for x in engrish]
    results = zip(kanji, kana, romaji, engrish)

    '''
    #before forming final definitions string list from these sublists
    #we'll remove definitions which have identical english meanings???
    results = []
    for i,definition in enumerate(definitions):
      if len(results>0) and definition[3] in results[:i-1][3]:
        pass
      else:
        results.append(definition)
    '''

    #form final results from zipped list and return 
    '''
    results = [u'{white}{kanji}{white}{kana}{white}{romaji}{white}{engrish}'.format( \
      white= foreground(u'black') + background(u'white'), \
      black=style(u'normal'), \
      kanji=(u' '+x[0]+u' '+style(u'normal')+u' ' if x[0] else u''), \
      kana=(u' '+x[1]+u' '+style(u'normal')+u' ' if x[1] else u''), \
      romaji=(u' '+x[2]+u' '+style(u'normal')+u' ' if x[2] else u''), \
      engrish=(u' '+x[3]+u' '+style(u'normal')+u' ' if x[3]else u'')) for x in results[:max_results]]
    '''
    lookups = [Lookup(x[0],x[1],x[2],x[3]) for x in results[:max_results]]
      
  except:
    log.err()
  return lookups
Exemplo n.º 34
0
 def __init__(
     self, latin="", ka="", zoku="", romaji="", kana="", rarity=3, display_name="", masuda=None, takatsu=None
 ):
     kana = try_unicode(kana)
     if not romaji and kana:
         romaji = romkan.to_roma(kana).replace("n'", "nn")
     elif not kana and romaji:
         kana = romkan.to_kana(romaji)
     self.romaji = romaji
     self.kana = kana
     self.masuda = masuda if masuda else []
     self.takatsu = takatsu if takatsu else []
     self.latin = latin if latin else ""
     self.ka = ka if ka else ""
     self.zoku = zoku if zoku else ""
     self.rarity = rarity if rarity else 3
     self.display_name = display_name if display_name else self.kana
Exemplo n.º 35
0
 def command_ja(self, event):
     '''Usage: ~ja <k/h/r> <arg> displays katakana/hiragana/romaji for a given argument, converting between romaji and kana'''
     try:
         dest, phrase = event.params.split(' ', 1)
         dest = dest.lower()
         if dest == 'k':
             resp = romkan.to_katakana(phrase)
         elif dest == 'h':
             resp = romkan.to_hiragana(phrase)
         elif dest == 'r':
             resp = romkan.to_roma(phrase.decode('utf-8'))
         else:
             raise
         self.send_message(event.respond, resp)
     except:
         self.send_message(event.respond, 'Invalid input, please check syntax.')
         raise
Exemplo n.º 36
0
 def command_ja(self, event):
     '''Usage: ~ja <k/h/r> <arg> displays katakana/hiragana/romaji for a given argument, converting between romaji and kana'''
     try:
         dest, phrase = event.params.split(' ', 1)
         dest = dest.lower()
         if dest == 'k':
             resp = romkan.to_katakana(phrase)
         elif dest == 'h':
             resp = romkan.to_hiragana(phrase)
         elif dest == 'r':
             resp = romkan.to_roma(phrase.decode('utf-8'))
         else:
             raise
         self.send_message(event.respond, resp)
     except:
         self.send_message(event.respond, 'Invalid input, please check syntax.')
         raise
Exemplo n.º 37
0
def main(SMecabCorpusDir,
         HomStats,
         Model,
         ModelType='cbow',
         Window=5,
         UpToPercent=None,
         OutDir=None,
         DoRTextP=True):

    OutFNStem = os.path.basename(SMecabCorpusDir) + '_contexts_mvecs'
    OutFN = OutFNStem + '_' + ModelType + '.json'
    PickedTokenStatsFN = OutFNStem + '_pickedtokenstats.pickle'
    OutJsonFP, PickedTokenStatsFP = [
        (SMecabCorpusDir if OutDir is None else OutDir) + '/' + FN
        for FN in (OutFN, PickedTokenStatsFN)
    ]
    #    print('finding mean vectors for contexts...')
    myModule.ask_filenoexist_execute(
        [OutJsonFP, PickedTokenStatsFP], get_homs_contexts_mvecs,
        ([SMecabCorpusDir, HomStats, Model, Window], {
            'OutJsonFP': OutJsonFP
        }))

    if DoRTextP:
        TxtDir = os.path.join(os.path.dirname(OutJsonFP),
                              OutFNStem + '_txtfiles')
        if not os.path.isdir(TxtDir):
            os.mkdir(TxtDir)
        SortedP = json_sorted_p(OutJsonFP)
        HomsOrthsCnts = myModule.load_pickle(PickedTokenStatsFP)
        HomsCnts = sorted([(Hom, sum(OrthsCnts.values()))
                           for (Hom, OrthsCnts) in HomsOrthsCnts.items()])
        CntSoFar = 0
        for Cntr, (Hom, Cnt) in enumerate(HomsCnts):
            if Cntr > 1000:
                break
            OrthsVecs = get_hom_in_file(Hom,
                                        OutJsonFP,
                                        FstPosition=CntSoFar,
                                        AssumeSortedP=SortedP)
            RomHom = romkan.to_roma(Hom)
            OutHomFP = os.path.join(TxtDir, 'homvecs_' + RomHom)
            with open(OutHomFP, 'wt') as FSw:
                FSw.write(stringify_hom_vecs(OrthsVecs))
            CntSoFar += Cnt
Exemplo n.º 38
0
    def create_parts(self, sentence, romas):
        func = "_noname_"
        analyzer = CaboChaAnalyzer()
        tree = analyzer.parse(sentence)
        l = []
        mainPart = 0
        for chunk in tree:
            for token in chunk:
                kan = token.feature.split(',')[-2]
                if kan == '*':
                    kan = token.surface
                romas.append(romkan.to_roma(kan))
            if chunk.link == -1:
                mainPart = chunk.id
                func = self.get_first_token(chunk)
        for chunk in tree:
            curword = chunk.tokens[0].surface
            curfeature = chunk.tokens[0].feature
            feat = self.analyse_feature(curfeature)
            if feat == '@num' or feat == '@n':
                curword = self.join_tokens(chunk)
            elif feat == '@nc':
                curword = self.join_nc_tokens(chunk)
            elif feat == '@v':
                parts = curfeature.split(',')
                raw = parts[-3]
                if raw != '*':
                    curword = raw

            ## main part
            if chunk.link == -1:
                prefix = ""
                if feat == '@v':
                    prefix = "act:"
                elif feat == '@adj':
                    prefix = "desc:"
                elif feat == '@n':
                    prefix = "prop:"
                l.append(prefix + "*" + curword + feat)
            elif chunk.link == mainPart:
                l.append(self.get_prefix(chunk) + "+" + curword + feat)
            else:
                l.append("." + curword + feat)
        result = func + '(' + ", ".join(l) + ')'
        return result
Exemplo n.º 39
0
def hiragana_candidates(word, num):
    if not isinstance(word, unicode):  #unicode check
        word = word.decode("utf8")

    romaji = unicodedata.normalize("NFKC", romkan.to_roma(word))
    print "romaji:{}".format(romaji)

    candidates = prob(romaji) + edit1_prob(romaji) + edit2_prob(romaji)
    if candidates:
        for i, word_prob_tuple in enumerate(
                sorted(candidates, key=lambda x: x[1], reverse=True)[:num]):
            romaji = word_prob_tuple[0]
            p = word_prob_tuple[1]
            kana = romkan.to_hiragana(romaji).encode("utf8")
            print " {} : {:<10}{:<20} {:<}".format(i + 1, kana,
                                                   "(" + romaji + ")", p)
    else:
        print "NO RESULT"
Exemplo n.º 40
0
def getSongTitle(url):
	print(str(datetime.now()), ": Downloading song page", url)
	r = requests.get(url).text
	# Parse these formats, regardless of whitespace:
	# > 01 SONGNAME
	# > 1 SONGNAME
	# > 1. SONGNAME
	# > 01. SONGNAME
	# TODO: super fragile, replace with something more robust
	try:
		name = regex.findall(r'(?<=>[\s0]*1[.\s]+).+?(?=<)', r)[0]
	except Exception as e:
		print(url, e)
		name = "Unparsed"

	name = html.unescape(name.strip())
	name = romkan.to_roma(name)
	return name
Exemplo n.º 41
0
def findMatch(line):
	words = phrases.splitWord(line)
	alphabets = phrases.get_english_phrase(words)
	nonalphabets = phrases.generate_katakana_phrase(words)
	romanized = []
	for nonalphabet in nonalphabets:
		romanized.append(romkan.to_roma(nonalphabet))
	dim = (len(alphabets), len(romanized))
	similarity = numpy.zeros(dim)
	for i in range(len(alphabets)):
		for j in range(len(romanized)):
			similarity[i][j] = distance.euclidean(vectorize(alphabets[i]),vectorize(romanized[j]))
	ans = []
	if dim[1] > 0:
		for i in range(dim[0]):
			if min(similarity[i,:]) < 0.5:
				j = numpy.argmin(similarity[i,:])
				ans.append((alphabets[i],nonalphabets[j], line))
	return ans
Exemplo n.º 42
0
def findMatch(line):
    words = phrases.splitWord(line)
    alphabets = phrases.get_english_phrase(words)
    nonalphabets = phrases.generate_katakana_phrase(words)
    romanized = []
    for nonalphabet in nonalphabets:
        romanized.append(romkan.to_roma(nonalphabet))
    dim = (len(alphabets), len(romanized))
    similarity = numpy.zeros(dim)
    for i in range(len(alphabets)):
        for j in range(len(romanized)):
            similarity[i][j] = distance.euclidean(vectorize(alphabets[i]),
                                                  vectorize(romanized[j]))
    ans = []
    if dim[1] > 0:
        for i in range(dim[0]):
            if min(similarity[i, :]) < 0.5:
                j = numpy.argmin(similarity[i, :])
                ans.append((alphabets[i], nonalphabets[j], line))
    return ans
Exemplo n.º 43
0
def findMatch(line):
    alphabets = []
    nonalphabets = []
    romanized = []
    words = filter(lambda word: word != "" and not re.search(r"\d", word), line.split(" "))
    for word in words:
        if re.search("[a-zA-Z]", word) and len(word) > 1:
            alphabets.append(word)
        elif has_katakana(word):
            nonalphabets.append(word)
            romanized.append(romkan.to_roma(word))
    dim = (len(alphabets), len(romanized))
    similarity = numpy.zeros(dim)
    for i in range(len(alphabets)):
        for j in range(len(romanized)):
            similarity[i][j] = distance.euclidean(vectorize(alphabets[i]), vectorize(romanized[j]))
    ans = []
    if dim[1] > 0:
        for i in range(dim[0]):
            if min(similarity[i, :]) < 0.5:
                j = numpy.argmin(similarity[i, :])
                ans.append((alphabets[i], nonalphabets[j]))
    return ans
Exemplo n.º 44
0
def get_path(item_type, file_name=None, data=None):
    """Generates full path for the generated file using configuration
    and explicitly specified name or RSS item data. At least one argument
    should be specified. @file_name has higher priority during output
    path generation.

    Arguments:
        item_type -- 'post' or 'page'romkan.to_roma(transf.convert('汉字'))
        file_name -- explicitly defined correct file name.
        data -- preprocessed RSS item data dictionary."""

    if not file_name and type(data) is not dict:
        raise Exception('File name or RSS item data dict should be defined')

    root = conf['dump_path']
    root = root.format(date=time.strftime(conf['file_date_fmt']),
                       year=time.strftime("%Y"),
                       month=time.strftime("%m"),
                       day=time.strftime("%d"),
                       source=os.path.basename(conf['source_file']))

    if file_name:
        relpath = file_name
    else:
        transf = CConvert()
        name = romkan.to_roma(transf.convert(data.get('title', '').strip()))
        name = name or data.get('post_id', UNTITLED)
        relpath = get_path_fmt(item_type, data)
        field = FIELD_MAP.get('post_date', 'post_date')
        post_date = data[field]
        relpath = relpath.format(year=time.strftime("%Y", post_date),
                                 month=time.strftime("%m", post_date),
                                 day=time.strftime("%d", post_date),
                                 name=name,
                                 title=name)

    return uniquify(os.path.join(os.path.abspath(root), relpath))
Exemplo n.º 45
0
    def to_hiragana(self):
        """Return the reading as hiragana, even if it's On.

        >>> k = Kanji('柔')
        >>> r = Reading(k, 'ニュウ')
        >>> r.to_hiragana()
        'にゅう'


        If it's not On, it's imdepotent.
        >>> k = Kanji('最')
        >>> r = Reading(k, 'もっとも')
        >>> r.add_examples('最も')
        >>> r.reading
        'もっと.も'
        >>> r.to_hiragana()
        'もっと.も'

        """

        if self.kind == 'On':
            return(romkan.to_hiragana(romkan.to_roma(self.reading)))
        else:
            return(self.reading)
Exemplo n.º 46
0
            correct = roma[:i]
        elif i - 1 == current[1]:
            current[1] = i
        else:
            result += "[" + answer[current[0]:(current[1] + 1)] + "]" + answer[(current[1] + 1):i]
            correct += " " + roma[current[0]:(current[1] + 1)] + " " + roma[(current[1] + 1):i]
            current = [i, i]
    if current is not None:
        result += "["+answer[current[0]:(current[1]+1)]+"]"+answer[(current[1]+1):]
        correct += " "+roma[current[0]:(current[1]+1)]+" " + roma[(current[1]+1):]
    return result, correct

while True:
    i = random.randint(0, len(words))
    to_write = words[i]
    roma = romkan.to_roma(to_write).strip()
    if mode == 2:
        to_write = romkan.to_katakana(roma)+"\n"
    tries = 0
    while tries < max_tries:
        answer = input(to_write+"> ").strip()
        if answer == roma:
            print("\tcorrect!")
            break
        else:
            print("\tWRONG!")
            tries += 1
            if tries == max_tries:
                errors, correct = find_error(answer, roma)
                print("\tAnswer was "+correct+"\n\tYou wrote: "+errors)
Exemplo n.º 47
0
 def to_romanji(self):
     self.lang_mode = 2
     for item in self.get_japanese_items():
         item.setText(romkan.to_roma(item.text()))
Exemplo n.º 48
0
    def multiscrape(self, name, shy=False):
        if shy and self.config.has_key(name) and self.config[name]['ja']:
            return
        if name != u'名前' and name != u'ふりがな':
            nodes = self.root.xpath("//_:li/_:strong[contains(text(), '%s')]/following-sibling::_:ul/_:li|//_:h4[contains(text(), '%s')]/following-sibling::_:p" % (name, name), namespaces=NS)
        else:
            nodes = self.root.xpath("//_:h3", namespaces=NS)
        if not nodes:
            return

        iterator = nodes[0].itertext()
        val = ''
        l = []
        while 1:
            try:
                val = iterator.next()
                val = re.sub(u'^[  \r\n]+', '', val)
                val = re.sub(u'[  \r\n]+$', '', val)
                if val:
                    l.append(val)
            except:
                break

        val = re.sub('^[  \n]*(.*?)[  \n]*$', '\\1', '\n'.join(l))
        
        val = val.strip()
        val = makeHankaku(val)
        
        if name == u'名前':
            lst = val.split('\n')
            if not self.config.has_key(name):
                self.config[name] = {}
            self.config[name]['ja'] = lst[0]
        elif name == u'ふりがな' and not shy:
            if not self.config.has_key(u'名前'):
                self.config[u'名前'] = {}
            lst = val.split('\n')
            if len(lst) > 1:
                suzure = lst[1].replace(u' ', '').replace(' ', '')
                self.config[u'名前']['kana'] = lst[1]
                self.config[u'名前']['en'] = titleCase(lst[1])
                self.config[u'並べ替え']['ja'] = romkan.to_katakana(romkan.to_kunrei(suzure))
                self.config[u'並べ替え']['en'] = romkan.to_roma(suzure)
            else:
                self.config[u'名前']['kana'] = ''
        elif name == u'所属':
            if not self.config.has_key(u'所属'):
                self.config[u'所属'] = {}
            if self.staffType == 'LS' or self.staffType == 'PRO':
                self.config[u'所属']['ja'] = u'法科大学院'
                self.config[u'所属']['en'] = 'Law School (professional course)'
        elif name == u'役職':
            if not self.config.has_key(u'役職'):
                self.config[u'役職'] = {}
            self.config[u'役職']['ja'] = getPostJapanese(val)
            self.config[u'役職']['en'] = getPostEnglish(self.config[u'役職']['ja'])
            
        elif name == u'所属学会':
            if not self.config.has_key(u'学会'):
                self.config[u'学会'] = {}
            if len(val.split('\n')) > 1:
                self.config[u'学会']['ja'] = val.split('\n')
            else:
                self.config[u'学会']['ja'] = val
        elif name == u'教員からのメッセージ':
            if not self.config.has_key(u'法科大学院メッセージ'):
                self.config[u'法科大学院メッセージ'] = {}
            self.config[u'法科大学院メッセージ']['ja'] = val.split('\n')
        elif name == u'リンク':
            for node in nodes:
                subnode = node.xpath('.//_:a[@href]', namespaces=NS)
                if subnode and len(subnode):
                    self.config[u'ホームページ']['ja'] = subnode[0].text
                    self.config[u'ホームページ'][u'リンク'] = subnode[0].attrib['href']
                    break
        else:
            if not self.config.has_key(name):
                self.config[name] = {}
            if len(val.split('\n')) > 1:
                self.config[name]['ja'] = val.split('\n')
                if name == u'専門分野' and self.config[name]['ja'][0]:
                    self.config[name]['en'] = fieldsMap[self.config[name]['ja'][0]]
            else:
                self.config[name]['ja'] = val
                if name == u'専門分野' and self.config[name]['ja']:
                    self.config[name]['en'] = fieldsMap[self.config[name]['ja']]
Exemplo n.º 49
0
import re
import romkan

entries = set()
for i, entry in enumerate(open("edict2", encoding="euc-jp")):
    if i == 0:
        continue
    m = re.search("^[^/]*\\[([ぁ-んァ-ン]*)\\]", entry)
    if not m:
        continue
    entries.add(romkan.to_hiragana(romkan.to_roma(m.groups(1)[0])))

w = open("./hira.list", "w")
for e in entries:
    w.write(e+"\n")
w.close()
Exemplo n.º 50
0
def ship_info():
	# open the api_start2.json file
	with open("api_start2.json", "r") as f:
		json_data = json.load(f)

	# open the extra Ship.json file for evasion, LOS, and antisub, and others
	# sort by "index" (API ID, not card ID)
	with open("Ship.json", "r") as f:
		extra_ship_data = sorted(json.load(f), key=lambda k: k['index'])

	# loop through and rewrite ship info
	ships = json_data['api_data']['api_mst_ship']
	new_ships = []
	for ship in ships:
		# なし (nashi) means Null, an unused ID
		if romkan.to_roma(ship['api_name']) == "nashi":
			ships.remove(ship)
			continue
		
		# renaming keys: based on: https://kancolletool.github.io/docs/api/
		mvk(ship, 'api_sortno', 'id') # use card number as apparent ID 
		mvk(ship, 'api_id', 'api_id')           # use API number as primary ID
		mvk(ship, 'api_name', 'name')
		mvk(ship, 'api_yomi', 'kana')
		
		# don't create romanizations of ships without kana
		if ship['kana'] != "":
			ship['name_roma'] = romkan.to_roma(ship['kana'])
		else:
			ship['name_roma'] = ""

		mvk(ship, 'api_stype', 'ship_class')
		mvk(ship, 'api_afterlv', 'remodel_min_lv')
		mvk(ship, 'api_aftershipid', 'remodel_ship_id')
		ship['remodel_ship_id'] = int(ship['remodel_ship_id'])

		# split up (base, max) stats
		sbm(ship, 'api_taik', 'hp')
		sbm(ship, 'api_souk', 'armor')
		sbm(ship, 'api_houg', 'firepower')
		sbm(ship, 'api_raig', 'torpedo')
		sbm(ship, 'api_tyku', 'antiair')
		sbm(ship, 'api_luck', 'luck')
		
		# derived variables from Ship.json
		# look through extra_ship_data for matching index, then grab data from there
		found = False
		for extra_ship in extra_ship_data:
			if (extra_ship['index'] == ship['api_id']):
				# ASW: Anti-sub
				ship['antisub'] = extra_ship['antisub']
				
				# LOS: line-of-sight
				ship['line_of_sight'] = extra_ship['lineOfSight']
				
				# evasion
				ship['evasion'] = extra_ship['evasion']
				
				# illustrator
				if 'illustrator' in extra_ship.keys():
					if extra_ship['illustrator'] != 0:
						ship['illustrator'] = extra_ship['illustrator']
					else:
						ship['illustrator'] = ""
				else:
					ship['illustrator'] = ""
					
				# seiyuu: voice actor
				if 'cv' in extra_ship.keys():
					if extra_ship['cv'] != 0:
						ship['seiyuu'] = extra_ship['cv']
					else:
						ship['seiyuu'] = ""
				else:
					ship['seiyuu'] = ""
				
				# ship found, stop searching
				found = True
				break
			
		if found == False: # give default values if info not found
			ship['antisub'] = 0
			ship['line_of_sight'] = 0
			ship['evasion'] = 0
			ship['illustrator'] = ""
			ship['seiyuu'] = ""
				
		#print(ship['api_id'], ship['name_roma'], extra_ship_data[ship['api_id'] - 1])
		
		"""
		# optional variables, set to [] or 0 if nonexistent
		if 'api_tais' in ship:
			sbm(ship, 'api_tais', 'antisub')
		else:
			ship['antisub'] = [0, 0]
		
		if 'api_saku' in ship:
			sbm(ship, 'api_saku', 'line_of_sight')
		else:
			ship['line_of_sight'] = [0, 0]

		if 'api_kaih' in ship:
			sbm(ship, 'api_kaih', 'evasion')
		else:
			ship['evasion'] = [0, 0]
		"""

		mvk(ship, 'api_leng', 'range')
		mvk(ship, 'api_slot_num', 'equip_slots')
		mvk(ship, 'api_buildtime', 'build_time')

		mvk(ship, 'api_broken', 'scrap_value')
		mvk(ship, 'api_powup', 'feed_value') # stat power ups when fed for modernization
		mvk(ship, 'api_backs', 'rarity')

		mvk(ship, 'api_getmes', 'get_message')

		mvk(ship, 'api_afterfuel', 'remodel_fuel_cost') # apparently this is steel not fuel. The kancolle devs themselves may have misspelled it and neglected to fix it.
		mvk(ship, 'api_afterbull', 'remodel_ammo_cost')
		mvk(ship, 'api_fuel_max', 'max_fuel')
		mvk(ship, 'api_bull_max', 'max_ammo')
		mvk(ship, 'api_voicef', 'extra_voice_clips')

		# carrier data
		mvk(ship, 'api_maxeq', 'plane_capacity')
		
		# add to new JSON array
		new_ships.append(ship)

	return json.dumps(new_ships, indent=2, ensure_ascii=False)
Exemplo n.º 51
0
def hira_toroma(word):
    return unicodedata.normalize("NFKC",romkan.to_roma(word.decode("utf8")))
Exemplo n.º 52
0
def conv_line(line):
    try:
        return sub3(',', sub2('.', sub('', romkan.to_kunrei(romkan.to_roma(unicode(line, ('utf8'))).encode('utf8')))))
    except:
        return ''
Exemplo n.º 53
0
import psycopg2

execfile('jsonify.py')
f = open('pwd.txt')
user = f.readline().strip()
pwd = f.readline().strip()
f.close
#ships = master ship list
connect_name = "dbname='kancolle' user='******' host='localhost' password='******'"
conn = psycopg2.connect(connect_name)
cur = conn.cursor()
stypes = final['api_mst_stype']
for ind in range(0,383):
    idnum = ships[ind]['api_id']
    sortno = ships[ind]['api_sortno']
    name = ships[ind]['api_name']
    yomi = ships[ind]['api_yomi']
    eego = romkan.to_roma(yomi)
    classind = ships[ind]['api_stype']-1
    print 'INSERTING: ', idnum,' ', name
    if classind == 7:
        shipclass = u'\u9ad8\u901f\u6226\u8266'
    else:
        shipclass = stypes[classind]['api_name']
    cur.execute('insert into kanmusu values(%s,%s,%s,%s,%s,%s);',(idnum,name,yomi,eego,sortno,shipclass))

conn.commit()
cur.close()
conn.close()

Exemplo n.º 54
0
 def _to_romaji(self):
     return romkan.to_roma(self.word.decode("utf-8"))
Exemplo n.º 55
0
def to_romaji(word):
    """Convert a Japanese word in hiragana or katakana to romaji. """
    return romkan.to_roma(hiragana_to_katakana(word))
Exemplo n.º 56
0
                # print >>f, cleanedWord

                kanji = cleanedWord

                    # meaning = getMeaning(cleanedWord,f)
                # print >>f, meaning
            if i==1:
                hiragana = cleanedWord
                if "(" in hiragana or ")" or "、" or "・" in hiragana:
                    hiragana = ""
                    kanji=""
            i = i+1
        if not(hiragana == "" and kanji == ""):
            print >>f, "<tr>"

            romaji = romkan.to_roma(hiragana.decode("utf-8"))

            if kanji =="":
                meaning = getMeaning(hiragana)
            else:
                meaning = getMeaning(kanji)

            print >>f, "<td>" + kanji + "</td>"
            print >>f, "<td>" + hiragana + "</td>"
            print >>f, "<td>" + romaji + "</td>"
            print >>f, "<td>" + meaning + "</td>"
            print >>f, "</tr>"

    print >>f , "</table>"

    f.close()
Exemplo n.º 57
0
    def multiscrape(self, name, shy=False):
        if shy and self.config.has_key(name) and self.config[name]['ja']:
            return
        if name != u'名前' and name != u'ふりがな':
            nodes = self.root.xpath("//_:h4[contains(text(), '%s')]/following-sibling::_:p" % name, namespaces=NS)
        else:
            nodes = self.root.xpath("//_:h3", namespaces=NS)
        if not nodes:
            return

        iterator = nodes[0].itertext()
        val = ''
        l = []
        while 1:
            try:
                val = iterator.next()
                val = re.sub(u'^[  \r\n]+', '', val)
                val = re.sub(u'[  \r\n]+$', '', val)
                if val:
                    l.append(val)
            except:
                break

        val = re.sub('^[  \n]*(.*?)[  \n]*$', '\\1', '\n'.join(l))

        val = val.strip()
        val = makeHankaku(val)

        if name == u'名前':
            lst = val.split('\n')
            if not self.config.has_key(name):
                self.config[name] = {}
            self.config[name]['ja'] = lst[0]
        elif name == u'ふりがな' and not shy:
            if not self.config.has_key(u'名前'):
                self.config[u'名前'] = {}
            lst = val.split('\n')
            if len(lst) > 1:
                suzure = lst[1].replace(u' ', '').replace(' ', '')
                self.config[u'名前']['kana'] = lst[1]
                self.config[u'名前']['en'] = titleCase(romkan.to_hepburn(lst[1].replace(u' ', ' ')))
                self.config[u'並べ替え']['ja'] = romkan.to_katakana(romkan.to_kunrei(suzure))
                self.config[u'並べ替え']['en'] = romkan.to_roma(suzure)
            else:
                self.config[u'名前']['kana'] = ''
        elif name == u'教員からのメッセージ':
            if not self.config.has_key(u'学部メセージ'):
                self.config[u'学部メッセージ'] = {}
            self.config[u'学部メッセージ']['ja'] = val.split('\n')
        elif name == u'役職':
            if not self.config.has_key(u'役職'):
                self.config[u'役職'] = {}
            self.config[u'役職']['ja'] = getPostJapanese(val)
            self.config[u'役職']['en'] = getPostEnglish(self.config[u'役職']['ja'])
        elif name == u'主要':
            if len(val.split('\n')) > 1:
                self.config[u'主要業績']['ja'] = val.split('\n')
            else:
                self.config[u'主要業績']['ja'] = val
            
            
        else:
            if not self.config.has_key(name):
                self.config[name] = {}
            if len(val.split('\n')) > 1:
                self.config[name]['ja'] = val.split('\n')
                if name == u'専門分野' and self.config[name]['ja'][0]:
                    self.config[name]['en'] = fieldsMap[self.config[name]['ja'][0]]
            else:
                self.config[name]['ja'] = val
                if name == u'専門分野' and self.config[name]['ja']:
                    self.config[name]['en'] = fieldsMap[self.config[name]['ja']]
Exemplo n.º 58
0
	playlist_tracks = []

	for raw_info in playlist_track_infos:
		info = raw_info.split('!MAESTRO!')
		track_artist = info[0]
		track_name = info[1]
		album_name = info[2]
		track_id = info[3]

		playlist_tracks.append({"artist": track_artist, "name": track_name, "album": album_name, "track_id": track_id})

	for track in playlist_tracks:
		match_in_name = handler.query.lower() in track["name"].lower()
		match_in_artist = handler.query.lower() in track["artist"].lower()
		match_in_album = handler.query.lower() in track["album"].lower()
		match_in_name_kana = handler.query.lower() in romkan.to_roma(unicode(track["name"], "utf-8"))
		match_in_artist_kana = handler.query.lower() in romkan.to_roma(unicode(track["artist"], "utf-8"))
		match_in_album_kana = handler.query.lower() in romkan.to_roma(unicode(track["album"], "utf-8"))

		if match_in_name or match_in_artist or match_in_album or match_in_name_kana or match_in_artist_kana or match_in_album_kana:
			subtitle = None
			if len(track["album"]) > 0:
				subtitle = "%s [%s]" % (track["artist"], track["album"])
			else:
				subtitle = track["artist"]
			handler.add_new_item(title=track["name"], subtitle=subtitle, arg=track["track_id"], icon=get_artwork(track["name"]))
			anything_matched = True

except:
	pass