Пример #1
0
def parse(inputFile, outputDir):

    # getting configuration and BD. 
    db_handler = Japanese_DB_handler()
    #config_data = _parseConf()
    config_data = configuration.get_configuration()
    if not config_data :
        log.error("couldn't find get configuration data")
        return 

    copyfile(inputFile, _generateFileName(config_data.input_files_bk, 'input'))

    f = db_handler.base_format
    existing_kanjis = db_handler.list(f.vocab, f.vocab.word)

    potentialErrors = []
    newEntriesList = []

    # Parsing input file. 
    with open(inputFile, 'r') as fin:
        for row in csv.reader(fin, delimiter='	'):
            # usefull to just get half of the list
            # but question are not necessrely before awnser
            # we forced japanese as row[0]

            word = row[0]
            meaning = row[1]
            prononciation = row[2] if row[2] else ''
            exemple = ''

            if word not in existing_kanjis :
                newEntriesList.append(['','',word, prononciation, meaning, exemple])
            else :
                log.error('already exists : '+word)

    nb_of_files = len(newEntriesList)//100
    if len(newEntriesList)%100 != 0 :
        nb_of_files += 1

    outputDir += '/'
    for nb in range(1, nb_of_files+1, 1):
        fileName = _generateFileName(outputDir, "int", str(nb))
        with open(fileName, 'w') as fout:
            writer = csv.writer(fout, delimiter= '	')
            writer.writerow(['categorie','tag','word','prononciation','meaning','exemple'])
            for entry in newEntriesList[100 * (nb - 1) : 100 * nb] : 
                writer.writerow(entry)

    fileName = _generateFileName(outputDir, "int", '_pottentialErrors')
    with open(fileName, 'w') as fout:
        writer = csv.writer(fout, delimiter= '	')
        for error in potentialErrors:
            writer.writerow(error)
            log.error(error)
    return 
Пример #2
0
def parse(inputFile, outputDir):

    # getting configuration and BD.
    db_handler = Japanese_DB_handler()
    #config_data = _parseConf()
    config_data = configuration.get_configuration()
    if not config_data:
        log.error("couldn't find get configuration data")
        return

    copyfile(inputFile, _generateFileName(config_data.input_files_bk, 'input'))

    f = db_handler.base_format
    existing_kanjis = db_handler.list(f.vocab, f.vocab.word)

    potentialErrors = []
    newEntriesList = []

    # Parsing input file.
    with open(inputFile, 'r') as fin:
        for row in csv.reader(fin, delimiter='	'):
            # usefull to just get half of the list
            # but question are not necessrely before awnser
            # we forced japanese as row[0]

            if not _is_cjk(row[0][0]):
                continue

            japanese = row[0]
            french = row[1]

            # print(japanese)

            # 3 cases :
            # 1, juste kana
            # 2, a bunch of kanji and kana prononciation
            # 3, 2 + a sentence exemple.

            # 1) no kanjis

            status = True
            if ' ' not in japanese:
                word = japanese
                prononciation = ''
                exemple = ''

            else:
                potentialKanjis, afterKanjis = japanese.split(' ', 1)

                # remove trailing spaces.
                afterKanjis = _delTrailingSpaces(afterKanjis)

                if afterKanjis[:2] == 'する':
                    potentialKanjis += ' (する)'
                    afterKanjis = _delTrailingSpaces(afterKanjis[2:])

                if afterKanjis[:2] == 'な ':
                    potentialKanjis += ' (な)'
                    afterKanjis = _delTrailingSpaces(afterKanjis[1:])

                # x) Potentials errors : Full phrase.
                if len(potentialKanjis) > 7:
                    log.error('potential error :' + potentialKanjis)
                    status = False
                    potentialErrors.append(row)

                # 2) just kanjis and prononciation
                elif ' ' not in afterKanjis:
                    word = potentialKanjis
                    prononciation = _delTrailingSpaces(afterKanjis)
                    exemple = ''

                # 3) kanjis prononciation and exemple
                else:
                    word = potentialKanjis
                    prononciation, exemple = afterKanjis.split(' ', 1)
                    prononciation = _delTrailingSpaces(prononciation)
                    exemple = _delTrailingSpaces(exemple)

            if status and word not in existing_kanjis:
                newEntriesList.append(
                    ['', '', word, prononciation, french, exemple])
            else:
                log.error('already exists : ' + word)

    nb_of_files = len(newEntriesList) // 100
    if len(newEntriesList) % 100 != 0:
        nb_of_files += 1

    outputDir += '/'
    for nb in range(1, nb_of_files + 1, 1):
        fileName = _generateFileName(outputDir, "int", str(nb))
        with open(fileName, 'w') as fout:
            writer = csv.writer(fout, delimiter='	')
            writer.writerow([
                'categorie', 'tag', 'word', 'prononciation', 'meaning',
                'exemple'
            ])
            for entry in newEntriesList[100 * (nb - 1):100 * nb]:
                writer.writerow(entry)

    fileName = _generateFileName(outputDir, "int", '_pottentialErrors')
    with open(fileName, 'w') as fout:
        writer = csv.writer(fout, delimiter='	')
        for error in potentialErrors:
            writer.writerow(error)
            log.error(error)
    return