def parse(inputFile, outputDir): # getting configuration and BD. db_handler = Japanese_DB_handler() #config_data = _parseConf() config_data = configuration.get_configuration() if not config_data : log.error("couldn't find get configuration data") return copyfile(inputFile, _generateFileName(config_data.input_files_bk, 'input')) f = db_handler.base_format existing_kanjis = db_handler.list(f.vocab, f.vocab.word) potentialErrors = [] newEntriesList = [] # Parsing input file. with open(inputFile, 'r') as fin: for row in csv.reader(fin, delimiter=' '): # usefull to just get half of the list # but question are not necessrely before awnser # we forced japanese as row[0] word = row[0] meaning = row[1] prononciation = row[2] if row[2] else '' exemple = '' if word not in existing_kanjis : newEntriesList.append(['','',word, prononciation, meaning, exemple]) else : log.error('already exists : '+word) nb_of_files = len(newEntriesList)//100 if len(newEntriesList)%100 != 0 : nb_of_files += 1 outputDir += '/' for nb in range(1, nb_of_files+1, 1): fileName = _generateFileName(outputDir, "int", str(nb)) with open(fileName, 'w') as fout: writer = csv.writer(fout, delimiter= ' ') writer.writerow(['categorie','tag','word','prononciation','meaning','exemple']) for entry in newEntriesList[100 * (nb - 1) : 100 * nb] : writer.writerow(entry) fileName = _generateFileName(outputDir, "int", '_pottentialErrors') with open(fileName, 'w') as fout: writer = csv.writer(fout, delimiter= ' ') for error in potentialErrors: writer.writerow(error) log.error(error) return
def parse(inputFile, outputDir): # getting configuration and BD. db_handler = Japanese_DB_handler() #config_data = _parseConf() config_data = configuration.get_configuration() if not config_data: log.error("couldn't find get configuration data") return copyfile(inputFile, _generateFileName(config_data.input_files_bk, 'input')) f = db_handler.base_format existing_kanjis = db_handler.list(f.vocab, f.vocab.word) potentialErrors = [] newEntriesList = [] # Parsing input file. with open(inputFile, 'r') as fin: for row in csv.reader(fin, delimiter=' '): # usefull to just get half of the list # but question are not necessrely before awnser # we forced japanese as row[0] if not _is_cjk(row[0][0]): continue japanese = row[0] french = row[1] # print(japanese) # 3 cases : # 1, juste kana # 2, a bunch of kanji and kana prononciation # 3, 2 + a sentence exemple. # 1) no kanjis status = True if ' ' not in japanese: word = japanese prononciation = '' exemple = '' else: potentialKanjis, afterKanjis = japanese.split(' ', 1) # remove trailing spaces. afterKanjis = _delTrailingSpaces(afterKanjis) if afterKanjis[:2] == 'する': potentialKanjis += ' (する)' afterKanjis = _delTrailingSpaces(afterKanjis[2:]) if afterKanjis[:2] == 'な ': potentialKanjis += ' (な)' afterKanjis = _delTrailingSpaces(afterKanjis[1:]) # x) Potentials errors : Full phrase. if len(potentialKanjis) > 7: log.error('potential error :' + potentialKanjis) status = False potentialErrors.append(row) # 2) just kanjis and prononciation elif ' ' not in afterKanjis: word = potentialKanjis prononciation = _delTrailingSpaces(afterKanjis) exemple = '' # 3) kanjis prononciation and exemple else: word = potentialKanjis prononciation, exemple = afterKanjis.split(' ', 1) prononciation = _delTrailingSpaces(prononciation) exemple = _delTrailingSpaces(exemple) if status and word not in existing_kanjis: newEntriesList.append( ['', '', word, prononciation, french, exemple]) else: log.error('already exists : ' + word) nb_of_files = len(newEntriesList) // 100 if len(newEntriesList) % 100 != 0: nb_of_files += 1 outputDir += '/' for nb in range(1, nb_of_files + 1, 1): fileName = _generateFileName(outputDir, "int", str(nb)) with open(fileName, 'w') as fout: writer = csv.writer(fout, delimiter=' ') writer.writerow([ 'categorie', 'tag', 'word', 'prononciation', 'meaning', 'exemple' ]) for entry in newEntriesList[100 * (nb - 1):100 * nb]: writer.writerow(entry) fileName = _generateFileName(outputDir, "int", '_pottentialErrors') with open(fileName, 'w') as fout: writer = csv.writer(fout, delimiter=' ') for error in potentialErrors: writer.writerow(error) log.error(error) return