def mergeDictionaryWithClass(name): readFile.clear() readFile.readFile(name) # Unigram print("Add uni") for x in readFile.unigram: # tmp = readFile.isInDictionary(x["word"], UNIGRAM) tmp = readFile.BinarySearchNgramWithClass(UNIGRAM, x.key) if tmp > -1: # print(UNIGRAM[tmp].key) UNIGRAM[tmp].count = UNIGRAM[tmp].count + x.count else: # print(x.key) UNIGRAM.append(x) UNIGRAM.sort(key=lambda x: x.key) # Ngram print("Add nrg ") for idx, x in enumerate(readFile.ngram): # tmp = readFile.isNgramInDictionary(x, NGRAM) t = time.time() tmp = readFile.BinarySearchNgramWithClass(NGRAM, x.key) # print(format(idx) + "/" + format(len(readFile.ngram)) + " : " + format(tmp)) # print(time.time() - t) if time.time() - t > 500 * 10**(-3): print("teo") if tmp > -1: NGRAM[tmp].count = NGRAM[tmp].count + x.count else: NGRAM.append(x) NGRAM.sort(key=lambda x: x.key)
def _merger_dictionary(name): readFile.clear() readFile._read_file(name) # Unigram print("Add uni") for idx, item in readFile.unigramDict.items(): if idx in UNIGRAM_DICT: if "count" in item: if "count" in UNIGRAM_DICT[idx]: UNIGRAM_DICT[idx][ "count"] = UNIGRAM_DICT[idx]["count"] + item["count"] else: UNIGRAM_DICT[idx] = copy.deepcopy(item) else: UNIGRAM_DICT[idx] = copy.deepcopy(item) # Ngram print("Add nrg ") for idx, item in readFile.ngramDict.items(): if idx in NGRAM_DICT: if "count" in item: if "count" in NGRAM_DICT[idx]: NGRAM_DICT[idx][ "count"] = NGRAM_DICT[idx]["count"] + item["count"] else: NGRAM_DICT[idx] = copy.deepcopy(item) else: NGRAM_DICT[idx] = copy.deepcopy(item)
def _write_dic_to_json(name): readFile.clear() readFile.readFile(name) with open('unigram_json.txt', 'w') as uni_json_file: json.dump(readFile.unigramDict, uni_json_file) with open('ngram_json.txt', 'w') as ngr_json_file: json.dump(readFile.ngramDict, ngr_json_file)
def _write_dic_to_text(): readFile.clear() readFile.readFile("DicTest.txt") global UNIGRAM_DICT global NGRAM_DICT UNIGRAM_DICT = copy.deepcopy(readFile.unigramDict) NGRAM_DICT = copy.deepcopy(readFile.ngramDict) writeFile._write_dic_to_text(UNIGRAM_DICT, NGRAM_DICT)
def _main_v3(s): i = 0 for file in os.listdir(s): print(file + " {}/{}".format(i,len(os.listdir(s)))) readFile.clear() readFile._read_file(s + file) # _merge_dic_to_csv(readFile.unigramDict, True) # _merge_dic_to_csv(readFile.ngramDict, False) _merge_dic_to_csv_v2(readFile.unigramDict, True) _merge_dic_to_csv_v2(readFile.ngramDict, False) i = i + 1
def mergeDictionary(name): readFile.clear() readFile.readFile(name) # Unigram print("Add uni") for x in readFile.unigram: # tmp = readFile.isInDictionary(x["word"], UNIGRAM) tmp = readFile.BinarySearchUni(UNIGRAM, x["word"]) if tmp > -1: if "count" in x: if "count" in UNIGRAM[tmp]: if x["count"] != 0: UNIGRAM[tmp][ "count"] = UNIGRAM[tmp]["count"] + x["count"] else: UNIGRAM[tmp] = x else: UNIGRAM.append(x) UNIGRAM.sort(key=lambda x: x["word"]) # Ngram print("Add nrg " + format(len(readFile.ngram))) for idx, x in enumerate(readFile.ngram): # tmp = readFile.isNgramInDictionary(x, NGRAM) print( format(idx) + "/" + format(len(readFile.ngram)) + " : " + format(tmp)) t = time.time() tmp = readFile.BinarySearchNgram(NGRAM, readFile.mergeNgram(x)) print(time.time() - t) if time.time() - t > 500 * 10**(-3): print("teo") if tmp > -1: if "count" in x: if "count" in NGRAM[tmp]: if x["count"] != 0: NGRAM[tmp]["count"] = NGRAM[tmp]["count"] + x["count"] else: NGRAM[tmp] = x else: NGRAM.append(x) NGRAM.sort(key=lambda x: readFile.mergeNgram(x))
def _tool(): readFile.clear() readFile._read_file("DicTest.txt") global UNIGRAM_DICT global NGRAM_DICT UNIGRAM_DICT = copy.deepcopy(readFile.unigramDict) NGRAM_DICT = copy.deepcopy(readFile.ngramDict) # UNIGRAM_DICT = copy.deepcopy(_read_dic_to_json('unigram_json.txt')) # NGRAM_DICT = copy.deepcopy(_read_dic_to_json('ngram_json.txt')) print("start") for x in range(20, 21): # if x <= 9: # s = path + "0" + str(x) + folder + "/" # else: # s = path + str(x) + folder + "/" s = s = path + folder + "/" print(s) _main_v2(s) print("get pro uni") readFile._get_probability_uni(UNIGRAM_DICT) print("get pro ngr") readFile._get_probability_ngram(UNIGRAM_DICT, NGRAM_DICT) writeFile._write_dictionary(UNIGRAM_DICT, NGRAM_DICT, "DicTest.txt")
def _first_run_v2(): readFile.clear() readFile._read_file('text.txt') _merge_dic_to_csv_v2(readFile.unigramDict, True) _merge_dic_to_csv_v2(readFile.ngramDict, False)
def _first_run(): readFile.clear() readFile._read_file('main_vi.dict_chinh_quy_bo_ky_tu.txt') _merge_dic_to_csv(readFile.unigramDict, True) _merge_dic_to_csv(readFile.ngramDict, False)
def tool(): readFile.clear() readFile.readFileWithClass("DicTestV2.txt") global UNIGRAM global NGRAM UNIGRAM = copy.deepcopy(readFile.unigram) NGRAM = copy.deepcopy(readFile.ngram) print("start") for x in range(19, 20): # if x <= 9: # s = path + "0" + str(x) + folder + "/" # else: # s = path + str(x) + folder + "/" s = s = path + folder + "/" print(s) main(s) print("get pro uni") readFile.getProbabilityUniWithClass(UNIGRAM) print("get pro ngr") lengthNgram = len(NGRAM) temp = int(lengthNgram / 9) print(temp) if lengthNgram > 9: print("Thread") try: t = time.time() t1 = threading.Thread(target=readFile.getProbabilityNgramWithClass, args=(UNIGRAM, NGRAM, 0, temp)) # print(temp) t2 = threading.Thread(target=readFile.getProbabilityNgramWithClass, args=(UNIGRAM, NGRAM, temp, temp * 2)) # print(format(temp) +" " + format(temp*2) ) t3 = threading.Thread(target=readFile.getProbabilityNgramWithClass, args=(UNIGRAM, NGRAM, temp * 2, temp * 3)) # print(format(temp*2) +" " + format(temp*3) ) t4 = threading.Thread(target=readFile.getProbabilityNgramWithClass, args=(UNIGRAM, NGRAM, temp * 3, temp * 4)) # print(format(temp*3) +" " + format(temp*4) ) t5 = threading.Thread(target=readFile.getProbabilityNgramWithClass, args=(UNIGRAM, NGRAM, temp * 4, temp * 5)) # print(format(temp*4) +" " + format(temp*5) ) t6 = threading.Thread(target=readFile.getProbabilityNgramWithClass, args=(UNIGRAM, NGRAM, temp * 5, temp * 6)) # print(format(temp*5) +" " + format(temp*6) ) t7 = threading.Thread(target=readFile.getProbabilityNgramWithClass, args=(UNIGRAM, NGRAM, temp * 6, temp * 7)) # print(format(temp*6) +" " + format(temp*7) ) t8 = threading.Thread(target=readFile.getProbabilityNgramWithClass, args=(UNIGRAM, NGRAM, temp * 7, temp * 8)) # print(format(temp*7) +" " + format(temp*8) ) t9 = threading.Thread(target=readFile.getProbabilityNgramWithClass, args=(UNIGRAM, NGRAM, temp * 8, lengthNgram - 1)) # print(format(temp*8) +" " + format(lengthNgram-1) ) t1.start() t2.start() t3.start() t4.start() t5.start() t6.start() t7.start() t8.start() t9.start() t1.join() t2.join() t3.join() t4.join() t5.join() t6.join() t7.join() t8.join() t9.join() print("done in ", time.time() - t) except: print("error") else: readFile.getProbabilityNgramWithClass(UNIGRAM, NGRAM, 0, len(NGRAM) - 1) writeFile.writeDictionaryWithClass(UNIGRAM, NGRAM, "DicTestV2.txt")