def make_cal_db_word_list(text_name): word_list_out = [] line_len_list = [] curr_line = -1 num_words_on_line = 0 with open("caldb_{}.txt".format(text_name), "r") as caldb: for line in caldb: line_obj = cal_tools.parseCalLine(line,True,False) word = abbrev_map[line_obj["word"]] if line_obj["word"] in abbrev_map else line_obj["word"] word_list = word.split(" ") if curr_line == -1: curr_line = line_obj["line_num"] if curr_line != line_obj["line_num"]: line_len_list.append(num_words_on_line) num_words_on_line = len(word_list) curr_line = line_obj["line_num"] else: num_words_on_line += len(word_list) for w in word_list: word_list_out.append(w) #unabbrev_list_out.append(abbrev_map[line_obj["word"]] if line_obj["word"] in abbrev_map else "") prefix_list = [pre.replace("_","") for pre in line_obj["prefix"]] if "prefix" in line_obj else [] #hword_list_out.append("".join(prefix_list) + line_obj["head_word"]) line_len_list.append(num_words_on_line) #for the last line doc = {"words":word_list_out,"line_lens":line_len_list} fp = codecs.open("caldb_words_{}.json".format(text_name), "w", encoding='utf-8') json.dump(doc, fp, indent=4, encoding='utf-8', ensure_ascii=False)
def make_pos_hashtable_cal(): out = {} with open("caldb.txt","r") as cal: for line in cal: line_obj = cal_tools.parseCalLine(line,False,False) word = line_obj["word"] head_word = line_obj["head_word"] pos = line_obj["POS"] if pos[0] == 'p': pos = "P" + pos[1:] if word in out: pos_list = out[word]["POS"] hw_list = out[word]["head_word"] if not pos in pos_list and not head_word in hw_list: pos_list.append(pos) hw_list.append(head_word) out[word]["POS"] = pos_list out[word]["head_word"] = hw_list else: out[word] = { "POS" : [pos], "head_word" : [head_word] } for key in out: if len(out[key]["POS"]) > 1: print u"^{}^".format(key), u"*-*".join(out[key]["head_word"]), out[key]["POS"] cal_tools.saveUTFStr(out,"cal_pos_hashtable.json")
def make_cal_db_word_list(text_name): word_list_out = [] hword_list_out = [] with open("caldb_{}.txt".format(text_name), "r") as caldb: for line in caldb: line_obj = cal_tools.parseCalLine(line,True,False) word_list_out.append(line_obj["word"]) prefix_list = [pre.replace("_","") for pre in line_obj["prefix"]] if "prefix" in line_obj else [] hword_list_out.append("".join(prefix_list) + line_obj["head_word"]) doc = {"words":word_list_out,"head_words":hword_list_out} fp = codecs.open("caldb_words_{}.json".format(text_name), "w", encoding='utf-8') json.dump(doc, fp, indent=4, encoding='utf-8', ensure_ascii=False)