def make_cal_pos_hashtable(cutoff=0): obj = {} with open(full_cal_db_location, 'rb') as cal: for line in cal: try: lineObj = cal_tools.parseCalLine(line, False, False) except IndexError: print line continue word = lineObj["word"] pos = lineObj["POS"] if not word in obj: obj[word] = [] #pos_set = set(obj[word]) #pos_set.add(pos) obj[word].append(pos) num_one_pos_words = 0 total_num_pos = 0 for word, pos in reversed(obj.items()): pos_counts = {} for p in pos: if not p in pos_counts: pos_counts[p] = 0 pos_counts[p] += 1 obj[word] = pos_counts if len(pos_counts) < cutoff: del obj[word] continue total_num_pos += len(pos_counts) if len(pos_counts) == 1: num_one_pos_words += 1 print "Percent Words With 1 POS", round( 100.0 * num_one_pos_words / len(obj), 3) print "Avg Num POS per word", round(1.0 * total_num_pos / len(obj), 3) cal_tools.saveUTFStr(obj, "cal_pos_hashtable.json") f = codecs.open("double_pos_before_eng.txt", "wb", encoding='utf8') for word, pos in obj.items(): f.write(u'{} ~-~ {}\n'.format(word, str(pos))) f.close()
def make_cal_pos_hashtable(cutoff=0): obj = {} with open(full_cal_db_location,'rb') as cal: for line in cal: try: lineObj = cal_tools.parseCalLine(line,False,False) except IndexError: print line continue word = lineObj["word"] pos = lineObj["POS"] if not word in obj: obj[word] = [] #pos_set = set(obj[word]) #pos_set.add(pos) obj[word].append(pos) num_one_pos_words = 0 total_num_pos = 0 for word,pos in reversed(obj.items()): pos_counts = {} for p in pos: if not p in pos_counts: pos_counts[p] = 0 pos_counts[p] += 1 obj[word] = pos_counts if len(pos_counts) < cutoff: del obj[word] continue total_num_pos += len(pos_counts) if len(pos_counts) == 1: num_one_pos_words += 1 print "Percent Words With 1 POS",round(100.0*num_one_pos_words/len(obj),3) print "Avg Num POS per word",round(1.0*total_num_pos/len(obj),3) cal_tools.saveUTFStr(obj,"cal_pos_hashtable.json") f = codecs.open("double_pos_before_eng.txt","wb",encoding='utf8') for word,pos in obj.items(): f.write(u'{} ~-~ {}\n'.format(word,str(pos))) f.close()
def make_cal_segments(mesechta): def get_daf_str(daf_num, daf_side_num): return '{}{}'.format(daf_num, 'a' if daf_side_num == 1 else 'b') cal_gem_lines = [] with open("{}{}.txt".format(mesechta_cal_db_location, mesechta), "rb") as f: temp_gem_line = [] curr_gem_line_num = -1 curr_daf = '' for line in f: line_obj = cal_tools.parseCalLine(line, True, False) line_obj["daf"] = get_daf_str( line_obj['pg_num'], line_obj['side']) #add a daf str prop line_obj["word"] = line_obj["word"].replace("'", '"') if len(line_obj["word"]) > 1 and line_obj["word"][-1] == '"': line_obj["word"] = line_obj["word"][0: -1] #remove abbreviations if line_obj["line_num"] != curr_gem_line_num: if len(temp_gem_line) > 0: small_gem_lines = [temp_gem_line] has_big_lines = True #recursively split up big lines until they're not big while has_big_lines: has_big_lines = False new_small_gem_lines = [] for gem_line in small_gem_lines: if len(gem_line) > 5: has_big_lines = True cut_index = len(gem_line) / 2 new_small_gem_lines.append( gem_line[:cut_index]) new_small_gem_lines.append( gem_line[cut_index:]) else: new_small_gem_lines.append(gem_line) small_gem_lines = new_small_gem_lines for gem_line in small_gem_lines: cal_gem_lines.append(gem_line) temp_gem_line = [line_obj] curr_gem_line_num = line_obj["line_num"] else: temp_gem_line.append(line_obj) ''' #clean up lines with only 1 or 2 words new_cal_gem_lines = [] new_cal_gem_dafs = [] for i,clt in enumerate(zip(cal_gem_lines,cal_gem_line_nums,cal_gem_dafs)): cal_line = clt[0], line_num = clt[1], daf = clt[2] if i > 0 and cal_gem_dafs[i-1] == daf and line_num-cal_gem_line_nums[i-1] <= 1: p_cal_line = cal_gem_lines[i-1] else: p_cal_line = None if i < len(cal_gem_lines)-1 and cal_gem_dafs[i+1] == daf and cal_gem_line_nums[i+1]-line_num <= 1: n_cal_line = cal_gem_lines[i+1] else: n_cal_line = None if len(cal_line) <= 2 ''' #break up by daf, concat lines to strs all_daf_lines = [] all_dafs = [] curr_daf = '' curr_daf_lines = [] for iline, line in enumerate(cal_gem_lines): if line[0]["daf"] != curr_daf: if len(curr_daf_lines) > 0: all_daf_lines.append(curr_daf_lines) all_dafs.append(curr_daf) curr_daf = line[0]["daf"] curr_daf_lines = [line] else: curr_daf_lines.append(line) # dont forget to add the last daf in if iline == len(cal_gem_lines) - 1: if len(curr_daf_lines) > 0: all_daf_lines.append(curr_daf_lines) all_dafs.append(curr_daf) cal_tools.saveUTFStr({ "lines": all_daf_lines, "dafs": all_dafs }, "cal_lines_{}.json".format(mesechta))
def make_cal_segments(mesechta): def get_daf_str(daf_num,daf_side_num): return '{}{}'.format(daf_num,'a' if daf_side_num == 1 else 'b') cal_gem_lines = [] with open("{}{}.txt".format(mesechta_cal_db_location, mesechta), "rb") as f: temp_gem_line = [] curr_gem_line_num = -1 curr_daf = '' for line in f: line_obj = cal_tools.parseCalLine(line,True,False) line_obj["daf"] = get_daf_str(line_obj['pg_num'],line_obj['side']) #add a daf str prop line_obj["word"] = line_obj["word"].replace("'",'"') if len(line_obj["word"]) > 1 and line_obj["word"][-1] == '"': line_obj["word"] = line_obj["word"][0:-1] #remove abbreviations if line_obj["line_num"] != curr_gem_line_num: if len(temp_gem_line) > 0: small_gem_lines = [temp_gem_line] has_big_lines = True #recursively split up big lines until they're not big while has_big_lines: has_big_lines = False new_small_gem_lines = [] for gem_line in small_gem_lines: if len(gem_line) > 5: has_big_lines = True cut_index = len(gem_line)/2 new_small_gem_lines.append(gem_line[:cut_index]) new_small_gem_lines.append(gem_line[cut_index:]) else: new_small_gem_lines.append(gem_line) small_gem_lines = new_small_gem_lines for gem_line in small_gem_lines: cal_gem_lines.append(gem_line) temp_gem_line = [line_obj] curr_gem_line_num = line_obj["line_num"] else: temp_gem_line.append(line_obj) ''' #clean up lines with only 1 or 2 words new_cal_gem_lines = [] new_cal_gem_dafs = [] for i,clt in enumerate(zip(cal_gem_lines,cal_gem_line_nums,cal_gem_dafs)): cal_line = clt[0], line_num = clt[1], daf = clt[2] if i > 0 and cal_gem_dafs[i-1] == daf and line_num-cal_gem_line_nums[i-1] <= 1: p_cal_line = cal_gem_lines[i-1] else: p_cal_line = None if i < len(cal_gem_lines)-1 and cal_gem_dafs[i+1] == daf and cal_gem_line_nums[i+1]-line_num <= 1: n_cal_line = cal_gem_lines[i+1] else: n_cal_line = None if len(cal_line) <= 2 ''' #break up by daf, concat lines to strs all_daf_lines = [] all_dafs = [] curr_daf = '' curr_daf_lines = [] for line in cal_gem_lines: if line[0]["daf"] != curr_daf: if len(curr_daf_lines) > 0: all_daf_lines.append(curr_daf_lines) all_dafs.append(curr_daf) curr_daf = line[0]["daf"] curr_daf_lines = [line] else: curr_daf_lines.append(line) cal_tools.saveUTFStr({"lines":all_daf_lines,"dafs":all_dafs},"cal_lines_{}.json".format(mesechta))