Exemplo n.º 1
0
def make_aramaic_training_context():
    training = []
    with open('../noahcaldbfull.txt','rb') as cal:
        temp_phrase = []
        curr_line_num = None
        curr_word_num = None
        for line in cal:
            try:
                lineObj = cal_tools.parseCalLine(line,True,False)
            except IndexError:
                continue
            if curr_line_num is None:
                curr_line_num = lineObj['line_num']
            if curr_word_num is None:
                curr_word_num = lineObj['word_num'] - 1
            if curr_line_num == lineObj['line_num'] and (curr_word_num + 1) == lineObj['word_num']:
                temp_phrase.extend(lineObj['word'].split(' '))
                curr_word_num = lineObj['word_num']
            else:
                training.append({'language': 'aramaic','phrase': temp_phrase[:]})
                curr_line_num = lineObj['line_num']
                curr_word_num = lineObj['word_num']
                temp_phrase = lineObj['word'].split(' ')

    total_words = 0
    total_phrases = len(training)
    for p in training:
        total_words += len(p['phrase'])

    print 'NUM PHRASES: {} AVG WORDS PER PHRASE: {}'.format(total_phrases,total_words/total_phrases)

    return training
Exemplo n.º 2
0
def make_aramaic_training_context():
    training = []
    with open('data/1_cal_input/caldbfull.txt','rb') as cal:
        temp_phrase = []
        curr_line_num = None
        curr_word_num = None
        for line in cal:
            try:
                lineObj = cal_tools.parseCalLine(line,True,False)
            except IndexError:
                continue
            if curr_line_num is None:
                curr_line_num = lineObj['line_num']
            if curr_word_num is None:
                curr_word_num = lineObj['word_num'] - 1
            if curr_line_num == lineObj['line_num'] and (curr_word_num + 1) == lineObj['word_num']:
                temp_phrase.extend(lineObj['word'].split(' '))
                curr_word_num = lineObj['word_num']
            else:
                training.append({'language': 'aramaic','phrase': temp_phrase[:]})
                curr_line_num = lineObj['line_num']
                curr_word_num = lineObj['word_num']
                temp_phrase = lineObj['word'].split(' ')

    total_words = 0
    total_phrases = len(training)
    for p in training:
        total_words += len(p['phrase'])

    print 'NUM PHRASES: {} AVG WORDS PER PHRASE: {}'.format(total_phrases,total_words/total_phrases)

    return training
Exemplo n.º 3
0
def make_aramaic_training():
    abbrev_dict = json.load(codecs.open('data/1_cal_input/abbreviations.json',encoding='utf8'))
    for abbrev, defs in abbrev_dict.items():
        sorted_defs = sorted(defs.items(),key=lambda x: x[1])
        abbrev_dict[abbrev] = sorted_defs[0][0]


    training = []
    num_found = 0
    num_missed = 0
    with open('data/1_cal_input/caldbfull.txt','rb') as cal:
        for line in cal:
            line_obj = cal_tools.parseCalLine(line,True,withshinsin=False)
            temp_word = line_obj['word']

            words = []
            if u"'" in temp_word:
                temp_word = temp_word.replace(u"'",u'')
                if temp_word in abbrev_dict:
                    words = re.split(ur'\s+',abbrev_dict[temp_word])
                    num_found += 1
                else:
                    num_missed += 1
                    #print u'missed {}'.format(temp_word)
            else:
                words = [temp_word]

            for w in words:
                training.append({'word':w,'tag':'aramaic'})

    print u'Num abbrevs replaced {}. Num Missed {}'.format(num_found,num_missed)
    return training
Exemplo n.º 4
0
def make_headword_hashtable():
    headword_hashtable = {}
    with open('../language_classifier/noahcaldb.txt', 'rb') as cal:
        for line in cal:
            line_obj = cal_tools.parseCalLine(line, True, withshinsin=False)
            headword_hashtable[line_obj['word']] = line_obj['head_word']
    return headword_hashtable
Exemplo n.º 5
0
def make_headword_hashtable():
    headword_hashtable = {}
    with open('../language_classifier/noahcaldb.txt','rb') as cal:
        for line in cal:
            line_obj = cal_tools.parseCalLine(line,True,withshinsin=False)
            headword_hashtable[line_obj['word']] = line_obj['head_word']
    return headword_hashtable
Exemplo n.º 6
0
def make_aramaic_training():
    training = []
    with open('../noahcaldbfull.txt','rb') as cal:
        for line in cal:
            line_obj = cal_tools.parseCalLine(line,True,withshinsin=False)
            training.append({'word':line_obj['word'],'tag':'aramaic'})

    return training
Exemplo n.º 7
0
def make_aramaic_training():
    training = []
    with open('../noahcaldbfull.txt', 'rb') as cal:
        for line in cal:
            line_obj = cal_tools.parseCalLine(line, True, withshinsin=False)
            training.append({'word': line_obj['word'], 'tag': 'aramaic'})

    return training
Exemplo n.º 8
0
def make_cal_pos_hashtable(cutoff=0):
    obj = {}
    with open(full_cal_db_location, 'rb') as cal:
        for line in cal:
            try:
                lineObj = cal_tools.parseCalLine(line, False, False)
            except IndexError:
                print line
                continue
            word = lineObj["word"]
            pos = lineObj["POS"]
            if not word in obj:
                obj[word] = []

            #pos_set = set(obj[word])
            #pos_set.add(pos)
            obj[word].append(pos)

    num_one_pos_words = 0
    total_num_pos = 0
    for word, pos in reversed(obj.items()):
        pos_counts = {}
        for p in pos:
            if not p in pos_counts:
                pos_counts[p] = 0
            pos_counts[p] += 1
        obj[word] = pos_counts
        if len(pos_counts) < cutoff:
            del obj[word]
            continue
        total_num_pos += len(pos_counts)
        if len(pos_counts) == 1:
            num_one_pos_words += 1

    print "Percent Words With 1 POS", round(
        100.0 * num_one_pos_words / len(obj), 3)
    print "Avg Num POS per word", round(1.0 * total_num_pos / len(obj), 3)

    cal_tools.saveUTFStr(obj, "cal_pos_hashtable.json")
    f = codecs.open("double_pos_before_eng.txt", "wb", encoding='utf8')
    for word, pos in obj.items():
        f.write(u'{} ~-~ {}\n'.format(word, str(pos)))
    f.close()
Exemplo n.º 9
0
def make_cal_pos_hashtable(cutoff=0):
    obj = {}
    with open(full_cal_db_location,'rb') as cal:
        for line in cal:
            try:
                lineObj = cal_tools.parseCalLine(line,False,False)
            except IndexError:
                print line
                continue
            word = lineObj["word"]
            pos = lineObj["POS"]
            if not word in obj:
                obj[word] = []

            #pos_set = set(obj[word])
            #pos_set.add(pos)
            obj[word].append(pos)

    num_one_pos_words = 0
    total_num_pos = 0
    for word,pos in reversed(obj.items()):
        pos_counts = {}
        for p in pos:
            if not p in pos_counts:
                pos_counts[p] = 0
            pos_counts[p] += 1
        obj[word] = pos_counts
        if len(pos_counts) < cutoff:
            del obj[word]
            continue
        total_num_pos += len(pos_counts)
        if len(pos_counts) == 1:
            num_one_pos_words += 1

    print "Percent Words With 1 POS",round(100.0*num_one_pos_words/len(obj),3)
    print "Avg Num POS per word",round(1.0*total_num_pos/len(obj),3)

    cal_tools.saveUTFStr(obj,"cal_pos_hashtable.json")
    f = codecs.open("double_pos_before_eng.txt","wb",encoding='utf8')
    for word,pos in obj.items():
        f.write(u'{} ~-~ {}\n'.format(word,str(pos)))
    f.close()
Exemplo n.º 10
0
def make_cal_segments(mesechta):
    def get_daf_str(daf_num, daf_side_num):
        return '{}{}'.format(daf_num, 'a' if daf_side_num == 1 else 'b')

    cal_gem_lines = []
    with open("{}{}.txt".format(mesechta_cal_db_location, mesechta),
              "rb") as f:
        temp_gem_line = []
        curr_gem_line_num = -1
        curr_daf = ''
        for line in f:
            line_obj = cal_tools.parseCalLine(line, True, False)
            line_obj["daf"] = get_daf_str(
                line_obj['pg_num'], line_obj['side'])  #add a daf str prop
            line_obj["word"] = line_obj["word"].replace("'", '"')
            if len(line_obj["word"]) > 1 and line_obj["word"][-1] == '"':
                line_obj["word"] = line_obj["word"][0:
                                                    -1]  #remove abbreviations

            if line_obj["line_num"] != curr_gem_line_num:
                if len(temp_gem_line) > 0:
                    small_gem_lines = [temp_gem_line]
                    has_big_lines = True

                    #recursively split up big lines until they're not big
                    while has_big_lines:
                        has_big_lines = False
                        new_small_gem_lines = []
                        for gem_line in small_gem_lines:
                            if len(gem_line) > 5:
                                has_big_lines = True
                                cut_index = len(gem_line) / 2
                                new_small_gem_lines.append(
                                    gem_line[:cut_index])
                                new_small_gem_lines.append(
                                    gem_line[cut_index:])
                            else:
                                new_small_gem_lines.append(gem_line)
                        small_gem_lines = new_small_gem_lines
                    for gem_line in small_gem_lines:
                        cal_gem_lines.append(gem_line)
                temp_gem_line = [line_obj]
                curr_gem_line_num = line_obj["line_num"]
            else:
                temp_gem_line.append(line_obj)
    '''
    #clean up lines with only 1 or 2 words
    new_cal_gem_lines = []
    new_cal_gem_dafs = []

    for i,clt in enumerate(zip(cal_gem_lines,cal_gem_line_nums,cal_gem_dafs)):
        cal_line = clt[0], line_num = clt[1], daf = clt[2]
        if i > 0 and cal_gem_dafs[i-1] == daf and line_num-cal_gem_line_nums[i-1] <= 1:
            p_cal_line = cal_gem_lines[i-1]
        else:
            p_cal_line = None

        if i < len(cal_gem_lines)-1 and cal_gem_dafs[i+1] == daf and cal_gem_line_nums[i+1]-line_num <= 1:
            n_cal_line = cal_gem_lines[i+1]
        else:
            n_cal_line = None

        if len(cal_line) <= 2
    '''

    #break up by daf, concat lines to strs
    all_daf_lines = []
    all_dafs = []
    curr_daf = ''
    curr_daf_lines = []
    for iline, line in enumerate(cal_gem_lines):
        if line[0]["daf"] != curr_daf:
            if len(curr_daf_lines) > 0:
                all_daf_lines.append(curr_daf_lines)
                all_dafs.append(curr_daf)
            curr_daf = line[0]["daf"]
            curr_daf_lines = [line]
        else:
            curr_daf_lines.append(line)

        # dont forget to add the last daf in
        if iline == len(cal_gem_lines) - 1:
            if len(curr_daf_lines) > 0:
                all_daf_lines.append(curr_daf_lines)
                all_dafs.append(curr_daf)

    cal_tools.saveUTFStr({
        "lines": all_daf_lines,
        "dafs": all_dafs
    }, "cal_lines_{}.json".format(mesechta))
Exemplo n.º 11
0
def make_cal_segments(mesechta):

    def get_daf_str(daf_num,daf_side_num):
        return '{}{}'.format(daf_num,'a' if daf_side_num == 1 else 'b')

    cal_gem_lines = []
    with open("{}{}.txt".format(mesechta_cal_db_location, mesechta), "rb") as f:
        temp_gem_line = []
        curr_gem_line_num = -1
        curr_daf = ''
        for line in f:
            line_obj = cal_tools.parseCalLine(line,True,False)
            line_obj["daf"] = get_daf_str(line_obj['pg_num'],line_obj['side']) #add a daf str prop
            line_obj["word"] = line_obj["word"].replace("'",'"')
            if len(line_obj["word"]) > 1 and line_obj["word"][-1] == '"':
                line_obj["word"] = line_obj["word"][0:-1] #remove abbreviations

            if line_obj["line_num"] != curr_gem_line_num:
                if len(temp_gem_line) > 0:
                    small_gem_lines = [temp_gem_line]
                    has_big_lines = True

                    #recursively split up big lines until they're not big
                    while has_big_lines:
                        has_big_lines = False
                        new_small_gem_lines = []
                        for gem_line in small_gem_lines:
                            if len(gem_line) > 5:
                                has_big_lines = True
                                cut_index = len(gem_line)/2
                                new_small_gem_lines.append(gem_line[:cut_index])
                                new_small_gem_lines.append(gem_line[cut_index:])
                            else:
                                new_small_gem_lines.append(gem_line)
                        small_gem_lines = new_small_gem_lines
                    for gem_line in small_gem_lines:
                        cal_gem_lines.append(gem_line)
                temp_gem_line = [line_obj]
                curr_gem_line_num = line_obj["line_num"]
            else:
                temp_gem_line.append(line_obj)

    '''
    #clean up lines with only 1 or 2 words
    new_cal_gem_lines = []
    new_cal_gem_dafs = []

    for i,clt in enumerate(zip(cal_gem_lines,cal_gem_line_nums,cal_gem_dafs)):
        cal_line = clt[0], line_num = clt[1], daf = clt[2]
        if i > 0 and cal_gem_dafs[i-1] == daf and line_num-cal_gem_line_nums[i-1] <= 1:
            p_cal_line = cal_gem_lines[i-1]
        else:
            p_cal_line = None

        if i < len(cal_gem_lines)-1 and cal_gem_dafs[i+1] == daf and cal_gem_line_nums[i+1]-line_num <= 1:
            n_cal_line = cal_gem_lines[i+1]
        else:
            n_cal_line = None

        if len(cal_line) <= 2
    '''

    #break up by daf, concat lines to strs
    all_daf_lines = []
    all_dafs = []
    curr_daf = ''
    curr_daf_lines = []
    for line in cal_gem_lines:
        if line[0]["daf"] != curr_daf:
            if len(curr_daf_lines) > 0:
                all_daf_lines.append(curr_daf_lines)
                all_dafs.append(curr_daf)
            curr_daf = line[0]["daf"]
            curr_daf_lines = [line]
        else:
            curr_daf_lines.append(line)

    cal_tools.saveUTFStr({"lines":all_daf_lines,"dafs":all_dafs},"cal_lines_{}.json".format(mesechta))