예제 #1
0
def output_chapters_verses(book, chapter_index_filename, chapter_output_location):
    chapter_list = get_split_file_list(chapter_index_filename)
#  header = chapter_list[0]
    header = chapter_list.pop(0)
#  chapter_list = chapter_list.pop(0)
    c = {'Book':0, 'Chapter':1, 'Verse':2, 'Title Start':3, 'Title End':4, 'Verse Length':5}

    book_text = get_file_text(book)
#['Book', 'Chapter', 'Verse', 'Title Start', 'Title End', 'Verse Length']
#['1 Nephi ', '1', '1', '5720', '5731', '377']

    verse_regex = '^\s\d+\s'

    for chp in chapter_list:
        output_filename = chapter_output_location + chp[c['Book']] + 'CHP-' + chp[c['Chapter']] + '_VRS-' + chp[c['Verse']] + '.txt'
#    title_start = int(chp[c['Title Start']])
        title_end = int(chp[c['Title End']])
        verse_length = int(chp[c['Verse Length']])
        verse_text = book_text[title_end+1:title_end+verse_length]
        m = re.match(verse_regex, verse_text)
        verse_text = verse_text[m.end(0):-1]
        o_file = open(output_filename, 'w')
        o_file.write(verse_text)
        o_file.close()
    return chapter_list
예제 #2
0
def find_chapters(book):
    text = get_file_text(book)

    directory, file_name, extension = file_name_ext(book)
    output_filename = directory + '/' + file_name + '-chp.txt'
    o_file = open(output_filename, 'w')

#  chapters_regex = '\d+.+\d+:\d+'
    chapters_regex = '(.+?)(\d+):(\d+)'
#  crex = re.compile(chapters_regex)
#prog = re.compile(pattern)
#result = prog.match(string)

#  matches = re.findall(chapters_regex, txt, re.M + re.S + re.U)
#  matches = crex.findall(txt)
#  matches = re.findall(chapters_regex, txt)
#  for m in matches:
    fields = ['Book', 'Chapter', 'Verse', 'Title Start', 'Title End', 'Verse Length']
    o_file.write('\t'.join(fields) + '\n')
    previous_title = -1
    for m in re.finditer(chapters_regex, text):
        #o_file.write('%s\t%s\t%s\t%d\t%d\n' %(m[0], m[1], m[2], m.start(), m.end()))
        if previous_title != -1:
            verse_length = m.start() - previous_title
            o_file.write('%d\n' %(verse_length))
        o_file.write('%s\t%s\t%s\t%d\t%d\t' %(m.group(1), m.group(2), m.group(3), m.start(), m.end()))
        previous_title = m.end()

    end_regex = 'End of the Project Gutenberg EBook'
    end_chapter = re.search(end_regex, text).start()

    verse_length = end_chapter - previous_title
    o_file.write('%d\n' %(verse_length))
    o_file.close()
예제 #3
0
def count_words_unigram_pos(input_filename, output_path=''):

    txt = get_file_text(input_filename)

    word_regex = '[a-zA-Z]+'
    word_frequency = {}
    total_words = 0.

    matches = re.findall(word_regex, txt, re.M + re.S + re.U)
    for m in matches:
        word_frequency[m] = word_frequency.get(m, 0.) + 1.
        total_words+=1.

    sorted_words = sorted(word_frequency.iteritems(), key=operator.itemgetter(1))

    word_analysis = []
    for word in sorted_words:
        pos = pos_tag([word[0]])
        word_analysis.append([word[0], word[1], pos[0][1]])

    o_file = make_output_file(input_filename, output_path=output_path, prefix='', suffix='-words_unigram_pos')
    o_file.write('word\tcount\tpos\n')
    for w in word_analysis:
        o_file.write('%s\t%d\t%s\n' % (w[0], w[1], w[2]))

    o_file.close()
예제 #4
0
def find_koran_chapters(book):
    text = get_file_text(book)

    directory, file_name, extension = file_name_ext(book)
    output_filename = directory + '/' + file_name + '-chapters_debug.txt'
    o_file = open(output_filename, 'w')

#    regex = '(CHAPTER.+?)\.\\n\\n(.+?)\.\\n'
#    regex = '(CHAPTER.+?)\.\s\s(.+?)\s\s'
#    regex = '(CHAPTER.+?)\.$(.+?)\s\s' # not working
#    regex = '(CHAPTER.+?)\s\sENTITLED, (.+?);\s\s' # not working
    regex = '(CHAPTER.+?)\.\n\nENTITLED, (.+?);'
#m6 = re.findall(rx6, text, re.M + re.S + re.U)    
#
#
#CHAPTER III.
#
#ENTITLED, THE FAMILY OF IMRAN; REVEALED AT MEDINA

#  chapters_regex = '\d+.+\d+:\d+'
#    chapters_regex = '(.+?)(\d+):(\d+)'
#  crex = re.compile(chapters_regex)
#prog = re.compile(pattern)
#result = prog.match(string)

#  matches = re.findall(chapters_regex, txt, re.M + re.S + re.U)
#  matches = crex.findall(txt)
#  matches = re.findall(chapters_regex, txt)
#  for m in matches:
    fields = ['Chapter', 'Title', 'Title Start', 'Title End', 'Verse Length']
    o_file.write('\t'.join(fields) + '\n')
    previous_title = -1
#    for m in re.finditer(regex, text, re.M):    
    for m in re.finditer(regex, text, re.M + re.S + re.U):
        #o_file.write('%s\t%s\t%s\t%d\t%d\n' %(m[0], m[1], m[2], m.start(), m.end()))
        if previous_title != -1:
            verse_length = m.start() - previous_title
            o_file.write('%d\n' %(verse_length))
#        o_file.write('%s\t%s\t%s\t%d\t%d\t' %(m.group(1), m.group(2), m.group(3), m.start(), m.end()))
        title = m.group(2)
        title = re.sub(',', '', title)
#        title = re.sub('; REVEALED AT MECCA', '', title)
#        title = re.sub('; WHERE IT WAS REVEALED IS DISPUTED', '', title)
#        title = re.sub('; REVEALED AT MEDINA', '', title)
#        title = re.sub('; REVEALED PARTLY AT MECCA, AND PARTLY AT MEDINA', '', title)        
        o_file.write('%s\t%s\t%d\t%d\t' %(m.group(1), title, m.start(), m.end()))
        previous_title = m.end()

#    end_regex = 'End of the Project Gutenberg EBook'
#    end_chapter = re.search(end_regex, text).start()
    end_chapter = len(text)

    verse_length = end_chapter - previous_title
    o_file.write('%d\n' %(verse_length))
    o_file.close()
예제 #5
0
def count_words_v0(input_filename, output_path=''):

    txt = get_file_text(input_filename)
    words = txt.split(' ')
    #test with nltk
    word_frequency = {}
    for w in words:
        word_frequency[w] = word_frequency.get(w, 0) + 1

    sorted_words = sorted(word_frequency.iteritems(), key=operator.itemgetter(1))

    o_file = make_output_file(input_filename, output_path=output_path, prefix='', suffix='-words')
    for w in sorted_words:
        o_file.write('%s\t%d\n' % (w[0], w[1]))

    o_file.close()
예제 #6
0
def chords_add_header(chapter_index_filename, verse_location, n_gram_path):
    directory, file_name, extension = file_name_ext(chapter_index_filename)
#  chapters = make_dictionary_from_file(chapter_index_filename, sep='\t', key_col=0)
    files = os.listdir(verse_location)

    for f in files:
        directory, file_name, extension = file_name_ext(f)
        chord_filename = n_gram_path + file_name + '_chord.' + extension
        dic_filename = n_gram_path + file_name + '_dic_index.' + extension
        d_file = open(dic_filename, 'r')
        word_header = []
        for txt in d_file.readlines():
            cols = split_line(txt)
            word_header.append(cols[0])
        d_file.close()

        c_txt = get_file_text(chord_filename)

        o_file = open(chord_filename, 'w')
        o_file.write(','.join(word_header) + '\n')
        o_file.write(c_txt)
        o_file.close()
예제 #7
0
def n_gram_chapters(chapter_index_filename, verse_location, n_gram_path):
    directory, file_name, extension = file_name_ext(chapter_index_filename)

    chapters = make_dictionary_from_file(chapter_index_filename, sep='\t', key_col=0)

    files = os.listdir(verse_location)

    for k, v in chapters.items():
        file_filter = k + '*'
        file_match_list = fnmatch.filter(files, file_filter)

        for f in file_match_list:
            verse_text = get_file_text(verse_location + f)
            verse_text = verse_text.replace('.', ' ') #TODO: Make this a function, punctuation replacement
            verse_text = verse_text.replace(',', ' ')
            verse_text = verse_text.replace('\n', ' ')
            verse_words = verse_text.split()
            n_gram = {}
            dic = {} #New
            prev_word = verse_words[0]
            dic[prev_word] = dic.get(prev_word, 0) + 1
            for i in range(1, len(verse_words)):
#        n_gram[prev_word] = n_gram.get(verse_words[prev_word], {})
                n_gram[prev_word] = n_gram.get(prev_word, {})
                n_gram_child = n_gram[prev_word]
                n_gram_child[verse_words[i]] = n_gram_child.get(verse_words[i], 1)
                dic[verse_words[i]] = dic.get(verse_words[i], 0) + 1
                prev_word = verse_words[i]

            directory, file_name, extension = file_name_ext(f)
            output_filename = n_gram_path + file_name + '_n-gram.' + extension
            o_file = open(output_filename, 'w')
            json.dump(n_gram, o_file, sort_keys=True, indent=4)
            o_file.close()

            verse_index = []
            index_lookup = {}
            i = 0
            for k, v in dic.items():
                verse_index.append(k)
                index_lookup[k] = i
                i+=1

            output_filename = n_gram_path + file_name + '_dic_index.' + extension
            o_file = open(output_filename, 'w')

            dic_size = len(verse_index)
            verse_dic = {}
            for i in range(dic_size):
                o_file.write('%s\t%d\n' % (verse_index[i], i))
                verse_dic[verse_index[i]] = i
            o_file.close()

            verse_matrix = np.zeros((dic_size, dic_size), dtype = np.int)
            for k, v in n_gram.items():
                #get row
                row_index = index_lookup[k]
                for k2, v2 in v.items():
                    col_index = index_lookup[k2]
                    verse_matrix[row_index, col_index] = v2

            output_filename = n_gram_path + file_name + '_chord.' + extension
            np.savetxt(output_filename, verse_matrix, delimiter=',', fmt='%d')