def output_json_book_chapter_verse(input_file): BOOK, CHAPTER, VERSE, LENGTH = 0, 1, 2, 5 i_file = open(input_file, 'r') i_file.readline() d_book = {} for txt in i_file.readlines(): col = split_line(txt) # d_book[col[BOOK]] = d_book.get(col[BOOK], {}) # d_chapter = d_book.get(col[CHAPTER], {}) # d_verse = d_chapter.get(col[VERSE], col[LENGTH]) d_book[col[BOOK]] = d_book.get(col[BOOK], {}) d_book[col[BOOK]][col[CHAPTER]] = d_book[col[BOOK]].get(col[CHAPTER], {}) # d_chapter = d_book[col[BOOK]].get(col[CHAPTER], {}) # d_verse = d_chapter[col[CHAPTER]].get(col[VERSE], col[LENGTH]) d_book[col[BOOK]][col[CHAPTER]][col[VERSE]] = d_book[col[BOOK]][col[CHAPTER]].get(col[VERSE], col[LENGTH]) sorted_chp = sorted(d_book.iteritems(), key=operator.itemgetter(0)) directory, file_name, extension = file_name_ext(input_file) output_filename = directory + '/' + file_name + '-chapter_verse.json' o_file = open(output_filename, 'w') # json.dump(sorted_chp, o_file) json.dump(d_book, o_file) o_file.close()
def find_chapters(book): text = get_file_text(book) directory, file_name, extension = file_name_ext(book) output_filename = directory + '/' + file_name + '-chp.txt' o_file = open(output_filename, 'w') # chapters_regex = '\d+.+\d+:\d+' chapters_regex = '(.+?)(\d+):(\d+)' # crex = re.compile(chapters_regex) #prog = re.compile(pattern) #result = prog.match(string) # matches = re.findall(chapters_regex, txt, re.M + re.S + re.U) # matches = crex.findall(txt) # matches = re.findall(chapters_regex, txt) # for m in matches: fields = ['Book', 'Chapter', 'Verse', 'Title Start', 'Title End', 'Verse Length'] o_file.write('\t'.join(fields) + '\n') previous_title = -1 for m in re.finditer(chapters_regex, text): #o_file.write('%s\t%s\t%s\t%d\t%d\n' %(m[0], m[1], m[2], m.start(), m.end())) if previous_title != -1: verse_length = m.start() - previous_title o_file.write('%d\n' %(verse_length)) o_file.write('%s\t%s\t%s\t%d\t%d\t' %(m.group(1), m.group(2), m.group(3), m.start(), m.end())) previous_title = m.end() end_regex = 'End of the Project Gutenberg EBook' end_chapter = re.search(end_regex, text).start() verse_length = end_chapter - previous_title o_file.write('%d\n' %(verse_length)) o_file.close()
def find_koran_chapters(book): text = get_file_text(book) directory, file_name, extension = file_name_ext(book) output_filename = directory + '/' + file_name + '-chapters_debug.txt' o_file = open(output_filename, 'w') # regex = '(CHAPTER.+?)\.\\n\\n(.+?)\.\\n' # regex = '(CHAPTER.+?)\.\s\s(.+?)\s\s' # regex = '(CHAPTER.+?)\.$(.+?)\s\s' # not working # regex = '(CHAPTER.+?)\s\sENTITLED, (.+?);\s\s' # not working regex = '(CHAPTER.+?)\.\n\nENTITLED, (.+?);' #m6 = re.findall(rx6, text, re.M + re.S + re.U) # # #CHAPTER III. # #ENTITLED, THE FAMILY OF IMRAN; REVEALED AT MEDINA # chapters_regex = '\d+.+\d+:\d+' # chapters_regex = '(.+?)(\d+):(\d+)' # crex = re.compile(chapters_regex) #prog = re.compile(pattern) #result = prog.match(string) # matches = re.findall(chapters_regex, txt, re.M + re.S + re.U) # matches = crex.findall(txt) # matches = re.findall(chapters_regex, txt) # for m in matches: fields = ['Chapter', 'Title', 'Title Start', 'Title End', 'Verse Length'] o_file.write('\t'.join(fields) + '\n') previous_title = -1 # for m in re.finditer(regex, text, re.M): for m in re.finditer(regex, text, re.M + re.S + re.U): #o_file.write('%s\t%s\t%s\t%d\t%d\n' %(m[0], m[1], m[2], m.start(), m.end())) if previous_title != -1: verse_length = m.start() - previous_title o_file.write('%d\n' %(verse_length)) # o_file.write('%s\t%s\t%s\t%d\t%d\t' %(m.group(1), m.group(2), m.group(3), m.start(), m.end())) title = m.group(2) title = re.sub(',', '', title) # title = re.sub('; REVEALED AT MECCA', '', title) # title = re.sub('; WHERE IT WAS REVEALED IS DISPUTED', '', title) # title = re.sub('; REVEALED AT MEDINA', '', title) # title = re.sub('; REVEALED PARTLY AT MECCA, AND PARTLY AT MEDINA', '', title) o_file.write('%s\t%s\t%d\t%d\t' %(m.group(1), title, m.start(), m.end())) previous_title = m.end() # end_regex = 'End of the Project Gutenberg EBook' # end_chapter = re.search(end_regex, text).start() end_chapter = len(text) verse_length = end_chapter - previous_title o_file.write('%d\n' %(verse_length)) o_file.close()
def chords_add_header(chapter_index_filename, verse_location, n_gram_path): directory, file_name, extension = file_name_ext(chapter_index_filename) # chapters = make_dictionary_from_file(chapter_index_filename, sep='\t', key_col=0) files = os.listdir(verse_location) for f in files: directory, file_name, extension = file_name_ext(f) chord_filename = n_gram_path + file_name + '_chord.' + extension dic_filename = n_gram_path + file_name + '_dic_index.' + extension d_file = open(dic_filename, 'r') word_header = [] for txt in d_file.readlines(): cols = split_line(txt) word_header.append(cols[0]) d_file.close() c_txt = get_file_text(chord_filename) o_file = open(chord_filename, 'w') o_file.write(','.join(word_header) + '\n') o_file.write(c_txt) o_file.close()
def json_out(input_file): i_file = open(input_file, 'r') i_file.readline() d_chapter_names = {} for txt in i_file.readlines(): fields = split_line(txt) d_chapter_names[fields[0]] = d_chapter_names.get(fields[0], 0) + 1 sorted_chp = sorted(d_chapter_names.iteritems(), key=operator.itemgetter(0)) directory, file_name, extension = file_name_ext(input_file) output_filename = directory + '/' + file_name + '-chp_names.json' o_file = open(output_filename, 'w') json.dump(sorted_chp, o_file) o_file.close()
def n_gram_chapters(chapter_index_filename, verse_location, n_gram_path): directory, file_name, extension = file_name_ext(chapter_index_filename) chapters = make_dictionary_from_file(chapter_index_filename, sep='\t', key_col=0) files = os.listdir(verse_location) for k, v in chapters.items(): file_filter = k + '*' file_match_list = fnmatch.filter(files, file_filter) for f in file_match_list: verse_text = get_file_text(verse_location + f) verse_text = verse_text.replace('.', ' ') #TODO: Make this a function, punctuation replacement verse_text = verse_text.replace(',', ' ') verse_text = verse_text.replace('\n', ' ') verse_words = verse_text.split() n_gram = {} dic = {} #New prev_word = verse_words[0] dic[prev_word] = dic.get(prev_word, 0) + 1 for i in range(1, len(verse_words)): # n_gram[prev_word] = n_gram.get(verse_words[prev_word], {}) n_gram[prev_word] = n_gram.get(prev_word, {}) n_gram_child = n_gram[prev_word] n_gram_child[verse_words[i]] = n_gram_child.get(verse_words[i], 1) dic[verse_words[i]] = dic.get(verse_words[i], 0) + 1 prev_word = verse_words[i] directory, file_name, extension = file_name_ext(f) output_filename = n_gram_path + file_name + '_n-gram.' + extension o_file = open(output_filename, 'w') json.dump(n_gram, o_file, sort_keys=True, indent=4) o_file.close() verse_index = [] index_lookup = {} i = 0 for k, v in dic.items(): verse_index.append(k) index_lookup[k] = i i+=1 output_filename = n_gram_path + file_name + '_dic_index.' + extension o_file = open(output_filename, 'w') dic_size = len(verse_index) verse_dic = {} for i in range(dic_size): o_file.write('%s\t%d\n' % (verse_index[i], i)) verse_dic[verse_index[i]] = i o_file.close() verse_matrix = np.zeros((dic_size, dic_size), dtype = np.int) for k, v in n_gram.items(): #get row row_index = index_lookup[k] for k2, v2 in v.items(): col_index = index_lookup[k2] verse_matrix[row_index, col_index] = v2 output_filename = n_gram_path + file_name + '_chord.' + extension np.savetxt(output_filename, verse_matrix, delimiter=',', fmt='%d')