def output_chapters_verses(book, chapter_index_filename, chapter_output_location): chapter_list = get_split_file_list(chapter_index_filename) # header = chapter_list[0] header = chapter_list.pop(0) # chapter_list = chapter_list.pop(0) c = {'Book':0, 'Chapter':1, 'Verse':2, 'Title Start':3, 'Title End':4, 'Verse Length':5} book_text = get_file_text(book) #['Book', 'Chapter', 'Verse', 'Title Start', 'Title End', 'Verse Length'] #['1 Nephi ', '1', '1', '5720', '5731', '377'] verse_regex = '^\s\d+\s' for chp in chapter_list: output_filename = chapter_output_location + chp[c['Book']] + 'CHP-' + chp[c['Chapter']] + '_VRS-' + chp[c['Verse']] + '.txt' # title_start = int(chp[c['Title Start']]) title_end = int(chp[c['Title End']]) verse_length = int(chp[c['Verse Length']]) verse_text = book_text[title_end+1:title_end+verse_length] m = re.match(verse_regex, verse_text) verse_text = verse_text[m.end(0):-1] o_file = open(output_filename, 'w') o_file.write(verse_text) o_file.close() return chapter_list
def find_chapters(book): text = get_file_text(book) directory, file_name, extension = file_name_ext(book) output_filename = directory + '/' + file_name + '-chp.txt' o_file = open(output_filename, 'w') # chapters_regex = '\d+.+\d+:\d+' chapters_regex = '(.+?)(\d+):(\d+)' # crex = re.compile(chapters_regex) #prog = re.compile(pattern) #result = prog.match(string) # matches = re.findall(chapters_regex, txt, re.M + re.S + re.U) # matches = crex.findall(txt) # matches = re.findall(chapters_regex, txt) # for m in matches: fields = ['Book', 'Chapter', 'Verse', 'Title Start', 'Title End', 'Verse Length'] o_file.write('\t'.join(fields) + '\n') previous_title = -1 for m in re.finditer(chapters_regex, text): #o_file.write('%s\t%s\t%s\t%d\t%d\n' %(m[0], m[1], m[2], m.start(), m.end())) if previous_title != -1: verse_length = m.start() - previous_title o_file.write('%d\n' %(verse_length)) o_file.write('%s\t%s\t%s\t%d\t%d\t' %(m.group(1), m.group(2), m.group(3), m.start(), m.end())) previous_title = m.end() end_regex = 'End of the Project Gutenberg EBook' end_chapter = re.search(end_regex, text).start() verse_length = end_chapter - previous_title o_file.write('%d\n' %(verse_length)) o_file.close()
def count_words_unigram_pos(input_filename, output_path=''): txt = get_file_text(input_filename) word_regex = '[a-zA-Z]+' word_frequency = {} total_words = 0. matches = re.findall(word_regex, txt, re.M + re.S + re.U) for m in matches: word_frequency[m] = word_frequency.get(m, 0.) + 1. total_words+=1. sorted_words = sorted(word_frequency.iteritems(), key=operator.itemgetter(1)) word_analysis = [] for word in sorted_words: pos = pos_tag([word[0]]) word_analysis.append([word[0], word[1], pos[0][1]]) o_file = make_output_file(input_filename, output_path=output_path, prefix='', suffix='-words_unigram_pos') o_file.write('word\tcount\tpos\n') for w in word_analysis: o_file.write('%s\t%d\t%s\n' % (w[0], w[1], w[2])) o_file.close()
def find_koran_chapters(book): text = get_file_text(book) directory, file_name, extension = file_name_ext(book) output_filename = directory + '/' + file_name + '-chapters_debug.txt' o_file = open(output_filename, 'w') # regex = '(CHAPTER.+?)\.\\n\\n(.+?)\.\\n' # regex = '(CHAPTER.+?)\.\s\s(.+?)\s\s' # regex = '(CHAPTER.+?)\.$(.+?)\s\s' # not working # regex = '(CHAPTER.+?)\s\sENTITLED, (.+?);\s\s' # not working regex = '(CHAPTER.+?)\.\n\nENTITLED, (.+?);' #m6 = re.findall(rx6, text, re.M + re.S + re.U) # # #CHAPTER III. # #ENTITLED, THE FAMILY OF IMRAN; REVEALED AT MEDINA # chapters_regex = '\d+.+\d+:\d+' # chapters_regex = '(.+?)(\d+):(\d+)' # crex = re.compile(chapters_regex) #prog = re.compile(pattern) #result = prog.match(string) # matches = re.findall(chapters_regex, txt, re.M + re.S + re.U) # matches = crex.findall(txt) # matches = re.findall(chapters_regex, txt) # for m in matches: fields = ['Chapter', 'Title', 'Title Start', 'Title End', 'Verse Length'] o_file.write('\t'.join(fields) + '\n') previous_title = -1 # for m in re.finditer(regex, text, re.M): for m in re.finditer(regex, text, re.M + re.S + re.U): #o_file.write('%s\t%s\t%s\t%d\t%d\n' %(m[0], m[1], m[2], m.start(), m.end())) if previous_title != -1: verse_length = m.start() - previous_title o_file.write('%d\n' %(verse_length)) # o_file.write('%s\t%s\t%s\t%d\t%d\t' %(m.group(1), m.group(2), m.group(3), m.start(), m.end())) title = m.group(2) title = re.sub(',', '', title) # title = re.sub('; REVEALED AT MECCA', '', title) # title = re.sub('; WHERE IT WAS REVEALED IS DISPUTED', '', title) # title = re.sub('; REVEALED AT MEDINA', '', title) # title = re.sub('; REVEALED PARTLY AT MECCA, AND PARTLY AT MEDINA', '', title) o_file.write('%s\t%s\t%d\t%d\t' %(m.group(1), title, m.start(), m.end())) previous_title = m.end() # end_regex = 'End of the Project Gutenberg EBook' # end_chapter = re.search(end_regex, text).start() end_chapter = len(text) verse_length = end_chapter - previous_title o_file.write('%d\n' %(verse_length)) o_file.close()
def count_words_v0(input_filename, output_path=''): txt = get_file_text(input_filename) words = txt.split(' ') #test with nltk word_frequency = {} for w in words: word_frequency[w] = word_frequency.get(w, 0) + 1 sorted_words = sorted(word_frequency.iteritems(), key=operator.itemgetter(1)) o_file = make_output_file(input_filename, output_path=output_path, prefix='', suffix='-words') for w in sorted_words: o_file.write('%s\t%d\n' % (w[0], w[1])) o_file.close()
def chords_add_header(chapter_index_filename, verse_location, n_gram_path): directory, file_name, extension = file_name_ext(chapter_index_filename) # chapters = make_dictionary_from_file(chapter_index_filename, sep='\t', key_col=0) files = os.listdir(verse_location) for f in files: directory, file_name, extension = file_name_ext(f) chord_filename = n_gram_path + file_name + '_chord.' + extension dic_filename = n_gram_path + file_name + '_dic_index.' + extension d_file = open(dic_filename, 'r') word_header = [] for txt in d_file.readlines(): cols = split_line(txt) word_header.append(cols[0]) d_file.close() c_txt = get_file_text(chord_filename) o_file = open(chord_filename, 'w') o_file.write(','.join(word_header) + '\n') o_file.write(c_txt) o_file.close()
def n_gram_chapters(chapter_index_filename, verse_location, n_gram_path): directory, file_name, extension = file_name_ext(chapter_index_filename) chapters = make_dictionary_from_file(chapter_index_filename, sep='\t', key_col=0) files = os.listdir(verse_location) for k, v in chapters.items(): file_filter = k + '*' file_match_list = fnmatch.filter(files, file_filter) for f in file_match_list: verse_text = get_file_text(verse_location + f) verse_text = verse_text.replace('.', ' ') #TODO: Make this a function, punctuation replacement verse_text = verse_text.replace(',', ' ') verse_text = verse_text.replace('\n', ' ') verse_words = verse_text.split() n_gram = {} dic = {} #New prev_word = verse_words[0] dic[prev_word] = dic.get(prev_word, 0) + 1 for i in range(1, len(verse_words)): # n_gram[prev_word] = n_gram.get(verse_words[prev_word], {}) n_gram[prev_word] = n_gram.get(prev_word, {}) n_gram_child = n_gram[prev_word] n_gram_child[verse_words[i]] = n_gram_child.get(verse_words[i], 1) dic[verse_words[i]] = dic.get(verse_words[i], 0) + 1 prev_word = verse_words[i] directory, file_name, extension = file_name_ext(f) output_filename = n_gram_path + file_name + '_n-gram.' + extension o_file = open(output_filename, 'w') json.dump(n_gram, o_file, sort_keys=True, indent=4) o_file.close() verse_index = [] index_lookup = {} i = 0 for k, v in dic.items(): verse_index.append(k) index_lookup[k] = i i+=1 output_filename = n_gram_path + file_name + '_dic_index.' + extension o_file = open(output_filename, 'w') dic_size = len(verse_index) verse_dic = {} for i in range(dic_size): o_file.write('%s\t%d\n' % (verse_index[i], i)) verse_dic[verse_index[i]] = i o_file.close() verse_matrix = np.zeros((dic_size, dic_size), dtype = np.int) for k, v in n_gram.items(): #get row row_index = index_lookup[k] for k2, v2 in v.items(): col_index = index_lookup[k2] verse_matrix[row_index, col_index] = v2 output_filename = n_gram_path + file_name + '_chord.' + extension np.savetxt(output_filename, verse_matrix, delimiter=',', fmt='%d')