def import_data_from_text(data_filename, field_type_filename, table): field_types = make_dictionary_from_file(field_type_filename, key_col=0, val_col=1, has_header=False) data, field_names = file_to_list(data_filename) conn = sqlite3.connect(db_path) cursor = conn.cursor() r = 0 for i in range(len(data)): r+=1 sql_fields = '' sql_values = '' if len(data[i]) != len(field_names): print 'missing columns row:%d' % (i) return for j in range(len(field_names)): sql_fields += (field_names[j] + ',') if field_types[field_names[j]] == "INTEGER": if data[i][j] == 'None': sql_values+=('0,') else: sql_values+=(data[i][j] + ',') elif field_types[field_names[j]] == "TEXT": # c_str = re.sub('"', ' ', str(data[i][j])) c_str = data[i][j].replace('"', ' ') sql_values+=('"' + c_str + '",') elif field_types[field_names[j]] == "FLOAT": if data[i][j] == 'None': sql_values+=('0,') else: sql_values+=(data[i][j] + ',') else: print 'no valid type row:%d' % (i) return sql_fields = sql_fields[:-1] sql_values = sql_values[:-1] sql = 'insert into ' + table + ' (' + sql_fields + ') values (' + sql_values + ')' # print sql # try: cursor.execute(sql) # except sqlite3.OperationalError: print r # x = 4 # break conn.commit() cursor.close() return
def n_gram_chapters(chapter_index_filename, verse_location, n_gram_path): directory, file_name, extension = file_name_ext(chapter_index_filename) chapters = make_dictionary_from_file(chapter_index_filename, sep='\t', key_col=0) files = os.listdir(verse_location) for k, v in chapters.items(): file_filter = k + '*' file_match_list = fnmatch.filter(files, file_filter) for f in file_match_list: verse_text = get_file_text(verse_location + f) verse_text = verse_text.replace('.', ' ') #TODO: Make this a function, punctuation replacement verse_text = verse_text.replace(',', ' ') verse_text = verse_text.replace('\n', ' ') verse_words = verse_text.split() n_gram = {} dic = {} #New prev_word = verse_words[0] dic[prev_word] = dic.get(prev_word, 0) + 1 for i in range(1, len(verse_words)): # n_gram[prev_word] = n_gram.get(verse_words[prev_word], {}) n_gram[prev_word] = n_gram.get(prev_word, {}) n_gram_child = n_gram[prev_word] n_gram_child[verse_words[i]] = n_gram_child.get(verse_words[i], 1) dic[verse_words[i]] = dic.get(verse_words[i], 0) + 1 prev_word = verse_words[i] directory, file_name, extension = file_name_ext(f) output_filename = n_gram_path + file_name + '_n-gram.' + extension o_file = open(output_filename, 'w') json.dump(n_gram, o_file, sort_keys=True, indent=4) o_file.close() verse_index = [] index_lookup = {} i = 0 for k, v in dic.items(): verse_index.append(k) index_lookup[k] = i i+=1 output_filename = n_gram_path + file_name + '_dic_index.' + extension o_file = open(output_filename, 'w') dic_size = len(verse_index) verse_dic = {} for i in range(dic_size): o_file.write('%s\t%d\n' % (verse_index[i], i)) verse_dic[verse_index[i]] = i o_file.close() verse_matrix = np.zeros((dic_size, dic_size), dtype = np.int) for k, v in n_gram.items(): #get row row_index = index_lookup[k] for k2, v2 in v.items(): col_index = index_lookup[k2] verse_matrix[row_index, col_index] = v2 output_filename = n_gram_path + file_name + '_chord.' + extension np.savetxt(output_filename, verse_matrix, delimiter=',', fmt='%d')