def build_adex_db(cur_file): file_in = open(cur_file, 'rb') reader = csv.DictReader(file_in) rows = list(reader) file_in.close() #AWC, start time, end time col_datatypes = (float, str, str) db = CSVDatabase(['AWC', 'Start Time', 'End Time'], col_datatypes) db.cursor.execute('create index time_index on data (col1, col2);') accum_time = float(rows[0]['Elapsed_Time']) i = 0 while i < len(rows): seg_dur = float(rows[i]['Segment_Duration']) start_time = '%0.2f' % (accum_time) end_time = '%0.2f' % (accum_time + seg_dur) awc = float(rows[i]['AWC']) db.csv_insert([awc, start_time, end_time]) accum_time += seg_dur i += 1 return db
def create_db(): col_datatypes = ( str, int, str, str, str, str, float, float, float, float, float, float, float, float, str, float, float, float, float, str, ) #remove old db if it exists if os.path.exists(db_path): os.remove(db_path) db = CSVDatabase.create_with_files([data_path], col_datatypes) db.execute_script(sql_path) db.dump_to_file(db_path) db.close() return Database(db_path)
def build_stats_db(cur_file): file_in = open(cur_file, 'rb') reader = csv.reader(file_in) rows = list(reader) file_in.close() header_index = 18 + int(task == 3) header_row = rows[header_index] rows = rows[header_index + 1:-4] col_datatypes = (str, str, str, int,) db = CSVDatabase(header_row, col_datatypes) for cur_row in rows: start_time, end_time, phrase, count = cur_row start_time = '%0.2f' % (BackendUtils.time_str_to_float(start_time)) end_time = '%0.2f' % (BackendUtils.time_str_to_float(end_time)) db.csv_insert([start_time, end_time, phrase, count]) db.cursor.execute('create index time_index on data (col0, col1);') db.cursor.execute( 'select * from (select min(id), max(id), count(col0) as dup_count, sum(col3) from data group by col0, col1) where dup_count > 1;' ) dup_rows = db.cursor.fetchall() if dup_rows: print 'Found %d multi-speaker situations.' % (len(dup_rows)) # print 'min(id), max(id), dup_count, total' # for cur_row in dup_rows: # print cur_row for cur_dup in dup_rows: if task == 2: combine_multi_speaker_occur(db, cur_dup) elif task == 3: delete_multi_speaker_occur(db, cur_dup) #db.dump_to_file('C:/Users/Wayne/Documents/baby-lab/bll/reliability/confusion/results/Task%d/test.db' % (task)) return db
def _build_db(self, col_datatypes, col_headers): csv_file = open(self.csv_filename, 'rb') delim = self._get_csv_delim(csv_file) lines = csv_file.readlines() reader = csv.reader(lines, delimiter=delim) header_row = reader.next() #skip headers row db = CSVDatabase(col_headers, col_datatypes) progress_dialog = ProgressDialog(title='Loading file', phases=['Loading file...']) progress_dialog.show() num_rows = len(lines) - 1 #subtract one for header done = False i = 0 while i < num_rows and not done: row = reader.next() #if this file already contains counts at the end (has already been exported), skip those rows done = row and (row[0].startswith('File Stats') or row[0].startswith('Count of')) if not done: #if we have a marked column, use it - otherwise, append one if re.match(MainWindow.DATA_TYPE_REGEXS[bool], row[-1]): row = row[:-1] + [int(bool(row[-1] == 'True'))] else: row = row + [0] db.csv_insert([i + 1] + row) if (i % 10): progress_dialog.set_fraction(float(i + 1) / float(num_rows)) i += 1 progress_dialog.ensure_finish() csv_file.close() return db
def run(): for cur_dir in dirs: print 'Entering directory "%s"' % (cur_dir) file_list = glob.glob('%s%s*.csv' % (in_path, cur_dir)) file_list.sort() out_file = open('%s%s.csv' % (out_path, cur_dir[:-1]), 'wb') writer = csv.writer(out_file) write_headers(writer) for filename in file_list: print 'Processing File "%s"' % (os.path.basename(filename)) db = CSVDatabase.create_with_files((filename,), col_datatypes) out_row = [] # --- Inter-Participant Variables --- rows = db.csv_select_by_name( col_names = ( 'Child_ChildID', 'File_Name', 'Child_Age', 'AVA_DA', 'Child_Gender', 'EMLU' ), ) out_row.extend(rows[0]) #Adult Word Count (AWC) out_row.extend(calc_section(db, 'AWC')) #Child Vocalizations (CVC) out_row.extend(calc_section(db, 'Child_Voc_Count')) #Conversational Turns (CTC) out_row.extend(calc_section(db, 'Turn_Count')) #write to file writer.writerow(out_row) out_file.close() print '' print 'Done.'
def create_db(): col_datatypes = ( int, str, #filename str, #csv cols from here down str, str, str, float, float, float, float, str, bool, ) db = CSVDatabase(col_datatypes) return db, col_datatypes
def run(): file_list = glob.glob(in_path + '*.csv') out_file = open(out_filename, 'wb') writer = csv.writer(out_file) wrote_headers = False for cur_file in file_list: file_in = open(cur_file, 'rb') reader = csv.reader(file_in) rows = list(reader) file_in.close() if not wrote_headers: for cur_row in rows[:3]: writer.writerow(cur_row[0:1] + cur_row[2:]) #cut out 'Recording' row wrote_headers = True header = rows[2] fix_header_names(header) db = CSVDatabase(header, col_datatypes) for cur_row in rows[3:]: db.csv_insert(cur_row) db.set_blanks_to_null() #db.dump_to_file('%s.db' % (os.path.basename(cur_file)[:-4])) avg_col_indices = filter(lambda i: col_datatypes[i] == float, range(len(col_datatypes))) avg_rows = db.csv_select_by_index(fcn_indices=avg_col_indices, fcns=['avg'] * len(avg_col_indices), group_by_index=0) for cur_row in avg_rows: writer.writerow(cur_row[0:1] + cur_row[2:]) #cut out 'Recording' row out_file.close()
def run(): out_file = open(path + 'timetable.csv', 'wb') writer = csv.writer(out_file) writer.writerow([ 'Recording', 'Start Date', 'End Date', 'Start Time', 'End Time', 'Total Duration' ]) for cur_dir in dirs: filenames = glob.glob('%s%s*_done.csv' % (path, cur_dir)) for cur_file in filenames: #print os.path.basename(cur_file) file_in = open(cur_file, 'rb') reader = csv.reader(file_in) rows = list(reader) file_in.close() db = CSVDatabase(rows[0], col_datatypes) for cur_row in rows[1:]: #print cur_row db.csv_insert(cur_row) min_row = db.csv_select_by_name(col_names=[date_col], fcn_col_names=[date_col, date_col], fcns=['datetime', 'min']) max_row = db.csv_select_by_name(col_names=[date_col], fcn_col_names=[date_col, date_col], fcns=['datetime', 'max']) start = datetime.datetime.strptime(min_row[0][0], timestamp_fmt) end = datetime.datetime.strptime(max_row[0][0], timestamp_fmt) dur = end - start writer.writerow([ os.path.basename(cur_file)[:-4], str(start.strftime(date_fmt)), str(end.strftime(date_fmt)), str(start.strftime(time_fmt)), str(end.strftime(time_fmt)), str(dur) ])
def run(): purge_dir(out_path) file_list = glob.glob('%s*.csv' % (in_path)) trimmed_files = {} omitted_files = {} for i in range(len(file_list)): sys.stderr.write('\r' * 14) sys.stderr.write('File %d of %d' % (i + 1, len(file_list))) in_file = open(file_list[i], 'rb') reader = csv.reader(in_file) csv_rows = list(reader) in_file.close() clock_index = csv_rows[0].index('Clock_Time_TZAdj') el_time_index = csv_rows[0].index('Elapsed_Time') seg_dur_index = csv_rows[0].index('Segment_Duration') csv_rows[0].append('db_datetime') csv_rows[0].append('date_start_time') csv_rows[0].append('date_end_time') csv_rows[0].append('abs_start_time') csv_rows[0].append('abs_end_time') date_accum_time = None abs_accum_time = float(csv_rows[1][el_time_index]) prev_date = None for j in range(1, len(csv_rows)): csv_rows[j].append(fix_time(csv_rows[j][clock_index])) date = csv_rows[j][clock_index].split()[0] if date != prev_date: date_accum_time = float(csv_rows[j][el_time_index]) seg_dur = float(csv_rows[j][seg_dur_index]) csv_rows[j].append(date_accum_time) #date_start csv_rows[j].append(date_accum_time + seg_dur) #date_end csv_rows[j].append(abs_accum_time) #abs_start csv_rows[j].append(abs_accum_time + seg_dur) #abs_end abs_accum_time += seg_dur date_accum_time += seg_dur prev_date = date #create daterbase db = CSVDatabase.create_with_rows(csv_rows, input_datatypes) #append separate date and time cols for sanity db.add_column('tz_date', str) db.add_column('tz_time', str) #set their values using the Clock_Time_TZAdj column #drop down to database layer for this db_col_names = db.get_db_col_names() clock_col = db_col_names['db_datetime'] date_col = db_col_names['tz_date'] time_col = db_col_names['tz_time'] stmt = 'UPDATE %s SET %s = date(%s), %s = time(%s)' % (CSVDatabase.TABLE_NAME, date_col, clock_col, time_col, clock_col) db.execute_stmt(stmt) #db.dump_to_file('%d.db' % (i + 1)) #get a list of all of the dates we have in the file and their end time (durations) date_durs = select_date_durs(db) for row in date_durs: filename, date, dur = row #delete dates with < 5 hours of recorded time if float(dur) < MIN_SECS: if filename not in omitted_files: omitted_files[filename] = {} omitted_files[filename][date] = dur db.csv_delete_by_name( where_body='%s=?', where_cols=['tz_date'], params=[date] ) else: #we have >= 5 hours of recorded time dur_at_6 = select_dur_at_6(db, db_col_names, date) if dur_at_6 >= MIN_SECS: #recording ends at or after 6pm #delete the rows after 6pm db.csv_delete_by_name( where_body='%s = ? AND %s > ?', where_cols=['tz_date', 'tz_time'], params=[date, '18:00'] ) elif dur_at_6 < MIN_SECS: #delete rows after 5 hours db.csv_delete_by_name( where_body='%s = ? AND %s > ?', where_cols=['tz_date', 'date_start_time'], params=[date, MIN_SECS] ) #else #dur_at_6 == None: #take the whole recording #date, orig dur, trimmed_dur if filename not in trimmed_files: trimmed_files[filename] = {} trimmed_files[filename][date] = [dur] #get a list of all of the dates we have in the file and their end time (durations) after trimming date_durs = select_date_durs(db) for (filename, date, dur) in date_durs: trimmed_files[filename][date].append(dur) min_time, max_time = select_info(db, db_col_names, date) trimmed_files[filename][date].append(min_time) trimmed_files[filename][date].append(max_time) db.write_to_file( out_path + os.path.basename(file_list[i]), omit_col_indices=[22, 23, 24, 27, 28] ) print '' print_trimmed_files(trimmed_files) print '' print_omitted_files(omitted_files)