Пример #1
0
def build_adex_db(cur_file):
    file_in = open(cur_file, 'rb')
    reader = csv.DictReader(file_in)
    rows = list(reader)
    file_in.close()

    #AWC, start time, end time
    col_datatypes = (float, str, str)
    db = CSVDatabase(['AWC', 'Start Time', 'End Time'], col_datatypes)
    db.cursor.execute('create index time_index on data (col1, col2);')
    
    accum_time = float(rows[0]['Elapsed_Time'])
    i = 0
    while i < len(rows):
        seg_dur = float(rows[i]['Segment_Duration'])
        start_time = '%0.2f' % (accum_time)
        end_time = '%0.2f' % (accum_time + seg_dur)
        awc = float(rows[i]['AWC'])
        
        db.csv_insert([awc, start_time, end_time])

        accum_time += seg_dur
        i += 1

    return db
Пример #2
0
def create_db():
    col_datatypes = (
        str,
        int,
        str,
        str,
        str,
        str,
        float,
        float,
        float,
        float,
        float,
        float,
        float,
        float,
        str,
        float,
        float,
        float,
        float,
        str,
    )

    #remove old db if it exists
    if os.path.exists(db_path):
        os.remove(db_path)

    db = CSVDatabase.create_with_files([data_path], col_datatypes)
    db.execute_script(sql_path)
    db.dump_to_file(db_path)
    db.close()

    return Database(db_path)
Пример #3
0
def build_stats_db(cur_file):
    file_in = open(cur_file, 'rb')
    reader = csv.reader(file_in)
    rows = list(reader)
    file_in.close()
    header_index = 18 + int(task == 3)
    header_row = rows[header_index]
    rows = rows[header_index + 1:-4]

    col_datatypes = (str, str, str, int,)
    db = CSVDatabase(header_row, col_datatypes)
    
    for cur_row in rows:
        start_time, end_time, phrase, count = cur_row
        start_time = '%0.2f' % (BackendUtils.time_str_to_float(start_time))
        end_time = '%0.2f' % (BackendUtils.time_str_to_float(end_time))
        db.csv_insert([start_time, end_time, phrase, count])
        
    db.cursor.execute('create index time_index on data (col0, col1);')
    
    db.cursor.execute(
        'select * from (select min(id), max(id), count(col0) as dup_count, sum(col3) from data group by col0, col1) where dup_count > 1;'
    )
    
    dup_rows = db.cursor.fetchall()

    if dup_rows:
        print 'Found %d multi-speaker situations.' % (len(dup_rows))

    # print 'min(id), max(id), dup_count, total'
    # for cur_row in dup_rows:
    #     print cur_row

    for cur_dup in dup_rows:
        if task == 2:
            combine_multi_speaker_occur(db, cur_dup)
        elif task == 3:
            delete_multi_speaker_occur(db, cur_dup)

    #db.dump_to_file('C:/Users/Wayne/Documents/baby-lab/bll/reliability/confusion/results/Task%d/test.db' % (task))
    return db
Пример #4
0
    def _build_db(self, col_datatypes, col_headers):
        csv_file = open(self.csv_filename, 'rb')
        delim = self._get_csv_delim(csv_file)
        lines = csv_file.readlines()
        reader = csv.reader(lines, delimiter=delim)
        header_row = reader.next() #skip headers row

        db = CSVDatabase(col_headers, col_datatypes)

        progress_dialog = ProgressDialog(title='Loading file', phases=['Loading file...'])
        progress_dialog.show()
        
        num_rows = len(lines) - 1 #subtract one for header

        done = False
        i = 0
        while i < num_rows and not done:
            row = reader.next()

            #if this file already contains counts at the end (has already been exported), skip those rows
            done = row and (row[0].startswith('File Stats') or row[0].startswith('Count of'))
            if not done:
                #if we have a marked column, use it - otherwise, append one
                if re.match(MainWindow.DATA_TYPE_REGEXS[bool], row[-1]):
                    row = row[:-1] + [int(bool(row[-1] == 'True'))]
                else:
                    row = row + [0]

                db.csv_insert([i + 1] + row)

            if (i % 10):
                progress_dialog.set_fraction(float(i + 1) / float(num_rows))

            i += 1

        progress_dialog.ensure_finish()
        csv_file.close()
        
        return db
Пример #5
0
def run():
    for cur_dir in dirs:
        print 'Entering directory "%s"' % (cur_dir)
        
        file_list = glob.glob('%s%s*.csv' % (in_path, cur_dir))
        file_list.sort()
        
        out_file = open('%s%s.csv' % (out_path, cur_dir[:-1]), 'wb')
        writer = csv.writer(out_file)
        write_headers(writer)

        for filename in file_list:
            print 'Processing File "%s"' % (os.path.basename(filename))
            db = CSVDatabase.create_with_files((filename,), col_datatypes)
            out_row = []

            # --- Inter-Participant Variables ---
            rows = db.csv_select_by_name(
                col_names = (
                    'Child_ChildID',
                    'File_Name',
                    'Child_Age',
                    'AVA_DA',
                    'Child_Gender',
                    'EMLU'
                ),
            )

            out_row.extend(rows[0])

            #Adult Word Count (AWC)
            out_row.extend(calc_section(db, 'AWC'))

            #Child Vocalizations (CVC)
            out_row.extend(calc_section(db, 'Child_Voc_Count'))

            #Conversational Turns (CTC)
            out_row.extend(calc_section(db, 'Turn_Count'))

            #write to file
            writer.writerow(out_row)
            
        out_file.close()
        print ''

    print 'Done.'
Пример #6
0
def create_db():
    col_datatypes = (
        int,
        str,  #filename
        str,  #csv cols from here down
        str,
        str,
        str,
        float,
        float,
        float,
        float,
        str,
        bool,
    )

    db = CSVDatabase(col_datatypes)

    return db, col_datatypes
Пример #7
0
def run():
    file_list = glob.glob(in_path + '*.csv')
    out_file = open(out_filename, 'wb')
    writer = csv.writer(out_file)
    wrote_headers = False

    for cur_file in file_list:
        file_in = open(cur_file, 'rb')
        reader = csv.reader(file_in)
        rows = list(reader)
        file_in.close()

        if not wrote_headers:
            for cur_row in rows[:3]:
                writer.writerow(cur_row[0:1] +
                                cur_row[2:])  #cut out 'Recording' row
            wrote_headers = True

        header = rows[2]
        fix_header_names(header)

        db = CSVDatabase(header, col_datatypes)

        for cur_row in rows[3:]:
            db.csv_insert(cur_row)

        db.set_blanks_to_null()

        #db.dump_to_file('%s.db' % (os.path.basename(cur_file)[:-4]))

        avg_col_indices = filter(lambda i: col_datatypes[i] == float,
                                 range(len(col_datatypes)))

        avg_rows = db.csv_select_by_index(fcn_indices=avg_col_indices,
                                          fcns=['avg'] * len(avg_col_indices),
                                          group_by_index=0)

        for cur_row in avg_rows:
            writer.writerow(cur_row[0:1] +
                            cur_row[2:])  #cut out 'Recording' row

    out_file.close()
Пример #8
0
def run():
    out_file = open(path + 'timetable.csv', 'wb')
    writer = csv.writer(out_file)
    writer.writerow([
        'Recording', 'Start Date', 'End Date', 'Start Time', 'End Time',
        'Total Duration'
    ])

    for cur_dir in dirs:
        filenames = glob.glob('%s%s*_done.csv' % (path, cur_dir))

        for cur_file in filenames:
            #print os.path.basename(cur_file)

            file_in = open(cur_file, 'rb')
            reader = csv.reader(file_in)
            rows = list(reader)
            file_in.close()

            db = CSVDatabase(rows[0], col_datatypes)
            for cur_row in rows[1:]:
                #print cur_row
                db.csv_insert(cur_row)

            min_row = db.csv_select_by_name(col_names=[date_col],
                                            fcn_col_names=[date_col, date_col],
                                            fcns=['datetime', 'min'])

            max_row = db.csv_select_by_name(col_names=[date_col],
                                            fcn_col_names=[date_col, date_col],
                                            fcns=['datetime', 'max'])

            start = datetime.datetime.strptime(min_row[0][0], timestamp_fmt)
            end = datetime.datetime.strptime(max_row[0][0], timestamp_fmt)
            dur = end - start

            writer.writerow([
                os.path.basename(cur_file)[:-4],
                str(start.strftime(date_fmt)),
                str(end.strftime(date_fmt)),
                str(start.strftime(time_fmt)),
                str(end.strftime(time_fmt)),
                str(dur)
            ])
Пример #9
0
def run():
    purge_dir(out_path)
    
    file_list = glob.glob('%s*.csv' % (in_path))
    trimmed_files = {}
    omitted_files = {}
    
    for i in range(len(file_list)):
        sys.stderr.write('\r' * 14)
        sys.stderr.write('File %d of %d' % (i + 1, len(file_list)))

        in_file = open(file_list[i], 'rb')
        reader = csv.reader(in_file)
        csv_rows = list(reader)
        in_file.close()

        clock_index = csv_rows[0].index('Clock_Time_TZAdj')
        el_time_index = csv_rows[0].index('Elapsed_Time')
        seg_dur_index = csv_rows[0].index('Segment_Duration')

        csv_rows[0].append('db_datetime')
        csv_rows[0].append('date_start_time')
        csv_rows[0].append('date_end_time')
        csv_rows[0].append('abs_start_time')
        csv_rows[0].append('abs_end_time')
        date_accum_time = None
        abs_accum_time = float(csv_rows[1][el_time_index])
        prev_date = None
        
        for j in range(1, len(csv_rows)):
            csv_rows[j].append(fix_time(csv_rows[j][clock_index]))

            date = csv_rows[j][clock_index].split()[0]
            if date != prev_date:
                date_accum_time = float(csv_rows[j][el_time_index])

            seg_dur = float(csv_rows[j][seg_dur_index])
            csv_rows[j].append(date_accum_time) #date_start
            csv_rows[j].append(date_accum_time + seg_dur) #date_end
            csv_rows[j].append(abs_accum_time) #abs_start
            csv_rows[j].append(abs_accum_time + seg_dur) #abs_end

            abs_accum_time += seg_dur
            date_accum_time += seg_dur
            prev_date = date

        #create daterbase
        db = CSVDatabase.create_with_rows(csv_rows, input_datatypes)

        #append separate date and time cols for sanity
        db.add_column('tz_date', str)
        db.add_column('tz_time', str)

        #set their values using the Clock_Time_TZAdj column
        #drop down to database layer for this
        db_col_names = db.get_db_col_names()
        clock_col = db_col_names['db_datetime']
        date_col = db_col_names['tz_date']
        time_col = db_col_names['tz_time']
        stmt = 'UPDATE %s SET %s = date(%s), %s = time(%s)' % (CSVDatabase.TABLE_NAME, date_col, clock_col, time_col, clock_col)
        db.execute_stmt(stmt)

        #db.dump_to_file('%d.db' % (i + 1))
        
        #get a list of all of the dates we have in the file and their end time (durations)
        date_durs = select_date_durs(db)

        for row in date_durs:
            filename, date, dur = row
            
            #delete dates with < 5 hours of recorded time
            if float(dur) < MIN_SECS:
                if filename not in omitted_files:
                    omitted_files[filename] = {}
                omitted_files[filename][date] = dur
                
                db.csv_delete_by_name(
                    where_body='%s=?',
                    where_cols=['tz_date'],
                    params=[date]
                )
                
            else: #we have >= 5 hours of recorded time
                dur_at_6 = select_dur_at_6(db, db_col_names, date)
                
                if dur_at_6 >= MIN_SECS: #recording ends at or after 6pm
                    #delete the rows after 6pm
                    db.csv_delete_by_name(
                        where_body='%s = ? AND %s > ?',
                        where_cols=['tz_date', 'tz_time'],
                        params=[date, '18:00']
                    )
                elif dur_at_6 < MIN_SECS:
                    #delete rows after 5 hours
                    db.csv_delete_by_name(
                        where_body='%s = ? AND %s > ?',
                        where_cols=['tz_date', 'date_start_time'],
                        params=[date, MIN_SECS]
                    )

                #else #dur_at_6 == None:
                    #take the whole recording
                    
                #date, orig dur, trimmed_dur
                if filename not in trimmed_files:
                    trimmed_files[filename] = {}
                trimmed_files[filename][date] = [dur]

        #get a list of all of the dates we have in the file and their end time (durations) after trimming
        date_durs = select_date_durs(db)
        for (filename, date, dur) in date_durs:
            trimmed_files[filename][date].append(dur)
            min_time, max_time = select_info(db, db_col_names, date)
            trimmed_files[filename][date].append(min_time)
            trimmed_files[filename][date].append(max_time)

        db.write_to_file(
            out_path + os.path.basename(file_list[i]),
            omit_col_indices=[22, 23, 24, 27, 28]
        )

    print ''
    print_trimmed_files(trimmed_files)
    print ''
    print_omitted_files(omitted_files)