def no_laughter_present(t_files,start,end): for t_file in t_files: all_rows = get_text_from_file(t_file) for row in all_rows: region_start, region_end = extract_times_from_row(row) if audio_utils.times_overlap(float(region_start), float(region_end), float(start), float(end)): if 'laughter' in row.split()[-1]: return False return True
def combine_overlapping_regions(regions_A, regions_B): all_regions = regions_A + regions_B overlap_found = True while(overlap_found): i = 0; j = 0 overlap_found = False while i < len(all_regions): while j < len(all_regions): if i < j: start1 = all_regions[i][0]; end1 = all_regions[i][1] start2 = all_regions[j][0]; end2 = all_regions[j][1] if audio_utils.times_overlap(start1, end1, start2, end2): overlap_found = True all_regions.pop(i); all_regions.pop(j-1) all_regions.append((min(start1, start2), max(end1, end2))) j += 1 i += 1 return sorted(all_regions, key=lambda r: r[0])
def get_10_second_clips(regions_list, audio_file_path, full_audio_file_length, index, audioset_laughter_fraction, adjustment_amount=0): if len(regions_list) == 0: return [], [], 0, 0 # First pass to find clips all_clips = [] current_start = None current_end = None for i in range(len(regions_list)): if current_start is None: current_start = regions_list[i][0] beginning_space = current_start if regions_list[i][0] + regions_list[i][1] > current_start + 10: all_clips.append({ 'window': [current_start, current_end], 'beginning_buffer': 0., 'end_buffer': 0., 'beginning_space': beginning_space }) current_start = regions_list[i][0] beginning_space = current_start - current_end # new start point to old end point current_end = regions_list[i][0] + regions_list[i][1] if current_start is not None and current_end is not None: end_space = full_audio_file_length - current_end all_clips.append({ 'window': [current_start, current_end], 'beginning_buffer': 0., 'end_buffer': 0., 'beginning_space': beginning_space, 'end_space': end_space }) for i, clip in enumerate(all_clips): if 'end_space' not in clip: clip['end_space'] = all_clips[i + 1]['beginning_space'] #clip_len = clip['window'][1] - clip['window'][0] # 2nd pass: Go through, extending by 0.5 secs on each side unless it exceeds 10 seconds for i, clip in enumerate(all_clips): start, end = clip['window'] length = end - start # Try adding 0.5s to begin and end, if not possible, print and give up time_to_add_per_side = 0.5 # Try adding to beginning and end if time_to_add_per_side < clip[ 'beginning_space'] and time_to_add_per_side < clip['end_space']: clip['window'] = [ start - time_to_add_per_side, end + time_to_add_per_side ] clip['beginning_space'] -= time_to_add_per_side clip['end_space'] -= time_to_add_per_side clip['beginning_buffer'] += 0.5 clip['end_buffer'] += 0.5 if i > 0: all_clips[i - 1]['end_space'] -= time_to_add_per_side if i < len(all_clips) - 1: all_clips[i + 1]['beginning_space'] -= time_to_add_per_side # 3rd pass: Go back through, centering and extending windows out to 10s for i, clip in enumerate(all_clips): start, end = clip['window'] length = end - start # Try adding equally to begin and end, if not possible, try one side, if not possible, print and give up time_to_add = np.maximum( 10 - length, 0) # If longer than 10 secs, don't shorten it, just leave it time_to_add_per_side = time_to_add / 2 # Try adding to beginning and end if time_to_add_per_side < clip[ 'beginning_space'] and time_to_add_per_side < clip['end_space']: clip['window'] = [ start - time_to_add_per_side, end + time_to_add_per_side ] clip['beginning_space'] -= time_to_add_per_side clip['end_space'] -= time_to_add_per_side clip['beginning_buffer'] += time_to_add_per_side clip['end_buffer'] += time_to_add_per_side if i > 0: all_clips[i - 1]['end_space'] -= time_to_add_per_side if i < len(all_clips) - 1: all_clips[i + 1]['beginning_space'] -= time_to_add_per_side elif time_to_add < clip['beginning_space']: clip['window'] = [start - time_to_add, end] clip['beginning_buffer'] += time_to_add if i > 0: all_clips[i - 1]['end_space'] -= time_to_add elif time_to_add < clip['end_space']: clip['window'] = [start, end + time_to_add] clip['end_buffer'] += time_to_add if i < len(all_clips) - 1: all_clips[i + 1]['beginning_space'] -= time_to_add else: pass if clip['beginning_buffer'] < 0 and clip['beginning_buffer'] > -0.1: clip['beginning_buffer'] = 0. if clip['end_buffer'] < 0 and clip['end_buffer'] > -0.1: clip['end_buffer'] = 0. # 4th pass: Compute the class-balance (laughter fraction) for this conversation total_window_time = sum( [clip['window'][1] - clip['window'][0] for clip in all_clips]) total_laughter_time = sum([region[1] for region in regions_list]) swb_laughter_fraction = total_laughter_time / total_window_time # Tweak this adjustment_amount to find a value for which after everything # The class balances match intended_window_time = total_laughter_time / ( audioset_laughter_fraction) + adjustment_amount # 5th pass: Trim back the clips to match the class-balance distribution of the Audioset Annotations # Need to reduce the windows to cut 'total_window_time' down to 'intended_window_time' # Try to distribute the time so that all windows are close to the same size time_to_reduce = total_window_time - intended_window_time #available_time_per_clip = [clip['beginning_buffer']+clip['end_buffer'] for clip in clips] beginning_buffers = [clip['beginning_buffer'] for clip in all_clips] end_buffers = [clip['end_buffer'] for clip in all_clips] all_buffers = beginning_buffers + end_buffers time_to_reduce_per_buffer = distribute_time(time_to_reduce, all_buffers) beginning_buffer_updates, end_buffer_updates = np.split( time_to_reduce_per_buffer, 2) try: for i, clip in enumerate(all_clips): assert (clip['beginning_buffer'] >= 0) assert (clip['end_buffer'] >= 0) except: pass #import pdb; pdb.set_trace() try: assert (len(beginning_buffer_updates) == len(all_clips)) assert (len(end_buffer_updates) == len(all_clips)) except: pass #import pdb; pdb.set_trace() for i, clip in enumerate(all_clips): clip['window'][0] += beginning_buffer_updates[i] clip['beginning_space'] += beginning_buffer_updates[i] clip['beginning_buffer'] -= beginning_buffer_updates[i] clip['window'][1] -= end_buffer_updates[i] clip['end_space'] += end_buffer_updates[i] clip['end_buffer'] -= end_buffer_updates[i] if clip['beginning_buffer'] < 0 and clip['beginning_buffer'] > -0.1: clip['beginning_buffer'] = 0. if clip['end_buffer'] < 0 and clip['end_buffer'] > -0.1: clip['end_buffer'] = 0. try: assert (clip['beginning_buffer'] >= 0) assert (clip['end_buffer'] >= 0) except: pass #import pdb; pdb.set_trace() # 6th pass: Re-Compute the class-balance (laughter fraction) for this conversation total_window_time = sum( [clip['window'][1] - clip['window'][0] for clip in all_clips]) total_laughter_time = sum([region[1] for region in regions_list]) swb_laughter_fraction = total_laughter_time / total_window_time intended_window_time = total_laughter_time / audioset_laughter_fraction # Now make the dataframe rows = [] # For each window, grab each laughter region that's inside it and mark that relative to the window start for i, clip in enumerate(all_clips): inside_regions = [ r for r in regions_list if audio_utils.times_overlap( clip['window'][0], clip['window'][1], r[0], r[0] + r[1]) ] if len(inside_regions) > 5: pass #import pdb; pdb.set_trace() h = { 'FileID': audio_file_path.split('/')[-1].split('.')[0], 'audio_path': audio_file_path, 'audio_length': full_audio_file_length, 'window_start': clip['window'][0], 'window_length': clip['window'][1] - clip['window'][0] } for j in range(5): if j == 0: start_key = 'Start' end_key = 'End' else: start_key = f'Start.{j}' end_key = f'End.{j}' if len(inside_regions) > j: r = inside_regions[j] h[start_key] = r[0] h[end_key] = r[0] + r[1] else: h[start_key] = np.nan h[end_key] = np.nan if h['window_length'] > 1.: rows.append(h) return rows, all_clips, total_laughter_time, total_window_time