-
Notifications
You must be signed in to change notification settings - Fork 0
/
silence_cutter.py
executable file
·214 lines (180 loc) · 6.97 KB
/
silence_cutter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
# Justin 2019
# Code is from the python notebooks.
import os
import numpy as np
import argparse
from tqdm import tqdm
import librosa
import soundfile as sf
import re
def load_file(input_filename, mono=True, sr=22050):
# if mono is true, returns samples of shape (2, n, )
# else returns samples of shape (n, )
# sample rate refers to number of samples per second: default selected by None, librosa default is 22050
x, sr = librosa.load(input_filename, mono=mono, sr=sr)
return x, sr
def get_S(x, sr):
x = np.array(x)
return librosa.feature.melspectrogram(y=x, sr=sr, n_mels=128, fmax=8000)
def dc_removal(x, sr):
x_mean = np.mean(x)
x = [e - x_mean for e in x]
return x
def high_pass_filter(x, sr, cutOff = 500):
from scipy import signal
#Creation of filter
nyq = 0.5 * sr
N = 5 # Filter order
fc = cutOff / nyq # Cutoff frequency normal
b, a = signal.butter(N, fc, btype='high') # high pass filter
#Apply the filter
tempf = signal.filtfilt(b, a, x)
return tempf
def preprocess(t, sr):
s = get_S(t, sr)
rms = librosa.feature.rms(S = s)
threshold = np.heaviside(rms[0] - 0.025,1)
i = 0
min_len = 0.1
epsilon = min_len *2/5 # the smallest silence possible is 0.1,
# which means we can pad sounds by epsilon
# in case this algorithm cuts too early or too late
# the constant has to be in [0, 0.5)
silence = (threshold[i] == 1.0)
sounds = []
curr_sound = []
# 1. Deduce sounds based on threshold
for i in range(1, len(threshold)):
if threshold[i] == 1.0 and threshold[i-1] < 1.0:
#start of sound
if not curr_sound:
curr_sound.append(i*512/sr)
elif threshold[i] == 0.0 and threshold[i-1] > 0.0:
#end of sound
if curr_sound:
curr_sound.append(i*512/sr)
sounds.append(curr_sound)
curr_sound = []
# 2. Silence must be greater than min_len for silence
print("before: len = ", len(sounds))
i = 1
while i < len(sounds):
if sounds[i][0]-sounds[i-1][1] < min_len:
# this silence is too short
sounds[i-1][1] = sounds[i][1]
sounds.pop(i)
i -= 1
i += 1
i = 1
while i < len(sounds):
if sounds[i][1] - sounds[i][0] < min_len:
# this sound is also too short
sounds.pop(i)
i -= 1
i += 1
# 3. Pad sounds by epsilon so we avoid cutting off the sounds due to the
# algorithm's inaccuracy.
i = 1
while i < len(sounds):
sounds[i][0]-=epsilon
sounds[i][1]+=epsilon
print("now, sounds: ", len(sounds))
return t, rms, threshold, sounds
# silence cutting operations
def delete(sounds, index, sound=True):
if sound:
sounds.pop(index)
return sounds
else: #sound == False:
# the n-th silence is after the n-th sound
if n > len(sounds):
# out of index, dont delete anything
return sounds
sounds[index][1] = sounds[index][0]
sounds.pop(index+1)
return sounds
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--input_dir', default='input/')
parser.add_argument('--output_dir', default='output/')
parser.add_argument('--start_range', default=0, type=int)
parser.add_argument('--end_range', default=9, type=int)
parser.add_argument('--wait', default=0, type=int)
args = parser.parse_args()
print(args.input_dir)
directory = os.fsencode(args.input_dir)
count = 0
for file in os.listdir(directory):
filename = os.fsdecode(file)
if filename.endswith(".wav"):
cur_file = os.fsdecode(os.path.join(directory, file))
print("Processing ", cur_file)
count += 1
if count <= args.wait:
print("pass")
continue
filename = cur_file[len(args.input_dir):-4]
# if int(filename[0]) < args.start_range or int(filename[0]) > args.end_range:
# print("pass")
# continue
x, sr = load_file(cur_file)
x = dc_removal(x, sr)
t = high_pass_filter(x, sr, cutOff=300)
s = get_S(t, sr)
rms = librosa.feature.rms(S = s)
threshold = np.heaviside(rms[0] - 0.025,1)
# 1. Using threshold, determine where the silences and sound are
i = 0
silence = (threshold[i] == 1.0)
sounds = []
curr_sound = []
for i in range(1, len(threshold)):
if threshold[i] == 1.0 and threshold[i-1] < 1.0:
#start of sound
if not curr_sound:
curr_sound.append(i*512/sr)
elif threshold[i] == 0.0 and threshold[i-1] > 0.0:
#end of sound
if curr_sound:
curr_sound.append(i*512/sr)
sounds.append(curr_sound)
curr_sound = []
# 2.1. Silence must have a minimum length of min_len
print("before: len = ", len(sounds))
i = 1
min_len = 0.09
while i < len(sounds):
if sounds[i][0]-sounds[i-1][1] < min_len:
# this silence is too short
sounds[i-1][1] = sounds[i][1]
sounds.pop(i)
i -= 1
i += 1
# 2.2. Sounds must have a minimum length of min_len
i = 1
while i < len(sounds):
if sounds[i][1] - sounds[i][0] < min_len:
# this silence is also too short
sounds.pop(i)
i -= 1
i += 1
# 3. Pad sounds by episilon, a fraction of min_len to ensure we keep
# as much of the sound as possible, and avoid cutting out anything
# episilon must be in [0,0.5)
epsilon = min_len * 0.45
i = 1
while i < len(sounds):
sounds[i][0]-=epsilon
sounds[i][1]+=epsilon
i += 1
print("now, sounds: ", len(sounds))
print(sounds[:int(len(sounds)/10)], "...")
# save the text file for later use
import csv
# find filename
filename = cur_file[len(args.input_dir):-4] #includes the previous directory's forward slash
print(filename + "< THIS IS IT")
with open(args.output_dir + filename + ".txt", "w", newline="") as f:
writer = csv.writer(f)
writer.writerows(sounds)
continue