def test_batch(self): seg = Segmenter(vad_engine='sm') with tempfile.TemporaryDirectory() as tmpdirname: lout = [os.path.join(tmpdirname, '1.csv'), os.path.join(tmpdirname, '2.csv')] ret = seg.batch_process(['./media/musanmix.mp3', './media/musanmix.mp3'], lout) self.assertTrue(filecmp.cmp(lout[0], lout[1])) self.assertTrue(filecmp.cmp(lout[0], './media/musanmix-sm-gender.csv'))
def __init__(self, model_path_1, model_path_2): self.spleeter = Separator('spleeter:2stems', model_path_1) # 基于频域进行音轨分离,分离人声的话一般只需要2轨,accompaniment.wav 提取的背景/伴奏; vocals.wav是提取的人声 self.spleeter._get_predictor() self.ina_speech_segmenter = Segmenter(detect_gender=False, model_dir=model_path_2) ###### logging.info("init done")
def main(): parser = argparse.ArgumentParser() parser.add_argument("-g", "--detect_gender", type=bool, default=False, help="Enable gender detection") parser.add_argument("-d", "--vad_engine", choices=['sm', 'smn'], default='smn', help="Voice activity detection: smn (default) or sm") parser.add_argument("-b", "--ffmpeg_binary", default='ffmpeg', help="FFMPEG binary") parser.add_argument("input", help="Input file") parser.add_argument("output", help="Output file") args = parser.parse_args() seg = Segmenter(vad_engine=args.vad_engine, detect_gender=args.detect_gender, ffmpeg=args.ffmpeg_binary) seg.batch_process([args.input], [args.output], verbose=True)
def run_inaseg(input_wav, csv_out_dir): # load neural network into memory, may last few seconds with warnings.catch_warnings(): warnings.simplefilter("ignore", category=MatplotlibDeprecationWarning) from inaSpeechSegmenter import Segmenter seg = Segmenter(vad_engine="smn", detect_gender=True) with warnings.catch_warnings(): warnings.simplefilter("ignore") stem = input_wav.stem input_files = [str(input_wav)] output_files = [str(csv_out_dir / f"{stem}.csv")] seg.batch_process(input_files, output_files, verbose=True) #subprocess.run(["ina_speech_segmenter.py", "-i", input_wav, "-o", csv_out_dir]) return get_csv_path(input_wav, csv_out_dir)
def split(file_name, out_dir): print('\nREMOVE MUSIC AND CUT') seg = Segmenter() segmentation = seg(file_name) sample_rate, raw_audio = scipy.io.wavfile.read(file_name) #raw_audio , sr = librosa.load(file_name, sr=16000) speech = [] print(segmentation) count = 1 if not os.path.exists(out_dir): os.mkdir(out_dir) list_file = [] for s in segmentation: if s[0] != 'Music' and s[0] != 'NOACTIVITY': print(str(count), 'dur of sen:', s[2] - s[1]) speech_data = raw_audio[int(s[1] * sample_rate) - int(sample_rate / 4):int(s[2] * sample_rate + int(sample_rate / 4))] speech_data = np.array(speech_data) print(len(speech_data), len(speech_data) / sample_rate) if len(speech_data) / sample_rate < 0.5 or len( speech_data) / sample_rate > 20: continue else: out_filename = out_dir + '/' + file_name.split( '/')[-1].replace('.wav', '') + '_' + str(count) + '.wav' list_file.append(out_filename) scipy.io.wavfile.write(out_filename, sample_rate, speech_data) count += 1 return list_file
def __init__(self, filename): self.filename = filename self.segmenter = Segmenter(vad_engine='smn', detect_gender=False, ffmpeg='ffmpeg') self._find_music() self._trim()
def removeMusicAndCut(file_name, out_dir): print('\nREMOVE MUSIC AND CUT') seg = Segmenter() segmentation = seg(file_name) sample_rate, raw_audio = scipy.io.wavfile.read(file_name) speech = [] print(segmentation) count = 1 if not os.path.exists(out_dir): os.mkdir(out_dir) for s in segmentation: if s[0] != 'Music' and s[0] != 'NOACTIVITY': #speech.append(s) print(str(count), 'dur of sen:', s[2] - s[1]) speech_data = raw_audio[int(s[1] * sample_rate):int(s[2] * sample_rate)] speech_data = np.array(speech_data) print(len(speech_data), len(speech_data) / sample_rate) if len(speech_data) / sample_rate < 1.0 or len( speech_data) / sample_rate > 10: continue else: scipy.io.wavfile.write( out_dir + '/' + file_name.split('/')[-1].replace('.wav', '') + '_' + str(count) + '.wav', sample_rate, speech_data) count += 1
def getclips(media): seg = Segmenter() segmentation = seg(media) print("all seg", segmentation) timestamp = [] for i in segmentation: if i[0] == 'music': print("find out! music", i) if len(timestamp): if i[2] - timestamp[0] < 35: print("Too short, wait more", i[2], timestamp[0], i[2] - timestamp[0]) continue elif i[2] - timestamp[0] > 55: print("Too long") break timestamp[1] = i[2] else: timestamp.append(i[1]) timestamp.append(i[2]) print("music", timestamp) for i in timestamp: print(int(i)) newname = "%s.mp3" % (datetime.datetime.now().strftime("%Y%m%d")) #print("ffmpeg -ss %d -i %s -to %d %s" % ( timestamp[0], media, int(timestamp[1]-timestamp[0])+1, newname)) print("ffmpeg -ss %f -i %s -to %f %s -y " % (timestamp[0], media, timestamp[1] - timestamp[0], newname)) os.system("ffmpeg -ss %f -i %s -to %f %s -y" % (timestamp[0], media, (timestamp[1] - timestamp[0]), newname)) os.system("/usr/local/bin/telegram-send --caption %s --file %s" % (datetime.datetime.now().strftime("%Y%m%d"), newname))
def test_stopsec(self): # test stop_sec argument seg = Segmenter() stop_sec = 5. for lab, start, stop in seg('./media/musanmix.mp3', stop_sec=stop_sec): self.assertLessEqual(stop, stop_sec) self.assertLessEqual(start, stop_sec)
def test_processingresult(self): seg = Segmenter(vad_engine='sm') ret = seg('./media/musanmix.mp3') df = pd.read_csv('./media/musanmix-sm-gender.csv', sep='\t') ref = [(l.labels, float(l.start), float(l.stop)) for _, l in df.iterrows()] self.assertEqual([e[0] for e in ref], [e[0] for e in ret]) np.testing.assert_almost_equal([e[1] for e in ref], [e[1] for e in ret]) np.testing.assert_almost_equal([e[2] for e in ref], [e[2] for e in ret])
def recognize(audioFile): gender = [] seg = Segmenter() segmentation = seg(audioFile) for i in segmentation: if 'noEnergy' not in i[0]: gender.append(i[0]) return (max(set(gender), key=gender.count))
def _iina_segmentation(input_file): seg = Segmenter() segmentation = seg(input_file) result = [] for segment in segmentation: if segment[0] not in ('energy', 'noEnergy', 'noise', 'music'): result.append((segment[1], segment[2])) return result
def wav2seg(args, input_files): segmentations = [] detect_gender = bool(distutils.util.strtobool(args.detect_gender)) seg = Segmenter(vad_engine=args.vad_engine, detect_gender=detect_gender, ffmpeg=args.ffmpeg_binary) for input_file in input_files: segmentations += seg(input_file) print(segmentations) return segmentations
def main(): list_result = [] seg = Segmenter(detect_gender= True) ext = "."+str(sys.argv[1]) user = str(sys.argv[2]) Path = os.getcwd()+"/CorpusM/*" folders = getFolders(Path) folders = sorted(folders) Edades = Range_old(folders, ext) Generos = Gender(folders, ext,seg) data = Reporte(folders,Generos,Edades,'Mexico') WriteResult(data,user)
def test_boundaries(self): def seg2str(iseg, tseg): label, start, stop = tseg return 'seg %d <%s, %f, %f>' % (iseg, label, start, stop) seg = Segmenter() ret = seg('./media/musanmix.mp3') for i in range(len(ret) -1): curstop = ret[i][2] nextstart = ret[i+1][1] self.assertEqual(curstop, nextstart, '%s VS %s' % (seg2str(i, ret[i]), seg2str(i+1, ret[i+1])))
def test_processingresult(self): seg = Segmenter(vad_engine='sm') ret = seg('./media/musanmix.mp3') ref = [('music', 0.0, 22.48), ('noEnergy', 22.48, 29.080000000000002), ('male', 29.080000000000002, 32.480000000000004), ('music', 32.480000000000004, 52.800000000000004), ('noEnergy', 52.800000000000004, 54.78), ('music', 54.78, 55.74), ('noEnergy', 55.74, 63.34), ('male', 63.34, 68.26), ('noEnergy', 68.26, 68.92), ('male', 68.92, 71.60000000000001), ('noEnergy', 71.60000000000001, 72.0), ('male', 72.0, 73.82000000000001), ('noEnergy', 73.82000000000001, 74.5)] self.assertEqual(ref, ret)
def main(): (input_file, json_file) = sys.argv[1:3] # Run ina_speech_segmenter on input file # the result is a list of tuples # each tuple contains: # * label in 'Male', 'Female', 'Music', 'NOACTIVITY' # * start time of the segment # * end time of the segment seg = Segmenter() segmentation = seg(input_file) # Convert the resulting list of tuples to an object for serialization seg_schema = convert_to_segmentation_schema(input_file, segmentation) # Serialize the json and write it to destination file write_output_json(seg_schema, json_file) exit(0)
def classify(self): self.seg = Segmenter() counter = 0 for audioPath in self.media: startTime = int(round(time.time())) vid = audioPath.split("/")[-1] print("### {}/{} Processing {} ###".format(counter, len(self.media), vid)) tmp = self.seg(audioPath) tmp2 = str(tmp) self.segmentation.append(tmp) if ("Male" in tmp2 or "Female" in tmp2) and "Music" in tmp2: self.results.append("Mixed") elif "Music" in tmp2: self.results.append("Music") elif "Male" in tmp2 or "Female" in tmp2: self.results.append("Speech") endTime = int(round(time.time())) self.times.append(endTime - startTime) counter += 1
def classify(self): if self.algo == "ina": self.seg = Segmenter() counter = 0 for audioPath in self.media: startTime = int(round(time.time())) vid = audioPath.split("/")[-1] print("### {}/{} Processing {} ###".format(counter, len(self.media), vid)) if self.algo == "ina": tmp = self.seg(audioPath) tmp2 = str(tmp) self.segmentation.append(tmp) if ("Male" in tmp2 or "Female" in tmp2) and "Music" in tmp2: self.results.append("Mixed") elif "Music" in tmp2: self.results.append("Music") elif "Male" in tmp2 or "Female" in tmp2: self.results.append("Speech") elif self.algo == "paa": [flagsInd, classesAll, acc, CM] = aS.mtFileClassification(audioPath, "svmSM/svmSM", "svm", False, '') res = np.array(flagsInd).mean() if res <= 0.1: self.results.append("Speech") elif res >= 0.9: self.results.append("Music") else: self.results.append("Mixed") endTime = int(round(time.time())) self.times.append(endTime - startTime) counter += 1
class AudioDetect: def __init__(self, model_path_1, model_path_2): self.spleeter = Separator('spleeter:2stems', model_path_1) # 基于频域进行音轨分离,分离人声的话一般只需要2轨,accompaniment.wav 提取的背景/伴奏; vocals.wav是提取的人声 self.spleeter._get_predictor() self.ina_speech_segmenter = Segmenter(detect_gender=False, model_dir=model_path_2) ###### logging.info("init done") def file_base_name(self, file_path): return Path(file_path).resolve().stem def spleeter_volcals_file_name(self, input_file, output_dir): input_base_name = self.file_base_name(input_file) return output_dir + "/" + input_base_name + "/vocals.wav" # get def do_spleeter_from_buffer(self, input_buffer): waveform = buffer_utils.buffer_to_wave_for_spleeter( input_buffer, 44100) sources = self.spleeter.separate(waveform) return sources['vocals'] def do_spleeter(self, input_file, out_dir): # 分轨文件目录 out_dir self.spleeter.separate_to_file( input_file, out_dir, filename_format='{filename}/{instrument}.{codec}') return True def do_segment_from_buffer(self, input_buffer): with warnings.catch_warnings(): warnings.simplefilter("ignore") mspec, loge, difflen = buffer_utils.feat_from_spleeter_vocals_for_segment_two_transcode( input_buffer) segmention = self.ina_speech_segmenter.segment_feats( mspec, loge, difflen, 0) return (True, segmention) def do_segment(self, input, output_dir): with warnings.catch_warnings(): warnings.simplefilter("ignore") segmention = self.ina_speech_segmenter( self.spleeter_volcals_file_name(input, output_dir)) return (True, segmention) def process_segmention(self, result_dic, segmention): last_lable = "" last_start = -1 last_end = -1 segments = [] for segment in segmention: label = segment[0] label = self.map_label(label) start = round(float(segment[1]), 2) end = round(float(segment[2]), 2) if last_lable == "": last_lable = label last_start = start last_end = end continue if last_lable == label: last_end = end continue else: if last_lable == "speech": segments.append({ "type": "speech", "startSec": last_start, "endSec": last_end }) last_lable = label last_start = start last_end = end if last_lable == "speech": segments.append({ "type": "speech", "startSec": last_start, "endSec": last_end }) result_dic["segments"] = segments def map_label(self, label): speech_labels = ["music", "speech"] if label in speech_labels: return "speech" return "noEnergy" def process_from_buffer(self, input_buffer, input_file): result_dic = {} result_dic.clear() input_base_name = os.path.basename(input_file) result_dic["fileName"] = input_base_name vocals_data = self.do_spleeter_from_buffer(input_buffer) if vocals_data is None: logging.error("separate failed") return json.dumps(result_dic, ensure_ascii=False) result, segmention = self.do_segment_from_buffer( vocals_data) # make sure vocals_data is 16kHz if not result: logging.error("segment failed") return json.dumps(result_dic, ensure_ascii=False) self.process_segmention(result_dic, segmention) return json.dumps(result_dic, ensure_ascii=False) def process(self, input, output): result_dic = {} result_dic.clear() input_base_name = os.path.basename(input) result_dic["fileName"] = input_base_name if not self.do_spleeter(input, output): ### step 1 logging.error("separate failed") return json.dumps(result_dic, ensure_ascii=False) result, segmention = self.do_segment(input, output) ### step 2 if not result: logging.error("segment failed") return json.dumps(result_dic, ensure_ascii=False) self.process_segmention(result_dic, segmention) return json.dumps(result_dic, ensure_ascii=False)
def __init__(self): print("\ncnn_segs init...") self.seg = Segmenter()
help= "(default: 'true'). If set to 'true', segments detected as speech will be splitted into 'male' and 'female' segments. If set to 'false', segments corresponding to speech will be labelled as 'speech' (faster)" ) args = parser.parse_args() # Preprocess arguments and check their consistency input_files = [] for e in args.input: input_files += glob.glob(e) assert len( input_files ) > 0, 'No existing media selected for analysis! Bad values provided to -i (%s)' % args.input odir = args.output_directory assert os.access(odir, os.W_OK), 'Directory %s is not writable!' % odir # Do processings from inaSpeechSegmenter import Segmenter, seg2csv # load neural network into memory, may last few seconds detect_gender = bool(distutils.util.strtobool(args.detect_gender)) seg = Segmenter(vad_engine=args.vad_engine, detect_gender=detect_gender) with warnings.catch_warnings(): warnings.simplefilter("ignore") for i, e in enumerate(input_files): print('processing file %d/%d: %s' % (i + 1, len(input_files), e)) base, _ = os.path.splitext(os.path.basename(e)) seg2csv(seg(e), '%s/%s.csv' % (odir, base))
###--- utilsディレクトリはdatasetsディレクトリより上の階層にあるので ---### #sys.path.append('../utils') import vad_utils if __name__ == '__main__': """---Get all wavdata path---""" args = vad_utils.parse_args() path_input = vad_utils.get_path(args.input_dir) path_output = vad_utils.get_path(args.output_dir) clean_test_wav = vad_utils.get_wav_data(path_input) """---Generate de-silence data---""" path_index = 0 for i in clean_test_wav: #sr, input_data = wav.read(i) seg = Segmenter(vad_engine='smn', detect_gender=False) segmentation = seg(i) speech_segment_index = 0 for segment in segmentation: segment_label = segment[0] if (segment_label == 'speech'): #Convert start time in section from s to ms start_time = segment[1] * 1000 end_time = segment[2] * 1000 # 分割結果をwavに出力 newAudio = AudioSegment.from_wav(i) newAudio = newAudio[start_time:end_time] newAudio.export(path_output + "/segment" + str(path_index) +
def test_praat_export(self): seg = Segmenter() with tempfile.TemporaryDirectory() as tmpdirname: lout = [os.path.join(tmpdirname, '1.TextGrid')] ret = seg.batch_process(['./media/musanmix.mp3'], lout, output_format='textgrid') self.assertTrue(filecmp.cmp(lout[0], './media/musanmix-smn-gender.TextGrid'))
import Pyro4 import sys import os import socket from inaSpeechSegmenter import Segmenter if __name__ == '__main__': dname = os.path.dirname(os.path.realpath(__file__)) hostname = socket.gethostname() uri = sys.argv[1] jobserver = Pyro4.Proxy(uri) ret = -1 outname = 'init' # batch size set at 1024. Use lower values with small gpus g = Segmenter(batch_size=1024) while True: lsrc, ldst = jobserver.get_njobs('%s %s' % (hostname, ret)) print(lsrc, ldst) if len(lsrc) == 0: print('job list finished') break ret = g.batch_process(lsrc, ldst, skipifexist=True, nbtry=3)
def test_short(self): seg = Segmenter(vad_engine='sm') ret = seg('./media/0021.mp3') ref = [('male', 0, 0.66)] self.assertEqual(ref, ret)
def test_execution(self): # if this test fails, then you should check to correctness of your # tensorflow installation seg = Segmenter() ret = seg('./media/musanmix.mp3')
def recognize(audioFile): gender = [] seg = Segmenter() segmentation = seg(audioFile) return (max(set(gender), key=gender.count))
import moviepy.editor as mp from ddsp.colab.colab_utils import upload filenames, audios = upload() tuple_file=filenames, audios[0][0] video=tuple_file[0][0] clip = mp.VideoFileClip("/content/"+video).subclip(0,20) clip.audio.write_audiofile("audio.mp3") """##**Step 3: compute percentage**""" #@title Click here to calculate the percentage of female/male voice speech (this may take a while). from inaSpeechSegmenter import Segmenter, seg2csv media = 'audio.mp3' seg = Segmenter() segmentation = seg(media) female=0 male=0 for i in segmentation: duration = i[2] - i[1] if i[0] == "female": female=female + duration elif i[0] == "male": male=male + duration total_speech= female + male def percentage(part, whole): return 100 * float(part)/float(whole)
def test_init(self): seg = Segmenter()