def gen_fea(): sql_handler = SqlHandler() audio_text_value = list() with ThreadPoolExecutor(max_workers=30) as executor: #并行启动任务 task = [executor.submit(gen_sigle_fea, fold) for fold in PREFIX] for future in as_completed(task): try: fea_item = future.result() #每一个文件下所有数据的特征 eg:300_P audio_text_value.append(fea_item) except: continue COVAREP_COLUMNS.remove('VUV') audio_fea = list() audio_fea.append('ID') COVAREP_COLUMNS.extend(FORMANT_COLUMNS) for a_fea, s_fea in itertools.product(COVAREP_COLUMNS, stats_fea.columns): #笛卡尔积 相当于嵌套for循环 audio_fea.append(a_fea + '_' + s_fea) audio_text_fea = audio_fea + TEXT_COLUMNS assert len(audio_text_value[0]) == len(audio_text_fea) audio_text_df = pd.DataFrame(audio_text_value, columns=audio_text_fea) sql_handler.execute(f'drop table if exists {config.tbl_exp1_fea};' ) #因为每次选择特征不一样,所以入库之前需要删除原来的表 sql_handler.df_to_db(audio_text_df, config.tbl_exp1_fea) logger.info('audio feature exp1 has been stored!')
def data_set(): df_train = pd.read_csv(config.data_dir + global_values.TRAIN_SET_NAME, header=0) df_dev = pd.read_csv(config.data_dir + global_values.DEL_SET_NAME, header=0) logger.debug(df_dev.head()) sql_handler = SqlHandler() sql_handler.execute(f'drop table {config.tbl_develop_set}') sql_handler.execute(f'drop table {config.tbl_training_set}') sql_handler.df_to_db(df_train, config.tbl_training_set) sql_handler.df_to_db(df_dev, config.tbl_develop_set)
def hog_pca(): sql_handler = SqlHandler() pca = PCA(n_components=0.999) hog = pd.read_csv(config.data_dir+FACE_HOG) hog_pca_values = pca.fit_transform(hog) hog_pca_names = ['hog_pca_'+str(i) for i in range(184)] hog_pca = pd.DataFrame(hog_pca_values,columns = hog_pca_names) id = [float(id[:-1]) for id in PREFIX] col_name = hog_pca.columns.tolist() col_name.insert(0,'ID') hog_pca= hog_pca.reindex(columns = col_name,fill_value = 1) hog_pca['ID'] = id sql_handler.execute(f'drop table if exists {config.tbl_exp3_hog_fea};') #因为每次选择特征不一样,所以入库之前需要删除原来的表 sql_handler.df_to_db(hog_pca, config.tbl_exp3_hog_fea) logger.info('hog feature exp3 has been stored!')
def extract_audio(sample, prefix, opensmile_options, outputoption, feature_type): """Dispatch extraction tasks sample: phq-id like 310 prefix: phq file prefix like 310_ feature_type: mfcc or egemaps """ infilename = f"{config.sample_dir}/{prefix}P/{prefix}{SUFFIX['wav']}" outfilename = f'{sample}_{feature_type}.csv' opensmile_call = config.opensmile_exe + ' ' + opensmile_options + ' -inputfile ' + infilename + ' ' + outputoption + ' ' + outfilename + ' -instname ' + str( sample) + ' -output ?' os.system(opensmile_call) if os.path.exists(outfilename): df = pd.read_csv(outfilename, sep=';') else: return sample, feature_type db_handler = SqlHandler() if feature_type == 'mfcc': db_handler.df_to_db(df, config.tbl_mfcc, if_exists='append') elif feature_type == 'egemaps': db_handler.df_to_db(df, config.tbl_egemaps, if_exists='append') os.remove(outfilename) return sample, feature_type
def gen_fea(): sql_handler = SqlHandler() audio_feas, text_feas, vedio_feas = gen_sigle_fea(PREFIX[0]) #读取hog特征 应该在模型训练的地方做 #分三个表来提取数据 with ThreadPoolExecutor(max_workers=30) as executor: #并行启动任务 task = [executor.submit(gen_sigle_fea, fold) for fold in PREFIX[1:]] for future in as_completed(task): try: audio_value, text_value, vedio_value = future.result( ) #每一个文件下所有数据的特征 eg:300_P audio_feas = np.concatenate((audio_feas, audio_value)) vedio_feas = np.concatenate((vedio_feas, vedio_value)) text_feas = np.concatenate((text_feas, text_value)) except: continue COVAREP_COLUMNS.remove('VUV') audio_fea_name = ['ID'] text_fea_name = ['ID'] vedio_fea_name = ['ID'] audio_fea_name.extend(COVAREP_COLUMNS + FORMANT_COLUMNS) text_fea_name.extend(TEXT_COLUMNS) vedio_fea_name.extend(STABLE_POINTS) assert len(audio_feas[0]) == len(audio_fea_name) and len(text_feas[0]) == len(text_fea_name) \ and len(vedio_feas[0]) == len(vedio_fea_name) audio_df = pd.DataFrame(audio_feas, columns=audio_fea_name) vedio_df = pd.DataFrame(vedio_feas, columns=vedio_fea_name) text_df = pd.DataFrame(text_feas, columns=text_fea_name) hog_pca() sql_handler.execute(f'drop table if exists {config.tbl_exp3_audio_fea};' ) #因为每次选择特征不一样,所以入库之前需要删除原来的表 sql_handler.df_to_db(audio_df, config.tbl_exp3_audio_fea) logger.info('audio feature exp3 has been stored!') sql_handler.execute(f'drop table if exists {config.tbl_exp3_vedio_fea};' ) #因为每次选择特征不一样,所以入库之前需要删除原来的表 sql_handler.df_to_db(vedio_df, config.tbl_exp3_vedio_fea) logger.info('vedio feature exp3 has been stored!') sql_handler.execute(f'drop table if exists {config.tbl_exp3_text_fea};' ) #因为每次选择特征不一样,所以入库之前需要删除原来的表 sql_handler.df_to_db(text_df, config.tbl_exp3_text_fea) logger.info('text feature exp3 has been stored!')
def to_db(self, data_frame, table): sql_handler = SqlHandler() sql_handler.df_to_db(data_frame, table)