def __init__(self): self.preprocessor = Preprocessor() self.feature_extractor = FeatureExtractor() self.crf_analyzer = CRFAnalyzer() self.sentiment_analyzer = SentimentAnalyzer() print("\nAll module instantiated and ready to go....\n")
def mfcc_stuff(cfg): """ for dct, filter bands, etc """ # plot path plot_path = '../docu/thesis/3_signal/figs/' # init feature extractor feature_extractor = FeatureExtractor(cfg['feature_params']) # plot dct plot_dct(custom_dct_matrix(cfg['feature_params']['n_filter_bands']), plot_path=plot_path, name='signal_mfcc_dct', show_plot=False) plot_dct(custom_dct_matrix(cfg['feature_params']['n_filter_bands']), plot_path=plot_path, context='dct-div', name='signal_mfcc_dct-div', show_plot=False) # mel scale plot_mel_scale(plot_path=plot_path, name='signal_mfcc_mel_scale', show_plot=False) # plot mel bands plot_mel_band_weights(feature_extractor.w_f, feature_extractor.w_mel, feature_extractor.f, feature_extractor.m, plot_path=plot_path, name='signal_mfcc_weights', show_plot=True)
def __init__(self, dataset_cfg, feature_params, collect_wavs=False, verbose=False): # parent init super().__init__(dataset_cfg, feature_params, collect_wavs=collect_wavs, verbose=verbose) # feature extractor self.feature_extractor = FeatureExtractor(feature_params=self.feature_params) # short vars self.N = self.feature_extractor.N self.hop = self.feature_extractor.hop # create plot plaths if not already exists create_folder(list(self.plot_paths.values())) # recreate if self.dataset_cfg['recreate'] or not check_folders_existance(self.wav_folders, empty_check=True): # delete old data delete_files_in_path(self.wav_folders, file_ext=self.dataset_cfg['file_ext']) # create folder wav folders create_folder(self.wav_folders) # create sets (specific to dataset) self.create_sets() # get audio files from sets self.get_audiofiles() self.get_annotation_files()
def __init__(self, classifier, mic_params, is_audio_record=False, root_path='./'): # arguments self.classifier = classifier self.mic_params = mic_params self.is_audio_record = is_audio_record self.root_path = root_path # plot path self.plot_path = self.root_path + self.mic_params['plot_path'] # create folder for plot path create_folder([self.plot_path]) # shortcuts self.feature_params = classifier.feature_params # feature extractor self.feature_extractor = FeatureExtractor(self.feature_params) # windowing params self.N, self.hop = self.feature_extractor.N, self.feature_extractor.hop # queue self.q = queue.Queue() # collector self.collector = Collector( N=self.N, hop=self.hop, frame_size=self.feature_params['frame_size'], update_size=self.mic_params['update_size'], frames_post=self.mic_params['frames_post'], is_audio_record=self.is_audio_record) # device self.device = sd.default.device[0] if not self.mic_params[ 'select_device'] else self.mic_params['device'] # determine downsample self.downsample = self.mic_params['fs_device'] // self.feature_params[ 'fs'] # get input devices self.input_dev_dict = self.extract_devices() # show devices print("\ndevice list: \n", sd.query_devices()) print("\ninput devs: ", self.input_dev_dict.keys()) # stream self.stream = None # change device flag self.change_device_flag = False
def audio_set_wavs(cfg): """ audio set wavs """ # plot path plot_path = '../docu/thesis/5_exp/figs/' # audio sets a1 = AudioDataset(cfg['datasets']['speech_commands'], cfg['feature_params'], root_path='../') a2 = AudioDataset(cfg['datasets']['my_recordings'], cfg['feature_params'], root_path='../') # feature extractor feature_extractor = FeatureExtractor(cfg['feature_params']) # get audio files a1.get_audiofiles() # random seed np.random.seed(1234) r = np.random.randint(low=0, high=150, size=len(a1.set_audio_files[1])) wav_grid = [] # process wavs for wav in sorted([ label_wavs[r[i]] for i, label_wavs in enumerate(a1.set_audio_files[1]) ]): # info print("wav: ", wav) # get raw x, _ = a1.wav_pre_processing(wav) # extract feature vectors [m x l] _, bon_pos = feature_extractor.extract_mfcc(x, reduce_to_best_onset=False) # append to wav grid wav_grid.append((librosa.util.normalize(x), re.sub(r'[0-9]+-', '', wav.split('/')[-1].split('.')[0]), bon_pos)) # plot wav grid plot_wav_grid(wav_grid, feature_params=a1.feature_params, grid_size=(6, 5), plot_path=plot_path, name='wav_grid_c30', show_plot=True)
def __init__(self, classifier, feature_params, mic_params, is_audio_record=False): # arguments self.classifier = classifier self.feature_params = feature_params self.mic_params = mic_params self.is_audio_record = is_audio_record # windowing params self.N, self.hop = int( feature_params['N_s'] * feature_params['fs']), int( feature_params['hop_s'] * feature_params['fs']) # queue self.q = queue.Queue() # collector self.collector = Collector( N=self.N, hop=self.hop, frame_size=self.feature_params['frame_size'], update_size=self.mic_params['update_size'], frames_post=self.mic_params['frames_post'], is_audio_record=self.is_audio_record) # feature extractor self.feature_extractor = FeatureExtractor( self.feature_params['fs'], N=self.N, hop=self.hop, n_filter_bands=self.feature_params['n_filter_bands'], n_ceps_coeff=self.feature_params['n_ceps_coeff'], frame_size=self.feature_params['frame_size']) # select microphone yourself (usually not necessary) if mic_params['select_device']: sd.default.device = self.mic_params['device'] # determine downsample self.downsample = self.mic_params['fs_device'] // self.feature_params[ 'fs'] # show devices print("\ndevice list: \n", sd.query_devices()) # setup stream sounddevice self.stream = sd.InputStream(samplerate=self.mic_params['fs_device'], blocksize=int(self.hop * self.downsample), channels=self.mic_params['channels'], callback=self.callback_mic)
def time_measurements(x, u, feature_params): """ time measurements """ # create feature extractor feature_extractor = FeatureExtractor(feature_params) # n measurements delta_time_list = [] for i in range(100): # measure extraction time - start start_time = time.time() # time: 0.030081419944763182 #y = calc_mfcc39(x, fs, N=400, hop=160, n_filter_bands=32, n_ceps_coeff=12, use_librosa=False) # time: 0.009309711456298829 #y = calc_mfcc39(x, fs, N=400, hop=160, n_filter_bands=32, n_ceps_coeff=12, use_librosa=True) # time: 0.00014737367630004883 #y = (custom_dct(np.log(u), n_filter_bands).T) # time: 6.929159164428711e-05 #y = scipy.fftpack.dct(np.log(u), type=2, n=n_filter_bands, axis=1, norm=None, overwrite_x=False).T # time: 0.00418839693069458 *** winner y, _ = feature_extractor.extract_mfcc(x) # time: 0.015525884628295898 #y, _ = feature_extractor.extract_mfcc39_slow(x) # time: 0.011266257762908936s #y = custom_stft(x, N=N, hop=hop, norm=True) # time: 0.0005800390243530274s #y = 2 / N * librosa.stft(x, n_fft=N, hop_length=hop, win_length=N, window='hann', center=True, dtype=None, pad_mode='reflect') # time: 0.00044193744659423826s #_, _, y = scipy.signal.stft(x, fs=1.0, window='hann', nperseg=N, noverlap=N-hop, nfft=N, detrend=False, return_onesided=True, boundary='zeros', padded=False, axis=- 1) # result of measured time diff delta_time_list.append(time.time() - start_time) # data shpae print("y: ", y.shape) # times print("delta_time: ", np.mean(delta_time_list))
def write_predictions(self): """Output the predictions to a text file.""" res = FeatureExtractor("test").run() model = torch.load("model.pt") test = namedtuple("res", ["lsr", "feats", "scores"])(lsr=res.lsr.reshape( -1, 2048), feats=res.feats, scores=res.scores) dev_ = data_utils.TensorDataset(*[ torch.tensor(getattr(test, i)).float() for i in ["lsr", "feats", "scores"] ]) with torch.no_grad(): preds = model.forward(*dev_.tensors[:2]).cpu().numpy() np.set_printoptions(suppress=True) np.savetxt("predictions.txt", preds.astype(float), delimiter="\n", fmt="%f") print("Predictions saved to predictions.txt")
def showcase_wavs(cfg, raw_plot=True, spec_plot=True, mfcc_plot=True, show_plot=False): """ showcase wavs """ # plot path plot_path = '../docu/thesis/3_signal/figs/' # change params feature_params = cfg['feature_params'].copy() feature_params['n_ceps_coeff'] = 32 feature_params['norm_features'] = True # init feature extractor feature_extractor = FeatureExtractor(feature_params) # wav, anno dir wav_dir, anno_dir = '../ignore/my_recordings/showcase_wavs/', '../ignore/my_recordings/showcase_wavs/annotation/' # analyze some wavs for wav, anno in zip(glob(wav_dir + '*.wav'), glob(anno_dir + '*.TextGrid')): # info print("\nwav: ", wav), print("anno: ", anno) # load file x, _ = librosa.load(wav, sr=feature_params['fs']) # raw waveform if raw_plot: plot_waveform(x, feature_params['fs'], anno_file=anno, hop=feature_extractor.hop, plot_path=plot_path, name='signal_raw_' + wav.split('/')[-1].split('.')[0] + '_my', show_plot=show_plot) # spectogram if spec_plot: plot_spec_profile(x, feature_extractor.calc_spectogram(x).T, feature_params['fs'], feature_extractor.N, feature_extractor.hop, anno_file=anno, plot_path=plot_path, title=wav.split('/')[-1].split('.')[0] + '_my', name='signal_spec-lin_' + wav.split('/')[-1].split('.')[0] + '_my', show_plot=show_plot) plot_spec_profile(x, feature_extractor.calc_spectogram(x).T, feature_params['fs'], feature_extractor.N, feature_extractor.hop, log_scale=True, anno_file=anno, plot_path=plot_path, title=wav.split('/')[-1].split('.')[0] + '_my', name='signal_spec-log_' + wav.split('/')[-1].split('.')[0] + '_my', show_plot=show_plot) # mfcc if mfcc_plot: mfcc, bon_pos = feature_extractor.extract_mfcc( x, reduce_to_best_onset=False) plot_mfcc_profile(x, cfg['feature_params']['fs'], feature_extractor.N, feature_extractor.hop, mfcc, anno_file=anno, sep_features=True, bon_pos=bon_pos, frame_size=cfg['feature_params']['frame_size'], plot_path=plot_path, name='signal_mfcc_' + wav.split('/')[-1].split('.')[0] + '_my', close_plot=False, show_plot=show_plot)
def estimate_parameters(multinomial_nb=False, bernoulli_nb=False, k_nearest=False, support_vm=False, support_vmsgd=False, bow=False, tfidf=False): """ This method performs a grid search on the given algorithm using a fixed set of parameter ranges. The values with highest score are printed to stdout after evaluation :param multinomial_nb: MultinomialNB :param bernoulli_nb: BernoulliNB :param k_nearest: KNearestClassifier :param support_vm: Linear SVM aka SVC :param support_vmsgd: SGDClassifier :param bow: CountVectorizer aka Bag-of-words :param tfidf: TfidfVectorizer """ fe = FeatureExtractor() counts, targets = fe.fetch_data() MAX_DF = [0.25, 0.5, 0.75, 1.0] N_GRAMS = [(1, 1), (1, 2), (1, 3), (1, 4)] if multinomial_nb: CLF = MultinomialNB() parameters = {'clf__alpha': 10.0**-np.arange(5, 11)} elif bernoulli_nb: CLF = BernoulliNB() parameters = {'clf__alpha': 10.0**-np.arange(5, 11)} elif k_nearest: CLF = KNeighborsClassifier() parameters = { 'clf__n_neighbors': range(2, 10), 'clf__weights': ('uniform', 'distance'), 'clf__algorithm': ('auto', 'brute'), 'clf__leaf_size': (20, 30, 40) } elif support_vm: CLF = SVC() parameters = { 'clf__kernel': ('linear', 'sigmoid', 'rbf', 'poly'), 'clf__decision_function_shape': ('ovo', 'ovr'), 'clf__C': (100, 1000, 10000, 100000, 1000000), 'clf__gamma': (0.001, 0.01, 0.1, 1) } elif support_vmsgd: CLF = SGDClassifier(max_iter=50) parameters = { 'clf__loss': ('hinge', 'modified_huber', 'squared_hinge'), 'clf__penalty': ('l1', 'l2', 'elasticnet'), 'clf__alpha': 10.0**-np.arange(1, 8), 'clf__tol': (0.3, 0.2, 1e-2, 1e-3, 1e-4), 'clf__n_iter': np.ceil(10**6 / 1062), 'clf__eta0': (0.0, 0.2, 0.5, 0.7), 'clf__learning_rage': ('constant', 'optimal', 'invscaling'), 'clf__average': (True, False) } else: print('Please provide one which algorithm to use') return # add feature extraction params and classifier to pipeline if bow: parameters.update({ 'vect__max_df': MAX_DF, 'vect__ngram_range': N_GRAMS }) pipeline = Pipeline([('vect', CountVectorizer()), ('clf', CLF)]) elif tfidf: parameters.update({ 'tfidf__max_df': MAX_DF, 'tfidf__ngram_range': N_GRAMS, 'tfidf__analyzer': ('word', 'char'), 'tfidf__sublinear_tf': (True, False), 'tfidf__smooth_idf': (True, False), 'tfidf__use_idf': (True, False), 'tfidf__norm': ('l1', 'l2', None) }) pipeline = Pipeline([('tfidf', TfidfVectorizer()), ('clf', CLF)]) else: print('Please provide one which algorithm to use') return # perform grid search on pipeline grid_search = GridSearchCV(estimator=pipeline, param_grid=parameters, cv=15, scoring='accuracy') print("parameters:") pprint(parameters) print("Starting grid search. This may take some time...") # learn vocabulary grid_search.fit(counts, targets) print("Best parameters: " + str(grid_search.best_params_)) print("Best score: %0.3f" % grid_search.best_score_) filename = '/var/booking_categorizer/' with open(filename, 'a') as file: file.write("Best parameters: " + str(grid_search.best_params_) + "\n" + "Best score: %0.3f" % grid_search.best_score_)
sys.path.append("../") from common import create_folder from feature_extraction import FeatureExtractor # plot path #plot_path = './ignore/plots/fe/' # create folder #create_folder([plot_path]) # yaml config file cfg = yaml.safe_load(open("../config.yaml")) # init feature extractor feature_extractor = FeatureExtractor(cfg['feature_params']) # -- # params fs = 16000 N = 400 hop = 160 n_filter_bands = 16 n_ceps_coeff = 12 # -- # test signal # generate test signal x = some_test_signal(fs, t=1, save_to_file=False)
def run(properties, vector_models, results_file, csv_line): """ Script that runs the k-fold :param properties: dictionary containing the parameters specified in the config file for the current experiment :param vector_models: list of embedding models to be used in this experiment :param results_file: csv file where the accuracies are going to be written :param csv_line: line of csv config file corresponding to the current experiment :return: nothing """ kfold_folder_path = '../data/kfold/' # folder containing the k partitions (the development set has already been split during preprocessing) print 'Writing to output file:', results_file.name k = properties['K'] results_file.write(csv_line.rstrip( )) # it copies the line from config file to keep track of used parameters r = range(1, k + 1) """ If the bag-of-words feature is chosen, the script initializes dictionary for the list of all words in the data-set. It also initializes the index value available for the next unseen word, with the value of zero because no word has been added yet. This index value will be updated every time an unseen word occurs in the data-set. """ if properties['BAG_OF_WORDS']: feature_extractor = FeatureExtractor( properties, words_dict={}, next_word_position=0, vector_models=vector_models, vectors_size=properties['VECTORS_SIZE']) else: feature_extractor = FeatureExtractor( properties, vector_models=vector_models, vectors_size=properties['VECTORS_SIZE']) results_dict = { 'subj': 0.0, 'opos': 0.0, 'oneg': 0.0, 'iro': 0.0, 'polarity': 0.0 } kfold_folder_path += 'conll/' """ managing cross validation; the k partition have already been created during pre-processing """ if k > 1: for i in r: print print 'RUNNING ITERATION N.', str(i) kth_value_folder = kfold_folder_path + str(k) + '/' """ creates list of partition sorted by k value inside file names """ partitions = sorted(os.listdir(kth_value_folder), key=lambda x: (int(re.sub('\D', '', x)), x)) test_file = kth_value_folder + 'fold_' + str(i) for index in range(len(partitions)): partitions[index] = kth_value_folder + partitions[index] partitions.pop(partitions.index(test_file)) try: assert len(partitions) == k - 1 except AssertionError: print 'Error: invalid number of partitions' """ samples with word, emoj and embedding features """ training_samples = [] """ dictionaries of word occurrences in tweets for bag-of-words """ training_words_dicts = [] """ dictionary with training labels """ training_labels = { 'subj_s': [], 'opos_s': [], 'oneg_s': [], 'iro_s': [], 'lpos_s': [], 'lneg_s': [] } extraction_function = feature_extractor.extract_from_conll """ using partitions as training set, with the exception of kth one """ for training_file in partitions: samples, dicts, subj_s, opos_s, oneg_s, iro_s, lpos_s, lneg_s = extraction_function( training_file) training_labels['subj_s'] += subj_s training_labels['opos_s'] += opos_s training_labels['oneg_s'] += oneg_s training_labels['iro_s'] += iro_s training_labels['lpos_s'] += lpos_s training_labels['lneg_s'] += lneg_s training_samples += samples training_words_dicts += dicts """ sets to zero the empty positions in bags-of-words of training tweets """ fill_dicts(training_samples, training_words_dicts, feature_extractor.next_word_position) test_labels = {} """ using kth partition as test-set """ samples, test_words_dicts, id_s, top_s, subj_s, opos_s, oneg_s, iro_s, lpos_s, lneg_s = extraction_function( test_file, test=True) """ delete embedding models """ test_samples = samples test_labels['subj_s'] = subj_s test_labels['opos_s'] = opos_s test_labels['oneg_s'] = oneg_s test_labels['iro_s'] = iro_s test_labels['lpos_s'] = lpos_s test_labels['lneg_s'] = lneg_s test_labels['id_s'] = id_s """ sets to zero the empty positions in bags-of-words of test tweets """ fill_dicts(test_samples, test_words_dicts, feature_extractor.next_word_position) training_labels_vectors = [ training_labels['subj_s'], training_labels['opos_s'], training_labels['oneg_s'], training_labels['iro_s'], training_labels['lpos_s'], training_labels['lneg_s'] ] test_labels_vectors = [ test_labels['subj_s'], test_labels['opos_s'], test_labels['oneg_s'], test_labels['iro_s'], test_labels['lpos_s'], test_labels['lneg_s'] ] test_id_s = test_labels['id_s'] predict_matrix = get_prediction_matrix(training_samples, training_labels_vectors, test_samples, test_id_s, top_s, properties['KERNEL']) gold_matrix = get_gold_matrix(test_labels_vectors, test_id_s, top_s) prediction_lines = matrix2string(predict_matrix) test_lines = matrix2string(gold_matrix) """ write prediction and gold matrix to file for the evaluation script""" tmp_folder = '../tmp/' tmp_result_file = open(tmp_folder + 'tmp_res.txt', 'w') tmp_gold_file = open('tmp_folder + tmp_gold.txt', 'w') tmp_result_file.write(prediction_lines) tmp_gold_file.write(test_lines) tmp_result_file.close() tmp_gold_file.close() """ evaluate and write accuracies to temporary file""" tmp_out_file_name = 'tmp_out' + str(i) + '.txt' tmp_out_file = open(tmp_out_file_name, 'w') evaluate('tmp_res.txt', 'tmp_gold.txt', outfile=tmp_out_file, verbose=False) tmp_out_file.close() """ parse temporary results file and updates the dictionary with experiment results""" with open(tmp_out_file_name, 'r') as infile: task = '' for line in infile: if 'task' in line: task = line.rstrip().split()[-1] if line[0].isdigit(): """ add the accuracies values to the dictionary of accuracies """ results_dict[task] += float(line.rstrip().split()[-1]) for key, value in results_dict.iteritems(): """ averages the results """ results_dict[key] = value / k elif k == 1: """ if k == 1 it uses the official test-set as test """ training_file_name = '/home/ruggero/MEGA/tesi_magistrale/classification/data/training_all.parsed' test_file_name = '/home/ruggero/MEGA/tesi_magistrale/classification/data/testset_annotated.parsed' training_labels = { 'subj_s': [], 'opos_s': [], 'oneg_s': [], 'iro_s': [], 'lpos_s': [], 'lneg_s': [] } extraction_function = feature_extractor.extract_from_conll training_samples, training_words_dicts, subj_s, opos_s, oneg_s, iro_s, lpos_s, lneg_s = extraction_function( training_file_name) training_labels['subj_s'] += subj_s training_labels['opos_s'] += opos_s training_labels['oneg_s'] += oneg_s training_labels['iro_s'] += iro_s training_labels['lpos_s'] += lpos_s training_labels['lneg_s'] += lneg_s fill_dicts(training_samples, training_words_dicts, feature_extractor.next_word_position) test_labels = {} test_samples, test_words_dicts, id_s, top_s, subj_s, opos_s, oneg_s, iro_s, lpos_s, lneg_s = extraction_function( test_file_name, test=True) test_labels['subj_s'] = subj_s test_labels['opos_s'] = opos_s test_labels['oneg_s'] = oneg_s test_labels['iro_s'] = iro_s test_labels['lpos_s'] = lpos_s test_labels['lneg_s'] = lneg_s test_labels['id_s'] = id_s fill_dicts(test_samples, test_words_dicts, feature_extractor.next_word_position) training_labels_vectors = [ training_labels['subj_s'], training_labels['opos_s'], training_labels['oneg_s'], training_labels['iro_s'], training_labels['lpos_s'], training_labels['lneg_s'] ] test_labels_vectors = [ test_labels['subj_s'], test_labels['opos_s'], test_labels['oneg_s'], test_labels['iro_s'], test_labels['lpos_s'], test_labels['lneg_s'] ] test_id_s = test_labels['id_s'] predict_matrix = get_prediction_matrix(training_samples, training_labels_vectors, test_samples, test_id_s, top_s, properties['KERNEL']) gold_matrix = get_gold_matrix(test_labels_vectors, test_id_s, top_s) prediction_lines = matrix2string(predict_matrix) test_lines = matrix2string(gold_matrix) tmp_result_file = open('tmp_res.txt', 'w') tmp_gold_file = open('tmp_gold.txt', 'w') tmp_result_file.write(prediction_lines) tmp_gold_file.write(test_lines) tmp_result_file.close() tmp_gold_file.close() tmp_out_file_name = 'tmp_out' + '.txt' tmp_out_file = open(tmp_out_file_name, 'w') evaluate('tmp_res.txt', 'tmp_gold.txt', outfile=tmp_out_file, verbose=False) tmp_out_file.close() with open(tmp_out_file_name, 'r') as infile: task = '' for line in infile: if 'task' in line: task = line.rstrip().split()[-1] if line[0].isdigit(): results_dict[task] += float(line.rstrip().split()[-1]) write_results(results_file, results_dict)
def main(): parser = get_parser() args = parser.parse_args() feature_extractor = FeatureExtractor() if args.pipeline_type == "analysis": text_preprocessor = TextPreProcessor( stop_words_file_path=args.stopwords_file_path) analyser = DataAnalyser(input_file=args.input_file_path, text_preprocessor=text_preprocessor) analyser.get_data_distribution(plot_bar=args.plot_bar) analyser.get_word_weights(word_thresh=args.word_thresh) if args.word_cloud: analyser.generate_word_cloud() elif args.pipeline_type == "model_selection": text_preprocessor = TextPreProcessor( stop_words_file_path=args.stopwords_file_path) training_data_df = load_training_data(args.train_file_path) training_data_df["sentence"] = training_data_df["sentence"].map( text_preprocessor.process) features = feature_extractor.get_features_for_training( training_data_df["sentence"], args.vectorizer) labels = training_data_df["class"] apply_cross_validation( features=features, labels=labels, k_folds=args.kfolds, use_svm=args.use_svm, use_naive_bayes=args.use_naive_bayes, use_random_forest=args.use_random_forest, use_logistic_regression=args.use_logistic_regression, use_xgboost=args.use_xgboost, use_gradient_boosting=args.use_gradient_boosting, plot_cv_graph=True, ) elif args.pipeline_type == "training": trainer = Trainer( train_file_path=args.train_file_path, val_file_path=args.val_file_path, stop_words_file_path=args.stopwords_file_path, model_name=args.best_model, feature_extractor=feature_extractor, ) training_data_df = load_training_data(args.train_file_path) trainer.train( training_data_df, split_test_size=args.split_size, vectorizer_name=args.vectorizer, get_classification_report=args.get_classification_report, get_confusion_matrix=args.get_confusion_matrix, ) validation_data_df = load_validation_data(args.val_file_path) trainer.validate(validation_data_df, vectorizer_name=args.vectorizer) if args.model_check_point_path: trainer.save_trained_model(args.model_check_point_path) elif args.pipeline_type == "prediction": if not args.stopwords_file_path: predictor = Predictor() else: predictor = Predictor(stop_words_file=args.stopwords_file_path) if args.input_file_path: predictor.predict_csv(args.input_file_path, args.output_file_path, args.model_path) if args.test_input: model, vectorizer = predictor.unpickle_the_model(args.model_path) predictor.predict(args.test_input, model, vectorizer)
import pandas as pd from sklearn.model_selection import GridSearchCV from feature_extraction import FeatureExtractor from regressor import RandomForestClassifierAuc,XGBRegressor from tools import fitStats,featureImportance test_df = pd.read_csv("./data/test.csv", delimiter=";", header=0, index_col=0); train_df = pd.read_csv("./data/train_preprocessed.csv", delimiter=";", header=0, index_col=0); train_label = pd.read_csv("./data/label.csv", delimiter=";", header=0, index_col=0); extractor = FeatureExtractor() train_df = extractor.fit_transform(train_df, train_label) test_df = extractor.transform(test_df) param_grid = dict(max_depth=[10], n_estimators=[15]) studyAuc = True if(studyAuc): reg = GridSearchCV(RandomForestClassifierAuc(max_depth=10, n_estimators=15), param_grid=param_grid) reg = GridSearchCV(XGBRegressor(), param_grid=dict()) else: reg = RandomForestClassifierAuc(max_depth=10, n_estimators=15) X_train = train_df.values y_train = train_label.values.ravel() X_test = test_df.values fit = reg.fit(X_train, y_train)
def __init__(self, cfg_tb, test_model_path, root_path='./'): # arguments self.cfg_tb = cfg_tb self.test_model_path = test_model_path self.root_path = root_path # shortcuts self.feature_params, self.data_size = None, None # paths self.paths = dict( (k, self.root_path + v) for k, v in self.cfg_tb['paths'].items()) # test model path self.test_model_name = self.test_model_path.split('/')[-2] # determine available model files model_files_av = [ f.split('/')[-1] for f in glob(self.test_model_path + '*model.pth') ] # model file self.model_files = [ self.test_model_path + f for f in model_files_av if f in self.cfg_tb['model_file_names'] ] # pick just the first one (errors should not occur) self.model_file = self.model_files[0] # param file self.params_file = self.test_model_path + self.cfg_tb[ 'params_file_name'] # wavs self.test_wavs = [ self.root_path + wav for wav in self.cfg_tb['test_wavs'] ] # create folder create_folder(list(self.paths.values())) # parameter loading net_params = np.load(self.params_file, allow_pickle=True) # extract params self.nn_arch, self.train_params, self.class_dict = net_params[ 'nn_arch'][()], net_params['train_params'][( )], net_params['class_dict'][()] # legacy stuff #self.data_size, self.feature_params = self.legacy_adjustments_tb(net_params) # legacy stuff self.data_size, self.feature_params = legacy_adjustments_net_params( net_params) # init feature extractor self.feature_extractor = FeatureExtractor(self.feature_params) # init net handler self.net_handler = NetHandler(nn_arch=self.nn_arch, class_dict=self.class_dict, data_size=self.data_size, use_cpu=True) # load model self.net_handler.load_models(model_files=[self.model_file]) # set evaluation mode self.net_handler.set_eval_mode()
concat = Concatenate()([pooled_conv_dropped_a, pooled_conv_dropped_b]) output = TimeDistributed(Dense(units = 1, activation = 'sigmoid', ))(concat) model = Model(my_input, output) model.compile(loss='categorical_hinge', optimizer = 'adagrad', metrics = ['accuracy']) return model # print(model.summary()) fe = FeatureExtractor(6) fe.set_w2v(w2v_pathname, 500, keep_alive=True) for epoch in range(2, 6): model = load_model('model/cnw100_{}.h5'.format(epoch-1)) for i in range(1,1001): true_filename = 'data/batch/bengio/6/{}.txt'.format(i) false_filename = 'data/batch/cnw/6/{}.txt'.format(i) with open(true_filename) as file: true_data = file.readlines() with open(false_filename) as file: false_data = file.readlines() true_word_seq = [] false_word_seq = []
def run(self): """Run whole data loading, feature extraction, model training and regressing pipeline.""" if self.mode == "extract": print("Extracting features") train = FeatureExtractor("train").run() dev = FeatureExtractor("dev").run() print("Saving features") np.save("saved_features/train_lsr", train.lsr) np.save("saved_features/train_nlp", train.feats) np.save("saved_features/train_scores", train.scores) np.save("saved_features/dev_lsr", dev.lsr) np.save("saved_features/dev_nlp", dev.feats) np.save("saved_features/dev_scores", dev.scores) else: # Load saved extracted features print("Loading saved features") split = False if self.full_data else True train, dev = load_features(split=split, nt=True) if self.params["upsample"]: train = self.upsample(train) train_loader = create_loader(train, self.params["batch_size_train"]) dev_loader = create_loader(dev, validate=True) # We set a random seed to ensure that results are reproducible. # Also set a cuda GPU if available if torch.cuda.is_available(): torch.backends.cudnn.deterministic = True GPU = True else: GPU = False device_idx = 0 if GPU: device = torch.device( "cuda:" + str(device_idx) if torch.cuda.is_available() else "cpu") else: device = torch.device("cpu") print(f"Running on {device}") if self.bUseConv: model = RecursiveNN( ModelBlock, self.params["conv_dict"], self.params["conv_ffnn_dict"], BASELINE_dim=self.params["NBaseline"], ) else: model = RecursiveNN_Linear( in_features=2048, N1=self.params["N1"], N2=self.params["N2"], out_features=self.params["out_features"], dropout=self.params["dropout"], leaky_relu=self.params["leaky_relu"], ) model = model.to(device) weights_initialiser = True if weights_initialiser: model.apply(weights_init) params_net = sum(p.numel() for p in model.parameters() if p.requires_grad) print("Total number of parameters in Model is: {}".format(params_net)) print(model) optimizer = optim.Adam(model.parameters(), lr=self.params["lr"]) scheduler = optim.lr_scheduler.StepLR( optimizer, step_size=self.params["step_size"], gamma=self.params["gamma"]) date_string = (str(datetime.datetime.now())[:16].replace(":", "-").replace( " ", "-")) writer = SummaryWriter(logdir + date_string) print("Running model") for epoch in range(self.params["epochs"]): train_model( model, train_loader, optimizer, epoch, log_interval=1000, scheduler=scheduler, writer=writer, ) test_loss = test_model(model, dev_loader, epoch, writer=writer) torch.save(model, "model.pt") self.model = model
def __init__(self): self.face_detector = FaceDetector() self.preprocessor = Preprocessor() self.extractor = FeatureExtractor()
batch_size=32, epochs=max_epoch, validation_data=[x_val, y_val], callbacks=[mc]) self.model = load_model(save_filename, custom_objects={'f1': f1}) def evaluate(self, x_test, y_test): print(self.model.evaluate(x_test, y_test)) aspect_list = list_from_file('resource/aspect.txt') n_aspect = len(aspect_list) w2v_pathname = 'resource/w2v_path.txt' max_length = 50 fe = FeatureExtractor(max_length) fe.set_w2v(w2v_pathname, 500, keep_alive=True) def prepare_feature(filename): with open(filename) as file: data = json.load(file) sentences = [datum['sentence'] for datum in data] aspects = [datum['aspect'] for datum in data] sequences = [text_to_word_sequence(s) for s in sentences] label = [] for i in range(len(aspects)): label.append(np.zeros(n_aspect))