def load_dataset(data_pickle, labels_pickle, to_one_hot=True, batch_size=6, normalization=True): with open(data_pickle, 'rb') as data_dump: data_sentences = pickle.load(data_dump) with open(labels_pickle, 'rb') as labels_dump: labels_sentences = pickle.load(labels_dump) # Normalize the mfccs if normalization: print("Normalizing") data = [pp.normalize_mfcc(s) for s in data_sentences] else: data = data_sentences # Possibly flatten the sentences if to_one_hot: labels = [ pp.to_one_hot(labels_scalar, NUM_CLASSES) for labels_scalar in labels_sentences ] print("Preprocessing done") eval = int(len(data) / 50) #return DataSet(np.array(data), np.array(labels), batch_size) return DataSet(np.array(data[eval:]), np.array(labels[eval:]), batch_size), DataSet(np.array(data[:eval]), np.array(labels[:eval]), batch_size)
def file_to_oha(filepath=NOT_SPAM_DATA_PATH, label=1): my_oha = [] labels = [] with open(filepath, 'r') as f: lines = f.readlines()[:MAX_LINES] for line in lines: escaped_line = clean_line(line) oha = to_one_hot(escaped_line) labels.append(label) my_oha.append(oha) return my_oha, labels
def file_to_oha(filepath='data/simple/pos.txt', label=1): my_oha = [] labels = [] if filepath.endswith("neg.txt"): label = 0 with open(filepath, 'r') as f: lines = f.readlines() for line in lines: escaped_line = line.replace('\n', '') oha = to_one_hot(escaped_line) labels.append(label) my_oha.append(oha) return my_oha, labels
def predict(txt): oha_txt = to_one_hot(txt, add_to_bag=False) prediction_array = np.array(oha_txt) return clf.predict([prediction_array]) # 1 or 0
def predict(txt): txt = clean_line(txt) oha_txt = to_one_hot(txt, add_to_bag=False, bow=bow) prediction_array = np.array(oha_txt) return clf.predict([prediction_array]) # 1 or 0