def generate_LR_important_features(self, clf_LR, feature_names, results_file, N=1000): ''' :param clf_logistic_regression: :param feature_names: :param results_file: :param N: :return: ''' results_file = results_file.replace( '/classifications/', '/feature_selection/classifications/') FileUtility.ensure_dir(results_file) file_name = results_file + '_LR' idxs = argsort(np.abs(clf_LR.coef_.tolist()[0]).tolist(), rev=True)[0:N] f = codecs.open(file_name, 'w') f.write('\t'.join(['feature', 'score']) + '\n') for idx in idxs: f.write('\t'.join( [feature_names[idx], str(clf_LR.coef_.tolist()[0][idx])]) + '\n') f.close()
def numpy2trainfiles(file, name, out='../data/s8_features/'): ''' test_file='/mounts/data/proj/asgari/dissertation/datasets/deepbio/protein_general/ss/data/cb513+profile_split1.npy' train_file='/mounts/data/proj/asgari/dissertation/datasets/deepbio/protein_general/ss/data/cullpdb+profile_6133_filtered.npy' :param name: :param out: :return: ''' db = np.load(file) a = np.arange(0, 21) b = np.arange(35, 56) c = np.hstack((a, b)) db = np.reshape(db, (db.shape[0], int(db.shape[1] / 57), 57)) seq = [ 'A', 'C', 'E', 'D', 'G', 'F', 'I', 'H', 'K', 'M', 'L', 'N', 'Q', 'P', 'S', 'R', 'T', 'W', 'V', 'Y', 'X', 'NoSeq' ] label = ['L', 'B', 'E', 'G', 'I', 'H', 'S', 'T'] sequences = [] labels = [] possible_features = dict() for i in range(0, db.shape[0]): sequences.append(''.join([ seq[np.argmax(x)] if np.max(x) == 1 else '' for x in db[i, :, 0:21] ]).lower()) labels.append(''.join([ label[np.argmax(y)] if np.max(y) == 1 else '' for y in db[i, :, 22:30] ]).lower()) lengths = [len(x) for x in sequences] sorted_idxs = argsort(lengths) lengths.sort() sequences = [sequences[i] for i in sorted_idxs] labels = [labels[i] for i in sorted_idxs] FileUtility.save_list(out + name, [ '\n'.join([ ' '.join([elx, labels[idx][idy]]) for idy, elx in enumerate(list(seq)) ] + ['']) for idx, seq in enumerate(sequences) ]) db_new = db[sorted_idxs, :, :] label_encoding = [[([0] if np.max(row) == 1 else [1]) + row for row in db_new[i, :, 22:30].tolist()] for i in range(0, db.shape[0])] np.save(out + name + '_mat_Y', label_encoding) db_new = db_new[:, :, c] np.save(out + name + '_mat_X', db_new) FileUtility.save_list(out + name + '_length.txt', [str(l) for l in lengths])
def labeling_file_reader(file): with open(file, 'r') as f: train = f.read().splitlines() X, y = [], [] sent = [] sent_labels = [] for elem in train: if elem == '': X.append(sent) y.append(sent_labels) sent = [] sent_labels = [] else: xx, yy = elem.split() sent.append(xx) sent_labels.append(yy) lengths = LabelingData.sequence_lengths(file) sorted_idxs = argsort(lengths) lengths.sort() X = [X[i] for i in sorted_idxs] y = [y[i] for i in sorted_idxs] return X, y, lengths