def generate_LR_important_features(self,
                                       clf_LR,
                                       feature_names,
                                       results_file,
                                       N=1000):
        '''
        :param clf_logistic_regression:
        :param feature_names:
        :param results_file:
        :param N:
        :return:
        '''

        results_file = results_file.replace(
            '/classifications/', '/feature_selection/classifications/')
        FileUtility.ensure_dir(results_file)
        file_name = results_file + '_LR'

        idxs = argsort(np.abs(clf_LR.coef_.tolist()[0]).tolist(),
                       rev=True)[0:N]

        f = codecs.open(file_name, 'w')
        f.write('\t'.join(['feature', 'score']) + '\n')
        for idx in idxs:
            f.write('\t'.join(
                [feature_names[idx],
                 str(clf_LR.coef_.tolist()[0][idx])]) + '\n')
        f.close()
예제 #2
0
 def numpy2trainfiles(file, name, out='../data/s8_features/'):
     '''
     test_file='/mounts/data/proj/asgari/dissertation/datasets/deepbio/protein_general/ss/data/cb513+profile_split1.npy'
     train_file='/mounts/data/proj/asgari/dissertation/datasets/deepbio/protein_general/ss/data/cullpdb+profile_6133_filtered.npy'
     :param name:
     :param out:
     :return:
     '''
     db = np.load(file)
     a = np.arange(0, 21)
     b = np.arange(35, 56)
     c = np.hstack((a, b))
     db = np.reshape(db, (db.shape[0], int(db.shape[1] / 57), 57))
     seq = [
         'A', 'C', 'E', 'D', 'G', 'F', 'I', 'H', 'K', 'M', 'L', 'N', 'Q',
         'P', 'S', 'R', 'T', 'W', 'V', 'Y', 'X', 'NoSeq'
     ]
     label = ['L', 'B', 'E', 'G', 'I', 'H', 'S', 'T']
     sequences = []
     labels = []
     possible_features = dict()
     for i in range(0, db.shape[0]):
         sequences.append(''.join([
             seq[np.argmax(x)] if np.max(x) == 1 else ''
             for x in db[i, :, 0:21]
         ]).lower())
         labels.append(''.join([
             label[np.argmax(y)] if np.max(y) == 1 else ''
             for y in db[i, :, 22:30]
         ]).lower())
     lengths = [len(x) for x in sequences]
     sorted_idxs = argsort(lengths)
     lengths.sort()
     sequences = [sequences[i] for i in sorted_idxs]
     labels = [labels[i] for i in sorted_idxs]
     FileUtility.save_list(out + name, [
         '\n'.join([
             ' '.join([elx, labels[idx][idy]])
             for idy, elx in enumerate(list(seq))
         ] + ['']) for idx, seq in enumerate(sequences)
     ])
     db_new = db[sorted_idxs, :, :]
     label_encoding = [[([0] if np.max(row) == 1 else [1]) + row
                        for row in db_new[i, :, 22:30].tolist()]
                       for i in range(0, db.shape[0])]
     np.save(out + name + '_mat_Y', label_encoding)
     db_new = db_new[:, :, c]
     np.save(out + name + '_mat_X', db_new)
     FileUtility.save_list(out + name + '_length.txt',
                           [str(l) for l in lengths])
예제 #3
0
    def labeling_file_reader(file):
        with open(file, 'r') as f:
            train = f.read().splitlines()
            X, y = [], []
            sent = []
            sent_labels = []
        for elem in train:
            if elem == '':
                X.append(sent)
                y.append(sent_labels)
                sent = []
                sent_labels = []
            else:
                xx, yy = elem.split()
                sent.append(xx)
                sent_labels.append(yy)

        lengths = LabelingData.sequence_lengths(file)
        sorted_idxs = argsort(lengths)
        lengths.sort()
        X = [X[i] for i in sorted_idxs]
        y = [y[i] for i in sorted_idxs]
        return X, y, lengths