Пример #1
0
def get_data():
    """Simple function to pull some training and test data."""
    # Attach the database.
    dd = DescriptorDatabase(db_name='{}/vec_store.sqlite'.format(wkdir),
                            table='FingerVector')

    # Pull the features and targets from the database.
    names = dd.get_column_names()
    features, targets = names[1:-1], names[-1:]
    feature_data = dd.query_db(names=features)
    target_data = np.reshape(dd.query_db(names=targets),
                             (np.shape(feature_data)[0], ))

    # Split the data into so test and training sets.
    train_features = feature_data[:train_size, :n_features]
    train_targets = target_data[:train_size]
    test_features = feature_data[test_size:, :n_features]
    test_targets = target_data[test_size:]

    return train_features, train_targets, test_features, test_targets
Пример #2
0
    def todb(self, features, targets):
        """Function to convert numpy arrays to basic db."""
        data = np.concatenate((features, np.reshape(targets,
                                                    (len(targets), 1))),
                              axis=1)
        uid = [str(uuid.uuid4()) for _ in range(len(targets))]
        data = np.concatenate((np.reshape(uid, (len(uid), 1)), data), axis=1)

        descriptors = ['f' + str(i) for i in range(np.shape(features)[1])]
        targets = ['target']
        names = descriptors + targets

        # Set up the database to save system descriptors.
        dd = DescriptorDatabase(db_name=self.db_name, table=self.table)
        dd.create_db(names=names)

        # Fill the database with the data.
        dd.fill_db(descriptor_names=names, data=data)
    def test_expand(self):
        """Generate an extended feature space."""
        # Attach the database.
        dd = DescriptorDatabase(db_name='{}/vec_store.sqlite'.format(wkdir),
                                table='FingerVector')

        # Pull the features and targets from the database.
        names = dd.get_column_names()
        features, targets = names[1:-1], names[-1:]
        feature_data = dd.query_db(names=features)
        target_data = np.reshape(dd.query_db(names=targets),
                                 (np.shape(feature_data)[0], ))

        # Split the data into so test and training sets.
        train_features = feature_data[:train_size, :]
        train_targets = target_data[:train_size]
        test_features = feature_data[test_size:, :]
        d, f = np.shape(train_features)
        td, tf = np.shape(test_features)

        # Make some toy names.
        names = ['f{}'.format(i) for i in range(f)]

        # Perform feature engineering.
        extend = fe.single_transform(train_features)
        self.assertTrue(np.shape(extend) == (d, f * 3))

        extend = fe.get_order_2(train_features)
        ext_n = fe.get_labels_order_2(names, div=False)
        self.assertTrue(np.shape(extend) == (d, f * (f + 1) / 2))
        self.assertTrue(len(ext_n) == np.shape(extend)[1])

        extend = fe.get_div_order_2(train_features)
        ext_n = fe.get_labels_order_2(names, div=True)
        self.assertTrue(np.shape(extend) == (d, f**2))
        self.assertTrue(len(ext_n) == np.shape(extend)[1])

        extend = fe.get_order_2ab(train_features, a=2, b=4)
        ext_n = fe.get_labels_order_2ab(names, a=2, b=4)
        self.assertTrue(np.shape(extend) == (d, f * (f + 1) / 2))
        self.assertTrue(len(ext_n) == np.shape(extend)[1])

        extend = fe.get_ablog(train_features, a=2, b=4)
        ext_n = fe.get_labels_ablog(names, a=2, b=4)
        self.assertTrue(np.shape(extend) == (d, f * (f + 1) / 2))
        self.assertTrue(len(ext_n) == np.shape(extend)[1])

        p = train_features[:3, :10]
        fe.generate_features(p,
                             max_num=2,
                             max_den=0,
                             log=False,
                             sqrt=False,
                             exclude=False,
                             s=True)
        fe.generate_features(p,
                             max_num=2,
                             max_den=1,
                             log=True,
                             sqrt=True,
                             exclude=True,
                             s=True)

        self.__class__.train_features = train_features
        self.__class__.train_targets = train_targets
        self.__class__.test_features = test_features
Пример #4
0
    def test_storage(self):
        """Test database functions."""
        # Define variables for database to store system descriptors.
        db_name = '/vec_store.sqlite'
        descriptors = ['f' + str(i) for i in range(np.shape(self.data)[1])]
        targets = ['Energy']
        names = descriptors + targets

        # Set up the database to save system descriptors.
        dd = DescriptorDatabase(db_name=wkdir + db_name, table='FingerVector')
        dd.create_db(names=names)

        # Put data in correct format to be inserted into database.
        print('Generate the database')
        new_data = []
        for i, a in zip(self.data, self.all_cand):
            d = []
            d.append(a.info['unique_id'])
            for j in i:
                d.append(j)
            d.append(a.info['key_value_pairs']['raw_score'])
            new_data.append(d)

        # Fill the database with the data.
        dd.fill_db(descriptor_names=names, data=new_data)

        # Test out the database functions.
        train_fingerprint = dd.query_db(names=descriptors)
        train_target = dd.query_db(names=targets)
        print('\nfeature data for candidates:\n', train_fingerprint,
              '\ntarget data for candidates:\n', train_target)

        cand_data = dd.query_db(unique_id='7a216711c2eae02decc04da588c9e592')
        print('\ndata for random candidate:\n', cand_data)

        all_id = dd.query_db(names=['uuid'])
        dd.create_column(new_column=['random'])
        for i in all_id:
            dd.update_descriptor(descriptor='random',
                                 new_data=random(),
                                 unique_id=i[0])
        print('\nretrieve random vars:\n', dd.query_db(names=['random']))

        print('\nretrieved column names:\n', dd.get_column_names())