예제 #1
0
def make_hist(smiles):
    mol = Chem.MolFromSmiles(smiles)
    atoms = mol.GetAtoms()
    hist = np.zeros(utils.dataset_info(dataset)['hist_dim'])
    for atom in atoms:
        if dataset == 'qm9':
            atom_str = atom.GetSymbol()
        else:
            # zinc dataset # transform using "<atom_symbol><valence>(<charge>)"  notation
            symbol = atom.GetSymbol()
            valence = atom.GetTotalValence()
            charge = atom.GetFormalCharge()
            atom_str = "%s%i(%i)" % (symbol, valence, charge)

            if atom_str not in utils.dataset_info(dataset)['atom_types']:
                print('Unrecognized atom type %s' % atom_str)
                return None

        ind = utils.dataset_info(dataset)['atom_types'].index(atom_str)
        val = utils.dataset_info(dataset)['maximum_valence'][ind]
        hist[
            val -
            1] += 1  # in the array the valence number start from 1, instead the array start from 0
    return hist
예제 #2
0
    def __init__(self, args):
        self.args = args

        # Collect argument things:
        data_dir = ''
        if '--data_dir' in args and args['--data_dir'] is not None:
            data_dir = args['--data_dir']
        self.data_dir = data_dir

        # Collect parameters:
        params = self.default_params()
        config_file = args.get('--config-file')
        if config_file is not None:
            with open(config_file, 'r') as f:
                params.update(json.load(f))
        config = args.get('--config')
        if config is not None:
            params.update(json.loads(config))
        self.params = params

        # Get which dataset in use
        self.params['dataset'] = dataset = args.get('--dataset')
        # Number of atom types of this dataset
        self.params['num_symbols'] = len(dataset_info(dataset)["atom_types"])

        self.run_id = "_".join(
            [time.strftime("%Y-%m-%d-%H-%M-%S"),
             str(os.getpid())])
        log_dir = args.get('--log_dir') or '.'
        self.log_file = os.path.join(log_dir,
                                     "%s_log_%s.json" % (self.run_id, dataset))
        self.best_model_file = os.path.join(log_dir,
                                            "%s_model.pickle" % self.run_id)

        with open(
                os.path.join(log_dir,
                             "%s_params_%s.json" % (self.run_id, dataset)),
                "w") as f:
            json.dump(params, f)
        print("Run %s starting with following parameters:\n%s" %
              (self.run_id, json.dumps(self.params)))
        random.seed(params['random_seed'])
        np.random.seed(params['random_seed'])

        # Load data:
        self.max_num_vertices = 0
        self.num_edge_types = 0
        self.annotation_size = 0
        self.train_data = self.load_data(params['train_file'],
                                         is_training_data=True)
        self.valid_data = self.load_data(params['valid_file'],
                                         is_training_data=False)

        # Build the actual model
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        self.graph = tf.Graph()
        self.sess = tf.Session(graph=self.graph, config=config)
        with self.graph.as_default():
            tf.set_random_seed(params['random_seed'])
            self.placeholders = {}
            self.weights = {}
            self.ops = {}
            self.make_model()
            self.make_train_step()

            # Restore/initialize variables:
            restore_file = args.get('--restore')
            if restore_file is not None:
                self.restore_model(restore_file)
            else:
                self.initialize_model()
예제 #3
0
    total, tree_count = utils.check_cyclic(dataset)
    sascorer, sa_score_per_molecule = utils.check_sascorer(dataset)
    total, validity = utils.check_validity(dataset)

    print("------------------------------------------")
    print("Metrics")
    print("------------------------------------------")
    print("total molecule")
    print(total)
    print("------------------------------------------")
    print("percentage of nonplanar:")
    print(nonplanar / total)
    print("------------------------------------------")
    print("avg atom:")
    for atom_type, c in atom_counter.items():
        print(dataset_info(dataset)['atom_types'][atom_type])
        print(c / total)
    print("standard deviation")
    print(np.std(atom_per_molecule, axis=0))
    print("------------------------------------------")
    print("avg edge_type:")
    for edge_type, c in edge_type_counter.items():
        print(edge_type + 1)
        print(c / total)
    print("standard deviation")
    print(np.std(edge_type_per_molecule, axis=0))
    print("------------------------------------------")
    print("avg shape:")
    for shape, c in zip(utils.geometry_numbers, shape_count):
        print(shape)
        print(c / total)
예제 #4
0
            img_k = img_k[shuffle_idx]
            k = model_k(img_k)  # N x C
            k = k[reverse_idx].detach()  # reverse and no graident to key

        # update dictionary
        queue = enqueue(queue, k) if queue is not None else k
        queue = dequeue(queue)
    return {
               'loss': losses.avg,
               'pred': pred_meter.avg
           }, queue


if __name__ == '__main__':
    args = parse_option()
    image_size, mean, std = dataset_info(name='cifar')
    # image_size = 28
    # mean = [0.1307, ]
    # std = [0.3081, ]
    # normalize = transforms.Normalize(mean=mean, std=std)

    train_transform = get_transform(image_size, mean=mean, std=std, mode='train')
    # datasets.mnist.MNIST
    train_dataset = custom_dataset(datasets.cifar.CIFAR10)(root='./', train=True, transform=train_transform,
                                                           download=True)
    print(len(train_dataset))
    train_dataloader = DataLoader(train_dataset, batch_size=config.BATCH_SIZE, shuffle=True, num_workers=0,
                                  pin_memory=False, drop_last=True)  # drop the last batch due to irregular size

    model_q, model_k = get_model(config.MODEL)
예제 #5
0
    def __init__(self, args):
        self.args = args

        # Collect argument things:
        data_dir = ''
        if '--data_dir' in args and args['--data_dir'] is not None:
            data_dir = args['--data_dir']
        self.data_dir = data_dir

        # Collect parameters:
        params = self.default_params()
        config_file = args.get('--config-file')
        if config_file is not None:
            with open(config_file, 'r') as f:
                params.update(json.load(f))
        config = args.get('--config')
        if config is not None:
            params.update(json.loads(config))
        self.params = params
        # adjust variables values
        if self.params['generation'] == 2:  # for reconstruction
            self.params['try_different_starting'] = False
            self.params['use_argmax_nodes'] = True
            self.params['use_argmax_bonds'] = True

        if self.params['generation'] == 3:  # for testing
            self.params['try_different_starting'] = False
            self.params['use_argmax_nodes'] = True
            self.params['use_argmax_bonds'] = True

        # use only cpu
        if not self.params['use_gpu']:
            os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

        # Get which dataset in use
        self.params['dataset'] = dataset = args.get('--dataset')
        # Number of atom types of this dataset
        self.params['num_symbols'] = len(dataset_info(dataset)["atom_types"])

        suff = "_" + self.params['suffix'] if self.params[
            'suffix'] is not None else ""
        self.run_id = "_".join(
            [time.strftime("%Y-%m-%d-%H-%M-%S"),
             str(os.getpid())])
        log_dir = self.params['log_dir']
        self.log_file = os.path.join(
            log_dir, "%s_log_%s%s.json" % (self.run_id, dataset, suff))
        self.best_model_file = os.path.join(
            log_dir, "%s_model%s.pickle" % (self.run_id, suff))

        with open(
                os.path.join(
                    log_dir,
                    "%s_params_%s%s.json" % (self.run_id, dataset, suff)),
                "w") as f:
            json.dump(params, f)

        print("Run %s starting with following parameters:\n%s" %
              (self.run_id, json.dumps(self.params)))

        # Set random seeds
        random.seed(params['random_seed'])
        np.random.seed(params['random_seed'])

        # Load data:
        self.max_num_vertices = 0
        self.num_edge_types = 0
        self.annotation_size = 0
        if self.params['generation'] == 0:
            train_data, self.train_data = self.load_data(params['train_file'],
                                                         is_training_data=True)
        else:
            train_data, self.train_data = self.load_data(
                params['train_file'], is_training_data=False)
        valid_data, self.valid_data = self.load_data(params['valid_file'],
                                                     is_training_data=False)
        test_data, self.test_data = self.load_data(params['test_file'],
                                                   is_training_data=False)
        self.histograms = dict()
        self.histograms['hist_dim'] = utils.dataset_info(
            self.params['dataset'])['hist_dim']
        self.histograms['max_valence'] = utils.dataset_info(
            self.params['dataset'])['max_valence_value']
        self.max_num_vertices = dataset_info(dataset)["max_n_atoms"]
        self.histograms['train'] = self.prepareHist(train_data)
        # A = number of atoms in a molecule, N = number of histograms
        # With filter we create a list of max(A) lists, which each list inside the main one are the weights for each histogram
        # according to the number of atoms
        # 0 return the frequency, 1 return the prob
        self.histograms['filter'] = HM.v_filter(self.histograms['train'][0],
                                                self.histograms['train'][1],
                                                self.max_num_vertices)
        self.histograms['valid'] = self.prepareHist(valid_data)
        self.histograms['test'] = self.prepareHist(test_data)
        # print(self.histograms['filter'])

        # Build the actual model
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        self.graph = tf.Graph()
        self.sess = tf.Session(graph=self.graph, config=config)
        with self.graph.as_default():
            tf.set_random_seed(params['random_seed'])
            self.placeholders = {}
            self.weights = {}
            self.ops = {}

            self.make_model()
            self.make_train_step()

            # Restore/initialize variables:
            restore_file = args.get('--restore')
            if restore_file is not None:
                self.restore_model(restore_file)
            else:
                self.initialize_model()