示例#1
0
def predict_sklearn(args: SklearnPredictArgs):
    print('Loading data')
    data = get_data(path=args.test_path,
                    smiles_column=args.smiles_column,
                    target_columns=[])

    print('Loading training arguments')
    with open(args.checkpoint_paths[0], 'rb') as f:
        model = pickle.load(f)
        train_args: SklearnTrainArgs = SklearnTrainArgs().from_dict(
            model.train_args, skip_unsettable=True)

    print('Computing morgan fingerprints')
    morgan_fingerprint = get_features_generator('morgan')
    for datapoint in tqdm(data, total=len(data)):
        datapoint.set_features(
            morgan_fingerprint(mol=datapoint.smiles,
                               radius=train_args.radius,
                               num_bits=train_args.num_bits))

    print(
        f'Predicting with an ensemble of {len(args.checkpoint_paths)} models')
    sum_preds = np.zeros((len(data), train_args.num_tasks))

    for checkpoint_path in tqdm(args.checkpoint_paths,
                                total=len(args.checkpoint_paths)):
        with open(checkpoint_path, 'rb') as f:
            model = pickle.load(f)

        model_preds = predict(model=model,
                              model_type=train_args.model_type,
                              dataset_type=train_args.dataset_type,
                              features=data.features())
        sum_preds += np.array(model_preds)

    # Ensemble predictions
    avg_preds = sum_preds / len(args.checkpoint_paths)
    avg_preds = avg_preds.tolist()

    print(f'Saving predictions to {args.preds_path}')
    assert len(data) == len(avg_preds)
    makedirs(args.preds_path, isfile=True)

    # Copy predictions over to data
    for datapoint, preds in zip(data, avg_preds):
        for pred_name, pred in zip(train_args.task_names, preds):
            datapoint.row[pred_name] = pred

    # Save
    with open(args.preds_path, 'w') as f:
        writer = csv.DictWriter(f, fieldnames=data[0].row.keys())
        writer.writeheader()

        for datapoint in data:
            writer.writerow(datapoint.row)
示例#2
0
def predict_sklearn(args: Namespace):
    print('Loading data')
    data = get_data(path=args.test_path)

    print('Computing morgan fingerprints')
    morgan_fingerprint = get_features_generator('morgan')
    for datapoint in tqdm(data, total=len(data)):
        datapoint.set_features(morgan_fingerprint(mol=datapoint.smiles, radius=args.radius, num_bits=args.num_bits))

    print(f'Predicting with an ensemble of {len(args.checkpoint_paths)} models')
    sum_preds = np.zeros((len(data), args.num_tasks))

    for checkpoint_path in tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths)):
        with open(checkpoint_path, 'rb') as f:
            model = pickle.load(f)

        model_preds = predict(
            model=model,
            model_type=args.model_type,
            dataset_type=args.dataset_type,
            features=data.features()
        )
        sum_preds += np.array(model_preds)

    # Ensemble predictions
    avg_preds = sum_preds / len(args.checkpoint_paths)
    avg_preds = avg_preds.tolist()

    print('Saving predictions')
    assert len(data) == len(avg_preds)
    makedirs(args.preds_path, isfile=True)

    with open(args.preds_path, 'w') as f:
        writer = csv.writer(f)
        writer.writerow(['smiles'] + get_task_names(args.test_path))

        for smiles, pred in zip(data.smiles(), avg_preds):
            writer.writerow([smiles] + pred)
示例#3
0
def predict_sklearn(args: SklearnPredictArgs) -> None:
    """
    Loads data and a trained scikit-learn model and uses the model to make predictions on the data.

   :param args: A :class:`~chemprop.args.SklearnPredictArgs` object containing arguments for
                 loading data, loading a trained scikit-learn model, and making predictions with the model.
    """
    print('Loading data')
    data = get_data(path=args.test_path,
                    smiles_columns=args.smiles_columns,
                    target_columns=[],
                    ignore_columns=[],
                    store_row=True)

    print('Loading training arguments')
    with open(args.checkpoint_paths[0], 'rb') as f:
        model = pickle.load(f)
        train_args: SklearnTrainArgs = SklearnTrainArgs().from_dict(
            model.train_args, skip_unsettable=True)

    print('Computing morgan fingerprints')
    morgan_fingerprint = get_features_generator('morgan')
    for datapoint in tqdm(data, total=len(data)):
        for s in datapoint.smiles:
            datapoint.extend_features(
                morgan_fingerprint(mol=s,
                                   radius=train_args.radius,
                                   num_bits=train_args.num_bits))

    print(
        f'Predicting with an ensemble of {len(args.checkpoint_paths)} models')
    sum_preds = np.zeros((len(data), train_args.num_tasks))

    for checkpoint_path in tqdm(args.checkpoint_paths,
                                total=len(args.checkpoint_paths)):
        with open(checkpoint_path, 'rb') as f:
            model = pickle.load(f)

        model_preds = predict(model=model,
                              model_type=train_args.model_type,
                              dataset_type=train_args.dataset_type,
                              features=data.features())
        sum_preds += np.array(model_preds)

    # Ensemble predictions
    avg_preds = sum_preds / len(args.checkpoint_paths)
    avg_preds = avg_preds.tolist()

    print(f'Saving predictions to {args.preds_path}')
    # assert len(data) == len(avg_preds)    #TODO: address with unit test later
    makedirs(args.preds_path, isfile=True)

    # Copy predictions over to data
    for datapoint, preds in zip(data, avg_preds):
        for pred_name, pred in zip(train_args.task_names, preds):
            datapoint.row[pred_name] = pred

    # Save
    with open(args.preds_path, 'w') as f:
        writer = csv.DictWriter(f, fieldnames=data[0].row.keys())
        writer.writeheader()

        for datapoint in data:
            writer.writerow(datapoint.row)
示例#4
0
def predict_sklearn(args: SklearnPredictArgs):

    if args.parcel_size and args.max_data_size:
        num_iterations = math.ceil(args.max_data_size / args.parcel_size)
        max_data_size = args.parcel_size
    else:
        num_iterations = 1
        max_data_size = args.max_data_size
    offset = 0

    for iteration in range(num_iterations):

        if iteration > 0:
            offset = offset + args.parcel_size
            max_data_size = max_data_size + args.parcel_size

        print('Loading data')
        data = get_data(path=args.test_path,
                        smiles_column=args.smiles_column,
                        target_columns=[],
                        max_data_size=max_data_size,
                        data_offset=offset)

        print('Computing morgan fingerprints')
        morgan_fingerprint = get_features_generator('morgan')
        for datapoint in tqdm(data, total=len(data)):
            datapoint.set_features(
                morgan_fingerprint(mol=datapoint.smiles,
                                   radius=args.radius,
                                   num_bits=args.num_bits))

        print(
            f'Predicting with an ensemble of {len(args.checkpoint_paths)} models'
        )
        sum_preds = np.zeros((len(data), args.num_tasks))

        for checkpoint_path in tqdm(args.checkpoint_paths,
                                    total=len(args.checkpoint_paths)):
            with open(checkpoint_path, 'rb') as f:
                model = pickle.load(f)

            model_preds = predict(model=model,
                                  model_type=args.model_type,
                                  dataset_type=args.dataset_type,
                                  features=data.features())
            sum_preds += np.array(model_preds)

        # Ensemble predictions
        avg_preds = sum_preds / len(args.checkpoint_paths)
        avg_preds = avg_preds.tolist()

        print(f'Saving predictions to {args.preds_path}')
        assert len(data) == len(avg_preds)
        makedirs(args.preds_path, isfile=True)

        # Copy predictions over to data
        task_names = get_task_names(path=args.test_path)
        for datapoint, preds in zip(data, avg_preds):
            for pred_name, pred in zip(task_names, preds):
                datapoint.row[pred_name] = pred

        # Save
        if iteration != 0:
            name, ext = os.path.splitext(args.preds_path)
            preds_path = "{name}.{it}{ext}".format(name=name,
                                                   it=iteration,
                                                   ext=ext)
        else:
            preds_path = args.preds_path
        with open(args.preds_path, 'w') as f:
            writer = csv.DictWriter(f, fieldnames=data[0].row.keys())
            writer.writeheader()

            for datapoint in data:
                writer.writerow(datapoint.row)