Exemplo n.º 1
0
def populate_list_of_jobs(basedir, jobs, db_communicate=True):
    if db_communicate:
        db = connect2db(user="******", pwd="readonly", host="localhost",
                        port=27017, database="tmc", auth=True)
    else:
        db = None
    for job in jobs:
        populate_single_job(basedir, job, db)
Exemplo n.º 2
0
        "c1ccncc1": "pyr",
    }
    for idx, lig in enumerate(liglist):
        if lig in list(lig_dict.keys()):
            liglist[idx] = lig_dict[lig]
    return liglist


update_fields = ['alphaHOMO', 'betaHOMO', 'alphaLUMO', 'betaLUMO', 'status']
print(("update_fields: ", update_fields))
df = pd.read_csv("spectro_all_RACs_format.csv")
home = expanduser("~")
dbconfig = json.load(open(home + "/.db_config"))
db = connect2db(user=dbconfig['user'],
                pwd=dbconfig['pwd'],
                host='localhost',
                port=27017,
                database='tmc',
                auth=True)
print(("# of complexes: ", db.oct.count()))
basepath = "/home/crduan/Binary_Classifier/molecule/"
count, merged = 0, 0
for idx, row in df.iterrows():
    if row["mad"] == "oldgen":
        print("=================")
        print((row["job_name"]))
        filepath = basepath + row["metal"] + '/' + "_".join(
            row["job_name"].split("_")[:-2]) + '/geometry/'
        outpath = filepath + "_".join(
            row["job_name"].split("_")[:3]) + "_" + "_".join(
                row["job_name"].split("_")[-2:]) + "_geometry__runlast.out"
        scrpath = filepath + "scr_" + "_".join(
Exemplo n.º 3
0
def retrain(predictor,
            user,
            pwd,
            database,
            collection,
            collection_model,
            host="localhost",
            port=27017,
            auth=True,
            constraints=False,
            frac=0.8,
            epochs=1000,
            batch_size=32,
            force_push=False,
            hyperopt=False,
            tag=False,
            feature_extra=False,
            target=False,
            initialize_weight=True,
            hyperopt_step=100,
            load_latest_model=False,
            fix_architecture=False,
            direct_retrain=False,
            use_gpr=False):
    '''
    Main routine of db-based model retraining.

    Parameters
    ---
        predictor: str, name of the predictor (you can create your own name for your model)
        usr: str, username of your group db account
        pwd: str, password of your group db account
        database: str, name of the group database ("tmc")
        collection: str, name of the complex collection of the database
        collection_model: str, name of the model collection of the database
        host: str, host of db
        port: int, port of db
        auth: boolean, always true
        constraints: dict, constraints for querying the database
        frac: float, fraction of data being used as training data
        epochs: int, maximum epochs during the model training
        batch_size: int, batch size
        force_push: boolean, whether to force push the optimized model
        hyperopt: boolean, whether to do a hyperparameter optimization
        tag: str, tag of your model
        feature_extra: boolean or list, features other than RACs-155
        target: boolean or list, names of the target property
        initialize_weight: boolean, whether to initialize weight for each model from scratch
        hyperopt_step: int, number of steps for hyperopt
        load_latest_model: boolean, whether to load the lastest model as the seed model
        fix_architecture: boolean, whether to fix the architect (as the latest model) during hyperopt
        direct_retrain: boolean, whether to directly use the hyperparams of the latest model
        use_gpr: boolean, whether to use gpr instead of ann.

    Returns
    ---
        model_dict: dict, a dictionary for optimized model info
    '''
    db = connect2db(user, pwd, host, port, database, auth)
    dbquery_time = datetime.now()
    df, fnames, lname = extract_data_from_db(predictor,
                                             db,
                                             collection,
                                             constraints=constraints,
                                             feature_extra=feature_extra,
                                             target=target)
    X_train, X_test, y_train, y_test, n_train, n_test, x_scaler, y_scaler = normalize_data(
        df, fnames, lname, predictor, frac=frac, name=True)
    model_dict = {}
    if not use_gpr:
        model, history, res_dict_train, res_dict_test, best_params = train_model(
            predictor,
            db,
            collection_model,
            lname,
            X_train,
            X_test,
            y_train,
            y_test,
            x_scaler,
            y_scaler,
            epochs=epochs,
            batch_size=batch_size,
            hyperopt=hyperopt,
            initialize_weight=initialize_weight,
            hyperopt_step=hyperopt_step,
            load_latest_model=load_latest_model,
            fix_architecture=fix_architecture,
            direct_retrain=direct_retrain)
        pred_train = model.predict(X_train)
        pred_test = model.predict(X_test)
        model_dict.update({
            "history": {
                k: [float(ele) for ele in history.history[k]]
                for k in history.history
            },
        })
    else:
        model, history, res_dict_train, res_dict_test, best_params = train_gpr(
            predictor, lname, X_train, X_test, y_train, y_test, x_scaler,
            y_scaler)
        pred_train, var_train = model.predict(X_train)
        pred_test, var_test = model.predict(X_test)
        model_dict.update({
            "history": history,
        })
        if not "clf" in predictor:
            model_dict.update({
                "var_train":
                [var_train[ii].tolist() for ii in range(len(n_train))],
                "var_test":
                [var_test[ii].tolist() for ii in range(len(n_test))],
            })
    model_dict.update({
        "predictor":
        predictor,
        "constraints":
        str(constraints),
        "hyperopt":
        hyperopt,
        "hyperparams":
        best_params,
        "dbquery_time":
        dbquery_time,
        "name_train":
        n_train.tolist(),
        "name_test":
        n_test.tolist(),
        "target_train":
        y_train.tolist(),
        "target_test":
        y_test.tolist(),
        "len_train":
        y_train.shape[0],
        "len_test":
        y_test.shape[0],
        "len_tot":
        y_train.shape[0] + y_test.shape[0],
        "score_train": {k: float(res_dict_train[k])
                        for k in res_dict_train},
        "score_test": {k: float(res_dict_test[k])
                       for k in res_dict_test},
        "pred_train": [pred_train[ii].tolist() for ii in range(len(n_train))],
        "pred_test": [pred_test[ii].tolist() for ii in range(len(n_test))],
        "features":
        fnames,
        "target":
        lname,
        "force_push":
        force_push,
        "direct_retrain":
        direct_retrain,
        "tag":
        tag,
        "initialize_weight":
        initialize_weight,
        "hyperopt_step":
        hyperopt_step,
        "load_latest_model":
        load_latest_model,
        "fix_architecture":
        fix_architecture,
        "x_scaler":
        pickle.dumps(x_scaler),
        "y_scaler":
        pickle.dumps(y_scaler),
    })
    push_models(model,
                model_dict,
                database,
                collection_model,
                user=user,
                pwd=pwd,
                host=host,
                port=port,
                auth=auth)
    return model_dict
Exemplo n.º 4
0

def get_wavefunction(scrpath):
    wfn = {"c0": False, "ca0": False, "cb0": False}
    if os.path.isfile(scrpath + "ca0"):
        with open(scrpath + "ca0", "rb") as fo:
            wfn.update({"ca0": fo.read()})
    if os.path.isfile(scrpath + "cb0"):
        with open(scrpath + "cb0", "rb") as fo:
            wfn.update({"cb0": fo.read()})
    return wfn


db = connect2db(user=user,
                pwd=pwd,
                host='localhost',
                port=27017,
                database='tmc',
                auth=True)
print(("# of complexes: ", db.oct.count()))
counter = 0

count = 0
merged = 0
for i, outfile in enumerate(outfiles):
    if os.path.split(outfile.rsplit('_', 1)[0])[-1] in active_jobs:
        continue
    if 'nohup' in outfile:
        continue
    output = textfile(outfile)
    print(('----' + outfile + '----'))
    try:
Exemplo n.º 5
0
def main():
    args = arg_parser()
    if args.user == None or args.pwd == None:
        raise KeyError(
            "Please use the format python update_db_documents.py -user <username> -pwd <password>."
        )
    constraints = {
        "lig6": {
            "$ne": "x"
        },
        "status": 0,
        "gap": {
            "$exists": False
        }
    }
    update_fields = ['gap']
    database = "tmc"
    collection = "oct"
    user = args.user
    pwd = args.pwd
    db = connect2db(user,
                    pwd,
                    host="localhost",
                    port=27017,
                    database=database,
                    auth=True)
    print(("Number of complexes in the collection:", db.oct.count()))
    cursor = db[collection].find(constraints)
    tot = count_find(cursor)
    print(("Number of complexes to be updated: ", tot))
    cursor = db[collection].find(constraints,
                                 no_cursor_timeout=True).batch_size(10)
    print(("Are you sure to update %s with constraints %s in %s[%s]? (y/n)" %
           (str(update_fields), str(constraints), database, collection)))
    _in = eval(input())
    if not _in == "y":
        print("Quit. Have a nice day.")
        quit()
    count = 0
    confirmed = False
    for _tmcdoc in cursor:
        print(("complex: ", _tmcdoc["unique_name"]))
        recovered = True
        try:
            _this_tmc = tmcMongo(document=_tmcdoc,
                                 tag=_tmcdoc["tag"],
                                 subtag=_tmcdoc["subtag"],
                                 publication=_tmcdoc["publication"],
                                 update_fields=update_fields)
            print(("complex id_doc:", _this_tmc.id_doc))
        except ValueError:
            print("The input document cannot recover a DFTrun object.")
            recovered = False

        if recovered:
            ####
            ## Case 1.
            ## Simple modification. You have already know what to update and you **don't** want to update dftrun.

            new_tmc = copy.deepcopy(_this_tmc)
            try:
                new_tmc.document["gap"] = new_tmc.document[
                    "alphaLUMO"] - new_tmc.document["alphaHOMO"]
            except:
                pass
            # Change here. e.g. new_tmc.document["publication"] = xxx
            ####

            ####
            ## Case 2.
            ## Modify both the documents and dftrun.

            # _this_run = copy.deepcopy(_this_tmc.this_run)
            # current_folder = _this_run.scrpath.strip("optim.xyz")
            # multiwfnpath = glob.glob(current_folder + "*.molden")
            # if len(multiwfnpath) > 0:
            #     multiwfnpath = multiwfnpath[0]
            #     mulliken_spin_list = get_mulliken(multiwfnpath, _this_run.spin, _this_run.liglist[-1])
            #     print(mulliken_spin_list)
            #     _this_run.net_metal_spin = mulliken_spin_list[0]
            #     if len(mulliken_spin_list) > 1:
            #         _this_run.net_oxygen_spin = mulliken_spin_list[1]
            # else:
            #     print("No molden path found.")
            # _this_run.get_check_flags()
            # new_tmc = tmcMongo(this_run=_this_run, tag=_tmcdoc["tag"], subtag=_tmcdoc["subtag"],
            #                    publication=_tmcdoc["publication"], update_fields=update_fields)
            ###

            if not confirmed:
                for key in update_fields:
                    print(("=======Key======: ", key))
                    if key in _tmcdoc:
                        print(("Current: ", _tmcdoc[key]))
                    else:
                        print("Currently does not exist.")
                    print(("Change to: ", new_tmc.document[key]))
                print("Is this expected? (y/n)")
                _in = eval(input())
                if _in == "y":
                    confirmed = True
                else:
                    print("Quit. Have a nice day.")
                    quit()
            __ = insert(db, collection, new_tmc)
        count += 1
        print((" In progress: %d / %d" % (count, tot)))
    print(("You have changed %d documents in %s[%s]" %
           (tot, database, collection)))