示例#1
0
def data_gathering(dt, dm, fname, job):
    total_images = ase.io.read(fname,
                               index=':')  # extxyz, OUTCAR, ':' should be
    ### divide total data innot nset: nset-1 for training and last 1 for test
    if dt == 'set':
        if isinstance(dm, int):
            nset = dm
        elif isinstance(dm, list):
            nset = dm[0]
        images_sets = Images(total_images, dt, nset)
        training_images = images_sets.get_training_images()
        test_images = images_sets.get_test_images()
    ### indices for training d[0:2] and test d[len(d)-2:]: not call class Images
    elif dt == 'interval':
        d_list = dm[:2]
        training_images = total_images[d_list[0]:d_list[1]]
        if len(dm) >= 3:
            if len(data_int) == 3:
                d_list = dm[1:]
            elif len(data_int) == 4:
                d_list = dm[2:]
            test_images = total_images[d_list[0]:d_list[1]]
        else:
            test_images = []
            print("There is no test set region ")
    ### select some for training and some for test turn by turn in the file
    elif dt == 'sort':
        training_images = []
        test_images = []
        i = 0
        divider = dm[0]
        tr_remain = dm[1]
        te_remain = dm[2]
        for image in len(total_images):
            if i % divider == 0:
                training_images.append(total_images[i])
            else:
                test_images.append(total_images[i])
            i += 1
    return len(total_images), training_images, test_images
示例#2
0
def data_selection(total_images, dt, dl, job):

    if dt == 'npart':
        if isinstance(dl, int):
            nset = dl
        elif isinstance(dl, list):
            nset = dl[0]
        images_sets = Images(total_images, dt, nset)
        training_images = images_sets.get_training_images()
        test_images = images_sets.get_test_images()
        return training_images, test_images

    ### indices for training d[0:2] and test d[len(d)-2:]: not call class Images, job is used
    elif dt == 'int':
        d_list = dl[:2]
        training_images = total_images[d_list[0]:d_list[1]]
        if len(dl) >= 3:
            if len(dl) == 3:
                d_list = dl[1:]
            elif len(dl) == 4:
                d_list = dl[2:]
            test_images = total_images[d_list[0]:d_list[1]]
        else:
            test_images = []
            print("There is no test set region ")
        if job == 'tr':
            return training_images, test_images
        ### one interval will be test region
        elif job == 'te':
            return None, training_images
    ### Division by index: some for training and some for test turn by turn in the file
    elif dt == 'div':
        training_images = []
        test_images = []
        i = 0
        divider = dl[0]
        tr_remainder = dl[1]
        #    print("Wrong in selection data u. -dt 'div'")
        #    sys.exit(44)
        if len(dl) == 3:
            te_remainder = dl[2]
        #te_remain = dl[2]
        for image in total_images:
            if i % divider == tr_remainder:
                training_images.append(image)
                if Ldebug: print(f"{i}-th image in training_images")
            if len(dl) == 3:
                if i % divider == te_remainder:
                    test_images.append(image)
                    if Ldebug: print(f"{i}-th image in test_images")
            i += 1
        if job == 'te':
            test_images = training_images
            training_images = None
        return training_images, test_images
    elif dt == 'pick':
        training_images = []
        test_images = []
        i = 0
        j = 0
        if len(dl) == 2:  # Nontype error for dl, why?
            for image in total_images:
                if i < dl[0]:
                    training_images.append(image)
                    if Ldebug: print(f"{j}-th image in training_images")
                elif i < dl[0] + dl[1]:
                    test_images.append(image)
                    if Ldebug: print(f"{j}-th image in test_images")
                else:
                    training_images.append(image)
                    if Ldebug: print(f"{j}-th image in training_images")
                    i = 0
                i += 1
                j += 1
            return training_images, test_images
        else:
            return None, None
示例#3
0
def amp_jobs(fdata, job, nsets, HL, E_conv, Lgraph, ival_set):
    total_images = ase.io.read(fdata, index=':')
    images_sets = Images(total_images, nsets)
    if re.search("pr", job):
        y = []
        for mol in total_images:
            y.append(mol.get_potential_energy())
        mplot_nvector([], y, fdata.split(".")[0], 'sample', 'E(eV)')
    ### job == training
    elif re.search("tr", job):
        images = images_sets.get_training_images()
        print("data training:total sets %d/%d" %
              (len(images), len(total_images)))
        exe_train_images(images, HL, E_conv)
        ### job == training & test - test can be done at once by commenting one line below
        amp_pes = "amp.amp"
        images = images_sets.get_test_images()
        title, suptitle = get_title(job, fdata, HL, E_conv, len(total_images),
                                    len(images))
        print("data test:total sets %d/%d" % (len(images), len(total_images)))
        exe_test_images(job, images, amp_pes, title, suptitle, Lgraph)
    ### only test
    elif re.search("te", job):
        amp_pes = "amp.amp"
        images = images_sets.get_test_images()
        title, suptitle = get_title(job, fdata, HL, E_conv, len(total_images),
                                    len(images))
        print("data test:total sets %d/%d" % (len(images), len(total_images)))
        exe_test_images(job, images, amp_pes, title, suptitle, Lgraph)
    ### job == validation
    elif re.search("va", job):
        print("validation test")
        print("data images are diveded into %d sets" % nsets)
        ### training set scan for valicaiotn
        #for i in [0,1,2,3]:   #range(nsets-1):    # last one [4] is kept for test
        # ival_set should be lower than nsets-1

        if ival_set is None:
            print(
                "index for validation set is reguired with '-i num' between 0 ~ {}"
                .format(nsets - 2))
            sys.exit(0)
        else:
            if ival_set >= nsets - 1:
                print("validation set index should be lower than {}".format(
                    nsets - 1))
                print("refer to py_ai_ini.py -j amp")
                sys.exit(3)
        fname = fdata.split(".")[0]
        hl = ''.join(str(x) for x in HL)
        fname += hl + str(E_conv) + ".val"
        #for i in range(nsets-1): # last one is kept for test, this is not working at the moment
        for i in [ival_set]:
            ### training
            images, img_valid = images_sets.get_val_train_images(i)
            print("num images: training {} validation {}".format(
                len(images), len(img_valid)))
            exe_train_images(images, HL, E_conv)
            ### validating
            amp_pes = "amp.amp"
            title, suptitle = get_title(job, fdata, HL, E_conv,
                                        len(total_images), len(images))
            rmserr = exe_test_images(job,
                                     img_valid,
                                     amp_pes,
                                     title,
                                     suptitle,
                                     Lgraph,
                                     val_id=i)
            with open(fname, "a") as f:
                f.write("{}: {:5.3f}\n".format(ival_set, rmserr))
            # check divided image sets: plot 2d here
            if False:
                x_draw = []
                y_draw = []
                for n, atoms in enumerate(images):
                    pot = atoms.get_potential_energy()
                    x_draw.append(n)
                    y_draw.append(pot)
                mplot_vector_two(x_draw,
                                 y_draw,
                                 Title="Extracted Training Set %d" % i,
                                 Xtitle="serial number",
                                 Ytitle="Epot")

    elif re.search('md', job):
        #print("nsets is used for start geometry")
        if not nsets:
            atoms = ase.io.read(fdata, index='0')
        else:
            atoms = ase.io.read(fdata, index=nsets)
        run_md(atoms)
    return
示例#4
0
def amp_jobs(fdata, job, data_int, amp_pes, HL, E_conv, f_conv, Lgraph, ncore,
             n_mol, Ltwinx):
    total_images = ase.io.read(fdata, index=':')  # can read extxyz, OUTCAR,
    images_sets = Images(total_images, nsets=data_int)
    #if not os.path.isfile(amp_pes):

    if re.search("pr", job):
        y = []
        for mol in total_images:
            y.append(mol.get_potential_energy())
        if fdata.endswith('extxyz'):
            mplot_nvector([], y, fdata.split(".")[0], 'sample', 'E(eV)')
        elif fdata == "OUTCAR":
            mplot_nvector([], y, Xtitle='sample', Ytitle='E(eV)')
    ### JOB == TRAINING
    elif re.search("tr", job):
        if isinstance(data_int, int):
            images = images_sets.get_training_images()
        else:
            d_list = data_int[:2]
            images = images_sets.get_training_images(d_list=d_list)
        print("data training:total sets %d/%d" %
              (len(images), len(total_images)))
        calc_train_images(images, HL, E_conv, f_conv, ncore)
        ### test after training:: Do not turn on in qsub
        server = socket.gethostname()
        if server == 'chi':
            if isinstance(data_int, int):
                images = images_sets.get_test_images()
            else:
                if len(data_int) >= 3:
                    if len(data_int) == 3:
                        d_list = data_int[1:]
                    else:
                        d_list = data_int[2:]
                    images = images_sets.get_test_images(d_list=d_list)
                else:
                    print("There is no test set region in -di ")
            title, suptitle = get_title(fdata, HL, E_conv, f_conv,
                                        len(total_images), len(images))
            print("data test:total sets %d/%d" %
                  (len(images), len(total_images)))
            rmserr, max_res = calc_test_images(job,
                                               images,
                                               amp_pes,
                                               title,
                                               suptitle,
                                               Lgraph,
                                               ncore,
                                               Ltwinx=Ltwinx)
            f_write(fdata, HL, E_conv, f_conv, rmserr, max_res, job)
    ### JOB == TEST
    elif re.search("te", job):
        if isinstance(data_int, int):
            if data_int == 0:
                images = total_images
            else:
                images = images_sets.get_test_images()
        ### for data interval
        else:
            images = images_sets.get_test_images(d_list=data_int)

        title, suptitle = get_title(fdata, HL, E_conv, f_conv,
                                    len(total_images), len(images))
        print("data test:total sets %d/%d" % (len(images), len(total_images)))
        rmserr, max_res = calc_test_images(job,
                                           images,
                                           amp_pes,
                                           title,
                                           suptitle,
                                           Lgraph,
                                           ncore,
                                           nmol=n_mol,
                                           Ltwinx=Ltwinx)
        f_write(fdata, HL, E_conv, f_conv, rmserr, max_res, job)

    elif re.search('md', job):
        # use first geometry
        atoms = ase.io.read(fdata, index='0')
        run_md(atoms)
    return
示例#5
0
def amp_jobs(fdata, job, ndata, HL, E_conv):
    total_images = ase.io.read(fdata, index=':')
    images_c = Images(total_images)
    ### job == training
    if re.search("tr", job):
        if not ndata:
            images = images_c.total_images
            print("Start training using all the data %d" % len(images))
        else:
            images = images_c.get_training_images(ndata)
            print("data training:total sets %d/%d" %
                  (len(images), len(total_images)))
        exe_train_images(images, HL, E_conv)
    ### job == test
    elif re.search("te", job):
        amp_pes = "amp.amp"
        images = images_c.get_test_images(ndata)
        title, suptitle = get_title(job, fdata, HL, E_conv, len(total_images),
                                    len(images))
        print("data test:total sets %d/%d" % (len(images), len(total_images)))
        exe_test_images(job, images, amp_pes, title, suptitle)
    ### job == validation
    elif re.search("va", job):
        print("validation test")
        if not ndata:
            ndata = 5
            print("data images are diveded into %d sets" % ndata)
        ### training set scan for valicaiotn
        for i in range(ndata - 1):  # last one is kept for test
            ### training
            images, img_valid = images_c.get_val_train_images(ndata, i)
            print("num images: training {} validation {}".format(
                len(images), len(img_valid)))
            exe_train_images(images, HL, E_conv)
            ### validating
            amp_pes = "amp.amp"
            title, suptitle = get_title(job, fdata, HL, E_conv,
                                        len(total_images), len(images))
            #exe_test_images(job, img_valid, amp_pes, title, suptitle, val_id=i)
            ### Alternative Ways
            calc = Amp.load(amp_pes)
            y = []
            y_bar = []
            for mol in img_valid:
                y.append(mol.get_potential_energy())
                mol.set_calculator(calc)
                y_bar.append(mol.get_potential_energy())
            '''
            err = rmse(y, y_bar)
            print("in job {}-{}: validation error is {}".format(job,i,err)) 
            '''
            # check divided image sets: plot 2d here
            if False:
                x_draw = []
                y_draw = []
                for n, atoms in enumerate(images):
                    pot = atoms.get_potential_energy()
                    x_draw.append(n)
                    y_draw.append(pot)
                mplot_vector_two(x_draw,
                                 y_draw,
                                 Title="Extracted Training Set %d" % i,
                                 Xtitle="serial number",
                                 Ytitle="Epot")

    elif re.search('md', job):
        print("ndata is used for start geometry")
        if not ndata:
            atoms = ase.io.read(fdata, index='0')
        else:
            atoms = ase.io.read(fdata, index=ndata)
        run_md(atoms)
    return