Пример #1
0
def update_archive(thermoml_path=None):
    """Use RSS feeds to find and download any missing ThermoML XML files
    from the ThermoML archive.  
    
    Parameters
    ----------
    thermoml_path : str, optional, default=None
        If specified, use this path to store ThermoML XML files.  If None,
        use the THERMOML_PATH environment variable.
    """
    if thermoml_path is None:
        if "THERMOML_PATH" in os.environ:
            thermoml_path = os.environ["THERMOML_PATH"]
        else:
            raise(KeyError("You must either specify thermoml_path or the THERMOML_PATH environment variable."))

    for key, url in THERMOML_FEEDS.items():
        feed = feedparser.parse(url)
        for entry in feed["entries"]:
            link = entry["link"]
            base_filename = urllib_parse.urlsplit(link).path
            base_filename = base_filename[1:]  # Strip off preceeding backslash character so os.path.join will work
            filename = os.path.join(thermoml_path, base_filename)
            make_path(filename)
            if os.path.exists(filename):
                print("Already downloaded %s from %s" % (filename, link))
            else:
                print("Fetching %s from %s" % (filename, link))
                urllib.request.urlretrieve (link, filename)
Пример #2
0
    def build(self, ff_name, water_name):
        out_filename = self.get_initial_pdb_filename(ff_name, water_name)
        utils.make_path(out_filename)

        if os.path.exists(out_filename):
            return
        
        if self.pdb_filename is not None:
            fixer = pdbfixer.PDBFixer(filename=self.pdb_filename)
        else:
            fixer = pdbfixer.PDBFixer(pdbid=self.pdb_id)

        fixer.findMissingResidues()
        fixer.findNonstandardResidues()
        fixer.replaceNonstandardResidues()
        fixer.findMissingAtoms()
        fixer.addMissingAtoms()
        fixer.removeHeterogens(True)
        fixer.addMissingHydrogens(pH=self.pH)

        n_chains = len(list(fixer.topology.chains()))
        chains_to_remove = np.setdiff1d(np.arange(n_chains), self.keep_chains)
        fixer.removeChains(chains_to_remove)
                
        app.PDBFile.writeFile(fixer.topology, fixer.positions, open(out_filename, 'w'))
Пример #3
0
    def production(self, ff_name, water_name):

        equil_pdb_filename = self.get_equil_pdb_filename(ff_name, water_name)
        production_dcd_filename = self.get_production_dcd_filename(ff_name, water_name)
        production_protein_dcd_filename = self.get_production_protein_dcd_filename(ff_name, water_name)
        
        utils.make_path(production_dcd_filename)

        if os.path.exists(production_protein_dcd_filename):
            return

        ff = app.ForceField('%s.xml' % ff_name, '%s.xml' % water_name)
        
        traj = md.load(equil_pdb_filename)
        top, bonds = traj.top.to_dataframe()
        atom_indices = top.index[top.chainID == 0].values

        pdb = app.PDBFile(equil_pdb_filename)
        
        
        
        system = ff.createSystem(pdb.topology, nonbondedMethod=app.PME, nonbondedCutoff=self.cutoff, constraints=app.HBonds)
        integrator = mm.LangevinIntegrator(self.temperature, self.friction, self.timestep)
        system.addForce(mm.MonteCarloBarostat(self.pressure, self.temperature, self.barostat_frequency))

        simulation = app.Simulation(pdb.topology, system, integrator)
        simulation.context.setPositions(pdb.positions)

        simulation.context.setVelocitiesToTemperature(self.temperature)
        print('Production.')
        simulation.reporters.append(md.reporters.DCDReporter(production_protein_dcd_filename, self.protein_output_frequency, atomSubset=atom_indices))
        simulation.reporters.append(app.DCDReporter(production_dcd_filename, self.output_frequency))
        simulation.step(self.n_steps)
Пример #4
0
    def build(self, ff_name, water_name):
        out_filename = self.get_initial_pdb_filename(ff_name, water_name)
        utils.make_path(out_filename)

        if os.path.exists(out_filename):
            return

        pdbbuilder.build_pdb(self.sequence, out_filename, self.N_cap, self.C_cap, pH=self.pH)
Пример #5
0
 def __init__(self):
     settings = dict(
         debug=True,
         static_path=make_path("static"),
         template_path=make_path("template")
     )
     tornado.web.Application.__init__(self, controllers.routes, **settings)
     self.db = Session
     self.init_gsm()
    def production(self):  
        utils.make_path('production/')
        self.production_dcd_filename = "production/"+self.identifier +"_production.dcd"
        self.production_pdb_filename = "production/"+self.identifier +"_production.pdb"
        self.production_data_filename = "production/"+self.identifier +"_production.csv"

        utils.make_path(self.production_dcd_filename)

        if os.path.exists(self.production_pdb_filename):
            return        

        if self.ran_equilibrate:
            pdb = app.PDBFile(self.equil_pdb_filename)
            topology = pdb.topology
            positions = pdb.positions
        else:
            positions = self.packed_trj.openmm_positions(0)
            topology = self.packed_trj.top.to_openmm()
            topology.setUnitCellDimensions(mm.Vec3(*self.packed_trj.unitcell_lengths[0]) * u.nanometer)
        
        ff = self.ffxml

        system = ff.createSystem(topology, nonbondedMethod=app.PME, nonbondedCutoff=self.cutoff, constraints=app.HBonds)
        integrator = mm.LangevinIntegrator(self.temperature, self.friction, self.timestep)
        system.addForce(mm.MonteCarloBarostat(self.pressure, self.temperature, self.barostat_frequency))

        simulation = app.Simulation(topology, system, integrator)
        simulation.context.setPositions(positions)

        if not self.ran_equilibrate:
            print('Minimizing.')
            simulation.minimizeEnergy()

        simulation.context.setVelocitiesToTemperature(self.temperature)
        print('Production.')
        simulation.reporters.append(app.DCDReporter(self.production_dcd_filename, self.output_frequency))
        simulation.reporters.append(app.StateDataReporter(self.production_data_filename, self.output_data_frequency, step=True, potentialEnergy=True, temperature=True, density=True))

        converged = False
        while not converged:
            simulation.step(self.n_steps)
            d = pd.read_csv(self.production_data_filename, names=["step", "U", "Temperature", "Density"], skiprows=1)
            density_ts = np.array(d.Density)
            [t0, g, Neff] = ts.detectEquilibration(density_ts, nskip=1000)
            density_ts = density_ts[t0:]
            density_mean_stderr = density_ts.std() / np.sqrt(Neff)
            if density_mean_stderr < self.stderr_tolerance:
                converged = True

        del(simulation)
        if self.ran_equilibrate:
            traj = md.load(self.production_dcd_filename, top=self.equil_pdb_filename)[-1]
        else:
            traj = md.load(self.production_dcd_filename, top=self.box_pdb_filename)[-1]
        traj.save(self.production_pdb_filename)
Пример #7
0
    def equilibrate(self, ff_name, water_name):
        
        input_pdb_filename = self.get_initial_pdb_filename(ff_name, water_name)
        equil_pdb_filename = self.get_equil_pdb_filename(ff_name, water_name)
        equil_dcd_filename = self.get_equil_dcd_filename(ff_name, water_name)
        equil_protein_pdb_filename = self.get_equil_protein_pdb_filename(ff_name, water_name)
        
        utils.make_path(equil_pdb_filename)
        
        if os.path.exists(equil_pdb_filename):
            return
        
        ff = app.ForceField('%s.xml' % ff_name, '%s.xml' % water_name)
        pdb = app.PDBFile(input_pdb_filename)
        modeller = app.Modeller(pdb.topology, pdb.positions)
        modeller.addSolvent(ff, model=water_mapping[water_name], padding=self.padding, ionicStrength=self.ionic_strength)
        topology = modeller.getTopology()
        positions = modeller.getPositions()

        system = ff.createSystem(topology, nonbondedMethod=app.PME, nonbondedCutoff=self.cutoff, constraints=app.HBonds)
        integrator = mm.LangevinIntegrator(self.temperature, self.equil_friction, self.equil_timestep)
        system.addForce(mm.MonteCarloBarostat(self.pressure, self.temperature, self.barostat_frequency))

        platform = mm.Platform.getPlatformByName("CUDA")
        platform.setPropertyDefaultValue("CudaDeviceIndex", os.environ["CUDA_VISIBLE_DEVICES"])

        simulation = app.Simulation(topology, system, integrator, platform=platform)
        simulation.context.setPositions(positions)
        
        print('Minimizing.')
        simulation.minimizeEnergy()

        simulation.context.setVelocitiesToTemperature(self.temperature)
        print('Equilibrating.')
        
        simulation.reporters.append(app.PDBReporter(equil_pdb_filename, self.n_equil_steps - 1))
        simulation.reporters.append(app.DCDReporter(equil_dcd_filename, self.equil_output_frequency))
        simulation.step(self.n_equil_steps)
        del simulation
        del system
        traj = md.load(equil_dcd_filename, top=equil_pdb_filename)[-1]
        traj.save(equil_pdb_filename)
        
        top, bonds = traj.top.to_dataframe()
        atom_indices = top.index[top.chainID == 0].values
        traj.restrict_atoms(atom_indices)
        traj.save(equil_protein_pdb_filename)
    def build(self):
        utils.make_path('monomers/')
        utils.make_path('boxes/')
        utils.make_path('ffxml/')
        self.monomer_pdb_filenames = ["monomers/"+string+".pdb" for string in self.cas_strings]
        self.box_pdb_filename = "boxes/" + self.identifier + ".pdb"
        self.ffxml_filename = "ffxml/" + '_'.join(self.cas_strings) + ".xml"

        utils.make_path(self.box_pdb_filename)

        rungaff = False
        if not os.path.exists(self.ffxml_filename):     
            rungaff = True
        if not os.path.exists(self.box_pdb_filename):
            for filename in self.monomer_pdb_filenames:
                if not os.path.exists(filename):
                    rungaff = True

        if rungaff:
            self.smiles_strings = []
            for mlc in self.cas_strings:
                self.smiles_strings.append(resolve(mlc, 'smiles'))
            oemlcs = []
            with gaff2xml.utils.enter_temp_directory():  # Avoid dumping 50 antechamber files in local directory.
                for smiles_string in self.smiles_strings:
                    m = gaff2xml.openeye.smiles_to_oemol(smiles_string)
                    m = gaff2xml.openeye.get_charges(m, strictStereo=False, keep_confs=1)
                    oemlcs.append(m)
                ligand_trajectories, ffxml = gaff2xml.openeye.oemols_to_ffxml(oemlcs)    
            if not os.path.exists(self.ffxml_filename):
                outfile = open(self.ffxml_filename, 'w')
                outfile.write(ffxml.read())
                outfile.close()
                ffxml.seek(0)
            for k, ligand_traj in enumerate(ligand_trajectories):
                pdb_filename = self.monomer_pdb_filenames[k]
                if not os.path.exists(pdb_filename):
                    ligand_traj.save(pdb_filename)

        self.ffxml = app.ForceField(self.ffxml_filename)

        if "7732-18-5" in self.cas_strings:
            self.ffxml.loadFile("tip3p.xml")

        if not os.path.exists(self.box_pdb_filename):
            self.packed_trj = gaff2xml.packmol.pack_box(self.monomer_pdb_filenames, self.n_monomers)
            self.packed_trj.save(self.box_pdb_filename)
        else:
            self.packed_trj = md.load(self.box_pdb_filename)
    def equilibrate(self):
        self.ran_equilibrate = True
        utils.make_path('equil/')
        self.equil_dcd_filename = "equil/"+self.identifier +"_equil.dcd"
        self.equil_pdb_filename = "equil/"+self.identifier +"_equil.pdb"
        utils.make_path(self.equil_pdb_filename)
        
        if os.path.exists(self.equil_pdb_filename):
            return

        positions = self.packed_trj.openmm_positions(0)
        topology = self.packed_trj.top.to_openmm()
        topology.setUnitCellDimensions(mm.Vec3(*self.packed_trj.unitcell_lengths[0]) * u.nanometer)
        
        ff = self.ffxml

        system = ff.createSystem(topology, nonbondedMethod=app.PME, nonbondedCutoff=self.cutoff, constraints=app.HBonds)
        integrator = mm.LangevinIntegrator(self.temperature, self.equil_friction, self.equil_timestep)
        system.addForce(mm.MonteCarloBarostat(self.pressure, self.temperature, self.barostat_frequency))

        simulation = app.Simulation(topology, system, integrator)
        simulation.context.setPositions(positions)
        
        print('Minimizing.')
        simulation.minimizeEnergy()

        simulation.context.setVelocitiesToTemperature(self.temperature)
        print('Equilibrating.')

        simulation.reporters.append(app.DCDReporter(self.equil_dcd_filename, self.equil_output_frequency))
        simulation.step(self.n_equil_steps)


        # Re-write a better PDB with correct box sizes.
        traj = md.load(self.equil_dcd_filename, top=self.box_pdb_filename)[-1]
        traj.save(self.equil_pdb_filename)
Пример #10
0
def log():
    global best_score
    print("Logging")
    tr_logits, tr_cost = iter_apply(
        trX[:n_valid], trM[:n_valid], trY[:n_valid])
    va_logits, va_cost = iter_apply(vaX, vaM, vaY)
    tr_cost = tr_cost / len(trY[:n_valid])
    va_cost = va_cost / n_valid
    tr_acc = accuracy_score(trY[:n_valid], np.argmax(tr_logits, 1)) * 100.
    va_acc = accuracy_score(vaY, np.argmax(va_logits, 1)) * 100.
    logger.log(n_epochs=n_epochs, n_updates=n_updates, tr_cost=tr_cost,
               va_cost=va_cost, tr_acc=tr_acc, va_acc=va_acc)
    print('%d %d %.3f %.3f %.2f %.2f' %
          (n_epochs, n_updates, tr_cost, va_cost, tr_acc, va_acc))
    if submit:
        score = va_acc
        if score > best_score:
            best_score = score
            path = os.path.join(save_dir, desc, 'best_params')
            chainer.serializers.save_npz(make_path(path), model)
Пример #11
0
 def save(save_path, postfix):
     ps = sess.run(params)
     make_path(save_path)
     joblib.dump(ps, save_path+"/model"+str(postfix)+".pkl")
Пример #12
0
    avg_params = ema.apply(tf.trainable_variables())
    train = tf.group(train, avg_params)

    if not hps.profile:
        _, ema_loss, ema_states = model(X, S, Y, hps, train=False, ema=ema)

    # Logging
    timestamp = time.strftime('r%Y_%m_%d_%H_%M_%S')
    log_file = os.path.join(hps.logdir, 'lm', timestamp, "log.txt")
    json_file = os.path.join(hps.logdir, 'lm', timestamp, "json.txt")
    if os.path.exists(log_file):
        # avoid 2 jobs sharing log (quick and dirty fix)
        print(log_file, "already exists, exiting.")
        exit()

    make_path(log_file)
    logging.basicConfig(format='%(asctime)s [%(levelname)s] %(message)s',
                        filename=log_file,
                        level=logging.DEBUG)
    logging.getLogger().addHandler(
        logging.StreamHandler())  # Print logs to stderr as well

    hps.num_params = str(num_trainable_params("model0"))
    print_trainable_params("model0")

    json_header = {}
    for key in sorted(hps.__dict__.keys()):
        if type(hps.__dict__[key]) in (str, int, float, type, tf.DType):
            logging.info(str(key) + ': ' + str(hps.__dict__[key]))
            json_header[str(key)] = str(hps.__dict__[key])
Пример #13
0
def train():
    # load data sets
    train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros)
    dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros)
    test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)

    # Use selected tagging scheme (IOB / IOBES)
    update_tag_scheme(train_sentences, FLAGS.tag_schema)
    update_tag_scheme(test_sentences, FLAGS.tag_schema)

    # create maps if not exist
    if not os.path.isfile(FLAGS.map_file):
        # create dictionary for word
        if FLAGS.pre_emb:
            dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0]
            dico_chars, char_to_id, id_to_char = augment_with_pretrained(
                dico_chars_train.copy(),
                FLAGS.emb_file,
                list(itertools.chain.from_iterable(
                    [[w[0] for w in s] for s in test_sentences])
                )
            )
        else:
            _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower)

        # Create a dictionary and a mapping for tags 创建字典和标签映射
        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)
        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)

    # prepare data, get a collection of list containing index 准备数据,获取包含索引的列表的集合
    train_data = prepare_dataset(
        train_sentences, char_to_id, tag_to_id, FLAGS.lower
    )
    dev_data = prepare_dataset(
        dev_sentences, char_to_id, tag_to_id, FLAGS.lower
    )
    test_data = prepare_dataset(
        test_sentences, char_to_id, tag_to_id, FLAGS.lower
    )
    print("%i / %i / %i sentences in train / dev / test." % (
        len(train_data), 0, len(test_data)))

    train_manager = BatchManager(train_data, FLAGS.batch_size)
    dev_manager = BatchManager(dev_data, 100)
    test_manager = BatchManager(test_data, 100)
    # make path for store log and model if not exist
    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        config = config_model(char_to_id, tag_to_id)
        save_config(config, FLAGS.config_file)
    make_path(FLAGS)

    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    print_config(config, logger)

    # limit GPU memory
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    steps_per_epoch = train_manager.len_data
    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger)
        logger.info("start training")
        loss = []
        for i in range(100):
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                                "NER loss:{:>9.6f}".format(
                        iteration, step%steps_per_epoch, steps_per_epoch, np.mean(loss)))
                    loss = []

            best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)
            if best:
                save_model(sess, model, FLAGS.ckpt_path, logger)
            evaluate(sess, model, "test", test_manager, id_to_tag, logger)
def evaluate_testDataSet():
    # load data sets
    train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower,
                                     FLAGS.zeros)
    dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros)
    test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)

    # Use selected tagging scheme (IOB / IOBES)
    update_tag_scheme(train_sentences, FLAGS.tag_schema)
    update_tag_scheme(test_sentences, FLAGS.tag_schema)

    # create maps if not exist
    if not os.path.isfile(FLAGS.map_file):
        # create dictionary for word
        if FLAGS.pre_emb:
            dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0]
            dico_chars, char_to_id, id_to_char = augment_with_pretrained(
                dico_chars_train.copy(), FLAGS.emb_file,
                list(
                    itertools.chain.from_iterable([[w[0] for w in s]
                                                   for s in test_sentences])))
        else:
            _c, char_to_id, id_to_char = char_mapping(train_sentences,
                                                      FLAGS.lower)

        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)
        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)

    # prepare data, get a collection of list containing index
    train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id,
                                 FLAGS.lower)
    dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id,
                               FLAGS.lower)
    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id,
                                FLAGS.lower)
    print("%i / %i / %i sentences in train / dev / test." %
          (len(train_data), len(dev_data), len(test_data)))

    dev_manager = BatchManager(dev_data, 100)
    test_manager = BatchManager(test_data, 100)
    # make path for store log and model if not exist
    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        config = config_model(char_to_id, tag_to_id)
        save_config(config, FLAGS.config_file)
    make_path(FLAGS)

    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    print_config(config, logger)

    # limit GPU memory
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec,
                             config, id_to_char, logger)
        for i in range(100):
            best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)
            if best:
                save_model(sess, model, FLAGS.ckpt_path, logger)
            evaluate(sess, model, "test", test_manager, id_to_tag, logger)
Пример #15
0
            'synt1bow': synt1_bow, 'synt2bow': synt2_bow}

print("==== loading data ====")
num = 1000000
para_data = h5py.File(os.path.join(args.data_dir, 'data.h5'), 'r')

train_idxs, valid_idxs = random_split(range(num), [num-5000, 5000], generator=torch.Generator().manual_seed(args.seed))

print(f"number of train examples: {len(train_idxs)}")
print(f"number of valid examples: {len(valid_idxs)}")

train_loader = DataLoader(train_idxs, batch_size=args.train_batch_size, shuffle=True)
valid_loader = DataLoader(valid_idxs, batch_size=args.valid_batch_size, shuffle=False)

print("==== preparing data ====")
make_path(args.cache_dir)
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base', cache_dir=args.cache_dir)

with open('synt_vocab.pkl', 'rb') as f:
    synt_vocab = pickle.load(f)

dataset = prepare_dataset(para_data, tokenizer, num)

print("==== loading model ====")
config = BartConfig.from_pretrained('facebook/bart-base', cache_dir=args.cache_dir)
config.word_dropout = args.word_dropout
config.max_sent_len = args.max_sent_len
config.max_synt_len = args.max_synt_len

bart = BartModel.from_pretrained('facebook/bart-base', cache_dir=args.cache_dir)
model = ParaBart(config)
Пример #16
0
 def save(save_path):
     ps = sess.run(params)
     make_path(osp.dirname(save_path))
     joblib.dump(ps, save_path)
Пример #17
0
                          shuffle=False)

# load model
model = SynPG(len(dictionary), 300, word_dropout=args.word_dropout)
model.load_state_dict(torch.load(args.model_path))

optimizer = torch.optim.Adam(model.parameters(),
                             lr=args.lr,
                             weight_decay=args.weight_decay)
criterion = nn.CrossEntropyLoss(ignore_index=dictionary.word2idx["<pad>"])

model = model.cuda()
criterion = criterion.cuda()

# create folders
make_path(args.model_dir)
make_path(args.output_dir)

print("==== start training ====")
for epoch in range(1, args.n_epoch + 1):
    # training
    train(epoch, model, train_data, valid_data, train_loader, valid_loader,
          optimizer, criterion, dictionary, bpe, args)
    # save model
    torch.save(
        model.state_dict(),
        os.path.join(args.model_dir, "synpg_epoch{:02d}.pt".format(epoch)))
    # shuffle training data
    train_loader = DataLoader(train_idxs,
                              batch_size=args.batch_size,
                              shuffle=True)
Пример #18
0
def train():
    train_sentences = load_sentences(FLAGS.train_file)
    dev_sentences = load_sentences(FLAGS.dev_file)
    test_sentences = load_sentences(FLAGS.test_file)

    update_tag_scheme(train_sentences, FLAGS.tag_schema)
    update_tag_scheme(test_sentences, FLAGS.tag_schema)
    update_tag_scheme(dev_sentences, FLAGS.tag_schema)

    if not os.path.isfile(FLAGS.map_file):
        if FLAGS.pre_emb:
            dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0]

            dico_chars, char_to_id, id_to_char = augment_with_pretrained(
                dico_chars_train.copy(), FLAGS.emb_file,
                list(
                    itertools.chain.from_iterable([[w[0] for w in s]
                                                   for s in test_sentences])))
        else:
            _c, char_to_id, id_to_char = char_mapping(train_sentences,
                                                      FLAGS.lower)

        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)
        with open(FLAGS.map_file, 'wb') as f:
            pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.map_file, 'rb') as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)

    train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id)
    dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id)
    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id)
    train_manager = BatchManager(train_data, FLAGS.batch_size, FLAGS.num_steps)

    dev_manager = BatchManager(dev_data, 100, FLAGS.num_steps)
    test_manager = BatchManager(test_data, 100, FLAGS.num_steps)

    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        config = config_model(char_to_id, tag_to_id)
        save_config(config, FLAGS.config_file)
    make_path(FLAGS)

    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    print_config(config, logger)
    os.environ["CUDA_VISIBLE_DEVICES"] = "3"
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.9)
    tf_config = tf.ConfigProto(gpu_options=gpu_options)
    tf_config.gpu_options.allow_growth = True
    steps_per_epoch = train_manager.len_data
    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec,
                             config, id_to_char, logger)
        logger.info("start training")
        loss = []
        for i in range(75):
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{},".format(
                        iteration, step % steps_per_epoch, steps_per_epoch))
                    loss = []
            best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)
            if best:
                save_model(sess, model, FLAGS.ckpt_path, logger)
            evaluate(sess, model, "test", test_manager, id_to_tag, logger)
Пример #19
0
def find_storage_space(pth, identifier=ORIGINAL_IDENTIFIER):
    '''Find a new path with the identifier'''
    name, ext = pth.splitext()
    return make_path(name + identifier + ext, sep='').abspath()
Пример #20
0
def find_storage_space(pth, identifier=ORIGINAL_IDENTIFIER):
    '''Find a new path with the identifier'''
    name, ext = pth.splitext()
    return make_path(name + identifier + ext, sep='').abspath()
Пример #21
0
 def test_make_path(self):
     self.assertFalse(make_path(__file__).exists())
Пример #22
0
def train():
    # load data sets
    datasets = load_sentences(FLAGS.train_file, FLAGS.lower)
    np.random.seed(1)
    np.random.shuffle(datasets)
    train_sentences = datasets[:15000]
    test_sentences = datasets[15000:]

    # Use selected tagging scheme (IOB / IOBES)
    update_tag_scheme(train_sentences, FLAGS.tag_schema)
    update_tag_scheme(test_sentences, FLAGS.tag_schema)

    # create maps if not exist
    if not os.path.isfile(FLAGS.map_file):
        # create dictionary for word
        char_to_id, _ = elmo_char_mapping(FLAGS.elmo_vocab)

        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)
        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([char_to_id, tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, tag_to_id, id_to_tag = pickle.load(f)

    # prepare data, get a collection of list containing index
    train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id, FLAGS.lower)
    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id, FLAGS.lower)
    print("%i / %i sentences in train / dev." % (len(train_data), len(test_data)))

    elmo_batcher = get_batcher()
    train_manager = BatchManager(train_data, FLAGS.batch_size, elmo_batcher)
    test_manager = BatchManager(test_data, FLAGS.batch_size, elmo_batcher)
    # make path for store log and model if not exist
    make_path(FLAGS)
    # if os.path.isfile(FLAGS.config_file):
    #     config = load_config(FLAGS.config_file)
    # else:
    config = config_model(tag_to_id)
        # save_config(config, FLAGS.config_file)
    make_path(FLAGS)

    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    print_config(config, logger)

    # limit GPU memory
    tf_config = tf.ConfigProto(allow_soft_placement=True)
    tf_config.gpu_options.allow_growth = True
    steps_per_epoch = train_manager.len_data
    with tf.Session(config=tf_config) as sess:
        elmo_model = load_elmo()
        model = create_model(sess, Model, FLAGS.ckpt_path, elmo_model, config, logger)
        # ckpt_file = tf.train.latest_checkpoint(FLAGS.ckpt_path)
        # if not ckpt_file:
        #     model.lr/=100 #第一个epoch,用非常小的lr先训练
        logger.info("start training")
        loss = []
        f1score_lis=[0]
        for i in range(FLAGS.max_epoch):
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, NER loss:{:>9.6f}".format(
                        iteration, step%steps_per_epoch, steps_per_epoch, np.mean(loss)))
                    loss = []

            best = evaluate(sess, model, "test", test_manager, id_to_tag, logger) #这一步会生成ner_predict.utf8文件
            f1score = BIO_F1score(
                predict='result/ner_predict.utf8')
            logger.info('BIOf1score:{}'.format(f1score))
            f1score_lis.append(f1score)
            # if i==0:
            #     model.lr=FLAGS.lr #第一个epoch结束后,lr恢复初始值
            if best and f1score_lis[-1]>f1score_lis[-2]:
                save_model(sess, model, FLAGS.ckpt_path, logger,step)
            else:
                model.lr*=0.95
                logger.info('lr:{}'.format(model.lr))
                ckpt = tf.train.get_checkpoint_state(FLAGS.ckpt_path)
                if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
                    logger.info("Reading model parameters from %s" % ckpt.model_checkpoint_path)
                    model.saver.restore(sess, ckpt.model_checkpoint_path)
Пример #23
0
def train():
    # 加载数据集
    train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower,
                                     FLAGS.zeros)
    dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros)
    test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)

    # 选择tag形式 (IOB / IOBES)  默认使用IOBES
    update_tag_scheme(train_sentences, FLAGS.tag_schema)
    update_tag_scheme(test_sentences, FLAGS.tag_schema)

    if not os.path.isfile(FLAGS.map_file):
        if FLAGS.pre_emb:
            dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0]
            dico_chars, char_to_id, id_to_char = augment_with_pretrained(
                dico_chars_train.copy(), FLAGS.emb_file,
                list(
                    itertools.chain.from_iterable([[w[0] for w in s]
                                                   for s in test_sentences])))
        else:
            _c, char_to_id, id_to_char = char_mapping(train_sentences,
                                                      FLAGS.lower)

        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)
        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.map_file, "rb") as f:
            # {'S-LOC': 10, 'E-LOC': 3, 'B-ORG': 4, 'S-PER': 11, 'S-ORG': 12, 'O': 0,
            # 'E-ORG': 5, 'I-LOC': 6, 'I-PER': 7, 'I-ORG': 1, 'B-PER': 8, 'B-LOC': 2, 'E-PER': 9}
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)

    # 转化成数字化的数据
    train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id,
                                 FLAGS.lower)
    dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id,
                               FLAGS.lower)
    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id,
                                FLAGS.lower)
    print("%i / %i / %i sentences in train / dev / test." %
          (len(train_data), len(dev_data), len(test_data)))

    #长度不足补0
    train_manager = BatchManager(train_data, FLAGS.batch_size)
    dev_manager = BatchManager(dev_data, 100)
    test_manager = BatchManager(test_data, 100)

    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        config = config_model(char_to_id, tag_to_id)
        save_config(config, FLAGS.config_file)
    make_path(FLAGS)

    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    print_config(config, logger)

    # GPU设置
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    steps_per_epoch = train_manager.len_data

    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec,
                             config, id_to_char, logger)
        logger.info("start training")
        loss = []
        for i in range(100):
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                                "NER loss:{:>9.6f}".format(
                                    iteration, step % steps_per_epoch,
                                    steps_per_epoch, np.mean(loss)))
                    # 每100次算一次平均loss
                    loss = []

            best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)
            if best:
                save_model(sess, model, FLAGS.ckpt_path, logger)
            evaluate(sess, model, "test", test_manager, id_to_tag, logger)
Пример #24
0
    load_openai_pretrained_model(model, n_ctx=n_ctx, n_special=n_special)

    if device_id >= 0:
        cuda.cupy.random.seed(seed)

        model.to_gpu()
        lm_head.to_gpu()
        clf_head.to_gpu()

    n_updates = 0
    n_epochs = 0
    if dataset != 'stsb':
        trYt = trY
    if submit:
        path = os.path.join(save_dir, desc, 'best_params')
        chainer.serializers.save_npz(make_path(path), model)
    best_score = 0
    for i in range(n_iter):
        print("running epoch", i)
        run_epoch()
        n_epochs += 1
        log()
    if submit:
        path = os.path.join(save_dir, desc, 'best_params')
        chainer.serializers.load_npz(make_path(path), model)
        predict()
        if analysis:
            if dataset == 'rocstories':
                rocstories_analysis(
                    data_dir, os.path.join(
                        submission_dir, filenames[dataset]), os.path.join(
Пример #25
0
def train():
    # load data sets
    train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros)
    dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros)
    test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)

    # Use selected tagging scheme (IOB / IOBES)
    update_tag_scheme(train_sentences, FLAGS.tag_schema)
    update_tag_scheme(test_sentences, FLAGS.tag_schema)

    # create maps if not exist
    if not os.path.isfile(FLAGS.map_file):
        # create dictionary for word
        if FLAGS.pre_emb:
            dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0]
            dico_chars, char_to_id, id_to_char = augment_with_pretrained(
                dico_chars_train.copy(),
                FLAGS.emb_file,
                list(itertools.chain.from_iterable(
                    [[w[0] for w in s] for s in test_sentences])
                )
            )
        else:
            _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower)

        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)
        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)

    # prepare data, get a collection of list containing index
    train_data = prepare_dataset(
        train_sentences, char_to_id, tag_to_id, FLAGS.lower
    )
    dev_data = prepare_dataset(
        dev_sentences, char_to_id, tag_to_id, FLAGS.lower
    )
    test_data = prepare_dataset(
        test_sentences, char_to_id, tag_to_id, FLAGS.lower
    )
    print("%i / %i / %i sentences in train / dev / test." % (
        len(train_data), 0, len(test_data)))

    train_manager = BatchManager(train_data, FLAGS.batch_size)
    dev_manager = BatchManager(dev_data, 100)
    test_manager = BatchManager(test_data, 100)
    # make path for store log and model if not exist
    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        config = config_model(char_to_id, tag_to_id)
        save_config(config, FLAGS.config_file)
    make_path(FLAGS)

    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    print_config(config, logger)

    # limit GPU memory
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    steps_per_epoch = train_manager.len_data
    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger)
        logger.info("start training")
        loss = []

        for i in range(100):
            for batch in train_manager.iter_batch(shuffle=True):
                #print batch
                step, batch_loss = model.run_step(sess, True, batch)
                #print step
                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                                "NER loss:{:>9.6f}".format(
                        iteration, step%steps_per_epoch, steps_per_epoch, np.mean(loss)))
                    loss = []

            best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)
            if best:
                save_model(sess, model, FLAGS.ckpt_path, logger)
            evaluate(sess, model, "test", test_manager, id_to_tag, logger)
Пример #26
0
    def production(self):
        utils.make_path('production/')
        self.production_dcd_filename = "production/" + self.identifier + "_production.dcd"
        self.production_pdb_filename = "production/" + self.identifier + "_production.pdb"
        self.production_data_filename = "production/" + self.identifier + "_production.csv"

        utils.make_path(self.production_dcd_filename)

        if os.path.exists(self.production_pdb_filename):
            return

        if self.ran_equilibrate:
            pdb = app.PDBFile(self.equil_pdb_filename)
            topology = pdb.topology
            positions = pdb.positions
        else:
            positions = self.packed_trj.openmm_positions(0)
            topology = self.packed_trj.top.to_openmm()
            topology.setUnitCellDimensions(
                mm.Vec3(*self.packed_trj.unitcell_lengths[0]) * u.nanometer)

        ff = self.ffxml

        system = ff.createSystem(topology,
                                 nonbondedMethod=app.PME,
                                 nonbondedCutoff=self.cutoff,
                                 constraints=app.HBonds)
        integrator = mm.LangevinIntegrator(self.temperature, self.friction,
                                           self.timestep)
        system.addForce(
            mm.MonteCarloBarostat(self.pressure, self.temperature,
                                  self.barostat_frequency))

        simulation = app.Simulation(topology, system, integrator)
        simulation.context.setPositions(positions)

        if not self.ran_equilibrate:
            print('Minimizing.')
            simulation.minimizeEnergy()

        simulation.context.setVelocitiesToTemperature(self.temperature)
        print('Production.')
        simulation.reporters.append(
            app.DCDReporter(self.production_dcd_filename,
                            self.output_frequency))
        simulation.reporters.append(
            app.StateDataReporter(self.production_data_filename,
                                  self.output_data_frequency,
                                  step=True,
                                  potentialEnergy=True,
                                  temperature=True,
                                  density=True))

        converged = False
        while not converged:
            simulation.step(self.n_steps)
            d = pd.read_csv(self.production_data_filename,
                            names=["step", "U", "Temperature", "Density"],
                            skiprows=1)
            density_ts = np.array(d.Density)
            [t0, g, Neff] = ts.detectEquilibration(density_ts, nskip=1000)
            density_ts = density_ts[t0:]
            density_mean_stderr = density_ts.std() / np.sqrt(Neff)
            if density_mean_stderr < self.stderr_tolerance:
                converged = True

        del (simulation)
        if self.ran_equilibrate:
            traj = md.load(self.production_dcd_filename,
                           top=self.equil_pdb_filename)[-1]
        else:
            traj = md.load(self.production_dcd_filename,
                           top=self.box_pdb_filename)[-1]
        traj.save(self.production_pdb_filename)
Пример #27
0
def train():
    # load data sets:返回的是语料集的[['字','标'],...]元组
    train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower,
                                     FLAGS.zeros)
    dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros)
    test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)

    # Use selected tagging scheme (IOB / IOBES)
    update_tag_scheme(train_sentences, FLAGS.tag_schema)
    update_tag_scheme(test_sentences, FLAGS.tag_schema)

    # 由loader.py负责处理数据

    # create maps if not exist
    if not os.path.isfile(FLAGS.map_file):

        # create dictionary for word
        if FLAGS.pre_emb:  # 判断是否用之前训练好的词向量
            dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0]
            # dico_chars_train应该只接收了dico <注意后面的[0]> ,即训练数据的不重复统计的字集

            dico_chars, char_to_id, id_to_char = augment_with_pretrained(
                dico_chars_train.copy(), FLAGS.emb_file,
                list(
                    itertools.chain.from_iterable([[w[0] for w in s]
                                                   for s in test_sentences]))
                # chain.from_iterable(iterables): 一个备用链构造函数,其中的iterables是一个迭代变量,生成迭代序列
                # 所以这里的list生成的就是test_sentences里的字集
            )
            # 这里dico_chars是在train_set字典基础上添加wiki_100中包含的test_set里的字构成的字典
        else:
            _c, char_to_id, id_to_char = char_mapping(train_sentences,
                                                      FLAGS.lower)

        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)
        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)
            # 通过pickle模块的序列化操作我们能够将程序中运行的对象信息保存到文件中去,永久存储。
    else:
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)

    # prepare data, get a collection of list containing index
    train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id,
                                 FLAGS.lower)
    dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id,
                               FLAGS.lower)
    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id,
                                FLAGS.lower)
    # xxx_data 以句子为单位存储[字符,字符id,标签id/chars长度的全是“0”对应标签id的list <train = True/False>,标签]
    print("%i / %i / %i sentences in train / dev / test." %
          (len(train_data), 0, len(test_data)))

    train_manager = BatchManager(train_data,
                                 FLAGS.batch_size)  # 默认的batch_size为20
    dev_manager = BatchManager(dev_data, 100)
    test_manager = BatchManager(test_data, 100)
    # 定义了3个BatchManager类:这个类中包含batch_data和len_data
    # batch_data 是按句子长短顺序排序后一个batch大小的data列表数据,而且每个batch中的数据都padding到统一长短
    # len_data   是所分batch的数量

    # make path for store log and model if not exist
    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        config = config_model(char_to_id, tag_to_id)
        save_config(config, FLAGS.config_file)
    make_path(FLAGS)

    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    print_config(config, logger)

    # limit GPU memory
    tf_config = tf.ConfigProto(
    )  # tf.ConfigProto一般用在创建session的时候。用来对session进行参数配置
    tf_config.gpu_options.allow_growth = True
    steps_per_epoch = train_manager.len_data
    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec,
                             config, id_to_char, logger)
        logger.info("start training")
        loss = []
        for i in range(100):
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                                "NER loss:{:>9.6f}".format(
                                    iteration, step % steps_per_epoch,
                                    steps_per_epoch, np.mean(loss)))
                    loss = []

            best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)
            if best:
                save_model(sess, model, FLAGS.ckpt_path, logger)
            evaluate(sess, model, "test", test_manager, id_to_tag, logger)
Пример #28
0
def train():
    # load data sets
    # 句子集合 = [[句子1],[句子2],[句子3]],句子1 = [我 O,在 O,。。。]
    #<class 'list'>: [['海', 'O'], ['钓', 'O'], ['比', 'O'], ['赛', 'O'], ['地', 'O'], ['点', 'O'], ['在', 'O'], ['厦', 'B-LOC'], ['门', 'I-LOC'], ['与', 'O'], ['金', 'B-LOC'], ['门', 'I-LOC'], ['之', 'O'], ['间', 'O'], ['的', 'O'], ['海', 'O'], ['域', 'O'], ['。', 'O']]
    # train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros)
    # dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros)
    # test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)
    from xlnet_base.xlnet_data_utils import XLNetDataUtils
    sp_model = spm.SentencePieceProcessor()
    sp_model.Load('./chinese_xlnet_base_L-12_H-768_A-12/spiece.model')

    train_data = XLNetDataUtils(sp_model,
                                batch_size=FLAGS.batch_size,
                                entry="train")
    dev_data = XLNetDataUtils(sp_model,
                              batch_size=FLAGS.batch_size,
                              entry="dev")
    test_data = XLNetDataUtils(sp_model,
                               batch_size=FLAGS.batch_size,
                               entry="test")
    dev_batch = dev_data.iteration()

    def datapadding(data):
        alldatalist = []

        datalist = data.data
        max_length = 64
        for i in range(len(datalist)):
            tmpdatalist = []
            token = datalist[i][0]
            segmentid = datalist[i][1]
            inputid = datalist[i][2]
            inputmask = datalist[i][3]
            labellist = datalist[i][4]
            #token label
            if len(labellist) < max_length:
                for i in range(max_length - len(token)):
                    labellist.append(0)
            elif len(labellist) > max_length:
                tmplabellist = []
                for i in range(max_length):
                    tmplabellist.append(labellist[i])
                labellist = tmplabellist
            #segmentid inputid inputmask
            if len(segmentid) < max_length:
                for i in range(max_length - len(segmentid)):
                    segmentid.append(0)
                    inputid.append(0)
                    inputmask.append(0)
            elif len(segmentid) > max_length:
                tmpsegmentid = []
                tmpinputid = []
                tmpinputmask = []
                for i in range(max_length):
                    tmpsegmentid.append(segmentid[i])
                    tmpinputid.append(inputid[i])
                    tmpinputmask.append(inputmask[i])
                segmentid = tmpsegmentid
                inputid = tmpinputid
                inputmask = tmpinputmask
            tmpdatalist.append(token)
            tmpdatalist.append(segmentid)
            tmpdatalist.append(inputid)
            tmpdatalist.append(inputmask)
            tmpdatalist.append(labellist)
            alldatalist.append(tmpdatalist)
        return alldatalist

    ftraindata = datapadding(train_data)

    fdevdata = datapadding(dev_data)
    ftestdata = datapadding(test_data)
    print(len(ftraindata))
    print(len(fdevdata))
    print(len(ftestdata))
    # traindata = {
    #     "batch_size": train_data.batch_size,
    #     "input_size": train_data.input_size,
    #     "vocab": train_data.vocab,
    #     "tag_map": train_data.tag_map,
    # }
    # devdata = {
    #     "batch_size": dev_data.batch_size,
    #     "input_size": dev_data.input_size,
    #     "vocab": dev_data.vocab,
    #     "tag_map": dev_data.tag_map,
    # }
    # testdata = {
    #     "batch_size": test_data.batch_size,
    #     "input_size": test_data.input_size,
    #     "vocab": test_data.vocab,
    #     "tag_map": test_data.tag_map,
    # }
    # if not os.path.exists("./model/train_data_map.pkl"):
    #     f = open("./model/train_data_map.pkl", "wb")
    #     pickle.dump(traindata, f)
    #     f.close()
    # if not os.path.exists("./model/dev_data_map.pkl"):
    #     f = open("./model/dev_data_map.pkl", "wb")
    #     pickle.dump(devdata, f)
    #     f.close()
    # if not os.path.exists("./model/test_data_map.pkl"):
    #     f = open("./model/test_data_map.pkl", "wb")
    #     pickle.dump(testdata, f)
    #     f.close()

    # Use selected tagging scheme (IOB / IOBES)
    #update_tag_scheme(train_sentences, FLAGS.tag_schema)
    #update_tag_scheme(test_sentences, FLAGS.tag_schema)

    # create maps if not exist
    if not os.path.isfile(FLAGS.map_file):
        # Create a dictionary and a mapping for tags
        '''
         _t:{'O': 869087, 'B-LOC': 16571, 'I-LOC': 22531, 'B-PER': 8144, 'I-PER': 15881, 'B-ORG': 9277, 'I-ORG': 37689, '[SEP]': 8, '[CLS]': 10}
         id_to_tag:{0: 'O', 1: 'I-ORG', 2: 'I-LOC', 3: 'B-LOC', 4: 'I-PER', 5: 'B-ORG', 6: 'B-PER', 7: '[CLS]', 8: '[SEP]'}
         tag_to_id:{'O': 0, 'I-ORG': 1, 'I-LOC': 2, 'B-LOC': 3, 'I-PER': 4, 'B-ORG': 5, 'B-PER': 6, '[CLS]': 7, '[SEP]': 8}
        '''

        tag_to_id = train_data.tag_map
        id_to_tag = {v: k for k, v in tag_to_id.items()}
        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.map_file, "rb") as f:
            tag_to_id, id_to_tag = pickle.load(f)

    # prepare data, get a collection of list containing index
    '''
    [['在', '这', '里', '恕', '弟', '不', '恭', '之', '罪', ',', '敢', '在', '尊', '前', '一', '诤', ':', '前', '人', '论',
    '书', ',', '每', '曰', '“', '字', '字', '有', '来', '历', ',', '笔', '笔', '有', '出', '处', '”', ',', '细', '读', '公', 
    '字', ',', '何', '尝', '跳', '出', '前', '人', '藩', '篱', ',', '自', '隶', '变', '而', '后', ',', '直', '至', '明', '季',
    ',', '兄', '有', '何', '新', '出', '?'], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 1762, 6821, 7027, 2609, 2475, 679, 2621, 722,
    5389, 8024, 3140, 1762, 2203, 1184, 671, 6420, 8038, 1184, 782, 6389, 741, 8024, 3680, 3288, 100, 2099, 2099, 3300,
    3341, 1325, 8024, 5011, 5011, 3300, 1139, 1905, 100, 8024, 5301, 6438, 1062, 2099, 8024, 862, 2214, 6663, 1139, 
    1184, 782, 5974, 5075, 8024, 5632, 7405, 1359, 5445, 1400, 8024, 4684, 5635, 3209, 2108, 8024, 1040, 3300, 862, 
    3173, 1139, 8043, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
    '''

    # train_data = prepare_dataset(
    #     train_sentences, FLAGS.max_seq_len, tag_to_id, FLAGS.lower
    # )
    # dev_data = prepare_dataset(
    #     dev_sentences, FLAGS.max_seq_len, tag_to_id, FLAGS.lower
    # )
    # test_data = prepare_dataset(
    #     test_sentences, FLAGS.max_seq_len, tag_to_id, FLAGS.lower
    # )

    print("%i / %i / %i sentences in train / dev / test." %
          (len(train_data.data), len(dev_data.data), len(test_data.data)))

    train_manager = BatchManager(ftraindata, FLAGS.batch_size)
    dev_manager = BatchManager(fdevdata, FLAGS.batch_size)
    test_manager = BatchManager(ftestdata, FLAGS.batch_size)
    # make path for store log and model if not exist
    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        config = config_model(tag_to_id)
        save_config(config, FLAGS.config_file)
    make_path(FLAGS)

    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    print_config(config, logger)

    # limit GPU memory
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    steps_per_epoch = train_manager.len_data
    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model, FLAGS.ckpt_path, config, logger)
        logger.info("start training")
        loss = []
        for i in range(100):
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, True, batch)

                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                                "NER loss:{:>9.6f}".format(
                                    iteration, step % steps_per_epoch,
                                    steps_per_epoch, np.mean(loss)))
                    loss = []

            best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)
            if best:
                save_model(sess,
                           model,
                           FLAGS.ckpt_path,
                           logger,
                           global_steps=step)
            evaluate(sess, model, "test", test_manager, id_to_tag, logger)
Пример #29
0
def train():
    # load data sets
    train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower,
                                     FLAGS.zeros)
    dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros)
    test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)

    # Use selected tagging scheme (IOB / IOBES)
    # 检测并维护数据集的 tag 标记
    update_tag_scheme(train_sentences, FLAGS.tag_schema)
    update_tag_scheme(test_sentences, FLAGS.tag_schema)
    update_tag_scheme(dev_sentences, FLAGS.tag_schema)

    # create maps if not exist
    # 根据数据集创建 char_to_id, id_to_char, tag_to_id, id_to_tag 字典,并储存为 pkl 文件
    if not os.path.isfile(FLAGS.map_file):
        # create dictionary for word
        if FLAGS.pre_emb:
            dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0]
            # 利用预训练嵌入集增强(扩充)字符字典,然后返回字符与位置映射关系
            dico_chars, char_to_id, id_to_char = augment_with_pretrained(
                dico_chars_train.copy(), FLAGS.emb_file,
                list(
                    itertools.chain.from_iterable([[w[0] for w in s]
                                                   for s in test_sentences])))
        else:
            _c, char_to_id, id_to_char = char_mapping(train_sentences,
                                                      FLAGS.lower)

        # Create a dictionary and a mapping for tags
        # 获取标记与位置映射关系
        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)

        #with open('maps.txt','w',encoding='utf8') as f1:
        #f1.writelines(str(char_to_id)+" "+id_to_char+" "+str(tag_to_id)+" "+id_to_tag+'\n')
        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)

    # 提取句子特征
    # prepare data, get a collection of list containing index
    train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id,
                                 FLAGS.lower)
    dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id,
                               FLAGS.lower)
    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id,
                                FLAGS.lower)
    print("%i / %i / %i sentences in train / dev / test." %
          (len(train_data), len(dev_data), len(test_data)))

    # 获取可供模型训练的单个批次数据
    train_manager = BatchManager(train_data, FLAGS.batch_size)
    dev_manager = BatchManager(dev_data, 100)
    test_manager = BatchManager(test_data, 100)

    # make path for store log and model if not exist
    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        config = config_model(char_to_id, tag_to_id)
        save_config(config, FLAGS.config_file)
    make_path(FLAGS)

    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    print_config(config, logger)

    # limit GPU memory
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    # 训练集全量跑一次需要迭代的次数
    steps_per_epoch = train_manager.len_data
    with tf.Session(config=tf_config) as sess:

        # 此处模型创建为项目最核心代码
        model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec,
                             config, id_to_char, logger)
        logger.info("start training")
        loss = []
        with tf.device("/gpu:0"):
            for i in range(100):
                for batch in train_manager.iter_batch(shuffle=True):
                    step, batch_loss = model.run_step(sess, True, batch)
                    loss.append(batch_loss)
                    if step % FLAGS.steps_check == 0:
                        iteration = step // steps_per_epoch + 1
                        logger.info("iteration:{} step:{}/{}, "
                                    "NER loss:{:>9.6f}".format(
                                        iteration, step % steps_per_epoch,
                                        steps_per_epoch, np.mean(loss)))
                        loss = []

            # best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)
                if i % 7 == 0:
                    save_model(sess, model, FLAGS.ckpt_path, logger)
Пример #30
0
    compute_loss_fct = MultipleChoiceLossCompute(criterion, criterion,
                                                 args.lm_coef, model_opt)
    load_openai_pretrained_model(dh_model.transformer,
                                 n_ctx=n_ctx,
                                 n_special=n_special)

    dh_model.to(device)
    dh_model = nn.DataParallel(dh_model)

    n_updates = 0
    n_epochs = 0
    if dataset != 'stsb':
        trYt = trY
    if submit:
        path = os.path.join(save_dir, desc, 'best_params')
        torch.save(dh_model.state_dict(), make_path(path))
    best_score = 0
    for i in range(args.n_iter):
        print("running epoch", i)
        run_epoch()
        n_epochs += 1
        log(save_dir, desc)
    if submit:
        path = os.path.join(save_dir, desc, 'best_params')
        dh_model.load_state_dict(torch.load(path))
        predict(dataset, args.submission_dir)
        if args.analysis:
            rocstories_analysis(
                data_dir, os.path.join(args.submission_dir, 'ROCStories.tsv'),
                os.path.join(log_dir, 'rocstories.jsonl'))
Пример #31
0
    def train(self):
        make_path(self.FLAGS)

        logger = get_logger(self.FLAGS.logfile_path)

        # load data sets
        # use generator to avoid memory oversize
        train_sentences = SentenceGenerator(self.FLAGS.train_file,
                                            self.FLAGS.zeros)
        logger.info("Train sentence generator is initialized")
        dev_sentences = SentenceGenerator(self.FLAGS.dev_file,
                                          self.FLAGS.zeros)
        logger.info("Dev sentence generator is initialized")

        # create maps if not exist
        if not tf.gfile.Exists(self.FLAGS.mapfile_path):
            # create dictionary for word
            _, char_to_id, id_to_char = char_mapping(train_sentences(),
                                                     self.FLAGS.lower)
            logger.info("Created dictionary of word from train data")
            with tf.gfile.GFile(self.FLAGS.mapfile_path, "wb") as f:
                pickle.dump([char_to_id, id_to_char], f)
        else:
            with tf.gfile.GFile(self.FLAGS.mapfile_path, "rb") as f:
                char_to_id, id_to_char = pickle.load(f)
                logger.info("Load dictionary from existed map file")

        if not tf.gfile.Exists(self.FLAGS.vocabfile_path):
            with tf.gfile.GFile(self.FLAGS.vocabfile_path, "w") as file:
                for word in char_to_id:
                    file.write(word + "\n")
            logger.info("Created vocabulary file")

        # load config and print it
        if tf.gfile.Exists(self.FLAGS.configfile_path):
            config = load_config(self.FLAGS.configfile_path)
        else:
            config = self.config(char_to_id)
            save_config(config, self.FLAGS.configfile_path)
        print_config(config, logger)

        # prepare data
        # get char_based, char_index_based, segs_based, tag_index_based sentences
        # use generator to avoid memory oversize
        train_manager = BatchManager(train_sentences, config['batch_size'],
                                     config['lower'], char_to_id, True)
        logger.info("Train manager is initialized")
        dev_manager = BatchManager(dev_sentences, 100, config['lower'],
                                   char_to_id, False)
        logger.info("Dev manager is initialized")
        logger.info("{} / {} sentences in train /dev.".format(
            len(train_sentences), len(dev_sentences)))

        # limit GPU memory
        tf_config = tf.ConfigProto()
        tf_config.gpu_options.allow_growth = True
        # tf_config.log_device_placement = True
        steps_per_epoch = train_manager.len_data  # how many batches in an epoch
        with tf.Session(config=tf_config) as sess:
            model = create_model(sess, Model, self.FLAGS.ckpt_path, logger)
            logger.info("start training")
            loss = []
            lr = config["lr"]
            sample_prob_initial = config["sample_prob"]
            for i in range(self.FLAGS.max_epoch):
                tf.assign(model.global_epoch, i).eval()
                for iter_turn, batch in enumerate(train_manager.iter_batch()):
                    sample_prob = max(
                        0.3, sample_prob_initial -
                        (i * 500 + iter_turn) * 0.1 / 100.0)
                    step, batch_loss = model.run_step(True, batch, lr,
                                                      sample_prob)
                    loss.append(batch_loss)
                    if step % self.FLAGS.steps_check == 0:
                        iteration = step // steps_per_epoch + 1
                        logger.info(
                            "iteration:{} step:{}/{}, NER loss:{:>9.6f}, Training Sample prob is now {:>4.2f}"
                            .format(iteration,
                                    step % steps_per_epoch, steps_per_epoch,
                                    np.mean(loss), sample_prob))
                        loss = []
                    if step % self.FLAGS.steps_eval == 0:
                        self.evaluate(model, "dev", dev_manager, id_to_char,
                                      logger)
                        dev_manager.reset(dev_sentences())
                        logger.info(
                            "Epoch {} is finished, reset dev_manager".format(
                                i))
                if (i + 1) % 2 == 0:
                    save_model(sess, model,
                               self.FLAGS.ckpt_path + u"/" + str(i), logger)

                # reset BatchManager
                train_manager.reset(train_sentences())
                logger.info(
                    "Epoch {} is finished, reset train_manager".format(i))

                lr = max(0.001, lr / 1.5)
                logger.info(
                    "Epoch {} is finished, rescale learing rate to {}".format(
                        i, lr))
Пример #32
0
    config["lr"] = FLAGS.lr
    config["tag_schema"] = FLAGS.tag_schema
    config["pre_emb"] = FLAGS.pre_emb
    config["zeros"] = FLAGS.zeros
    config["lower"] = FLAGS.lower
    return config


with open(FLAGS.map_file, "rb") as f:
    if pyversion == 'three':
        char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)
    else:
        char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f,
                                                                   protocol=2)
        # make path for store log and model if not exist
make_path(FLAGS)
if os.path.isfile(FLAGS.config_file):
    config = load_config(FLAGS.config_file)
else:
    config = config_model(char_to_id, tag_to_id)
    save_config(config, FLAGS.config_file)
make_path(FLAGS)
app = Flask(__name__)
log_path = os.path.join("log", FLAGS.log_file)
logger = get_logger(log_path)
tf_config = tf.ConfigProto()
sess = tf.Session(config=tf_config)
sess.run(tf.global_variables_initializer())
model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config,
                     id_to_char, logger)
Пример #33
0
def train():
    # load data sets
    train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower,
                                     FLAGS.zeros)  #训练集 101218 句子
    dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower,
                                   FLAGS.zeros)  #验证集 7827句子
    test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower,
                                    FLAGS.zeros)  #测试集 16804句子

    # Use selected tagging scheme (IOB / IOBES)
    update_tag_scheme(train_sentences, FLAGS.tag_schema)  #更新标注iob转换成iobes
    update_tag_scheme(test_sentences, FLAGS.tag_schema)  #更新标注iob转换成iobes
    update_tag_scheme(dev_sentences, FLAGS.tag_schema)  #更新标注iob转换成iobes
    # create maps if not exist
    if not os.path.isfile(FLAGS.map_file):  #判断maps.pkl是否存在
        # create dictionary for word
        if FLAGS.pre_emb:  #是否使用预先训练的模型(训练好的字向量)  测试集的数据不在训练集中
            dico_chars_train = char_mapping(train_sentences,
                                            FLAGS.lower)[0]  #字频统计下来 dico_chars
            dico_chars, char_to_id, id_to_char = augment_with_pretrained(
                dico_chars_train.copy(),
                FLAGS.emb_file,
                list(
                    itertools.chain.from_iterable(  #拉平,变成一个list
                        [[w[0] for w in s] for s in test_sentences])  #w[0] 是个字
                ))  #每个字建个字典,每个词建个字典
        else:
            #每个字的id,标记的id
            _c, char_to_id, id_to_char = char_mapping(train_sentences,
                                                      FLAGS.lower)

        # Create a dictionary and a mapping for tags 每个标记的id
        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)  #字频,排序,写入文件
        #with open('maps.txt','w',encoding='utf8') as f1:
        #f1.writelines(str(char_to_id)+" "+id_to_char+" "+str(tag_to_id)+" "+id_to_tag+'\n')
        with open(FLAGS.map_file, "wb") as f:  #持久化下来
            pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)

    # prepare data, get a collection of list containing index
    train_data = prepare_dataset(  #字词 数字特征化
        train_sentences, char_to_id, tag_to_id, FLAGS.lower)
    dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id,
                               FLAGS.lower)
    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id,
                                FLAGS.lower)
    print("%i / %i / %i sentences in train / dev / test." %
          (len(train_data), 0, len(test_data)))

    train_manager = BatchManager(train_data, FLAGS.batch_size)  #训练集每次60个句子进行迭代
    dev_manager = BatchManager(dev_data, 100)
    test_manager = BatchManager(test_data, 100)
    # make path for store log and model if not exist
    make_path(FLAGS)  #创建文件log,result,ckpt
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        config = config_model(char_to_id, tag_to_id)  #字符对应的id,标签对应的id
        save_config(config, FLAGS.config_file)  #每次的数据不一样都要生成一个config_file,
    make_path(FLAGS)  #创建文件log,result,ckpt 模型中的文件

    log_path = os.path.join("log", FLAGS.log_file)  #读取log路径
    logger = get_logger(log_path)  #定义log日志的写入格式
    print_config(config, logger)  #写入log日志

    # limit GPU memory
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True  #设置GPU自适应,用多少使用多少
    #tf_config.gpu_options.per_process_gpu_memory_fraction=True 设置GPU的使用率,占比
    steps_per_epoch = train_manager.len_data  #总共分多少批,取多少次
    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec,
                             config, id_to_char, logger)
        #模型初始化结束
        logger.info("start training")
        loss = []
        # with tf.device("/gpu:0"):没有Gpu注释掉  卷积神经网络要求句子的长度一样,
        for i in range(100):  #迭代多少次,每次把数据拿过来
            for batch in train_manager.iter_batch(shuffle=True):  #随机的拿
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                                "NER loss:{:>9.6f}".format(
                                    iteration, step % steps_per_epoch,
                                    steps_per_epoch, np.mean(loss)))
                    loss = []

        # best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)比上次模型好的话,就保存
            if i % 7 == 0:
                save_model(sess, model, FLAGS.ckpt_path, logger)
Пример #34
0
 def save(save_path):
     ps = sess.run(params)
     make_path(save_path)
     joblib.dump(ps, save_path)
def train():
    # load data sets
    train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros)
    dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros)
    test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)

    # Use selected tagging scheme (IOB / IOBES)
    #update_tag_scheme(train_sentences, FLAGS.tag_schema)
    #update_tag_scheme(test_sentences, FLAGS.tag_schema)

    # create maps if not exist
    if not os.path.isfile(FLAGS.map_file):
        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)
        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.map_file, "rb") as f:
            tag_to_id, id_to_tag = pickle.load(f)

    # prepare data, get a collection of list containing index
    train_data = prepare_dataset(
        train_sentences, FLAGS.max_seq_len, tag_to_id, FLAGS.lower
    )
    dev_data = prepare_dataset(
        dev_sentences, FLAGS.max_seq_len, tag_to_id, FLAGS.lower
    )
    test_data = prepare_dataset(
        test_sentences, FLAGS.max_seq_len, tag_to_id, FLAGS.lower
    )
    print("%i / %i / %i sentences in train / dev / test." % (
        len(train_data), 0, len(test_data)))
    train_len=len(train_data)
    train_manager = BatchManager(train_data, FLAGS.batch_size)
    dev_manager = BatchManager(dev_data, FLAGS.batch_size)
    test_manager = BatchManager(test_data, FLAGS.batch_size)
    # make path for store log and model if not exist
    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        config = config_model(tag_to_id)
        save_config(config, FLAGS.config_file)
    make_path(FLAGS)

    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    print_config(config, logger)

    # limit GPU memory
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    steps_per_epoch = train_manager.len_data
    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model, FLAGS.ckpt_path, config, logger)

        logger.info("start training")
        loss = []
        for i in range(FLAGS.max_epoch):
            from tqdm import tqdm
            for batch in tqdm(train_manager.iter_batch(shuffle=True)):
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                                "NER loss:{:>9.6f}".format(
                        iteration, step%steps_per_epoch, steps_per_epoch, np.mean(loss)))
                    loss = []
            print("save result epoch:",i," ***************************************************")
            best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger,i)
            if i>=8:
                save_model(sess, model, FLAGS.ckpt_path, logger, global_steps=step)
                evaluate(sess, model, "test", test_manager, id_to_tag, logger,i)
Пример #36
0
def train():
    # load data sets
    # sentences 的格式如下  ['在', 'O'], ['厦', 'B-LOC'], ['门', 'I-LOC']
    # train_sentences = loader.load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros)
    # dev_sentences = loader.load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros)
    # test_sentences = loader.load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)

    train_sentences = loader.load_folder_sentences(FLAGS.train_file,
                                                   FLAGS.lower, FLAGS.zeros)
    dev_sentences = loader.load_folder_sentences(FLAGS.dev_file, FLAGS.lower,
                                                 FLAGS.zeros)
    test_sentences = loader.load_folder_sentences(FLAGS.test_file, FLAGS.lower,
                                                  FLAGS.zeros)

    # Use selected tagging scheme (IOB / IOBES)
    # update_tag_scheme 后sentence没有太大的变化
    loader.update_tag_scheme(train_sentences, FLAGS.tag_schema)
    loader.update_tag_scheme(test_sentences, FLAGS.tag_schema)

    os.environ["CUDA_VISIBLE_DEVICES"] = "0"

    # create maps if not exist
    # 是否存在maps.pkl文件,如果不存在就需要读取训练数据,
    # 获得char_to_id  tag_to_id

    # create maps if not exist
    # 是否存在maps.pkl文件,
    if not os.path.isfile(FLAGS.map_file):
        # create dictionary for word
        if FLAGS.pre_emb:
            dico_chars_train = loader.char_mapping(train_sentences,
                                                   FLAGS.lower)[0]
            dico_chars, char_to_id, id_to_char = loader.augment_with_pretrained(
                dico_chars_train.copy(), FLAGS.emb_file,
                list(
                    itertools.chain.from_iterable([[w[0] for w in s]
                                                   for s in test_sentences])))
        else:
            _c, char_to_id, id_to_char = loader.char_mapping(
                train_sentences, FLAGS.lower)

        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = loader.tag_mapping(train_sentences)

        print('tag_to_id: ', tag_to_id)

        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)

    # print('tag_to_id: ', tag_to_id)

    print('tag_to_id: ', tag_to_id)
    # prepare data, get a collection of list containing index
    train_data = loader.prepare_dataset(train_sentences, char_to_id, tag_to_id,
                                        FLAGS.lower)
    dev_data = loader.prepare_dataset(dev_sentences, char_to_id, tag_to_id,
                                      FLAGS.lower)
    test_data = loader.prepare_dataset(test_sentences, char_to_id, tag_to_id,
                                       FLAGS.lower)
    print("%i / %i / %i sentences in train / dev / test." %
          (len(train_data), 0, len(test_data)))

    train_manager = data_utils.BatchManager(train_data, FLAGS.batch_size)
    dev_manager = data_utils.BatchManager(dev_data, 100)
    test_manager = data_utils.BatchManager(test_data, 100)

    # make path for store log and model if not exist
    utils.make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        config = utils.load_config(FLAGS.config_file)
    else:
        config = config_model(char_to_id, tag_to_id)
        utils.save_config(config, FLAGS.config_file)
    utils.make_path(FLAGS)

    log_path = os.path.join("log", FLAGS.log_file)  # ./log/train.log
    logger = utils.get_logger(log_path)
    utils.print_config(config, logger)

    # limit GPU memory
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    steps_per_epoch = train_manager.len_data
    with tf.Session(config=tf_config) as sess:
        model = utils.create_model(sess, Model, FLAGS.ckpt_path,
                                   data_utils.load_word2vec, config,
                                   id_to_char, logger)
        logger.info("start training")
        loss = []

        for i in range(FLAGS.iterations):
            # for i in range(10):
            logger.info('epoch: {}'.format(i))
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                                "NER loss:{:>9.6f}".format(
                                    iteration, step % steps_per_epoch,
                                    steps_per_epoch, np.mean(loss)))
                    loss = []

            best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)
            if best:
                utils.save_model(sess, model, FLAGS.ckpt_path, logger)
            evaluate(sess, model, "test", test_manager, id_to_tag, logger)
Пример #37
0
def save(path):
    ps = sess.run(params)
    joblib.dump(ps, make_path(path))
Пример #38
0
def train():
    """
    train函数:传入数据、处理数据、模型训练、输出测试集f1值
    :return:
    """
    # load data sets传入数据集,做基本处理包括转小写、换0、去除空格提取word等,将训练集word和tag放在list中。 .dev_file用作cross validation
    train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower,
                                     FLAGS.zeros)  # FLAGS.zeros = False
    # train_sentences格式 ['厦', 'B-LOC'], ['门', 'I-LOC'], ['与', 'O'], ['金', 'B-LOC'], ['门', 'I-LOC']
    dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros)
    test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)

    # Use selected tagging scheme (IOB / IOBES) 将IOB格式标签转换文IOBES。I:中间,O:其他,B:开始 | E:结束,S:单个
    # 调用loder.py中的update_tag_scheme函数进行tag转换,在此函数内又调用data_utils.py中的iob_iobes函数转换tag
    update_tag_scheme(train_sentences, FLAGS.tag_schema)
    update_tag_scheme(test_sentences, FLAGS.tag_schema)
    update_tag_scheme(dev_sentences, FLAGS.tag_schema)

    # create maps if not exist 创建词映射字典
    if not os.path.isfile(FLAGS.map_file):
        # create dictionary for word
        if FLAGS.pre_emb:  # 数据增强 添加预训练词向量到训练字典中
            dico_chars_train = char_mapping(
                train_sentences, FLAGS.lower
            )[0]  # 调用loader.py中的char_mapping函数,只输出一个被转换为小写的数据集字典,frequency降序排列
            dico_chars, char_to_id, id_to_char = augment_with_pretrained(  # 调用loader.py中的augment_with_pretrained函数
                # 添加原字典中没有的pretrain字符到原字典中,pretrain必须在test集中有出现过
                dico_chars_train.copy(),
                FLAGS.emb_file,
                list(
                    itertools.chain.from_iterable([[w[0] for w in s]
                                                   for s in test_sentences
                                                   ])  # 使用test集作为预训练词向量的基准
                ))
        else:
            _c, char_to_id, id_to_char = char_mapping(train_sentences,
                                                      FLAGS.lower)
            # _c是无序字典,即列出了每个key出现的次数。char_to_id是有序字典,但是value不是frequency,是序号,但key排列顺序是按frequence降序

        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = tag_mapping(
            train_sentences)  # 调用loader.py中的tag_mapping函数创建tag字典,_t是不重复tag的字典
        # tag_to_id: {'O': 0, 'S-MISC': 1, 'B-ORG': 2, 'B-PER': 3, 'E-ORG': 4, 'E-PER': 5, 'S-LOC': 6, 'S-ORG': 7, 'I-PER': 8, 'S-PER': 9}
        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag],
                        f)  # 将上述字典保存到map file中
    else:
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)

    # prepare data, get a collection of list containing index
    train_data = prepare_dataset(  # 调用loader.py中的prepare_dataset函数生成 训练集word字符——word的frequency——分词后的word特征——标签的frequency
        train_sentences, char_to_id, tag_to_id, FLAGS.lower)
    dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id,
                               FLAGS.lower)
    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id,
                                FLAGS.lower)
    print("%i / %i / %i sentences in train / dev / test." %
          (len(train_data), len(dev_data), len(test_data)))

    # 生成bach_size大小 可以调用batch_data和len_data两个内置变量
    # BatchManager用来统一输入的训练数据的array的长度
    train_manager = BatchManager(
        train_data, FLAGS.batch_size)  # data_utils.py传入BatchManager类
    dev_manager = BatchManager(dev_data, 100)
    test_manager = BatchManager(test_data, 100)
    # make path for store log and model if not exist
    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        config = config_model(char_to_id, tag_to_id)  # output配置文件config_file
        save_config(config, FLAGS.config_file)
    make_path(FLAGS)

    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    print_config(config, logger)  # 打印生成的log并储存在文件夹内

    # 迭代原理 训练 loss值如何产生
    # limit GPU memory
    tf_config = tf.compat.v1.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    # 英文:steps_per_epoch = 703 即一共需要处理的训练数据批次量, steps_per_epoch * 20 = 总共的句子个数
    # 中文:steps_per_epoch = 1044
    steps_per_epoch = train_manager.len_data
    # 开始训练模型
    with tf.compat.v1.Session(
            config=tf_config
    ) as sess:  # 使用tf.Session激活配置参数,使用utils.py中create_model函数下session.run执行
        # 创建模型框架,包括init函数中定义的模型各个层参数和相应函数调用,先生成num_chars * 100的word embedding权重矩阵
        model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec,
                             config, id_to_char, logger)
        # 调用utils.py中的Model类创建模型,传入训练字典。调用data_utils中的load_word2vec函数
        logger.info("start training")
        loss = []
        # 这层循环的意义是共训练模型100次,不断传入train和验证集来调整模型的参数,得到最优F1值。括号内的range(100)可调参
        for i in range(100):
            # 开始训练模型 传入数据集
            # 先在模型中根据batch创建输入字典feed_dict,每20个一组,包括每句话的word id,每句话的word feature,每句话tag id
            # 依次执行模型每一层,从embedding layer开始
            # 生成词向量,按批次传入参数包括每句话的char id;每句话feature和是否存在句子维度的预先定义值,生成120维包含所有训练数据的词向量
            # 用dropout随机去除部分词向量防止过拟合,将词向量喂给CNN模型进行卷积训练。
            for batch in train_manager.iter_batch(
                    shuffle=True):  # iter_batch:data_utils.py中的iter_batch函数
                # batch是产生随机顺序的句子,输出上述array
                # batch组成:4个大list,每个list包含:
                # 1. 随机输出的所有句子,['Fairview', ',', 'Texas', ',', '$', '1.82', 'million', 'deal', 'Baa1', '-'],
                # 2. word出现在字典中的位置。
                # 3. 每句话对应的表征word长度特征的list。
                # 4. 每句话对应的tag在tag字典中出现的位置
                step, batch_loss = model.run_step(sess, True, batch)
                # loss:60.648315 76.53908 54.006336 108.96472
                # step从1开始增加,每100次输出一次当前loss值
                loss.append(batch_loss)
                # 5个batch输出一次loss值,step=100,总batch
                if step % FLAGS.steps_check == 0:  # 每迭代100次输出一次loss,
                    iteration = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                                "NER loss:{:>9.6f}".format(
                                    iteration, step % steps_per_epoch,
                                    steps_per_epoch, np.mean(loss)))
                    loss = []

            best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)
            if best:
                save_model(sess, model, FLAGS.ckpt_path, logger)
            evaluate(sess, model, "test", test_manager, id_to_tag, logger)
Пример #39
0
def train():
    # load data sets
    train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower,
                                     FLAGS.zeros)
    # "sentences[0]:[['我', 'O'], ['要', 'O'], ['看', 'O'], ['乌', 'B-SLOC'], ['鲁', 'I-SLOC'], ['木', 'I-SLOC'], ['齐', 'I-SLOC'], ['市', 'I-SLOC'], ['第', 'I-SLOC'], ['四', 'I-SLOC'], ['十', 'I-SLOC'], ['九', 'I-SLOC'], ['中', 'I-SLOC'], ['学', 'I-SLOC'], ['东', 'I-SLOC'], ['门', 'I-SLOC'], ['去', 'O'], ['乌', 'B-ELOC'], ['鲁', 'I-ELOC'], ['木', 'I-ELOC'], ['齐', 'I-ELOC'], ['推', 'I-ELOC'], ['拿', 'I-ELOC'], ['职', 'I-ELOC'], ['业', 'I-ELOC'], ['学', 'I-ELOC'], ['校', 'I-ELOC'], ['南', 'I-ELOC'], ['门', 'I-ELOC'], ['沿', 'O'], ['西', 'B-ROAD'], ['虹', 'I-ROAD'], ['东', 'I-ROAD'], ['路', 'I-ROAD'], ['的', 'O'], ['监', 'B-TYPE'], ['控', 'I-TYPE']]"
    dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros)
    test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)

    # Use selected tagging scheme (IOB / IOBES)
    update_tag_scheme(train_sentences, FLAGS.tag_schema)
    # print("train_sentences[0]:{}".format(train_sentences[0]))
    # "train_sentences[0]:[['我', 'O'], ['要', 'O'], ['看', 'O'], ['乌', 'B-SLOC'], ['鲁', 'I-SLOC'], ['木', 'I-SLOC'], ['齐', 'I-SLOC'], ['市', 'I-SLOC'], ['第', 'I-SLOC'], ['四', 'I-SLOC'], ['十', 'I-SLOC'], ['九', 'I-SLOC'], ['中', 'I-SLOC'], ['学', 'I-SLOC'], ['东', 'I-SLOC'], ['门', 'E-SLOC'], ['去', 'O'], ['乌', 'B-ELOC'], ['鲁', 'I-ELOC'], ['木', 'I-ELOC'], ['齐', 'I-ELOC'], ['推', 'I-ELOC'], ['拿', 'I-ELOC'], ['职', 'I-ELOC'], ['业', 'I-ELOC'], ['学', 'I-ELOC'], ['校', 'I-ELOC'], ['南', 'I-ELOC'], ['门', 'E-ELOC'], ['沿', 'O'], ['西', 'B-ROAD'], ['虹', 'I-ROAD'], ['东', 'I-ROAD'], ['路', 'E-ROAD'], ['的', 'O'], ['监', 'B-TYPE'], ['控', 'E-TYPE']]"
    update_tag_scheme(dev_sentences, FLAGS.tag_schema)
    update_tag_scheme(test_sentences, FLAGS.tag_schema)

    # create maps if not exist
    # print("map_file:{}".format(FLAGS.map_file))
    # print("pre_emb:{}".format(FLAGS.pre_emb))
    # map_file: maps.pkl
    # pre_emb: False
    if not os.path.isfile(FLAGS.map_file):
        # create dictionary for word
        if FLAGS.pre_emb:
            dico_chars_train = char_mapping(
                train_sentences, FLAGS.lower)[0]  # character -> count dict
            dico_chars, char_to_id, id_to_char = augment_with_pretrained(
                dico_chars_train.copy(), FLAGS.emb_file,
                list(
                    itertools.chain.from_iterable([[w[0] for w in s]
                                                   for s in test_sentences])))
        else:
            _c, char_to_id, id_to_char = char_mapping(train_sentences,
                                                      FLAGS.lower)

        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)

        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)

    # prepare data, get a collection of list containing index
    train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id,
                                 FLAGS.lower)
    dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id,
                               FLAGS.lower)
    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id,
                                FLAGS.lower)
    print("%i / %i / %i sentences in train / dev / test." %
          (len(train_data), 0, len(test_data)))
    # '3027 / 0 / 361 sentences in train / dev / test.'

    # print("batch_size:{}".format(FLAGS.batch_size))
    # batch_size: 20
    train_manager = BatchManager(train_data, FLAGS.batch_size)
    dev_manager = BatchManager(dev_data, 100)
    test_manager = BatchManager(test_data, 100)
    # make path for store log and model if not exist
    make_path(FLAGS)
    # print("config_file:{}".format(FLAGS.config_file))
    # config_file: config_file
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        config = config_model(char_to_id, tag_to_id)
        save_config(config, FLAGS.config_file)

    log_path = os.path.join("log", FLAGS.log_file)
    # log_path:log/train.log
    logger = get_logger(log_path)
    print_config(config, logger)

    # limit GPU memory
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    steps_per_epoch = train_manager.len_data
    # print("steps_per_epoch:{}".format(steps_per_epoch))
    # steps_per_epoch: 152
    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec,
                             config, id_to_char, logger)
        logger.info("start training")
        loss = []
        for i in range(100):
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)
                # print("steps_check:{}".format(FLAGS.steps_check))
                # steps_check: 100
                if step % FLAGS.steps_check == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                                "NER loss:{:>9.6f}".format(
                                    iteration, step % steps_per_epoch,
                                    steps_per_epoch, np.mean(loss)))
                    loss = []

            best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)
            if best:
                save_model(sess, model, FLAGS.ckpt_path, logger)
            evaluate(sess, model, "test", test_manager, id_to_tag, logger)
            export(model, sess, "ner", "export_model")
Пример #40
0
    def log_test_pr_curve(self,
                          epoch,
                          entity_ids_test,
                          labels_test,
                          probs_test,
                          negative_label_idx,
                          label_encoder=None):
        bag_ids = [e1 + '_' + e2 for e1, e2 in entity_ids_test]

        bag_to_mention_mapping = defaultdict(set)
        for idx, bag_id in enumerate(bag_ids):
            bag_to_mention_mapping[bag_id].add(idx)

        num_relation_facts = 0
        Prediction = namedtuple('Prediction', [
            'score', 'is_correct', 'bag_id', 'predicted_label_idx',
            'bag_label_idxs', 'predicted_label', 'bag_labels', 'bag_size'
        ])
        predictions = []
        for bag_id, mention_idxs in bag_to_mention_mapping.items():
            # Aggregate and count the labels per bag without the negative label
            bag_labels = set(labels_test[list(mention_idxs)])
            bag_labels.discard(negative_label_idx)
            num_relation_facts += len(bag_labels)
            bag_size = len(mention_idxs)

            # Use max to aggregate the mention probabilities in the bag
            mention_probs = probs_test[list(mention_idxs)]
            bag_probs = np.max(mention_probs, axis=0)

            # For each bag and positive relation create a prediction
            for relation_idx, relation_prob in enumerate(bag_probs):
                if relation_idx == negative_label_idx:
                    continue

                if len(bag_labels) == 0:
                    bag_labels_str = 'NA'
                    bag_label_idxs_str = negative_label_idx
                else:
                    if label_encoder:
                        decoded_bag_labels = [
                            label_encoder.get_item_for_index(idx)
                            for idx in bag_labels
                        ]
                        bag_labels_str = ', '.join(decoded_bag_labels)
                    else:
                        bag_labels_str = ''

                    bag_label_idxs_str = ', '.join(
                        [str(lbl) for lbl in bag_labels])

                if label_encoder:
                    predicted_label_str = label_encoder.get_item_for_index(
                        relation_idx)
                else:
                    predicted_label_str = ""
                predicted_label_idx_str = str(relation_idx)

                is_correct = relation_idx in bag_labels
                predictions.append(
                    Prediction(score=relation_prob,
                               is_correct=is_correct,
                               bag_id=bag_id,
                               predicted_label_idx=predicted_label_idx_str,
                               bag_label_idxs=bag_label_idxs_str,
                               predicted_label=predicted_label_str,
                               bag_labels=bag_labels_str,
                               bag_size=bag_size))

        predictions = sorted(predictions,
                             key=attrgetter('score'),
                             reverse=True)

        correct = 0
        precision_values = []
        recall_values = []
        for idx, prediction in enumerate(predictions):
            if prediction.is_correct:
                correct += 1
            precision_values.append(correct / (idx + 1))
            recall_values.append(correct / num_relation_facts)

        def precision_at(n):
            return (
                sum([prediction.is_correct
                     for prediction in predictions[:n]]) / n) * 100

        pr_metrics = {
            'P/R AUC': auc(x=recall_values, y=precision_values),
            'Precision@100': precision_at(100),
            'Precision@200': precision_at(200),
            'Precision@500': precision_at(500)
        }

        predictions_dir = join(self._base_path, 'predictions', 'test')
        pr_metrics_file_path = join(predictions_dir,
                                    'pr_metrics_epoch_{}.jsonl'.format(epoch))
        with open(make_path(pr_metrics_file_path), 'w',
                  encoding='utf-8') as pr_metrics_file:
            pr_metrics_file.write(json.dumps(pr_metrics) + '\n')

        pr_predictions_file = join(
            predictions_dir, 'predictions_pr_curve_epoch_{}.tsv'.format(epoch))
        with open(make_path(pr_predictions_file), 'w') as pr_pred_file:
            tuple_attrs = [
                'score', 'is_correct', 'bag_id', 'predicted_label_idx',
                'bag_label_idxs', 'predicted_label', 'bag_labels', 'bag_size'
            ]
            pr_pred_file.write("\t".join(tuple_attrs) + "\n")
            for prediction in predictions:
                pred_values = attrgetter(*tuple_attrs)(prediction)
                pred_values = [str(val) for val in pred_values]
                pr_pred_file.write("\t".join(pred_values) + "\n")

        np.save(join(predictions_dir, 'pr_curve_y_epoch_{}.npy'.format(epoch)),
                precision_values)
        np.save(join(predictions_dir, 'pr_curve_x_epoch_{}.npy'.format(epoch)),
                recall_values)
Пример #41
0
    # declare loss function and the optimizer
    criterion = nn.CrossEntropyLoss(reduce=False) # TODO check loss functions
    model_opt = OpenAIAdam(model.parameters(), lr=lr, schedule=lr_schedule,
                            warmup=lr_warmup, t_total=n_updates_total, b1=b1,
                            b2=b2, e=e, l2=l2, vector_l2=vector_l2,
                            max_grad_norm=max_grad_norm)
    compute_loss_fct = LossCompute(criterion, lm_coef, model_opt)

    # this part will be changed for multigpu support
    model.to(device)
    lm_head.to(device)

    n_updates = 0
    n_epochs = 0

    make_path(os.path.join(save_dir, desc, 'temp.txt'))
    # repeat for n_iter epochs
    while n_epochs < n_iter:
        iters = 0
        # split to train and valid
        _trX, _trV = get_train_valid(tr_data)
        start_ind = 0
        end_ind = start_ind + n_batch_size
        while True:
            cur_batch = _trX[start_ind:end_ind]
            print("epoch ", n_epochs, "iter ", iters)
            trX, trM = transform_code(cur_batch)
            # forward pass and backprop
            run_epoch(trX, trM)
            iters += 1
            start_ind = end_ind
Пример #42
0
 def __init__(self, path, **kwargs):
     if 'time' not in kwargs:
         kwargs['time'] = time.time()
     self.f_log = open(make_path(path), 'w')
     self.f_log.write(json.dumps(kwargs) + '\n')
Пример #43
0
def save(path):
    save_path = saver.save(sess, make_path(path))
    print('save the best model!')