Exemplo n.º 1
0
  def parse(self, input_files, output_dir=None, output_file=None):
    """"""
    
    if not isinstance(input_files, (tuple, list)):
      input_files = [input_files]
    if len(input_files) > 1 and output_file is not None:
      raise ValueError('Cannot provide a value for --output_file when parsing multiple files')
    self.add_file_vocabs(input_files)
    
    start_time = time.time()
    for input_file in input_files:
      with tf.Graph().as_default():
        config_proto = tf.ConfigProto()
        if self.per_process_gpu_memory_fraction == -1:
          config_proto.gpu_options.allow_growth = True
        else:
          config_proto.gpu_options.per_process_gpu_memory_fraction = self.per_process_gpu_memory_fraction
        with tf.Session(config=config_proto) as sess:
          # load the model and prep the parse set
          self.setup_vocabs()
          trainset = Trainset.from_configurable(self, self.vocabs, nlp_model=self.nlp_model)
          with tf.variable_scope(self.name.title()):
            train_tensors = trainset()
          train_outputs = [train_tensors[train_key] for train_key in trainset.train_keys]

          saver = tf.train.Saver(self.save_vars, max_to_keep=1)
          for var in self.non_save_vars:
            sess.run(var.initializer)
          saver.restore(sess, tf.train.latest_checkpoint(self.save_dir))
          
          # Iterate through files and batches
          parseset = Parseset.from_configurable(self, self.vocabs, parse_files=input_file, nlp_model=self.nlp_model)
          with tf.variable_scope(self.name.title(), reuse=True):
            parse_tensors = parseset(moving_params=self.optimizer)
          parse_outputs = [parse_tensors[parse_key] for parse_key in parseset.parse_keys]
          
          input_dir, input_file = os.path.split(input_file)
          if output_dir is None and output_file is None:
            output_dir = self.save_dir
          if output_dir == input_dir and output_file is None:
            output_path = os.path.join(input_dir, 'parsed-'+input_file)
          elif output_file is None:
            output_path = os.path.join(output_dir, input_file)
          else:
            output_path = output_file
          
          probs = []
          sents = []
          for feed_dict, tokens in parseset.iterbatches(shuffle=False):
            probs.append(sess.run(parse_outputs, feed_dict=feed_dict))
            sents.append(tokens)
          parseset.write_probs(sents, output_path, probs)
      del trainset
      del parseset
      print('Finished one')
    if self.verbose:
      print(ctext('Parsing {0} file(s) took {1} seconds'.format(len(input_files), time.time()-start_time), 'bright_green'))
    return
Exemplo n.º 2
0
    def print_accuracy(self, accumulators, time, prefix='Train'):
        """"""

        acc_dict = self.process_accumulators(accumulators, time=time)
        strings = []
        strings.append(color_pattern('Loss:', '{Loss:7.3f}', 'bright_red'))
        strings.append(color_pattern('TS:', '{TS:5.2f}%', 'bright_cyan'))
        strings.append(color_pattern('SS:', '{SS:5.2f}%', 'bright_green'))
        strings.append(
            color_pattern('Speed:', '{Seq_rate:6.1f} seqs/sec',
                          'bright_magenta'))
        string = ctext('{0}  ', 'bold') + ' | '.join(strings)
        print(string.format(prefix, **acc_dict))
        return
Exemplo n.º 3
0
 def from_dataset(cls, dataset, *args, **kwargs):
   """"""
   
   multibucket = cls.from_configurable(dataset, *args, **kwargs)
   indices = []
   for multibucket_ in dataset:
     indices.append(multibucket_.indices)
   for i in xrange(1, len(indices)):
     pass
     # assert np.equal(indices[0].astype(int), indices[i].astype(int)).all()
   multibucket._indices = np.array(multibucket_.indices)
   buckets = [Bucket.from_dataset(dataset, i, *args, **kwargs) for i in xrange(len(multibucket_))]
   multibucket._buckets = buckets
   if dataset.verbose:
     for bucket in multibucket:
       print('Bucket {name} is {shape}'.format(name=bucket.name, shape=ctext(' x '.join(str(x) for x in bucket.indices.shape), 'bright_blue')))
   return multibucket
Exemplo n.º 4
0
 def from_dataset(cls, dataset, *args, **kwargs):
   """"""
   
   multibucket = cls.from_configurable(dataset, *args, **kwargs)
   indices = []
   for multibucket_ in dataset:
     indices.append(multibucket_.indices)
   #Ici il construit les batchs en attribuant à chaque phrase l'id de son batch et son id relatif dans le batch 
   #(phrase 1 dans 4:5 === phrase 1 du corpus mise dans le batch 4 à la position 5 )
   
   #for i in xrange(1, len(indices)):
   #  assert np.equal(indices[0].astype(int), indices[i].astype(int)).all()
   multibucket._indices = np.array(multibucket_.indices)
   buckets = [Bucket.from_dataset(dataset, i, *args, **kwargs) for i in xrange(len(multibucket_))]
   multibucket._buckets = buckets
   if dataset.verbose:
     for bucket in multibucket:
       print('Bucket {name} is {shape}'.format(name=bucket.name, shape=ctext(' x '.join(str(x) for x in bucket.indices.shape), 'bright_blue')))
   return multibucket
Exemplo n.º 5
0
  def train(self, load=False):
    """"""
    
    # prep the configurables
    self.add_file_vocabs(self.parse_files)
    self.setup_vocabs()
    trainset = Trainset.from_configurable(self, self.vocabs, nlp_model=self.nlp_model)
    with tf.variable_scope(self.name.title()):
      train_tensors = trainset()
    train = self.optimizer(tf.losses.get_total_loss())
    train_outputs = [train_tensors[train_key] for train_key in trainset.train_keys]
    saver = tf.train.Saver(self.save_vars, max_to_keep=1)
    validset = Parseset.from_configurable(self, self.vocabs, nlp_model=self.nlp_model)
    with tf.variable_scope(self.name.title(), reuse=True):
      valid_tensors = validset(moving_params=self.optimizer)
    valid_outputs = [valid_tensors[train_key] for train_key in validset.train_keys]
    valid_outputs2 = [valid_tensors[valid_key] for valid_key in validset.valid_keys]
    current_acc = 0
    best_acc = 0
    n_iters_since_improvement = 0
    n_iters_in_epoch = 0
    
    # calling these properties is inefficient so we save them in separate variables
    min_train_iters = self.min_train_iters
    max_train_iters = self.max_train_iters
    validate_every = self.validate_every
    save_every = self.save_every
    verbose = self.verbose
    quit_after_n_iters_without_improvement = self.quit_after_n_iters_without_improvement
    
    # load or prep the history
    if load:
      self.history = pkl.load(open(os.path.join(self.save_dir, 'history.pkl')))
    else:
      self.history = {'train': defaultdict(list), 'valid': defaultdict(list)}
    
    # start up the session
    config_proto = tf.ConfigProto()
    if self.per_process_gpu_memory_fraction == -1:
      config_proto.gpu_options.allow_growth = True
    else:
      config_proto.gpu_options.per_process_gpu_memory_fraction = self.per_process_gpu_memory_fraction
    with tf.Session(config=config_proto) as sess:
      sess.run(tf.global_variables_initializer())
      if load:
        saver.restore(sess, tf.train.latest_checkpoint(self.save_dir))
      total_train_iters = sess.run(self.global_step)
      train_accumulators = np.zeros(len(train_outputs))
      train_time = 0
      # training loop
      while total_train_iters < max_train_iters:
        for feed_dict in trainset.iterbatches():
          start_time = time.time()
          batch_values = sess.run(train_outputs + [train], feed_dict=feed_dict)[:-1]
          batch_time = time.time() - start_time
          # update accumulators
          total_train_iters += 1
          n_iters_since_improvement += 1
          train_accumulators += batch_values
          train_time += batch_time
          # possibly validate
          if total_train_iters == 1 or (total_train_iters % validate_every == 0):
            valid_accumulators = np.zeros(len(train_outputs))
            valid_time = 0
            with codecs.open(os.path.join(self.save_dir, 'sanity_check'), 'w', encoding='utf-8', errors='ignore') as f:
              for feed_dict, sents in validset.iterbatches(return_check=True):
                start_time = time.time()
                batch_values = sess.run(valid_outputs+valid_outputs2, feed_dict=feed_dict)
                batch_time = time.time() - start_time
                # update accumulators
                valid_accumulators += batch_values[:len(valid_outputs)]
                valid_preds = batch_values[len(valid_outputs):]
                valid_time += batch_time
                validset.check(valid_preds, sents, f)
            # update history
            trainset.update_history(self.history['train'], train_accumulators)
            current_acc = validset.update_history(self.history['valid'], valid_accumulators)
            # print
            if verbose:
              print(ctext('{0:6d}'.format(int(total_train_iters)), 'bold')+')') 
              trainset.print_accuracy(train_accumulators, train_time)
              validset.print_accuracy(valid_accumulators, valid_time)
            train_accumulators = np.zeros(len(train_outputs))
            train_time = 0
            if current_acc > best_acc:
              if verbose:
                print(ctext('Saving model...', 'bright_yellow'))
              best_acc = current_acc
              n_iters_since_improvement = 0
              saver.save(sess, os.path.join(self.save_dir, self.name.lower()),
                         #global_step=self.global_epoch,
                         write_meta_graph=False)
              with open(os.path.join(self.save_dir, 'history.pkl'), 'w') as f:
                pkl.dump(dict(self.history), f)
            elif n_iters_since_improvement >= quit_after_n_iters_without_improvement and total_train_iters > min_train_iters:
              break
        else:
          # We've completed one epoch
          if total_train_iters <= min_train_iters:
            saver.save(sess, os.path.join(self.save_dir, self.name.lower()),
                       #global_step=self.global_epoch,
                       write_meta_graph=False)
            with open(os.path.join(self.save_dir, 'history.pkl'), 'w') as f:
              pkl.dump(dict(self.history), f)
          sess.run(self.global_epoch.assign_add(1.))
          continue
        break
      # Now parse the training and testing files
      input_files = self.train_files + self.parse_files
      saver.restore(sess, tf.train.latest_checkpoint(self.save_dir))
      for input_file in input_files:
        parseset = Parseset.from_configurable(self, self.vocabs, parse_files=input_file, nlp_model=self.nlp_model)
        with tf.variable_scope(self.name.title(), reuse=True):
          parse_tensors = parseset(moving_params=self.optimizer)
        parse_outputs = [parse_tensors[parse_key] for parse_key in parseset.parse_keys]

        input_dir, input_file = os.path.split(input_file)
        output_dir = self.save_dir
        output_file = input_file
        
        start_time = time.time()
        probs = []
        sents = []
        for feed_dict, tokens in parseset.iterbatches(shuffle=False):
          probs.append(sess.run(parse_outputs, feed_dict=feed_dict))
          sents.append(tokens)
        parseset.write_probs(sents, os.path.join(output_dir, output_file), probs)
    if self.verbose:
      print(ctext('Parsing {0} file(s) took {1} seconds'.format(len(input_files), time.time()-start_time), 'bright_green'))
    return
Exemplo n.º 6
0
    def train(self, load=False):
        """"""
        print('TRAIN')

        # prep the configurables
        self.add_file_vocabs(self.parse_files)

        #train
        trainset = Trainset.from_configurable(self,
                                              self.vocabs,
                                              nlp_model=self.nlp_model)
        with tf.variable_scope(self.name.title()):
            train_tensors = trainset()
        train = self.optimizer(tf.losses.get_total_loss())
        train_outputs = [
            train_tensors[train_key] for train_key in trainset.train_keys
        ]
        saver = tf.train.Saver(self.save_vars, max_to_keep=1)

        #valid
        validset = Parseset.from_configurable(self,
                                              self.vocabs,
                                              nlp_model=self.nlp_model)
        with tf.variable_scope(self.name.title(), reuse=True):
            valid_tensors = validset(moving_params=self.optimizer)
        valid_outputs = [
            valid_tensors[train_key] for train_key in validset.train_keys
        ]
        valid_outputs2 = [
            valid_tensors[valid_key] for valid_key in validset.valid_keys
        ]

        #init
        current_acc = 0
        best_acc = 0
        n_iters_since_improvement = 0
        n_iters_in_epoch = 0
        # calling these properties is inefficient so we save them in separate variables
        min_train_iters = self.min_train_iters
        max_train_time = self.max_train_time
        min_percent_per_hour = self.min_percent_per_hour
        validate_every = self.validate_every
        save_every = self.save_every
        verbose = self.verbose
        quit_after_n_iters_without_improvement = self.quit_after_n_iters_without_improvement

        #if no load file, don't load
        if not op.isfile(os.path.join(self.save_dir, 'history.pkl')):
            load = False
        # load or prep the history
        if load:
            self.history = pkl.load(
                open(os.path.join(self.save_dir, 'history.pkl')))
        else:
            self.history = {
                'train': defaultdict(list),
                'valid': defaultdict(list)
            }

        #stoping criteria
        range_scores = 20
        last_scores = np.repeat(0.0, range_scores)
        last_train_time = np.repeat(1.0, range_scores)

        # start up the session
        last_training_time = time.time()

        #configuration du processeur
        config_proto = tf.ConfigProto()
        #allow_soft_placement=True) #log_device_placement=True,allow_soft_placement=Tru

        #config_proto.gpu_options.per_process_gpu_memory_fraction = 0.5
        #config_proto.gpu_options.allow_growth = True

        with tf.Session(
                config=config_proto) as sess:  #initialisation de la session
            #train_writer = tf.summary.FileWriter( './logs/1/train ', sess.graph)

            sess.run(tf.global_variables_initializer())
            if load:  #Restaure une session
                saver.restore(sess, tf.train.latest_checkpoint(self.save_dir))
            total_train_iters = sess.run(self.global_step)
            train_accumulators = np.zeros(len(train_outputs))
            train_time = 0
            # training loop
            while sess.run(self.global_time) < max_train_time:
                for feed_dict in trainset.iterbatches():
                    sys.stdout.write("-")
                    sys.stdout.flush()

                    start_time = time.time()
                    #print(sess.run(train_outputs + [train], feed_dict=feed_dict)[:-1])

                    batch_values = sess.run(train_outputs + [train],
                                            feed_dict=feed_dict)[:-1]
                    batch_time = time.time() - start_time

                    #merge = tf.summary.merge_all()
                    #summary = sess.run(merge, feed_dict=feed_dict)
                    #train_writer.add_summary(summary, total_train_iters)

                    # update accumulators
                    total_train_iters += 1
                    n_iters_since_improvement += 1
                    train_accumulators += batch_values
                    train_time += batch_time

                    sess.run(
                        self.global_time.assign_add(
                            (time.time() - last_training_time) / 60.0))
                    last_training_time = time.time()

                    # possibly validate
                    if total_train_iters == 1 or (total_train_iters %
                                                  validate_every == 0):
                        valid_accumulators = np.zeros(len(train_outputs))
                        valid_time = 0
                        print(
                            ctext('\nStarting sanity check...',
                                  'bright_yellow'))
                        with codecs.open(os.path.join(self.save_dir,
                                                      'sanity_check'),
                                         'w',
                                         encoding='utf-8',
                                         errors='ignore') as f:
                            for feed_dict, sents in validset.iterbatches(
                                    return_check=True):
                                start_time = time.time()
                                batch_values = sess.run(valid_outputs +
                                                        valid_outputs2,
                                                        feed_dict=feed_dict)
                                batch_time = time.time() - start_time
                                # update accumulators
                                valid_accumulators += batch_values[:len(
                                    valid_outputs)]
                                valid_preds = batch_values[len(valid_outputs):]
                                valid_time += batch_time

                                #validset.check(valid_preds, sents, f)
                        print(ctext('End of sanity check', 'bright_yellow'))
                        # update history
                        trainset.update_history(self.history['train'],
                                                train_accumulators)
                        current_acc = validset.update_history(
                            self.history['valid'], valid_accumulators)
                        # print
                        if verbose:
                            print(
                                ctext('{0:6d}'.format(int(total_train_iters)),
                                      'bold') + ')')
                            trainset.print_accuracy(train_accumulators,
                                                    train_time)
                            validset.print_accuracy(valid_accumulators,
                                                    valid_time)
                            print("Train since: ", sess.run(self.global_time),
                                  "min")
                        train_accumulators = np.zeros(len(train_outputs))

                        #evaluation du critere d'arret
                        last_scores[range_scores - 1] = current_acc
                        last_train_time[range_scores - 1] = train_time

                        avg_percent_per_sec = np.diff(
                            last_scores) / last_train_time[1:]
                        avg_percent_per_sec = np.mean(avg_percent_per_sec)
                        avg_percent_per_hour = avg_percent_per_sec * 3600
                        print((int)(avg_percent_per_hour), "% par heure")

                        stop_criteria = avg_percent_per_hour < min_percent_per_hour and total_train_iters > min_train_iters
                        stop_criteria = stop_criteria or sess.run(
                            self.global_time) > max_train_time
                        #logs
                        print("avg_percent_per_hour ", avg_percent_per_hour)
                        print("min_percent_per_hour ", min_percent_per_hour)
                        print("total_train_iters ", total_train_iters)
                        print("min_train_iters ", min_train_iters)
                        print("global_time ", sess.run(self.global_time))
                        print("max_train_time ", max_train_time)
                        print("avg_percent_per_hour < min_percent_per_hour ",
                              avg_percent_per_hour < min_percent_per_hour)
                        print("total_train_iters > min_train_iters ",
                              total_train_iters > min_train_iters)
                        print("sess.run(self.global_time) > max_train_time ",
                              sess.run(self.global_time) > max_train_time)
                        print("stop_criteria ", stop_criteria)
                        print("best acc ", best_acc)

                        #Best model
                        if current_acc > best_acc:
                            if verbose:
                                print(ctext('Saving model...',
                                            'bright_yellow'))
                            best_acc = current_acc
                            n_iters_since_improvement = 0
                            saver.save(
                                sess,
                                os.path.join(self.save_dir, self.name.lower()),
                                #global_step=self.global_epoch,
                                write_meta_graph=False)
                            with open(
                                    os.path.join(self.save_dir, 'history.pkl'),
                                    'w') as f:
                                pkl.dump(dict(self.history), f)
                            if verbose:
                                print(ctext('Saved!', 'bright_yellow'))
                        #Stopping criteria
                        if stop_criteria:
                            break

                    #shift stopping criteria arrays
                        for i in range(1, range_scores):
                            last_scores[i - 1] = last_scores[i]
                            last_train_time[i - 1] = last_train_time[i]

                        train_time = 0
                else:
                    # We've completed one epoch
                    if total_train_iters <= min_train_iters:
                        saver.save(
                            sess,
                            os.path.join(self.save_dir, self.name.lower()),
                            #global_step=self.global_epoch,
                            write_meta_graph=False)
                        with open(os.path.join(self.save_dir, 'history.pkl'),
                                  'w') as f:
                            pkl.dump(dict(self.history), f)
                    sess.run(self.global_epoch.assign_add(1.))
                    continue
                break
            # Now parse the training and testing files
            input_files = self.train_files + self.parse_files
            saver.restore(sess, tf.train.latest_checkpoint(self.save_dir))
            for input_file in input_files:
                parseset = Parseset.from_configurable(self,
                                                      self.vocabs,
                                                      parse_files=input_file,
                                                      nlp_model=self.nlp_model)
                with tf.variable_scope(self.name.title(), reuse=True):
                    parse_tensors = parseset(moving_params=self.optimizer)
                parse_outputs = [
                    parse_tensors[parse_key]
                    for parse_key in parseset.parse_keys
                ]

                input_dir, input_file = os.path.split(input_file)
                output_dir = self.save_dir
                output_file = input_file

                start_time = time.time()
                probs = []
                sents = []
                for feed_dict, tokens in parseset.iterbatches(shuffle=False):
                    probs.append(sess.run(parse_outputs, feed_dict=feed_dict))
                    sents.append(tokens)
                parseset.write_probs(sents,
                                     os.path.join(output_dir, output_file),
                                     probs)
        if self.verbose:
            print(
                ctext(
                    'Parsing {0} file(s) took {1} seconds'.format(
                        len(input_files),
                        time.time() - start_time), 'bright_green'))
        return