예제 #1
0
def broadcast_test(comm, value, kind, root):
    if comm.rank == 0:
        print("Broadcasting %s from root %d..." % (kind, root)),
    got_value = None
    got_value = mpi.broadcast(comm, value, root)
    if comm.rank == 0:
        print("OK.")
    return
예제 #2
0
def broadcast_test(comm, value, kind, root):
    if comm.rank == root:
        print ("Broadcasting %s from root %d..." % (kind, root)),
        
    got_value = mpi.broadcast(comm, value, root)
    assert got_value == value
    if comm.rank == root:
        print "OK."
    return
예제 #3
0
def do_training(indices, training_blob, heldout_blob, weights, weights_out, debiasing_weights):
    """
    Helper/wrapper function for parallel perceptron training.
    Runs one epoch of perceptron training and reports current accuracy on
    training data and on heldout data.
    """
    # Under de-biasing mode, we only allow features present in a given initial
    # weight vector. These are features that have been "selected" under a previously
    # run regularized training scheme.
    valid_feature_names = None
    if FLAGS.debiasing:
        valid_feature_names = getFeatureNames(debiasing_weights)

    for epoch in range(FLAGS.maxepochs):
        # Randomize order of examples; broadcast this randomized order to all processes.
        # The particular subset any perceptron process gets for this epoch is dependent
        # upon this randomized ordering.
        if myRank == 0 and FLAGS.shuffle:
            random.shuffle(indices)
        indices = mpi.broadcast(value=indices, root=0)

        ##################################################
        # SEARCH: Find 1-best under current model
        ##################################################
        # Run one epoch over training data
        io_helper.write_master("===EPOCH %d TRAINING===\n" %(epoch))
        newWeights_avg = perceptron_parallel(epoch, indices, training_blob, weights,
                                             valid_feature_names)
        ####################################
        # Dump weights for this iteration
        ####################################
        if myRank == 0:
            cPickle.dump(newWeights_avg, weights_out, protocol=cPickle.HIGHEST_PROTOCOL)
            # Need to flush output somehow here. Does weights_out.flush() work?
            weights_out.flush()

        ##################################################
        # Try a corpus re-decode here with the new weights
        # This returns a HELDOUT F-SCORE
        ##################################################
        # Decode dev data with same new learned weight vector
        if FLAGS.decodeheldout:
            io_helper.write_master("===EPOCH %d DECODE HELDOUT===\n" %(epoch))
            decode_parallel(newWeights_avg, indices_dev, heldout_blob, "dev")
    if myRank == 0:
        weights_out.close()
예제 #4
0
파일: choa.py 프로젝트: 460130107/choa
def do_training(indices, training_blob, heldout_blob, weights, weights_out, debiasing_weights):
  """
  Helper/wrapper function for parallel perceptron training.
  Runs one epoch of perceptron training and reports current accuracy on
  training data and on heldout data.
  """
  # Under de-biasing mode, we only allow features present in a given initial
  # weight vector. These are features that have been "selected" under a previously
  # run regularized training scheme.
  valid_feature_names = None
  if FLAGS.debiasing:
    valid_feature_names = getFeatureNames(debiasing_weights)

  for epoch in range(FLAGS.maxepochs):
    # Randomize order of examples; broadcast this randomized order to all processes.
    # The particular subset any perceptron process gets for this epoch is dependent
    # upon this randomized ordering.
    if myRank == 0 and FLAGS.shuffle:
      random.shuffle(indices)
    indices = mpi.broadcast(value=indices, root=0)

    ##################################################
    # SEARCH: Find 1-best under current model
    ##################################################
    # Run one epoch over training data
    io_helper.write_master("===EPOCH %d TRAINING===\n" %(epoch))
    newWeights_avg = perceptron_parallel(epoch, indices, training_blob, weights,
                                         valid_feature_names)
    ####################################
    # Dump weights for this iteration
    ####################################
    if myRank == 0:
      cPickle.dump(newWeights_avg, weights_out, protocol=cPickle.HIGHEST_PROTOCOL)
 	    # Need to flush output somehow here. Does weights_out.flush() work?
      weights_out.flush()

    ##################################################
    # Try a corpus re-decode here with the new weights
    # This returns a HELDOUT F-SCORE
    ##################################################
    # Decode dev data with same new learned weight vector
    if FLAGS.decodeheldout:
      io_helper.write_master("===EPOCH %d DECODE HELDOUT===\n" %(epoch))
      decode_parallel(newWeights_avg, indices_dev, heldout_blob, "dev")
  if myRank == 0:
    weights_out.close()
예제 #5
0
def perceptron_parallel(epoch, indices, blob, weights = None, valid_feature_names = None):
    """
    Implements parallelized version of perceptron training for structured outputs
    (Collins, 2002; McDonald, 2010).
    """
    # Which processor am I?
    myRank = mpi.rank

    # Setting for output of decoding Path
    decodingPath = None
    decodingPathFile = robustWrite("%s/%s%s" % (tmpdir, FLAGS.decoding_path_out,str(myRank)))

    # Let processor 0 be the master.
    masterRank = 0
    # How many processors are there?
    nProcs = mpi.size
    ##########################################
    # Keep track of time to train this epoch
    ##########################################
    startTime = time.time()
    # Restart with weights from last epoch or 0.
    # Will ignore any weights passed during function call.
    weights_restart_filename = '%s/training-restart.%s' % (tmpdir, str(mpi.rank))
    if os.path.isfile(weights_restart_filename):
        weights_restart_file = open(weights_restart_filename, 'r')
        weights = cPickle.load(weights_restart_file)
        weights_restart_file.close()
    else:
        # If weights passed during function call is None start with empty.
        if weights is None or len(weights) == 0:
            weights = svector.Vector()

    # Restart with previous running weight sum, also.
    weights_sum_filename = '%s/training.%s' % (tmpdir, str(mpi.rank))
    if os.path.isfile(weights_sum_filename):
        weights_sum_file = open(weights_sum_filename, 'r')
        weights_sum = cPickle.load(weights_sum_file)
        weights_sum_file.close()
    else:
        weights_sum = svector.Vector()

    numChanged = 0
    done = False
    for i, instanceID in enumerate(indices[:FLAGS.subset]):
        if myRank == i % nProcs:
        # Assign the current instances we will look at
            f = blob['f_instances'][instanceID]
            e = blob['e_instances'][instanceID]
            etree = blob['etree_instances'][instanceID]
            gold_str = blob['gold_instances'][instanceID]

            inverse = None
            if FLAGS.inverse_a is not None:
                inverse = blob['inverse_instances'][instanceID]

            a1 = None
            if FLAGS.a1 is not None:
                a1 = blob['a1_instances'][instanceID]

            a2 = None
            if FLAGS.a2 is not None:
                a2 = blob['a2_instances'][instanceID]

            ftree = None
            if FLAGS.ftrees is not None:
                ftree = blob['ftree_instances'][instanceID]

            # Preprocess input data
            # f, e are sequences of words
            f = f.split() ; e = e.split()

            # gold is a sequence of f-e link pairs
            gold = Alignment.Alignment(gold_str, FLAGS.inverse)

            # Initialize model for this instance
            model = GridAlign.Model(f, e, etree, ftree, instanceID, weights, a1, a2,
                                    inverse, LOCAL_FEATURES=blob['localFeatures'],
                                    NONLOCAL_FEATURES=blob['nonlocalFeatures'],
                                    FLAGS=FLAGS)
            model.gold = gold

            # Initialize model with data tables
            model.pef = blob['pef']
            model.pfe = blob['pfe']

            # Load language model
            model.lm = blob['lm']

            # Align the current training instance
            model.align()

            if FLAGS.decoding_path_out is not None:
                cPickle.dump(model.decodingPath, decodingPathFile, protocol=cPickle.HIGHEST_PROTOCOL)

            ######################################################################
            # Weight updating
            ######################################################################
            LEARNING_RATE = FLAGS.learningrate

            # Set the oracle item
            oracle = None
            if FLAGS.oracle in ['gold','hope']:
                oracle = model.oracle
            else:
                sys.stderr.write("ERROR: Unknown oracle class: %s\n" %(FLAGS.oracle))

            # Set the hypothesis item
            hyp = None
            if FLAGS.hyp in ['1best', 'fear']:
                hyp = model.hyp
            else:
                sys.stderr.write("ERROR: Unknown hyp class: %s\n" %(FLAGS.hyp))
            # Debiasing
            if FLAGS.debiasing:
                validate_features(oracle.scoreVector, valid_feature_names)
                validate_features(hyp.scoreVector, valid_feature_names)

            deltas = None
            if set(hyp.links) != set(oracle.links):
                numChanged += 1
                ###############################################################
                # WEIGHT UPDATES
                ################################################################
                deltas = oracle.scoreVector - hyp.scoreVector
                weights = weights + LEARNING_RATE*deltas
            # Even if we didnt update, the current weight vector should count towards the sum!
            weights_sum += weights
            # L1 Projection step
            # if w in [-tau, tau], w -> 0
            # else, move w closer to 0 by tau.
            if FLAGS.tau is not None:
                for index, w in weights_sum.iteritems():
                    if w == 0:
                        del weights_sum[index]
                        continue
                    if index[-3:] == '_nb':
                        continue
                    if w > 0 and w <= FLAGS.tau and not FLAGS.negreg:
                        del weights_sum[index]
                    elif w < 0 and w >= (FLAGS.tau * -1):
                        del weights_sum[index]
                    elif w > 0 and w > FLAGS.tau and not FLAGS.negreg:
                        weights_sum[index] -= FLAGS.tau
                    elif w < 0 and w < (FLAGS.tau * -1):
                        weights_sum[index] += FLAGS.tau

    # Set uniq pickled output file for this process
    # Holds sum of weights over each iteration for this process
    output_filename = "%s/training.%s" %(tmpdir, str(mpi.rank))
    output_file = open(output_filename,'w')
    # Dump all weights used during this node's run; to be averaged by master along with others
    cPickle.dump(weights_sum, output_file, protocol=cPickle.HIGHEST_PROTOCOL)
    output_file.close()

    # Remeber just the last weights used for this process; start here next epoch.
    output_filename_last_weights = "%s/training-restart.%s" %(tmpdir, str(mpi.rank))
    output_file_last_weights = open(output_filename_last_weights,'w')
    cPickle.dump(weights, output_file_last_weights, protocol=cPickle.HIGHEST_PROTOCOL)
    output_file_last_weights.close()

    decodingPathFile.close()
    #############################################
    # Gather "done" messages from workers
    #############################################
    # Synchronize
    done = mpi.gather(value=True,root=0)

    #####################################################################################
    # Compute f-measure over all alignments
    #####################################################################################
    masterWeights = svector.Vector()

    if myRank == masterRank:
        decodePathFiles = {}

        # Read pickled output
        for rank in range(nProcs):
            input_filename = tmpdir+'/training.'+str(rank)
            input_file = open(input_filename,'r')
            masterWeights += cPickle.load(input_file)
            input_file.close()
            decodePathFiles[rank] = robustRead("%s/%s%s" % (tmpdir, FLAGS.decoding_path_out, str(rank)) )

        sys.stderr.write("Done reading data.\n")
        sys.stderr.write("len(masterWeights)= %d\n"%(len(masterWeights)))
        sys.stderr.flush()

        ######################################################
        # AVERAGED WEIGHTS
        ######################################################
        sys.stderr.write("[%d] Averaging weights.\n" %(mpi.rank))
        sys.stderr.flush()
        masterWeights = masterWeights / (len(indices) * (epoch+1))
        # Dump master weights to file
        # There is only one weight vector in this file at a time.
        mw = robustWrite(tmpdir+'/weights')
        cPickle.dump(masterWeights,mw,protocol=cPickle.HIGHEST_PROTOCOL)
        mw.close()

        # Write decoding path
        decodingPathList = []
        if FLAGS.decoding_path_out is not None:
            path_out = robustWrite(FLAGS.decoding_path_out, encoding="utf-8")
            for i, instanceID in enumerate(indices[:FLAGS.subset]):
                node = i % nProcs
                chosenTree = cPickle.load(decodePathFiles[node])
                heappush(decodingPathList, (instanceID, chosenTree))

            orderedList = [heappop(decodingPathList)[1] for _ in xrange(len(decodingPathList))]
            path_out.write(u"\n".join(orderedList))
            path_out.close()
            # CLEAN UP
            for i in range(nProcs):
                decodePathFiles[i].close()

    ######################################################################
    # All processes read and load new averaged weights
    ######################################################################
    # But make sure worker nodes don't attempt to read from the weights
    # file before the root node has written it.
    # Sync-up with a blocking broadcast call
    ready = mpi.broadcast(value=True, root=0)
    mw = robustRead(tmpdir+'/weights')
    masterWeights = cPickle.load(mw)
    mw.close()

    ######################################################################
    # Print report for this iteration
    ######################################################################
    elapsedTime = time.time() - startTime
    if myRank == masterRank:
        # masterRank is printing elapsed time.
        # May differ at each node.
        sys.stderr.write("Time: %0.2f\n" %(elapsedTime))
        sys.stderr.write("[%d] Finished training.\n" %(mpi.rank))

    return masterWeights
예제 #6
0
        a2_dev_instances = []
        gold_dev_instances = []
        inverse_dev_instances = []

    tmpdir = None
    if mpi.rank == 0:
        base_tempdir = None
        if FLAGS.tempdir is not None:
            base_tempdir = FLAGS.tempdir
        else:
            base_tempdir = tempfile.gettempdir()
        if base_tempdir is None:
            base_tempdir = "."
        tmpdir = tempfile.mkdtemp(prefix='align-'+str(os.getpid())+'-',
                                  dir=base_tempdir)
    tmpdir = mpi.broadcast(value=tmpdir, root=0)


    ################################################
    # Load training examples
    ################################################
    count = 1
    etree_file_handle = readDependencyForestFile(FLAGS.etrees)
    for f, e, etree in izip(file_handles['f'], file_handles['e'], etree_file_handle):
        f_instances.append(f.strip())
        e_instances.append(e.strip())
        etree_instances.append(etree.strip())
        if FLAGS.train and FLAGS.partial!=-1 and count>= FLAGS.partial:
            break
        count += 1
    indices = range(len(e_instances))
예제 #7
0
파일: mira.py 프로젝트: jungikim/sbmt
            all_nweights = mpi.gather(mpi.world, nweights, parallel.master)
            if parallel.rank == parallel.master:
                sumweights = sum(all_outweights, svector.Vector())
                outweights = sumweights / float(sum(all_nweights))
                log.write("summed feature weights: %s n=%d\n" %
                          (sumweights * watch_features, sum(all_nweights)))
                log.write("averaged feature weights: %s\n" %
                          (outweights * watch_features))

        if opts.outweightfilename:
            if not opts.parallel or parallel.rank == parallel.master:
                outweightfile.write("%s\n" % outweights)
                outweightfile.flush()

        if opts.parallel:
            outweights = mpi.broadcast(mpi.world, outweights, parallel.master)

        # Process heldout data

        if not opts.parallel or parallel.rank != parallel.master:
            saveweights = thedecoder.weights
            thedecoder.weights = outweights

        if opts.parallel:
            outsents = parallel.pmap(process_heldout,
                                     heldoutsents,
                                     tag=0,
                                     verbose=1)
        else:
            outsents = (process_heldout(sent) for sent in heldoutsents)
예제 #8
0
파일: mira.py 프로젝트: isi-nlp/sbmt
        if opts.parallel:
            all_outweights = mpi.gather(mpi.world, outweights, parallel.master)
            all_nweights = mpi.gather(mpi.world, nweights, parallel.master)
            if parallel.rank == parallel.master:
                sumweights = sum(all_outweights, svector.Vector())
                outweights = sumweights / float(sum(all_nweights))
                log.write("summed feature weights: %s n=%d\n" % (sumweights * watch_features, sum(all_nweights)))
                log.write("averaged feature weights: %s\n" % (outweights * watch_features))

        if opts.outweightfilename:
            if not opts.parallel or parallel.rank == parallel.master:
                outweightfile.write("%s\n" % outweights)
                outweightfile.flush()

        if opts.parallel:
            outweights = mpi.broadcast(mpi.world, outweights, parallel.master)

        # Process heldout data

        if not opts.parallel or parallel.rank != parallel.master:
            saveweights = thedecoder.weights
            thedecoder.weights = outweights

        if opts.parallel:
            outsents = parallel.pmap(process_heldout, heldoutsents, tag=0, verbose=1)
        else:
            outsents = (process_heldout(sent) for sent in heldoutsents)

        if not opts.parallel or parallel.rank == parallel.master:
            heldout_score_comps = svector.Vector()
            for outsent in outsents:
예제 #9
0
파일: choa.py 프로젝트: 460130107/choa
def perceptron_parallel(epoch, indices, blob, weights = None, valid_feature_names = None):
  """
  Implements parallelized version of perceptron training for structured outputs
  (Collins, 2002; McDonald, 2010).
  """
  # Which processor am I?
  myRank = mpi.rank
  # Let processor 0 be the master.
  masterRank = 0
  # How many processors are there?
  nProcs = mpi.size
  ##########################################
  # Keep track of time to train this epoch
  ##########################################
  startTime = time.time()
  # Restart with weights from last epoch or 0.
  # Will ignore any weights passed during function call.
  weights_restart_filename = '%s/training-restart.%s' % (tmpdir, str(mpi.rank))
  if os.path.isfile(weights_restart_filename):
    weights_restart_file = open(weights_restart_filename, 'r')
    weights = cPickle.load(weights_restart_file)
    weights_restart_file.close()
  else:
    # If weights passed during function call is None start with empty.
    if weights is None or len(weights) == 0:
        weights = svector.Vector()

  # Restart with previous running weight sum, also.
  weights_sum_filename = '%s/training.%s' % (tmpdir, str(mpi.rank))
  if os.path.isfile(weights_sum_filename):
    weights_sum_file = open(weights_sum_filename, 'r')
    weights_sum = cPickle.load(weights_sum_file)
    weights_sum_file.close()
  else:
    weights_sum = svector.Vector()

  numChanged = 0
  done = False
  for i, instanceID in enumerate(indices[:FLAGS.subset]):
    if myRank == i % nProcs:
      # Assign the current instances we will look at
      f = blob['f_instances'][instanceID]
      e = blob['e_instances'][instanceID]
      etree = blob['etree_instances'][instanceID]
      gold_str = blob['gold_instances'][instanceID]

      inverse = None
      if FLAGS.inverse is not None:
        inverse = blob['inverse_instances'][instanceID]

      a1 = None
      if FLAGS.a1 is not None:
        a1 = blob['a1_instances'][instanceID]

      a2 = None
      if FLAGS.a2 is not None:
        a2 = blob['a2_instances'][instanceID]

      ftree = None
      if FLAGS.ftrees is not None:
        ftree = blob['ftree_instances'][instanceID]

      # Preprocess input data
      # f, e are sequences of words
      f = f.split() ; e = e.split()

      # gold is a sequence of f-e link pairs
      gold = Alignment.Alignment(gold_str)

      # Initialize model for this instance
      model = GridAlign.Model(f, e, etree, ftree, instanceID, weights, a1, a2,
                              inverse, LOCAL_FEATURES=blob['localFeatures'],
                              NONLOCAL_FEATURES=blob['nonlocalFeatures'],
                              FLAGS=FLAGS)
      model.gold = gold

      # Initialize model with data tables
      model.pef = blob['pef']
      model.pfe = blob['pfe']
      # Align the current training instance
      model.align()

      ######################################################################
      # Weight updating
      ######################################################################
      LEARNING_RATE = FLAGS.learningrate

      # Set the oracle item
      oracle = None
      if FLAGS.oracle == 'gold':
        oracle = model.oracle
      elif FLAGS.oracle == 'hope':
        oracle = model.hope
      else:
        sys.stderr.write("ERROR: Unknown oracle class: %s\n" %(FLAGS.oracle))

      # Set the hypothesis item
      hyp = None
      if FLAGS.hyp == '1best':
        hyp = model.modelBest
      elif FLAGS.hyp == 'fear':
        hyp = model.fear
      else:
        sys.stderr.write("ERROR: Unknown hyp class: %s\n" %(FLAGS.hyp))
      # Debiasing
      if FLAGS.debiasing:
          validate_features(oracle.scoreVector, valid_feature_names)
          validate_features(hyp.scoreVector, valid_feature_names)

      deltas = None
      if set(hyp.links) != set(oracle.links):
          numChanged += 1
          ###############################################################
          # WEIGHT UPDATES
          ################################################################
          deltas = oracle.scoreVector - hyp.scoreVector
          weights = weights + LEARNING_RATE*deltas
      # Even if we didnt update, the current weight vector should count towards the sum!
      weights_sum += weights
      # L1 Projection step
      # if w in [-tau, tau], w -> 0
      # else, move w closer to 0 by tau.
      if FLAGS.tau is not None:
          for index, w in weights_sum.iteritems():
              if w == 0:
                  del weights_sum[index]
                  continue
              if index[-3:] == '_nb':
                  continue
              if w > 0 and w <= FLAGS.tau and not FLAGS.negreg:
                  del weights_sum[index]
              elif w < 0 and w >= (FLAGS.tau * -1):
                  del weights_sum[index]
              elif w > 0 and w > FLAGS.tau and not FLAGS.negreg:
                  weights_sum[index] -= FLAGS.tau
              elif w < 0 and w < (FLAGS.tau * -1):
                  weights_sum[index] += FLAGS.tau

  # Set uniq pickled output file for this process
  # Holds sum of weights over each iteration for this process
  output_filename = "%s/training.%s" %(tmpdir, str(mpi.rank))
  output_file = open(output_filename,'w')
  # Dump all weights used during this node's run; to be averaged by master along with others
  cPickle.dump(weights_sum, output_file, protocol=cPickle.HIGHEST_PROTOCOL)
  output_file.close()

  # Remeber just the last weights used for this process; start here next epoch.
  output_filename_last_weights = "%s/training-restart.%s" %(tmpdir, str(mpi.rank))
  output_file_last_weights = open(output_filename_last_weights,'w')
  cPickle.dump(weights, output_file_last_weights, protocol=cPickle.HIGHEST_PROTOCOL)
  output_file_last_weights.close()

  #############################################
  # Gather "done" messages from workers
  #############################################
  # Synchronize
  done = mpi.gather(value=True,root=0)

  #####################################################################################
  # Compute f-measure over all alignments
  #####################################################################################
  masterWeights = svector.Vector()

  if myRank == masterRank:
    # Read pickled output
    for rank in range(nProcs):
      input_filename = tmpdir+'/training.'+str(rank)
      input_file = open(input_filename,'r')
      masterWeights += cPickle.load(input_file)
      input_file.close()
    sys.stderr.write("Done reading data.\n")
    sys.stderr.write("len(masterWeights)= %d\n"%(len(masterWeights)))
    sys.stderr.flush()

    ######################################################
    # AVERAGED WEIGHTS
    ######################################################
    sys.stderr.write("[%d] Averaging weights.\n" %(mpi.rank))
    sys.stderr.flush()
    masterWeights = masterWeights / (len(indices) * (epoch+1))
    # Dump master weights to file
    # There is only one weight vector in this file at a time.
    mw = robustWrite(tmpdir+'/weights')
    cPickle.dump(masterWeights,mw,protocol=cPickle.HIGHEST_PROTOCOL)
    mw.close()

  ######################################################################
  # All processes read and load new averaged weights
  ######################################################################
  # But make sure worker nodes don't attempt to read from the weights
  # file before the root node has written it.
  # Sync-up with a blocking broadcast call
  ready = mpi.broadcast(value=True, root=0)
  mw = robustRead(tmpdir+'/weights')
  masterWeights = cPickle.load(mw)
  mw.close()

  ######################################################################
  # Print report for this iteration
  ######################################################################
  elapsedTime = time.time() - startTime
  if myRank == masterRank:
    # masterRank is printing elapsed time.
    # May differ at each node.
    sys.stderr.write("Time: %0.2f\n" %(elapsedTime))
    sys.stderr.write("[%d] Finished training.\n" %(mpi.rank))

  return masterWeights
예제 #10
0
파일: choa.py 프로젝트: 460130107/choa
      a2_dev_instances = []
      gold_dev_instances = []
      inverse_dev_instances = []

    tmpdir = None
    if mpi.rank == 0:
      base_tempdir = None
      if FLAGS.tempdir is not None:
        base_tempdir = FLAGS.tempdir
      else:
        base_tempdir = tempfile.gettempdir()
      if base_tempdir is None:
        base_tempdir = "."
      tmpdir = tempfile.mkdtemp(prefix='align-'+str(os.getpid())+'-',
                                dir=base_tempdir)
    tmpdir = mpi.broadcast(value=tmpdir, root=0)


    ################################################
    # Load training examples
    ################################################
    count = 1
    etree_file_handle = readDependencyFile(FLAGS.etrees)
    for f, e, etree in izip(file_handles['f'], file_handles['e'], etree_file_handle):
        f_instances.append(f.strip())
        e_instances.append(e.strip())
        etree_instances.append(etree.strip())
        if FLAGS.train and FLAGS.partial!=-1 and count>= FLAGS.partial:
            break
        count += 1
    indices = range(len(e_instances))