예제 #1
0
파일: embedding.py 프로젝트: uhh-lt/lttc
 def load(self, skipheader = True, nlines = sys.maxsize, normalize = False):
   self.index = Index()
   print('Loading embedding from %s' % self.file)
   data_ = []
   with open(self.file, 'r', encoding='utf-8', errors='ignore') as f:
     if skipheader:
       f.readline()
     for i, line in enumerate(f):
       if i >= nlines:
         break
       try:
         line = line.strip()
         splits = line.split(self.separator)
         word = splits[0]
         if self.index.hasWord(word):
           continue
         coefs = np.array(splits[1:self.vdim+1], dtype=np.float32)
         if normalize:
           length = np.linalg.norm(coefs)
           if length == 0:
             length += 1e-6
           coefs = coefs / length
         if coefs.shape != (self.vdim,):
           continue
         idx = self.index.add(word)
         data_.append(coefs)
         assert idx == len(data_)
       except Exception as err:
         print('Error in line %d' % i, sys.exc_info()[0], file = sys.stderr)
         print(' ', err, file = sys.stderr)
         continue
   self.data = np.array(data_, dtype = np.float32)
   del data_
   return self
예제 #2
0
def load_dataset(path):
    charset = Charset()

    vocab = Vocabulary()
    vocab.load(f"{path}/vocab.txt")

    tag_set = Index()
    tag_set.load(f"{path}/tag2id.txt")

    measure_type = get_measure_type(path)

    tag_set = Index()
    if measure_type == "relations":
        tag_set.load(f"{path}/tag2id.txt")
    elif measure_type == "entities":
        tag_set.load(f"{path}/entity_labels.txt")

    helper = Helper(vocab, tag_set, charset, measure_type=measure_type)

    # relation_labels = Index()
    # relation_labels.load(f"{path}/relation_labels.txt")

    train_data = load(f"{path}/train.pk")[:1000]
    test_data = load(f"{path}/test.pk")

    word_embeddings = np.load(f"{path}/word2vec.vectors.npy")

    return helper, word_embeddings, train_data, test_data, tag_set
예제 #3
0
class RandomEmbedding(Embedding):
  
  def __init__(self, vectordim = 300):
    self.index = Index()
    self.vdim = vectordim
    self.data = np.zeros((0, self.vdim), dtype = np.float32)
    self.invindex = None
  
  def getVector(self, word):
    if not self.index.hasWord(word):
      # create random vector
      v = np.random.rand(self.vdim).astype(np.float32)
      # normalize
      length = np.linalg.norm(v)
      if length == 0:
        length += 1e-6
      v = v / length
      # add
      idx = self.index.add(self.id2w)
      self.data = np.vstack((self.data, v))
      assert idx == len(self.data)
      if self.invindex is not None:
        del self.invindex
        self.invindex = None
      return v
    idx = self.index.getId(word)
    return self.data[idx]
    
  def search(self, q, topk = 4):
    if not self.invindex:
      print('Building faiss index...')
      self.invindex = faiss.IndexFlatL2(self.vdim)
      self.invindex.add(self.data)
      print('Faiss index built:', self.invindex.is_trained)
    if len(q.shape) == 1:
      q = np.matrix(q)
    if q.shape[1] != self.vdim:
      print('Wrong shape, expected %d dimensions but got %d.' % (self.vdim, q.shape[1]), file = sys.stderr)
      return
    D, I = self.invindex.search(q, topk) # D = distances, I = indices
    return ( I, D )
    
  def wordForVec(self, v):
    idx, dist = self.search(v, topk=1)
    idx = idx[0,0]
    dist = dist[0,0]
    sim = 1. - dist
    word = self.index.getWord(idx)
    return word, sim
  
  def containsWord(self, word):
    return True
  
  def vocabulary(self):
    return self.index.vocbulary()
  
  def dim(self):
    return self.vdim
예제 #4
0
def loadData(args):
  '''
  
  '''
  __SequenceDataset = data.CharSequence if args.chars else data.TokenSequence
  print(__SequenceDataset.__name__)
  index = Index(initwords = ['<unk>'], unkindex = 0)
  train_ = __SequenceDataset(args.data, subset='train.txt', index = index, seqlen = args.bptt, skip = args.bptt).to(args.device)
  index.freeze(silent = True).tofile(os.path.join(args.data, 'vocab_chars.txt' if args.chars else 'vocab_tokens.txt'))
  test_ = __SequenceDataset(args.data, subset='test.txt', index = index, seqlen = args.bptt, skip = args.bptt).to(args.device)
  valid_ = __SequenceDataset(args.data, subset='valid.txt', index = index, seqlen = args.bptt, skip = args.bptt).to(args.device)
  
  # load pre embedding
  if args.init_weights:
    # determine type of embedding by checking it's suffix
    if args.init_weights.endswith('bin'):
      preemb = FastTextEmbedding(args.init_weights, normalize = True).load()
      if args.emsize != preemb.dim():
        raise ValueError('emsize must match embedding size. Expected %d but got %d)' % (args.emsize, preemb.dim()))
    elif args.init_weights.endswith('txt'):
      preemb = TextEmbedding(args.init_weights, vectordim = args.emsize).load(normalize = True)
    elif args.init_weights.endswith('rand'):
      preemb = RandomEmbedding(vectordim = args.emsize)
    else:
      raise ValueError('Type of embedding cannot be inferred.')
    preemb = Embedding.filteredEmbedding(index.vocabulary(), preemb, fillmissing = True)
    preemb_weights = torch.Tensor(preemb.weights)
  else:
    preemb_weights = None
  
  eval_batch_size = 10
  __ItemSampler = RandomSampler if args.shuffle_samples else SequentialSampler
  __BatchSampler = BatchSampler if args.sequential_sampling else EvenlyDistributingSampler  
  train_loader = torch.utils.data.DataLoader(train_, batch_sampler = ShufflingBatchSampler(__BatchSampler(__ItemSampler(train_), batch_size=args.batch_size, drop_last = True), shuffle = args.shuffle_batches, seed = args.seed), num_workers = 0)
  test_loader = torch.utils.data.DataLoader(test_, batch_sampler = __BatchSampler(__ItemSampler(test_), batch_size=eval_batch_size, drop_last = True), num_workers = 0)
  valid_loader = torch.utils.data.DataLoader(valid_, batch_sampler = __BatchSampler(__ItemSampler(valid_), batch_size=eval_batch_size, drop_last = True), num_workers = 0)
  print(__ItemSampler.__name__)
  print(__BatchSampler.__name__)
  print('Shuffle training batches: ', args.shuffle_batches)

  setattr(args, 'index', index)
  setattr(args, 'ntokens', len(index))
  setattr(args, 'trainloader', train_loader)
  setattr(args, 'testloader', test_loader)
  setattr(args, 'validloader', valid_loader)
  setattr(args, 'preembweights', preemb_weights)
  setattr(args, 'eval_batch_size', eval_batch_size)

  return args
예제 #5
0
 def __init__(self, path, subset = 'train.txt', index = None, seqlen = 35, skip = 35):
   super(TokenSequence, self).__init__(seqlen, skip)
   self.path = path
   self.subset = subset
   self.file = os.path.join(self.path, self.subset)
   self.index = index if index is not None else Index()
   self.data = self.load()
예제 #6
0
 def __init__(self,
              path=None,
              lang='en',
              nlines=None,
              maxseqlen=None,
              index=None,
              nbos=0,
              neos=1,
              posiindex=None,
              classindex=None,
              bert_model='bert-base-uncased',
              maxseqlen_bert=None,
              cache_device_tensors=True):
     super(LttcDataset, self).__init__()
     self.path = path
     self.maxseqlen = maxseqlen
     self.nbos = max(0, nbos)
     self.neos = max(1, neos)
     self.index = index if index is not None else Index()
     self.padidx = self.index.add('<pad>')
     self.bosidx = self.index.add('<s>')
     self.eosidx = self.index.add('</s>')
     self.index.unkindex = self.index.add('<unk>')
     self.classindex = classindex if classindex is not None else Index()
     self.classindex.unkindex = 0
     self.posiindex = posiindex if posiindex is not None else Index()
     self.nlines = nlines
     self.device = torch.device('cpu')
     self.lang = lang
     self.spacy_model = importSpacy(self.lang)
     self.bert_tokenizer = BertTokenizer.from_pretrained(
         bert_model, do_lower_case='uncased' in bert_model) if isinstance(
             bert_model, str) else bert_model
     self.maxseqlen_bert = maxseqlen_bert if maxseqlen_bert else self.bert_tokenizer.max_len
     self.samples = pandas.DataFrame(columns=[
         'id', 'filename', 'rawdata', 'spacydata', 'spacy_to_bert_position',
         'seq', 'seq_bert', 'seqlen', 'seqlen_bert', 'seq_recon', 'pseq',
         'pseq_rev', 'label', 'labelid'
     ])
     self.tensor_cache = [] if cache_device_tensors else None
예제 #7
0
 def load(self, skipheader = True, nlines = sys.maxsize, normalize = False):
   self.index = Index()
   print('Loading embedding from %s' % self.file)
   data_ = []
   with open(self.file, 'r', encoding='utf-8', errors='ignore') as f:
     if skipheader:
       f.readline()
     for i, line in enumerate(f):
       if i >= nlines:
         break
       try:
         line = line.strip()
         splits = line.split(self.separator)
         word = splits[0]
         if self.index.hasWord(word):
           continue
         coefs = np.array(splits[1:self.vdim+1], dtype=np.float32)
         if normalize:
           length = np.linalg.norm(coefs)
           if length == 0:
             length += 1e-6
           coefs = coefs / length
         if coefs.shape != (self.vdim,):
           continue
         idx = self.index.add(word)
         data_.append(coefs)
         assert idx == len(data_)
       except Exception as err:
         print('Error in line %d' % i, sys.exc_info()[0], file = sys.stderr)
         print(' ', err, file = sys.stderr)
         continue
   self.data = np.array(data_, dtype = np.float32)
   del data_
   print('Building faiss index...')
   if not self.normalize:
     print('Attention, normlization of vectors is required to guarantee functional search behaviour. Be sure your vectors are normalized, otherwise declare normlaize flag!')
   self.invindex = faiss.IndexFlatL2(self.vdim)
   self.invindex.add(self.data)
   print('Faiss index built:', self.invindex.is_trained)
   return self
예제 #8
0
def load(mfile, ifile):
    # load model
    print('Loading model', file=sys.stderr)
    with open(mfile, 'rb') as f:
        model = torch.load(f).to(device)
    model.rnn.flatten_parameters(
    )  # after load the rnn params are not a continuous chunk of memory. This makes them a continuous chunk, and will speed up forward pass
    model.eval()  # deactivate training
    # load index
    print('Loading index', file=sys.stderr)
    index = Index.fromfile(ifile).freeze()
    print('Loading embedding', file=sys.stderr)
    emb = Embedding(model.encoder.weight.detach(), index, normalize=False)
    return model, index, emb
예제 #9
0
파일: embedding.py 프로젝트: uhh-lt/lttc
class RandomEmbedding(Embedding):

  def __init__(self, vectordim = 300):
    self.index = Index()
    self.vdim = vectordim
    self.data = np.zeros((0, self.vdim), dtype = np.float32)
    self.invindex = None

  def getVector(self, word):
    if not self.index.hasWord(word):
      # create random vector
      v = np.random.rand(self.vdim).astype(np.float32)
      # normalize
      length = np.linalg.norm(v)
      if length == 0:
        length += 1e-6
      v = v / length
      # add
      idx = self.index.add(self.id2w)
      self.data = np.vstack((self.data, v))
      assert idx == len(self.data)
      if self.invindex is not None:
        del self.invindex
        self.invindex = None
      return v
    idx = self.index.getId(word)
    return self.data[idx]

  def containsWord(self, word):
    return True

  def vocabulary(self):
    return self.index.vocbulary()

  def dim(self):
    return self.vdim
예제 #10
0
파일: embedding.py 프로젝트: uhh-lt/lttc
 def filteredEmbedding(vocabulary, embedding, fillmissing = True):
   index = Index()
   weights = []
   if fillmissing:
     rv = RandomEmbedding(embedding.dim())
   for w in vocabulary:
     if index.hasWord(w):
       continue
     if embedding.containsWord(w):
       index.add(w)
       weights.append(embedding.getVector(w))
     elif fillmissing:
       index.add(w)
       weights.append(rv.getVector(w))
   weights = np.array(weights, dtype = np.float32)
   return Embedding(weights, index)
예제 #11
0
 def __init__(self, path, subset = 'train.txt', nlines=None, maxseqlen=None, maxentlen=None, maxdist=60, nbos = 0, neos = 1, index = None, posiindex = None, classindex = None, rclassindex = None, dclassindex = None, eclassindex = None, compact=True):
   self.path = path
   self.subset = subset
   self.maxseqlen = maxseqlen
   self.maxdist = maxdist
   self.nbos = max(0, nbos)
   self.neos = max(1, neos)
   self.index = index if index is not None else Index()
   self.bosidx = self.index.add('<s>')
   self.eosidx = self.index.add('</s>')
   self.padidx = self.index.add('<pad>')
   self.epadidx = self.index.add('<epad>')
   self.classindex = classindex if classindex is not None else Index()
   self.rclassindex = rclassindex if rclassindex is not None else Index()
   self.dclassindex = dclassindex if dclassindex is not None else Index()
   self.eclassindex = eclassindex if eclassindex is not None else Index()
   self.posiindex = posiindex if eclassindex is not None else Index(initwords = [ maxdist, -maxdist ], unkindex = 0)
   self.maxentlen = maxentlen
   self.load(nlines, compact)
   self.device = torch.device('cpu')
   self.deviceTensor = torch.LongTensor().to(self.device) # create tensor on device, which can be used for copying
예제 #12
0
# Set the random seed manually for reproducibility.
torch.manual_seed(args.seed)
if torch.cuda.is_available():
    if not args.cuda:
        print(
            "WARNING: You have a CUDA device, so you should probably run with --cuda"
        )

device = torch.device("cuda" if args.cuda else "cpu")

###############################################################################
# Load data
###############################################################################
__SequenceDataset = CharSequence if args.chars else TokenSequence
print(__SequenceDataset.__name__)
index = Index(initwords=['<unk>'], unkindex=0)
train_ = __SequenceDataset(args.data,
                           subset='train.txt',
                           index=index,
                           seqlen=args.bptt,
                           skip=args.bptt).to(device)
index.freeze(silent=True).tofile(
    os.path.join(args.data,
                 'vocab_chars.txt' if args.chars else 'vocab_tokens.txt'))
test_ = __SequenceDataset(args.data,
                          subset='test.txt',
                          index=index,
                          seqlen=args.bptt,
                          skip=args.bptt).to(device)
valid_ = __SequenceDataset(args.data,
                           subset='valid.txt',
예제 #13
0
파일: embedding.py 프로젝트: uhh-lt/lttc
 def __init__(self, vectordim = 300):
   self.index = Index()
   self.vdim = vectordim
   self.data = np.zeros((0, self.vdim), dtype = np.float32)
   self.invindex = None
예제 #14
0
파일: embedding.py 프로젝트: uhh-lt/lttc
class TextEmbedding(Embedding):

  def __init__(self, txtfile, sep = ' ', vectordim = 300):
    self.file = txtfile
    self.vdim = vectordim
    self.separator = sep

  def load(self, skipheader = True, nlines = sys.maxsize, normalize = False):
    self.index = Index()
    print('Loading embedding from %s' % self.file)
    data_ = []
    with open(self.file, 'r', encoding='utf-8', errors='ignore') as f:
      if skipheader:
        f.readline()
      for i, line in enumerate(f):
        if i >= nlines:
          break
        try:
          line = line.strip()
          splits = line.split(self.separator)
          word = splits[0]
          if self.index.hasWord(word):
            continue
          coefs = np.array(splits[1:self.vdim+1], dtype=np.float32)
          if normalize:
            length = np.linalg.norm(coefs)
            if length == 0:
              length += 1e-6
            coefs = coefs / length
          if coefs.shape != (self.vdim,):
            continue
          idx = self.index.add(word)
          data_.append(coefs)
          assert idx == len(data_)
        except Exception as err:
          print('Error in line %d' % i, sys.exc_info()[0], file = sys.stderr)
          print(' ', err, file = sys.stderr)
          continue
    self.data = np.array(data_, dtype = np.float32)
    del data_
    return self

  def getVector(self, word):
    if not self.containsWord(word):
      print("'%s' is unknown." % word, file = sys.stderr)
      v = np.zeros(self.vdim)
      v[0] = 1
      return v
    idx = self.index.getId(word)
    return self.data[idx]

  def search(self, q, topk = 4):
    if len(q.shape) == 1:
      q = np.matrix(q)
    if q.shape[1] != self.vdim:
      print('Wrong shape, expected %d dimensions but got %d.' % (self.vdim, q.shape[1]), file = sys.stderr )
      return
    D, I = self.invindex.search(q, topk) # D = distances, I = indices
    return ( I, D )

  def wordForVec(self, v):
    idx, dist = self.search(v, topk=1)
    idx = idx[0,0]
    dist = dist[0,0]
    sim = 1. - dist
    word = self.index.getWord(idx)
    return word, sim

  def containsWord(self, word):
    return self.index.hasWord(word)

  def vocabulary(self):
    return self.index.vocabulary()

  def dim(self):
    return self.vdim