Exemplo n.º 1
0
def main():
    model, encoder_model, decoder_model = create_models(300, 512, 300)
    model.load_weights(sys.argv[1])

    ft_en = FastText('embeddings/wiki.en.bin')
    ft_tl = FastText('embeddings/wiki.tl.bin')

    start_seq = ft_en.get_numpy_vector(SOS, normalized=True).reshape(1, 1, -1)

    chars = '.,?!()'

    while True:
        input_sentence = input('Input Tagalog: ').lower()  #'kamusta ka ?'

        for c in chars:
            input_sentence = input_sentence.replace(c, ' ' + c + ' ')

        print('Embedding...')
        input_seq = input_sentence.lower().split()
        aaa = np.zeros((1, 15, 300), dtype='float32')
        for i, w in enumerate(input_seq):
            aaa[0, i] = ft_tl.get_numpy_vector(w, normalized=True)
        #input_seq = [ft_tl.get_numpy_vector(i, normalized=True) for i in input_seq]
        #input_seq = np.stack(input_seq).reshape(1, -1, 300)
        input_seq = aaa
        print(input_seq)

        print('Translating...')

        decoded_sentence = decode_sequence(input_seq, encoder_model,
                                           decoder_model, ft_en, start_seq)
        print('-')
        print('Input sentence:', input_sentence)
        print('Decoded sentence:', decoded_sentence)
Exemplo n.º 2
0
def main():
    model = FastText('model_text8.bin')

    target_words = [
        'granada', 'python', 'harmony', 'mafia', 'yoga', 'goth', 'cyberpunk',
        'nasa', 'japan', 'boolean', 'foodball', 'algorithm', 'china', 'usa',
        'internet', 'harvard', 'earth', 'horse', 'angel', 'rock'
    ]
    for t_word in target_words:
        # get embedding
        target_word_embedding = model.get_numpy_vector(t_word)
        print('Target word:', t_word)
        #print('Embedding shape:', target_word_embedding.shape)
        #print('Embedding:', target_word_embedding[0:10], '...')

        # find closest words
        closest_words = model.nearest_neighbors(t_word, k=15)
        # init array
        nn_word_embedding = np.zeros(shape=(15, 128))
        i = 0
        for word, similarity in closest_words:
            # get each word embedding
            nn_word_embedding[i] = model.get_numpy_vector(word)
            #print('Word:', word, 'Vec:', nn_word_embedding[i])
            i = i + 1
        # kmeans
        #print(nn_word_embedding.shape)
        #print(closest_words)
        cluster_model = KMeans(n_clusters=3, init='k-means++')
        prediction = cluster_model.fit_predict(nn_word_embedding)
        print(prediction)
        j = 0
        for word in closest_words:
            print('Word:', word[0], '- Cluster #%d' % (prediction[j] + 1))
            j = j + 1
Exemplo n.º 3
0
class FeatureGenerator:
    def __init__(self, fastext_path):
        self.fasttext = FastText(fastext_path)

    def generate_record(self, tuple):
        tr = self.fasttext.get_numpy_vector(tuple[0])
        si = self.fasttext.get_numpy_vector(tuple[1])
        lm = self.fasttext.get_numpy_vector(tuple[2])
        #return numpy.concatenate((tr, lm))
        #return numpy.concatenate((tr, si, lm))
        return numpy.concatenate((tr, si, lm, lm - tr))
        #return numpy.concatenate((si, lm - tr, tr - lm))

    def generate(self, values):
        return numpy.array([self.generate_record(value) for value in values])
Exemplo n.º 4
0
def use_pyfasttext_model():
    # OK
    # 训练模型可以使用fasttext命令行工具进行(../doc/fastText_train.png),也可以使用本文件使用的pyfasttext包训练。
    """
    # OK: 1. pyfasttext包训练的模型的导入
    model = FastText("../data/lxw_model_sg_pyfasttext.bin")
    print(model["先生"])     # type(model["先生"]): <class 'array.array'>
    print(model.get_numpy_vector("先生"))    # type: <class 'numpy.ndarray'>
    print(model["刘晓伟"])   # OOV
    print(model.get_numpy_vector("刘晓伟"))
    print(model["陈贺"])   # OOV
    print(model.get_numpy_vector("陈贺"))

    model = FastText("../data/lxw_model_cbow_pyfasttext.bin")
    print(model["先生"])
    print(model.get_numpy_vector("先生"))    # type: <class 'numpy.ndarray'>
    print(model["刘晓伟"])   # OOV
    print(model.get_numpy_vector("刘晓伟"))
    print(model["陈贺"])   # OOV
    print(model.get_numpy_vector("陈贺"))
    # NOTE: 简单的测试发现, 两个不同的模型针对同一个OOV计算得到的向量是一样的(与fasttext包的情况相同,详情可参见NO_2_use_fasttext_model), 非OOV的向量是不一样的。
    """

    # OK: 2. fasttext命令行工具训练出来的模型的导入
    model = FastText("../data/880w_fasttext_skip_gram.bin")
    print(model["先生"])  # type(model["先生"]): <class 'array.array'>
    print(model.get_numpy_vector("先生"))
    # print(model["刘晓伟"])   # OK. OOV
    # print(model["陈贺"])   # OK. OOV

    # Sentence and text vectors.
    sentence_vec = model.get_numpy_sentence_vector("刘晓伟 是 个 好人")
    print(sentence_vec)
    """
def make_embedding_matrix(word_index, fname):
    model = FastText(os.path.join('embeddings', fname))
    embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM),
                                dtype='float32')
    for word, i in word_index.items():
        embedding_matrix[i] = model.get_numpy_vector(word, normalized=True)
    return embedding_matrix
Exemplo n.º 6
0
class FastTextEmbedding(Embedding):

  def __init__(self, binfile, normalize = False):
    self.file = binfile
    self.vdim = -1
    self.normalize = normalize
    
  def load(self):
    print('Loading fasttext model.')
    self.ftmodel = FastText()
    self.ftmodel.load_model(self.file)
    self.vdim = len(self.ftmodel['is'])
    print('Finished loading fasttext model.')
    return self
  
  def getVector(self, word):
    return self.ftmodel.get_numpy_vector(word, normalized = self.normalize)
    
  def search(self, q, topk = 4):
    raise NotImplementedError()
    
  def wordForVec(self, v):
    word, sim = self.ftmodel.words_for_vector(v)[0]
    return word, sim
  
  def containsWord(self, word):
    return True
  
  def vocabulary(self):
    return self.ftmodel.words
  
  def dim(self):
    return self.vdim
Exemplo n.º 7
0
class FastTextEmbeddings:
    def __init__(self, path):
        self.fasttext = FastText(path)

    def generate(self, sentence):
        return [self.fasttext.get_numpy_vector(word) for word in sentence]

    def size(self):
        return 300
Exemplo n.º 8
0
def build_w2v(relevant_tokens, model_file='wiki.cy.bin'):
    # using this library because it's more memory friendly for python :)
    from pyfasttext import FastText
    model = FastText(model_file)

    w2v = {}
    for token in relevant_tokens:
        vec = model.get_numpy_vector(token)
        w2v[token] = vec
    return w2v
Exemplo n.º 9
0
def main():
    model = FastText('model_text8.bin')

    target_word = 'dog'

    # get embedding
    target_word_embedding = model.get_numpy_vector(target_word)
    print('Target word:', target_word)
    print('Embedding shape:', target_word_embedding.shape)
    print('Embedding:', target_word_embedding[0:10], '...')

    # find closest words
    closest_words = model.nearest_neighbors(target_word, k=15)
    for word, similarity in closest_words:
        print('Word:', word, 'similarity:', similarity)
Exemplo n.º 10
0
    def __init__(self, doc_catgy, n_vocab, emb_dim, out_channels, filter_size, word2index, pre_trained_embedding, multi_label):
        self.in_channels = 1
        self.out_channels = out_channels
        self.row_dim = emb_dim
        self.hidden_dim = 512 ## fixed
        self.doc_catgy = doc_catgy
        self.n_classes = len(doc_catgy)
        self.n_vocab = n_vocab
        self.filter_size = filter_size
        self.word2index = word2index
        self.mutli_label = multi_label
        self.le = None
        if self.mutli_label == 1:
            self.le = MultiLabelBinarizer(classes=[i[0] for i in sorted(self.doc_catgy.items(), key=lambda x: x[1])],sparse_output=False)
        elif self.mutli_label == 0:
            self.le = LabelEncoder()
            self.le.fit([i[0] for i in sorted(self.doc_catgy.items(), key=lambda x: x[1])])
        self.look_up_table = None
        self.pre_trained_embedding = pre_trained_embedding
        super(XMLCnn, self).__init__()
        self.to_gpu()
        if not self.pre_trained_embedding is None:
                model = FastText(self.pre_trained_embedding)
                dim = len(model['a'])
                n_vocab = len(self.word2index.keys())
                self.look_up_table = self.xp.zeros((n_vocab, dim),dtype=np.float32)
                for word,index in tqdm(self.word2index.items()):
                    try:
                        self.look_up_table[index] = chainer.cuda.to_gpu(model.get_numpy_vector(word))
                    except:
                        self.xp.random.seed(index)
                        self.look_up_table[index][:] = self.xp.random.uniform(-0.25, 0.25, dim)

        self.set_seed_random(123)
        with self.init_scope():
            if self.look_up_table is None:
                self.embedding=L.EmbedID(n_vocab, self.row_dim, ignore_label=-1,initialW=linear_init)
            else:
                self.embedding=L.EmbedID(n_vocab, self.row_dim, ignore_label=-1,initialW=self.look_up_table)
            self.conv1 = L.Convolution2D(self.in_channels,self.out_channels,(filter_size[0],self.row_dim), stride=2,initialW=linear_init)
            self.conv2 = L.Convolution2D(self.in_channels,self.out_channels,(filter_size[1],self.row_dim), stride=2,initialW=linear_init)
            self.conv3 = L.Convolution2D(self.in_channels,self.out_channels,(filter_size[2],self.row_dim), stride=2,initialW=linear_init)
            self.l1=L.Linear(in_size = None, out_size = self.hidden_dim, initialW=linear_init)
            self.l2=L.Linear(in_size = self.hidden_dim, out_size = self.n_classes,initialW=linear_init)
        self.to_gpu()    
Exemplo n.º 11
0
class SVM:
    def __init__(
            self,
            TFIDF):  # TODO: plz give me the model or using global singleton
        self.model = FastText('/home/yee0/Atos/wiki.en.bin')
        self.TFIDF = TFIDF
        self.tfidf_dict, self.words = TFIDF.get_tfidf()

    def set_tfidf(self, TFIDF):
        self.TFIDF = TFIDF
        self.tfidf_dict, self.words = TFIDF.get_tfidf()

    def to_feature(self, X):
        return [self.vec(self.TFIDF.predict_field(post), post) for post in X]

    def vec(self, field_index, post):
        v = np.zeros(300)
        post = set(post)  # make unique
        for pl in post:
            if pl != '' and pl in self.words:
                v += self.model.get_numpy_vector(
                    pl) * self.tfidf_dict[field_index][pl]
        return v

    def train(self, X, y):
        # 建立 SVC 模型
        self.svc = svm.SVC()
        svc_fit = self.svc.fit(self.to_feature(X), y)

    def predict(self, post):
        return self.svc.predict(self.to_feature([post]))

    def save_model(self):
        with open('steevebot/save/svm.pickle', 'wb') as f:
            pickle.dump(self.svc, f)

    def restore_model(self):
        with open('steevebot/save/svm.pickle', 'rb') as f:
            self.svc = pickle.load(f)


# In[ ]:

# In[ ]:
Exemplo n.º 12
0
class FastTextWrapper(EmbeddingWrapper):
    """
    Contains the KeyedVectors object, name of the file from which the embeddings were loaded as well as its md5.
    """
    def __init__(self, fasttext_path):
        super(FastTextWrapper, self).__init__(fasttext_path)
        self._fasttext = FastText(fasttext_path)
        self._word_output_len = self.get_numpy_vector("checklen").shape[0]

    def __len__(self):
        return self._word_output_len

    def get_numpy_vector(self, word):
        return self._fasttext.get_numpy_vector(word)

    def emb_obj(self):
        """
        Returns FastText object
        """
        return self._fasttext
Exemplo n.º 13
0
class FastTextEmbedding(Embedding):
    def __init__(self, binfile, normalize=False):
        self.file = binfile
        self.vdim = -1
        self.normalize = normalize

    def load(self):
        print('Loading fasttext model.')
        self.ftmodel = FastText()
        self.ftmodel.load_model(self.file)
        self.vdim = len(self.ftmodel['is'])
        print('Finished loading fasttext model.')
        return self

    def getVector(self, word):
        return self.ftmodel.get_numpy_vector(word, normalized=self.normalize)

    def wordForVec(self, v):
        word, sim = self.ftmodel.words_for_vector(v)[0]
        return word, sim

    def nearest_neighbors(self, word, n=200):
        tuples = ftmodel.nearest_neighbors(word, n)
        return tuples

    def nearest_neighbors_by_vector(self, word, n=200):
        tuples = self.ftmodel.words_for_vector(v, n)
        return tuples

    def containsWord(self, word, explicit=False):
        if explicit:
            return word in vocabulary()
        return True

    def vocabulary(self):
        return self.ftmodel.words

    def dim(self):
        return self.vdim
Exemplo n.º 14
0
def main():
    texts_tl, texts_en = data.parse_corpora('corpus')
    word_index_tl, word_index_en, encoder_input_data, decoder_input_data, decoder_target_data = data.preprocess(
        texts_en, texts_tl)

    embedding_dim = 300
    latent_dim = 512
    model, encoder_model, decoder_model = create_models(
        embedding_dim, latent_dim, embedding_dim)
    model.load_weights(sys.argv[1])

    indexes = np.random.randint(0, len(texts_tl), 100)

    ft_model = FastText(os.path.join('embeddings', 'wiki.en.bin'))
    start_seq = ft_model.get_numpy_vector(data.SOS,
                                          normalized=True).reshape(1, 1, -1)

    embedding_weights = np.load('embedding-weights.npz')
    e_tl = embedding_weights['tl'].astype('float32')

    for seq_index in indexes:
        # Take one sequence (part of the training set)
        # for trying out decoding.
        sentence = texts_tl[seq_index]
        input_seq = encoder_input_data[seq_index]
        input_seq = np.take(e_tl, input_seq, axis=0).reshape(1, -1, 300)
        print(input_seq)
        #input_seq = sentence.split()[1:-1]
        #print(input_seq)
        #input_seq = np.stack(list(ft_model.get_numpy_vector(i, normalized=True) for i in input_seq)).reshape(1, -1, 300)
        #input_seq = np.stack(list(map(ft_model.get_numpy_vector, input_seq))).reshape(1, -1, 300)
        #print(input_seq)
        #print(input_seq.shape)
        decoded_sentence = decode_sequence(input_seq, encoder_model,
                                           decoder_model, ft_model, start_seq)
        print('-')
        print('Input sentence:', texts_tl[seq_index])
        print('Decoded sentence:', decoded_sentence)
Exemplo n.º 15
0
from pyfasttext import FastText

ft = FastText('model.bin')
print ft.get_numpy_vector(u'you')
Exemplo n.º 16
0
def main():
    model = FastText('model_text8.bin')

    target_word = 'deep'    
    
    # get embedding
    target_word_embedding = model.get_numpy_vector(target_word)
    print('Target word:', target_word)
    print('Embedding shape:', target_word_embedding.shape)
    print('Embedding:', target_word_embedding[0:15], '...')

    # find closest words
    closest_words = model.nearest_neighbors(target_word, k=15)
    closest_word_embeddings = []
    numw = 0
    for word, similarity in closest_words:
        print('Word:', word, 'similarity:', similarity)
        closest_word_embeddings.append(model.get_numpy_vector(word))
           
    kmeans = cluster.KMeans(n_clusters=3)
    kmeans.fit(closest_word_embeddings)
    labels = kmeans.labels_
    print ('Cluster id labels for inputted data')
    print (labels)
    
    cluster1 = []
    cluster2 = []
    cluster3 = []
    
    for i in range(0,15):
      if labels[i] == 0:
          cluster1.append(closest_words[i][0]) 
          
      if labels[i] == 1:
          cluster2.append(closest_words[i][0])
          
      if labels[i] == 2:
          cluster3.append(closest_words[i][0])
      
    print("cluster #1 : ", cluster1)
    print("cluster #2 : ", cluster2)
    print("cluster #3 : ", cluster3)
      
    
        
    target_word = 'president'

    # get embedding
    target_word_embedding = model.get_numpy_vector(target_word)
    print('Target word:', target_word)
    #print('Embedding shape:', target_word_embedding.shape)
    #print('Embedding:', target_word_embedding[0:10], '...')

    # find closest words
    closest_words = model.nearest_neighbors(target_word, k=15)
    closest_word_embeddings = []
    numw = 0
    for word, similarity in closest_words:
        print('Word:', word, 'similarity:', similarity)
        closest_word_embeddings.append(model.get_numpy_vector(word))
           
    kmeans = cluster.KMeans(n_clusters=3)
    kmeans.fit(closest_word_embeddings)
    labels = kmeans.labels_
    print ('Cluster id labels for inputted data')
    print (labels)
    
    cluster1 = []
    cluster2 = []
    cluster3 = []
    
    for i in range(0,15):
      if labels[i] == 0:
          cluster1.append(closest_words[i][0]) 
          
      if labels[i] == 1:
          cluster2.append(closest_words[i][0])
          
      if labels[i] == 2:
          cluster3.append(closest_words[i][0])
      
    print("cluster #1 : ", cluster1)
    print("cluster #2 : ", cluster2)
    print("cluster #3 : ", cluster3)
      
        
    target_word = 'self'

    # get embedding
    target_word_embedding = model.get_numpy_vector(target_word)
    print('Target word:', target_word)
    #print('Embedding shape:', target_word_embedding.shape)
    #print('Embedding:', target_word_embedding[0:10], '...')

    # find closest words
    closest_words = model.nearest_neighbors(target_word, k=15)
    closest_word_embeddings = []
    numw = 0
    for word, similarity in closest_words:
        print('Word:', word, 'similarity:', similarity)
        closest_word_embeddings.append(model.get_numpy_vector(word))
           
    kmeans = cluster.KMeans(n_clusters=3)
    kmeans.fit(closest_word_embeddings)
    labels = kmeans.labels_
    print ('Cluster id labels for inputted data')
    print (labels)
    
    cluster1 = []
    cluster2 = []
    cluster3 = []
    
    for i in range(0,15):
      if labels[i] == 0:
          cluster1.append(closest_words[i][0]) 
          
      if labels[i] == 1:
          cluster2.append(closest_words[i][0])
          
      if labels[i] == 2:
          cluster3.append(closest_words[i][0])
      
    print("cluster #1 : ", cluster1)
    print("cluster #2 : ", cluster2)
    print("cluster #3 : ", cluster3)
      
        
    target_word = 'insult'

    # get embedding
    target_word_embedding = model.get_numpy_vector(target_word)
    print('Target word:', target_word)
    #print('Embedding shape:', target_word_embedding.shape)
    #print('Embedding:', target_word_embedding[0:10], '...')

    # find closest words
    closest_words = model.nearest_neighbors(target_word, k=15)
    closest_word_embeddings = []
    numw = 0
    for word, similarity in closest_words:
        print('Word:', word, 'similarity:', similarity)
        closest_word_embeddings.append(model.get_numpy_vector(word))
           
    kmeans = cluster.KMeans(n_clusters=3)
    kmeans.fit(closest_word_embeddings)
    labels = kmeans.labels_
    print ('Cluster id labels for inputted data')
    print (labels)
    
    cluster1 = []
    cluster2 = []
    cluster3 = []
    
    for i in range(0,15):
      if labels[i] == 0:
          cluster1.append(closest_words[i][0]) 
          
      if labels[i] == 1:
          cluster2.append(closest_words[i][0])
          
      if labels[i] == 2:
          cluster3.append(closest_words[i][0])
      
    print("cluster #1 : ", cluster1)
    print("cluster #2 : ", cluster2)
    print("cluster #3 : ", cluster3)
      

        
    target_word = 'general'

    # get embedding
    target_word_embedding = model.get_numpy_vector(target_word)
    print('Target word:', target_word)
    #print('Embedding shape:', target_word_embedding.shape)
    #print('Embedding:', target_word_embedding[0:10], '...')

    # find closest words
    closest_words = model.nearest_neighbors(target_word, k=15)
    closest_word_embeddings = []
    numw = 0
    for word, similarity in closest_words:
        print('Word:', word, 'similarity:', similarity)
        closest_word_embeddings.append(model.get_numpy_vector(word))
           
    kmeans = cluster.KMeans(n_clusters=3)
    kmeans.fit(closest_word_embeddings)
    labels = kmeans.labels_
    print ('Cluster id labels for inputted data')
    print (labels)
    
    cluster1 = []
    cluster2 = []
    cluster3 = []
    
    for i in range(0,15):
      if labels[i] == 0:
          cluster1.append(closest_words[i][0]) 
          
      if labels[i] == 1:
          cluster2.append(closest_words[i][0])
          
      if labels[i] == 2:
          cluster3.append(closest_words[i][0])
      
    print("cluster #1 : ", cluster1)
    print("cluster #2 : ", cluster2)
    print("cluster #3 : ", cluster3)
      
        
    target_word = 'inclined'
    # get embedding
    target_word_embedding = model.get_numpy_vector(target_word)
    print('Target word:', target_word)
    #print('Embedding shape:', target_word_embedding.shape)
    #print('Embedding:', target_word_embedding[0:10], '...')

    # find closest words
    closest_words = model.nearest_neighbors(target_word, k=15)
    closest_word_embeddings = []
    numw = 0
    for word, similarity in closest_words:
        print('Word:', word, 'similarity:', similarity)
        closest_word_embeddings.append(model.get_numpy_vector(word))
           
    kmeans = cluster.KMeans(n_clusters=3)
    kmeans.fit(closest_word_embeddings)
    labels = kmeans.labels_
    print ('Cluster id labels for inputted data')
    print (labels)
    
    cluster1 = []
    cluster2 = []
    cluster3 = []
    
    for i in range(0,15):
      if labels[i] == 0:
          cluster1.append(closest_words[i][0]) 
          
      if labels[i] == 1:
          cluster2.append(closest_words[i][0])
          
      if labels[i] == 2:
          cluster3.append(closest_words[i][0])
      
    print("cluster #1 : ", cluster1)
    print("cluster #2 : ", cluster2)
    print("cluster #3 : ", cluster3)
      
        
    target_word = 'property'

    # get embedding
    target_word_embedding = model.get_numpy_vector(target_word)
    print('Target word:', target_word)
    #print('Embedding shape:', target_word_embedding.shape)
    #print('Embedding:', target_word_embedding[0:10], '...')

    # find closest words
    closest_words = model.nearest_neighbors(target_word, k=15)
    closest_word_embeddings = []
    numw = 0
    for word, similarity in closest_words:
        print('Word:', word, 'similarity:', similarity)
        closest_word_embeddings.append(model.get_numpy_vector(word))
           
    kmeans = cluster.KMeans(n_clusters=3)
    kmeans.fit(closest_word_embeddings)
    labels = kmeans.labels_
    print ('Cluster id labels for inputted data')
    print (labels)
    
    cluster1 = []
    cluster2 = []
    cluster3 = []
    
    for i in range(0,15):
      if labels[i] == 0:
          cluster1.append(closest_words[i][0]) 
          
      if labels[i] == 1:
          cluster2.append(closest_words[i][0])
          
      if labels[i] == 2:
          cluster3.append(closest_words[i][0])
      
    print("cluster #1 : ", cluster1)
    print("cluster #2 : ", cluster2)
    print("cluster #3 : ", cluster3)
      
        
    target_word = 'international'

    # get embedding
    target_word_embedding = model.get_numpy_vector(target_word)
    print('Target word:', target_word)
    #print('Embedding shape:', target_word_embedding.shape)
    #print('Embedding:', target_word_embedding[0:10], '...')

    # find closest words
    closest_words = model.nearest_neighbors(target_word, k=15)
    closest_word_embeddings = []
    numw = 0
    for word, similarity in closest_words:
        print('Word:', word, 'similarity:', similarity)
        closest_word_embeddings.append(model.get_numpy_vector(word))
           
    kmeans = cluster.KMeans(n_clusters=3)
    kmeans.fit(closest_word_embeddings)
    labels = kmeans.labels_
    print ('Cluster id labels for inputted data')
    print (labels)
    
    cluster1 = []
    cluster2 = []
    cluster3 = []
    
    for i in range(0,15):
      if labels[i] == 0:
          cluster1.append(closest_words[i][0]) 
          
      if labels[i] == 1:
          cluster2.append(closest_words[i][0])
          
      if labels[i] == 2:
          cluster3.append(closest_words[i][0])
      
    print("cluster #1 : ", cluster1)
    print("cluster #2 : ", cluster2)
    print("cluster #3 : ", cluster3)
      
    target_word = 'many'

    # get embedding
    target_word_embedding = model.get_numpy_vector(target_word)
    print('Target word:', target_word)
    #print('Embedding shape:', target_word_embedding.shape)
    #print('Embedding:', target_word_embedding[0:10], '...')

    # find closest words
    closest_words = model.nearest_neighbors(target_word, k=15)
    closest_word_embeddings = []
    numw = 0
    for word, similarity in closest_words:
        print('Word:', word, 'similarity:', similarity)
        closest_word_embeddings.append(model.get_numpy_vector(word))
           
    kmeans = cluster.KMeans(n_clusters=3)
    kmeans.fit(closest_word_embeddings)
    labels = kmeans.labels_
    print ('Cluster id labels for inputted data')
    print (labels)
    
    cluster1 = []
    cluster2 = []
    cluster3 = []
    
    for i in range(0,15):
      if labels[i] == 0:
          cluster1.append(closest_words[i][0]) 
          
      if labels[i] == 1:
          cluster2.append(closest_words[i][0])
          
      if labels[i] == 2:
          cluster3.append(closest_words[i][0])
      
    print("cluster #1 : ", cluster1)
    print("cluster #2 : ", cluster2)
    print("cluster #3 : ", cluster3)
      
        
    target_word = 'imprisoned'

    # get embedding
    target_word_embedding = model.get_numpy_vector(target_word)
    print('Target word:', target_word)
    #print('Embedding shape:', target_word_embedding.shape)
    #print('Embedding:', target_word_embedding[0:10], '...')

    # find closest words
    closest_words = model.nearest_neighbors(target_word, k=15)
    closest_word_embeddings = []
    numw = 0
    for word, similarity in closest_words:
        print('Word:', word, 'similarity:', similarity)
        closest_word_embeddings.append(model.get_numpy_vector(word))
           
    kmeans = cluster.KMeans(n_clusters=3)
    kmeans.fit(closest_word_embeddings)
    labels = kmeans.labels_
    print ('Cluster id labels for inputted data')
    print (labels)
    
    cluster1 = []
    cluster2 = []
    cluster3 = []
    
    for i in range(0,15):
      if labels[i] == 0:
          cluster1.append(closest_words[i][0]) 
          
      if labels[i] == 1:
          cluster2.append(closest_words[i][0])
          
      if labels[i] == 2:
          cluster3.append(closest_words[i][0])
      
    print("cluster #1 : ", cluster1)
    print("cluster #2 : ", cluster2)
    print("cluster #3 : ", cluster3)
      
    target_word = 'branches'

    # get embedding
    target_word_embedding = model.get_numpy_vector(target_word)
    print('Target word:', target_word)
    #print('Embedding shape:', target_word_embedding.shape)
    #print('Embedding:', target_word_embedding[0:10], '...')

    # find closest words
    closest_words = model.nearest_neighbors(target_word, k=15)
    closest_word_embeddings = []
    numw = 0
    for word, similarity in closest_words:
        print('Word:', word, 'similarity:', similarity)
        closest_word_embeddings.append(model.get_numpy_vector(word))
           
    kmeans = cluster.KMeans(n_clusters=3)
    kmeans.fit(closest_word_embeddings)
    labels = kmeans.labels_
    print ('Cluster id labels for inputted data')
    print (labels)
    
    cluster1 = []
    cluster2 = []
    cluster3 = []
    
    for i in range(0,15):
      if labels[i] == 0:
          cluster1.append(closest_words[i][0]) 
          
      if labels[i] == 1:
          cluster2.append(closest_words[i][0])
          
      if labels[i] == 2:
          cluster3.append(closest_words[i][0])
      
    print("cluster #1 : ", cluster1)
    print("cluster #2 : ", cluster2)
    print("cluster #3 : ", cluster3)
      
        
    target_word = 'communist'

    # get embedding
    target_word_embedding = model.get_numpy_vector(target_word)
    print('Target word:', target_word)
    #print('Embedding shape:', target_word_embedding.shape)
    #print('Embedding:', target_word_embedding[0:10], '...')

    # find closest words
    closest_words = model.nearest_neighbors(target_word, k=15)
    closest_word_embeddings = []
    numw = 0
    for word, similarity in closest_words:
        print('Word:', word, 'similarity:', similarity)
        closest_word_embeddings.append(model.get_numpy_vector(word))
           
    kmeans = cluster.KMeans(n_clusters=3)
    kmeans.fit(closest_word_embeddings)
    labels = kmeans.labels_
    print ('Cluster id labels for inputted data')
    print (labels)
    
    cluster1 = []
    cluster2 = []
    cluster3 = []
    
    for i in range(0,15):
      if labels[i] == 0:
          cluster1.append(closest_words[i][0]) 
          
      if labels[i] == 1:
          cluster2.append(closest_words[i][0])
          
      if labels[i] == 2:
          cluster3.append(closest_words[i][0])
      
    print("cluster #1 : ", cluster1)
    print("cluster #2 : ", cluster2)
    print("cluster #3 : ", cluster3)
      
    target_word = 'france'

    # get embedding
    target_word_embedding = model.get_numpy_vector(target_word)
    print('Target word:', target_word)
    #print('Embedding shape:', target_word_embedding.shape)
    #print('Embedding:', target_word_embedding[0:10], '...')

    # find closest words
    closest_words = model.nearest_neighbors(target_word, k=15)
    closest_word_embeddings = []
    numw = 0
    for word, similarity in closest_words:
        print('Word:', word, 'similarity:', similarity)
        closest_word_embeddings.append(model.get_numpy_vector(word))
           
    kmeans = cluster.KMeans(n_clusters=3)
    kmeans.fit(closest_word_embeddings)
    labels = kmeans.labels_
    print ('Cluster id labels for inputted data')
    print (labels)
    
    cluster1 = []
    cluster2 = []
    cluster3 = []
    
    for i in range(0,15):
      if labels[i] == 0:
          cluster1.append(closest_words[i][0]) 
          
      if labels[i] == 1:
          cluster2.append(closest_words[i][0])
          
      if labels[i] == 2:
          cluster3.append(closest_words[i][0])
      
    print("cluster #1 : ", cluster1)
    print("cluster #2 : ", cluster2)
    print("cluster #3 : ", cluster3)
      
    target_word = 'strict'

    # get embedding
    target_word_embedding = model.get_numpy_vector(target_word)
    print('Target word:', target_word)
    #print('Embedding shape:', target_word_embedding.shape)
    #print('Embedding:', target_word_embedding[0:10], '...')

    # find closest words
    closest_words = model.nearest_neighbors(target_word, k=15)
    closest_word_embeddings = []
    numw = 0
    for word, similarity in closest_words:
        print('Word:', word, 'similarity:', similarity)
        closest_word_embeddings.append(model.get_numpy_vector(word))
           
    kmeans = cluster.KMeans(n_clusters=3)
    kmeans.fit(closest_word_embeddings)
    labels = kmeans.labels_
    print ('Cluster id labels for inputted data')
    print (labels)
    
    cluster1 = []
    cluster2 = []
    cluster3 = []
    
    for i in range(0,15):
      if labels[i] == 0:
          cluster1.append(closest_words[i][0]) 
          
      if labels[i] == 1:
          cluster2.append(closest_words[i][0])
          
      if labels[i] == 2:
          cluster3.append(closest_words[i][0])
      
    print("cluster #1 : ", cluster1)
    print("cluster #2 : ", cluster2)
    print("cluster #3 : ", cluster3)
      
        
    target_word = 'earthly'

    # get embedding
    target_word_embedding = model.get_numpy_vector(target_word)
    print('Target word:', target_word)
    #print('Embedding shape:', target_word_embedding.shape)
    #print('Embedding:', target_word_embedding[0:10], '...')

    # find closest words
    closest_words = model.nearest_neighbors(target_word, k=15)
    closest_word_embeddings = []
    numw = 0
    for word, similarity in closest_words:
        print('Word:', word, 'similarity:', similarity)
        closest_word_embeddings.append(model.get_numpy_vector(word))
           
    kmeans = cluster.KMeans(n_clusters=3)
    kmeans.fit(closest_word_embeddings)
    labels = kmeans.labels_
    print ('Cluster id labels for inputted data')
    print (labels)
    
    cluster1 = []
    cluster2 = []
    cluster3 = []
    
    for i in range(0,15):
      if labels[i] == 0:
          cluster1.append(closest_words[i][0]) 
          
      if labels[i] == 1:
          cluster2.append(closest_words[i][0])
          
      if labels[i] == 2:
          cluster3.append(closest_words[i][0])
      
    print("cluster #1 : ", cluster1)
    print("cluster #2 : ", cluster2)
    print("cluster #3 : ", cluster3)
      
    terget_word = "zero"

    # get embedding
    target_word_embedding = model.get_numpy_vector(target_word)
    print('Target word:', target_word)
    #print('Embedding shape:', target_word_embedding.shape)
    #print('Embedding:', target_word_embedding[0:10], '...')

    # find closest words
    closest_words = model.nearest_neighbors(target_word, k=15)
    closest_word_embeddings = []
    numw = 0
    for word, similarity in closest_words:
        print('Word:', word, 'similarity:', similarity)
        closest_word_embeddings.append(model.get_numpy_vector(word))
           
    kmeans = cluster.KMeans(n_clusters=3)
    kmeans.fit(closest_word_embeddings)
    labels = kmeans.labels_
    print ('Cluster id labels for inputted data')
    print (labels)
    
    cluster1 = []
    cluster2 = []
    cluster3 = []
    
    for i in range(0,15):
      if labels[i] == 0:
          cluster1.append(closest_words[i][0]) 
          
      if labels[i] == 1:
          cluster2.append(closest_words[i][0])
          
      if labels[i] == 2:
          cluster3.append(closest_words[i][0])
      
    print("cluster #1 : ", cluster1)
    print("cluster #2 : ", cluster2)
    print("cluster #3 : ", cluster3)
      
    target_word = 'feminism'

    # get embedding
    target_word_embedding = model.get_numpy_vector(target_word)
    print('Target word:', target_word)
    #print('Embedding shape:', target_word_embedding.shape)
    #print('Embedding:', target_word_embedding[0:10], '...')

    # find closest words
    closest_words = model.nearest_neighbors(target_word, k=15)
    closest_word_embeddings = []
    numw = 0
    for word, similarity in closest_words:
        print('Word:', word, 'similarity:', similarity)
        closest_word_embeddings.append(model.get_numpy_vector(word))
           
    kmeans = cluster.KMeans(n_clusters=3)
    kmeans.fit(closest_word_embeddings)
    labels = kmeans.labels_
    print ('Cluster id labels for inputted data')
    print (labels)
    
    cluster1 = []
    cluster2 = []
    cluster3 = []
    
    for i in range(0,15):
      if labels[i] == 0:
          cluster1.append(closest_words[i][0]) 
          
      if labels[i] == 1:
          cluster2.append(closest_words[i][0])
          
      if labels[i] == 2:
          cluster3.append(closest_words[i][0])
      
    print("cluster #1 : ", cluster1)
    print("cluster #2 : ", cluster2)
    print("cluster #3 : ", cluster3)
         
    target_word = 'ideas'

    # get embedding
    target_word_embedding = model.get_numpy_vector(target_word)
    print('Target word:', target_word)
    #print('Embedding shape:', target_word_embedding.shape)
    #print('Embedding:', target_word_embedding[0:10], '...')

    # find closest words
    closest_words = model.nearest_neighbors(target_word, k=15)
    closest_word_embeddings = []
    numw = 0
    for word, similarity in closest_words:
        print('Word:', word, 'similarity:', similarity)
        closest_word_embeddings.append(model.get_numpy_vector(word))
           
    kmeans = cluster.KMeans(n_clusters=3)
    kmeans.fit(closest_word_embeddings)
    labels = kmeans.labels_
    print ('Cluster id labels for inputted data')
    print (labels)
    
    cluster1 = []
    cluster2 = []
    cluster3 = []
    
    for i in range(0,15):
      if labels[i] == 0:
          cluster1.append(closest_words[i][0]) 
          
      if labels[i] == 1:
          cluster2.append(closest_words[i][0])
          
      if labels[i] == 2:
          cluster3.append(closest_words[i][0])
      
    print("cluster #1 : ", cluster1)
    print("cluster #2 : ", cluster2)
    print("cluster #3 : ", cluster3)
      
        
    target_word = 'theory'

     # get embedding
    target_word_embedding = model.get_numpy_vector(target_word)
    print('Target word:', target_word)
    #print('Embedding shape:', target_word_embedding.shape)
    #print('Embedding:', target_word_embedding[0:10], '...')

    # find closest words
    closest_words = model.nearest_neighbors(target_word, k=15)
    closest_word_embeddings = []
    numw = 0
    for word, similarity in closest_words:
        print('Word:', word, 'similarity:', similarity)
        closest_word_embeddings.append(model.get_numpy_vector(word))
           
    kmeans = cluster.KMeans(n_clusters=3)
    kmeans.fit(closest_word_embeddings)
    labels = kmeans.labels_
    print ('Cluster id labels for inputted data')
    print (labels)
    
    cluster1 = []
    cluster2 = []
    cluster3 = []
    
    for i in range(0,15):
      if labels[i] == 0:
          cluster1.append(closest_words[i][0]) 
          
      if labels[i] == 1:
          cluster2.append(closest_words[i][0])
          
      if labels[i] == 2:
          cluster3.append(closest_words[i][0])
      
    print("cluster #1 : ", cluster1)
    print("cluster #2 : ", cluster2)
    print("cluster #3 : ", cluster3)
      
        
    target_word = 'writings'

     # get embedding
    target_word_embedding = model.get_numpy_vector(target_word)
    print('Target word:', target_word)
    #print('Embedding shape:', target_word_embedding.shape)
    #print('Embedding:', target_word_embedding[0:10], '...')

    # find closest words
    closest_words = model.nearest_neighbors(target_word, k=15)
    closest_word_embeddings = []
    numw = 0
    for word, similarity in closest_words:
        print('Word:', word, 'similarity:', similarity)
        closest_word_embeddings.append(model.get_numpy_vector(word))
           
    kmeans = cluster.KMeans(n_clusters=3)
    kmeans.fit(closest_word_embeddings)
    labels = kmeans.labels_
    print ('Cluster id labels for inputted data')
    print (labels)
    
    cluster1 = []
    cluster2 = []
    cluster3 = []
    
    for i in range(0,15):
      if labels[i] == 0:
          cluster1.append(closest_words[i][0]) 
          
      if labels[i] == 1:
          cluster2.append(closest_words[i][0])
          
      if labels[i] == 2:
          cluster3.append(closest_words[i][0])
      
    print("cluster #1 : ", cluster1)
    print("cluster #2 : ", cluster2)
    print("cluster #3 : ", cluster3)
class NB_Implement():
    def __init__(self):
        start_time = time.time()
        # self.model = FastText("../data/input/models/sg_pyfasttext.bin")  # DEBUG
        self.model = FastText(
            "../data/input/models/880w_fasttext_skip_gram.bin")
        end_time = time.time()
        print(f"Loading word vector model cost: {end_time - start_time:.2f}s")

        # self.vocab_size, self.vector_size = self.model.numpy_normalized_vectors.shape  # OK
        self.vocab_size = self.model.nwords
        self.vector_size = self.model.args.get("dim")
        print(
            f"self.vector_size:{self.vector_size}, self.vocab_size: {self.vocab_size}"
        )  # self.vector_size:200, self.vocab_size: 925242

        # 句子的表示形式: {"avg": 向量和的平均, "fasttext": get_numpy_sentence_vector, "matrix": matrix}
        self.sentence_vec_type = "avg"

    def set_sent_vec_type(self, sentence_vec_type):
        assert self.sentence_vec_type in [
            "avg", "matrix", "fasttext"
        ], "self.sentence_vec_type must be in ['avg', 'fasttext', 'matrix']"
        self.sentence_vec_type = sentence_vec_type

    def gen_sentence_vec(self, sentence):
        """
        :param sentence: 
        :return: 
        """
        sentence = sentence.strip()
        if self.sentence_vec_type == "fasttext":
            return self.model.get_numpy_sentence_vector(sentence)

        word_list = [word for word in sentence.split(" ")]
        word_len = len(word_list)
        if self.sentence_vec_type == "matrix":
            sentence_matrix = np.empty(word_len, dtype=list)
            for idx, word in enumerate(word_list):
                sentence_matrix[idx] = self.model.get_numpy_vector(word)
            return sentence_matrix
        else:  # self.sentence_vec_type == "avg":
            sentence_vector = np.zeros(self.vector_size)  # <ndarray>
            # print(f"type(sentence_vector): {type(sentence_vector)}")
            for idx, word in enumerate(word_list):
                # print(f"type(self.model.get_numpy_vector(word)): {type(self.model.get_numpy_vector(word))}")  # <ndarry>
                sentence_vector += self.model.get_numpy_vector(word)
            return sentence_vector / len(word_list)

    def gen_train_val_data(self):
        """
        构造训练, 验证数据
        """
        X_train = list()
        y_train = list()
        for line in open("../data/input/training_set.txt"):
            line = line.strip().split("\t")
            sent_vector = self.gen_sentence_vec(line[-1])
            X_train.append(sent_vector)
            y_train.append(int(line[0]))

        X_val = list()
        y_val = list()
        for line in open("../data/input/validation_set.txt"):
            line = line.strip().split("\t")
            sent_vector = self.gen_sentence_vec(line[-1])
            X_val.append(sent_vector)
            y_val.append(int(line[0]))

        return np.array(X_train), np.array(y_train), np.array(X_val), np.array(
            y_val),

    def train_bayes(self, X_train, y_train):
        """
        基于Naive Bayes的分类算法
        """
        from sklearn.naive_bayes import GaussianNB
        model = GaussianNB()
        model.fit(X_train, y_train)
        joblib.dump(model, "../data/output/models/bayes_model")

    def evaluate_bayes(self, model_path, X_val, y_val):
        """
        基于Naive Bayes分类器的预测
        """
        model = joblib.load(model_path)
        y_val = list(y_val)
        correct = 0
        """
        y_predict = list()
        for sent_vec in X_val:  # sent_vec.shape: (self.vector_size,)
            predicted = model.predict(sent_vec.reshape(1, -1))  # sent_vec.reshape(1, -1).shape: (1, self.vector_size)
            y_predict.append(predicted[0])
        """
        y_predict = model.predict(X_val)
        print(f"len(y_predict): {len(y_predict)}, len(y_val): {len(y_val)}")
        assert len(y_predict) == len(
            y_val
        ), "Unexpected Error: len(y_predict) != len(y_val), but it should be"
        for idx in range(len(y_predict)):
            if int(y_predict[idx]) == int(y_val[idx]):
                correct += 1
        score = correct / len(y_predict)
        print(f"Bayes Classification Accuray:{score}")
        return score

    def predict_bayes(self, model_path):
        """
        实际应用测试
        """
        model = joblib.load(model_path)
        sentence = "这件 衣服 真的 太 好看 了 ! 好想 买 啊 "
        sent_vec = np.array(self.gen_sentence_vec(sentence)).reshape(1, -1)
        print(f"'{sentence}': {model.predict(sent_vec)}")  # 1: 负向

        sentence = "这个 电视 真 尼玛 垃圾 , 老子 再也 不买 了"
        sent_vec = np.array(self.gen_sentence_vec(sentence)).reshape(1, -1)
        print(f"'{sentence}': {model.predict(sent_vec)}")  # 1: 负向
Exemplo n.º 18
0
        # Update states
        states_value = [h, c]

    return decoded_sentence


#indexes = np.random.randint(0, len(input_texts), 100)
indexes = np.random.randint(0, len(valid_input_texts), 100)

for seq_index in indexes:
    # Take one sequence (part of the training set)
    # for trying out decoding.
    #text = input_texts[seq_index]
    text = valid_input_texts[seq_index]
    words = text.split()
    encoder_input_data = np.zeros((1, max_encoder_seq_length, embedding_dims),
                                  dtype='float32')
    for t, word in enumerate(words):
        encoder_input_data[0,
                           t, :] = filModel.get_numpy_vector(word,
                                                             normalized=True)
        #print("decodeding",word)

    input_seq = encoder_input_data

    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('Input sentence:', input_texts[seq_index])
    print('Decoded sentence:', decoded_sentence)
Exemplo n.º 19
0
        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, embedding_dims))
        target_seq[0, 0, :] = engModel.get_numpy_vector(sampled_word, normalized=True)

        # Update states
        states_value = [h, c]

    return decoded_sentence

#indexes = np.random.randint(0, len(input_texts), 100)
indexes = np.random.randint(0, len(valid_input_texts), 100)

for seq_index in indexes:
    # Take one sequence (part of the training set)
    # for trying out decoding.
    #text = input_texts[seq_index]
    text = valid_input_texts[seq_index]
    words = text.split()
    encoder_input_data = np.zeros((1, max_encoder_seq_length, embedding_dims),dtype='float32')
    for t, word in enumerate(words):
        encoder_input_data[0,t,:] = filModel.get_numpy_vector(word, normalized=True)
        #print("decodeding",word)

    input_seq = encoder_input_data

    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('Input sentence:', input_texts[seq_index])
    print('Decoded sentence:', decoded_sentence)
class Preprocessing:
    def __init__(self):
        start_time = time.time()
        # self.model = FastText("../data/input/models/sg_pyfasttext.bin")  # DEBUG
        self.model = FastText(
            "../data/input/models/880w_fasttext_skip_gram.bin")
        end_time = time.time()
        print(f"Loading word vector model cost: {end_time - start_time:.2f}s")

        # self.vocab_size, self.vector_size = self.model.numpy_normalized_vectors.shape  # OK
        self.vocab_size = self.model.nwords
        self.vector_size = self.model.args.get("dim")
        # self.vector_size:200, self.vocab_size: 925242
        print(
            f"self.vector_size:{self.vector_size}, self.vocab_size: {self.vocab_size}"
        )

        # 句子的表示形式:
        # {"avg": 向量和的平均, "fasttext": get_numpy_sentence_vector, "concatenate": 向量拼接和补齐, "matrix": 矩阵}
        self.sentence_vec_type = "matrix"

        self.MAX_SENT_LEN = 70  # DEBUG: 超参数. self.get_sent_max_length()
        # 对于"concatenate": self.MAX_SENT_LEN = 30, 取其他不同值的结果: 100: 50.22%, 80: 50.23%, 70: 50.33%, 60: 55.92%, 50: 69.11%, 40: 68.91%, 36: 69.34%, 30: 69.22%, 20: 69.17%, 10: 67.07%
        # 对于"matrix": self.MAX_SENT_LEN = 70, 取其他不同值的结果: TODO:

    @classmethod
    def data_analysis(cls):
        train_df = pd.read_csv("../data/input/training_set.txt",
                               sep="\t",
                               header=None,
                               names=["label", "sentence"])
        val_df = pd.read_csv("../data/input/validation_set.txt",
                             sep="\t",
                             header=None,
                             names=["label", "sentence"])
        y_train = train_df["label"]
        y_val = val_df["label"]
        sns.set(style="white", context="notebook", palette="deep")
        # 查看样本数据分布情况(各个label数据是否均匀分布)
        sns.countplot(y_train)
        plt.show()
        sns.countplot(y_val)
        plt.show()
        print(y_train.value_counts())
        print(y_val.value_counts())

    def set_sent_vec_type(self, sentence_vec_type):
        assert sentence_vec_type in ["avg", "concatenate", "fasttext", "matrix"], \
            "sentence_vec_type must be in ['avg', 'fasttext', 'concatenate', 'matrix']"
        self.sentence_vec_type = sentence_vec_type

    def get_sent_max_length(self):  # NOT_USED
        sent_len_counter = Counter()
        max_length = 0
        with open("../data/input/training_set.txt") as f:
            for line in f:
                content = line.strip().split("\t")[1]
                content_list = content.split()
                length = len(content_list)
                sent_len_counter[length] += 1
                if max_length <= length:
                    max_length = length
        sent_len_counter = sorted(list(sent_len_counter.items()),
                                  key=lambda x: x[0])
        print(sent_len_counter)
        # [(31, 1145), (32, 1105), (33, 1017), (34, 938), (35, 839), (36, 830), (37, 775), (38, 737), (39, 720), (40, 643), (41, 575), (42, 584), (43, 517), (44, 547), (45, 514), (46, 514), (47, 480), (48, 460), (49, 470), (50, 444), (51, 484), (52, 432), (53, 462), (54, 495), (55, 487), (56, 500), (57, 496), (58, 489), (59, 419), (60, 387), (61, 348), (62, 265), (63, 222), (64, 153), (65, 127), (66, 103), (67, 67), (68, 34), (69, 21), (70, 22), (71, 8), (72, 6), (73, 4), (74, 10), (75, 2), (76, 4), (77, 2), (78, 1), (79, 2), (80, 4), (81, 2), (82, 3), (83, 1), (84, 5), (86, 4), (87, 3), (88, 3), (89, 2), (90, 2), (91, 3), (92, 5), (93, 2), (94, 4), (96, 1), (97, 5), (98, 1), (99, 2), (100, 2), (101, 2), (102, 1), (103, 2), (104, 2), (105, 2), (106, 5), (107, 3), (108, 2), (109, 3), (110, 4), (111, 1), (112, 2), (113, 3), (114, 1), (116, 1), (119, 3), (679, 1)]
        return max_length

    def gen_sentence_vec(self, sentence):
        """
        :param sentence: 
        :return: 
        """
        sentence = sentence.strip()
        if self.sentence_vec_type == "fasttext":
            return self.model.get_numpy_sentence_vector(sentence)

        word_list = sentence.split(" ")
        if self.sentence_vec_type == "concatenate":
            sentence_vector = self.model.get_numpy_vector(word_list[0])
            for word in word_list[1:]:
                sentence_vector = np.hstack(
                    (sentence_vector, self.model.get_numpy_vector(word)))
            return sentence_vector  # NOTE: 对于concatenate情况, 每个句子的sentence_vector是不一样长的
        if self.sentence_vec_type == "matrix":  # for Deep Learning.
            sentence_matrix = []
            for word in word_list[
                    -self.
                    MAX_SENT_LEN:]:  # NOTE: 截取后面的应该是要好些(参考https://github.com/lxw0109/SentimentClassification_UMICH_SI650/blob/master/src/LSTM_wo_pretrained_vector.py#L86)
                sentence_matrix.append(self.model.get_numpy_vector(word))
            length = len(sentence_matrix)
            # 一定成立,因为上面做了切片截取
            assert length <= self.MAX_SENT_LEN, "CRITICAL ERROR: len(sentence_matrix) > self.MAX_SENT_LEN."
            # 参数中的matrix类型为list of ndarray, 返回值的matrix是ndarray of ndarray
            sentence_matrix = np.pad(sentence_matrix,
                                     pad_width=((0,
                                                 self.MAX_SENT_LEN - length),
                                                (0, 0)),
                                     mode="constant",
                                     constant_values=-1)
            return sentence_matrix
        else:  # self.sentence_vec_type == "avg":
            sentence_vector = np.zeros(self.vector_size)  # <ndarray>
            # print(f"type(sentence_vector): {type(sentence_vector)}")
            for idx, word in enumerate(word_list):
                # print(f"type(self.model.get_numpy_vector(word)): {type(self.model.get_numpy_vector(word))}")  # <ndarray>
                sentence_vector += self.model.get_numpy_vector(word)
            return sentence_vector / len(word_list)

    def gen_train_val_data(self):
        # 构造训练数据 & 验证数据
        train_df = pd.read_csv("../data/input/training_set.txt",
                               sep="\t",
                               header=None,
                               names=["label", "sentence"])
        val_df = pd.read_csv("../data/input/validation_set.txt",
                             sep="\t",
                             header=None,
                             names=["label", "sentence"])
        # 打乱训练集的顺序. TODO: 不打乱感觉训练出来的模型是有问题的?(好看那句总是预测结果是1?)
        train_df = train_df.sample(frac=1, random_state=1)
        # val_df = val_df.sample(frac=1, random_state=1)  # 验证集不用打乱

        X_train = train_df["sentence"]
        X_train_vec = list()
        for sentence in X_train:
            sent_vector = self.gen_sentence_vec(sentence)
            X_train_vec.append(sent_vector)
        y_train = train_df["label"]  # <Series>

        X_val = val_df["sentence"]
        X_val_vec = list()
        for sentence in X_val:
            sent_vector = self.gen_sentence_vec(sentence)
            X_val_vec.append(sent_vector)
        y_val = val_df["label"]  # <Series>

        if self.sentence_vec_type == "concatenate":
            # NOTE: 注意,这里的dtype是必须的,否则dtype默认值是"int32", 词向量所有的数值会被全部转换为0
            X_train_vec = sequence.pad_sequences(X_train_vec,
                                                 maxlen=self.MAX_SENT_LEN *
                                                 self.vector_size,
                                                 value=0,
                                                 dtype=np.float)
            X_val_vec = sequence.pad_sequences(X_val_vec,
                                               maxlen=self.MAX_SENT_LEN *
                                               self.vector_size,
                                               value=0,
                                               dtype=np.float)

        return np.array(X_train_vec), np.array(X_val_vec), np.array(
            y_train), np.array(y_val)
Exemplo n.º 21
0
def main():
    #enc_word2vec = FastText('wiki.tl/wiki.tl.bin')
    #dec_word2vec = FastText('wiki.en/wiki.en.bin')

    dec_word2vec = FastText('wiki.en/wiki.en.bin')
    enc_word2vec = FastText('wiki.tl/wiki.tl.bin')
    #data_path = 'tgl-eng/tgl.txt'
    #test_path = 'valid_split'
    test_path = 'train_split'
    #data_path = 'health_shortened.tsv'
    eos = "eos"
    sos = "sos"
    #savemodel_filename = 's2s_fasttextloader_batch64_twodata.h5'
    #training parameters
    #batch_size = 64  # Batch size for training.
    batch_size = 64
    epochs = 500  # Number of epochs to train for.
    latent_dim = 512  # Latent dimensionality of the encoding space.
    word_vec_size = 300

    #chkpt_path="checkpoints/weights-improvement-twodata-{epoch:05d}.hdf5"
    #checkpoint = ModelCheckpoint(chkpt_path, verbose=1)

    # Define the model that will turn
    # `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
    #so far checkpoints/weights_cosine_proximity_combined2-00063.hdf5 seems to work
    weights_path = 'checkpoints/weights_cosine_proximity_combined2-00063.hdf5'
    model, encoder_model, decoder_model = build_model(word_vec_size,
                                                      latent_dim, weights_path)
    model.summary()

    # Compile & run training
    #model.compile(optimizer='rmsprop', loss='mean_squared_error')
    #model.compile(optimizer='rmsprop', loss='cosine_proximity')
    #model.compile(optimizer='rmsprop', loss='mean_squared_error')
    # Note that `decoder_target_data` needs to be one-hot encoded,
    # rather than sequences of integers like `decoder_input_data`!

    #num_sentence = 77990
    #steps_per_epoch = int(num_sentence//batch_size)

    decoder_model.summary()

    def decode_sequence(input_seq, sos, eos):
        # Encode the input as state vectors.
        states_value = encoder_model.predict(input_seq)

        # Generate empty target sequence of length 1.
        target_seq = np.zeros((1, 1, word_vec_size))
        # Populate the first character of target sequence with the start character.
        #target_seq[0, 0, target_dict[sos]] = 1.
        ''' create vector for sos '''
        target_seq[0, 0, :] = dec_word2vec.get_numpy_vector(sos,
                                                            normalized=True)

        # Sampling loop for a batch of sequences
        # (to simplify, here we assume a batch of size 1).
        stop_condition = False
        decoded_sentence = ''
        while not stop_condition:
            output_tokens, h, c = decoder_model.predict([target_seq] +
                                                        states_value)

            # Sample a token
            #sampled_token_index = np.argmax(output_tokens[0, -1, :])
            #sampled_word = target_rev_dict[sampled_token_index]
            sampled_word = dec_word2vec.words_for_vector(
                output_tokens[0, -1, :])[0][0]
            decoded_sentence += sampled_word + " "

            # Exit condition: either hit max length
            # or find stop character.
            # if sampled_word in [".", "?", "!"] or
            if (sampled_word == eos
                    or len(decoded_sentence) > max_decoder_seq_length):
                stop_condition = True
                if (decoded_sentence.endswith(eos + ' ')):
                    decoded_sentence = decoded_sentence[:-len(eos + ' ')]
            # Update the target sequence (of length 1).
            target_seq = np.zeros((1, 1, word_vec_size))
            target_seq[0,
                       0, :] = dec_word2vec.get_numpy_vector(sampled_word,
                                                             normalized=True)

            # Update states
            states_value = [h, c]

        return decoded_sentence

    input_texts, target_texts = input2target(test_path, sos, eos)

    indexes = np.random.randint(0, len(input_texts), 50)
    #max_encoder_seq_length = max([len(words.split()) for words in input_texts])
    #max_decoder_seq_length = max([len(words.split()) for words in target_texts])

    max_encoder_seq_length = 130
    max_decoder_seq_length = 100
    encoder_input_data = np.zeros(
        (len(input_texts), max_encoder_seq_length, word_vec_size),
        dtype='float32')
    '''
    for i, text, in enumerate(input_texts):
        words = text.split()
        #words.reverse()
        for t, word in enumerate(words):
            encoder_input_data[i, t, :] = enc_word2vec.get_numpy_vector(word, normalized=True)
    '''
    while True:
        input_sentence = input('Enter Filipino sentence: ')
        print('Input:', input_sentence)

        input_sentence = input_sentence.replace(",", " ,")
        input_sentence = input_sentence.replace(".", " .")
        input_sentence = input_sentence.replace("!", " !")
        input_sentence = input_sentence.replace("?", " ?")
        input_sentence = input_sentence.lower()
        input_words = input_sentence.split()
        for t, word in enumerate(input_words):
            encoder_input_data[0, t, :] = enc_word2vec.get_numpy_vector(
                word, normalized=True)

    #for seq_index in indexes:
    # Take one sequence (part of the training set)
    # for trying out decoding.
        input_seq = encoder_input_data
        decoded_sentence = decode_sequence(input_seq, sos, eos)
        print('-')
        #print('Input sentence:', input_texts[seq_index])
        print('Decoded sentence:', decoded_sentence)
Exemplo n.º 22
0
class Embeddings(object):
    def __init__(self,
                 name,
                 path='./embedding-registry.json',
                 lang='en',
                 extension='vec',
                 use_ELMo=False):
        self.name = name
        self.embed_size = 0
        self.static_embed_size = 0
        self.vocab_size = 0
        self.model = {}
        self.registry = self._load_embedding_registry(path)
        self.lang = lang
        self.extension = extension
        self.embedding_lmdb_path = None
        if self.registry is not None:
            self.embedding_lmdb_path = self.registry["embedding-lmdb-path"]
        self.env = None
        self.make_embeddings_simple(name)
        self.static_embed_size = self.embed_size
        self.bilm = None

        # below init for using ELMo embeddings
        self.use_ELMo = use_ELMo
        if use_ELMo:
            self.make_ELMo()
            self.embed_size = ELMo_embed_size + self.embed_size
            description = self._get_description('elmo-en')
            self.env_ELMo = None
            if description:
                self.embedding_ELMo_cache = os.path.join(
                    description["path-dump"], "cache")
                # clean possible remaining cache
                self.clean_ELMo_cache()
                # create and load a cache in write mode, it will be used only for training
                self.env_ELMo = lmdb.open(self.embedding_ELMo_cache,
                                          map_size=map_size)

    def __getattr__(self, name):
        return getattr(self.model, name)

    def _load_embedding_registry(self, path='./embedding-registry.json'):
        """
        Load the description of available embeddings. Each description provides a name, 
        a file path (used only if necessary) and a embeddings type (to take into account
        small variation of format)
        """
        registry_json = open(path).read()
        return json.loads(registry_json)

    def make_embeddings_simple_in_memory(self,
                                         name="fasttext-crawl",
                                         hasHeader=True):
        nbWords = 0
        print('loading embeddings...')
        begin = True
        description = self._get_description(name)
        if description is not None:
            embeddings_path = description["path"]
            embeddings_type = description["type"]
            self.lang = description["lang"]
            print("path:", embeddings_path)
            if self.extension == 'bin':
                self.model = FastText(embeddings_path)
                nbWords = self.model.nwords
                self.embed_size = 300
            else:
                if embeddings_type == "glove":
                    hasHeader = False
                with open(embeddings_path) as f:
                    for line in f:
                        line = line.strip()
                        line = line.split(' ')
                        if begin:
                            if hasHeader:
                                # first line gives the nb of words and the embedding size
                                nbWords = int(line[0])
                                self.embed_size = int(line[1].replace(
                                    "\n", ""))
                                begin = False
                                continue
                            else:
                                begin = False
                        word = line[0]
                        #if embeddings_type == 'glove':
                        vector = np.array(
                            [float(val) for val in line[1:len(line)]],
                            dtype='float32')
                        #else:
                        #    vector = np.array([float(val) for val in line[1:len(line)-1]], dtype='float32')
                        if self.embed_size == 0:
                            self.embed_size = len(vector)
                        self.model[word] = vector
                if nbWords == 0:
                    nbWords = len(self.model)
            print('embeddings loaded for', nbWords, "words and",
                  self.embed_size, "dimensions")

    '''
    def make_embeddings_fasttext_bin(self, name="wiki.en.bin"):
        nbWords = 0
        print('loading embeddings...')
        description = self._get_description(name)
        if description is not None:
            embeddings_path = description["path"]
            print("path:", embeddings_path)

        self.model = load_fasttext_format(embeddings_path)
    '''

    def make_embeddings_lmdb(self, name="fasttext-crawl", hasHeader=True):
        nbWords = 0
        print(
            '\nCompiling embeddings... (this is done only one time per embeddings at first launch)'
        )
        begin = True
        description = self._get_description(name)
        if description is not None:
            embeddings_path = description["path"]
            embeddings_type = description["type"]
            self.lang = description["lang"]
            print("path:", embeddings_path)
            if embeddings_type == "glove":
                hasHeader = False
            txn = self.env.begin(write=True)
            batch_size = 1024
            i = 0
            nb_lines = 0
            with open(embeddings_path) as f:
                for line in f:
                    nb_lines += 1

            with open(embeddings_path) as f:
                #for line in f:
                for line in tqdm(f, total=nb_lines):
                    line = line.split(' ')
                    if begin:
                        if hasHeader:
                            # first line gives the nb of words and the embedding size
                            nbWords = int(line[0])
                            self.embed_size = int(line[1].replace("\n", ""))
                            begin = False
                            continue
                        else:
                            begin = False
                    word = line[0]
                    #if embeddings_type == 'glove':
                    vector = np.array(
                        [float(val) for val in line[1:len(line)]],
                        dtype='float32')
                    #else:
                    #    vector = np.array([float(val) for val in line[1:len(line)-1]], dtype='float32')
                    if self.embed_size == 0:
                        self.embed_size = len(vector)

                    if len(word.encode(
                            encoding='UTF-8')) < self.env.max_key_size():
                        txn.put(word.encode(encoding='UTF-8'),
                                _serialize_pickle(vector))
                        #txn.put(word.encode(encoding='UTF-8'), _serialize_byteio(vector))
                        i += 1

                    # commit batch
                    if i % batch_size == 0:
                        txn.commit()
                        txn = self.env.begin(write=True)

            #if i % batch_size != 0:
            txn.commit()
            if nbWords == 0:
                nbWords = i
            self.vocab_size = nbWords
            print('embeddings loaded for', nbWords, "words and",
                  self.embed_size, "dimensions")

    def make_embeddings_simple(self, name="fasttext-crawl", hasHeader=True):
        description = self._get_description(name)
        if description is not None:
            self.extension = description["format"]

        if self.embedding_lmdb_path is None or self.embedding_lmdb_path == "None":
            print(
                "embedding_lmdb_path is not specified in the embeddings registry, so the embeddings will be loaded in memory..."
            )
            self.make_embeddings_simple_in_memory(name, hasHeader)
        elif self.extension == "bin":
            print(
                "embedding is of format .bin, so it will be loaded in memory..."
            )
            self.make_embeddings_simple_in_memory(name, hasHeader)
        else:
            # check if the lmdb database exists
            envFilePath = os.path.join(self.embedding_lmdb_path, name)
            if os.path.isdir(envFilePath):
                description = self._get_description(name)
                if description is not None:
                    self.lang = description["lang"]

                # open the database in read mode
                self.env = lmdb.open(envFilePath,
                                     readonly=True,
                                     max_readers=2048,
                                     max_spare_txns=4)
                # we need to set self.embed_size and self.vocab_size
                with self.env.begin() as txn:
                    stats = txn.stat()
                    size = stats['entries']
                    self.vocab_size = size

                with self.env.begin() as txn:
                    cursor = txn.cursor()
                    for key, value in cursor:
                        vector = _deserialize_pickle(value)
                        self.embed_size = vector.shape[0]
                        break
                    cursor.close()

                # no idea why, but we need to close and reopen the environment to avoid
                # mdb_txn_begin: MDB_BAD_RSLOT: Invalid reuse of reader locktable slot
                # when opening new transaction !
                self.env.close()
                self.env = lmdb.open(envFilePath,
                                     readonly=True,
                                     max_readers=2048,
                                     max_spare_txns=2)
            else:
                # create and load the database in write mode
                self.env = lmdb.open(envFilePath, map_size=map_size)
                self.make_embeddings_lmdb(name, hasHeader)

    def make_ELMo(self):
        # Location of pretrained BiLM for the specified language
        # TBD check if ELMo language resources are present
        description = self._get_description('elmo-en')
        if description is not None:
            self.lang = description["lang"]
            vocab_file = description["path-vocab"]
            options_file = description["path-config"]
            weight_file = description["path_weights"]

            print('init ELMo')

            # Create a Batcher to map text to character ids
            self.batcher = Batcher(vocab_file, 50)

            # Build the biLM graph.
            self.bilm = BidirectionalLanguageModel(options_file, weight_file)

            # Input placeholders to the biLM.
            self.character_ids = tf.placeholder('int32',
                                                shape=(None, None, 50))
            self.embeddings_op = self.bilm(self.character_ids)

            with tf.variable_scope('', reuse=tf.AUTO_REUSE):
                # the reuse=True scope reuses weights from the whole context
                self.elmo_input = weight_layers('input',
                                                self.embeddings_op,
                                                l2_coef=0.0)

    def dump_ELMo_token_embeddings(self, x_train):
        if not self.use_ELMo:
            print(
                "Warning: ELMo embeddings dump requested but embeddings object wrongly initialised"
            )
            return

        description = self._get_description('elmo-en')
        if description is not None:
            print("Building ELMo token dump")

            self.lang = description["lang"]
            options_file = description["path-config"]
            weight_file = description["path_weights"]
            working_path = description["path-dump"]

            all_tokens = set(['<S>', '</S>'])
            for i in range(0, len(x_train)):
                # as it is training, it is already tokenized
                tokens = x_train[i]
                for token in tokens:
                    if token not in all_tokens:
                        all_tokens.add(token)

            vocab_file = os.path.join(working_path, 'vocab_small.txt')
            with open(vocab_file, 'w') as fout:
                fout.write('\n'.join(all_tokens))

            tf.reset_default_graph()
            token_embedding_file = os.path.join(working_path,
                                                'elmo_token_embeddings.hdf5')
            dump_token_embeddings(vocab_file, options_file, weight_file,
                                  token_embedding_file)
            tf.reset_default_graph()

            self.batcher_token_dump = TokenBatcher(vocab_file)

            self.bilm_token_dump = BidirectionalLanguageModel(
                options_file,
                weight_file,
                use_character_inputs=False,
                embedding_weight_file=token_embedding_file)

            self.token_ids = tf.placeholder('int32', shape=(None, None))
            self.embeddings_op_token_dump = self.bilm_token_dump(
                self.token_ids)
            """
            with tf.variable_scope('', reuse=tf.AUTO_REUSE):
                # the reuse=True scope reuses weights from the whole context 
                self.elmo_input_token_dump = weight_layers('input', self.embeddings_op_token_dump, l2_coef=0.0)
            """
            print("ELMo token dump completed")

    def get_sentence_vector_only_ELMo(self, token_list):
        """
            Return the ELMo embeddings only for a full sentence
        """

        if not self.use_ELMo:
            print(
                "Warning: ELMo embeddings requested but embeddings object wrongly initialised"
            )
            return

        # Create batches of data
        local_token_ids = self.batcher.batch_sentences(token_list)
        max_size_sentence = local_token_ids[0].shape[0]
        # check lmdb cache
        elmo_result = self.get_ELMo_lmdb_vector(token_list, max_size_sentence)
        if elmo_result is not None:
            return elmo_result

        with tf.Session() as sess:
            # weird, for this cpu is faster than gpu (1080Ti !)
            with tf.device("/cpu:0"):
                # It is necessary to initialize variables once before running inference
                sess.run(tf.global_variables_initializer())

                # Compute ELMo representations (2 times as a heavy warm-up)
                elmo_result = sess.run(
                    self.elmo_input['weighted_op'],
                    feed_dict={self.character_ids: local_token_ids})
                elmo_result = sess.run(
                    self.elmo_input['weighted_op'],
                    feed_dict={self.character_ids: local_token_ids})
                #cache computation
                self.cache_ELMo_lmdb_vector(token_list, elmo_result)
        return elmo_result

    def get_sentence_vector_with_ELMo(self, token_list):
        """
            Return a concatenation of standard embeddings (e.g. Glove) and ELMo embeddings 
            for a full sentence
        """
        if not self.use_ELMo:
            print(
                "Warning: ELMo embeddings requested but embeddings object wrongly initialised"
            )
            return
        """
        # trick to extend the context for short sentences
        token_list_extended = token_list.copy()
        #print("token_list_extended before: ", token_list_extended)
        for i in range(0, len(token_list_extended)):
            local_list = token_list_extended[i]
            j = i
            while len(local_list) <= 5:
                #print(j, local_list)
                if j < len(token_list_extended)-1:
                    local_list = local_list + token_list_extended[j+1]
                else:
                    break
                j = j + 1
            token_list_extended[i] = local_list
        #print("token_list_extended after: ", token_list_extended)
        
        max_size_sentence = 0
        for i in range(0, len(token_list)):
            local_length = len(token_list[i])
            if local_length > max_size_sentence:
                max_size_sentence = local_length
        """

        # Create batches of data
        local_token_ids = self.batcher.batch_sentences(token_list)
        max_size_sentence = local_token_ids[0].shape[0]
        # check lmdb cache
        elmo_result = self.get_ELMo_lmdb_vector(token_list, max_size_sentence)
        if elmo_result is None:
            with tf.Session() as sess:
                # weird, for this cpu is faster than gpu (1080Ti !)
                with tf.device("/cpu:0"):
                    # It is necessary to initialize variables once before running inference
                    sess.run(tf.global_variables_initializer())

                    # Compute ELMo representations (2 times as a heavy warm-up)
                    elmo_result = sess.run(
                        self.elmo_input['weighted_op'],
                        feed_dict={self.character_ids: local_token_ids})
                    elmo_result = sess.run(
                        self.elmo_input['weighted_op'],
                        feed_dict={self.character_ids: local_token_ids})
                    #cache computation
                    self.cache_ELMo_lmdb_vector(token_list, elmo_result)
        concatenated_result = np.zeros(
            (elmo_result.shape[0], max_size_sentence - 2, self.embed_size),
            dtype=np.float32)
        for i in range(0, elmo_result.shape[0]):
            for j in range(0, len(token_list[i])):
                #if is_int(token_list[i][j]) or is_float(token_list[i][j]):
                #    dummy_result = np.zeros((elmo_result.shape[2]), dtype=np.float32)
                #    concatenated_result[i][j] = np.concatenate((dummy_result, self.get_word_vector(token_list[i][j])), )
                #else:
                concatenated_result[i][j] = np.concatenate(
                    (elmo_result[i][j], self.get_word_vector(
                        token_list[i][j])), )
        return concatenated_result

    def get_sentence_vector_ELMo_with_token_dump(self, token_list):
        if not self.use_ELMo:
            print(
                "Warning: ELMo embeddings requested but embeddings object wrongly initialised"
            )
            return

        with tf.variable_scope('', reuse=tf.AUTO_REUSE):
            # the reuse=True scope reuses weights from the whole context
            self.elmo_input_token_dump = weight_layers(
                'input', self.embeddings_op_token_dump, l2_coef=0.0)

        # Create batches of data
        local_token_ids = self.batcher_token_dump.batch_sentences(token_list)

        with tf.Session() as sess:
            # weird, for this cpu is faster than gpu (1080Ti !)
            with tf.device("/cpu:0"):
                # It is necessary to initialize variables once before running inference
                sess.run(tf.global_variables_initializer())

                # Compute ELMo representations
                elmo_result = sess.run(
                    self.elmo_input_token_dump['weighted_op'],
                    feed_dict={self.token_ids: local_token_ids})
        return elmo_result

    def _get_description(self, name):
        for emb in self.registry["embeddings"]:
            if emb["name"] == name:
                return emb
        for emb in self.registry["embeddings-contextualized"]:
            if emb["name"] == name:
                return emb
        return None

    def get_word_vector(self, word):
        """
            Get static embeddings (e.g. glove) for a given token
        """
        if (self.name == 'wiki.fr') or (self.name == 'wiki.fr.bin'):
            # the pre-trained embeddings are not cased
            word = word.lower()
        if self.env is None:
            # db not available, the embeddings should be available in memory (normally!)
            return self.get_word_vector_in_memory(word)
        try:
            with self.env.begin() as txn:
                txn = self.env.begin()
                vector = txn.get(word.encode(encoding='UTF-8'))
                if vector:
                    word_vector = _deserialize_pickle(vector)
                    vector = None
                else:
                    word_vector = np.zeros((self.static_embed_size, ),
                                           dtype=np.float32)
                    # alternatively, initialize with random negative values
                    #word_vector = np.random.uniform(low=-0.5, high=0.0, size=(self.embed_size,))
                    # alternatively use fasttext OOV ngram possibilities (if ngram available)
        except lmdb.Error:
            # no idea why, but we need to close and reopen the environment to avoid
            # mdb_txn_begin: MDB_BAD_RSLOT: Invalid reuse of reader locktable slot
            # when opening new transaction !
            self.env.close()
            envFilePath = os.path.join(self.embedding_lmdb_path, self.name)
            self.env = lmdb.open(envFilePath,
                                 readonly=True,
                                 max_readers=2048,
                                 max_spare_txns=2,
                                 lock=False)
            return self.get_word_vector(word)
        return word_vector

    def get_ELMo_lmdb_vector(self, token_list, max_size_sentence):
        """
            Try to get the ELMo embeddings for a sequence cached in LMDB
        """
        if self.env_ELMo is None:
            # db cache not available, we don't cache ELMo stuff
            return None
        try:
            ELMo_vector = np.zeros(
                (len(token_list), max_size_sentence - 2, ELMo_embed_size),
                dtype='float32')
            with self.env_ELMo.begin() as txn:
                for i in range(0, len(token_list)):
                    txn = self.env_ELMo.begin()
                    # get a hash for the token_list
                    the_hash = list_digest(token_list[i])
                    vector = txn.get(the_hash.encode(encoding='UTF-8'))
                    if vector:
                        # adapt expected shape/padding
                        local_embeddings = _deserialize_pickle(vector)
                        if local_embeddings.shape[0] > max_size_sentence - 2:
                            # squeeze the extra padding space
                            ELMo_vector[
                                i] = local_embeddings[:max_size_sentence - 2, ]
                        elif local_embeddings.shape[
                                0] == max_size_sentence - 2:
                            # bingo~!
                            ELMo_vector[i] = local_embeddings
                        else:
                            # fill the missing space with padding
                            filler = np.zeros((max_size_sentence -
                                               (local_embeddings.shape[0] + 2),
                                               ELMo_embed_size),
                                              dtype='float32')
                            ELMo_vector[i] = np.concatenate(
                                (local_embeddings, filler))
                        vector = None
                    else:
                        return None
        except lmdb.Error:
            # no idea why, but we need to close and reopen the environment to avoid
            # mdb_txn_begin: MDB_BAD_RSLOT: Invalid reuse of reader locktable slot
            # when opening new transaction !
            self.env_ELMo.close()
            self.env_ELMo = lmdb.open(embedding_ELMo_cache,
                                      readonly=True,
                                      max_readers=2048,
                                      max_spare_txns=2,
                                      lock=False)
            return self.get_ELMo_lmdb_vector(token_list)
        return ELMo_vector

    def cache_ELMo_lmdb_vector(self, token_list, ELMo_vector):
        """
            Cache in LMDB the ELMo embeddings for a given sequence 
        """
        if self.env_ELMo is None:
            # db cache not available, we don't cache ELMo stuff
            return None
        txn = self.env_ELMo.begin(write=True)
        for i in range(0, len(token_list)):
            # get a hash for the token_list
            the_hash = list_digest(token_list[i])
            txn.put(the_hash.encode(encoding='UTF-8'),
                    _serialize_pickle(ELMo_vector[i]))
        txn.commit()

    def clean_ELMo_cache(self):
        """
            Delete ELMo embeddings cache, this takes place normally after the completion of a training
        """
        if self.env_ELMo is None:
            # db cache not available, nothing to clean
            return
        else:
            for file in os.listdir(self.embedding_ELMo_cache):
                file_path = os.path.join(self.embedding_ELMo_cache, file)
                if os.path.isfile(file_path):
                    os.remove(file_path)
            os.rmdir(self.embedding_ELMo_cache)

    def get_word_vector_in_memory(self, word):
        if (self.name == 'wiki.fr') or (self.name == 'wiki.fr.bin'):
            # the pre-trained embeddings are not cased
            word = word.lower()
        if self.extension == 'bin':
            return self.model.get_numpy_vector(word)
        if word in self.model:
            return self.model[word]
        else:
            # for unknown word, we use a vector filled with 0.0
            return np.zeros((self.static_embed_size, ), dtype=np.float32)
Exemplo n.º 23
0
    def __init__(self,
                 n_layers,
                 n_source_vocab,
                 n_units,
                 catgy,
                 doc_catgy,
                 senseid2netout,
                 word2index,
                 pre_trained_embedding,
                 model_type,
                 multi_label,
                 wsd_epoch=0,
                 h=8,
                 dropout=0.1,
                 max_length=500,
                 use_label_smoothing=False,
                 embed_position=False,
                 wsd_model=None):
        super(Transformer, self).__init__()
        self.to_gpu()
        self.set_random_seed(123)
        self.word2index = word2index
        self.pre_trained_embedding = pre_trained_embedding
        self.model_type = model_type
        self.wsd_model = wsd_model
        self.multi_label = multi_label

        with self.init_scope():
            if not self.pre_trained_embedding is None:
                model = FastText(self.pre_trained_embedding)
                dim = len(model['a'])
                n_vocab = len(self.word2index.keys())
                self.look_up_table = self.xp.zeros((n_vocab, dim),
                                                   dtype=np.float32)
                for word, index in self.word2index.items():
                    try:
                        self.look_up_table[index] = chainer.cuda.to_gpu(
                            model.get_numpy_vector(word))
                    except:
                        self.xp.random.seed(index)
                        self.look_up_table[index][:] = self.xp.random.uniform(
                            -0.25, 0.25, dim)
                self.embed_x = L.EmbedID(n_source_vocab,
                                         n_units,
                                         ignore_label=-1,
                                         initialW=self.look_up_table)
            else:
                self.embed_x = L.EmbedID(n_source_vocab,
                                         n_units,
                                         ignore_label=-1,
                                         initialW=linear_init)

            self.encoder = Encoder(n_layers, n_units, h, dropout)

            self.fc2 = L.Linear(in_size=n_units,
                                out_size=len(doc_catgy),
                                initialW=linear_init)
            self.fc2_wsd = L.Linear(in_size=n_units,
                                    out_size=len(catgy),
                                    initialW=linear_init)
            self.lookup_table_sense = L.EmbedID(in_size=len(catgy),
                                                out_size=n_units,
                                                ignore_label=-1,
                                                initialW=linear_init)
            self.lookup_table_sense_fixed = self.lookup_table_sense.W.data
            self.senseid2netout = senseid2netout
            self.senseid2netout['<PAD>'] = [-1]

            self.wsd_epoch = wsd_epoch
            if embed_position:
                self.embed_pos = L.EmbedID(max_length,
                                           n_units,
                                           ignore_label=-1)

        self.n_layers = n_layers
        self.n_units = n_units
        self.dropout = dropout
        self.use_label_smoothing = use_label_smoothing
        self.initialize_position_encoding(max_length, n_units)
        self.scale_emb = self.n_units**0.5  ## origin 0.5
        self.doc_catgy = doc_catgy
        self.catgy = catgy
        self.inverse_catgy = {v: k for k, v in self.catgy.items()}

        self.wsd_netout2wordindex = {
            k: self.word2index[v]
            for k, v in self.inverse_catgy.items()
        }
        self.wsd_netout2wordindex[-1] = -1
        self.max_len = max_length
        self.le = None
        if self.multi_label == 1:
            self.le = MultiLabelBinarizer(classes=[
                i[0]
                for i in sorted(self.doc_catgy.items(), key=lambda x: x[1])
            ],
                                          sparse_output=False)
        elif self.multi_label == 0:
            self.le = LabelEncoder()
            self.le.fit([
                i[0]
                for i in sorted(self.doc_catgy.items(), key=lambda x: x[1])
            ])
        self.to_gpu()
Exemplo n.º 24
0
# 加载一个已经存在的模型
# model = FastText()
# model.load_model('./path/to/model.bin')

# 使用skip-gram模型训练
skip_gram_model = FastText()
skip_gram_model.skipgram(input='./train.txt',
                         output='skip_gram_model',
                         epoch=100,
                         lr=0.7)
print(skip_gram_model['贷款'])
# print(skip_gram_model.get_numpy_vector('贷款'))
# print(skip_gram_model.get_numpy_vector('贷款', normalized=True))

var1 = skip_gram_model.get_numpy_vector('人民币')
var2 = skip_gram_model.get_numpy_vector('贷款')
var3 = skip_gram_model.get_numpy_vector('外币')
skip_gram_model.words_for_vector(var1 + var2 - var3, k=1)

# for word in skip_gram_model.words:
#    print(word, skip_gram_model[word])

print(skip_gram_model.nearest_neighbors('贷款', k=2))

# test data is stored inside a file, use this:
# skip_gram_model.predict_proba_file('./test.txt', k=2)

print("\n")

##################