示例#1
0
def encode_labels(data, labenc=None, max_len=50):
    if labenc == None:
        labenc = labelencoder
    if labenc == None:  # or onehotencoder == None:
        print "Error: labelencoder must be trained before it can be used!"
        return None
    #return onehotencoder.transform(labelencoder.transform(data))
    data2 = []

    num_labels = len(labenc.classes_)
    zero_vec = data_util.zero_vec(num_labels)
    print "data: " + str(len(data))
    for item in data:
        #print "item len: " + str(len(item))
        new_item = []
        if len(item) > 0:
            item2 = labenc.transform(item)
            for lab in item2:
                onehot = []
                for x in range(num_labels):
                    onehot.append(0)
                onehot[lab] = 1
                new_item.append(onehot)
        # Pad vectors
        if len(new_item) > max_len:
            new_item = new_item[0:max_len]
        while len(new_item) < max_len:
            new_item.append(zero_vec)
        data2.append(new_item)
        #else:
        #    data2.append([])
    return data2
示例#2
0
def decode_sequence(encoder_model,
                    decoder_model,
                    input_seq,
                    output_seq_len,
                    output_dim,
                    vec_labels=False):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq, batch_size=1)

    # Generate empty target sequence of length 1.
    #output_dim = 5
    #print "output_dim: " + str(output_dim)
    target_seq = numpy.zeros((1, 1, int(output_dim)))
    # Populate the first character of target sequence with the start character.
    zero_lab = data_util.zero_vec(output_dim)
    if vec_labels:
        target_seq[0, 0] = zero_lab
    else:
        zero_lab = encode_labels([['O']])[0][0]
        index = zero_lab.index(1)
        target_seq[0, 0, index] = 1

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = []
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] +
                                                    states_value)
        # Sample a token
        #sampled_token_index = np.argmax(output_tokens[0, -1, :])
        #sampled_lab = reverse_target_char_index[sampled_token_index]
        #print "output_tokens shape: " + str(output_tokens.shape)
        token = output_tokens[0, -1]
        #print "token: " + str(token)
        encoded_label = numpy.zeros((output_dim, ), dtype=numpy.int).tolist()
        if vec_labels:
            decoded_sentence.append(encoded_label)
        else:
            ind = numpy.argmax(token)
            encoded_label[ind] = 1
            #print "encoded_label: " + str(encoded_label)
            sampled_lab = decode_labels([encoded_label])[0]
            print "sampled_lab: " + str(sampled_lab)
            decoded_sentence.append(sampled_lab)

        # Exit condition: either hit max length or find stop character.
        if (len(decoded_sentence) > output_seq_len):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = numpy.zeros((1, 1, output_dim))
        for x in range(output_dim):
            target_seq[0, 0, x] = token[x]

        # Update states
        states_value = [h, c]

    return decoded_sentence
示例#3
0
def get_feats(seqs, train=False):
    print "get_feats"
    vec_model, dim = word2vec.load(vecfile)
    zero_vec = data_util.zero_vec(dim)
    feats = []
    labels = []
    global label_set
    label_set = set([])
    for s in seqs:
        s_feats = []
        s_labels = []
        for pair in s:
            word = pair[0]
            vector = word2vec.get(word, vec_model)
            s_feats.append(vector)
            s_labels.append(pair[1])
            label_set.add(pair[1])
        feats.append(s_feats)
        labels.append(s_labels)
    if train:
        num_labels = len(list(label_set))
        create_labelencoder(list(label_set), num_labels)
        global max_seq_len
        #max_seq_len = max([len(txt) for txt in feats])
    print "max_seq_len: " + str(max_seq_len)

    # Pad sequences
    #feats = pad_sequences(numpy.array(feats), maxlen=max_seq_len, dtype='float32', padding="pre")
    #labels = pad_sequences(numpy.array(labels), maxlen=max_seq_len, dtype='str', padding="pre", value='O')

    padded_feats = []
    padded_labels = []
    for feat in feats:
        #print "seq len: " + str(len(feat))
        while len(feat) > max_seq_len:
            feat_part = feat[0:max_seq_len]
            padded_feats.append(pad_feat(feat_part, max_seq_len, zero_vec))
            feat = feat[max_seq_len:]
        new_feat = pad_feat(feat, max_seq_len, zero_vec)
        padded_feats.append(new_feat)
    for labs in labels:
        while len(labs) > max_seq_len:
            labs_part = labs[0:max_seq_len]
            padded_labels.append(pad_feat(labs_part, max_seq_len, 'O'))
            labs = labs[max_seq_len:]
        padded_labels.append(pad_feat(labs, max_seq_len, 'O'))
    feats = padded_feats
    labels = padded_labels

    # Encode labels
    encoded_labels = encode_labels(labels, max_len=max_seq_len)
    print "labels[0]: " + str(encoded_labels[0])
    #for row in labels:
    #    encoded_row = encode_labels(row)
    #    encoded_labels.append(encoded_row)
    print "feats: " + str(len(feats)) + " labels: " + str(len(encoded_labels))
    return feats, encoded_labels
def vectorize(phrase, vec_model, dim):
    stopwords = ['and', 'or', 'with', 'in', 'of', 'at', 'had', 'ho']
    words = phrase.split(' ')
    vecs = []
    zero_vec = numpy.asarray(data_util.zero_vec(dim))
    for word in words:
        if word not in stopwords:
            vecs.append(word2vec.get(word, vec_model))
    # Average vectors
    if len(vecs) > 0:
        avg_vec = numpy.average(numpy.asarray(vecs), axis=0)
        return avg_vec
    else:
        return zero_vec
def read_cluster_file(clusterfile, word2vec, dim, cluster_names=None):
    train = False
    if cluster_names is None:
        cluster_names = set()
        cluster_names.add(0)
        train = True
    keywords = []
    kw_vecs = []
    kw_clusters = []

    zero_vec = numpy.array(data_util.zero_vec(dim))

    with open(clusterfile, 'r') as f:
        for line in f:
            cols = line.split(',')
            kw = cols[0]
            clust = int(cols[1].strip())
            # Look up keyword in word2vec
            vec = zero_vec
            for word in kw.split(' '):
                vec2 = zero_vec
                # ignore stopwords
                #if word not in stopwords and word in word2vec:
                if word in word2vec:
                    vec2 = numpy.array(word2vec[word])
                vec = vec + vec2
            keywords.append(kw)
            kw_vecs.append(vec)
            kw_clusters.append(clust)
            if train:
                cluster_names.add(clust)

    # Convert cluster names to numbers 0 to num_clusters
    if train:
        cluster_names = list(cluster_names)
    for x in range(len(kw_clusters)):
        val = kw_clusters[x]
        if val in cluster_names:
            kw_clusters[x] = cluster_names.index(val)
        else:
            kw_clusters[x] = 0

    return keywords, kw_clusters, kw_vecs, cluster_names
示例#6
0
def get(word, model):
    dim = model.vector_size
    if word in model:  #.wv.vocab:
        return list(model[word])
    else:
        return data_util.zero_vec(dim)
示例#7
0
def train_seq2seq(trainx,
                  trainy,
                  num_nodes=100,
                  vec_labels=False,
                  loss_function="cosine_proximity",
                  num_epochs=10):
    trainx = numpy.array(trainx)
    print "trainx shape: " + str(trainx.shape)
    trainy = numpy.array(trainy)
    print "trainy shape: " + str(trainy.shape)
    input_dim = trainx.shape[-1]
    output_dim = trainy.shape[-1]
    input_seq_len = trainx.shape[1]
    output_seq_len = trainy.shape[1]

    # Create decoder target data
    trainy_target = []
    zero_lab = data_util.zero_vec(output_dim)
    if not vec_labels:
        zero_lab = encode_labels([['O']])[0][0]
    print "zero_lab shape: " + str(numpy.asarray(zero_lab))
    for i in range(trainy.shape[0]):
        row = trainy[i].tolist()
        new_row = row[1:]
        new_row.append(zero_lab)
        trainy_target.append(new_row)
    trainy_target = numpy.asarray(trainy_target)

    print "trainy_target shape: " + str(trainy_target.shape)

    # Set up the encoder
    latent_dim = num_nodes
    dropout = 0.1
    encoder_inputs = Input(shape=(None, input_dim))  #seq_len
    encoder = LSTM(latent_dim, return_state=True)

    # Encoder-Decoder model
    encoder_outputs, state_h, state_c = encoder(encoder_inputs)
    encoder_states = [state_h, state_c]

    # Set up the decoder, using `encoder_states` as initial state.
    decoder_inputs = Input(shape=(None, output_dim))
    decoder_rnn = LSTM(latent_dim, return_sequences=True, return_state=True)
    decoder_outputs, d_state_h, d_state_c = decoder_rnn(
        decoder_inputs, initial_state=encoder_states)
    decoder_dense = Dense(output_dim, activation='softmax')
    decoder_outputs = decoder_dense(decoder_outputs)

    # Define the model that will turn
    # `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    model.compile(optimizer='rmsprop', loss=loss_function)
    model.fit([trainx, trainy], trainy_target, epochs=num_epochs)

    # Normal RNN
    #rnn_out = GRU(latent_dim, return_sequences=False)(encoder_inputs)
    #dropout_out = Dropout(dropout)(rnn_out)
    #prediction = Dense(output_dim, activation='softmax')(dropout_out)
    #model = Model(inputs=encoder_inputs, outputs=prediction)
    #model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
    #model.fit(trainx, trainy, nb_epoch=20)

    model.summary()
    model.save('seq2seq.model')

    # Create models for inference
    encoder_model = Model(encoder_inputs, encoder_states)

    decoder_state_input_h = Input(shape=(latent_dim, ))
    decoder_state_input_c = Input(shape=(latent_dim, ))
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    decoder_outputs, state_h, state_c = decoder_rnn(
        decoder_inputs, initial_state=decoder_states_inputs)
    decoder_states = [state_h, state_c]
    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model = Model([decoder_inputs] + decoder_states_inputs,
                          [decoder_outputs] + decoder_states)

    return model, encoder_model, decoder_model, output_dim
def cluster_embeddings(labels,
                       clusterfile,
                       vecfile,
                       return_phrases=False,
                       max_length=10):
    print "getting cluster embeddings"
    word2vec, dim = data_util.load_word2vec(vecfile)
    cluster_map = {}
    cluster_names = {}
    cluster_embeddings = []
    with open(clusterfile, 'r') as f:
        for line in f.readlines():
            phrases = line.strip().strip(',').split(',')
            key = int(phrases[0])
            phrases = phrases[1:]
            cluster_map[key] = phrases

    # Get cluster centers
    cluster_centers = {}
    cluster_vecs = {}
    center_file = clusterfile + ".centers"
    calculate_centers = True
    if os.path.exists(center_file):
        cluster_centers = get_cluster_centers(clusterfile)
        calculate_centers = False
    # Get the vectors for each phrase in the clusters
    for num in cluster_map.keys():
        #print "cluster " + str(num)
        vecs = []
        phrases = cluster_map[num]
        for phrase in phrases:
            words = phrase.split(' ')
            word_vecs = []
            for word in words:
                vec = data_util.zero_vec(dim)
                if word in word2vec:
                    vec = word2vec[word]
                word_vecs.append(vec)
            if len(word_vecs) == 0:
                #print "ZERO VEC: " + phrase
                phrase_vec = data_util.zero_vec(dim)
            else:
                phrase_vec = numpy.average(numpy.asarray(word_vecs), axis=0)
            vecs.append(phrase_vec)
        cluster_vecs[num] = vecs
        if calculate_centers:
            cluster_vec = numpy.average(numpy.asarray(vecs), axis=0)
            #print "cluster " + str(num) + " vec shape: " + str(cluster_vec.shape)
            cluster_centers[num] = cluster_vec

    # Get closest phrase
    if return_phrases:
        for num in cluster_map.keys():
            cluster_vec = cluster_centers[num]
            phrases = cluster_map[num]
            vecs = cluster_vecs[num]
            #print 'phrases: ' + str(len(phrases)) + ', vecs: ' + str(len(vecs))
            best_vec = data_util.zero_vec(dim)
            best_phrase = ""
            best_dist = 10000000.0
            for x in range(len(phrases)):
                phrase = phrases[x]
                phrase_len = len(phrase.split(' '))
                phrase_vec = vecs[x]
                dist_temp = numpy.linalg.norm(phrase_vec - cluster_vec)
                # Length penalty
                dist = dist_temp * phrase_len
                #print "phrase: " + phrase + ", dist: " + str(dist)
                if dist < best_dist:
                    best_dist = dist
                    best_vec = phrase_vec
                    best_phrase = phrase
            #print "best phrase: " + best_phrase
            cluster_names[num] = best_phrase

    zero_vec = data_util.zero_vec(dim)
    kw_names = []
    for kw_list in labels:
        #print "kw_list: " + str(type(kw_list)) + " : " + str(kw_list)
        if type(kw_list) is str:
            kw_list = kw_list.split(',')
        kw_embeddings = []
        kw_text = []
        for cluster_num in kw_list:
            if cluster_num != '':
                #print "converting cluster " + cluster_num
                num = int(cluster_num)
                vec = cluster_centers[num]
                #print "vec: " + str(len(vec))
                kw_embeddings.append(vec)
                if return_phrases:
                    name = cluster_names[num]
                    kw_text.append(name)
        kw_names.append(kw_text)
        # Pad vectors
        while len(kw_embeddings) < max_length:
            kw_embeddings.insert(0, zero_vec)
        if len(kw_embeddings) > max_length:
            kw_embeddings = kw_embeddings[:max_length]
        #print "kw_embeddings: " + str(len(kw_embeddings))
        cluster_embeddings.append(kw_embeddings)

    if return_phrases:
        # Write cluster name mapping to file
        outname = clusterfile + ".names"
        outfile = open(outname, 'w')
        for key in cluster_names.keys():
            name = cluster_names[key]
            phrases = cluster_map[key]
            outfile.write(
                str(key) + " : " + name + " : " + str(phrases) + "\n")
        outfile.close()
        return cluster_embeddings, kw_names
    else:
        return cluster_embeddings