Пример #1
0
def build_tsne_embeddings(tags, c2v_model=None, tsne_model=None):
    """
    Use sklearn TSNE to build embedding layer.
    :return: an Numpy array

    1. Use `char2vec` to transform tags into a 150D vectors -> (num_tags, 150)
    2. Feed word embeddings into sklearn TSNE model

    """
    if not c2v_model:
        c2v_model = c2v.load_model("train_fr_150/")

    word_embeddings = c2v_model.vectorize(tags)

    print(f"Word embedding shape: {word_embeddings.shape}")

    if not tsne_model:
        # preplexity: we should experiment with value between 5 and 50 to see different results
        # n_components: the input word embeding is a (num_words, 150) 2-D matrix
        # n_iter: number of iterations for optimization, >= 250
        # random_state: seed for random number generator
        tsne_model = TSNE(perplexity=40,
                          n_components=2,
                          init="pca",
                          n_iter=2500,
                          random_state=23)

    tsne_embeddings = tsne_model.fit_transform(word_embeddings)

    print(f"T-SNE embedding shape: {tsne_embeddings.shape}")

    return tsne_embeddings
Пример #2
0
def pca(l_cluster):

    with open('datasetParsingDEF.csv') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        line_count = 0
        for row in csv_reader:
            if line_count == 0:
                print(f'Column names are {", ".join(row)}')
                line_count += 1
            else:
                print('WORD', row[1][:-4])
                song_Strings.append(row[2])
                song_Names.append(row[1][:-4])
                line_count += 1

        print(f'Processed {line_count} lines.')

    # Load Inutition Engineering pretrained model
    # Models names: 'eng_50', 'eng_100', 'eng_150' 'eng_200', 'eng_300'
    c2v_model = chars2vec.load_model(embedding)

    # Create word embeddings
    word_embeddings = c2v_model.vectorize_words(song_Strings)

    # Project embeddings on plane using the PCA
    projection_2d = sklearn.decomposition.PCA(
        n_components=2).fit_transform(word_embeddings)

    # Draw words on plane
    f = plt.figure(figsize=(8, 6))
    plt.title("KMean - Divisione : " + n_clusterString +
              ' Cluster - Embedding : ' + embedding)
    #label_color = [LABEL_COLOR_MAP[l] for l in l_cluster]

    print(song_Names)
    trasformLabelColor(l_cluster)
    print(label_color_Final)

    i = 0

    print(len(l_cluster))

    modificaLabelColor()

    print(label_color_Final)

    for j in range(len(projection_2d)):
        print(j)
        plt.scatter(projection_2d[j, 0],
                    projection_2d[j, 1],
                    marker=('$' + 'o' + '$'),
                    s=30,
                    label=j,
                    c=label_color_Final[j])
        i = i + 1

    plt.savefig('./Scatter/Kmean/' + embedding + '/' + n_clusterString +
                '.png')
Пример #3
0
def spectralClustering():
    words = []

    with open('./datasetFit.csv') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        line_count = 0
        for row in csv_reader:
            if line_count == 0:
                #print(f'Column names are {", ".join(row)}')
                line_count += 1
            else:
                words.append(row[2])
                line_count += 1

    with open('./datasetCouple.csv') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        line_count = 0
        arrayDiStringhe = []
        for row in csv_reader:
            if line_count == 0:
                #print(f'Column names are {", ".join(row)}')
                line_count += 1
            else:
                words.append(row)
                line_count += 1
            for i in range(len(words)):
                if (words[i]):

                    stringa = str(words[i])
                    stringa = stringa.replace("[", "")
                    stringa = stringa.replace("]", "")
                    stringa = stringa.replace("'", "")
                    arrayDiStringhe.append(stringa)

    c2v_model = chars2vec.load_model('eng_50')
    word_embeddings = c2v_model.vectorize_words(arrayDiStringhe)
    #print(word_embeddings)
    #print(len(word_embeddings))

    clustering = SpectralClustering(n_clusters=9,
                                    assign_labels="discretize",
                                    random_state=0).fit(word_embeddings)
    labels = clustering.labels_
    #print(labels)
    l = len(labels)

    if (labels[l - 1] == labels[l - 2]):
        #print('TRUE')
        return True
    else:
        #print('FALSE')
        return False
Пример #4
0
def pca(l_cluster):
    words = []
    etichette = []

    with open('datasetParsingDEF.csv') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        line_count = 0
        for row in csv_reader:
            if line_count == 0:
                print(f'Column names are {", ".join(row)}')
                line_count += 1
            else:
                print(row[1], row[2])
                words.append(row[2])
                etichette.append(row[1])
                line_count += 1

        print(f'Processed {line_count} lines.')

    # Load Inutition Engineering pretrained model
    # Models names: 'eng_50', 'eng_100', 'eng_150' 'eng_200', 'eng_300'
    c2v_model = chars2vec.load_model('eng_100')

    # Create word embeddings
    word_embeddings = c2v_model.vectorize_words(words)

    # Project embeddings on plane using the PCA
    projection_2d = sklearn.decomposition.PCA(
        n_components=2).fit_transform(word_embeddings)

    # Draw words on plane
    f = plt.figure(figsize=(8, 6))

    #label_color = [LABEL_COLOR_MAP[l] for l in l_cluster]

    print(label_color)

    i = 0

    print(len(l_cluster))

    for j in range(len(projection_2d)):
        print(j)
        plt.scatter(projection_2d[j, 0],
                    projection_2d[j, 1],
                    marker=('$' + 'o' + '$'),
                    s=30,
                    label=j,
                    c=label_color[l_cluster[j]])
        i = i + 1
    plt.show()
Пример #5
0
def getCommentEmbeddings(model_name, comments):
    if not model_name in [
            'eng_50', 'eng_100', 'eng_150', 'eng_200', 'eng_300'
    ]:
        print(
            "Error: arguments 'model' should be one of eng_50, eng_100, eng_150, eng_200, and eng_300"
        )
        exit()

    if type(comments[0]) == str:
        c2v_model = chars2vec.load_model(model_name)
        comments = list(map(lambda x: x.strip(), comments))
        comment_embeddings = c2v_model.vectorize_words(comments)
        return comment_embeddings

    elif type(comments[0][0]) == str:
        comments_embeddings = []
        for comments_ in comments:
            c2v_model = chars2vec.load_model(model_name)
            comments_ = list(map(lambda x: x.strip(), comments_))
            comment_embeddings = c2v_model.vectorize_words(comments_)
            comments_embeddings.append(comment_embeddings)
        return comments_embeddings
Пример #6
0
 def __init__(self, dataset_dir, vocab_size, word_size, char_size, mode):
     self.vocab_size = vocab_size  # 128 -> ascii number
     self.word_size = word_size
     self.char_size = char_size
     self.mode = mode
     self.c2v_model = chars2vec.load_model('eng_{}'.format(char_size))
     # dir = <dir>/{train|val}/<filename>.json
     self.filenames = sorted([
         os.path.join(dataset_dir, mode, f)
         for f in os.listdir(os.path.join(dataset_dir, mode))
         if re.match(r'.*\.json', f)
     ])
     self.data = [self.read_file(file) for file in self.filenames]
     self.document_lists = []
     self.labels = []
def main():
    max_review_length = 50
    c2v_model = chars2vec.load_model('eng_50')
    num_class = 9
    char_dic = create_dic()
    f_train_in = open("Datasets/categ.txt", "r")
    f_train_out = open("Datasets/outs.txt", "r")
    ''' Training model'''
    train_model(f_train_in, f_train_out, max_review_length, c2v_model,
                num_class)
    '''Predict'''
    embedding_model = load_model(
        "Categorical_classifier_models/Categorical_classifier_embedd.h5")
    test_model(embedding_model, char_dic, max_review_length)
    return
Пример #8
0
LR_INIT = 1e-4
VOCAB_SIZE = 128
WORD_SIZE = 250
CHAR_SIZE = 50
WARMUP_EPOCHS = 100
TRAIN_EPOCHS = 1500
NUM_CLASS = 5
GRID_SIZE = [64, 64]
CLASS_NAME = [
    "Don't care", "Merchant Name", "Merchant Address", "Transaction Date",
    "Total"
]

# model config
c2v_model = chars2vec.load_model('eng_{}'.format(150))
model = GridClassifier(num_class=NUM_CLASS, gird_size=GRID_SIZE)
optimizer = tf.keras.optimizers.Adam(lr=LR_INIT, clipnorm=10.0)
cross_entropy = SparseCategoricalCrossentropy(from_logits=True)
model_ckpt = tf.train.Checkpoint(step=tf.Variable(1),
                                 optimizer=optimizer,
                                 net=model)
model_manager = tf.train.CheckpointManager(
    model_ckpt,
    './checkpoints/grid_receipt_classifier_train.tf',
    max_to_keep=5)
if model_manager.latest_checkpoint:
    # restore checkpoint
    model_ckpt.restore(model_manager.latest_checkpoint)
    print("Restored from {}".format(model_manager.latest_checkpoint))
else:
Пример #9
0
import chars2vec
import sklearn.decomposition
import matplotlib.pyplot as plt

# Load Inutition Engineering pretrained model
# Models names: 'eng_50', 'eng_100', 'eng_150'
c2v_model = chars2vec.load_model('eng_50')

words = [
    'Natural', 'Language', 'Understanding', 'Naturael', 'Longuge',
    'Updderctundjing', 'Motural', 'Lamnguoge', 'Understaating', 'Naturrow',
    'Laguage', 'Unddertandink', 'Nattural', 'Languagge', 'Umderstoneding'
]

# Create word embeddings
word_embeddings = c2v_model.vectorize_words(words)

# Project embeddings on plane using the PCA
projection_2d = sklearn.decomposition.PCA(
    n_components=2).fit_transform(word_embeddings)

# Draw words on plane
f = plt.figure(figsize=(8, 6))

for j in range(len(projection_2d)):
    plt.scatter(projection_2d[j, 0],
                projection_2d[j, 1],
                marker=('$' + words[j] + '$'),
                s=500 * len(words[j]),
                label=j,
                facecolors='green' if words[j]
Пример #10
0
 def __init__(
     self,
     language_model='eng_50',
 ):  # phoc_vectors  have a size n_test_samplesXn_ensmbles
     self.c2v_model = chars2vec.load_model(language_model)
Пример #11
0
import chars2vec as c2v
import sklearn.decomposition
import matplotlib.pyplot as plt
import tensorflow as tf
""" Mute tensorflow warning """
tf.logging.set_verbosity(tf.logging.ERROR)
""" 2D Visualization script by using PCA on the vectorization of a list of words"""

c2v_model = c2v.load_model("train_fr_150")
""" Words to visualize """
words = [
    'est', 'ezt', 'zest', 'carotte', 'carote', 'carottte', 'langage',
    'language', 'langqge', 'francais', 'franssais', 'francqis', 'bread',
    'brad', 'breod', 'broad'
]

word_embeddings = c2v_model.vectorize(words)
""" Optional print of euclidean distances between vectors """


def print_distance(words):
    import numpy as np

    print("\t", end='')
    for word in words:
        print("%-10.6s" % word, end='\t')
    print("")
    for i, vec1 in enumerate(word_embeddings):
        print(words[i], end=' ')
        for vec2 in word_embeddings:
            print("%10.4f" % np.linalg.norm(vec1 - vec2), end='\t')
def execute_spell_suggester(input_word):
    char_emb_model = chars2vec.load_model(model_path)
    return spell_corrector.compute_correct_word(input_word,
                                                startChar_to_words_dict,
                                                char_emb_model)
Пример #13
0
X_train = [
    ('mecbanizing', 'mechanizing'),  # similar words, target is equal 0
    ('dicovery', 'dis7overy'),  # similar words, target is equal 0
    ('prot$oplasmatic', 'prtoplasmatic'),  # similar words, target is equal 0
    ('copulateng', 'lzateful'),  # not similar words, target is equal 1
    ('estry', 'evadin6'),  # not similar words, target is equal 1
    ('cirrfosis', 'afear')  # not similar words, target is equal 1
]

y_train = [0, 0, 0, 1, 1, 1]

model_chars = [
    '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/',
    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>',
    '?', '@', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
    'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'
]

# Create and train chars2vec model using given training data
my_c2v_model = chars2vec.train_model(dim, X_train, y_train, model_chars)

# Save pretrained model
chars2vec.save_model(my_c2v_model, path_to_model)

words = ['list', 'of', 'words']

# Load pretrained model, create word embeddings
c2v_model = chars2vec.load_model(path_to_model)
word_embeddings = c2v_model.vectorize_words(words)
word_embeddings['of']
Пример #14
0
def pca(l_cluster):

    with open('datasetParsing2DEF.csv') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        line_count = 0
        for row in csv_reader:
            if line_count == 0:
                print(f'Column names are {", ".join(row)}')
                line_count += 1
            else:
                print('WORD', row[1][:-4])
                song_Strings.append(row[2])
                song_Names.append(row[1][:-4])
                line_count += 1

        print(f'Processed {line_count} lines.')

    # Load Inutition Engineering pretrained model
    # Models names: 'eng_50', 'eng_100', 'eng_150' 'eng_200', 'eng_300'
    c2v_model = chars2vec.load_model(embedding)

    # Create word embeddings
    word_embeddings = c2v_model.vectorize_words(song_Strings)

    # Project embeddings on plane using the PCA
    projection_2d = sklearn.decomposition.PCA(
        n_components=2).fit_transform(word_embeddings)

    # Draw words on plane
    f = plt.figure(figsize=(8, 6))
    plt.title("KMean - Divisione : " + n_clusterString +
              ' Cluster - Embedding : ' + embedding)
    #label_color = [LABEL_COLOR_MAP[l] for l in l_cluster]

    print(song_Names)
    trasformLabelColor(l_cluster)
    print(label_color_Final)

    i = 0

    print(len(l_cluster))

    modificaLabelColor()

    print(label_color_Final)

    print(len(projection_2d))

    assex = []
    assey = []

    for j in range(0, len(projection_2d)):
        assex.append(projection_2d[j, 0])
        assey.append(projection_2d[j, 1])

    fig = go.Figure(data=go.Scatter(
        x=assex,
        y=assey,
        mode='markers',
        text=song_Names,
        marker=dict(
            size=16,
            color=label_color_Final,  # set color equal to a variable
            showscale=True,
        )))

    fig.update_xaxes(showgrid=False)
    fig.update_yaxes(showgrid=False)

    fig.update_layout(title_text=algo + '  ' + embedding + '  ' +
                      n_clusterString,
                      plot_bgcolor='rgb(236,241,243)')

    fig.show()

    for j in range(len(projection_2d)):
        print(j)
        plt.scatter(projection_2d[j, 0],
                    projection_2d[j, 1],
                    marker=('$' + 'o' + '$'),
                    s=30,
                    label=j,
                    c=label_color_Final[j])
        i = i + 1
Пример #15
0
 def __init__(self,  language_model = 'eng_50', max_word_len = 20): # phoc_vectors  have a size n_test_samplesXn_ensmbles
     self.c2v_model = chars2vec.load_model(language_model)
     self.len_vec =  len(self.c2v_model.vectorize_words(['dump']).squeeze())
     self.len_output = max_word_len*int(language_model.replace('eng_',''))
Пример #16
0
def generateCSV(labels, name):

    with open('cluster.csv', 'w') as csvfile:
        filewriter = csv.writer(csvfile,
                                delimiter=',',
                                quotechar='|',
                                quoting=csv.QUOTE_MINIMAL)
        filewriter.writerow(['N', 'Song', 'Cluster'])
        i = 0
        for l in labels:
            single_Name = name[i][:-4]
            filewriter.writerow([i, single_Name, l])
            i = i + 1


c2v_model = chars2vec.load_model(embedding)

words = []
etichette = []

with open('datasetParsingDEF.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if line_count == 0:
            print(f'Column names are {", ".join(row)}')
            line_count += 1
        else:
            print(row[1], row[2])
            words.append(row[2])
            etichette.append(row[1])
Пример #17
0
def wembed_features(df, model, tokenizer, char_embeddings=False, window=3):
    # Features Matrix
    features = []
    if char_embeddings == True:
        c2v_features_size = 100
        char2vec_model = chars2vec.load_model('eng_'+str(c2v_features_size))
    for ind, row in df.iterrows():
        # Get text
        string = row['text']

        # Tokenize
        tokens = tokenizer.tokenize(string)
        tokens = list(tokens)
        tokens = [x.lower() for x in tokens]
        # tokens = [lemmatizer.lemmzatize(tok) for tok in tokens]

        token_vectors = []
        accepted_tokens = []
        for i in range(int(window/2)):
            vector_size = model.vector_size
            if char_embeddings == True:
                vector_size += c2v_features_size
            token_vectors.append(np.zeros(vector_size))
            accepted_tokens.append('null')

        for token in tokens:
            try:
                wembed_features = model.word_vec(token)
                wembed_features = np.reshape(
                    wembed_features, (1, wembed_features.shape[0]))
                if char_embeddings == True:
                    c2v_feature = char2vec_model.vectorize_words([token])
                    wembed_features = np.hstack((wembed_features, c2v_feature))
                    token_vectors.append(wembed_features)
                accepted_tokens.append(token)
            except Exception as e:
                wembed_features_size = model.vector_size
                if char_embeddings == True:
                    wembed_features_size += c2v_features_size
                token_vectors.append(np.random.rand(wembed_features_size))
                accepted_tokens.append(token+'#rand')

        for i in range(int(window/2)):
            vector_size = model.vector_size
            if char_embeddings == True:
                vector_size += c2v_features_size
            token_vectors.append(np.zeros(vector_size))
            accepted_tokens.append('null')

        # Window Buffer
        last = 0
        vector_size = model.vector_size
        if char_embeddings == True:
            vector_size += c2v_features_size
        window_buffer = np.zeros((window, vector_size))
        # Final Vector List
        final_vectors = []
        final_tokens = []
        for vector in token_vectors:
            if last < window:
                # Update Buffer with new vector
                window_buffer[last, :] = vector
                # If window is full
                if last == window-1:
                    new_vec = window_buffer.mean(axis=0)
                    final_vectors.append(new_vec)
                    final_tokens.append('-'.join(accepted_tokens[0:3]))

                last += 1

            else:
                if window == 1:
                    next_pos = 0
                else:
                    next_pos = (last % window)

                window_buffer[next_pos, :] = vector

                new_vec = window_buffer.mean(axis=0)
                final_vectors.append(new_vec)
                final_tokens.append(
                    '-'.join(accepted_tokens[last+1-window:last+1]))

                last += 1

                # End of Buffer
                if last == len(token_vectors):
                    break

        # Free Up Memory
        del window_buffer
        del accepted_tokens
        del token_vectors

        # If final_vectors is empty fill zeros
        if len(final_tokens) < 2:
            features.append([0 for i in range(0, 8)])
        else:
            features.append(wembed_util(final_vectors))
        # print('features: ',features)
        print('ws: ', window, ind)

    features_df = pd.DataFrame(features, columns=[
                               'max_sim', 'min_sim', 'max_dsim', 'min_dsim', 'max_wsim', 'min_wsim', 'max_wdsim', 'min_wdsim'])
    if len(features_df.isnull().any(1).nonzero()[0]) == 0:
        print("No Nans")
    else:
        features_df = features_df.fillna(0)

    return features_df