def save_vectors_file():
    data = load_data(FLAGS.data_path)

    vectorizer = Vectorizer()

    logging.info('getting vectors')
    img_vectors = []
    genders = []
    for img_path, gender_id in tqdm(data.items()):
        try:
            img_array = get_img(img_path)

            vector = vectorizer.get_vector(img_array)

            img_vectors.append(vector)
            genders.append(gender_id)
        except Exception as e:
            logging.warning('exception: {}'.format(e))

    vectorizer.close()

    dim_reduction_technique = get_dim_reduction_technique(
        FLAGS.dim_reduction_technique)

    reduced, model = dim_reduction_technique(img_vectors, FLAGS.n_dimensions)

    save_pkl_file(model, FLAGS.reducter_path)
    save_pkl_file((reduced, genders), FLAGS.vectors_path)
Exemplo n.º 2
0
    def classify(self):
        # Classifies unknown forum posts
        if not self.fit:
            print("Fitting must be performed before classifying")
            return

        vectorizer = Vectorizer(self.dictionary.dictionary)
        input_file = input(
            "Enter the name of the .txt file containing the unknown posts (including file-ending: "
        )
        try:
            with open(input_file, "r") as file:
                vectors = vectorizer.vectorize(
                    self.preprocessor.preprocess(file))
        except FileNotFoundError:
            if input("File not found. Press enter to try again or type 'm' and press enter to return to menu.").lower()\
                    == "m":
                return
            self.classify()
            return

        with open("result.txt", "w") as result_file:
            for line in self.classifier.classify(vectors):
                result_file.write((label_list[line] + "\n"))
        print(
            "Result saved in result.txt. " +
            "The predicted label of each post is printed on the corresponding line of the document."
        )
Exemplo n.º 3
0
def create_tf_idf(file_path):
    reader = TrainingTextReader(file_path)
    keywords = KeywordExtractor(reader.articles[10], 'useless.txt')
    vector_index = Vectorizer(keywords.article_sents_tokened)
    freq_mat = vector_index.frequencyMatrix
    normalized_vector = VectorNormalizer(freq_mat)
    norm_mat = normalized_vector.l2_norm_matrice
    tf_idf = InverseDocumentFrequency(norm_mat)
    return tf_idf.tf_idf_matrice
    def setUp(self):
        self.vec = Vectorizer(layer=-1, backend='gpu', cores=32)

        # Generate a list of images
        base_image = os.path.expanduser(
            '~') + '/SaturnServer/test_resources/map_image'
        self.imagenames = []
        for i in range(1, self.vec.cores + 1):
            self.imagenames.append("{}{}.jpg".format(base_image, i))
Exemplo n.º 5
0
    def __init__(self):

        # vectorizer class
        # based on composition instead of inheritence principles
        self.vectorizer = Vectorizer()

        # weights learned and used by model
        self.weights = np.array([])
        self.tag_enums = []

        self.tag_dict = {}
    def __init__(self,folder='model',modeltype='kpca',topics=10):
        # the classifier, which also contains the trained BoW transformer
        self.bow = Vectorizer(folder=folder,steps=['hashing','tfidf'])
        self.folder = folder
        self.modeltype = modeltype
        self.topics = topics

        if self.modeltype is 'kpca':
            from sklearn.decomposition import KernelPCA
            self.model = KernelPCA(kernel='rbf',gamma=1.,n_components=topics)
        if self.modeltype is 'nmf':
            from sklearn.decomposition import NMF
            self.model = NMF(n_components=topics)
Exemplo n.º 7
0
    def start(self):
        bag_of_words, words = TermFrequency(self.trained).create_vocabulary()

        v = Vectorizer(self.trained, self.classify, words, bag_of_words)

        tfidf_trained = v.tfidf_for_tweets_trained
        evaluations = v.evaluations
        tfidf_to_classify = v.tfidf_for_tweets_to_classify

        models = Models(tfidf_trained, evaluations, tfidf_to_classify)
        prediction = models.svm_linear()

        return prediction
Exemplo n.º 8
0
def main(filename, category_filename, answer_col, predictor_col, hidden_nodes):
    df = pd.read_csv(filename, usecols=[answer_col, predictor_col])
    categories = pd.read_csv(category_filename,
                             usecols=[predictor_col])[predictor_col].values
    vectorizer = Vectorizer(df, categories, predictor_col, answer_col)
    vectorizer.format(0.6, 0.2)

    batch_size = 1000
    epocs = 50
    learning_rate = 1e-3
    model = build_and_train(vectorizer, batch_size, epocs, learning_rate,
                            hidden_nodes)
    validate(model, vectorizer)
    joblib.dump(model, filename + '.joblib')
Exemplo n.º 9
0
def main():
    img = get_img(FLAGS.img_path)

    vectorizer = Vectorizer()
    vector = vectorizer.get_vector(img)
    vectorizer.close()

    reducter = load_pkl_file(FLAGS.reducter_path)
    reduced = reducter.transform([vector])

    model = load_pkl_file(FLAGS.model_path)

    output = model.predict(reduced)[0]

    print('result: {}'.format(output))
Exemplo n.º 10
0
    def __init__(self):
        """Initializes the datastructures required.
        """
        # The actual text extraction object (does text to vector mapping).
        self.vectorizer = Vectorizer()

        # A list of already hand classified tweets to train our classifier.
        self.data = None

        # A list containing the classification to each individual tweet
        # in the tweets list.
        self.classification = None

        self.classifier = None
        self.scores = None
Exemplo n.º 11
0
def startAnalysis(folder, S1_path, S2_path):

    fetcher = PageFetcher()
    S1 = fetcher.fetchPages(folder, S1_path)
    S2 = fetcher.fetchPages(folder, S2_path)

    #We use a document representation based on TF-IDF model
    TF_IDF = Vectorizer()
    S1_HTML = TF_IDF.fit_transform(S1)
    S2_HTML = TF_IDF.fit_transform(S2)
    pageAllignament = PageAllignament()
    S1S2_Pairs = pageAllignament.allignSources(S1_HTML, S2_HTML)

    print("Stats of: " + str(S1_path) + " and " + str(S2_path))
    evaluation_pipeline(S1S2_Pairs)
Exemplo n.º 12
0
    def test_regression__vectorizer_layer_minus_one_behaves_same(self):
        # GIVEN a layer to test
        layer_under_test = -1

        # AND a vectorizer that uses that layer
        vec = Vectorizer(layer=layer_under_test, prm_path=default_prm_path, backend='cpu')

        # AND an expected output
        expected_output = [0.0016, 0.9883, 0.0099, 0.00]

        #
        # WHEN extracting the attributes from an image
        print 'This test has not stalled, it takes 20-40 seconds on an fast-ish computer (%s)' % strftime("%H:%M:%S", gmtime())
        actual_output = roundArray(vec.get_attribute_vector(image_loc))

        #
        # THEN the output is as expected
        self.assertEqual(expected_output, actual_output, 'The output %s, does not match the expected output of %s' % (str(actual_output), str(expected_output)))
Exemplo n.º 13
0
def vectorize_jobs(df_jobs, vectorizer_path, tfidfs_path, debug=False):
    #initializing tfidf vectorizer
    if debug:
        print('[Job Vectorization 2/5] Initializing Vectorizer \n')
    vectorizer = Vectorizer()

    if debug:
        print('[Job Vectorization 3/5] Tranforming/Vectorizing data \n')
    tfidf_jobs = vectorizer.fit_transform(
        (df_jobs['text']))  #fitting and transforming the vector

    if debug:
        print('[Job Vectorization 4/5] saving vectorizer to {path} \n'.format(
            path=vectorizer_path))
    vectorizer.save_vectorizer(vectorizer_path)

    if debug:
        print('[Job Vectorization 5/5] saving tfidf to {path} \n'.format(
            path=tfidfs_path))
    vectorizer.save_tfidfs(tfidf_jobs, tfidfs_path)
Exemplo n.º 14
0
    def __init__(self, folder='model', train=False):
        '''
        Creates a classifier object
        if no model is found, or train is set True, a new classifier is learned

        INPUT
        folder  the root folder with the Bag-of-Word data, where the model is stored
        train   set True if you want to train 

        '''
        self.folder = folder
        # load Bag-of-Word extractor
        self.bow_vectorizer = Vectorizer(self.folder)
        # if there is no classifier file or training is invoked
        if (not os.path.isfile(self.folder + '/classifier.pickle')) or train:
            print 'Training classifier'
            self.train()
        print 'Loading classifier'
        clfdict = cPickle.load(open(self.folder + '/classifier.pickle'))
        self.clf = clfdict['classifier']
        self.parties = clfdict['labels']
Exemplo n.º 15
0
def main():
    with timer("model loading"):
        # モデルとパイプラインの読込
        model = ModelMLP()
        model.load_model()
        vectorizer = Vectorizer()
        vectorizer.load_vectorizer()

    with timer("data loading"):
        # 予測対象のデータをロード
        df = load_data_from_gcs()

    with timer("preprocess"):
        df = preprocess(df)

    with timer("predict"):
        X = df.drop(columns="price")
        X = vectorizer.transform(X)
        pred = model.predict(X)

        print(pred[:10])
Exemplo n.º 16
0
    def _load_data(self, data_dir, word_tokens, pristine_input,
                   pristine_output, batch_size, seq_length, seq_step):
        try:
            with open(os.path.join(data_dir, 'input.txt'),
                      encoding='utf-8') as input_file:
                text = input_file.read()
        except FileNotFoundError:
            print_red("No input.txt in data_dir")
            sys.exit(1)

        skip_validate = True
        try:
            with open(os.path.join(data_dir, 'validate.txt')) as validate_file:
                text_val = validate_file.read()
                skip_validate = False
        except FileNotFoundError:
            pass  # Validation text optional

        # Find some good default seed string in our source text.
        self.seeds = find_random_seeds(text)
        # Include our validation texts with our vectorizer
        all_text = text if skip_validate else '\n'.join([text, text_val])
        self.vectorizer = Vectorizer(all_text, word_tokens, pristine_input,
                                     pristine_output)

        data = self.vectorizer.vectorize(text)
        x, y = shape_for_stateful_rnn(data, batch_size, seq_length, seq_step)
        print('x.shape:', x.shape)
        print('y.shape:', y.shape)

        if skip_validate:
            return x, y, None, None

        data_val = self.vectorizer.vectorize(text_val)
        x_val, y_val = shape_for_stateful_rnn(data_val, batch_size, seq_length,
                                              seq_step)
        print('x_val.shape:', x_val.shape)
        print('y_val.shape:', y_val.shape)
        return x, y, x_val, y_val
Exemplo n.º 17
0
def main():
    # 学習データ読み込み
    with timer("train data load"):
        df = load_data_from_gcs()

    # 前処理
    with timer("preprocess"):
        df = preprocess(df)
        vectorizer = Vectorizer()

    X_train = df.drop(columns="price")
    y_train = df["price"]

    with timer("training"):
        X_train = vectorizer.fit_transform(X_train)

        # 学習
        base_params = {
            'input_dropout': 0.2,
            'hidden_layers': 3,
            'hidden_units': 256,
            'hidden_activation': 'relu',
            'hidden_dropout': 0.2,
            'batch_norm': 'before_act',
            'optimizer': {
                'type': 'adam',
                'lr': 5e-5
            },
            'batch_size': 64,
        }

        model = ModelMLP(base_params)
        model.fit(X_train, y_train)

    with timer("save model"):
        #モデルとパイプラインの保存
        vectorizer.save_vectorizer()
        model.save_model()
Exemplo n.º 18
0
    def preprocess_and_fit(self):
        # Method that preprocesses data, indexes all words, vectorizes posts and finally trains and tests the classifier
        processed = []
        processed_test = []
        for category in self.categories:
            processed.append(
                self.preprocessor.preprocess('training' + str(category) +
                                             ".txt"))
            processed_test.append(
                self.preprocessor.preprocess('testing' + str(category) +
                                             ".txt"))

        # Word indexing
        for category in processed:  # indexes all words into dictionary
            self.dictionary.index_words(category)
        print("Words indexed. Dictionary size: ",
              len(self.dictionary.dictionary), " words")

        # Vectorization
        vectorizer = Vectorizer(
            self.dictionary.dictionary
        )  # initializes vectorizer-object with dictionary
        vector_start = time.time()
        print("Vectorizing...")
        training_vectors = []
        testing_vectors = []
        for category in processed:
            training_vectors.append(vectorizer.vectorize(category))
        for category in processed_test:
            testing_vectors.append(vectorizer.vectorize(category))
        vector_time = time.time() - vector_start
        print("Vectorization completed in ", ("%.2f" % vector_time), "seconds")

        # Training and evaluation
        self.classifier.train(training_vectors)
        self.fit = True
        self.classifier.evaluate(testing_vectors)
Exemplo n.º 19
0
    def test_regression__vectorizer_layer_minus_four_behaves_same(self):
        # GIVEN a layer to test
        layer_under_test = -4

        # AND a vectorizer that uses that layer
        vec = Vectorizer(layer=layer_under_test, prm_path=default_prm_path, backend='cpu')

        # AND an expected output stored in a file
        expected_output_file_path = os.path.expanduser('~')+'/SaturnServer/test_resources/layer4results.txt'

        #
        # WHEN extracting the attributes from an image
        print 'This test has not stalled, it takes 20-40 seconds on an fast-ish computer (%s)' % strftime("%H:%M:%S", gmtime())
        actual_output = roundArray(vec.get_attribute_vector(image_loc))

        #
        # THEN each element of the actual output array must match each element of the expected results
        with open(expected_output_file_path, 'r') as expected_output_file:
            element_no = 0
            for expected_element in expected_output_file:
                self.assertEqual(float(expected_element), actual_output[element_no],
                                 'The output (element %d) %s, does not match the expected output of %s'
                                 % (element_no, str(actual_output[element_no]), str(expected_element)))
                element_no += 1
            ]

    def tokenize(self):
        for desc in self.texts():
            yield [
                pos_tag(wordpunct_tokenize(sent))
                for sent in sent_tokenize(desc)
            ]

    def describe(self):
        started = time.time()
        counts = FreqDist()
        tokens = FreqDist()

        for word in self.words():
            counts['words'] += 1
            tokens[word] += 1

        return {
            'words': counts['words'],
            'vocab': len(tokens),
            'lexdiv': float(counts['words']) / float(len(tokens)),
            'secs': time.time() - started,
        }


if __name__ == '__main__':
    from vectorizer import Vectorizer
    vec = Vectorizer()
    for vector in vec.tf_idf():
        print(vector)
word_embeddings_file_path = args.word2vec
pretrained_weights_file_path = args.save
epochs = args.epochs
df = read_SEMEVAL_data(args.data)

# initialize objects
print('Initializing objects ...')
print('Initializing word embeddings ...')
t1 = time.time()
word_embeddings = WordEmbeddings(word_embeddings_file_path)
t2 = time.time()
print('\tTook %f seconds' % (t2 - t1))
print('Initializing tokenizer ...')
tokenizer = Tokenizer()
print('Initializing vectorizer ...')
vectorizer = Vectorizer(word_embeddings, tokenizer)

#### training dataset ####
# vectorizing
ids, train_a_vectors, train_b_vectors, train_gold = vectorizer.vectorize_df(df)
train_max_a_length = len(max(train_a_vectors, key=len))
train_max_b_length = len(max(train_b_vectors, key=len))
print('maximum number of tokens per sentence A in training set is %d' %
      train_max_a_length)
print('maximum number of tokens per sentence B in training set is %d' %
      train_max_b_length)
max_len = max([train_max_a_length, train_max_b_length])

# padding
train_a_vectors = pad_tensor(train_a_vectors, max_len)
train_b_vectors = pad_tensor(train_b_vectors, max_len)
Exemplo n.º 22
0
    ix_to_rel = {i: r for i, r in enumerate(rel_set)}
    num_words = len(word_set)
    num_tags = len(tag_set)
    num_rels = len(rel_set)

    ROOT_TAG = "root"

    WORD_SIZE = 100
    TAG_SIZE = 30
    HIDDEN_SIZE = 100
    NUM_EPOCHS = 3

    word_vectorizer = Vectorizer(WordExtractor(sents),
                                 None,
                                 "parser_word",
                                 WORD_SIZE,
                                 filler=ZeroFiller(WORD_SIZE),
                                 ce_enabled=False,
                                 tf_enabled=False)
    tag_vectorizer = Vectorizer(TagExtractor(sents),
                                None,
                                "parser_pos",
                                TAG_SIZE,
                                filler=ZeroFiller(TAG_SIZE),
                                ce_enabled=False,
                                tf_enabled=False)

    parser = SyntaxParser(num_words, WORD_SIZE, num_tags, TAG_SIZE,
                          WORD_SIZE + TAG_SIZE, HIDDEN_SIZE, num_rels)
    optimizer = optim.SGD(parser.parameters(), lr=0.1)
    loss_function = nn.NLLLoss()
Exemplo n.º 23
0
PREPROCESSOR = Preprocessor(thesaurus_path)  # シソーラス・パスを渡さなければ置換をしません。
print('前処理を行います')
PREPROCESSOR.load_text([text_path])
whitelist = PREPROCESSOR.investigate_whitelist(thesaurus_path)
print('保存します')
PREPROCESSOR.save(auto_text_path)
PARSER = Parser()
print('かかり受け解析を行います..')
PARSER.t2f([auto_text_path + '/' + root + '.text'],
           kytea_model=kytea_path,
           eda_model=eda_path)
print('結果を保存します')
PARSER.save(tree_path)  # かかり受け解析したものをファイルに保存
print("Indexを読み込みます...")
VECTORIZER = Vectorizer(index_path, t=1, list=whitelist)  # Indexの読み込み
print('Treeを読み込みます')
vectors = VECTORIZER.get_vector([tree_path + '/' + root + '.eda'],
                                filter=3)  # ベクトルを生成
print(vectors)
print('Vectorを保存します')
VECTORIZER.save(vectors, [vector_path])  # ベクトルを保存

#-----
# いまもっているTFIDFコーパスベクトル群と、クエリベクトルtfidf_vectorsを比較
#----

print('TFIDF corpus Vectorsを読み込みます')
tfidf_corpus_vectors = VECTORIZER.load(
    sorted(glob.glob(tfidf_DB_path + '/*.vector')))
print(tfidf_corpus_vectors)
Exemplo n.º 24
0
    def __init__(self, name, sents, vectorizer_words, vectorizer_forms, embedding_size,
                 tag_sents, tag_embedding_size, context_size,
                 lrs=(0.1, 0.1, 0.1), lr_decrease_factor=0.5, epochs_per_decrease=10):
        ######################################################################
        # Model's parameters.
        # 'sents' is a list of sentences of tuples ((form, word, tag), rel, head)
        self.name = name
        self.sents = sents
        self.embedding_size = embedding_size
        self.context_size = context_size

        ######################################################################
        # Load or create indices.
        # Common
        self.path_base = "internal"
        self.num_words = 0
        self.root_tag = "root"

        # CUDA flag
        self.is_cuda_available = torch.cuda.is_available()

        # For POS tags:
        self.tags = set()
        self.num_tags = 0
        self.tag2index = {}
        self.index2tag = {}

        # For chunk tags:
        self.chunks = set()
        self.num_chunks = 0
        self.chunk2index = {}
        self.index2chunk = {}

        # For relation tags:
        self.rels = set()
        self.num_rels = 0
        self.rel2index = {}
        self.index2rel = {}

        # Update database
        self.create_or_load_indices()
        if self.num_words == 0:
            self.num_words = self.get_num_words(self.sents)

        ######################################################################
        # Logic.
        # Learning rate controls
        self.lrs = lrs
        self.lr_decrease_factor = lr_decrease_factor
        self.epochs_per_decrease = epochs_per_decrease

        # Define machines
        self.vectorizer = Vectorizer(vectorizer_words, vectorizer_forms, name,
                                     embedding_size, filler=ZeroFiller(embedding_size),
                                     ce_enabled=True)

        # self.vectorizer = FastTextVectorizer(name, embedding_size * 2, "ft_sg_syntagrus.bin")

        self.tag_vectorizer = Vectorizer(tag_sents, None, name + "_pos",
                                         tag_embedding_size, filler=ZeroFiller(tag_embedding_size),
                                         ce_enabled=False, tf_enabled=False)

        # Tags embeddings (H).
        # Chunker will get linear combination as an input:
        #    I = H^T * p
        #    p - probabilities vector
        self.tag_embeddings = []
        for i in range(self.num_tags):
            tag = self.index2tag[i].lower()
            self.tag_embeddings.append(self.tag_vectorizer(tag, tag))
        self.tag_embeddings = torch.stack(self.tag_embeddings)
        if self.is_cuda_available:
            self.tag_embeddings = self.tag_embeddings.cuda()

        # Vector size is 1 (TF) + 100 (Word embedding) + 100 (Char grams embedding)
        self.vector_size = self.vectorizer.get_vector_size()

        self.tag_size = self.tag_vectorizer.get_vector_size()

        # Chunk size.
        # Benchmark is 200 (POS hidden) + 201 (embedding) + NUM_TAGS (probabilities)
        self.chunk_size = 2 * embedding_size + self.vector_size + self.tag_size

        # Parse size -- input size for parser.
        # When chunking is not available, parse size is equal to chunk size
        self.parse_size = self.chunk_size

        self.log("tagger input size: {}".format(self.vector_size))
        self.log("chunker input size: {}".format(self.chunk_size))
        self.log("parser input size: {}".format(self.parse_size))

        self.tagger = Tagger(self.vector_size, self.num_tags, "GRU", embedding_size)
        # self.chunker = Tagger(self.chunk_size, self.num_chunks, "LSTM", embedding_size)
        self.parser = SyntaxParser(0, 0, 0, 0, self.parse_size, embedding_size, self.num_rels)

        self.is_tagger_trained = False
        # self.is_chunker_trained = False
        self.is_parser_trained = False

        self.tagger_name = "pos tagging"
        # self.chunker_name = "chunking"
        self.parser_name = "parsing"

        # Try to load from file
        self.tagger_path = "{}/model_pos_{}.pt".format(self.path_base, self.name)
        # self.chunker_path = "{}/model_chunk_{}.pt".format(self.path_base, self.name)
        self.parser_path = "{}/model_parse_{}.pt".format(self.path_base, self.name)

        if os.path.exists(self.tagger_path):
            self.log("Loading POS tagger")
            self.tagger = torch.load(self.tagger_path)
            self.tagger.unit.flatten_parameters()
            self.is_tagger_trained = True
            self.log("Done")

        # if os.path.exists(self.chunker_path):
        #     self.log("Loading chunker")
        #     self.chunker = torch.load(self.chunker_path)
        #     self.chunker.unit.flatten_parameters()
        #     self.is_chunker_trained = True
        #     self.log("Done")

        if os.path.exists(self.parser_path):
            self.log("Loading parser")
            self.parser = torch.load(self.parser_path)
            self.parser.unit.flatten_parameters()
            self.is_parser_trained = True
            self.log("Done")
Exemplo n.º 25
0
from data_analysis import DataManager
from vectorizer import Vectorizer
import numpy as np
import pickle
from tempfile import TemporaryFile

dm = DataManager('./data/spam.csv')
dm.most_frequent_character_in_spam()
dm.most_frequent_character_in_legit()
dm.most_frequent_characters()
dm.average_text_length()

sentences, labels = dm.get_text(), dm.get_labels()
labels = list(map(lambda v: 0 if v == 'ham' else 1, labels))
vectorizer = Vectorizer(sentences)

sentences_features = []

for sentence in sentences:
    sentence_vector = vectorizer.text_to_vec(sentence, alpha=0.3)
    sentences_features.append(sentence_vector)

train_x, train_y = sentences_features[0:5000], labels[0:5000]
train_x = np.asarray(train_x)
train_y = np.asarray(train_y)

test_x, test_y = sentences_features[5000:], labels[5000:]
test_x = np.asarray(test_x)
test_y = np.asarray(test_y)

np.savetxt('train_x.txt', train_x)
from data_loader import load_data
from data_preprocessor import message_cleaning
from vectorizer import Vectorizer
import pandas as pd

tweets_df = load_data("../data/twitter.csv")

# Let's test the newly added function
tweets_df_clean = tweets_df['tweet'].apply(message_cleaning)

print(tweets_df_clean[5])  # show the cleaned up version

print(tweets_df['tweet'][5])  # show the original version

# Vectorize the tweets using tokenizer
tweets_countvectorizer = Vectorizer(tweets_df)

print(tweets_countvectorizer)
print(tweets_countvectorizer.shape)

# dataframe to train
tweets = pd.DataFrame(tweets_countvectorizer)

X = tweets
Y = tweets_df['label']
Exemplo n.º 27
0
import numpy as np
import pickle
from keras.models import Model

from spacy_data import get_train_dev_for_embedding, get_embeddings

from model_similarity2 import create_model

from spacy_features import anygram_kernel, anygram_similarity_fast as kernel, words_to_indices as get_features
from vectorizer import VectorizerGlove as Vectorizer

print "Loading spacy..."
nlp = spacy.load('en_core_web_lg')

print "Loading vectorizer..."
vectorizer = Vectorizer(dims=50)


def train(train_texts, train_labels, dev_texts, dev_labels, sentence_length=100, batch_size=100, nb_epoch=5):

    #embeddings, vocab = get_embeddings(nlp.vocab)
    # from spacy_features import w2v_matrix
    embeddings = vectorizer.get_word_matrix()
    vocab = None

    print embeddings.shape

    print("Parsing texts...")
    train_docs = list(nlp.pipe(train_texts))
    dev_docs = list(nlp.pipe(dev_texts))
Exemplo n.º 28
0
def evaluation(filenames,
               dictionary_root='../../lexica',
               cruncher_type='lemmatizer',
               vectorizer_type='word2vec',
               metrics=['f1-score', 'accuracy-score']):

    if not isinstance(filenames, list):
        raise ValueError("'" + filenames + "' is not an instance of 'list'")

    beg = time.time()

    vectorizer = Vectorizer(vectorizer_type)

    try:

        labels, vectors = vectorizer.vectorize(filenames, dictionary_root)

    except:

        preprocessor = Preprocessor(filenames, Cruncher(cruncher_type))

        dictionary = Dictioanry(dictionary_root) if dictionary_root else None

        labels, vectors = vectorizer.vectorize(preprocessor, dictionary)

    test_ids, test_labels, test_vectors = [], [], []
    train_ids, train_labels, train_vectors = [], [], []

    for id, label in labels.items():

        if label == 'unknown':
            test_ids.append(id)
            test_labels.append(label)
            test_vectors.append(vectors[id])

        else:
            train_ids.append(id)
            train_labels.append(label)
            train_vectors.append(vectors[id])

    evaluator = Evaluator()

    for classifing in ['knn', 'rrb', 'svm']:

        if classifing != 'rrb':
            classifier = Classifier(train_vectors, train_labels, classifing)

            predictions = classifier.predict(test_vectors)

        else:
            classifier = RoundRobin(train_labels, train_vectors, test_vectors)

            predictions = classifier.classify()

        for metric in metrics:

            value = evaluator.evaluate(dict(zip(test_ids, predictions)),
                                       metric)

            print('<LOG>: The performance of', "'" + classifing + "'",
                  'according to the',
                  ("'" + metric + "'").ljust(max(map(len, metrics)) + 2),
                  "metric is", '{0:.6f}'.format(value))

    end = time.time()

    print('\n\nElapsed time:',
          '{0:.6f}'.format(end - beg),
          'seconds',
          file=sys.stderr)
        x, y, c = [], [], []

        for token in tokens:
            if token in dictioanry.valences:
                x.append(np.random.rand())
                y.append(np.random.rand())
                c.append(rgb(np.mean(dictioanry.valences[token])))

        plt.scatter(x, y, c=c, alpha=0.8)

        plt.show()


if __name__ == "__main__":

    preprocessor = Preprocessor(['train.tsv', 'test.tsv'], Cruncher())

    dictionary = Dictioanry('..\\..\\lexica')

    vectorizer = Vectorizer()

    labels, vectors = vectorizer.vectorize(preprocessor, dictionary)

    visualizer = Visualizer(preprocessor)

    for method in Visualizer.supported_methods:

        visualizer.visualize(method=method,
                             dictionary=dictionary,
                             model=vectorizer.underlying)
Exemplo n.º 30
0
import plac
import pathlib
import spacy
import numpy as np

from spacy_data import get_train_dev_test, get_embeddings

from spacy_features import get_hv_similar_sentences as get_features, anygram_similarity_fast as kernel
from vectorizer import VectorizerGlove as Vectorizer

print "Loading spacy..."
nlp = spacy.load('en_core_web_lg')

print "Loading vectorizer..."
vectorizer = Vectorizer(dims=200)


def train(train_texts, train_labels, dev_texts, dev_labels, test_texts, test_labels, sentence_length=100, batch_size=100, nb_epoch=5):

    print("Creating model...")
    from sklearn import svm
    from spacy_features import anygram_kernel

    for c in [0.8]:

        print("Parsing texts...")
        train_docs = list(nlp.pipe(train_texts))
        dev_docs = list(nlp.pipe(dev_texts))

        test_docs = list(nlp.pipe(test_texts))