예제 #1
0
def compute_similarity_using_word2vec_model(query_word,
                                            steam_tokens=None,
                                            model=None,
                                            enforce_training=False):
    if steam_tokens is None:
        steam_tokens = load_tokens()

    if model is None:
        try:
            print('Loading Word2Vec model.')
            model = Word2Vec.load(get_word_model_file_name())

            if enforce_training:
                model = train_word_model_on_steam_tokens(
                    model=model, steam_tokens=steam_tokens)

        except FileNotFoundError:
            print('Training Word2Vec model from scratch.')
            model = train_word_model_on_steam_tokens(model=None,
                                                     steam_tokens=steam_tokens)

    if query_word in get_word_model_vocabulary(model):
        similar_words = test_word(model, query_word)
    else:
        print('The word {} is not part of the word model vocabulary.'.format(
            query_word))
        similar_words = None

    return similar_words
def compute_similarity_with_candidate_sentences_using_wmd(query_app_id, steam_tokens=None, model=None,
                                                          candidates=None):
    if steam_tokens is None:
        steam_tokens = load_tokens()

    if model is None:
        model = Word2Vec.load(get_word_model_file_name())

    constrain_search = (candidates is not None)

    query = steam_tokens[query_app_id]

    if constrain_search:
        documents = list(steam_tokens[i] for i in candidates)
    else:
        # Caveat: the Word Mover algorithm is painfully slow! Please consider constraining the search to few candidates!
        documents = list(steam_tokens.values())

    instance = WmdSimilarity(documents, model.wv, num_best=10)

    similarity_scores_as_tuples = instance[query]

    similarity_scores = reformat_similarity_scores_for_wmd(similarity_scores_as_tuples, candidates)
    print_most_similar_sentences(similarity_scores)

    return similarity_scores
def load_input():
    game_names, _ = load_game_names(include_genres=False,
                                    include_categories=False)

    steam_tokens = load_tokens()

    app_ids = list(int(app_id) for app_id in steam_tokens.keys())

    return game_names, steam_tokens, app_ids
예제 #4
0
def compute_similarity_with_all_other_steam_sentences(
        query_app_id,
        steam_tokens=None,
        model=None,
        game_names=None,
        filter_out_words_out_of_vocabulary=True):
    if steam_tokens is None:
        steam_tokens = load_tokens()

    if model is None:
        model = Word2Vec.load(get_word_model_file_name())

    if game_names is None:
        game_names, _ = load_game_names()

    index2word_set = get_word_model_vocabulary(model)

    query_sentence = steam_tokens[query_app_id]
    if filter_out_words_out_of_vocabulary:
        query_sentence = filter_out_words_not_in_vocabulary(
            query_sentence, index2word_set)

    similarity_scores = {}

    counter = 0
    num_games = len(steam_tokens)

    for app_id in steam_tokens:
        counter += 1

        if (counter % 1000) == 0:
            print('[{}/{}] appID = {} ({})'.format(counter, num_games, app_id,
                                                   game_names[app_id]))

        reference_sentence = steam_tokens[app_id]
        if filter_out_words_out_of_vocabulary:
            reference_sentence = filter_out_words_not_in_vocabulary(
                reference_sentence, index2word_set)

        try:
            similarity_scores[app_id] = model.wv.n_similarity(
                query_sentence, reference_sentence)
        except ZeroDivisionError:
            similarity_scores[app_id] = 0

    return similarity_scores
예제 #5
0
def train_doc_model_on_steam_tokens(model=None, steam_tokens=None, num_epochs=10):
    # You do not want to perform training this way, because training already happened when initializating the model
    # with Doc2Vec(documents). Moreover, calling train() several times messes with decay of learning rate alpha!

    if steam_tokens is None:
        steam_tokens = load_tokens()

    documents = list(read_corpus(steam_tokens))

    if model is None:
        model = doc2vec.Doc2Vec(documents)  # training happens with 5 epochs (default) here

    start = time()
    model.train(documents, total_examples=len(documents), epochs=num_epochs)
    print('Elapsed time: {%.2f}' % (time() - start))

    model.save(get_doc_model_file_name())

    return model
예제 #6
0
def train_word_model_on_steam_tokens(model=None,
                                     steam_tokens=None,
                                     num_epochs=10):
    # Warning: training will happen several times, which might be detrimental to your model!

    if steam_tokens is None:
        steam_tokens = load_tokens()

    documents = list(steam_tokens.values())

    if model is None:
        model = Word2Vec(
            documents
        )  # training already happens here, due to the 'documents' argument!

    model.train(documents, total_examples=len(documents), epochs=num_epochs)

    model.save(get_word_model_file_name())

    return model
예제 #7
0
def compute_similarity_using_doc2vec_model(query_app_id, steam_tokens=None, model=None,
                                           verbose=False,
                                           enforce_training=False, avoid_inference=False, num_items_displayed=10):
    if steam_tokens is None:
        steam_tokens = load_tokens()

    if model is None:
        try:
            print('Loading Doc2Vec model.')
            model = doc2vec.Doc2Vec.load(get_doc_model_file_name())

            if enforce_training:
                model = train_doc_model_on_steam_tokens(model=model, steam_tokens=steam_tokens)

        except FileNotFoundError:
            print('Training Doc2Vec model from scratch.')
            model = train_doc_model_on_steam_tokens(model=None, steam_tokens=steam_tokens)

    if avoid_inference:
        if verbose:
            print('Finding most similar documents based on the query appID.')
        # For games which are part of the training corpus, we do not need to call model.infer_vector()

        similarity_scores_as_tuples = model.docvecs.most_similar(positive=get_tag_prefix() + str(query_app_id),
                                                                 topn=num_items_displayed)
    else:
        if verbose:
            print('Finding most similar documents based on an inferred vector, which represents the query document.')
        query = steam_tokens[query_app_id]
        # Caveat: « Subsequent calls to this function may infer different representations for the same document. »
        # Reference: https://radimrehurek.com/gensim/models/doc2vec.html#gensim.models.doc2vec.Doc2Vec.infer_vector
        inferred_vector = model.infer_vector(query)
        similarity_scores_as_tuples = model.docvecs.most_similar([inferred_vector])

    similarity_scores = reformat_similarity_scores_for_doc2vec(similarity_scores_as_tuples)
    print_most_similar_sentences(similarity_scores, num_items_displayed=num_items_displayed)

    return similarity_scores
예제 #8
0
            print('Loading Word2Vec model.')
            model = Word2Vec.load(get_word_model_file_name())

            if enforce_training:
                model = train_word_model_on_steam_tokens(
                    model=model, steam_tokens=steam_tokens)

        except FileNotFoundError:
            print('Training Word2Vec model from scratch.')
            model = train_word_model_on_steam_tokens(model=None,
                                                     steam_tokens=steam_tokens)

    if query_word in get_word_model_vocabulary(model):
        similar_words = test_word(model, query_word)
    else:
        print('The word {} is not part of the word model vocabulary.'.format(
            query_word))
        similar_words = None

    return similar_words


if __name__ == '__main__':
    steam_tokens = load_tokens()

    model = Word2Vec.load(get_word_model_file_name())

    for query_word in ['anime', 'fun', 'violent']:
        compute_similarity_using_word2vec_model(query_word, steam_tokens,
                                                model)
예제 #9
0
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

import math as m
import progressbar as pb
import tensorflow as tf
import utils as u
from datetime import datetime as dt

# --- script settings start ---
document = '../data/marktwain.txt'
yml = './simple_rnn.yml'
# --- script settings end ---

FLAGS = u.load_flags(yml)
tokens = u.load_tokens(document)
unique_tokens = set(tokens)
unique_tokens = dict((v, i) for i, v in enumerate(unique_tokens))
samples = u.make_samples(tokens, FLAGS['sample_length'])

# build a simple model
# words -> one-hot -> rnn -> dense -> output
model = tf.keras.Sequential([
    tf.keras.layers.SimpleRNN(units=FLAGS['units'],
                              input_shape=(FLAGS['sample_length'] - 1,
                                           len(unique_tokens))),
    tf.keras.layers.Dense(len(unique_tokens)),
    tf.keras.layers.Activation('softmax')
])
optimizer = tf.keras.optimizers.Nadam()
model.compile(optimizer=optimizer, loss='categorical_crossentropy')
예제 #10
0
import asyncore

from server.tcptunnel import TCPServer
from server.udptunnel import UDPServer
from utils import load_tokens

if __name__ == '__main__':
    tokens = load_tokens()
    tcp_server = TCPServer('0.0.0.0', 6666, tokens)
    #udp_server = UDPServer('0.0.0.0', 7778)
    asyncore.loop()
예제 #11
0
# -*- coding: utf-8 -*-
from keras.models import load_model
import utils

tokenizer = utils.load_tokens()

review = "Good but the charger is not the same size as an Apple charger and that makes it a little difficult for charging with cases on."  # 2

input_data = utils.convert_review(review, tokenizer)

model_name = 'model/weights-improvement-20-0.9417.hdf5'
model = load_model(model_name)

predictions = model.predict(input_data)
utils.display_rating(predictions)
def retrieve_similar_store_descriptions(
        compute_from_scratch=True,
        use_unit_vectors=False,
        alpha=1e-3,  # in SIF weighting scheme, parameter in the range [3e-5, 3e-3]
        num_removed_components_for_sentence_vectors=0,  # in SIF weighting scheme
        pre_process_word_vectors=False,
        num_removed_components_for_word_vectors=0,
        count_words_out_of_vocabulary=True,
        use_idf_weights=True,
        shuffle_corpus=True,
        use_glove_with_spacy=True,
        use_cosine_similarity=True,
        num_neighbors=10,
        no_below=5,  # only relevant with Word2Vec
        no_above=0.5,  # only relevant with Word2Vec
        only_print_banners=True):
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)

    game_names, _ = load_game_names(include_genres=False,
                                    include_categories=False)

    steam_tokens = load_tokens()

    documents = list(steam_tokens.values())

    if shuffle_corpus:
        # Useful for Doc2Vec in 'doc2vec_model.py'. It might be useful for other methods.
        random.shuffle(documents)

    if compute_from_scratch:

        if not use_glove_with_spacy:
            # Use self-trained Word2Vec vectors

            dct = Dictionary(documents)
            print('Dictionary size (before trimming): {}'.format(len(dct)))

            dct.filter_extremes(no_below=no_below, no_above=no_above)
            print('Dictionary size (after trimming): {}'.format(len(dct)))

            model = Word2Vec(documents, workers=multiprocessing.cpu_count())

            wv = model.wv

        else:
            # Use pre-trained GloVe vectors loaded from spaCy
            # Reference: https://spacy.io/models/en#en_vectors_web_lg

            spacy_model_name = 'en_vectors_web_lg'  # either 'en_core_web_lg' or 'en_vectors_web_lg'
            nlp = spacy.load(spacy_model_name)

            wv = nlp.vocab

        if pre_process_word_vectors:
            # Jiaqi Mu, Pramod Viswanath, All-but-the-Top: Simple and Effective Postprocessing for Word Representations,
            # in: ICLR 2018 conference.
            # Reference: https://openreview.net/forum?id=HkuGJ3kCb

            if use_glove_with_spacy:
                wv.vectors.data -= np.array(wv.vectors.data).mean(axis=0)

                if num_removed_components_for_word_vectors > 0:
                    wv.vectors.data = remove_pc(
                        wv.vectors.data,
                        npc=num_removed_components_for_word_vectors)

            else:
                wv.vectors -= np.array(wv.vectors).mean(axis=0)

                if num_removed_components_for_word_vectors > 0:
                    wv.vectors = remove_pc(
                        wv.vectors,
                        npc=num_removed_components_for_word_vectors)

                wv.init_sims()

        if use_unit_vectors and not use_glove_with_spacy:
            # Pre-computations of unit word vectors, which replace the unnormalized word vectors. A priori not required
            # here, because another part of the code takes care of it. A fortiori not required when using spaCy.
            wv.init_sims(
                replace=True
            )  # TODO IMPORTANT choose whether to normalize vectors

        if not use_glove_with_spacy:
            index2word_set = set(wv.index2word)
        else:
            index2word_set = None

        num_games = len(steam_tokens)

        word_counter = {}
        document_per_word_counter = {}

        counter = 0
        for app_id in steam_tokens:
            counter += 1

            if (counter % 1000) == 0:
                print('[{}/{}] appID = {} ({})'.format(counter, num_games,
                                                       app_id,
                                                       game_names[app_id]))

            reference_sentence = steam_tokens[app_id]
            if not count_words_out_of_vocabulary:
                # This has an impact on the value of 'total_counter'.
                reference_sentence = filter_out_words_not_in_vocabulary(
                    reference_sentence, index2word_set, wv)

            for word in reference_sentence:
                try:
                    word_counter[word] += 1
                except KeyError:
                    word_counter[word] = 1

            for word in set(reference_sentence):
                try:
                    document_per_word_counter[word] += 1
                except KeyError:
                    document_per_word_counter[word] = 1

        total_counter = sum(word_counter.values())

        # Inverse Document Frequency (IDF)
        idf = {}
        for word in document_per_word_counter:
            idf[word] = math.log(
                (1 + num_games) / (1 + document_per_word_counter[word]))

        # Word frequency. Caveat: over the whole corpus!
        word_frequency = dict()
        for word in word_counter:
            word_frequency[word] = word_counter[word] / total_counter

        sentence_vector = {}
        if not use_glove_with_spacy:
            word_vector_length = wv.vector_size
        else:
            word_vector_length = wv.vectors_length
        X = np.zeros([num_games, word_vector_length])

        counter = 0
        for (i, app_id) in enumerate(steam_tokens.keys()):
            counter += 1

            if (counter % 1000) == 0:
                print('[{}/{}] appID = {} ({})'.format(counter, num_games,
                                                       app_id,
                                                       game_names[app_id]))

            reference_sentence = steam_tokens[app_id]
            num_words_in_reference_sentence = len(reference_sentence)

            reference_sentence = filter_out_words_not_in_vocabulary(
                reference_sentence, index2word_set, wv)
            if not count_words_out_of_vocabulary:
                # NB: Out-of-vocabulary words are not counted in https://stackoverflow.com/a/35092200
                num_words_in_reference_sentence = len(reference_sentence)

            weighted_vector = np.zeros(word_vector_length)

            for word in reference_sentence:
                if use_idf_weights:
                    weight = idf[word]
                else:
                    weight = (alpha / (alpha + word_frequency[word]))

                # TODO IMPORTANT Why use the normalized word vectors instead of the raw word vectors?
                if not use_glove_with_spacy:
                    if use_unit_vectors:
                        # Reference: https://github.com/RaRe-Technologies/movie-plots-by-genre
                        word_vector = wv.vectors_norm[wv.vocab[word].index]
                    else:
                        word_vector = wv.vectors[wv.vocab[word].index]
                else:
                    word_vector = wv.get_vector(word)
                    if use_unit_vectors:
                        word_vector_norm = wv[word].vector_norm
                        if word_vector_norm > 0:
                            word_vector = word_vector / word_vector_norm

                weighted_vector += weight * word_vector

            if len(reference_sentence) > 0:
                sentence_vector[
                    app_id] = weighted_vector / num_words_in_reference_sentence
            else:
                sentence_vector[app_id] = weighted_vector

            X[i, :] = sentence_vector[app_id]

        # Reference: https://stackoverflow.com/a/11620982
        X = np.where(np.isfinite(X), X, 0)

        print('Saving the sentence embedding.')
        np.save('data/X.npy', X)

    else:
        print('Loading the sentence embedding.')
        X = np.load('data/X.npy', mmap_mode='r')

    if num_removed_components_for_sentence_vectors > 0:
        X = remove_pc(X, npc=num_removed_components_for_sentence_vectors)

    app_ids = list(int(app_id) for app_id in steam_tokens.keys())

    query_app_ids = load_benchmarked_app_ids(append_hard_coded_app_ids=True)

    matches_as_app_ids = perform_knn_search_with_app_ids_as_input(
        query_app_ids,
        label_database=X,
        app_ids=app_ids,
        use_cosine_similarity=use_cosine_similarity,
        num_neighbors=num_neighbors)

    print_ranking(query_app_ids,
                  matches_as_app_ids,
                  num_elements_displayed=num_neighbors,
                  only_print_banners=only_print_banners)

    retrieval_score = compute_retrieval_score(
        query_app_ids,
        matches_as_app_ids,
        num_elements_displayed=num_neighbors,
        verbose=False)

    retrieval_score_by_genre = compute_retrieval_score_based_on_sharing_genres(
        query_app_ids,
        matches_as_app_ids,
        num_elements_displayed=num_neighbors,
        verbose=False)

    retrieval_score_by_tag = compute_retrieval_score_based_on_sharing_tags(
        query_app_ids,
        matches_as_app_ids,
        num_elements_displayed=num_neighbors,
        verbose=False)

    return retrieval_score, retrieval_score_by_genre, retrieval_score_by_tag
예제 #13
0
"====================================================================="

#load train set description
train_descriptions = UT.load_clean_desc('description.txt', train)
print("Descriptions Train: ", len(train_descriptions))

"======================================================================"

train_features = UT.load_photo_features('features.pkl', train)
print("train features:", len(train_features))

"======================================================================="

# Get tokens
tokens = UT.load_tokens(train_descriptions)
vocab = len(tokens.word_index) + 1
print('Vocab Size:', vocab)

max_length = UT.max_length(train_descriptions)
print('Description Length:', max_length)
# prepare sequences
X1train, X2train, ytrain = UT.create_sequence(tokens, max_length, features,
                                              train_descriptions, vocab)
print('Size of sequence', len(X2train))

# TIME FOR LOAD VALIDATION DATASET
print("[INFO] Load Val data.......")
test = UT.load_identifiers(args['devPath'])
print('Dataset: %d' % len(test))
# descriptions
def main(chosen_model_no=0, num_items_displayed=10, use_spacy=False, use_soft_cosine_similarity=False,
         num_topics=None, no_below=5, no_above=0.5, normalize_vectors=False):
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

    if num_topics is None:
        num_topics = 100

    possible_model_names = [
        'tf_idf',  # 0
        'lsi_bow', 'lsi_tf_idf',  # 1, 2
        'rp_bow', 'rp_tf_idf',  # 3, 4
        'lda_bow', 'lda_tf_idf',  # 5, 6
        'hdp_bow', 'hdp_tf_idf',  # 7, 8
        'word2vec',  # 9
    ]
    chosen_model_name = possible_model_names[chosen_model_no]
    print(chosen_model_name)

    game_names, _ = load_game_names(include_genres=False, include_categories=False)

    steam_tokens = load_tokens()

    nlp = spacy.load('en_core_web_lg')

    documents = list(steam_tokens.values())

    dct = Dictionary(documents)
    print(len(dct))
    dct.filter_extremes(no_below=no_below, no_above=no_above)
    print(len(dct))

    corpus = [dct.doc2bow(doc) for doc in documents]

    # Pre-processing

    pre_process_corpus_with_tf_idf = chosen_model_name.endswith('_tf_idf')

    tfidf_model = TfidfModel(corpus, id2word=dct, normalize=normalize_vectors)

    if pre_process_corpus_with_tf_idf:
        # Caveat: the leading underscore is important. Do not use this pre-processing if the chosen model is Tf-Idf!
        print('Corpus as Tf-Idf')
        pre_processed_corpus = tfidf_model[corpus]
    else:
        print('Corpus as Bag-of-Words')
        pre_processed_corpus = corpus

    # Model

    model = None
    wv = None
    index2word_set = None

    if chosen_model_name == 'tf_idf':
        print('Term Frequency * Inverse Document Frequency (Tf-Idf)')
        model = tfidf_model

    elif chosen_model_name.startswith('lsi'):
        print('Latent Semantic Indexing (LSI/LSA)')
        model = LsiModel(pre_processed_corpus, id2word=dct, num_topics=num_topics)

    elif chosen_model_name.startswith('rp'):
        print('Random Projections (RP)')
        model = RpModel(pre_processed_corpus, id2word=dct, num_topics=num_topics)

    elif chosen_model_name.startswith('lda'):
        print('Latent Dirichlet Allocation (LDA)')
        model = LdaModel(pre_processed_corpus, id2word=dct, num_topics=num_topics)

    elif chosen_model_name.startswith('hdp'):
        print('Hierarchical Dirichlet Process (HDP)')
        model = HdpModel(pre_processed_corpus, id2word=dct)

    elif chosen_model_name == 'word2vec':
        use_a_lot_of_ram = False

        if use_a_lot_of_ram:
            model = None

            print('Loading Word2Vec based on Google News')
            # Warning: this takes a lot of time and uses a ton of RAM!
            wv = KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin.gz', binary=True)
        else:
            if use_spacy:
                print('Using Word2Vec with spaCy')
            else:
                print('Training Word2Vec')

                model = Word2Vec(documents)

                wv = model.wv

        if not use_spacy:
            wv.init_sims(replace=normalize_vectors)

            index2word_set = set(wv.index2word)

    else:
        print('No model specified.')
        model = None

    if chosen_model_name != 'word2vec':
        if not use_soft_cosine_similarity:
            index = MatrixSimilarity(model[pre_processed_corpus], num_best=10, num_features=len(dct))
        else:
            w2v_model = Word2Vec(documents)
            similarity_index = WordEmbeddingSimilarityIndex(w2v_model.wv)
            similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dct, tfidf_model, nonzero_limit=100)
            index = SoftCosineSimilarity(model[pre_processed_corpus], similarity_matrix)
    else:
        index = None

    query_app_ids = load_benchmarked_app_ids(append_hard_coded_app_ids=True)

    app_ids = list(int(app_id) for app_id in steam_tokens.keys())

    matches_as_app_ids = []

    for query_count, query_app_id in enumerate(query_app_ids):
        print('[{}/{}] Query appID: {} ({})'.format(query_count + 1, len(query_app_ids),
                                                    query_app_id, get_app_name(query_app_id, game_names)))

        query = steam_tokens[str(query_app_id)]

        if use_spacy:
            spacy_query = Doc(nlp.vocab, query)
        else:
            spacy_query = None

        if chosen_model_name != 'word2vec':
            vec_bow = dct.doc2bow(query)
            if pre_process_corpus_with_tf_idf:
                pre_preoccessed_vec = tfidf_model[vec_bow]
            else:
                pre_preoccessed_vec = vec_bow
            vec_lsi = model[pre_preoccessed_vec]
            sims = index[vec_lsi]

            if use_soft_cosine_similarity:
                sims = enumerate(sims)

            similarity_scores_as_tuples = [(str(app_ids[i]), sim) for (i, sim) in sims]
            similarity_scores = reformat_similarity_scores_for_doc2vec(similarity_scores_as_tuples)
        else:
            if use_spacy:
                similarity_scores = {}
                for app_id in steam_tokens:
                    reference_sentence = steam_tokens[app_id]
                    spacy_reference = Doc(nlp.vocab, reference_sentence)
                    similarity_scores[app_id] = spacy_query.similarity(spacy_reference)
            else:
                query_sentence = filter_out_words_not_in_vocabulary(query, index2word_set)

                similarity_scores = {}

                counter = 0
                num_games = len(steam_tokens)

                for app_id in steam_tokens:
                    counter += 1

                    if (counter % 1000) == 0:
                        print('[{}/{}] appID = {} ({})'.format(counter, num_games, app_id, game_names[app_id]))

                    reference_sentence = steam_tokens[app_id]
                    reference_sentence = filter_out_words_not_in_vocabulary(reference_sentence, index2word_set)

                    try:
                        similarity_scores[app_id] = wv.n_similarity(query_sentence, reference_sentence)
                    except ZeroDivisionError:
                        similarity_scores[app_id] = 0

        similar_app_ids = print_most_similar_sentences(similarity_scores, num_items_displayed=num_items_displayed,
                                                       verbose=False)
        matches_as_app_ids.append(similar_app_ids)

    print_ranking(query_app_ids,
                  matches_as_app_ids,
                  only_print_banners=True)

    return
예제 #15
0
def apply_pipeline(train_from_scratch=True, avoid_inference=False, shuffle_corpus=True,
                   include_genres=False, include_categories=True, include_app_ids=True,
                   verbose=False):
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

    game_names, game_tags = load_game_names(include_genres, include_categories)

    steam_tokens = load_tokens()

    documents = list(read_corpus(steam_tokens, game_tags, include_app_ids))

    if shuffle_corpus:
        # « Only if the training data has some existing clumping – like all the examples with certain words/topics are
        # stuck together at the top or bottom of the ordering – is native ordering likely to cause training problems.
        # And in that case, a single shuffle, before any training, should be enough to remove the clumping. »
        # Reference: https://stackoverflow.com/a/48080869
        random.shuffle(documents)

    if train_from_scratch:
        print('Creating a new Doc2Vec model from scratch.')
        model = doc2vec.Doc2Vec(documents,
                                vector_size=100,
                                window=5,
                                min_count=5,
                                epochs=20,
                                workers=multiprocessing.cpu_count())

        # NB: Do not follow the piece of advice given in https://rare-technologies.com/doc2vec-tutorial/
        # « I have obtained better results by iterating over the data several times and either:
        #     1. randomizing the order of input sentences, or
        #     2. manually controlling the learning rate over the course of several iterations. »
        # Indeed, in my experience, this leads to buggy results. Moreover, this approach is not recommended according to
        # https://stackoverflow.com/a/48080869

        model.save(get_doc_model_file_name())
    else:
        print('Loading previous Doc2Vec model.')
        model = doc2vec.Doc2Vec.load(get_doc_model_file_name())

    # Test doc2vec

    if verbose:

        try:
            # Spelunky + (Slay the Spire) - (Dream Quest)
            check_analogy(model, pos=['239350', '646570'], neg=['557410'])
        except TypeError:
            pass

        try:
            # Half-Life + (Witcher 2) - (Witcher)
            check_analogy(model, pos=['70', '20920'], neg=['20900'])
        except TypeError:
            pass

        query_app_ids = ['620', '364470', '504230', '583950', '646570', '863550', '794600']

        for query_app_id in query_app_ids:
            print('Query appID: {} ({})'.format(query_app_id, game_names[query_app_id]))
            compute_similarity_using_doc2vec_model(query_app_id, steam_tokens, model,
                                                   avoid_inference=avoid_inference,
                                                   num_items_displayed=10)

        # Check the relevance of the corresponding word2vec
        for query_word in ['anime', 'fun', 'violent']:
            compute_similarity_using_word2vec_model(query_word, steam_tokens, model)

        entity = get_doc_model_entity(model)
        tag_entity = set(tag for tag in entity if 'appID_' not in tag)

        print(tag_entity)

        query_tags = ['In-App Purchases', 'Free to Play', 'Violent', 'Early Access']

        for query_tag in tag_entity.intersection(query_tags):
            for query_app_id in query_app_ids:
                try:
                    sim = model.docvecs.similarity(get_tag_prefix() + query_app_id, query_tag)
                    print('Similarity = {:.0%} for tag {} vs. appID {} ({})'.format(sim, query_tag, query_app_id,
                                                                                    game_names[query_app_id]))
                except KeyError:
                    pass

        num_items_displayed = 3
        for query_tag in tag_entity:
            print('\nTag: {}'.format(query_tag))
            similarity_scores_as_tuples = model.docvecs.most_similar(positive=query_tag, topn=num_items_displayed)
            similarity_scores = reformat_similarity_scores_for_doc2vec(similarity_scores_as_tuples)
            print_most_similar_sentences(similarity_scores, num_items_displayed=num_items_displayed)

    # Top 100

    query_app_ids = load_benchmarked_app_ids(append_hard_coded_app_ids=True)

    num_neighbors = 10
    only_print_banners = True
    use_cosine_similarity = True

    label_database = np.array(model.docvecs.vectors_docs)
    doc_tags = list(model.docvecs.doctags.keys())

    init_indices = np.array(range(len(doc_tags)))
    bool_indices_to_remove = list(map(lambda x: not x.startswith(get_tag_prefix()), doc_tags))
    indices_to_remove = init_indices[bool_indices_to_remove]
    label_database = np.delete(label_database, indices_to_remove, axis=0)

    app_ids = [int(doc_tag[len(get_tag_prefix()):]) for doc_tag in doc_tags
               if doc_tag.startswith(get_tag_prefix())]

    knn = prepare_knn_search(label_database, use_cosine_similarity=use_cosine_similarity)

    query_des = None
    for query_app_id in query_app_ids:
        if avoid_inference:
            inferred_vector = label_database[app_ids.index(query_app_id)]
        else:
            # From query appID to query feature vector
            query = steam_tokens[str(query_app_id)]
            # Caveat: « Subsequent calls to this function may infer different representations for the same document. »
            # Reference: https://radimrehurek.com/gensim/models/doc2vec.html#gensim.models.doc2vec.Doc2Vec.infer_vector
            inferred_vector = model.infer_vector(query)

        if query_des is None:
            query_des = inferred_vector
        else:
            query_des = np.vstack((query_des, inferred_vector))

    # Matching of feature vectors
    matches = perform_knn_search_with_vectors_as_input(query_des, knn, num_neighbors)

    # From feature matches to appID matches
    matches_as_app_ids = transform_matches_to_app_ids(matches, app_ids)

    print_ranking(query_app_ids,
                  matches_as_app_ids,
                  num_elements_displayed=num_neighbors,
                  only_print_banners=only_print_banners)

    return
예제 #16
0
def train(args, states=None):

    config_obj = Config(args.config_file)
    config = config_obj.elements

    # make training runs deterministic
    set_seed(seed_value=config['random_seed'])

    logging.info("Loading datasets...")
    dataset, labels = load_tokens(
        input_id_path=config['input_id'],
        token_type_id_path=config['token_type_id'],
        attention_mask_path=config['attention_mask'],
        label_path=config['labels'],
    )

    train_loader, val_loader, test_loader = create_dataloaders(
        dataset,
        labels,
        batch_size=config['batch_size'],
        random_seed=config['random_seed'],
        balance=config['correct_imbalance'],
    )

    model = BertForSequenceClassification.from_pretrained(
        "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext",
        num_labels=2,
        output_attentions=False,
        output_hidden_states=False,
    )

    if torch.cuda.is_available():
        model.cuda()

    loss_function = nn.CrossEntropyLoss()
    # optimizer = AdamW(model.parameters(), lr=config['lr'])
    optimizer = torch.optim.SGD(model.parameters(), lr=config['lr'])

    total_train_steps = config['num_epochs'] * len(train_loader)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=total_train_steps,
    )

    best_metric = 0

    # loop over the dataset multiple times
    for epoch in range(1, config['num_epochs'] + 1):
        logging.info(
            f"==================== Epoch: {epoch} ====================")
        running_losses = []
        for i, data in enumerate(train_loader, 0):
            # get the inputs; data is a list of [inputs, labels]
            input_ids, token_type_ids, attention_mask, labels = data

            if torch.cuda.is_available():
                input_ids = input_ids.cuda()
                token_type_ids = token_type_ids.cuda()
                attention_mask = attention_mask.cuda()
                labels = labels.cuda()

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward
            _, logits = model(
                input_ids=input_ids,
                token_type_ids=token_type_ids,
                attention_mask=attention_mask,
                labels=labels,
            )
            # probs = F.softmax(logits, dim=1)

            # backprop
            loss = loss_function(logits, labels)
            loss.backward()

            # clip gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            # update/optimize
            optimizer.step()
            # update learning rate
            scheduler.step()

            # Log summary
            running_losses.append(loss.item())
            if i % args.log_interval == 0:
                interval_loss = sum(running_losses) / len(running_losses)
                logging.info(f"step = {i}, loss = {interval_loss}")
                running_losses = []

            if i % args.test_interval == 0:
                dev_metric = eval(
                    val_loader,
                    model,
                    loss_function,
                    args.eval_metric,
                )
                if dev_metric > best_metric:
                    best_metric = dev_metric
                    states = {
                        "epoch": epoch,
                        "step": i,
                        "model": model.state_dict(),
                        "optimizer": optimizer.state_dict()
                    }
                    save_model_state(save_dir=args.model_dir,
                                     step=i,
                                     states=states)

    print(f"Finished Training, best {args.eval_metric}: {best_metric}")
예제 #17
0
    'py36': 'python:3.6-slim',
    'py35': 'python:3.5-slim',
    'py27': 'python:2.7-slim'
}
BASE64_REGEX = re.compile(r'^[a-zA-Z0-9+/]+={0,2}$')
FILENAME_REGEX = re.compile(r'^[\w,\s-]+\.[A-Za-z]{1,4}$')
MAX_REQUEST_SIZE = 1 * 1024 * 1024   # 1MB
MAX_EXECUTION_LIMIT = 60  # in seconds
MAX_OUTPUT_FILESIZE = 4 * 1024 * 1024  # 4MB
CONTAINER_WORKING_DIR = '/usr/src/app'
FORMATS = ['text', 'base64_encoded_binary', 'json']
DIR_PATH = os.path.dirname(os.path.realpath(__file__))
CODE_PATH = os.path.dirname(os.path.realpath(__file__))

TOKENS_FILE = os.path.join(DIR_PATH, 'tokens.txt')
TOKENS = utils.load_tokens(TOKENS_FILE)

def limit_content_length(max_length: int) -> Callable:
    """Limits a request to max_length bytes at max."""
    def decorator(f):
        @wraps(f)
        def wrapper(*args, **kwargs):
            cl = request.content_length
            if cl is not None and cl > max_length:
                abort(413)
            return f(*args, **kwargs)
        return wrapper
    return decorator

def die(reason: str = "Fatal Error. Please contact the service's operators and consult for support.", returncode: int = 400) -> Tuple[str, int]:
    """Returns a json of an error."""