def __init__(self, lex_prob_file, corpus_file): self.lex_prob = defaultdict(list) for line in open(lex_prob_file): chunks = line[:-1].split() self.lex_prob[chunks[1]].append(float(chunks[2])) corpus = TextCorpus(input=corpus_file) self.corpus_freq = FreqDist([word for line in corpus.get_texts() for word in line]) self.thresholds = [0.01, 0.05, 0.1, 0.2, 0.5]
def train_gensim(): from gensim.corpora import TextCorpus from gensim.corpora.textcorpus import lower_to_unicode from gensim.models import Word2Vec as GensimWord2Vec start = time() stopwords = [] if args.stop_word_lang: # starting spark only for this... spark = SparkSession.builder.appName("load stop words").getOrCreate() stopwords += StopWordsRemover.loadDefaultStopWords(args.stop_word_lang) spark.sparkContext.stop() if args.stop_word_file: with open(args.stop_word_file) as stop_word_file: stopwords += [word.strip("\n") for word in stop_word_file.readlines()] def remove_stopwords(tokens): return [token for token in tokens if token not in stopwords] corpus = TextCorpus( args.txtPath, dictionary={None: None}, character_filters=[lower_to_unicode], token_filters=[remove_stopwords] ) model = GensimWord2Vec( seed=1, alpha=args.step_size, size=args.vector_size, window=args.window_size, sample=1e-6, sg=1 ) model.build_vocab(corpus.get_texts()) model.train(corpus.get_texts(), total_examples=model.corpus_count, epochs=model.epochs) model.save(args.modelPath) end = time() print("Gensim training took {} seconds".format(end - start))
# logging import logging logger = logging.getLogger() ch = logging.StreamHandler() ch.setLevel(logging.INFO) logger.addHandler(ch) logger.info("Setting up app!") app = Flask(__name__) CORS(app) # setup model stuff corpus = TextCorpus('jobspicker/jobspicker-descriptions.csv') corpus.dictionary.filter_extremes(no_below=4, no_above=.9, keep_n=100000) sentences = [list(g) for g in list(corpus.get_texts())] tfidf = TfidfModel(corpus) model = Word2Vec.load("profiles.model") corp_vecs = corpus_vec(sentences, model, corpus) # create simple helper functions get_vec = lambda t: sentence_to_vec(t, model, corpus, tfidf) get_job = lambda v: get_closest_doc(v, corp_vecs, sentences) # our database of bayesopt models user_models = {} @app.route('/init/<i>') def init(i): # make bayesianopt class with id and store in memory user_modes[i] = user_models[i] = "foo"