def __init__(self, lex_prob_file, corpus_file):
     self.lex_prob = defaultdict(list)
     for line in open(lex_prob_file):
         chunks = line[:-1].split()
         self.lex_prob[chunks[1]].append(float(chunks[2]))
     corpus = TextCorpus(input=corpus_file)
     self.corpus_freq = FreqDist([word for line in corpus.get_texts() for word in line])
     self.thresholds = [0.01, 0.05, 0.1, 0.2, 0.5]
 def __init__(self, lex_prob_file, corpus_file):
     self.lex_prob = defaultdict(list)
     for line in open(lex_prob_file):
         chunks = line[:-1].split()
         self.lex_prob[chunks[1]].append(float(chunks[2]))
     corpus = TextCorpus(input=corpus_file)
     self.corpus_freq = FreqDist([word for line in corpus.get_texts() for word in line])
     self.thresholds = [0.01, 0.05, 0.1, 0.2, 0.5]
示例#3
0
def train_gensim():
	from gensim.corpora import TextCorpus
	from gensim.corpora.textcorpus import lower_to_unicode
	from gensim.models import Word2Vec as GensimWord2Vec

	start = time()

	stopwords = []
	if args.stop_word_lang:
		# starting spark only for this...
		spark = SparkSession.builder.appName("load stop words").getOrCreate()
		stopwords += StopWordsRemover.loadDefaultStopWords(args.stop_word_lang)
		spark.sparkContext.stop()
	if args.stop_word_file:
		with open(args.stop_word_file) as stop_word_file:
			stopwords += [word.strip("\n") for word in stop_word_file.readlines()]

	def remove_stopwords(tokens):
		return [token for token in tokens if token not in stopwords]

	corpus = TextCorpus(
		args.txtPath,
		dictionary={None: None},
		character_filters=[lower_to_unicode],
		token_filters=[remove_stopwords]
	)

	model = GensimWord2Vec(
		seed=1,
		alpha=args.step_size,
		size=args.vector_size,
		window=args.window_size,
		sample=1e-6,
        sg=1
	)
	model.build_vocab(corpus.get_texts())
	model.train(corpus.get_texts(), total_examples=model.corpus_count, epochs=model.epochs)
	model.save(args.modelPath)

	end = time()
	print("Gensim training took {} seconds".format(end - start))
示例#4
0
# logging
import logging
logger = logging.getLogger()
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
logger.addHandler(ch)

logger.info("Setting up app!")

app = Flask(__name__)
CORS(app)

# setup model stuff
corpus = TextCorpus('jobspicker/jobspicker-descriptions.csv')
corpus.dictionary.filter_extremes(no_below=4, no_above=.9, keep_n=100000)
sentences = [list(g) for g in list(corpus.get_texts())]
tfidf = TfidfModel(corpus)
model = Word2Vec.load("profiles.model")
corp_vecs = corpus_vec(sentences, model, corpus)

# create simple helper functions
get_vec = lambda t: sentence_to_vec(t, model, corpus, tfidf)
get_job = lambda v: get_closest_doc(v, corp_vecs, sentences)

# our database of bayesopt models
user_models = {}

@app.route('/init/<i>')
def init(i):
    # make bayesianopt class with id and store in memory user_modes[i] =
    user_models[i] = "foo"