def wordtovec(wordrdd): word2vec = Word2Vec() model = word2vec.fit(wordrdd) print(model.getVectors()) synonyms = model.findSynonyms('1', 5) for word, cosine_distance in synonyms: print("{}: {}".format(word, cosine_distance))
def word2vec(file_dir): word2vec_training_file = file_dir + WORD2VEC_TRAINING_FILE synonym_data_file = file_dir + SYNONYM_DATA_FILE word2vec_trace_data = file_dir + WORD2VEC_TRACE sc = SparkContext(appName="word2vec") inp = sc.textFile(word2vec_training_file).map(lambda line: line.split(" ")) word2vec = Word2Vec() model = word2vec.setLearningRate(0.02).setMinCount(5).setVectorSize(10).setSeed(2017).fit(inp) vec = model.getVectors() synonyms_data = open(synonym_data_file, "w") logger = logging.getLogger() logger.debug("len of vec:{0}".format(len(vec))) for word in vec.keys(): synonyms = model.findSynonyms(word, 5) entry = {"word": word} synon_list = [] for synonym, cosine_distance in synonyms: synon_list.append(synonym) entry["synonyms"] = synon_list synonyms_data.write(json.dumps(entry)) synonyms_data.write('\n') synonyms_data.close() model.save(sc, word2vec_trace_data) sc.stop() logger.info("Word2Vec training finished")
def train_w2v(): threshold = 20 data = sqlContext.read.parquet('hdfs:///user/rmusters/data_jan').select( "filtered_text") counts = data.flatMap(lambda line: line) \ .map(lambda word: (word, 1)) \ .reduceByKey(lambda a, b: a + b) \ .filter(lambda pair: pair[1] >= threshold) vocab_size = 67585 #counts.count() print "Vocabulary size is: ", vocab_size data = data.map(lambda line: line.filtered_text.split()) max_int_size = 268435455 vector_size = max_int_size / vocab_size print "Vector size is: ", vector_size word2vec = Word2Vec() word2vec.setMinCount(threshold) word2vec.setVectorSize(vector_size) for idx in range(1, 100, 1): print idx model = word2vec.fit(data.sample(False, 0.01)) model.save(sc, '/user/rmusters/jan_threshold20_2015model' + str(idx))
def word2vec(rdd,**kw): """ 生成向量 vec_len:生成向量的长度 min_count:出现最少次数 window_size:窗口长度 learning_rate:学习率 """ seed = int(time.time()) vec_len = kw.get('vec_len',300) min_count = kw.get('min_count',3) window_size = kw.get('window_size',5) partitions = kw.get('partitions',5) lr = kw.get('learning_rate',0.025) step_1 = time.time() model = Word2Vec().setVectorSize(vec_len).setLearningRate(lr).setMinCount(min_count).\ setNumPartitions(partitions).setSeed(seed).setWindowSize(window_size).fit(rdd) vectors = model.getVectors() step_2 = time.time() print 'Build Word2vec Model Using:%s s!' % (step_2-step_1) result = dict(vectors) keys = result.keys() for key in keys: result[key] = list(result[key]) return result
def word2vec(df, inputcol, outputcol, vecsize): from pyspark.mllib.feature import Word2Vec from pyspark.ml.feature import Word2Vec from pyspark.ml.feature import CountVectorizer, CountVectorizerModel, Tokenizer, RegexTokenizer, StopWordsRemover # 使用自定义函数 df.drop('seg') df_seg = df.withColumn("seg", segUDF(inputcol)) df_w = df_seg.drop('words') tokenizer = Tokenizer(inputCol=inputcol, outputCol='words') t_words = tokenizer.transform(df_w) t_words.select('words').head() #4.将文本向量转换成稀疏表示的数值向量(字符频率向量) cv = CountVectorizer(inputCol="words", outputCol="features", vocabSize=5, minDF=2.0) df_f = t_words.drop("features") cv_model = cv.fit(df_f) cv_result = cv_model.transform(df_f) #5.将tokenizer得到的分词结果转换数字向量 word2Vec = Word2Vec(vectorSize=vecsize, minCount=0, inputCol="words", outputCol=outputcol) w2v_model = word2Vec.fit(cv_result) result = w2v_model.transform(cv_result) for feature in result.select(outputcol).take(3): print(feature) return t_words
def write_data(path): import filter from pyspark.mllib.feature import Word2Vec, Word2VecModel # load data loc = '/user/rmusters/text/2015/01/*' text_file = sc.textFile(loc) data = text_file.map(lambda line: filter.filter(line).split(" ")) # load model word2vec = Word2Vec() model = Word2VecModel.load(sc, '/user/rmusters/2015model99') # get a tweet vector pair. from pyspark.sql import SQLContext sqlContext = SQLContext(sc) lookup = sqlContext.read.parquet('/user/rmusters/2015model99/data').alias("lookup") lookup_bd = sc.broadcast(lookup.rdd.collectAsMap()) vectors = data.map(lambda ws: [lookup_bd.value.get(w) for w in ws]) logger.info(vectors.count()) data = text_file.map(lambda line: (line, filter.filter(line).split(" ")))\ .map(lambda (text, filtered): (text, filtered, [lookup_bd.value.get(w) for w in filtered][0])) from pyspark.sql.functions import monotonicallyIncreasingId df = data.toDF(["text", "filtered_text", "vectors"]) # This will return a new DF with all the columns + id res = df.withColumn("id", monotonicallyIncreasingId()) res.write.parquet(path, mode="overwrite")
def main(argv): import getopt dir = '/user/rmusters/' word2vec = Word2Vec() sc = SparkContext(appName='Word2Vec') # # try: # opts, args = getopt.getopt(argv,"hi:o:",["ifile=","ofile="]) # except getopt.GetoptError: # print 'test.py -i <inputfile> -o <outputfile>' # sys.exit(2) # for opt, arg in opts: # if opt == '-h': # print 'test.py -i <inputfile> -o <outputfile>' # sys.exit() # elif opt in ("-l"): # inputfile = arg # elif opt in ("-s"): # outputfile = arg # print 'Input file is "', inputfile # print 'Output file is "', outputfile filename = "12.txt" inp = sc.textFile(dir + filename).map(lambda row: row.split(" ")) model = word2vec.fit(inp) model.save(sc, dir + "pymodelF.bin") model = Word2VecModel.load(sc, dir + "pymodelF.bin") print model.getVectors()
def main(sc): spark = SparkSession.builder.appName("Charembedding").config( "spark.some.config.option", "Charembedding").getOrCreate() # readjson and preprocess file_path = '/user/ichongxiang/data/positions/20180518/dedup_json/part-' df = spark.read.json( '/user/ichongxiang/data/positions/20180518/dedup_json/part-00000' ).select("requirement", "description") text_00000 = df.rdd.map(list) text_00000 = text_00000.map(lambda r: [r[0] + r[1]]) inp_all = text_00000.map(split) for i in range(1, 300): print('Processing input files:%s/%s' % (i, 300)) i = "%05d" % i df = spark.read.json(file_path + str(i)).select( "requirement", "description") text = df.rdd.map(list) text = text.map(lambda r: [r[0] + r[1]]) inp = text.map(split) inp_all = inp_all.union(inp) print('Start Traing Word2vec') word2vec = Word2Vec() model = word2vec.setVectorSize(100).setMinCount(0).setSeed( 100000000000000).fit(inp_all) w2v_dict = model.getVectors() print('Saving Word2vec model vectors') w2v_save = open("char_embedding_w2v.csv", 'w') for i, v in w2v_dict.items(): w2v_save.write(str(i)) w2v_save.write('\t') w2v_save.write(str(v)) w2v_save.write('\n') w2v_save.close() print("succeed")
def trainItem2vec(spark, samples, embLength, embOutputPath, saveToRedis, redisKeyPrefix): word2vec = Word2Vec().setVectorSize(embLength).setWindowSize( 5).setNumIterations(10) model = word2vec.fit(samples) synonyms = model.findSynonyms("158", 20) print("similarMovieId, cosineSimilarity") for synonym, cosineSimilarity in synonyms: print(synonym, cosineSimilarity) embOutputDir = '/'.join(embOutputPath.split('/')[:-1]) if not os.path.exists(embOutputDir): os.makedirs(embOutputDir) with open(embOutputPath, 'w') as f: for movie_id in model.getVectors( ): # model.getVectors() -> {movie_id: List[movie_embedding]} vectors = " ".join( [str(emb) for emb in model.getVectors()[movie_id]]) f.write(movie_id + ":" + vectors + "\n") # save to Redis if saveToRedis: r = redis.Redis(host=redisHost, port=redisPort, db=0, decode_responses=True) for movieId in model.getVectors(): r.set(redisKeyPrefix + ":" + movieId, " ".join([str(emb) for emb in model.getVectors()[movie_id]]), ex=TTL) embeddingLSH(spark, model.getVectors()) return model
def test_word2vec_setters(self): model = (Word2Vec().setVectorSize(2).setLearningRate( 0.01).setNumPartitions(2).setNumIterations(10).setSeed( 1024).setMinCount(3).setWindowSize(6)) self.assertEqual(model.vectorSize, 2) self.assertTrue(model.learningRate < 0.02) self.assertEqual(model.numPartitions, 2) self.assertEqual(model.numIterations, 10) self.assertEqual(model.seed, 1024) self.assertEqual(model.minCount, 3) self.assertEqual(model.windowSize, 6)
def __get_word2vec(self, word2vec_setting): min_count, seed, learning_rate, vector_size = word2vec_setting word2vec = Word2Vec() # Word2Vec's default min count is 100; our default min count is 20. word2vec.setMinCount(min_count) word2vec.setSeed(seed) # Word2Vec's default learning rate is 0.025; our default min count is also 0.025. word2vec.setLearningRate(learning_rate) # Word2Vec's default vector size is 100; our default vector size is 50. word2vec.setVectorSize(vector_size) return word2vec
def test_word2vec_get_vectors(self): data = [ ["a", "b", "c", "d", "e", "f", "g"], ["a", "b", "c", "d", "e", "f"], ["a", "b", "c", "d", "e"], ["a", "b", "c", "d"], ["a", "b", "c"], ["a", "b"], ["a"] ] model = Word2Vec().fit(self.sc.parallelize(data)) self.assertEquals(len(model.getVectors()), 3)
def create_model_text(self, data, params): learningRate = float(params.get('learningRate', 0.025)) numIterations = int(params.get('numIterations', 10)) minCount = int(params.get('minCount', 5)) word2vec = Word2Vec() word2vec.setLearningRate(learningRate) word2vec.setNumIterations(numIterations) word2vec.setMinCount(minCount) inp = data.map(lambda row: row.split(" ")) return word2vec.fit(inp)
def _skip_gram(self, walks_rdd): vector_size = self.getOrDefault("vector_size") min_count = self.getOrDefault("min_count") num_partitions = self.getOrDefault("num_partitions") learning_rate = self.getOrDefault("learning_rate") num_iter = self.getOrDefault("num_iter") model = Word2Vec() \ .setVectorSize(vector_size) \ .setMinCount(min_count) \ .setNumPartitions(num_partitions) \ .setLearningRate(learning_rate) \ .setNumIterations(num_iter) \ .fit(walks_rdd) return model.getVectors()
def word2vecModel(text): """Computes distributed vector representation of words using a skip-gram model. The training objective of skip-gram is to learn word vector representations that are good at predicting its context in the same sentence. :parameter text: (REQUIRED) - the input data of text words/strings you'd like to use :return: word2vec model Use it as: .. code-block::python model = word2vecModel(text) synonyms = model.findSynonyms('random_word', 40) """ word2vec = Word2Vec() return word2vec.fit(text)
def test_word2vec_setters(self): data = [["I", "have", "a", "pen"], ["I", "like", "soccer", "very", "much"], ["I", "live", "in", "Tokyo"]] model = Word2Vec() \ .setVectorSize(2) \ .setLearningRate(0.01) \ .setNumPartitions(2) \ .setNumIterations(10) \ .setSeed(1024) \ .setMinCount(3) self.assertEquals(model.vectorSize, 2) self.assertTrue(model.learningRate < 0.02) self.assertEquals(model.numPartitions, 2) self.assertEquals(model.numIterations, 10) self.assertEquals(model.seed, 1024) self.assertEquals(model.minCount, 3)
def trainItem2vec(spark, samples, embLength, embOutputPath, saveToRedis, redisKeyPrefix): word2vec = Word2Vec().setVectorSize(embLength).setWindowSize( 5).setNumIterations(10) model = word2vec.fit(samples) synonyms = model.findSynonyms("158", 20) for synonym, cosineSimilarity in synonyms: print(synonym, cosineSimilarity) embOutputDir = '/'.join(embOutputPath.split('/')[:-1]) if not os.path.exists(embOutputDir): os.makedirs(embOutputDir) with open(embOutputPath, 'w') as f: for movie_id in model.getVectors(): vectors = " ".join( [str(emb) for emb in model.getVectors()[movie_id]]) f.write(movie_id + ":" + vectors + "\n") embeddingLSH(spark, model.getVectors()) return model
def main(): # Threshold to limit words which occur less than the threshold threshold = 10 #10 text_file = sc.textFile(loc) data = text_file.map(lambda line: filter.filter(line)) counts = data.flatMap(lambda line: line.split(" ")) \ .map(lambda word: (word, 1)) \ .reduceByKey(lambda a, b: a + b) \ .filter(lambda pair: pair[1] >= threshold) #.sortBy(lambda x:x[1], ascending=True) #only use for inspection counts.cache() vocab_size = counts.count() print "Vocabulary size is: ", vocab_size inp = data.map(lambda line: line.split(" ")) inp.cache() max_int_size = 268435455 vector_size = max_int_size / vocab_size print "Vector size is: ", vector_size word2vec = Word2Vec() word2vec.setMinCount(threshold) #40 word2vec.setVectorSize(vector_size) #/100 for idx in range(1, 100, 1): print idx model = word2vec.fit(inp.sample(False, 0.01)) # if idx == 1 or idx == 2: # print "Vector size of current model: ", word2vec.getVectorSize() # inputcol = word2vec.getInputCol() # outputcol = word2vec.getOutputCol() # print "input column: ", inputcol # try: # print len(inputcol) # print len(outputcol) # except: # pass # print "output column", outputcol model.save(sc, '/user/rmusters/threshold20_2015model' + str(idx))
def trainItem2vecAndSave(spark, samples, embLength, embOutputPath): bucket = embOutputPath.split('//')[1].split('/', 1)[0] key = embOutputPath.split('//')[1].split('/', 1)[1] word2vec = Word2Vec().setVectorSize(embLength).setWindowSize( 5).setNumIterations(10) model = word2vec.fit(samples) synonyms = model.findSynonyms("158", 20) #print(synonyms) for synonym, cosineSimilarity in synonyms: print(synonym, cosineSimilarity) buffer = StringIO() for movie_id in model.getVectors(): vectors = " ".join([str(emb) for emb in model.getVectors()[movie_id]]) buffer.write(movie_id + ":" + vectors + "\n") buffer.seek(0) s3.put_object(Bucket=bucket, Key=key, Body=buffer.read()) embeddingLSH(spark, model.getVectors()) return model
def train_item_to_vec(self, spark_session: SparkSession, samples, emb_length: int, emb_output_file_name: str, save_to_redis: bool, redis_key_prefix: str): """ train a word2vec model based on movie samples :param spark_session: :param samples: :param emb_length: :param emb_output_file_name: :param save_to_redis: :param redis_key_prefix: :return: """ word2vec = Word2Vec().setVectorSize(emb_length).setWindowSize( 5).setNumIterations(10) model = word2vec.fit(samples) synonyms = model.findSynonyms("158", 20) for synonym, cosine_sim in synonyms: print(synonym, cosine_sim) root_dir = dirname(dirname(dirname(abspath(__file__)))) rating_resource_path = join(root_dir, "resources", "webroot/modeldata/") file = open(join(rating_resource_path, emb_output_file_name), "w") for movieId, vector in model.getVectors().items(): file.write(movieId + ":" + " ".join([str(num) for num in vector]) + "\n") if save_to_redis: redis_client = redis.Redis(host=self.redisEndpoint, port=self.redisPort) for movieId, vector in model.getVectors().items(): redis_client.set(redis_key_prefix + ":" + movieId, " ".join([str(num) for num in vector]), ex=60 * 60 * 24) redis_client.close() self.embedding_lsh(spark_session, model.getVectors()) return model
def main(): inp = sc.textFile("hdfs://hadoop2/input/result.txt").map(lambda row: row.split(" ")) word2vec = Word2Vec() model = word2vec.fit(inp) ket = model.getVectors().keys() for noun in ket: num = 0 synonym = ["" for _ in range(5)] synonyms = model.findSynonyms(noun, 5) for word, cosince_distance in synonyms: synonym[num] = word num = num + 1 try: print(noun.encode('utf-8')) db_manager(noun, synonym[0], synonym[1], synonym[2], synonym[3],synonym[4]) except Exception as err: print(err) pass
def trainItem2vec(spark, samples, embLength, embOutputPath, redisKeyPrefix, saveToRedis=False): # 构造Word2vec网络模型结构 # setVectorSize设置Embedding向量的维度,即Word2vec的隐含层的神经元数目 # setWindowSize设置在序列上进行滑动的滑动窗口大小(windowSize=2c+1) # setNumIterations设置训练模型时的迭代次数,类似epoch word2vec = Word2Vec().setVectorSize(embLength).setWindowSize(5).setNumIterations(10) model = word2vec.fit(samples) # 调用封装好的函数寻找与某个item最相似的N个其它item synonyms = model.findSynonyms("592", 20) # id"592"为蝙蝠侠Batman for synonym, cosineSimilarity in synonyms: print(synonym, cosineSimilarity) # 准备从训练完毕后的Word2vec中取出Embedding向量并存入目标文件夹中或redis中 if not saveToRedis: embOutputDir = '/'.join(embOutputPath.split('/')[:-1]) # if not os.path.exists(embOutputDir): os.mkdir(embOutputDir) # 使用getVectors()方法得到存放word及其向量表达(Embedding向量,W_vxn的行向量)的map<movie_id : String, Embedding : Vector> with open(embOutputPath, 'w') as file: for movie_id in model.getVectors(): vectors = " ".join([str(emb) for emb in model.getVectors()[movie_id]]) file.write(movie_id + ":" + vectors + "\n") else: # 将Item的Embedding写入Redis中 redis_client = redis.StrictRedis(host='66.42.66.135', port='6379', db=0, password='******') expire_time = 60*60*24 # 设置缓存时间为24h # 使用Pipeline,否则每一次连接Redis都消耗一次RTT,过于慢了 pipe = redis_client.pipeline(transaction=True) for movie_id in model.getVectors(): vectors = " ".join([str(emb) for emb in model.getVectors()[movie_id]]) pipe.set(redisKeyPrefix + ":" + movie_id, vectors) # pipe.expire(redisKeyPrefix + ":" + movie_id, expire_time) # 还没部署到线上,暂时不设置缓存时间 # 执行管道里的各请求 pipe.execute() redis_client.close() return model
def event_gen_word2vec(sc, log_lines, window_size=60): import hdbscan D = log_lines.map( lambda logline: ( int(logline.ts / window_size), (logline.ts, logline.templateId))) .groupByKey() .map( lambda window_loglines: [ str(templateId) for ( ts, templateId) in sorted( window_loglines[1])]) # Run Word2Vec model = Word2Vec().setVectorSize(16).setSeed(42).fit(D) model_vectors = model.getVectors() # mapping dict_distrib labels = [] vectors = [] for label, vector in model_vectors.items(): labels.append(label) vectors.append(list(vector)) # Clsutering output_events = defaultdict(list) for i, val in enumerate(hdbscan.HDBSCAN( min_cluster_size=2).fit_predict(vectors)): output_events[val].append(labels[i]) # Create event objects events = [] for item in output_events: event = Event(id=item, template_ids=map(int, output_events[item])) if len(event.template_ids) > 0: events.append(event) return events
import urllib from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel import os from pyspark.mllib.feature import Word2Vec from time import time from pyspark.mllib.regression import LabeledPoint from numpy import array from classes.PosNeg import PosNegCount from classes.WordVector import WordVectorAnalyzer import nltk import json from nltk.tree import Tree import json from Levenstein import Lev stop_words = nltk.corpus.stopwords.words('english') stop_words += ['?', '.', '!', ','] sparkConf = SparkConf().setMaster("local").setAppName( "PredictKafkaTweetStreaming").set("spark.app.id", "Predict") sc = SparkContext(appName="WordVectorTrainer") sc.setLogLevel("WARN") inp = sc.textFile("WordTraining.txt").map(lambda row: row.split(" ")) word2vec = Word2Vec() model = word2vec.fit(inp) WordVectors = {} for i in model.getVectors().keys(): WordVectors[i] = model.findSynonyms(i, 7) with open('WordVectors.json', 'w') as fp: json.dump(WordVectors, fp)
def trainOne(sc, url): w2v = Word2Vec() return w2v.fit(url2rdd(sc, url))
def train(sc, urls): w2v = Word2Vec() rdds = reduce(lambda a, b: a.union(b), [url2rdd(sc, url) for url in urls.split("\n")]) return w2v.fit(rdd)
def generate_word2vec_model(doc): return Word2Vec().setVectorSize(10).setSeed(42).fit(doc)
def run_word_embedding_word2vec(sc, sentences, wepath): word2vec = Word2Vec() model = word2vec.fit(sentences) # model.transform(sentences).saveAsTextFile(wepath) print 'saving model to output: {}'.format(wepath) model.save(sc, wepath)
for feature in features_l: if all(ord(char) < 128 for char in feature): if feature.rfind("#") == -1 and feature.rfind("@") == -1 and feature.rfind("https"): feature.replace(",", "").replace(".", "").replace(":", "").replace(";", "").replace("\"", "").lower() new_feature_l.append(feature) return new_feature_l first_n_rows = int(sys.argv[1]) vectorSize = int(sys.argv[2]) sc = SparkContext(appName = "Prova") sqlContext = SQLContext(sc) test = sc.textFile("txt/dataset_" + str(first_n_rows) + ".txt") #test = test.map(lambda tweet: tweet[1].replace("\"", "")) test = test.filter(lambda tweet: "," in tweet).map(lambda tweet: tweet.split(",")) test = test.map(lambda tweet: filtering(tweet[0].split(" "))) word2vec = Word2Vec().setVectorSize(vectorSize) model = word2vec.fit(test) print(model.getVectors()) model.save(sc, "word2vec_models/" + str(first_n_rows) + "_" + str(vectorSize))
embedding_size = 2 start = time.time() load_edgelist(file_name, graph) end = time.time() print "Loading edgelist\t", (end - start) data = generate_walks(graph) data_matrix = [] for row in data: data_matrix.append(row) print "Data Matrix Created" s = sc.parallelize(data_matrix) print "Building Word Vectors" start = time.time() model = Word2Vec().setVectorSize(embedding_size).setSeed(22).setMinCount( 1).fit(s) end = time.time() print "Word2vec\t", (end - start) embeddings = model.getVectors() d = defaultdict(list) for key in embeddings: for x in embeddings[key]: d[key].append(x) l = sc.textFile(file_name) X = l.map(lambda row: row.strip().split(',')) t = sc.textFile("combined_lab.csv") y = t.map(lambda row: map(int, row.strip())) temp = y.zip(X) data = temp.map(lambda row: getEmbeddings(row, d)) for x in [.01, .02, .03, .04, .05, .06, .07, .08, .09]: