def transform(self, train_data, is_test=False): model = Word2VecModel.load(self.model_path) item2vec = model.getVectors() train_data_seq = transform_trainable(train_data, test=is_test) train_data_seq = model.transform(train_data_seq) # step5:Get the click sequence train_data_click = train_data.filter( "action_type='clickout item'").select('user_id', "session_id", 'timestamp', 'step', 'reference', 'impressions') train_data_click = train_data_click.withColumn( 'impressions', F.split(train_data.impressions, '\|')).withColumn("impressions", F.explode("impressions")) cond = train_data_click.impressions == item2vec.word df_out = train_data_click.join(item2vec, cond, how='left').select( 'user_id', "session_id", 'timestamp', 'step', 'reference', 'impressions', 'vector') df_out = df_out.join(train_data_seq, df_out.session_id == train_data_seq.session_id, how='left').drop(train_data_seq.session_id) # step6: Find and sort the similarity between the session vector and the exposure vector df_out = df_out.withColumn('sim', getCosinDis('vector', 'item2vec')) df_out = df_out.withColumn("sim", df_out.sim.cast('float')).withColumn( "rank", F.rank().over( Window.partitionBy("session_id", 'timestamp', "step").orderBy("sim"))) return df_out
def startup(): # Download the model import findspark findspark.init() import pyspark # only run after findspark.init() from pyspark.sql import SparkSession spark = SparkSession.builder.getOrCreate() print("Spark session started\n\n") aws_model_cmd = f"aws s3 cp --recursive s3://{bucket_name}/{s3_dir} {local_dir}/{s3_dir}" # os.system(aws_model_cmd) # Download the vectors aws_vector_cmd = f"aws s3 cp --recursive s3://{bucket_name}/{title_vector} {local_dir}/{title_vector}" # os.system(aws_vector_cmd) figures_cmd = f"aws s3 cp --recursive s3://{bucket_name}/{figures} dashboard/templates/static" os.system(figures_cmd) # Load the Model and data saveword2vec_path = f"{local_dir}/{s3_dir}" model_word2vec = Word2VecModel.load(saveword2vec_path) title_vectors_df = spark.read.parquet(f"{local_dir}/{title_vector}") return spark, model_word2vec, title_vectors_df
def main(): db_con = init_sync() if (not os.path.exists('model')): if (not os.path.exists('data_text')): print("Папка создана") os.mkdir('data_text') save_txt.save_text_db_to_txt(db_con) word2vec.create_w2v_model() persons = get_persons(db_con) places = get_places(db_con) spark = SparkSession \ .builder \ .appName("SimpleApplication") \ .getOrCreate() model = Word2VecModel.load(PATH) pprint("Поиск контекстных синонимов персон:") persons_synonyms = get_synonyms(persons, 5, model, spark) insert_to_persons_synonyms(db_con, persons, persons_synonyms) print_elem(persons, persons_synonyms) pprint("Поиск контекстных синонимов достопримечательностей:") places_synonyms = get_synonyms(places, 5, model, spark) insert_to_places_synonyms(db_con, places, places_synonyms) print_elem(places, places_synonyms) spark.stop()
def loadModel(path): ''' Load Word2Vec model Input : - path Output : - model [Word2Vec model data frame] ''' model = Word2VecModel.load(path) return model
def load_model( self, cloud_path: str, model_name: str, ) -> Word2VecModel: """ Load a previously saved Word2Vec model object to memory. """ if not model_name.endswith(".sparkml"): model_name += ".sparkml" self.model = Word2VecModel.load(cloud_path + "/" + model_name) return self.model
def get_vectors_for_df(self, df): """ Note:This method is used to get the vectors from the text in datafram Author:Aliabbas Bhojani """ logging.info("Getting the vectors for the given dataframe") if os.path.exists(self.model_dir + self.model_name): model = Word2VecModel.load(self.model_dir + self.model_name + "/") output_df = model.transform(df) return output_df else: logging.info("No Models Found, retrain the model")
def configure_processing(self): """ A function to operate on input raw data generates report in desired format. """ # Load report making data print "============ NLP methods starts on report data ===================" df = helper_method_process_data(self.total_data, self.spark_context, self.sql_context) print "============ NLP methods finished on report data ===================" # load pre-generated model print "============= Loading pre-build model =======================" model_path = self.model_save_path + self.model_date + "/" model = Word2VecModel.load(model_path) print "====================== Model loading completed ==============" report_data = df.rdd.map(lambda a: a.asDict()).collect() df.unpersist() del df ### report the result count_id = 1 self.es = Elasticsearch([{ 'host': self.es_output_host, 'port': self.es_output_port }]) print "============ Report seeding to Elastic Search start ===========" output_index_name = self.es_output_index + '-' + datetime.now( ).strftime("%Y-%m-%d-%H-%M-%S") print "============ Result will be pushed in index:", output_index_name, "==============" for one_line_report in report_data: if len(one_line_report['features']) < 3: continue for word2vec_keyword in list(set(one_line_report['features'])): try: synonyms = model.findSynonyms(word2vec_keyword, 10) self.send_result_to_elasticsearch(word2vec_keyword, synonyms, count_id, output_index_name) count_id = count_id + 1 except Exception as e: ''' It will come here if model did not get the current word''' print e.message pass print "================== Report seeding Ended============="
def get_synonyms(word: str, count: int): spark = SparkSession \ .builder \ .appName("SimpleApplication") \ .getOrCreate() model = Word2Vec.load('/home/pok/sem/project/models/model0mincount/model') model_fitted = Word2VecModel.load( '/home/pok/sem/project/models/model0mincount/fitted') synonyms = model_fitted.findSynonyms(word, count) synonyms.select("word", fmt("similarity", 5).alias("similarity")).show() spark.stop() return synonyms
def main(): if not os.path.exists('model'): save_txt.save_text_db_to_txt() word2vec.create_w2v_model() with SparkSession.builder.appName( "SimpleApplication").getOrCreate() as spark_session: model = Word2VecModel.load(PATH) persons = mongo.selectAll('persons') places = mongo.selectAll('places') pprint("Поиск контекстных синонимов персон:") persons_synonyms = word2vec.find_synonyms(persons, model, spark_session) pprint(persons_synonyms) pprint("Поиск контекстных синонимов достопримечательностей:") places_synonyms = word2vec.find_synonyms(places, model, spark_session) pprint(places_synonyms)
def init(): ''' Initialize libraries. ''' print('Initializing...') sconf = pyspark.SparkConf().setAll([ ('spark.executor.memory', config.spark_executor_memory), ('spark.executor.instances', config.spark_executor_instances), ('spark.executor.cores', config.spark_executor_cores), #('spark.cores.max', config.spark_cores_max), ('spark.driver.memory', config.spark_driver_memory), ('master', config.spark_master), ]) global spark, df_all, w2v_model spark = SparkSession.builder.appName('similarity2').config( conf=sconf, ).getOrCreate() spark.sparkContext.setLogLevel(config.spark_log_level) df_all = spark.read.parquet(config.input_dir).sample( withReplacement=False, fraction=config.spark_fraction, seed=config.spark_seed) w2v_model = Word2VecModel.load(config.model_file)
def main(): spark = SparkSession.builder \ .appName("Spark CV-job ad matching") \ .config("spark.some.config.option", "some-value") \ .master("local[*]") \ .getOrCreate() df_categories = spark.read.json("allcategories4rdd/allcategories.jsonl") tokenizer = Tokenizer(inputCol="skillText", outputCol="words") tokenized = tokenizer.transform(df_categories) remover = StopWordsRemover(inputCol="words", outputCol="filtered") removed = remover.transform(tokenized) stripped = removed.select('filtered').rdd.map(lambda x: strip_punctuation(x[0]))\ .map(lambda x: Row(filtered=x)).toDF(['filtered']) # word2vec = Word2Vec(vectorSize=100, inputCol="filtered", outputCol="result") # model = word2vec.fit(stripped) #model.save("word2vec-model") model = Word2VecModel.load("word2vec-model") synonyms = model.findSynonyms(sys.argv[1], 10) synonyms.show(truncate=False)
import math from pyspark.ml.feature import Word2VecModel from pyspark_ml.app_root import get_root_path from pyspark.ml.feature import PCA import matplotlib.pyplot as plt from pyspark.ml.clustering import KMeans from mpl_toolkits.mplot3d import Axes3D project_root_path = get_root_path() from pyspark.sql import SparkSession spark = SparkSession.builder.appName('evel_word2vec').getOrCreate() #load model modelPath = project_root_path + "/models/word2vec-model" loadedModel = Word2VecModel.load(modelPath) # loadedModel.findSynonyms('beijing',15).show(truncate=False) wordVectorsDF = loadedModel.getVectors() vocabSize = wordVectorsDF.count() print("Vocabulary Size: ", vocabSize) loadedModel.findSynonyms('coffee', 3).show(truncate=False) # loadedModel.getVectors().show(truncate=False) dfW2V = wordVectorsDF.select('vector').withColumnRenamed('vector', 'features') numComponents = 3 pca = PCA(k=numComponents, inputCol='features', outputCol='pcaFeatures') model = pca.fit(dfW2V) dfComp = model.transform(dfW2V).select("pcaFeatures")
num_iterations = sys.argv[5] vector_size = sys.argv[6] debug_flag = sys.argv[7] relations_result_file = config.get('DataSection', 'relations_result_file') spark = SparkSession \ .builder \ .appName("WikiFindSynonyms") \ .config("spark.executor.memory", "2g") \ .config("spark.driver.memory", "4g") \ .config("spark.driver.maxResultSize", "1g") \ .config("spark.default.parallelism", "4") \ .getOrCreate() model = Word2VecModel.load(sys.argv[1]) modelDF = model.getVectors() modelDF.show() print("Total number of records in modelDF = %d" % modelDF.count()) modelDF = modelDF.repartition(1000, 'word') testDataDF = spark.read.csv(sys.argv[2], header=True) testDataDF.show() print("Total number of records in testDataDF = %d" % testDataDF.count()) testDataDF = testDataDF.join(modelDF, testDataDF.word1 == modelDF.word,'inner')\ .select(testDataDF.word1, testDataDF.word2, \ testDataDF.word3, modelDF.vector) testDataDF.show() testDataDF = testDataDF.withColumnRenamed('vector', 'vec1')
PATH = 'model/kurs_model/' from pyspark.sql import SparkSession from pyspark.ml.feature import Word2VecModel from pprint import pprint test_words = [ "дом", "бочаров", "документ", "деньги", "гараж", "волга", "дума", "закон", "полиция", "врач" ] spark = SparkSession \ .builder \ .appName("SimpleApplication") \ .getOrCreate() model = Word2VecModel.load(PATH) pprint( "Контекстные синонимы слов, полученные из модели, обученной на статьях:") for test_word in test_words: pprint("-" * 20) pprint(test_word) result = model.findSynonyms(test_word, 5).collect() for el in result: pprint(el[0]) spark.stop()
def load_model(self, path_to_model): """Load Word2Vec model from model_path.""" from pyspark.ml.feature import Word2VecModel w2vec_model = Word2VecModel.load(path_to_model) return (w2vec_model)
end = time.time() print("数据拆分用时:{}".format(end - start)) """ 对主治功能总字段进行词转向量 """ start = time.time() # 创建word2Vec对象并指定配置信息(维度信息,输入列,输出列) # word2Vec = Word2Vec(vectorSize=300, minCount=0, inputCol="d_func", outputCol="d_func_result") # 使用word2Vec对象对指定列进行转化 # hypertension_model = word2Vec.fit(manbing) # 持久化模型(费时操作只执行一次) # hypertension_model.save("./WV_model/") model = Word2VecModel.load("./WV_model/") # 对训练集的数据进行词转向量的转化 manbing = model.transform(train_set) end = time.time() print("词转向量用时:{}".format(end - start)) """ 训练数据构建模型 """ start = time.time() # 从源数据中指定特征列 assembler = VectorAssembler(inputCols=["d_func_result"], outputCol="features") # assembler对象是一个transformer,将多列数据转化为单列的向量列(决策树可以识别的类型) # train_set2 = assembler.transform(manbing)
dataset = dataset.withColumn("month", (F.col("month") - 1) / (12 - 1)) dataset = dataset.withColumn("day", (F.col("day") - 1) / (31 - 1)) dataset = dataset.withColumn("hour", (F.col("hour") - 0) / (23 - 0)) dataset = dataset.withColumn("minute", (F.col("minute") - 0) / (59 - 0)) dataset = dataset.withColumn("second", (F.col("second") - 0) / (59 - 0)) # Word2Vec dataset = dataset.withColumn( 'categorical', F.concat(F.array('rat'), F.array('mcc'), F.array('mnc'), F.array('msin'), F.array('tac'), F.array('snr'))) word2Vec_output_path = "{}/data/word2VecModel.bin".format(base_path) word2Vec = Word2VecModel.load(word2Vec_output_path) dataset = word2Vec.transform(dataset) # VectorAssembler sizeHint = VectorSizeHint(inputCol="vcategorical", handleInvalid="skip", size=50) dataset = sizeHint.transform(dataset) vector_assembler_output_path = "{}/data/vectorAssemblerW2VModel.bin".format( base_path) vector_assembler = VectorAssembler.load(vector_assembler_output_path) dataset = vector_assembler.transform(dataset) # Clasificación
def compute_article_similar(self, articleProfile): """ 计算增量文章与历史文章的相似度 word2vec :return: """ # 得到要更新的新文章通道类别(不采用) # all_channel = set(articleProfile.rdd.map(lambda x: x.channel_id).collect()) def avg(row): x = 0 for v in row.vectors: x += v # 将平均向量作为article的向量 return row.article_id, row.channel_id, x / len(row.vectors) for channel_id, channel_name in CHANNEL_INFO.items(): profile = articleProfile.filter( 'channel_id = {}'.format(channel_id)) wv_model = Word2VecModel.load( "hdfs://hadoop-master:9000/headlines/models/channel_%d_%s.word2vec" % (channel_id, channel_name)) vectors = wv_model.getVectors() # 计算向量 profile.registerTempTable("incremental") articleKeywordsWeights = self.spark.sql( "select article_id, channel_id, keyword, weight from incremental LATERAL VIEW explode(keywords) AS keyword, weight where channel_id=%d" % channel_id) articleKeywordsWeightsAndVectors = articleKeywordsWeights.join( vectors, vectors.word == articleKeywordsWeights.keyword, "inner") articleKeywordVectors = articleKeywordsWeightsAndVectors.rdd.map( lambda r: (r.article_id, r.channel_id, r.keyword, r.weight * r. vector)).toDF([ "article_id", "channel_id", "keyword", "weightingVector" ]) articleKeywordVectors.registerTempTable("tempTable") articleVector = self.spark.sql( "select article_id, min(channel_id) channel_id, collect_set(weightingVector) vectors from tempTable group by article_id" ).rdd.map(avg).toDF(["article_id", "channel_id", "articleVector"]) # 写入数据库 def toArray(row): return row.article_id, row.channel_id, [ float(i) for i in row.articleVector.toArray() ] articleVector = articleVector.rdd.map(toArray).toDF( ['article_id', 'channel_id', 'articleVector']) articleVector.write.insertInto("article_vector") import gc del wv_model del vectors del articleKeywordsWeights del articleKeywordsWeightsAndVectors del articleKeywordVectors gc.collect() # 得到历史数据, 转换成固定格式使用LSH进行求相似 train = self.spark.sql( "select * from article_vector where channel_id=%d" % channel_id) def _array_to_vector(row): return row.article_id, Vectors.dense(row.articleVector) train = train.rdd.map(_array_to_vector).toDF( ['article_id', 'articleVector']) test = articleVector.rdd.map(_array_to_vector).toDF( ['article_id', 'articleVector']) brp = BucketedRandomProjectionLSH(inputCol='articleVector', outputCol='hashes', seed=12345, bucketLength=1.0) model = brp.fit(train) similar = model.approxSimilarityJoin(test, train, 2.0, distCol='EuclideanDistance') def save_hbase(partition): import happybase for row in partition: pool = happybase.ConnectionPool(size=3, host='hadoop-master') # article_similar article_id similar:article_id sim with pool.connection() as conn: table = conn.table("article_similar") for row in partition: if row.datasetA.article_id == row.datasetB.article_id: pass else: table.put( str(row.datasetA.article_id).encode(), { b"similar:%d" % row.datasetB.article_id: b"%0.4f" % row.EuclideanDistance }) conn.close() similar.foreachPartition(save_hbase)
"cleaned_tweets", regexp_replace(col("tweets"), "http.+|@.|\n|RT|\d+", ' ')) # All words are lowercase and tokenized tweets_df = RegexTokenizer(inputCol="cleaned_tweets", outputCol="lowercase_tweets", pattern="\\W").transform(tweets_df) # We remove the StopWords tweets_df = StopWordsRemover( inputCol="lowercase_tweets", outputCol="processed_tweets").transform(tweets_df) # We drop the unused columns tweets_df = tweets_df.drop("cleaned_tweets", "lowercase_tweets", "lang", "date") # We load the language model model_path = "s3://" + bucket_name + "/models/w2v_model" loaded_model = Word2VecModel.load(model_path) # We add the output columns : it is the average of the words' vectors for each tweet tweets_df = loaded_model.transform(tweets_df) # We load the classifier clf_path = "s3://" + bucket_name + "/models/mpc_model" loaded_clf = MultilayerPerceptronClassificationModel.load(clf_path) predictions = loaded_clf.transform(tweets_df) # We keep the probability only for the predicted sentiment to_array = udf(lambda v: v.toArray().tolist(), ArrayType(FloatType())) predictions = predictions.withColumn("probability", to_array("probability")) predictions = predictions.withColumn("probability", array_max("probability"))
# step1:convert item_id to sequence in session train_data_seq = transform_trainable(train_data, test=False) # step2:Training model and save model word2Vec = Word2Vec(vectorSize=100, seed=42, minCount=2, inputCol="reference_list", outputCol="doc2vec_spark") model = word2Vec.fit(train_data_seq) model_path = '/team/cmp/hive_db/cmp_tmp/dl_model_template/recdata/rec_models' model.write().overwrite().save(os.path.join(model_path, "item2vec.model")) # step3:load model model = Word2VecModel.load(os.path.join(model_path, "item2vec.model")) # step4:Get the vector for each item and session item2vec = model.getVectors() train_data_seq = model.transform(train_data_seq) # step5:Get the click sequence train_data_click = train_data.filter("action_type='clickout item'").select( 'user_id', "session_id", 'timestamp', 'step', 'reference', 'impressions') train_data_click = train_data_click.withColumn( 'impressions', F.split(train_data.impressions, '\|')).withColumn("impressions", F.explode("impressions")) cond = train_data_click.impressions == item2vec.word
filtered_words_list.append(seg.word) elif seg.flag in ["x", "eng"]: # 是自定一个词语或者是英文单词 filtered_words_list.append(seg.word) return filtered_words_list for row in partition: sentence = re.sub("<.*?>", "", row.sentence) # 替换掉标签数据 words = cut_sentence(sentence) yield row.article_id, row.channel_id, words ''' words_df = article_data.rdd.mapPartitions(segmentation).toDF(['article_id', 'channel_id', 'words']) words_df.show() # 直接调用word2vec训练 w2v_model = Word2Vec(vectorSize=100, inputCol='words', outputCol='model', minCount=3) model = w2v_model.fit(words_df) model.save("hdfs://hadoop-master:9000/headlines/models/test.word2vec") ''' # 1、加载某个频道模型,得到每个词的向量 from pyspark.ml.feature import Word2VecModel channel_id = 18 channel = "python" wv_model = Word2VecModel.load( "hdfs://hadoop-master:9000/headlines/models/word2vec_model/channel_%d_%s.word2vec" % (channel_id, channel)) vectors = wv_model.getVectors() vectors.show()
def Validate(ngrams \ , sampleSizes \ , ctxSize \ , sqc \ , seqs \ , outFile \ , minval \ , maxval \ , avg \ , nlines): accuracy = [] gramSize = GramSize(ctxSize, lookahead) c1 = (((maxval - minval) * 1.0) / nlines) / avg c2 = ((minval * 1.0) / nlines) / avg print seqs.count() ngrams = ngrams.repartition(1 << nPartLog) ngrams.cache() #we will validate separately for each vector size for vecSize in vecSizes: print '======TESTING FOR VECTOR SIZE', vecSize #start fresh old_ngrams = ngrams ngrams = ngrams.withColumn('correct', lit(0)) #use models from each sample modelId = 0 for sampleSize in sampleSizes: w2v = Word2VecModel.load(w2vFile(outDir, ctxSize, sampleSize, vecSize)) lrmodels = [] for dim in range(0, vecSize): lrmodels.append(LinearRegressionModel.load(lrmFile(outDir, ctxSize, sampleSize, vecSize, dim))) success = 0 fail = 0 unopt = 0 #add columns to store model success and failure modelSucc = 'succ_' + str(modelId) modelFail = 'fail_' + str(modelId) modelUnopt = 'unopt_' + str(modelId) seqs = seqs.withColumn(modelSucc, lit(0)) \ .withColumn(modelFail, lit(0)) \ .withColumn(modelUnopt, lit(0)) modelId = modelId + 1 ngrams = ngrams \ .withColumn('predSeq', lit('')) #create initial feature vector #transform each word into a cluster center words, d, centers = ClusterWords(w2v \ , seqs \ ) #record correctness for this model only old_ngrams = ngrams ngrams = ngrams.withColumn('sample_correct', lit(0)).withColumn('sample_confi', lit(1.0)) for nextPos in range(0,lookahead): #build the feature vector ngrams = BuildSubstringFeature(ngrams, w2v, nextPos, nextPos + ctxSize, ctxSize, lookahead,) #build the prediction vector ngrams = BuildPredictionVector(ngrams, lrmodels, ctxSize, vecSize) #now assign a cluster id to each prediction vector old_ngrams = ngrams ngrams = centers.transform(ngrams).withColumnRenamed('cluster', 'predWord').withColumnRenamed('vector', 'predictionVector') #get the predicted word ngrams = ngrams.join(broadcast(words), words.cluster == ngrams.predWord, 'inner') \ .drop('cluster') #\ #calculate the cosine similarity between prediction vector and center vector epsilon = 0.0001 def CosineSimi (v1, v2): d1 = DenseVector(v1) d2 = DenseVector(v2) n1 = d1.norm(2) n2 = d2.norm(2) return float(d1.dot(d2) / (n1 * n2)) cossim = udf(lambda v1, v2: CosineSimi(v1, v2), DoubleType()) ngrams = ngrams.withColumn('simi', cossim('centerVector', 'predictionVector')) ngrams = ngrams.drop('centerVector').drop('predictionVector') #update predicted sequence ngrams = ngrams.withColumn('predSeq', concat_ws(' ', 'predSeq', 'word')) ngrams = ngrams.withColumn('predSeq', ltrim(ngrams.predSeq)) #get actual sequence ngrams = CreateSubstring(ngrams, 'sentence', 'actualSeq', gramSize, ' ', ctxSize, ctxSize + nextPos + 1) #now get the cluster id for the predicted word in the sentence ngrams = BuildLabelVector(ngrams, w2v, ctxSize, lookahead, nextPos).withColumnRenamed('labelVec', 'vector').drop('ngrams') ngrams = centers.transform(ngrams).drop('vector') #and host latency for actual word ngrams = ngrams.join(broadcast(words), 'cluster', 'inner') \ .drop('word') \ .drop('centerVector') #\ #record correctness ngrams = ngrams.withColumn('round_correct', when((ngrams.predWord != ngrams.cluster) | (ngrams.simi < confidence), 0).otherwise(nextPos + 1)).drop('predWord').drop('cluster') ngrams = ngrams.withColumn('sample_correct', when(ngrams.sample_correct + 1 == ngrams.round_correct, ngrams.round_correct).otherwise(ngrams.sample_correct)) #get overall correctness ngrams = ngrams.withColumn('correct', greatest('sample_correct', 'correct')) #get binary correctness ngrams = ngrams.withColumn('binary_correct', when(ngrams.correct >= nextPos + 1, 1).otherwise(0)) ngrams = ngrams.withColumn('sample_confi', when(ngrams.binary_correct == 1, 1.0).otherwise(least(ngrams.simi, ngrams.sample_confi))) ngrams = ngrams.withColumn('simi', when(ngrams.binary_correct == 1, ngrams.simi).otherwise(ngrams.sample_confi)) ngrams = ngrams.withColumn('predSeq', when((ngrams.binary_correct == 1) | (ngrams.simi < confidence), ngrams.actualSeq).otherwise(ngrams.predSeq)) ngrams = ngrams.withColumn('succ_wt', when(ngrams.binary_correct == 1, ngrams.wt).otherwise(0)) ngrams = ngrams.withColumn('fail_wt', when((ngrams.binary_correct == 1) | (ngrams.simi < confidence), 0).otherwise(ngrams.wt)) ngrams = ngrams.withColumn('unopt_wt', when((ngrams.binary_correct == 0) & (ngrams.simi < confidence), ngrams.wt).otherwise(0)) ngrams = ngrams.drop('simi') #now summarize success and failure rates by predicted sequence seqWts = ngrams.groupBy('predSeq').agg(sum('succ_wt').alias('succ_wt'), sum('fail_wt').alias('fail_wt'), sum('unopt_wt').alias('unopt_wt')) #update sequences table seqs = seqWts.join(broadcast(seqs), seqWts.predSeq==seqs.word, 'right_outer').drop('predSeq').fillna(-c2/c1, ['succ_wt', 'fail_wt', 'unopt_wt']) scaleback = udf(lambda s: float(s*c1 + c2), DoubleType()) seqs = seqs.withColumn(modelSucc, col(modelSucc) + scaleback(seqs.succ_wt)).drop('succ_wt') seqs = seqs.withColumn(modelFail, col(modelFail) + scaleback(seqs.fail_wt)).drop('fail_wt') seqs = seqs.withColumn(modelUnopt, col(modelUnopt) + scaleback(seqs.unopt_wt)).drop('unopt_wt') seqs.cache() aggregated = seqs.agg(sum(modelSucc), sum(modelFail), sum(modelUnopt)) aggregated.cache() new_success = aggregated.head()['sum(' + modelSucc + ')'] new_fail = aggregated.head()['sum(' + modelFail + ')'] new_unopt = aggregated.head()['sum(' + modelUnopt + ')'] print nextPos, new_success - success, new_fail - fail, new_unopt - unopt success = new_success fail = new_fail unopt = new_unopt #end for testing for each model for a particular vector size #end for each vector size seqs.orderBy('succ_0', ascending=False).write.mode('overwrite').csv(outputFile(outDir, ctxSize, vecSize, sampleSizes)) return accuracy
for row in partition: sentence = re.sub("<.*?>", "", row.sentence) # 替换掉标签数据 words = cut_sentence(sentence) yield row.article_id, row.channel_id, words words_df = article_data.rdd.mapPartitions(segmentation).toDF(["article_id","channel_id","words"]) words_df.show() # 训练#17频道的word2vec # w2v_model = Word2Vec(vectorSize=100,inputCol='words',outputCol='model',minCount=3) # model = w2v_model.fit(words_df) # model.save("hdfs://hadoop1:9000/headlines/models/word2vec_model_17") # 1.加载模型,得到每个词的向量 from pyspark.ml.feature import Word2VecModel wv = Word2VecModel.load("hdfs://hadoop1:9000/headlines/models/word2vec_model_17") vectors = wv.getVectors() vectors.show() # 2.获取该频道的文章画像,得到文章画像的关键词,获取这20个关键词对应的词向量 article_profile = w2v.spark.sql("select * from article_profile where channel_id=17 limit 10") # 3.计算得到文章每个词的向量,利用explode将keywords字典炸开得到keyword和weight article_profile.registerTempTable('profile') keyword_weight = w2v.spark.sql("select article_id,channel_id,keyword,weight from profile " "LATERAL VIEW explode(keywords) as keyword,weight") keyword_weight.show() # 4.将文章画像的keyword_weight和w2v模型合并,得到每篇文章20个关键词的词向量 _keywords_vector = keyword_weight.join(vectors,vectors.word==keyword_weight.keyword,how='inner') _keywords_vector.show()
['article_id', 'channel_id', 'words']) print("分词数据", words_df.take(10)) # 二、word2vec训练分词数据 from pyspark.ml.feature import Word2Vec w2v_model = Word2Vec(vectorSize=100, inputCol='words', outputCol='vector', minCount=3) model = w2v_model.fit(words_df) model.write().overwrite().save("models/word2vec_model/python.word2vec") from pyspark.ml.feature import Word2VecModel w2v_model = Word2VecModel.load("models/word2vec_model/python.word2vec") vectors = w2v_model.getVectors() vectors.show() # 三、关键词获取(tfidf) # tdidf # 词频,即tf from pyspark.ml.feature import CountVectorizer # vocabSize是总词汇的大小,minDF是文本中出现的最少次数 cv = CountVectorizer(inputCol="words", outputCol="countFeatures", vocabSize=200 * 10000, minDF=1.0) # 训练词频统计模型 cv_model = cv.fit(words_df)
.config("spark.rpc.message.maxSize", "2047") \ .config("spark.sql.catalogImplementation", "in-memory") \ .config("spark.dynamicAllocation.enabled", "false") \ .getOrCreate() sc = spark.sparkContext else: spark = None sc = None # load model if args.modelType == "glint": from ml_glintword2vec import ServerSideGlintWord2VecModel model = ServerSideGlintWord2VecModel.load(args.modelPath) elif args.modelType == "ml": model = Word2VecModel.load(args.modelPath) else: model = GensimWord2Vec.load(args.modelPath) # get required vectors with model words1, words2, wordvecs1, wordvecs2 = words_and_vecs_from_csv( spark, model, args.csvPath) simlex_wordvecs = wordvecs_from_simlex(spark, model, args.language) ws353_wordvecs = wordvecs_from_wordsim353(spark, model, args.language) predicted_synonyms = word_synonyms(words1, model) predicted_words2 = word_analogies(wordvecs1, wordvecs2, words1, words2) # stop model if args.modelType == "glint": model.stop()
def compute_article_similar(self, articleProfile): """ 计算增量文章与历史文章的相似度 :param articleProfile: :return: """ from pyspark.ml.feature import Word2VecModel def avg(row): x = 0 for v in row.vectors: x += v return row.article_id, row.channel_id, x / len(row.vectors) for channel_id, channel_name in CHANNEL_INFO.items(): profile = articleProfile.filter( 'channel_id = {}'.format(channel_id)) wv_model = Word2VecModel.load( "hdfs://hadoop1:9000/headlines/models/channel_%d_%s.word2vec" % (channel_id, channel_name)) vectors = wv_model.getVectors() # 计算向量 profile.registerTempTable("incremental") articleKeywordsWeights = self.spark.sql( "select article_id,channel_id,keyword,weight from profile\ LATERAL VIEW explode(keywords) as keyword,weight" ) articleKeywordsWeightsAndVectors = articleKeywordsWeights.join( vectors, vectors.word == articleKeywordsWeights.keyword, "inner") articleKeywordVectors = articleKeywordsWeightsAndVectors.rdd.map( lambda r: (r.article_id, r.channel_id, r.keyword, r.weight * r. vector)).toDF([ "article_id", "channel_id", "keyword", "weightVector" ]) articleKeywordVectors.registerTemptable("Temptable") articleVector = self.spark.sql( "select article_id, min(channel_id) channel_id, collect_set(weightVector) vectors from temptable group by article_id" ).rdd.map(avg).toDF(["article_id", "channel_id", "articleVector"]) # 写入数据库hive def toArray(row): return row.article, row.channel_id, [ float(i) for i in row.articleVector.toArray() ] articleVector = articleVector.rdd.map(toArray).toDF( ["article_id", "channel_id", "articleVector"]) articleVector.write.insertInto("article_vector") import gc del wv_model del vectors del articleKeywordsWeights del articleKeywordsWeightsAndVectors del articleKeywordVectors gc.collect() # 得到历史文章向量,转换成vector格式,使用LSH求相似文章 from pyspark.ml.linalg import Vectors from pyspark.ml.feature import BucketedRandomProjectionLSH train = self.spark.sql( "select * from article_vector where channel_id=%d" % channel_id) def _array_to_vector(row): return row.article_id, Vectors.dense(row.articleVector) train = train.rdd.map(_array_to_vector).toDF( ["article_id", "articleVector"]) test = articleVector.rdd.map(_array_to_vector).toDF( "article_id", "articleVector") brp = BucketedRandomProjectionLSH(inputCol="articleVector", outputCol="hashes", bucketLength=1.0, seed=12345) model = brp.fit(train) similar = model.approxSimilarityJoin(test, train, 2.0, distCol="EuclideanDistance") def save_hbase(partitions): import happybase pool = happybase.ConnectionPool(size=3, host='hadoop1') with pool.connection() as conn: article_similar = conn.table('article_similar') for row in partitions: if row.datasetA.article_id == row.datasetB.article_id: pass else: article_similar.put( str(row.datasetA.article_id).encode(), { 'similar:{}'.format(row.datasetB.article_id).encode( ): b'%0.4f' % (row.EuclideanDistance) }) similar.foreachPartition(save_hbase)
udf(preprocess_text, ArrayType(StringType()))(question_dataframe.question)) # In[9]: #print(question_tokenized_df.take(1)) # In[10]: ###now we have to generate the vectors for this given question from pyspark.ml.feature import Word2Vec, Word2VecModel saveword2vec_path = os.getcwd() + '/dataset/word2vecmodel' # In[11]: model_word2vec = Word2VecModel.load(saveword2vec_path) # In[12]: question_with_vector_df = model_word2vec.transform(question_tokenized_df) # In[13]: #taking only the dense vector question_dense_vec = question_with_vector_df.first()["features"] # In[14]: #Now that we have everything in place, we just need to calculate the similarity score import numpy as np
from pyspark.ml.feature import Word2VecModel, Tokenizer, StopWordsRemover from pyspark.sql.functions import regexp_replace import os spark = SparkSession.builder \ .appName("Sentiment") \ .master("local[*]") \ .config("spark.driver.memory","4g")\ .config("spark.hadoop.yarn.resourcemanager.principal",os.getenv("HADOOP_USER_NAME"))\ .getOrCreate() storage = os.getenv("STORAGE") tokenizer = Tokenizer(inputCol="spoken_words", outputCol="word_list") remover = StopWordsRemover(inputCol="word_list", outputCol="wo_stop_words") w2v_model_fitted = Word2VecModel.load( storage + "/datalake/data/sentiment/w2v_model_fitted") lr_model = PipelineModel.load(storage + "/datalake/data/sentiment/lr_model") #args = {"sentence":"I'm no dunce, I was born an oaf and I'll die an oaf"} def predict_sentiment(args): input_sentence = args["sentence"] #.split(",") sentence_df = spark.createDataFrame([(input_sentence, )], ['spoken_words']) sentence_df = sentence_df.select( regexp_replace('spoken_words', r'[_\"\'():;,.!?\\-]', ' ').alias('spoken_words')) sentence_df = tokenizer.transform(sentence_df) sentence_df = remover.transform(sentence_df) sentence_df = w2v_model_fitted.transform(sentence_df) result = lr_model.transform(sentence_df).collect()[0]
wordsDF = articleDF.rdd.mapPartitions(segmentation).toDF( ["article_id", "channel_id", "words"]) # Train word2vec model word2vec = Word2Vec(vectorSize=50, inputCol="words", outputCol="model", minCount=2) model = word2vec.fit(wordsDF) model.save( "D:/WorkSpace/ToutiaoRecommenderWorkSpace/toutiao_project/reco_sys/output/Word2Vec.model" ) # Load the model wv_model = Word2VecModel.load( "D:/WorkSpace/ToutiaoRecommenderWorkSpace/toutiao_project/reco_sys/output/Word2Vec.model" ) vectors = wv_model.getVectors() profile = sqlContext.sql("select * from article_profile") profile.registerTempTable("incremental") articleKeywordsWeights = sqlContext.sql( "select article_id, channel_id, keyword, weight " "from incremental LATERAL VIEW explode(keywords) AS keyword, weight") _article_profile = articleKeywordsWeights.join( vectors, vectors.word == articleKeywordsWeights.keyword, "inner") articleKeywordVectors = _article_profile.rdd.map( lambda row: (row.article_id, row.channel_id, row.keyword, row.weight * row. vector)).toDF([ "article_id", "channel_id", "keyword", "weightingVector"