def _transform(self, dataset): t = StringType() out_col = self.getOutputCol() in_col = dataset[self.getInputCol()] return dataset.withColumn( out_col, udf(lambda x: LabeledPoint(1, Vectors.fromML(x)), t)(in_col))
def __init__(self, training_data, max_iterations=200, spark_session=None, ll_sample_size=5, ll_sample_fraction=0.99, fit_model_retries=10): if hasattr(training_data, 'rdd'): self.ml_training_data = training_data # type: DataFrame self.mllib_training_data = training_data.rdd\ .map(lambda r: Vectors.fromML(r.features)).persist() # type: RDD else: if spark_session is None: raise Exception( "Spark session must be provided if training data is not a dataframe." ) self.mllib_training_data = training_data # type: RDD self.ml_training_data = spark_session.createDataFrame( training_data.map(lambda v: (MlVectors.dense(v), )), ['features']) # type: DataFrame self.max_iterations = max_iterations self.ll_sample_size = ll_sample_size self.ll_sample_fraction = ll_sample_fraction self.ll_samples = {} self.fit_model_retries = fit_model_retries
def getKeywordsInDataRange(sDF, oldestTime, newestTime, topics=1, wordsPerTopic=20): #yyyy-MM-dd #Filter oldestTime = datetime.strptime(oldestTime, '%Y-%m-%d') newestTime = datetime.strptime(newestTime, '%Y-%m-%d') filteredText = sDF\ .select( "id", date_format('day','yyyy-MM-dd').alias('time'), col("title").alias("text") )\ .where( (col("time") >= oldestTime) & (col("time") <= newestTime) ) #StartPipeline for preparing data textToWords = RegexTokenizer( inputCol="text", outputCol="splitted", pattern="[\\P{L}]+") #Remove signs and split by spaces stopRemover = StopWordsRemover( inputCol="splitted", outputCol="words", stopWords=StopWordsRemover.loadDefaultStopWords("english")) countVectorizer = CountVectorizer(inputCol="words", outputCol="features") pipeline = Pipeline(stages=[textToWords, stopRemover, countVectorizer]) #GetCorups for LDA try: model = pipeline.fit(filteredText) except IllegalArgumentException: return [] result = model.transform(filteredText) corpus = result.select("id", "features").rdd.map( lambda r: [mhash(r.id) % 10**8, Vectors.fromML(r.features)]).cache() # Cluster the documents into k topics using LDA ldaModel = LDA.train(corpus, k=topics, maxIterations=100, optimizer='online') topics = ldaModel.topicsMatrix() vocabArray = model.stages[2].vocabulary #CountVectorizer topicIndices = spark.sparkContext.parallelize( ldaModel.describeTopics(maxTermsPerTopic=wordsPerTopic)) def topic_render(topic): # specify vector id of words to actual words terms = topic[0] result = [] for i in range(wordsPerTopic): term = vocabArray[terms[i]] result.append(term) return result # topics_final = topicIndices.map(lambda topic: topic_render(topic)).collect() # for topic in range(len(topics_final)): # print ("Topic" + str(topic) + ":") # for term in topics_final[topic]: # print (term) # print ('\n') return topicIndices.map(lambda topic: topic_render(topic)).collect()
def train_SVM(idf_df, iterations=50, regress_param=0.3): """ 通过上面划分的数据向量来训练SVM 注:这里必须是静态方法,否则会出现sparkContext广播错误(sparkContext只能由全局driver使用) :param idf_df: :param iterations: :param regress_param: :return: """ splits = idf_df.select(['idf_output', 'label']).randomSplit([0.8, 0.2], seed=100) train = splits[0].cache() test = splits[1].cache() train_lb = train.rdd.map( lambda row: LabeledPoint(row[1], MLLibVectors.fromML(row[0]))) # SVM model svm = SVMWithSGD.train(train_lb, iterations, regParam=regress_param) test_lb = test.rdd.map( lambda row: LabeledPoint(row[1], MLLibVectors.fromML(row[0]))) scoreAndLabels_test = test_lb.map( lambda x: (float(svm.predict(x.features)), x.label)) spark = SparkSession \ .builder \ .appName("Python Spark SQL") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() score_label_test = spark.createDataFrame(scoreAndLabels_test, ["prediction", "label"]) # F1 score f1_eval = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1") svm_f1 = f1_eval.evaluate(score_label_test) print("F1 score: %.4f" % svm_f1) return svm
def test_ml_mllib_vector_conversion(self): # to ml # dense mllibDV = Vectors.dense([1, 2, 3]) mlDV1 = newlinalg.Vectors.dense([1, 2, 3]) mlDV2 = mllibDV.asML() self.assertEqual(mlDV2, mlDV1) # sparse mllibSV = Vectors.sparse(4, {1: 1.0, 3: 5.5}) mlSV1 = newlinalg.Vectors.sparse(4, {1: 1.0, 3: 5.5}) mlSV2 = mllibSV.asML() self.assertEqual(mlSV2, mlSV1) # from ml # dense mllibDV1 = Vectors.dense([1, 2, 3]) mlDV = newlinalg.Vectors.dense([1, 2, 3]) mllibDV2 = Vectors.fromML(mlDV) self.assertEqual(mllibDV1, mllibDV2) # sparse mllibSV1 = Vectors.sparse(4, {1: 1.0, 3: 5.5}) mlSV = newlinalg.Vectors.sparse(4, {1: 1.0, 3: 5.5}) mllibSV2 = Vectors.fromML(mlSV) self.assertEqual(mllibSV1, mllibSV2)
def df_to_simple_rdd(df, categorical=False, nb_classes=None, features_col='features', label_col='label'): """Convert DataFrame into RDD of pairs """ sql_context = df.sql_ctx sql_context.registerDataFrameAsTable(df, "temp_table") selected_df = sql_context.sql( "SELECT {0} AS features, {1} as label from temp_table".format(features_col, label_col)) if isinstance(selected_df.first().features, MLLibVector): lp_rdd = selected_df.rdd.map( lambda row: LabeledPoint(row.label, row.features)) else: lp_rdd = selected_df.rdd.map(lambda row: LabeledPoint( row.label, MLLibVectors.fromML(row.features))) rdd = lp_to_simple_rdd(lp_rdd, categorical, nb_classes) return rdd
def tfidf(row_df): hashingTF = HashingTF(inputCol='bigrams', outputCol='TF', numFeatures=20000) tf_df = hashingTF.transform(row_df) idf = IDF(inputCol='TF', outputCol='TF-IDF') idfModel = idf.fit(tf_df) idf_df = idfModel.transform(tf_df) # Convert labels to sparse vectors, that are needed by the classifer coordinates = tf_df.select("coordinates").rdd.flatMap( lambda x: x).collect() tweets = tf_df.select('sentence').rdd.flatMap(lambda x: x).collect() return tweets, coordinates, tf_df.rdd.map( lambda row: LabeledPoint(0.0, Vectors.fromML(row.TF)))
def A1(): #1) apply LDA and find topics in user's posts (including reposts) textToWords = RegexTokenizer( inputCol="text", outputCol="splitted", pattern="[\\P{L}]+") #Remove signs and split by spaces stopRemover = StopWordsRemover( inputCol="splitted", outputCol="words", stopWords=StopWordsRemover.loadDefaultStopWords("russian") + StopWordsRemover.loadDefaultStopWords("english")) countVectorizer = CountVectorizer(inputCol="words", outputCol="features") #Filter if post id exists? data = uWallP\ .filter( uWallP.text != "" )\ .select("id","text")\ .limit(10)\ pipeline = Pipeline(stages=[textToWords, stopRemover, countVectorizer]) model = pipeline.fit(data) result = model.transform(data) corpus = result.select("id", "features").rdd.map( lambda r: [r.id, Vectors.fromML(r.features)]).cache() # Cluster the documents into k topics using LDA ldaModel = LDA.train(corpus, k=8, maxIterations=100, optimizer='online') topics = ldaModel.topicsMatrix() vocabArray = model.stages[2].vocabulary #CountVectorizer wordNumbers = 20 # number of words per topic topicIndices = spark.sparkContext.parallelize( ldaModel.describeTopics(maxTermsPerTopic=wordNumbers)) def topic_render(topic): # specify vector id of words to actual words terms = topic[0] result = [] for i in range(wordNumbers): term = vocabArray[terms[i]] result.append(term) return result topics_final = topicIndices.map( lambda topic: topic_render(topic)).collect() for topic in range(len(topics_final)): print("Topic" + str(topic) + ":") for term in topics_final[topic]: print(term) print('\n')
def word_topics(num_topics=NUM_TOPICS, num_words_per_topics=NUM_WORDS_PER_TOPICS): """Generates topics from word clusters. Arguments: num_topics {integer} -- Number of topics to infer num_words_per_topics {integer} -- Number of terms to collect for each topic Returns: None """ spark = init_spark(AITA_CLEANED_COLLECTION) data_rdd = spark.read.format('mongo').load().rdd preprocessed_rdd = data_rdd\ .flatMap(lambda row: [row['header'].lower().split(' ') + row['content'].lower().split(' ')]) \ .zipWithIndex() \ .map(lambda x: Row(index=x[1], words=x[0])) preprocessed_df = spark.createDataFrame(preprocessed_rdd) cv = CountVectorizer(inputCol='words', outputCol='vectors') model = cv.fit(preprocessed_df) vector_df = model.transform(preprocessed_df) corpus = vector_df.select('index', 'vectors').rdd.map(lambda x: [x[0], Vectors.fromML(x[1])]).cache() lda_model = LDA.train(corpus, k=num_topics, maxIterations=100, optimizer='online') vocab_array = model.vocabulary topic_indices = spark.sparkContext.parallelize(lda_model.describeTopics(maxTermsPerTopic=num_words_per_topics)) def vector_id_to_word(topic): terms = topic[0] weights = topic[1] result = [] for i in range(num_words_per_topics): result.append((vocab_array[terms[i]], weights[i])) return result topics = topic_indices.map(lambda topic: vector_id_to_word(topic)).collect() for i in range(len(topics)): print('Topic {}:'.format(i)) for item in topics[i]: print(item) print('\n')
def LDA_Treatment(str): finalTopics = [] txt = wordTokenize(str) data = sc.parallelize([txt]).zipWithIndex().map(lambda val: Row(idd=val[1], _words=val[0].split(" "))) docDF = spark.createDataFrame(data, ["_words"]) Vector = CountVectorizer(inputCol="_words", outputCol="vectors") model = Vector.fit(docDF) result = model.transform(docDF) corpus = result.select("idd", "vectors").rdd.map(lambda val: [val[0], Vectors.fromML(val[1])]).cache() ldaModel = LDA.train(corpus, k=nbTopics, maxIterations=1000, optimizer='online') topics = ldaModel.topicsMatrix() vocabArray = model.vocabulary topicIndices = sc.parallelize(ldaModel.describeTopics(maxTermsPerTopic=wordNumbers)) topics_final = topicIndices.map(lambda topic: topic_render(topic, vocabArray)).collect() for topic in range(len(topics_final)): for term in topics_final[topic]: term = unidecode.unidecode(term) finalTopics.append(term) return finalTopics
def mllib_linear_regression(s_file, r_file, iter_ ): def data_process(s_file, r_file): table_s = spark.read.csv(s_file, inferSchema = True, header = True, sep = ",") table_r = spark.read.csv(r_file, inferSchema = True, header = True, sep = ",") table_r = table_r.withColumn("default", lit(1)) table_s = table_s.select(*(col(c).cast("float").alias(c) for c in table_s.columns)) table_r = table_r.select(*(col(c).cast("float").alias(c) for c in table_r.columns)) table_s.registerTempTable("table_s") table_r.registerTempTable("table_r") table_joint = spark.sql("SELECT * FROM table_s LEFT JOIN table_r ON table_s.fk = table_r.rid") table_joint.registerTempTable("table_joint") table_joint = table_joint.select(*(col(c).cast("float").alias(c) for c in table_joint.columns)) # make joint data col_size_s = len(table_s.columns) col_size_r = len(table_r.columns) feature_cols = table_joint.columns[3:col_size_s]+table_joint.columns[col_size_s+1:] vectorAssembler = VectorAssembler(inputCols = feature_cols, outputCol = 'X') table_assemble = vectorAssembler.transform(table_joint) exprs = [col(column).alias(column.replace(' ', '_')) for column in table_assemble.columns] #R(RID, X_R) Tdata = table_assemble.select(*exprs).selectExpr("y as y", "X as X") return Tdata.rdd #processing data #Sdata_rdd, Rdata_rdd, feat_size_s, feat_size_r,Sdata_size = data_pre_process(s_file, r_file) #processing data Tdata_rdd = data_process(s_file, r_file) trainingData = Tdata_rdd.map(lambda row: LabeledPoint(row.y, MLLibVectors.fromML(row.X))) lr_model = LinearRegressionWithSGD.train(trainingData, iterations=iter_, step=0.01, miniBatchFraction=1.0) W = list(lr_model.weights) return np.array(W)
sc = pyspark.SparkContext.getOrCreate(conf=conf) sqlcontext = pyspark.SQLContext(sc) training_set = (sqlcontext.read.format("parquet").option( "header", True).load(data_dir)) # TF cv = sf.CountVectorizer(inputCol="text", outputCol="tf_features", vocabSize=input_dim) # IDF idf = sf.IDF(inputCol="tf_features", outputCol="features") label_string = sf.StringIndexer(inputCol="first_label", outputCol="label") pipeline_dl = Pipeline(stages=[cv, idf, label_string]) df = pipeline_dl.fit(training_set).transform(training_set) df = df.rdd.map(lambda x: (LabeledPoint(x[ 'label'], MLLibVectors.fromML(x['features'])))) logger.info("Pipeline created ...") logger.info("Transforms the text into tf idf RDD ...") model = create_keras_model(input_dim, output_dim) logger.info("Starts Training ...") spark_model = SparkMLlibModel(model=model, frequency='epoch', mode='asynchronous', parameter_server_mode='socket') spark_model.fit(df, epochs=epochs, batch_size=132, verbose=1, validation_split=0.2, categorical=True,
def do_query(issues, input_file, _log): """ Get the Latent Dirochelet Allocation topics for this group of articles """ # Extract parameters from input_rules with open(input_file, 'r') as infile: keys = load(infile) keyword = keys['keyword'] optimizer = keys['optimizer'] if optimizer != 'online' and optimizer != 'em': raise ValueError( "Optmizer must be 'online' or 'em' but is '{}'".format( optimizer)) max_iterations = keys['max_iterations'] if max_iterations < 1: raise ValueError('max_iterations must be at least 1') ntopics = keys['ntopics'] if ntopics < 1: raise ValueError('ntopics must be at least 1') topic_words = keys['topic_words'] if topic_words < 1: raise ValueError('topic_words must be at least 1') keyword_pattern = comp(r'\b{}\b'.format(keyword), U | I) # Map each article in each issue to a year of publication min_year, max_year = issues \ .filter(lambda issue: issue.date) \ .map(lambda issue: (issue.date.year, issue.date.year)) \ .reduce(find_min_and_max) articles_rdd = issues.flatMap(lambda issue: issue.articles) \ .filter(contains_keyword(keyword_pattern)) \ .zipWithIndex() \ .map(to_row_with_words) spark = SparkSession \ .builder \ .appName('lda') \ .getOrCreate() articles_df = spark.createDataFrame(articles_rdd) remover = StopWordsRemover(inputCol='words', outputCol='filtered') articles_df = remover.transform(articles_df) vectortoriser = CountVectorizer(inputCol='filtered', outputCol='vectors') model = vectortoriser.fit(articles_df) vocab_array = model.vocabulary articles_df = model.transform(articles_df) corpus = articles_df \ .select('idx', 'vectors') \ .rdd \ .map(lambda a: [a[0], Vectors.fromML(a[1])]) \ .cache() # Cluster the documents into n topics using LDA lda_model = LDA.train(corpus, k=ntopics, maxIterations=max_iterations, optimizer=optimizer) # topics = lda_model.topicsMatrix() # _log.error(topics) topics_final = [ topic_render(topic, topic_words, vocab_array) for topic in lda_model.describeTopics(maxTermsPerTopic=topic_words) ] topics = [('Years', [min_year, max_year])] for i, topic in enumerate(topics_final): t_words = [] for term in topic: t_words.append(term) topics.append((str(i), t_words)) return topics
for i in range(1, k): if 'f:'+str(i) in line: indexList.append(i) valList.append(line['f:'+str(i)]) label = int(line['l:'+str(col)]) if label == -1: label = 0 features.append((Vectors.sparse(k, indexList, valList),label)) features = sc.parallelize(features) #sclines = sc.parallelize(lines) #features = sclines.map(featuresToSparseVecFromLine) featureDataFrame = spark.createDataFrame(features, ["features", "label"]) pca = PCA(k=100, inputCol="features", outputCol="pcaFeatures") model = pca.fit(featureDataFrame) #pcaresult = model.transform(featureDataFrame).select("pcaFeatures").collect() #lp = [] #c = 0 #for com in pcaresult: # lp.append(LabeledPoint(lines[c]['l:' + str(col)], mllibVectors.fromML(com.pcaFeatures))) # c += 1 #lp = sc.parallelize(lp) pcaresult = model.transform(featureDataFrame).rdd lp = pcaresult.map(lambda r: LabeledPoint(r.label, mllibVectors.fromML(r.pcaFeatures))) model = SVMWithSGD.train(lp) model.save(sc, "svm/SVM" + str(col)) labelsAndPreds = lp.map(lambda p: (p.label, model.predict(p.features))) err = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count()) print("err at node " + str(col) + " = " + str(err)) sc.stop()
conf = (SparkConf().setMaster("local").setAppName("My").set( "spark.executor.memory", "1g")) sc = SparkContext(conf=conf) sc.setLogLevel("OFF") sqlContext = SQLContext(sc) path = 'clean_test.txt' # path of the txt file data = sc.textFile(path).zipWithIndex().map( lambda line: Row(idd=line[1], words=line[0].split(" "))) os.system('rm -f metastore_db/dbex.lck') docDF = sqlContext.createDataFrame(data) Vector = CountVectorizer(inputCol="words", outputCol="vectors") Vector = Vectors.fromML(Vector) model = Vector.fit(docDF) result = model.transform(docDF) corpus_size = result.count() # total number of words corpus = result.select( "idd", "vectors").rdd.map(lambda line: [line[0], line[1]]).cache() # Cluster the documents into three topics using LDA ldaModel = LDA.train(corpus, k=3, maxIterations=100, optimizer='online') topics = ldaModel.topicsMatrix() vocabArray = model.vocabulary wordNumbers = 10 # number of words per topic topicIndices = sc.parallelize( ldaModel.describeTopics(maxTermsPerTopic=wordNumbers))
def main(sc): train_id = utils.load("data_id/train.p") test_id = utils.load("data_id/test.p") meta(train_id) train_id = [[idx] for idx in train_id] test_id = [[idx] for idx in test_id] sqlContext = SQLContext(sc) train_f = sqlContext.createDataFrame(train_id, ['biz_id']) test_f = sqlContext.createDataFrame(test_id, ['biz_id']) # Register user defined functions # city = udf(lambda b_id: get_city(b_id), StringType()) state = udf(lambda b_id: MLVectors.dense(get_state(b_id)), VectorUDT()) stars = udf(lambda b_id: get_stars(b_id), FloatType()) popularity = udf(lambda b_id: get_popularity(b_id), IntegerType()) name_size = udf(lambda b_id: get_name_size(b_id), IntegerType()) name_polar = udf(lambda b_id: get_name_polar(b_id), FloatType()) pos_neg_score = udf(lambda b_id: MLVectors.dense(get_PosNeg_score(b_id)), VectorUDT()) # clarity = udf(lambda b_id: get_clarity(b_id), ArrayType(FloatType())) elite_cnt = udf(lambda b_id: get_elite_cnt(b_id), IntegerType()) label = udf(lambda b_id: get_y(b_id), IntegerType()) # Generate feature columns # data_f = data_f.withColumn("city", city(data_f['biz_id'])) train_f = train_f.withColumn("state", state(train_f['biz_id'])) train_f = train_f.withColumn("stars", stars(train_f['biz_id'])) train_f = train_f.withColumn("popularity", popularity(train_f['biz_id'])) train_f = train_f.withColumn("name_size", name_size(train_f['biz_id'])) train_f = train_f.withColumn("name_polar", name_polar(train_f['biz_id'])) train_f = train_f.withColumn("pos_neg_score", pos_neg_score(train_f['biz_id'])) # data_f = data_f.withColumn("clarity", clarity(data_f['biz_id'])) train_f = train_f.withColumn("elite_cnt", elite_cnt(train_f['biz_id'])) train_f = train_f.withColumn("y", label(train_f['biz_id'])) train_f.show(5) # Generate feature columns test_f = test_f.withColumn("state", state(test_f['biz_id'])) test_f = test_f.withColumn("stars", stars(test_f['biz_id'])) test_f = test_f.withColumn("popularity", popularity(test_f['biz_id'])) test_f = test_f.withColumn("name_size", name_size(test_f['biz_id'])) test_f = test_f.withColumn("name_polar", name_polar(test_f['biz_id'])) test_f = test_f.withColumn("pos_neg_score", pos_neg_score(test_f['biz_id'])) test_f = test_f.withColumn("elite_cnt", elite_cnt(test_f['biz_id'])) test_f = test_f.withColumn("y", label(test_f['biz_id'])) test_f.show(5) # One-hot encoding # encoder = OneHotEncoder(inputCol="state", outputCol="stateVec") # train_f = encoder.transform(train_f) train_f.show(5) # test_f = encoder.transform(test_f) test_f.show(5) # Assemble columns to features assembler = VectorAssembler(inputCols=[ "state", "stars", "popularity", "name_size", "name_polar", "pos_neg_score", "elite_cnt" ], outputCol="features") train_f = assembler.transform(train_f) train_f.show(5) test_f = assembler.transform(test_f) test_f.show(5) train_f = train_f.filter(train_f.y != -1) test_f = test_f.filter(test_f.y != -1) train_d = (train_f.select(col("y"), col("features")) \ .rdd \ .map(lambda row: LabeledPoint(float(row.y), MLLibVectors.fromML(row.features)))) m = SVMWithSGD.train(train_d) predictionAndLabels = test_f.rdd.map(lambda row: (float( m.predict(MLLibVectors.fromML(row.features))), float(row.y))) # Grid search for best params and model # scores = {} # max_score = 0 # for m in model_list: # print ('run', m) # evaluator = BinaryClassificationEvaluator() # cv = CrossValidator(estimator=model_list[m], # estimatorParamMaps=params_list[m], # evaluator=evaluator, # numFolds=3) # cv.fit(train) # scores[m] = cv.get_best_score() # if scores[m] > max_score: # op_params = params_list[m][cv.get_best_index()] # op_model = cv.get_best_model() # op_m_name = m # predictionAndLabels = test.map(lambda lp: (float(op_model.predict(lp.features)), lp.y)) # Instantiate metrics object bi_metrics = BinaryClassificationMetrics(predictionAndLabels) mul_metrics = MulticlassMetrics(predictionAndLabels) # Area under precision-recall curve print("Area under PR = %s" % bi_metrics.areaUnderPR) # Area under ROC curve print("Area under ROC = %s" % bi_metrics.areaUnderROC) # Confusion Matrix print("Confusion Matrix") print(mul_metrics.confusionMatrix().toArray()) # Overall statistics precision = mul_metrics.precision() recall = mul_metrics.recall() f1Score = mul_metrics.fMeasure() accuracy = mul_metrics.accuracy print("Summary Stats") print("Precision = %s" % precision) print("Recall = %s" % recall) print("F1 Score = %s" % f1Score) print("Accuracy = %s" % accuracy) # Individual label stats labels = [0, 1] for label in labels: print("Class %s precision = %s" % (label, mul_metrics.precision(label))) print("Class %s recall = %s" % (label, mul_metrics.recall(label)))
wordsFiltered.append(w) txt = " ".join(wordsFiltered).lower() data = sc.parallelize([ txt ]).zipWithIndex().map(lambda val: Row(idd=val[1], words=val[0].split(" "))) docDF = spark.createDataFrame(data) Vector = CountVectorizer(inputCol="words", outputCol="vectors") model = Vector.fit(docDF) result = model.transform(docDF) corpus = result.select( "idd", "vectors").rdd.map(lambda val: [val[0], Vectors.fromML(val[1])]).cache() # Cluster the documents into three topics using LDA ldaModel = LDA.train(corpus, k=3, maxIterations=700, optimizer='online') topics = ldaModel.topicsMatrix() vocabArray = model.vocabulary wordNumbers = 5 # number of words per topic topicIndices = sc.parallelize( ldaModel.describeTopics(maxTermsPerTopic=wordNumbers)) def topic_render(topic): # specify vector id of words to actual words terms = topic[0] result = [] for i in range(wordNumbers):
'yyyy-MM-dd').alias('no_timestamp')).groupby('no_timestamp').count().sort( F.col('no_timestamp')) print(dates.show(dates.count())) dates.toPandas().plot(kind='line', x='no_timestamp', y='count') dates.toPandas().plot(kind='bar', x='no_timestamp') tokenizer = Tokenizer(inputCol="tweet", outputCol="words") prep_df = tokenizer.transform(df) cv_prep = CountVectorizer(inputCol="words", outputCol="prep") cv_model = cv_prep.fit(prep_df) ready_df = cv_model.transform(prep_df) # stopWords = [word for word in cv_prep.vocabulary if any(char.isdigit() for char in word)] # remover = StopWordsRemover(inputCol="words", outputCol="filtered", stopWords = stopwords) # prep_df = remover.transform(prep_df) trainable = ready_df.select( 'tweet_id', 'prep').rdd.map(lambda x, y: [x, Vectors.fromML(y)]).cache() print("Trainable") print(trainable.take(10)) print("take") model = LDA.train(trainable, k=5, seed=1, optimizer="online") exit(0) #Print the topics in the model topics = model.describeTopics(maxTermsPerTopic=15) for x, topic in enumerate(topics): print('topic nr: ' + str(x)) words = topic[0] weights = topic[1] for n in range(len(words)): print(cv_prep.vocabulary[words[n]] + ' ' + str(weights[n]))
# tweets.append(new_tweet) # f.close() # fd = codecs.open('cleaned_example.txt', 'w', encoding = 'utf-8') # for tweet in tweets: # fd.write(tweet+'\n') rdd = sc.textFile('opinion.txt').zipWithIndex().map( lambda (words, idd): Row(idd=idd, words=words.split(" "))) docDF = spark.createDataFrame(rdd) Vector = CountVectorizer(inputCol="words", outputCol="vectors") model = Vector.fit(docDF) result = model.transform(docDF) corpus = result.select( "idd", "vectors").rdd.map(lambda (x, y): [x, Vectors.fromML(y)]).cache() # Cluster the documents into three topics using LDA ldaModel = LDA.train(corpus, k=5, maxIterations=100, optimizer='online') topics = ldaModel.topicsMatrix() vocabArray = model.vocabulary wordNumbers = 5 # number of words per topic topicIndices = sc.parallelize( ldaModel.describeTopics(maxTermsPerTopic=wordNumbers)) def topic_render(topic): # specify vector id of words to actual words terms = topic[0] result = [] for i in range(wordNumbers): term = vocabArray[terms[i]]
#import os #import sys #import io #os.environ["PYSPARK_PYTHON"]="/usr/bin/python2" #sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') path = "Adelaide Airport-adelaide-0-2016.txt" sc = SparkContext() spark = SparkSession.builder.appName("Python Spark SQL basic example").config("spark.some.config.option", "some-value").getOrCreate() data = sc.textFile(path).zipWithIndex().map(lambda (words,idd): Row(idd= idd, words = words.split(" "))) docDF = spark.createDataFrame(data) Vector = CountVectorizer(inputCol="words", outputCol="vectors") model = Vector.fit(docDF) result = model.transform(docDF) corpus = result.select("idd", "vectors").rdd.map(lambda (x,y): [x,Vectors.fromML(y)]).cache() # Cluster the documents into three topics using LDA ldaModel = LDA.train(corpus, k=5,maxIterations=100,optimizer='online') topics = ldaModel.topicsMatrix() vocabArray = model.vocabulary wordNumbers = 5 # number of words per topic topicIndices = sc.parallelize(ldaModel.describeTopics(maxTermsPerTopic = wordNumbers)) def topic_render(topic): # specify vector id of words to actual words terms = topic[0] result = [] for i in range(wordNumbers): term = vocabArray[terms[i]] result.append(term) return result
adjectifs = ["ABSOLU","ADMIRABLE","AGREABLE","AIMABLE","AMUSANT","APOCALYPTIQUE","APPROXIMATIF","ATTACHANT","BANAL","BAS","BAVAROIS","BIEN","BOF","BON","BOULEVERSANT","BOUTE EN TRAIN","CAPTIVANT","CARACTERIEL","CATACLYSMIQUE","CATASTROPHIQUE","CELESTE","CHARMANT","CHEF D'OEUVRE","CHOUETTE","COMMUN","CONVENABLE","CONVIVIAL","COQUET","CORRECT","CREDIBLE","CROQUANTE","CYNIQUE","DEGUEULASSE","DELECTABLE","DELICIEUSE","DISJONCTE","DIVIN","DOUCE","DOUE","DROLE","EBLOUISSANT","EBOURIFFE","EFFICACE","EMBALLANT","EMOUVANT","ENDIABLE","ENNUYANT","ENRAGE","ENTHOUSIASMANT","EPATANT","EPOUSTOUFLANT","EPOUVANTABLE","EQUITABLE","EXALTANT","EXCEPTIONNEL","EXCUSABLE","EXEMPLAIRE","EXTRA","FERU","FESTIF","FLAMBOYANTE","FORMIDABLE","GRANDIOSE","HARDI","HONNETE","HORRIBLE","IMPORTANT","IMPRESSIONNANT","INCONNU","INCREDULE","INDEPENDANT","INFERNAL","INNOMMABLE","INSIGNIFIANT","INSUFFISANT","INSUPPORTABLE","INTENABLE","INTERESSANT","IRRESISTIBLE","LIBIDINEUX","LOUABLE","MAJESTUEUX","MAGISTRAL","MAGNIFIQUE","MEDIOCRE","MERDIQUE","MERVEILLEUX","MIGNON","MINABLE","MIROBOLANTE","MORTEL","MOYEN","NEGLIGEABLE","NUL","ORDINAIRE","ORIGINAL","PARFAIT","PIRE","PASSABLE","PASSIONNANT","PERCUTANT","PERSEVERANT","PHENOMENAL","PLACIDE","PLAISANT","PRESTANT","PRODIGIEUX","PROVERBIAL","QUELCONQUE","RAVISSANT","RECYCLE","RELATIF","REMARQUABLE","RENVERSANT","REVENDICATRICE","REVOLUTIONNAIRE","ROCAMBOLESQUE","RUTILANT","SAINT","SATISFAISANT","SEDUISANT","SEXY","SOMPTUEUX","SPIRITUEUX","SPLENDIDE","SUAVE","SUBLIME","SULFUREUSE","SUPERBE","SUPREME","SUPPORTABLE","TALENTUEUX","TOLERABLE","TRAGIQUE","TREPIDANT","TRES","TROUBLANT","VALABLE","VALEUREUX","VENERABLE","VITAMINES","VIVABLE","VULGAIRE"] articles = ['LE', 'LA', 'LES', 'UN', 'DES','COMME', 'A', 'QUE', 'PLUS', 'OUI', 'NON', 'PEUT', 'CES', 'CETTE', 'CET', 'MAIS', 'OU', 'ET', 'DONC', 'TOUS', 'TOUTE', 'LEUR', 'TOUTES', 'LEURS', 'AINSI', 'BIEN', 'MAL', 'ETRE', 'AVOIR', 'FAIRE', 'AVEC', 'SANS', 'PLUS', 'MOINS'] for w in words: if w not in stopWords and w.upper() not in articles and w.upper() not in adjectifs and len(w)>2: wordsFiltered.append(w) txt = " ".join(wordsFiltered).lower() data = sc.parallelize([txt]).zipWithIndex().map(lambda val: Row(idd= val[1], _words = val[0].split(" "))) docDF = spark.createDataFrame(data, ["_words"]) Vector = CountVectorizer(inputCol="_words", outputCol="vectors") model = Vector.fit(docDF) result = model.transform(docDF) corpus = result.select("idd", "vectors").rdd.map(lambda val: [val[0],Vectors.fromML(val[1])]).cache() # Cluster the documents into three topics using LDA ldaModel = LDA.train(corpus, k=1,maxIterations=700,optimizer='online') topics = ldaModel.topicsMatrix() vocabArray = model.vocabulary wordNumbers = 6 # number of words per topic topicIndices = sc.parallelize(ldaModel.describeTopics(maxTermsPerTopic = wordNumbers)) def topic_render(topic): # specify vector id of words to actual words terms = topic[0] result = [] for i in range(wordNumbers): term = vocabArray[terms[i]]
def buildTfIdfRddAllTopics(business, sports, politics, entertainment): business_df = buildTextRDD(business, BUSINESS_LABEL) politics_df = buildTextRDD(sports, POLITICS_LABEL) sports_df = buildTextRDD(politics, SPORTS_LABEL) entertainment_df = buildTextRDD(entertainment, ENTERTAINMENT_LABEL) # Union together all dataframes main_df = business_df.union(politics_df) main_df = main_df.union(sports_df) main_df = main_df.union(entertainment_df) main_df = main_df.withColumnRenamed('_1', 'label') main_df = main_df.withColumnRenamed('_2', 'content') tokenizer = Tokenizer(inputCol="content", outputCol="words") wordsData = tokenizer.transform(main_df) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=8) featurizedData = hashingTF.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) return rescaledData.select([c for c in rescaledData.columns if c in ['label', 'features']]).rdd.map(lambda x: LabeledPoint(x.label, MLLibVectors.fromML(x.features)))
valPredsRDD = valPreds.rdd valuesAndPredsVal = valPredsRDD.map(lambda x: (x.label, x.prediction)) print('Validation RMSE: {}'.format(rootMeanSquaredError(valuesAndPredsVal))) #################################################################################### ## part 3 print('*' * 100) print('Part 3 - Visualize the log of the training error\n') # convert data sets from pyspark.mllib.linalg import Vectors as MLLibVectors from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD train_dataRDD = train_data.rdd train_dataRDD = train_dataRDD.map( lambda x: LabeledPoint(x[0], MLLibVectors.fromML(x[1]))) train_dataRDD.persist() numIters = 50 errors = [] for i in range(1, numIters + 1): model = LinearRegressionWithSGD.train(train_dataRDD, iterations=i, step=0.01) valuesAndPredsTrain = train_dataRDD.map( lambda x: (x.label, model.predict(x.features))) errors.append(rootMeanSquaredError(valuesAndPredsTrain)) print(errors) # visualize actual vs. prediction x = np.arange(1, numIters + 1)
# Restructure the dataframe in preparation for one-hot encoding grouped = df.groupBy("application_id").agg(collect_list("package_id")) grouped = grouped.withColumnRenamed("collect_list(package_id)", "package_ids") grouped = grouped.withColumn("package_ids", col("package_ids").cast("array<string>")) # One-hot encode the data (rows are applications, columns are packages) vectorizer = CountVectorizer(inputCol="package_ids", outputCol="packages_encoded") vectorizer_model = vectorizer.fit(grouped) transformedDf = vectorizer_model.transform(grouped) transformedDf = transformedDf.drop(col("package_ids")) # Extract vectors from the DataFrame in preparation for computing the similarity matrix array = [ Vectors.fromML(row.packages_encoded) for row in transformedDf.collect() ] # Create a RowMatrix matrix = RowMatrix(sc.parallelize(array)) # Compute column similarity matrix similarity = matrix.columnSimilarities() # Convert the matrix to a DataFrame entries = similarity.entries.collect() similarityDf = spark.createDataFrame(entries).toDF("package_a", "package_b", "similarity") # Write to the database url_connect = f"jdbc:postgresql://{host}/"
data = indexer.fit(data).transform(data) print('indexed') data.write.parquet("gs://elinor/NBTrainData") #------------------------------FIT RANDOM FOREST-----------------------------# print("fittingRF") rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="features", numTrees=10) rfmodel = rf.fit(data) print("fitted") #------------------------------FIT NB ---------------------------------------# print("fitting NB") data2 = data.rdd\ .map(lambda x: tuple(x))\ .map(lambda x: LabeledPoint(x[3],MLLibVectors.fromML(x[1])))\ .toDF() data2 = data2.withColumn("features", as_ml("features")) nb = NaiveBayes(smoothing=1.0, modelType="multinomial") nbmodel = nb.fit(data2) print("fitted") #------------------------------DEAL WITH TEST DATA-----------------------------# print("loading testing data") if DEBUG: wtf = sc.textFile(TEST_DATA)\ .map(lambda x: "data/bytes/" + x + ".bytes")\ .reduce(lambda accum,x: accum + "," + x) else:
tf_df = hashingtf.transform(gram_df) #tf-idf idf = IDF(inputCol="tf", outputCol="idftf") idfModel = idf.fit(tf_df) idf_df = idfModel.transform(tf_df) #convert dataframe t rdd, to make a LabeledPoint tuple(label, feature, vector) for machine tf_rdd = tf_df.rdd from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.linalg import Vectors as MLLibVectors #we also need to convert ml.sparsevector mllib.sparse vector, because naive bayes only accepts mllib.sparsevector type train_dataset = tf_rdd.map( lambda x: LabeledPoint(float(x.sentiment), MLLibVectors.fromML(x.tf))) #split dataset into train, test train, test = train_dataset.randomSplit([0.9, 0.1], seed=11L) print(train.first()) print(test.first()) #create Model #now train and save the model from pyspark.mllib.classification import NaiveBayes import shutil #training print("************************TRAINIG*******************************")
def calculate_distance(self, sdf1, sdf2): """ This will calculate the distance between the vector-type columns of two spark dataframes :param sdf1: This is to have a columns id1 (dtype int) and v1 (dtype Vector) :param sdf2: This is to have a columns id2 (dtype int) and v2 (dtype Vector) :return: """ cov = RowMatrix( sdf1.select(["v1"]).withColumnRenamed("v1", "v").union( sdf2.select(["v2"]).withColumnRenamed( "v2", "v")).rdd.map(lambda row: Vectors.fromML(row.asDict( )["v"]))).computeCovariance().toArray() x, v = np.linalg.eigh(cov) indices = 1e-10 <= x # we are trying to enfore the data types to be only python types n = int(v.shape[0]) m = int(indices.sum()) v_vals = [float(val) for val in v[:, indices].reshape(-1, ).tolist()] v_spark = DenseMatrix(n, m, v_vals) x_vals = [ float(val) for val in np.diag(x[indices]**-0.5).reshape(-1, ).tolist() ] x_spark = DenseMatrix(m, m, x_vals) # we get the index to maintain the order _sdf1 = sdf1.rdd.zipWithIndex()\ .map(lambda val_key: Row(id1=val_key[0].id1, v1=val_key[0].v1, index=val_key[1])).toDF() _sdf1.persist() _sdf2 = sdf2.rdd.zipWithIndex()\ .map(lambda val_key: Row(id2=val_key[0].id2, v2=val_key[0].v2, index=val_key[1])).toDF() _sdf2.persist() # we get our indexed row matrix _sdf1_mat = IndexedRowMatrix( _sdf1.rdd.map(lambda row: IndexedRow(index=row.asDict()["index"], vector=Vectors.fromML( row.asDict()["v1"])))) _sdf2_mat = IndexedRowMatrix( _sdf2.rdd.map(lambda row: IndexedRow(index=row.asDict()["index"], vector=Vectors.fromML( row.asDict()["v2"])))) # we apply our transformation and then set it as our new variable _sdf1 = _sdf1.drop("v1").join(_sdf1_mat.multiply(v_spark).multiply(x_spark).rows\ .map(lambda indexed_row: Row(index=indexed_row.index, v1=indexed_row.vector)).toDF(), "index") _sdf2 = _sdf2.drop("v2").join(_sdf2_mat.multiply(v_spark).multiply(x_spark).rows\ .map(lambda indexed_row: Row(index=indexed_row.index, v2=indexed_row.vector)).toDF(), "index") @F.udf(DoubleType(), VectorUDT()) def tmp(vec): return float(vec[0].squared_distance(vec[1]))**0.5 all_sdf = _sdf1.crossJoin(_sdf2) dist_sdf = all_sdf.select("*", tmp(F.array('v1', 'v2')).alias('diff')) dist_sdf.persist() return dist_sdf
def do_query(issues, config_file=None, logger=None, context=None): """ Gets the Latent Dirochelet Allocation (LDA) topics for words within articles. config_file must be the path to a LDA configuration file in YAML format. For example: keyword: <KEYWORD> optimizer: online|em max_iterations: <N> ntopics: <N> topic_words: <N> <N> must be >= 1 for each parameter. The keyword and words in documents are normalized, by removing all non-'a-z|A-Z' characters. Returns result of form: { <0>: [<WORD_0>, ..., <WORD_topicwords>], <1>: [<WORD_0>, ..., <WORD_topicwords>], <2>: [<WORD_0>, ..., <WORD_topicwords>], ... <ntopics>: [<WORD_0>, ..., <WORD_topicwords>], years:[<MIN_YEAR>, <MAX_YEAR>] } :param issues: RDD of defoe.papers.issue.Issue :type issues: pyspark.rdd.PipelinedRDD :param config_file: query configuration file :type config_file: str or unicode :param logger: logger (unused) :type logger: py4j.java_gateway.JavaObject :return: LDA topics :rtype: dict """ with open(config_file, 'r') as f: config = load(f) keyword = config['keyword'] optimizer = config['optimizer'] if optimizer != 'online' and optimizer != 'em': raise ValueError("optmizer must be 'online' or 'em' but is '{}'" .format(optimizer)) max_iterations = config['max_iterations'] if max_iterations < 1: raise ValueError('max_iterations must be at least 1') ntopics = config['ntopics'] if ntopics < 1: raise ValueError('ntopics must be at least 1') topic_words = config['topic_words'] if topic_words < 1: raise ValueError('topic_words must be at least 1') keyword = query_utils.normalize(keyword) # [date, ...] # => # [(yesr, year), ...] # => # (year, year) min_year, max_year = issues \ .filter(lambda issue: issue.date) \ .map(lambda issue: (issue.date.year, issue.date.year)) \ .reduce(min_max_tuples) # [issue, issue, ...] # => # [article, article, ...] # => # [(article, 0), (article, 1), ...] # => # [Row, Row, ...] articles_rdd = issues.flatMap(lambda issue: issue.articles) \ .filter(lambda article: article_contains_word(article, keyword, PreprocessWordType.NORMALIZE)) \ .zipWithIndex() \ .map(article_idx_to_words_row) spark = SparkSession \ .builder \ .appName('lda') \ .getOrCreate() articles_df = spark.createDataFrame(articles_rdd) remover = StopWordsRemover(inputCol='words', outputCol='filtered') articles_df = remover.transform(articles_df) vectortoriser = CountVectorizer(inputCol='filtered', outputCol='vectors') model = vectortoriser.fit(articles_df) vocabulary = model.vocabulary articles_df = model.transform(articles_df) corpus = articles_df \ .select('idx', 'vectors') \ .rdd \ .map(lambda a: [a[0], Vectors.fromML(a[1])]) \ .cache() # Cluster the documents into N topics using LDA. lda_model = LDA.train(corpus, k=ntopics, maxIterations=max_iterations, optimizer=optimizer) topics_final = [topic_render(topic, topic_words, vocabulary) for topic in lda_model.describeTopics(maxTermsPerTopic=topic_words)] topics = [('years', [min_year, max_year])] for i, topic in enumerate(topics_final): term_words = [] for term in topic: term_words.append(term) topics.append((str(i), term_words)) return topics
'comments')) # comm_lemm.show(truncate=False) # Indexed Subreddit name so that we can train LDA indexer = StringIndexer(inputCol="id", outputCol="index") indexed = indexer.fit(comm_lemm).transform(comm_lemm) # fit a CountVectorizerModel from the corpus. cv = CountVectorizer(inputCol="comments", outputCol="vectors") count_vectorizer_model = cv.fit(indexed) result = count_vectorizer_model.transform(indexed) # result.show(truncate=False) corpus = result.select(result['index'].cast('long'), result['vectors']) \ .rdd.map(lambda x: [x[0], Vectors.fromML(x[1])]).cache() # # for x in corpus.collect(): # # print(x) # ldaModel = LDA.train(corpus, k=10) topics = ldaModel.topicsMatrix() # vocabArray = count_vectorizer_model.vocabulary print(topics) # for topic in range(10): # print("Topic " + str(topic) + ":") # for word in range(0, ldaModel.vocabSize()): # print(" " + str(topics[word][topic]))
def main(): SC = SparkContext("local[1]", "pkgpkr") # Connect to the database USER = os.environ.get("DB_USER") PASSWORD = os.environ.get("DB_PASSWORD") HOST = os.environ.get("DB_HOST") DB = psycopg2.connect(user=USER, password=PASSWORD, host=HOST) CUR = DB.cursor() # Load the raw data into Spark CUR.execute("SELECT * FROM dependencies") DEPENDENCIES = CUR.fetchall() SPARK = SparkSession.builder.master("local[1]").appName("pkgpkr").getOrCreate() DF = SPARK.createDataFrame(DEPENDENCIES).toDF("application_id", "package_id") # Close the database connection CUR.close() DB.close() # Restructure the dataframe in preparation for one-hot encoding GROUPED = DF.groupBy("application_id").agg(collect_list("package_id")) GROUPED = GROUPED.withColumnRenamed("collect_list(package_id)", "package_ids") GROUPED = GROUPED.withColumn("package_ids", col("package_ids").cast("array<string>")) # One-hot encode the data (rows are applications, columns are packages) VECTORIZER = CountVectorizer(inputCol="package_ids", outputCol="packages_encoded") VECTORIZER_MODEL = VECTORIZER.fit(GROUPED) TRANSFORMED_DF = VECTORIZER_MODEL.transform(GROUPED) TRANSFORMED_DF = TRANSFORMED_DF.drop(col("package_ids")) # Extract vectors from the DataFrame in preparation for computing the similarity matrix ARRAY = [Vectors.fromML(row.packages_encoded) for row in TRANSFORMED_DF.collect()] # Create a RowMatrix MATRIX = RowMatrix(SC.parallelize(ARRAY, numSlices=100)) # Compute column similarity matrix SIMILARITY = MATRIX.columnSimilarities() # Convert the matrix to a DataFrame ENTRIES = SIMILARITY.entries.collect() SIMILARITY_DF = SPARK.createDataFrame(ENTRIES).toDF("a", "b", "similarity") # Map the package identifiers back to their pre-vectorized values MAPPING = create_map([lit(x) for x in chain(*enumerate(VECTORIZER_MODEL.vocabulary))]) SIMILARITY_DF = SIMILARITY_DF.withColumn("package_a", MAPPING.getItem(col("a")).cast("integer")) \ .withColumn("package_b", MAPPING.getItem(col("b")).cast("integer")) SIMILARITY_DF = SIMILARITY_DF.drop(col("a")).drop(col("b")) # Mirror the columns and append to the existing dataframe so we need only query the first column SIMILARITY_DF = SIMILARITY_DF.select('package_a', 'package_b', 'similarity') \ .union(SIMILARITY_DF.select('package_b', 'package_a', 'similarity')) # Write similarity scores to the database URL_CONNECT = f"jdbc:postgresql://{HOST}/" TABLE = "similarity" MODE = "overwrite" PROPERTIES = {"user": USER, "password": PASSWORD, "driver": "org.postgresql.Driver"} SIMILARITY_DF.write.jdbc(URL_CONNECT, TABLE, MODE, PROPERTIES) # # Update popularity scores # POPULARITY_UPDATE = """ UPDATE packages SET popularity = s.popularity FROM ( SELECT package_b, COUNT(package_b) AS popularity FROM similarity GROUP BY package_b ) s WHERE packages.id = s.package_b; """ POPULARITY_NULL_TO_ZERO = """ UPDATE packages SET popularity = 0 WHERE popularity IS NULL; """ BOUNDED_POPULARITY_UPDATE = """ UPDATE packages SET bounded_popularity = s.popularity FROM ( SELECT id, WIDTH_BUCKET(LOG(popularity + 1), 0, (SELECT MAX(LOG(popularity + 1)) FROM packages), 9) AS popularity FROM packages ) s WHERE packages.id = s.id; """ # Connect to the database DB = psycopg2.connect(user=USER, password=PASSWORD, host=HOST) CUR = DB.cursor() # Execute popularity updates CUR.execute(POPULARITY_UPDATE) CUR.execute(POPULARITY_NULL_TO_ZERO) CUR.execute(BOUNDED_POPULARITY_UPDATE) # # Update trending scores # MONTHLY_DOWNLOADS_LAST_MONTH_NULL_TO_ZERO = """ UPDATE packages SET monthly_downloads_last_month = 0 WHERE monthly_downloads_last_month IS NULL; """ MONTHLY_DOWNLOADS_A_YEAR_AGO_NULL_TO_ZERO = """ UPDATE packages SET monthly_downloads_a_year_ago = 0 WHERE monthly_downloads_a_year_ago IS NULL; """ ABSOLUTE_TREND_UPDATE = """ UPDATE packages SET absolute_trend = s.absolute_trend FROM ( SELECT id, WIDTH_BUCKET( LOG(monthly_downloads_last_month + 1) - LOG(monthly_downloads_a_year_ago + 1), (SELECT MIN(LOG(monthly_downloads_last_month + 1) - LOG(monthly_downloads_a_year_ago + 1)) FROM packages), (SELECT MAX(LOG(monthly_downloads_last_month + 1) - LOG(monthly_downloads_a_year_ago + 1)) FROM packages), 9 ) AS absolute_trend FROM packages ) s WHERE packages.id = s.id; """ RELATIVE_TREND_UPDATE = """ UPDATE packages SET relative_trend = s.relative_trend FROM ( SELECT id, WIDTH_BUCKET( LOG(monthly_downloads_last_month + 1) / (LOG(monthly_downloads_a_year_ago + 1) + 1), (SELECT MIN(LOG(monthly_downloads_last_month + 1) / (LOG(monthly_downloads_a_year_ago + 1) + 1)) FROM packages), (SELECT MAX(LOG(monthly_downloads_last_month + 1) / (LOG(monthly_downloads_a_year_ago + 1) + 1)) FROM packages), 9 ) AS relative_trend FROM packages ) s WHERE packages.id = s.id; """ # Execute trending updates CUR.execute(MONTHLY_DOWNLOADS_LAST_MONTH_NULL_TO_ZERO) CUR.execute(MONTHLY_DOWNLOADS_A_YEAR_AGO_NULL_TO_ZERO) CUR.execute(ABSOLUTE_TREND_UPDATE) CUR.execute(RELATIVE_TREND_UPDATE) # Commit changes and close the database connection DB.commit() CUR.close() DB.close()