def test_slice(self): from pyspark.sql.functions import slice, lit df = self.spark.createDataFrame([([1, 2, 3], ), ([4, 5], )], ['x']) self.assertEquals( df.select(slice(df.x, 2, 2).alias("sliced")).collect(), df.select(slice(df.x, lit(2), lit(2)).alias("sliced")).collect(), )
def benchmark_extract_top_keywords(posts, n_keywords=10): """Given TF-IDF output (as "features" column) extracts out the vocabulary index of the 10 keywords with highest TF-IDF (for each post).""" def extract_keys_from_vector(vector): return vector.indices.tolist() def extract_values_from_vector(vector): return vector.values.tolist() extract_keys_from_vector_udf = udf( lambda vector: extract_keys_from_vector(vector), ArrayType(IntegerType())) extract_values_from_vector_udf = udf( lambda vector: extract_values_from_vector(vector), ArrayType(DoubleType())) posts = posts.withColumn("extracted_keys", extract_keys_from_vector_udf("features")) posts = posts.withColumn("extracted_values", extract_values_from_vector_udf("features")) posts = posts.withColumn( "zipped_truncated", slice( sort_array(arrays_zip("extracted_values", "extracted_keys"), asc=False), 1, n_keywords)) take_second = udf(lambda rows: [row[1] for row in rows], ArrayType(IntegerType())) posts = posts.withColumn("top_indices", take_second("zipped_truncated")) return posts
def add_year(self, df): df2 = ( df.withColumn('file_name', slice(split(input_file_name(), '/'), -1 ,1)[0]) .withColumn('flight_year', col('file_name').substr(1, 4).cast(IntegerType())) ) return df2
def test_slice(self): from pyspark.sql.functions import lit, size, slice df = self.spark.createDataFrame([([1, 2, 3], ), ([4, 5], )], ['x']) self.assertEqual( df.select(slice(df.x, 2, 2).alias("sliced")).collect(), df.select(slice(df.x, lit(2), lit(2)).alias("sliced")).collect(), ) self.assertEqual( df.select(slice(df.x, size(df.x) - 1, lit(1)).alias("sliced")).collect(), [Row(sliced=[2]), Row(sliced=[4])]) self.assertEqual( df.select(slice(df.x, lit(1), size(df.x) - 1).alias("sliced")).collect(), [Row(sliced=[1, 2]), Row(sliced=[4])])
def test_slice(self): df = self.spark.createDataFrame( [ ( [1, 2, 3], 2, 2, ), ( [4, 5], 2, 2, ), ], ["x", "index", "len"], ) expected = [Row(sliced=[2, 3]), Row(sliced=[5])] self.assertTrue( all([ df.select(slice(df.x, 2, 2).alias("sliced")).collect() == expected, df.select(slice(df.x, lit(2), lit(2)).alias("sliced")).collect() == expected, df.select(slice("x", "index", "len").alias("sliced")).collect() == expected, ])) self.assertEqual( df.select(slice(df.x, size(df.x) - 1, lit(1)).alias("sliced")).collect(), [Row(sliced=[2]), Row(sliced=[4])], ) self.assertEqual( df.select(slice(df.x, lit(1), size(df.x) - 1).alias("sliced")).collect(), [Row(sliced=[1, 2]), Row(sliced=[4])], )
def extract_top_keywords(posts, vocabulary, n_keywords=10): """Given word count (Count Vectorizer) output (as "features" column) - extracts out the vocabulary index of the 10 keywords with highest TF-IDF (for each post).""" def extract_keys_from_vector(vector): return vector.indices.tolist() def extract_values_from_vector(vector): return vector.values.tolist() extract_keys_from_vector_udf = udf( lambda vector: extract_keys_from_vector(vector), ArrayType(IntegerType())) extract_values_from_vector_udf = udf( lambda vector: extract_values_from_vector(vector), ArrayType(DoubleType())) idf_udf = array_transform(idf_wiki) vocab_dict = {k: v for k, v in enumerate(vocabulary)} def ix_to_word(ix): return vocab_dict[ix] vocab_udf = array_transform(ix_to_word) posts = posts.withColumn("word_ix", extract_keys_from_vector_udf("features")) posts = posts.withColumn("word_count", extract_values_from_vector_udf("features")) posts = posts.withColumn('words', vocab_udf(col('word_ix'))) posts = posts.withColumn("idf", idf_udf(col("words"))) posts = posts.withColumn( "zipped_truncated", slice(sort_array(arrays_zip("idf", "words"), asc=False), 1, n_keywords)) take_second = udf(lambda rows: [row[1] for row in rows], ArrayType(StringType())) posts = posts.withColumn("top_keywords", take_second("zipped_truncated")) return posts['CreationDate', 'top_keywords', 'Tags', 'ParentId']
def generateCooccurrenceMatrices(data, df_all): # to dataframe and create top 100 column df_temp = data.toDF(['Topic', 'Tuples per Topic']) df_temp = df_temp.withColumn( 'Top 100', F.slice('Tuples per Topic', start=1, length=100)) count_tuples = df_temp.select( ['Topic', 'Top 100']).rdd.map(lambda r: (r[0], [w[0] for w in r[1]])).collect() # lists of top100 words per topic economy_top100_words = count_tuples[0][1] microsoft_top100_words = count_tuples[1][1] palestine_top100_words = count_tuples[2][1] obama_top100_words = count_tuples[3][1] def mapOcc(sentence, top100_words): data_combs = [] for word in top100_words: h = {} if word in sentence: for neigh in neighbors(sentence, top100_words, word): comb = (word, neigh) h[comb] = h.get(comb, 0) + 1 h[(word, word)] = 0 data_combs.extend(list(h.items())) return data_combs def neighbors(sentence, top100_words, word): neighs = list(set(sentence) & set(top100_words)) neighs.remove(word) return neighs def generateCoocurrenceMatrix(col, topic, top100_words): # list of all titles per topic col_topic = df_all.select(col + '_sentence').where( F.col('Topic').isin({topic})).rdd.flatMap(lambda r: r).collect() df_col_topic = spark.createDataFrame(col_topic, "string").toDF(col) df_col_topic = df_col_topic.withColumn(col, F.split(F.col(col), ' ')) # calculate co-occurrence stripes per title economy_coocurrence = df_col_topic.rdd. \ flatMap(lambda r: mapOcc(r[0], top100_words)). \ reduceByKey(lambda x, y: x + y). \ map(lambda x: (x[0][0], [(x[0][1], int(x[1]))])). \ reduceByKey(lambda x, y: x + y).sortByKey().collect() # construct matrix from coocurrence stripes as_matrix = defaultdict(dict) for entry in economy_coocurrence: topword = entry[0] for cooc in sorted(entry[1]): as_matrix[topword][cooc[0]] = cooc[1] # use pandas to create a dataframe from matrix pd_df_matrix = pd.DataFrame(as_matrix) pd_df_matrix.insert(0, 'words', sorted(top100_words), True) df_matrix = spark.createDataFrame(pd_df_matrix) df_matrix = df_matrix.na.fill(0) # write matrix to file df_matrix \ .repartition(1) \ .write \ .mode("overwrite") \ .csv(dir_path + "results/" + "cooc_matrix_" + col + '_' + topic, header=True) generateCoocurrenceMatrix('Title', 'economy', economy_top100_words) generateCoocurrenceMatrix('Title', 'microsoft', microsoft_top100_words) generateCoocurrenceMatrix('Title', 'palestine', palestine_top100_words) generateCoocurrenceMatrix('Title', 'obama', obama_top100_words) generateCoocurrenceMatrix('Headline', 'economy', economy_top100_words) generateCoocurrenceMatrix('Headline', 'microsoft', microsoft_top100_words) generateCoocurrenceMatrix('Headline', 'palestine', palestine_top100_words) generateCoocurrenceMatrix('Headline', 'obama', obama_top100_words)
conf = pyspark.SparkConf().set("spark.cores.max", "4") sc = pyspark.SparkContext(master=SPARK_MASTER, conf=conf) spark = pyspark.sql.SparkSession(sc).builder.appName( APP_NAME).getOrCreate() print("PySpark initiated...") lines = spark \ .readStream \ .format("text") \ .load(path="streaming_src/") lines.printSchema() words = lines \ .filter(lines['value'].contains('- -')) \ .withColumn("split", slice(split(lines['value'], " "), -2,1).getItem(0)) wordCounts = words.groupBy('split').count() # Start running the query that prints the running counts to the console query = wordCounts \ .writeStream \ .outputMode('complete') \ .format('console')\ .start() query.awaitTermination() query.stop()