def sampleVCF(hdfs, spark): vcf = spark.sparkContext.textFile(hdfs) col_name = vcf.filter(lambda x: x.startswith("#CHROM")).first().split("\t") vcf_data = vcf.filter(lambda x : re.match("[^#][^#]", x))\ .map(lambda x : x.split("\t"))\ .toDF(col_name)\ .withColumn("POS", F.col("POS").cast(IntegerType())) sample_name = vcf_data.columns[-1] vcf_data = vcf_data.select(F.col("#CHROM"), F.col("POS"), F.col("FORMAT"), F.col(sample_name))\ .withColumn("FORMAT", F.array_remove(F.split(F.col("FORMAT"), ":"), "GT"))\ .withColumn(sample_name, firstremove(F.split(F.col(sample_name), ":")))\ .withColumn(sample_name, F.map_from_arrays(F.col("FORMAT"), F.col(sample_name))) return vcf_data.select("#CHROM", "POS", sample_name)
def count_paragraphs(df): """Paragraphs """ pattern_filterings = [ '\n\n+(\{\{[^\}]*\}\}|\[\[[^\]]*\]\]|={1,7}.*={1,7})\n\n+', '==References==[\s\S]*', '==External [lL]inks==[\s\S]*', '\{\{([^\{\}]*(\{\{.*\}\})*)*\}\}\n+', '\{\{([^\{\}]*(\{\{.*\}\})*)*\}\}$', '\[\[[^\]]*\]\]\n+', '\[\[[^\]]*\]\]$', '\n\n+' ] c = col('text') for pattern in pattern_filterings: c = regexp_replace(c, pattern, '\n\n') splitted = array_remove(split(c, '\n\n'), '') # return df.withColumn('paragraphs', splitted).withColumn('n_paragraphs', size(splitted)) return df.withColumn('n_paragraphs', size(splitted) - 1)
def preVCF(hdfs, spark): vcf = spark.sparkContext.textFile(hdfs) col_name = vcf.filter(lambda x: x.startswith("#CHROM")).first().split("\t") vcf_data = vcf.filter(lambda x : re.match("[^#][^#]", x))\ .map(lambda x : x.split("\t"))\ .toDF(col_name)\ .withColumn("POS", F.col("POS").cast(IntegerType()))\ .withColumn("FORMAT", F.array_remove(F.split(F.col("FORMAT"), ":"), "GT"))\ .withColumn("INFO", when(F.col("INFO").startswith("END="), None).otherwise(F.col("INFO"))) sample_name = vcf_data.columns[-1] vcf_data = vcf_data.drop("QUAL", "FILTER", sample_name) for index in range(len(vcf_data.columns)): compared_arr = ["#CHROM", "POS", "REF"] if vcf_data.columns[index] not in compared_arr: vcf_data = vcf_data.withColumn(vcf_data.columns[index], F.array(vcf_data.columns[index])) vcf_data = vcf_data.withColumnRenamed( vcf_data.columns[index], vcf_data.columns[index] + "_" + sample_name) vcf_data = vcf_data.withColumnRenamed("REF", "REF_" + sample_name) vcf_data = vcf_data.withColumn("count", F.length(vcf_data.columns[3])) # window & split col parameter sample_ref = vcf_data.columns[3] indel_window = Window.partitionBy("#CHROM").orderBy("POS") split_col = F.split("REF_temp", '_') ### vcf_data = vcf_data.withColumn("count", when(F.col("count") >= 2, F.lead("POS", 1).over(indel_window) - F.col("POS") - F.lit(1))\ .otherwise(F.lit(0))) not_indel = vcf_data.drop("count").withColumn(sample_ref, F.array(F.col(sample_ref))) indel = vcf_data.filter(F.col("count") >= 1)\ .withColumn(sample_ref, ref_melt(F.col(sample_ref), F.col("count"))).drop("count")\ .withColumn(sample_ref, explode(F.col(sample_ref))).withColumnRenamed(sample_ref, "REF_temp")\ .withColumn(sample_ref, F.array(split_col.getItem(0))).withColumn('POS_var', split_col.getItem(1))\ .drop(F.col("REF_temp")).withColumn("POS", (F.col("POS") + F.col("POS_var")).cast(IntegerType()))\ .drop(F.col("POS_var"))\ .withColumn(vcf_data.columns[2], F.array(F.lit(".")))\ .withColumn(vcf_data.columns[4], F.array(F.lit("*, <NON_REF>"))) return not_indel.unionByName(indel)
def _setChanges(self): cols = self.joinKeys columns = self.leftDF.columns # get conditions for all columns except id conditions_ = [ F.when(self.leftDF[c] != self.rightDF[c], F.lit(c)).otherwise("") for c in self.leftDF.columns if c not in cols ] select_expr = [ *cols, *[self.rightDF[c] for c in self.rightDF.columns if c not in cols], F.array_remove(F.array(*conditions_), "").alias("column_names") ] changesDF = (self.leftDF.join( self.rightDF, cols, "left").select(*select_expr)).where("size(column_names) > 0") return changesDF.count()
def array_intersection(a: Column, b: Column) -> Column: """Calculate the intersection of two array columns""" return F.array_remove(F.array_intersect(a, b), "")
def array_union(a: Column, b: Column) -> Column: """Calculate the union of two array columns""" return F.array_remove(F.array_union(a, b), "")
.getOrCreate() spark.udf.registerJavaFunction("index2dict", "scalaUDF.Index2dict", ArrayType(StringType())) spark.sparkContext.addPyFile("function.py") from function import * # sample parquet write hdfs = "hdfs://master:9000" hdfs_list = hadoop_list(gvcf_count, "/raw_data/gvcf") vcf_list = list() for index in range(len(hdfs_list)): vcf_list.append( sampleVCF(hdfs + hdfs_list[index].decode("UTF-8"), spark)) indel_com = spark.read.parquet("/raw_data/output/gvcf_output/" + folder_name + "//info.g.vcf")\ .select(["#CHROM","POS","FORMAT"])\ .withColumn("FORMAT", F.array_remove(F.split(F.col("FORMAT"), ":"), "GT"))\ .orderBy(F.col("#CHROM"), F.col("POS")).persist(StorageLevel.MEMORY_ONLY) indel_com.count() print("info gvcf count : ", indel_com.count()) parquet_list = list( map(lambda arg: parquet_revalue(arg, indel_com), vcf_list)) for parquet in join_split_inner(parquet_list, num): parquet.write.mode('overwrite')\ .parquet("/raw_data/output/gvcf_output/"+ folder_name + "//" + "sample_" + str(cnt) + ".g.vcf") cnt += num spark.catalog.clearCache() spark.stop()
def insertDataToGraph(self): spark = self.sparkSession neo4j = self.neo4jDriver.session() sc = spark.sparkContext feats = self.user_features_df list_cat = self.list_categories cat_count = len(list_cat) #import edges e = self.edges_df self.nodes_df = e.select("Source").union( e.select("Target")).distinct().withColumnRenamed('Source', 'id') n = self.nodes_df print(feats.count(), list_cat, e.count(), n.count()) feats.printSchema() #cache dataframes feats.cache() e.cache() n.cache() #add category property to u u_focus_rate = feats.select( col('id'), col("user_features{}.dict_focus_rate".format(( "_" + self.method_name ) if len(self.method_name) > 0 else "")).alias("dict_focus_rate")) u_with_category = u_focus_rate.withColumn( "userCategory", array([lit(c) for c in list_cat])) for cat in list_cat: u_with_category = u_with_category.withColumn( "temp", when( col("dict_focus_rate.{}".format(cat)) < 1 / cat_count, array_remove(u_with_category["userCategory"], cat)).otherwise( u_with_category["userCategory"])).drop( "userCategory").withColumnRenamed( "temp", "userCategory") u_with_category = u_with_category.select("id", "userCategory") #join n and u_with_category n_with_category = n.join(u_with_category, "id", how="left") #add category columns to e e_with_category = e.join(n_with_category, e.Source == n_with_category.id, how="left").withColumnRenamed( "userCategory", "sourceCategory").select( "Source", "Target", "sourceCategory") e_with_category = e_with_category.join( n_with_category, e_with_category.Target == n_with_category.id, how="left").withColumnRenamed("userCategory", "targetCategory").select( "Source", "Target", "sourceCategory", "targetCategory") #determine intersection between sourceCategory and targetCategory e_with_category = e_with_category.withColumn( "Categories", array_intersect(e_with_category["sourceCategory"], e_with_category["targetCategory"])) #flatten out categories of edges e_with_category = e_with_category.withColumn( "Category", explode(col("Categories"))).select("Source", "Target", "Category") print("e_with_category", e_with_category.count()) e_with_category.printSchema() ## Insert data insert_query = ''' UNWIND {triples} as triple MERGE (p1:User {id:triple[0]}) MERGE (p2:User {id:triple[1]}) WITH p1,p2,triple CALL apoc.create.relationship(p1, triple[2], {}, p2) YIELD rel RETURN * ''' e_listoftriples = e_with_category.toPandas()[[ 'Source', 'Target', 'Category' ]].values.tolist() print("e_listoftriples:", len(e_listoftriples)) batches = list(self.generate_batches(e_listoftriples, 7000)) for batch in batches: neo4j.run(insert_query, parameters={"triples": batch}) e_with_category.show() print("batches size:", len(batches), " last batch:", len(batches[-1]))
def main(society): df_user_join = spark.read.format("csv").option( "header", "true").load('joined_user.csv') df_user_join.show() df_posts_society = load_posts(society) df_posts_society_filterd = df_posts_society.join( df_user_join, df_posts_society[str(society + "_ownerId")] == df_user_join[str(society + "_userId")], 'inner').drop(str(society + "_userId")) df_posts_society_filterd.show() df_posts_society_question = df_posts_society_filterd.where( df_posts_society_filterd[str(society + "_typeId")] == 1) df_posts_society_answer = df_posts_society_filterd.where( df_posts_society_filterd[str(society + "_typeId")] == 2).drop( str(society + "_postId")).drop(str(society + "_typeId")).drop( str(society + "_tags")) df_posts_society_answer = df_posts_society_answer.withColumnRenamed( str(society + "_parentID"), str(society + "_answerParentId")) df_posts_society_answer = df_posts_society_answer.withColumnRenamed( str(society + "_ownerId"), str(society + "_answerOwnerId")) df_posts_society_question.show() df_posts_society_answer.show() df_posts_society_answer_post = df_posts_society.join( df_posts_society_answer, df_posts_society[str(society + "_postId")] == df_posts_society_answer[str(society + "_answerParentId")], 'inner') df_posts_society_answer_post.show() df_posts_society_question = df_posts_society_question.select( str(society + "_tags"), str(society + "_ownerId"), 'accountId') df_posts_society_question.show() print(df_posts_society_question.count()) df_posts_society_answer_post = df_posts_society_answer_post.select( str(society + "_tags"), str(society + "_answerOwnerId"), 'accountId') df_posts_society_answer_post = df_posts_society_answer_post.withColumnRenamed( str(society + "_answerOwnerId"), str(society + "_ownerId")) df_posts_society_answer_post.show() print(df_posts_society_answer_post.count()) df_posts_society_merge = df_posts_society_question.unionAll( df_posts_society_answer_post) df_posts_society_merge.show() print(df_posts_society_merge.count()) df_posts_society_merge.write.csv(society + '_all_tags_combined.csv', header='true') # df_posts_society_merge = spark.read.format("csv").option("header", "true").load(society + '_all_tags_combined.csv') df_posts_society_merge.show() df_posts_society_merge = df_posts_society_merge.withColumn( 'tag', functions.explode( functions.array_remove( functions.split(str(society + "_tags"), r"(\<)|(\>)"), ""))) df_posts_society_merge = df_posts_society_merge.drop(str(society + "_tags")) df_posts_society_merge = df_posts_society_merge.withColumnRenamed( 'tag', str(society + "_tags")) df_posts_society_merge.show() # uncommand these lines if tag synonyms are used # df_synonym = spark.read.format("csv").option("header", "true").load(society + '_synonym.csv') # df_synonym.show() # df_posts_society_merge = df_posts_society_merge.withColumn(str(society + "_tags"), functions.when(df_posts_society_merge[str(society + "_tags")] == df_synonym['synonym'], df_synonym['target']).otherwise(df_posts_society_merge[str(society + "_tags")])) # df_posts_society_merge.show() df_posts_society_merge.write.csv(society + '_tags.csv', header='true')