Пример #1
0
def sampleVCF(hdfs, spark):
    vcf = spark.sparkContext.textFile(hdfs)
    col_name = vcf.filter(lambda x: x.startswith("#CHROM")).first().split("\t")
    vcf_data = vcf.filter(lambda x : re.match("[^#][^#]", x))\
                       .map(lambda x : x.split("\t"))\
                       .toDF(col_name)\
                       .withColumn("POS", F.col("POS").cast(IntegerType()))

    sample_name = vcf_data.columns[-1]
    vcf_data = vcf_data.select(F.col("#CHROM"), F.col("POS"), F.col("FORMAT"), F.col(sample_name))\
                       .withColumn("FORMAT", F.array_remove(F.split(F.col("FORMAT"), ":"), "GT"))\
                       .withColumn(sample_name, firstremove(F.split(F.col(sample_name), ":")))\
                       .withColumn(sample_name, F.map_from_arrays(F.col("FORMAT"), F.col(sample_name)))
    return vcf_data.select("#CHROM", "POS", sample_name)
Пример #2
0
def count_paragraphs(df):
    """Paragraphs
    """
    pattern_filterings = [
        '\n\n+(\{\{[^\}]*\}\}|\[\[[^\]]*\]\]|={1,7}.*={1,7})\n\n+',
        '==References==[\s\S]*', '==External [lL]inks==[\s\S]*',
        '\{\{([^\{\}]*(\{\{.*\}\})*)*\}\}\n+',
        '\{\{([^\{\}]*(\{\{.*\}\})*)*\}\}$', '\[\[[^\]]*\]\]\n+',
        '\[\[[^\]]*\]\]$', '\n\n+'
    ]
    c = col('text')
    for pattern in pattern_filterings:
        c = regexp_replace(c, pattern, '\n\n')
    splitted = array_remove(split(c, '\n\n'), '')
    # return df.withColumn('paragraphs', splitted).withColumn('n_paragraphs', size(splitted))
    return df.withColumn('n_paragraphs', size(splitted) - 1)
Пример #3
0
def preVCF(hdfs, spark):
    vcf = spark.sparkContext.textFile(hdfs)
    col_name = vcf.filter(lambda x: x.startswith("#CHROM")).first().split("\t")
    vcf_data = vcf.filter(lambda x : re.match("[^#][^#]", x))\
                       .map(lambda x : x.split("\t"))\
                       .toDF(col_name)\
                       .withColumn("POS", F.col("POS").cast(IntegerType()))\
                       .withColumn("FORMAT", F.array_remove(F.split(F.col("FORMAT"), ":"), "GT"))\
                       .withColumn("INFO", when(F.col("INFO").startswith("END="), None).otherwise(F.col("INFO")))

    sample_name = vcf_data.columns[-1]
    vcf_data = vcf_data.drop("QUAL", "FILTER", sample_name)

    for index in range(len(vcf_data.columns)):
        compared_arr = ["#CHROM", "POS", "REF"]
        if vcf_data.columns[index] not in compared_arr:
            vcf_data = vcf_data.withColumn(vcf_data.columns[index],
                                           F.array(vcf_data.columns[index]))
            vcf_data = vcf_data.withColumnRenamed(
                vcf_data.columns[index],
                vcf_data.columns[index] + "_" + sample_name)
    vcf_data = vcf_data.withColumnRenamed("REF", "REF_" + sample_name)
    vcf_data = vcf_data.withColumn("count", F.length(vcf_data.columns[3]))

    # window & split col parameter
    sample_ref = vcf_data.columns[3]
    indel_window = Window.partitionBy("#CHROM").orderBy("POS")
    split_col = F.split("REF_temp", '_')
    ###

    vcf_data = vcf_data.withColumn("count", when(F.col("count") >= 2, F.lead("POS", 1).over(indel_window) - F.col("POS") - F.lit(1))\
                                           .otherwise(F.lit(0)))

    not_indel = vcf_data.drop("count").withColumn(sample_ref,
                                                  F.array(F.col(sample_ref)))
    indel = vcf_data.filter(F.col("count") >= 1)\
                .withColumn(sample_ref, ref_melt(F.col(sample_ref), F.col("count"))).drop("count")\
                .withColumn(sample_ref, explode(F.col(sample_ref))).withColumnRenamed(sample_ref, "REF_temp")\
                .withColumn(sample_ref, F.array(split_col.getItem(0))).withColumn('POS_var', split_col.getItem(1))\
                .drop(F.col("REF_temp")).withColumn("POS", (F.col("POS") + F.col("POS_var")).cast(IntegerType()))\
                .drop(F.col("POS_var"))\
                .withColumn(vcf_data.columns[2], F.array(F.lit(".")))\
                .withColumn(vcf_data.columns[4], F.array(F.lit("*, <NON_REF>")))

    return not_indel.unionByName(indel)
Пример #4
0
    def _setChanges(self):
        cols = self.joinKeys
        columns = self.leftDF.columns
        # get conditions for all columns except id
        conditions_ = [
            F.when(self.leftDF[c] != self.rightDF[c], F.lit(c)).otherwise("")
            for c in self.leftDF.columns if c not in cols
        ]

        select_expr = [
            *cols,
            *[self.rightDF[c] for c in self.rightDF.columns if c not in cols],
            F.array_remove(F.array(*conditions_), "").alias("column_names")
        ]

        changesDF = (self.leftDF.join(
            self.rightDF, cols,
            "left").select(*select_expr)).where("size(column_names) > 0")
        return changesDF.count()
Пример #5
0
def array_intersection(a: Column, b: Column) -> Column:
    """Calculate the intersection of two array columns"""
    return F.array_remove(F.array_intersect(a, b), "")
Пример #6
0
def array_union(a: Column, b: Column) -> Column:
    """Calculate the union of two array columns"""
    return F.array_remove(F.array_union(a, b), "")
Пример #7
0
                            .getOrCreate()

    spark.udf.registerJavaFunction("index2dict", "scalaUDF.Index2dict",
                                   ArrayType(StringType()))
    spark.sparkContext.addPyFile("function.py")
    from function import *

    # sample parquet write
    hdfs = "hdfs://master:9000"
    hdfs_list = hadoop_list(gvcf_count, "/raw_data/gvcf")
    vcf_list = list()
    for index in range(len(hdfs_list)):
        vcf_list.append(
            sampleVCF(hdfs + hdfs_list[index].decode("UTF-8"), spark))

    indel_com = spark.read.parquet("/raw_data/output/gvcf_output/" + folder_name + "//info.g.vcf")\
                    .select(["#CHROM","POS","FORMAT"])\
                    .withColumn("FORMAT", F.array_remove(F.split(F.col("FORMAT"), ":"), "GT"))\
                    .orderBy(F.col("#CHROM"), F.col("POS")).persist(StorageLevel.MEMORY_ONLY)
    indel_com.count()
    print("info gvcf count : ", indel_com.count())

    parquet_list = list(
        map(lambda arg: parquet_revalue(arg, indel_com), vcf_list))
    for parquet in join_split_inner(parquet_list, num):
        parquet.write.mode('overwrite')\
                .parquet("/raw_data/output/gvcf_output/"+ folder_name + "//" + "sample_" + str(cnt) + ".g.vcf")
        cnt += num

    spark.catalog.clearCache()
    spark.stop()
Пример #8
0
    def insertDataToGraph(self):
        spark = self.sparkSession
        neo4j = self.neo4jDriver.session()
        sc = spark.sparkContext
        feats = self.user_features_df
        list_cat = self.list_categories
        cat_count = len(list_cat)

        #import edges
        e = self.edges_df
        self.nodes_df = e.select("Source").union(
            e.select("Target")).distinct().withColumnRenamed('Source', 'id')
        n = self.nodes_df
        print(feats.count(), list_cat, e.count(), n.count())
        feats.printSchema()

        #cache dataframes
        feats.cache()
        e.cache()
        n.cache()

        #add category property to u
        u_focus_rate = feats.select(
            col('id'),
            col("user_features{}.dict_focus_rate".format((
                "_" + self.method_name
            ) if len(self.method_name) > 0 else "")).alias("dict_focus_rate"))
        u_with_category = u_focus_rate.withColumn(
            "userCategory", array([lit(c) for c in list_cat]))
        for cat in list_cat:
            u_with_category = u_with_category.withColumn(
                "temp",
                when(
                    col("dict_focus_rate.{}".format(cat)) < 1 / cat_count,
                    array_remove(u_with_category["userCategory"],
                                 cat)).otherwise(
                                     u_with_category["userCategory"])).drop(
                                         "userCategory").withColumnRenamed(
                                             "temp", "userCategory")
        u_with_category = u_with_category.select("id", "userCategory")

        #join n and u_with_category
        n_with_category = n.join(u_with_category, "id", how="left")

        #add category columns to e
        e_with_category = e.join(n_with_category,
                                 e.Source == n_with_category.id,
                                 how="left").withColumnRenamed(
                                     "userCategory", "sourceCategory").select(
                                         "Source", "Target", "sourceCategory")
        e_with_category = e_with_category.join(
            n_with_category,
            e_with_category.Target == n_with_category.id,
            how="left").withColumnRenamed("userCategory",
                                          "targetCategory").select(
                                              "Source", "Target",
                                              "sourceCategory",
                                              "targetCategory")

        #determine intersection between sourceCategory and targetCategory
        e_with_category = e_with_category.withColumn(
            "Categories",
            array_intersect(e_with_category["sourceCategory"],
                            e_with_category["targetCategory"]))

        #flatten out categories of edges
        e_with_category = e_with_category.withColumn(
            "Category",
            explode(col("Categories"))).select("Source", "Target", "Category")
        print("e_with_category", e_with_category.count())
        e_with_category.printSchema()

        ## Insert data
        insert_query = '''
        UNWIND {triples} as triple
        MERGE (p1:User {id:triple[0]})
        MERGE (p2:User {id:triple[1]}) WITH p1,p2,triple
        CALL apoc.create.relationship(p1, triple[2], {}, p2) YIELD rel
        RETURN *
        '''

        e_listoftriples = e_with_category.toPandas()[[
            'Source', 'Target', 'Category'
        ]].values.tolist()
        print("e_listoftriples:", len(e_listoftriples))
        batches = list(self.generate_batches(e_listoftriples, 7000))
        for batch in batches:
            neo4j.run(insert_query, parameters={"triples": batch})

        e_with_category.show()
        print("batches size:", len(batches), " last batch:", len(batches[-1]))
Пример #9
0
def main(society):
    df_user_join = spark.read.format("csv").option(
        "header", "true").load('joined_user.csv')
    df_user_join.show()

    df_posts_society = load_posts(society)
    df_posts_society_filterd = df_posts_society.join(
        df_user_join,
        df_posts_society[str(society +
                             "_ownerId")] == df_user_join[str(society +
                                                              "_userId")],
        'inner').drop(str(society + "_userId"))
    df_posts_society_filterd.show()

    df_posts_society_question = df_posts_society_filterd.where(
        df_posts_society_filterd[str(society + "_typeId")] == 1)
    df_posts_society_answer = df_posts_society_filterd.where(
        df_posts_society_filterd[str(society + "_typeId")] == 2).drop(
            str(society + "_postId")).drop(str(society + "_typeId")).drop(
                str(society + "_tags"))
    df_posts_society_answer = df_posts_society_answer.withColumnRenamed(
        str(society + "_parentID"), str(society + "_answerParentId"))
    df_posts_society_answer = df_posts_society_answer.withColumnRenamed(
        str(society + "_ownerId"), str(society + "_answerOwnerId"))

    df_posts_society_question.show()
    df_posts_society_answer.show()

    df_posts_society_answer_post = df_posts_society.join(
        df_posts_society_answer, df_posts_society[str(society + "_postId")] ==
        df_posts_society_answer[str(society + "_answerParentId")], 'inner')
    df_posts_society_answer_post.show()

    df_posts_society_question = df_posts_society_question.select(
        str(society + "_tags"), str(society + "_ownerId"), 'accountId')
    df_posts_society_question.show()
    print(df_posts_society_question.count())
    df_posts_society_answer_post = df_posts_society_answer_post.select(
        str(society + "_tags"), str(society + "_answerOwnerId"), 'accountId')
    df_posts_society_answer_post = df_posts_society_answer_post.withColumnRenamed(
        str(society + "_answerOwnerId"), str(society + "_ownerId"))
    df_posts_society_answer_post.show()
    print(df_posts_society_answer_post.count())
    df_posts_society_merge = df_posts_society_question.unionAll(
        df_posts_society_answer_post)
    df_posts_society_merge.show()
    print(df_posts_society_merge.count())
    df_posts_society_merge.write.csv(society + '_all_tags_combined.csv',
                                     header='true')

    # df_posts_society_merge = spark.read.format("csv").option("header", "true").load(society + '_all_tags_combined.csv')
    df_posts_society_merge.show()

    df_posts_society_merge = df_posts_society_merge.withColumn(
        'tag',
        functions.explode(
            functions.array_remove(
                functions.split(str(society + "_tags"), r"(\<)|(\>)"), "")))
    df_posts_society_merge = df_posts_society_merge.drop(str(society +
                                                             "_tags"))
    df_posts_society_merge = df_posts_society_merge.withColumnRenamed(
        'tag', str(society + "_tags"))
    df_posts_society_merge.show()

    # uncommand these lines if tag synonyms are used
    # df_synonym = spark.read.format("csv").option("header", "true").load(society + '_synonym.csv')
    # df_synonym.show()

    # df_posts_society_merge = df_posts_society_merge.withColumn(str(society + "_tags"), functions.when(df_posts_society_merge[str(society + "_tags")] == df_synonym['synonym'], df_synonym['target']).otherwise(df_posts_society_merge[str(society + "_tags")]))

    # df_posts_society_merge.show()

    df_posts_society_merge.write.csv(society + '_tags.csv', header='true')