Exemplo n.º 1
0
def spark_intersect(mutation_table_name, regions_table_name, DB_CONF, output_format, regions=None, jdbc_jar='postgresql-42.2.12.jar', groupby=None, useSQL=False, minCount=-1, tumorType=None, filter=None):

    # SQL VS MINE8: 388[1h] , 1507 (25min), 1018[bin=20], (1h,  no bins), 1101 (5 bins), 994 [100] - 952 [200] 916(ctcf) 941[41]
    # 590 ETS1
    #3h13 geco 4h37 genomic

    fs_db_dir  =os.getenv('MUTVIZ_FS_DB_FOLDER', "disabled")

    numPartitions = int(os.getenv('MUTVIZ_NUM_PARTITIONS', -1))
    memory = os.getenv('MUTVIZ_DRIVER_MEMORY', "50g")
    sparkDebug = os.getenv('MUTVIZ_SPARK_DEBUG', "false") == "true"
    print("USING "+str(numPartitions)+" PARTITIONS (-1:AUTO).")
    start_time = time.time()

    os.environ["SPARK_HOME"] = os.getenv('MUTVIZ_SPARK_HOME', "/var/lib/spark-2.4.5-bin-hadoop2.7")
    os.environ["PYSPARK_PYTHON"] = sys.executable
    os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

    driver_class = "org.postgresql.Driver"

    cores = os.getenv('MUTVIZ_CORES', "*")

    print("#### SPARK CONFIGURATION ####")
    print("SPARK HOME: " + os.getenv('SPARK_HOME'))
    print("Using cores: "+cores)
    print("Using memory: "+memory)
    print("Using partitions: "+str(numPartitions))
    print("Debug enabled: " + str(sparkDebug))
    if tumorType:
        print("Tumor Type id: "+str(tumorType))
    if filter:
        print("Filter count: "+str(len(filter)))
    print("#############################")

    spark = SparkSession.builder \
        .master("local["+cores+"]") \
        .appName("Word Count") \
        .config("spark.jars", jdbc_jar) \
        .config("spark.driver.memory", memory) \
        .config("spark.driver.cores", cores) \
        .getOrCreate()

    sql_ctx = SQLContext(spark.sparkContext)
    sc = spark.sparkContext

    properties = {'user': DB_CONF["postgres_user"], 'password':DB_CONF["postgres_pw"], 'driver': driver_class}
    url = 'postgresql://'+DB_CONF["postgres_host"]+':'+DB_CONF["postgres_port"]+'/'+DB_CONF["postgres_db"]


    if fs_db_dir == 'disabled':
        mutations = DataFrameReader(sql_ctx).jdbc(
            url='jdbc:%s' % url, table=mutation_table_name, properties=properties
        )
    else:
        customSchema = StructType([
            StructField("donor_id", IntegerType(), False),
            StructField("tumor_type_id", IntegerType(), False),
            StructField("chrom", IntegerType(), False),
            StructField("position", IntegerType(), False),
            StructField("mutation_code_id", IntegerType(), False),
            StructField("trinucleotide_id_r", IntegerType(), False)]
        )

        mutations = spark.read.format("csv").option("header", "true").schema(customSchema).load(fs_db_dir + "/"+mutation_table_name)

    def filter1(x):
        return int(x["tumor_type_id"]) == int(tumorType)
    def filter2(x):
        return  int(x["donor_id"]) in filter

    if tumorType:
        mutations = mutations.rdd.filter(filter1)
        if filter!=None:
            mutations = mutations.filter(filter2)

        if mutations.isEmpty():
            return []
        else:
            mutations = mutations.toDF()



    regions_df = DataFrameReader(sql_ctx).jdbc(
        url='jdbc:%s' % url, table=regions_table_name, properties=properties
    )

    if sparkDebug:
        print("############ mutations ==> ", mutations.count())
        print("############ regions   ==>", regions_df.count())

    # new
    if useSQL :
        mutations.registerTempTable("mutations")
        regions_df.registerTempTable("regions")

        sql_res = spark.sql("SELECT m.tumor_type_id, m.trinucleotide_id_r, count(*) from mutations as m, regions as r WHERE m.chrom=r.chrom AND m.position >= r.pos_start AND m.position <= r.pos_stop GROUP BY m.tumor_type_id, m.trinucleotide_id_r")

        res = sql_res.rdd.map(lambda r: [r["tumor_type_id"], r["trinucleotide_id_r"], r["count(1)"]]).collect()
        print("Spark execution took %s seconds ---" % (time.time() - start_time))
    #print(sql_res.collect())

    else:

        # regions = regions_df.collect()
        # print("====> REGIONS COLLECTED AFTER (S) %s" % (time.time() - start_time))
        # rb = defaultdict(list)
        # for v in regions: rb[v["chrom"]].append(v)
        #
        # for c in rb:
        #     rb[c] = sorted(rb[c], key=itemgetter('pos_start', 'pos_stop'))

        regions = regions_df.collect()
        regions = sorted(regions, key=itemgetter('pos_start', 'pos_stop'))

        print("====> REGIONS SORTED AFTER (S) %s" % (time.time() - start_time))
        regions_broadcast = sc.broadcast(regions)
        print("====> REGIONS BROADCAST AFTER (S) %s" % (time.time() - start_time))


        def partitionWork(p):

            localMutations = list(p)
            matched = []

            if sparkDebug:
                print("====> PROCESSING PARTITION AFTER (S)  %s" % (time.time() - start_time))

            if localMutations:

                import copy
                localRegions = copy.deepcopy(regions_broadcast.value)

                if localRegions:
                    sorted_mutations = sorted(localMutations, key=itemgetter('position'))
                    sorted_regions = localRegions

                    cur_reg_idx = 0
                    cur_mut_idx = 0

                    while( cur_mut_idx < len(sorted_mutations)  and cur_reg_idx < len(sorted_regions) ):

                        cur_reg = sorted_regions[cur_reg_idx]
                        cur_mut = sorted_mutations[cur_mut_idx]

                        if cur_mut["position"] < cur_reg["pos_start"]:
                            cur_mut_idx += 1
                        elif cur_mut["position"] <= cur_reg["pos_stop"]:
                            if cur_reg["chrom"] == cur_mut["chrom"]:
                                matched.append(cur_mut)
                            else:
                                # look ahead
                                next_region_index = cur_reg_idx + 1
                                while next_region_index < len(sorted_regions) and sorted_regions[next_region_index][
                                    "pos_start"] <= cur_mut["position"]:
                                    if sorted_regions[next_region_index]["chrom"] == cur_mut["chrom"] and \
                                            sorted_regions[next_region_index]["pos_stop"] >= cur_mut["position"]:
                                        matched.append(cur_mut)
                                    next_region_index = next_region_index + 1

                            cur_mut_idx += 1
                        else:
                            cur_reg_idx += 1

            return matched


        #if numPartitions > 0:
        #    res = mutations.rdd.groupBy(lambda e: e["chrom"],numPartitions=numPartitions).flatMap(partitionWork)
        #else:
        #    res = mutations.rdd.groupBy(lambda e: e["chrom"]).flatMap(partitionWork)

        if sparkDebug:
            print("#### NUM PARTITIONS: ", mutations.rdd.getNumPartitions)

        res = mutations.rdd.mapPartitions(partitionWork)


        if sparkDebug:
            print("############ results ==> ", res.count())

        # Grouping
        #todo: if empty
        if groupby:
            if minCount==-1:
                res = res.toDF().groupBy(groupby).count().rdd.map(output_format)
            else:
                res_df = res.toDF().groupBy(groupby).count()
                res = res_df.filter(res_df["count"]>minCount).rdd.map(output_format)

            if sparkDebug:
                print("############ results after grouping ==> ", res.count())

    res = res.collect()
    sc.stop()

    print("Spark execution took %s seconds ---" % (time.time() - start_time))

    return res
Exemplo n.º 2
0
def spark_intersect2(mutation_table_name, regions_table_name, DB_CONF, jdbc_jar='postgresql-42.2.12.jar'):

    fs_db_dir  =os.getenv('MUTVIZ_FS_DB_FOLDER', "disabled")

    numPartitions = int(os.getenv('MUTVIZ_NUM_PARTITIONS', -1))
    memory = os.getenv('MUTVIZ_DRIVER_MEMORY', "50g")
    sparkDebug = os.getenv('MUTVIZ_SPARK_DEBUG', "false") == "true"
    print("USING "+str(numPartitions)+" PARTITIONS (-1:AUTO).")
    start_time = time.time()

    os.environ["SPARK_HOME"] = os.getenv('MUTVIZ_SPARK_HOME', "/var/lib/spark-2.4.5-bin-hadoop2.7")
    os.environ["PYSPARK_PYTHON"] = sys.executable
    os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

    driver_class = "org.postgresql.Driver"

    cores = os.getenv('MUTVIZ_CORES', "*")

    print("#### SPARK CONFIGURATION ####")
    print("SPARK HOME: " + os.getenv('SPARK_HOME'))
    print("Using cores: "+cores)
    print("Using memory: "+memory)
    print("Using partitions: "+str(numPartitions))
    print("Debug enabled: " + str(sparkDebug))
    print("#############################")

    spark = SparkSession.builder \
        .master("local["+cores+"]") \
        .appName("Word Count") \
        .config("spark.jars", jdbc_jar) \
        .config("spark.driver.memory", memory) \
        .config("spark.driver.cores", cores) \
        .getOrCreate()

    sql_ctx = SQLContext(spark.sparkContext)
    sc = spark.sparkContext

    properties = {'user': DB_CONF["postgres_user"], 'password':DB_CONF["postgres_pw"], 'driver': driver_class}
    url = 'postgresql://'+DB_CONF["postgres_host"]+':'+DB_CONF["postgres_port"]+'/'+DB_CONF["postgres_db"]


    if fs_db_dir == 'disabled':
        mutations = DataFrameReader(sql_ctx).jdbc(
            url='jdbc:%s' % url, table=mutation_table_name, properties=properties
        )
    else:
        customSchema = StructType([
            StructField("donor_id", IntegerType(), False),
            StructField("tumor_type_id", IntegerType(), False),
            StructField("chrom", IntegerType(), False),
            StructField("position", IntegerType(), False),
            StructField("mutation_code_id", IntegerType(), False),
            StructField("trinucleotide_id_r", IntegerType(), False)]
        )

        mutations = spark.read.format("csv").option("header", "true").schema(customSchema).load(fs_db_dir + "/"+mutation_table_name)

    regions_df = DataFrameReader(sql_ctx).jdbc(
        url='jdbc:%s' % url, table=regions_table_name, properties=properties
    )

    regions = regions_df.collect()
    regions = sorted(regions, key=itemgetter('pos_start', 'pos_stop'))

    print("====> REGIONS SORTED AFTER (S) %s" % (time.time() - start_time))
    regions_broadcast = sc.broadcast(regions)
    print("====> REGIONS BROADCAST AFTER (S) %s" % (time.time() - start_time))


    def partitionWork(p):

        localMutations = list(p)
        matched = []

        if sparkDebug:
            print("====> PROCESSING PARTITION AFTER (S)  %s" % (time.time() - start_time))

        if localMutations:

            import copy
            localRegions = copy.deepcopy(regions_broadcast.value)

            if localRegions:
                sorted_mutations = sorted(localMutations, key=itemgetter('position'))
                sorted_regions = localRegions

                cur_reg_idx = 0
                cur_mut_idx = 0

                while( cur_mut_idx < len(sorted_mutations)  and cur_reg_idx < len(sorted_regions) ):

                    cur_reg = sorted_regions[cur_reg_idx]
                    cur_mut = sorted_mutations[cur_mut_idx]

                    if cur_mut["position"] < cur_reg["pos_start"]:
                        cur_mut_idx += 1
                    elif cur_mut["position"] <= cur_reg["pos_stop"]:
                        if cur_reg["chrom"] == cur_mut["chrom"]:
                            matched.append(cur_mut)
                        else:
                            # look ahead
                            next_region_index = cur_reg_idx + 1
                            while next_region_index < len(sorted_regions) and sorted_regions[next_region_index][
                                "pos_start"] <= cur_mut["position"]:
                                if sorted_regions[next_region_index]["chrom"] == cur_mut["chrom"] and \
                                        sorted_regions[next_region_index]["pos_stop"] >= cur_mut["position"]:
                                    matched.append(cur_mut)
                                next_region_index = next_region_index + 1

                        cur_mut_idx += 1
                    else:
                        cur_reg_idx += 1

        return matched

    res = mutations.rdd.mapPartitions(partitionWork).toDF().toPandas()


    print("Spark execution took %s seconds ---" % (time.time() - start_time))

    return res
Exemplo n.º 3
0
            },
            sort_keys=False)
    except BaseException as e:
        return None


try:
    data_file = "airports.dat"
    data_file_url = "https://raw.githubusercontent.com/jpatokal/openflights/master/data/airports.dat"
    download_file(data_file_url, data_file)
    kafka_url = "172.18.0.100:9092"
    kafka_topic = "airports"

    sc = SparkContext("local", "example-of-processing-data")
    sc.setLogLevel("ERROR")
    sqlContext = SQLContext(sc)

    airports = DataFrameReader(sqlContext).csv(data_file)

    producer = KafkaProducer(bootstrap_servers=kafka_url)
    #for index in range(1, airports.count()):
    #    print(to_json(airports.take(index)[0]))
    for each in airports.collect():
        # logging.debug(as_json(each))
        producer.send(kafka_topic, as_json(each))

finally:
    producer.close()
    sc.stop()
    shutil.os.remove(data_file)