def getSC(appName='fea'): sconf = SparkConf().set("spark.hadoop.validateOutputSpecs", "false") \ .set("spark.akka.frameSize", "1000") \ .set("spark.kryoserializer.buffer.max", "1000") sc = SparkContext(appName=appName, conf=sconf) sc.addPyFile("fea.py") return sc
def init_spark_context(details=[]): global spark_context if spark_context: return build_type = yb_dist_tests.global_conf.build_type from pyspark import SparkContext # We sometimes fail tasks due to unsynchronized clocks, so we should tolerate a fair number of # retries. # https://stackoverflow.com/questions/26260006/are-failed-tasks-resubmitted-in-apache-spark # NOTE: we never retry failed tests to avoid hiding bugs. This failure tolerance mechanism # is just for the resilience of the test framework itself. SparkContext.setSystemProperty('spark.task.maxFailures', str(SPARK_TASK_MAX_FAILURES)) if yb_dist_tests.global_conf.build_type == 'tsan': logging.info("Using a separate default Spark cluster for TSAN tests") default_spark_master_url = DEFAULT_SPARK_MASTER_URL_TSAN else: logging.info("Using the regular default Spark cluster for non-TSAN tests") default_spark_master_url = DEFAULT_SPARK_MASTER_URL spark_master_url = os.environ.get('YB_SPARK_MASTER_URL', default_spark_master_url) details += [ 'user: {}'.format(getpass.getuser()), 'build type: {}'.format(build_type) ] if 'BUILD_URL' in os.environ: details.append('URL: {}'.format(os.environ['BUILD_URL'])) spark_context = SparkContext(spark_master_url, "YB tests ({})".format(', '.join(details))) spark_context.addPyFile(yb_dist_tests.__file__)
def main(): sc = SparkContext() sqlCtx = SQLContext(sc) config = configparser.ConfigParser() config.read('config.ini') #Path where docking list file will be saved path_to_save = str(sys.argv[1]) #Path for drugdesign project path_spark_drugdesign = config.get('DRUGDESIGN', 'path_spark_drugdesign') sc.addPyFile(os.path.join(path_spark_drugdesign,"database_crud.py")) sc.addPyFile(os.path.join(path_spark_drugdesign,"database_io.py")) #**************** Loading Ligand Database ligand_database = config.get('DEFAULT', 'ligand_database_path_file') rdd_database = load_database(sc, ligand_database) #Creating Dataframe database_table = sqlCtx.createDataFrame(rdd_database) database_table.registerTempTable("database") #**************** Finish #Creating input files for peforming virtual screening creating_docking_list(path_to_save, config, sqlCtx)
def getSC(appName='aux'): sconf = SparkConf().set("spark.hadoop.validateOutputSpecs", "false") \ .set("spark.akka.frameSize", "2000") \ .set("spark.kryoserializer.buffer.max", "2000") sc = SparkContext(appName=appName, conf=sconf) sc.addPyFile("src/data_loader.py") sc.addPyFile("src/common.py") return sc
def main(): sc = SparkContext() sqlCtx = SQLContext(sc) config = configparser.ConfigParser() config.read('config.ini') #Path that contains all files for analysis path_analysis = config.get('DEFAULT', 'path_analysis') #Ligand Database file ligand_database = config.get('DEFAULT', 'ligand_database_path_file') #Path for drugdesign project path_spark_drugdesign = config.get('DRUGDESIGN', 'path_spark_drugdesign') #Adding Python Source file sc.addPyFile(os.path.join(path_spark_drugdesign,"vina_utils.py")) sc.addPyFile(os.path.join(path_spark_drugdesign,"json_utils.py")) start_time = datetime.now() #**************** Loading file that contains all scores score_file_name = os.path.join(path_analysis,get_file_name_sorted_energy()) text_file = sc.textFile(score_file_name) #Spliting score file by \t header = text_file.first() #extract header rdd_vs_score_sorted_split = text_file.filter(lambda x:x !=header) #filter out header rdd_vs_score_sorted_split = rdd_vs_score_sorted_split.map(lambda line: line.split("\t")) rdd_vs_score_sorted = rdd_vs_score_sorted_split.map(lambda p: Row(energy=float(p[0]), pose=str(p[1]), ligand=get_ligand_from_receptor_ligand_model(p[1]) )) #Creating Vina Datafrase based on score file vina_table = sqlCtx.createDataFrame(rdd_vs_score_sorted) vina_table.registerTempTable("vina") #**************** Finish #**************** Loading Ligand Database rdd_database = load_database(sc, ligand_database) #Creating Dataframe database_table = sqlCtx.createDataFrame(rdd_database) database_table.registerTempTable("database") #**************** Finish #Computing ligand efficiency ligand_efficiencyRDD = sqlCtx.sql("SELECT vina.pose, vina.energy as affinity, (vina.energy / database.heavyAtom) as lig_efficiency FROM database JOIN vina ON vina.ligand = database.ligand ORDER BY vina.energy") ligand_efficiencyRDD = ligand_efficiencyRDD.map(lambda p: (p.pose, p.affinity, p.lig_efficiency) ).collect() #Saving ligand efficiency file save_ligand_efficiency(path_analysis, ligand_efficiencyRDD) finish_time = datetime.now() save_ligand_efficiency_log(finish_time, start_time)
def functionToCreateContext(): # new context conf = SparkConf() conf = conf.setAppName(APP_NAME) sc = SparkContext(conf=conf) # http://stackoverflow.com/questions/24686474/shipping-python-modules-in-pyspark-to-other-nodes sc.addPyFile("common.py") # As argument Spark Context and batch retention ssc = StreamingContext(sc, 10) # set checkpoint directory ssc.checkpoint(CHECKPOINT_DIR) # return streaming spark context return ssc
def sc(self): # noqa if not self._spark_context: spark_context = SparkContext(conf=self.spark_config) assert self.spex_conf.spex_file is not None, "The spex builder must be broken I do not know my spex conf!" spark_context.addFile(self.spex_conf.spex_file) for py_file in self.spex_conf.spark_config.py_files: spark_context.addPyFile(py_file) for file in self.spex_conf.spark_config.files: # noqa spark_context.addFile(file) for jar in self.spex_conf.spark_config.jars: # noqa spark_context.addFile(jar) self._spark_context = spark_context print_banner(self) return self._spark_context
def main(): # master = 'local[2]' master = 'spark://192.168.9.164:7077' app_name = 'test-broadcast' # spark_home = '/data01/app/bigdata/spark' # local spark_home = '/home/hadoop/app/spark' # test pyFiles = ['mysql_utils.py'] spark_conf = SparkConf() spark_conf.setMaster(master).setAppName(app_name).setSparkHome(spark_home) sc = SparkContext(conf=spark_conf) for path in (pyFiles or []): sc.addPyFile(path) external_cache = get_api_deviceinfo() deviceinfo_b = sc.broadcast(external_cache) sc.stop()
def main(): sc = SparkContext() config = configparser.ConfigParser() config.read('config.ini') #Broadcast path_analysis = config.get('DEFAULT', 'path_analysis') path_save_log = config.get('DEFAULT', 'path_save_log') path_spark_drugdesign = config.get('DRUGDESIGN', 'path_spark_drugdesign') #Adding Python Source file sc.addPyFile(os.path.join(path_spark_drugdesign,"vina_utils.py")) sc.addPyFile(os.path.join(path_spark_drugdesign,"json_utils.py")) #Checking path_analysis if not os.path.exists(path_analysis): os.makedirs(path_analysis) else: if len(os.listdir(path_analysis)) > 0: raise EnvironmentError("Analysis directory contains files ") #preparing log list list_obj_log = [] log_files = get_files_log(path_save_log) for flog in log_files: list_obj_log.append(flog) #appling map and collect logRDD = sc.parallelize(list_obj_log) all_lines_dic = logRDD.map(build_log_lines).collect() #creating a dictionary from the returned rdd dict_from_rdd = create_dictionary_from_rdd(all_lines_dic) #sorting dictionary sorted_dict_list = sorted(dict_from_rdd.items(), key=operator.itemgetter(1)) #saving energy file create_file_by_sorted_energy(path_analysis, sorted_dict_list)
def getSC(master, name): conf = (SparkConf() .setMaster(master) .setAppName(name) #.set("spark.executor.memory", "1g") .set("spark.akka.frameSize", "512") ) sc = SparkContext(conf = conf) sc.addPyFile('default.py') sc.addPyFile('segment.py') sc.addPyFile('radix.py') sc.addPyFile('partition.py') sc.addPyFile('bwt.py') return sc
def main(): try: partitions_num = sys.argv[1] csv_filename = sys.argv[2] base_dir = sys.argv[3] attrs_to_save = sys.argv[4:] except: logger.error("Usage: ./mmsongsdb_to_csv.py partitions_num <csv_filename> <directory> [<attrs_to_save>]") sys.exit(1) return sc = SparkContext(appName="mmSongtoCSV") sc.addPyFile("/root/mm-songs-db-tools-master2/hdf5_getters.py") sc.addPyFile("/root/mm-songs-db-tools-master2/mmsongsdbtocsvconverter.py") converter = MMSongsDbToCsvConverter(csv_filename, attrs_to_save) file_list = filter(lambda s: s.endswith(".h5"), ["%s%s%s" %(root, os.sep, file) for root, dirs, files in os.walk(base_dir) for file in files]) file_partitions = sc.parallelize(file_list, partitions_num) rdd = file_partitions.map(converter._handle_h5_file) #print rdd.count() rdd.saveAsTextFile(csv_filename)
def main(): config = configparser.ConfigParser() config.read('config.ini') #Number of poses to select by buried area number_poses_to_select_mult_obj = int(config.get('DRUGDESIGN', 'number_poses_to_select_mult_obj') ) #Path that contains all files for analysis path_analysis = config.get('DEFAULT', 'path_analysis') #File for saving the filtered buried area result_file_to_select_buried_area = config.get('DRUGDESIGN', 'result_file_to_select_buried_area') #File for saving the filtered buried area only poses result_file_to_select_buried_area_only_pose = config.get('DRUGDESIGN', 'result_file_to_select_buried_area_only_pose') result_file_to_select_normalized_buried_area_only_pose = config.get('DRUGDESIGN', 'result_file_to_select_normalized_buried_area_only_pose') #Ligand Database file ligand_database = config.get('DEFAULT', 'ligand_database_path_file') #Path where all pdb receptor are path_receptor = config.get('DEFAULT', 'pdb_path') #Path for saving pdb files of models generated by VS path_ligand = get_directory_pdb_analysis(path_analysis) #Path where saved the selected compelex path_to_save = os.path.join(path_analysis, "mult_objective") if not os.path.exists(path_to_save): os.makedirs(path_to_save) # Create SPARK config maxResultSize = str(config.get('SPARK', 'maxResultSize')) conf = (SparkConf().set("spark.driver.maxResultSize", maxResultSize)) # Create context sc = SparkContext(conf=conf) sqlCtx = SQLContext(sc) #Adding Python Source file #Path for drugdesign project path_spark_drugdesign = config.get('DRUGDESIGN', 'path_spark_drugdesign') sc.addPyFile(os.path.join(path_spark_drugdesign,"vina_utils.py")) sc.addPyFile(os.path.join(path_spark_drugdesign,"pdb_io.py")) sc.addPyFile(os.path.join(path_spark_drugdesign,"database_io.py")) start_time = datetime.now() finish_time = datetime.now() save_log(finish_time, start_time)
def main(): sc = SparkContext() sqlCtx = SQLContext(sc) config = configparser.ConfigParser() config.read('config.ini') #Path that contains all files for analysis path_analysis = config.get('DEFAULT', 'path_analysis') #Ligand Database file ligand_database = config.get('DEFAULT', 'ligand_database_path_file') #Path for drugdesign project path_spark_drugdesign = config.get('DRUGDESIGN', 'path_spark_drugdesign') #Adding Python Source file sc.addPyFile(os.path.join(path_spark_drugdesign,"vina_utils.py")) sc.addPyFile(os.path.join(path_spark_drugdesign,"database_io.py")) sc.addPyFile(os.path.join(path_spark_drugdesign,"hydrogen_bond_io.py")) sc.addPyFile(os.path.join(path_spark_drugdesign,"hydrogen_bond_crud.py")) #Sufix of completly data file full_data_file_name = config.get('DRUGDESIGN', 'full_data_file_name') start_time = datetime.now() #**************** Loading file that contains all scores and ligand efficiency score_file_name = os.path.join(path_analysis, "summary_energies.dat") text_file = sc.textFile(score_file_name) header = text_file.first() #extract header #Spliting score file by \t rdd_vs_score_sorted_split = text_file.filter(lambda x:x !=header).map(lambda line: line.split("\t")) #rdd_vs_score_sorted = rdd_vs_score_sorted_split.map(lambda p: Row(receptor=str(p[0]), ligand=str(p[1]), mode=int(p[2]), energy=float(p[3]) )) rdd_vs_score_sorted = rdd_vs_score_sorted_split.map(lambda p: Row(affinity=float(p[0]), ligand_efficiency=float(p[1]), pose=str(p[2]) )) #Creating Vina Datafrase based on score file vina_table = sqlCtx.createDataFrame(rdd_vs_score_sorted) vina_table.registerTempTable("vina_lig_efficiency") #**************** Finish #**************** Loading Ligand Database rdd_database = load_database(sc, ligand_database) #Creating Dataframe database_table = sqlCtx.createDataFrame(rdd_database) database_table.registerTempTable("database") #**************** Finish #**************** Loading Buried Area total buried_area_file_name = os.path.join(path_analysis,"summary_buried_areas_total.dat") buried_area_file = sc.textFile(buried_area_file_name) #Spliting file by \t header = buried_area_file.first() #extract header rdd_buried_area_split = buried_area_file.filter(lambda x:x !=header).map(lambda line: line.split("\t")) #rdd_buried_area = rdd_buried_area_split.map(lambda p: Row( receptor=str(p[0]), ligand=str(p[1]), mode=int(p[2]), buried_lig_rec=float(p[3]), buried_lig_rec_perc=float(p[4]), buried_lig_lig_perc=float(p[5]) )) rdd_buried_area = rdd_buried_area_split.map(lambda p: Row( buried_area_total=float(p[0]), pose=str(p[1]) )) #Creating buried Dataframe buried_table = sqlCtx.createDataFrame(rdd_buried_area) buried_table.registerTempTable("buriedArea_total") #**************** Finish #**************** Loading Buried Area receptor buried_area_file_name = os.path.join(path_analysis,"summary_buried_areas_receptor.dat") buried_area_file_receptor = sc.textFile(buried_area_file_name) header = buried_area_file_receptor.first() #extract header #Spliting file by \t buried_area_file_receptor_split = buried_area_file_receptor.filter(lambda x:x !=header).map(lambda line: line.split("\t")) buried_area_file_receptor = buried_area_file_receptor_split.map(lambda p: Row( buried_area_receptor=float(p[0]), pose=str(p[1]) )) #Creating buried Dataframe buried_area_file_receptor_table = sqlCtx.createDataFrame(buried_area_file_receptor) buried_area_file_receptor_table.registerTempTable("buried_area_receptor") #**************** Finish #**************** Loading Buried Area ligand buried_area_file_name = os.path.join(path_analysis,"summary_buried_area_ligand.dat") buried_area_file_ligand = sc.textFile(buried_area_file_name) header = buried_area_file_ligand.first() #extract header #Spliting file by \t buried_area_file_ligand_split = buried_area_file_ligand.filter(lambda x:x !=header).map(lambda line: line.split("\t")) buried_area_file_ligand = buried_area_file_ligand_split.map(lambda p: Row( buried_area_lig=float(p[0]), buried_area_lig_perc=float(p[1]), buried_area_lig_lig_perc=float(p[2]), pose=str(p[3]) )) #Creating buried Dataframe buried_area_file_ligand_table = sqlCtx.createDataFrame(buried_area_file_ligand) buried_area_file_ligand_table.registerTempTable("buried_area_ligand") #**************** Finish #**************** Loading Hydrogen Bond hydrogen_bond_num_pose_file_name = os.path.join(path_analysis,"summary_hbonds_4.0A_30.0deg.dat") rdd_hydrogen_bond = load_file_summary_hbonds(sc, hydrogen_bond_num_pose_file_name) #Creating buried Dataframe hydrogen_bond_table = create_df_hydrogen_bond(sqlCtx, rdd_hydrogen_bond) #**************** Finish #Creating SQL command sql = "" sql = "SELECT vina_lig_efficiency.pose, vina_lig_efficiency.affinity, vina_lig_efficiency.ligand_efficiency" sql +=" ,buriedArea_total.buried_area_total" sql +=" ,buried_area_receptor.buried_area_receptor" sql +=" ,buried_area_ligand.buried_area_lig, buried_area_ligand.buried_area_lig_perc, buried_area_ligand.buried_area_lig_lig_perc " sql +=" ,hydrogenbond.numHydroBond " sql +=" FROM vina_lig_efficiency" sql +=" JOIN buriedArea_total ON buriedArea_total.pose = vina_lig_efficiency.pose" sql +=" JOIN buried_area_receptor ON buried_area_receptor.pose = vina_lig_efficiency.pose" sql +=" JOIN buried_area_ligand ON buried_area_ligand.pose = vina_lig_efficiency.pose" sql +=" LEFT OUTER " sql +=" JOIN hydrogenbond ON hydrogenbond.pose = vina_lig_efficiency.pose" sql +=" ORDER BY vina_lig_efficiency.pose" #Getting all data full_dataRDD = sqlCtx.sql(sql) full_dataRDD = full_dataRDD.map(lambda p: (p.affinity, p.ligand_efficiency, p.numHydroBond, p.buried_area_lig, p.buried_area_lig_perc, p.buried_area_lig_lig_perc, p.buried_area_total, p.buried_area_receptor, p.pose) ).collect() #Saving file save_vs_full_data(path_analysis, full_dataRDD, full_data_file_name) finish_time = datetime.now() save_vs_full_data_analysis_log(finish_time, start_time)
cf = ConfRead(conf_path) def getPara(paraName, section="PARA"): try: return cf.get(section, paraName) except Exception, e: logging.error("Fail to get para[%s]: %s" % (paraName, e)) return None # sys.exit(1) code_path = getPara("code_path") logging.info(code_path) sc.addPyFile(code_path + '/preprocess/FeatureManager.py') sc.addPyFile(code_path + '/preprocess/warm_start.py') sc.addPyFile(code_path + '/preprocess/__init__.py') sc.addPyFile(code_path + '/optimize/olbfgs.py') sc.addPyFile(code_path + '/optimize/__init__.py') sc.addPyFile(code_path + '/trainer.py') sc.addPyFile(code_path + '/__init__.py') from FeatureManager import * from olbfgs import * from warm_start import set_first_intercept, set_first_intercept_spark ##################################### data_path = cf.get("PARA", "data_path") max_iter = int(getPara('max_iter'))
def main(): # 解析配置 app_id = int(sys.argv[1]) master = sys.argv[2] app_name = sys.argv[3] # 应用配置 assert APP_CONFIG.get(app_id) is not None, \ '[myapp streaming_app_main.main()] configuration error invalid APP_CONFIG with app.id = ' + str(app_id) app_conf = map_conf_properties(APP_CONFIG.get(app_id), 'app.id')[app_id] spark_home = app_conf['sparkHome'] pyFiles = app_conf['pyFiles.list'] di_id = app_conf.get('app.interfaceId') # 数据接口配置 di_in_conf_with_ds_conf = get_di_conf_with_ds_conf( di_id, DATAINTERFACE_CONFIG, DATASOURCE_CONFIG, di_key='interface.id', di_ds_key='interface.sourceId', ds_key='source.id', merge_key_name='interface.id' )[di_id] print('= = ' * 20, type(di_in_conf_with_ds_conf), 'di_in_conf_with_ds_conf = ') pprint(di_in_conf_with_ds_conf) schema_conf_string = di_in_conf_with_ds_conf['schema'] struct_type = generate_df_schmea(schema_conf_string) # schema_field_list = [x.name for x in struct_type.fields] di_in_conf_with_ds_conf['struct.type'] = struct_type # di_in_conf_with_ds_conf['struct.field.list'] = schema_field_list di_out_confs = [kv for kv in DATAINTERFACE_CONFIG.iteritems() if kv[1].get('interface.type', '') == 'output'] print('= = ' * 20, type(di_out_confs), 'di_out_confs = ') pprint(di_out_confs) di_out_confs_with_ds_conf = list_dict_merge( [get_di_conf_with_ds_conf( kv[0], DATAINTERFACE_CONFIG, DATASOURCE_CONFIG, di_key='interface.id', di_ds_key='interface.sourceId', ds_key='source.id', merge_key_name='interface.id') for kv in DATAINTERFACE_CONFIG.iteritems() if kv[1].get('interface.type', '') == 'output'] ) print('= = ' * 20, type(di_out_confs_with_ds_conf), 'di_out_confs_with_ds_conf = ') pprint(di_out_confs_with_ds_conf) # 外部缓存配置 cache_confs_with_ds_conf = list_dict_merge( [get_di_conf_with_ds_conf( kv[0], CACHE_CONFIG, DATASOURCE_CONFIG, di_key='cache.id', di_ds_key='cache.sourceId', ds_key='source.id', merge_key_name='cache.id') for kv in CACHE_CONFIG.iteritems()] ) print('= = ' * 20, type(cache_confs_with_ds_conf), 'cache_confs_with_ds_conf = ') pprint(cache_confs_with_ds_conf) # 指定输入接口准备阶段的配置 # 准备阶段配置中有效步骤的配置 # Note: 对 dict 进行 filter,传给function的参数是 dict 的 key prepares_config_active = PREPARES_CONFIG[di_id] \ if PREPARES_CONFIG.get(di_id, {}).get('prepares.enabled', False) else {} # print('= = ' * 20, type(prepares_config_active), 'prepares_config_active = ') # pprint(prepares_config_active) # TODO: 2中方法的结果==测试False, 删除注释 # prepares_config_active_steps = filter( # lambda step_conf: step_conf[1].get('step.enabled', False), # map(lambda step_conf: (step_conf[0], map_conf_properties(step_conf[1])), # prepares_config_active.get('steps', {}).iteritems() # ) # ) prepares_config_active_steps = \ [(k, map_conf_properties(v)) for k, v in prepares_config_active.get('steps', {}).iteritems() if v.get('step.enabled', False)] print('= = ' * 20, type(prepares_config_active_steps), 'prepares_config_active_steps = ') pprint(prepares_config_active_steps) # 指定输入接口计算阶段的配置 # filter 之后变成 list,list 的每个元素是 tuple(computeStatistics.id, computeStatistics.conf_dict) computes_config_active = COMPUTES_CONFIG[di_id] \ if COMPUTES_CONFIG.get(di_id, {}).get('computeStatistics.enabled', False) else {} # list[{computeStatistic.id: {conf}}, ...] # # TODO: 2中方法的结果==测试False, 删除注释 # compute_computeStatistics_config_active = filter( # lambda computeStatistic_conf: computeStatistic_conf[1].get('computeStatistic.enabled', False), # computes_config_active.get('computeStatistics', {}).iteritems()) compute_computeStatistics_config_active = [ kv for kv in computes_config_active.get('computeStatistics', {}).iteritems() if kv[1].get('computeStatistic.enabled', False)] print('= = ' * 20, type(compute_computeStatistics_config_active), 'compute_computeStatistics_config_active = ') pprint(compute_computeStatistics_config_active) # {computeStatistic.id -> list[step_conf_tuple]}, 其中 step_conf_tuple = (step_id, step_conf_dict) compute_prepares_config_active = dict(map( lambda computeStatistic_conf: (computeStatistic_conf[0], sorted(list_dict_merge( map(lambda step_conf: map_conf_properties(step_conf[1], 'step.id'), filter( lambda step_conf: step_conf[1].get('step.enabled', False), computeStatistic_conf[1].get('prepares.steps', {}).iteritems()) )).iteritems()) ), compute_computeStatistics_config_active)) # print('= = ' * 30, compute_prepares_config_active2 == compute_prepares_config_active) print('= = ' * 20, type(compute_prepares_config_active), 'compute_prepares_config_active = ') pprint(compute_prepares_config_active) compute_computes_config_active = dict(map( lambda computeStatistic_conf: (computeStatistic_conf[0], sorted(list_dict_merge( map(lambda step_conf: map_conf_properties(step_conf[1], 'step.id'), filter(lambda step_conf: step_conf[1].get('step.enabled', False), computeStatistic_conf[1].get('computes.steps', {}).iteritems()) )).iteritems()) ), compute_computeStatistics_config_active)) print('= = ' * 20, type(compute_computes_config_active), 'compute_computes_config_active = ') pprint(compute_computes_config_active) test_flag = False if not test_flag: # 初始化 # 测试 serializer # serializer 默认取值 PickleSerializer() #UnpicklingError: invalid load key, '{'. # serializer=MarshalSerializer() # ValueError: bad marshal data # serializer=AutoSerializer() # ValueError: invalid sevialization type: { # serializer=CompressedSerializer(PickleSerializer()) # error: Error -3 while decompressing data: incorrect header check # sc = SparkContext(master, app_name, sparkHome = spark_home, pyFiles=pyFiles) # sc = SparkContext(master, app_name, sparkHome = sparkHome, pyFiles=pyFiles, serializer=MarshalSerializer()) # sc = SparkContext(master, app_name, sparkHome = sparkHome, pyFiles=pyFiles, serializer=AutoSerializer()) # sc = SparkContext(master, app_name, sparkHome = sparkHome, pyFiles=pyFiles, serializer=CompressedSerializer(PickleSerializer())) spark_conf = SparkConf() spark_conf.setMaster(master).setAppName(app_name).setSparkHome(spark_home) # spark streaming 调优配置 spark_streaming_blockInterval = str(app_conf.get('spark.streaming.blockInterval', '')).strip() if spark_streaming_blockInterval: spark_conf.set('spark.streaming.blockInterval', spark_streaming_blockInterval) spark_streaming_kafka_maxRatePerPartition = str( app_conf.get('spark.streaming.kafka.maxRatePerPartition', '')).strip() if spark_streaming_kafka_maxRatePerPartition: spark_conf.set('spark.streaming.kafka.maxRatePerPartition', spark_streaming_kafka_maxRatePerPartition) spark_streaming_receiver_maxRate = str(app_conf.get('spark.streaming.receiver.maxRate', '')).strip() if spark_streaming_receiver_maxRate: spark_conf.set('spark.streaming.receiver.maxRate', spark_streaming_receiver_maxRate) spark_streaming_concurrentJobs = str(app_conf.get('spark.streaming.concurrentJobs', '')).strip() if spark_streaming_concurrentJobs: spark_conf.set('spark.streaming.concurrentJobs', spark_streaming_concurrentJobs) # spark sql 调优配置 spark_sql_shuffle_partitions = str(app_conf.get('spark.sql.shuffle.partitions', '')).strip() if spark_sql_shuffle_partitions: spark_conf.set('spark.sql.shuffle.partitions', spark_sql_shuffle_partitions) sc = SparkContext(conf=spark_conf) for path in (pyFiles or []): sc.addPyFile(path) # 外部缓存优化,broadcast 分发 cache_manager = CacheManager() cache_broadcast_list = \ [(cache_id, cache_manager.cache_dataset(sc, cache_conf)) for cache_id, cache_conf in cache_confs_with_ds_conf.iteritems() if cache_conf.get('broadcast.enabled', False)] for cache_id, cache_broadcast in cache_broadcast_list: cache_confs_with_ds_conf[cache_id]['broadcast'] = cache_broadcast batchDruationSeconds = app_conf['batchDuration.seconds'] ssc = StreamingContext(sc, batchDruationSeconds) sqlc = SQLContext(sc) # 读取数据源 stream = StreamingReader.readSource(ssc, di_in_conf_with_ds_conf, app_conf) # 流处理: 1 根据配置初始化处理指定数据接口的类的实例, 2 调用指定处理类实例的流数据处理方法 # 测试 kafka_wordcount # counts = stream.flatMap(lambda line: line.split(" ")) \ # .map(lambda word: (word, 1)) \ # .reduceByKey(lambda a, b: a+b) # counts.pprint() StreamingApp.process( stream, sc, sqlc, di_in_conf_with_ds_conf, di_out_confs_with_ds_conf, cache_confs_with_ds_conf, prepares_config_active_steps, compute_prepares_config_active, compute_computes_config_active) ssc.start() ssc.awaitTermination()
#Average dialy taxi speed from pyspark import SparkContext, StorageLevel, SparkConf import os conf = (SparkConf() .setMaster("local") .setAppName("My nyc taxi app") .set("spark.executor.memory", "1g")) sc = SparkContext(conf=conf) root_dir = '' sc.addPyFile(root_dir + 'utils.py') from utils import parse_taxi_record_avg_speed # Load Data raw_data_url = "data/trips-subset.csv" raw_data = sc.textFile(raw_data_url) trips = raw_data.map(parse_taxi_record_avg_speed).reduceByKey( lambda a, b: a + b ) trips.persist(StorageLevel.MEMORY_AND_DISK) #Number of trips print trips.count() trips_avg_speed_grouped = trips_avg_speed.map(lambda ((r,c,t),s): (t[6:8],s)).groupByKey() trips_avg_speed_dialy = trips_avg_speed_grouped.map(lambda x: (x[0], round(sum(x[1])/len(x[1])))) #average daily taxi speed print trips_avg_speed_dialy.sortByKey().collect()
from pyspark import SparkContext from pyspark.mllib.linalg import Vectors sc = SparkContext() sc.addPyFile("./dbSparkFinalClass.py") from dbSparkFinalClass import * def main() : data = sc.textFile("./inputData.csv") parsedData = data.map(lambda s : Vectors.dense([float(i) for i in s.split(',')])).cache() dbScan = ParallelDBScan(); trainResult = dbScan.train(parsedData, 3, 3, 4); for clus in trainResult : print (str(clus[0][0]) +"," + str(clus[0][1]) + "," + clus[1]); sc.stop() main()
def main(): sc = SparkContext() sqlCtx = SQLContext(sc) config = configparser.ConfigParser() config.read('config.ini') # Path for gromacs spark project path_spark_drugdesign = config.get('DRUGDESIGN', 'path_spark_drugdesign') # Adding Python Source file sc.addPyFile(os.path.join(path_spark_drugdesign, "gromacs_utils.py")) sc.addPyFile(os.path.join(path_spark_drugdesign, "os_utils.py")) sc.addPyFile(os.path.join(path_spark_drugdesign, "basic_analysis.py")) sc.addPyFile(os.path.join(path_spark_drugdesign, "md_description.py")) # Path for gromacs program gromacs_path = preparing_path(config.get('DRUGDESIGN', 'gromacs_path')) time_dt = int(config.get('GROMACS_ANALYSIS', 'time_dt')) time_dt_pdb = int(config.get('GROMACS_ANALYSIS', 'time_dt_pdb')) water_layer_thickness = int(config.get('GROMACS_ANALYSIS', 'water_layer_thickness')) # File that contains all md to create the trajectory file_of_md_analysis = sys.argv[1] check_file_exists(file_of_md_analysis) start_time = datetime.now() # Broadcast gromacs_path = sc.broadcast(gromacs_path) time_dt = sc.broadcast(time_dt) time_dt_pdb = sc.broadcast(time_dt_pdb) water_layer_thickness = sc.broadcast(water_layer_thickness) # ********************* STARTING FUNCTION *************************** def run_trajetory(md_obj): ana_dir = os.path.join(md_obj.get_path(), "analysis") make_directory(ana_dir) # Original file names from the simulation reference_xtc = os.path.join(md_obj.get_path(), md_obj.get_simulation_prefix() + ".xtc") reference_tpr = os.path.join(md_obj.get_path(), md_obj.get_simulation_prefix() + ".tpr") # File names after trajectory treatment. allatom_xtc = os.path.join(ana_dir, "".join([md_obj.get_prefix_ref(), "_fit.", str(md_obj.get_repetion_number()), ".xtc"])) allatom_tpr = reference_tpr nonwater_xtc = os.path.join(ana_dir,"".join([md_obj.get_prefix_ref(), "_non-water.", str(md_obj.get_repetion_number()), ".xtc"])) nonwater_tpr = os.path.join(ana_dir, "".join([md_obj.get_prefix_ref(), "_non-water.", str(md_obj.get_repetion_number()), ".tpr"])) nonwater_pdb = os.path.join(ana_dir, "".join([md_obj.get_prefix_ref(), "_non-water.", str(md_obj.get_repetion_number()), ".pdb"])) waterlayer_pdb = os.path.join(ana_dir, "".join([md_obj.get_prefix_ref(), "_water-", str(water_layer_thickness.value), "A-layer.", str(md_obj.get_repetion_number()), ".pdb"])) # Trajectory treatment to remove PBC artifacts xtc_whole = os.path.join(ana_dir, "".join([md_obj.get_prefix_ref(), "_whole.", str(md_obj.get_repetion_number()), ".xtc"])) command = "".join(["echo System | ", gromacs_path.value, "./gmx trjconv ", "-f ", reference_xtc, " -s ", reference_tpr, " -pbc whole", " -o ", xtc_whole, " >/dev/null 2>/dev/null"]) proc = Popen(command, shell=True, stdout=PIPE) proc.communicate() # Extracting first frame gro_first_frame = os.path.join(ana_dir, "".join(["0.", str(md_obj.get_repetion_number()), ".gro"])) command = "".join(["echo System | ", gromacs_path.value, "./gmx trjconv ", "-f ", xtc_whole, " -s ", reference_tpr, " -e 0.1 ", " -o ", gro_first_frame, " >/dev/null 2>/dev/null"]) proc = Popen(command, shell=True, stdout=PIPE) proc.communicate() # Removing jumps xtc_nojump = os.path.join(ana_dir, "".join([md_obj.get_prefix_ref(), "_nojump.", str(md_obj.get_repetion_number()), ".xtc"])) command = "".join(["echo System | ", gromacs_path.value, "./gmx trjconv ", "-f ", xtc_whole, " -s ", gro_first_frame, " -pbc nojump ", " -o ", xtc_nojump, " >/dev/null 2>/dev/null"]) proc = Popen(command, shell=True, stdout=PIPE) proc.communicate() # Centering the protein xtc_center_protein = os.path.join(ana_dir, "".join([md_obj.get_prefix_ref(), "_center.", str(md_obj.get_repetion_number()), ".xtc"])) command = "".join(["echo C-alpha System | ", gromacs_path.value, "./gmx trjconv ", "-f ", xtc_whole, " -s ", gro_first_frame, " -center ", " -o ", xtc_center_protein, " >/dev/null 2>/dev/null"]) proc = Popen(command, shell=True, stdout=PIPE) proc.communicate() # Putting all atoms in a compact box xtc_atoms_box = os.path.join(ana_dir, "".join([md_obj.get_prefix_ref(), "_atom.", str(md_obj.get_repetion_number()), ".xtc"])) command = "".join(["echo System | ", gromacs_path.value, "./gmx trjconv ", "-f ", xtc_center_protein, " -s ", gro_first_frame, " -ur compact ", " -pbc atom ", " -o ", xtc_atoms_box, " >/dev/null 2>/dev/null"]) proc = Popen(command, shell=True, stdout=PIPE) proc.communicate() # Fitting the protein command = "".join(["echo C-alpha System | ", gromacs_path.value, "./gmx trjconv ", "-f ", xtc_atoms_box, " -s ", gro_first_frame, " -fit rot+trans ", " -o ", allatom_xtc, " >/dev/null 2>/dev/null"]) proc = Popen(command, shell=True, stdout=PIPE) proc.communicate() # Creating water-free trajectory command = "".join(["echo non-water | ", gromacs_path.value, "./gmx convert-tpr ", " -s ", reference_tpr, " -o ", nonwater_tpr, " >/dev/null 2>/dev/null"]) proc = Popen(command, shell=True, stdout=PIPE) proc.communicate() command = "".join(["echo non-water | ", gromacs_path.value, "./gmx trjconv ", "-f ", allatom_xtc, " -s ", gro_first_frame, " -o ", nonwater_xtc, " >/dev/null 2>/dev/null"]) proc = Popen(command, shell=True, stdout=PIPE) proc.communicate() command = "".join(["echo system | ", gromacs_path.value, "./gmx trjconv ", " -f ", nonwater_xtc, " -s ", nonwater_tpr, " -o ", nonwater_pdb, " -dt ", str(time_dt_pdb.value), " >/dev/null 2>/dev/null"]) proc = Popen(command, shell=True, stdout=PIPE) proc.communicate() # Creating water_layer_thickness - A water-layer pdb trajectory t = 0 frame = 0 ndx_water_layer = os.path.join(ana_dir, "".join([md_obj.get_prefix_ref(), "_water-layer.", str(md_obj.get_repetion_number()), ".ndx"])) ndx_temporary = os.path.join(ana_dir, "".join([md_obj.get_prefix_ref(), "_temporary_", str(md_obj.get_repetion_number()), ".ndx"])) if os.path.isfile(waterlayer_pdb): os.remove(waterlayer_pdb) if os.path.isfile(ndx_water_layer): os.remove(ndx_water_layer) select_string = ('\'"water_layer" (same residue as ((resname SOL and within 0.'"$water_layer_thickness"' of group "Protein"))) or\ (group "Ion" and within 0.'"$water_layer_thickness"' of group "Protein") \ or (group "Protein") \'') select_string = select_string.replace("$water_layer_thickness", str(water_layer_thickness.value)) # Running make_ndx command = "".join(["echo -e ", "'chain z'\"\\n\"'q'\"\\n\" | ", gromacs_path.value, "gmx make_ndx ", "-f ", reference_tpr, " -o ", ndx_temporary, " >/dev/null 2>/dev/null"]) proc = Popen(command, shell=True, stdout=PIPE) proc.communicate() # Are there ligands? if search_for_ligand_ndx_file(ndx_temporary) is True: select_string = (select_string + '\'or (same residue as ((resname SOL and within 0.'"$water_layer_thickness"' of group "Other"))) \ or (group "Ion" and within 0.'"$water_layer_thickness"' of group "Other") \ or (group "Other")\'') select_string = select_string.replace("$water_layer_thickness", str(water_layer_thickness.value)) command = "".join([gromacs_path.value, "gmx select -f ", allatom_xtc, " -s ", allatom_tpr, " -on ", ndx_water_layer, " -select ", select_string, " -dt ", str(time_dt_pdb.value), " >/dev/null 2>/dev/null"]) proc = Popen(command, shell=True, stdout=PIPE) proc.communicate() # Creating pdb files command = "".join(["echo ", str(frame), " | ", gromacs_path.value, "./gmx trjconv ", "-f ", allatom_xtc, " -s ", allatom_tpr, " -n ", ndx_water_layer, " -o ", "frame_", str(frame), ".pdb ", "-b ", str(t), " -e ", str(t), " >/dev/null 2>/dev/null"]) proc = Popen(command, shell=True, stdout=PIPE) proc.communicate() command = "".join(["echo MODEL ", str(frame), " >> ", waterlayer_pdb]) proc = Popen(command, shell=True, stdout=PIPE) proc.communicate() command = "".join(["grep ATOM ", "frame_", str(frame), ".pdb ", ">> ", waterlayer_pdb]) proc = Popen(command, shell=True, stdout=PIPE) proc.communicate() command = "".join(["echo ENDML", ">> ", waterlayer_pdb]) proc = Popen(command, shell=True, stdout=PIPE) proc.communicate() # Removing temporary files command = "".join(["rm frame_", str(frame), ".pdb"]) proc = Popen(command, shell=True, stdout=PIPE) proc.communicate() frame = frame + 1 t = t + int(time_dt_pdb.value) if os.path.isfile(xtc_whole): os.remove(xtc_whole) if os.path.isfile(xtc_nojump): os.remove(xtc_nojump) if os.path.isfile(xtc_center_protein): os.remove(xtc_center_protein) if os.path.isfile(xtc_atoms_box): os.remove(xtc_atoms_box) if os.path.isfile(ndx_water_layer): os.remove(ndx_water_layer) if os.path.isfile(gro_first_frame): os.remove(gro_first_frame) command = "rm \#* 2>/dev/null" proc = Popen(command, shell=True, stdout=PIPE) proc.communicate() # Basic Analysis basic_an_data = (gromacs_path.value, nonwater_xtc, nonwater_tpr, md_obj.get_simulation_prefix(), ana_dir, time_dt.value) run_basic_analysis(basic_an_data) # ************************** END FUNCTION ********************************** list_obj_md = load_md_traj(file_of_md_analysis) md_trajRDD = sc.parallelize(list_obj_md) md_trajRDD.foreach(run_trajetory) finish_time = datetime.now() time_execution_log(finish_time, start_time, "gromacs_trajectory.log")
config_vina = config.get('VINA', 'config_file') vina_path = config.get('VINA', 'vina_program') pdbqt_ligand_path = config.get('DEFAULT', 'pdbqt_ligand_path') pdbqt_receptor_path = config.get('DEFAULT', 'pdbqt_receptor_path') path_save_output = config.get('DEFAULT', 'path_save_structure') path_save_log = config.get('DEFAULT', 'path_save_log') path_spark_drugdesign = config.get('DRUGDESIGN', 'path_spark_drugdesign') path_save_log = preparing_path(path_save_log) make_directory(path_save_log) path_save_output = preparing_path(path_save_output) make_directory(path_save_output) # Adding Python Source file sc.addPyFile(os.path.join(path_spark_drugdesign, "docking_description.py")) # Broadcast vina_path = sc.broadcast(vina_path) pdbqt_ligand_path = sc.broadcast(pdbqt_ligand_path) pdbqt_receptor_path = sc.broadcast(pdbqt_receptor_path) path_save_output = sc.broadcast(path_save_output) path_save_log = sc.broadcast(path_save_log) sc.addFile(config_vina) file_of_vina_docking = sys.argv[1] check_file_exists(file_of_vina_docking) start_time = datetime.now() def run_vina_docking(vd_obj):
activations = sparkRun(sess, image_filenames) return activations, y start_time = time.time() cat_image_names_train = os.listdir(str(train_dir) + 'cat/') dog_image_names_train = os.listdir(str(train_dir) + 'dog/') random.shuffle(cat_image_names_train) random.shuffle(dog_image_names_train) image_filenames = getTrainBatchImages(cat_image_names_train[0:20], dog_image_names_train[0:20]) imageNames = sc.parallelize(image_filenames) sc.addPyFile("vgg16_cat_and_dog_svm.py") sc.addPyFile("utils_svm.py") sc.addFile("vgg16.npy") train_activations = imageNames.map(activationRun) train_activations_collect = train_activations.collect() print('##### train_activations_len: ', len(train_activations_collect)) print('##### train_activations_collect 1: ', train_activations_collect[0][0].shape) # print('##### train_activations_collect 2: ', train_activations_collect[0][1]) # print('##### train_activations_collect 2: ', train_activations_collect[1][1]) # print('##### train_activations_collect 2: ', train_activations_collect[2][1]) print('########duration: ' + str((time.time() - start_time))) train_activations_rdd = sc.parallelize(train_activations_collect) parsedData = train_activations_rdd.map(parsePoint)
if (longitude > 140 and longitude <= 180): result.append("node_7") result.append("node_9") return result if __name__ == '__main__': import happybase # configure the spark environment sparkConf = SparkConf().setAppName("Simulating Streamline") sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") sc = SparkContext(conf=sparkConf) sc.addPyFile("module.zip") # from pywebhdfs.webhdfs import PyWebHdfsClient; distributed_dataset = sc.textFile( "hdfs:/user/uacharya/110_Stations_Data_Combined.txt", use_unicode=False, minPartitions=24) print("this is the driver container") # getting the header of the whole dataset header = distributed_dataset.first() # filtering the header out of the data distributed_dataset = distributed_dataset.filter(lambda d: d != header) # mapping the data to prepare for processing data_in_required_format = distributed_dataset.map( create_required_datewise_data) data_in_required_format.cache() #collecting keys to do batch processing based on keys
from pyspark.streaming import StreamingContext from pyspark.streaming.kafka import KafkaUtils from movie import Movie, clean_movie es_write_conf = {"es.nodes": 'localhost', "es.port": '9200', "es.resource": 'movies-index/movie', "es.nodes.wan.only": "true" } conf = SparkConf().setAppName("PythonStreamingDirectKafkaWordCount") \ .set("es.nodes", "localhost:9200") \ .set("es.index.auto.create", "true") sc = SparkContext(appName="PythonStreamingDirectKafkaWordCount", conf=conf) sc.addPyFile("./Spark/movie.py") ssc = StreamingContext(sc, 5) # brokers, topic = sys.argv[1:] kvs = KafkaUtils.createDirectStream(ssc, ["movies"], {"metadata.broker.list": "localhost:9092"}) x = kvs.map(lambda row: row[1]) \ .map(lambda row: row.split("||")) \ .map(lambda row: Movie(row[0], float(row[1]), row[2], float(row[3]), float(row[4]), row[5], row[6], row[7], row[8], row[9], row[10], row[11])) \ .map(lambda obj: clean_movie(obj)) \ .map(lambda obj: obj.__dict__) \ .map(lambda obj: (None, obj)) \ .foreachRDD(lambda rdd: rdd.saveAsNewAPIHadoopFile( path='-', outputFormatClass="org.elasticsearch.hadoop.mr.EsOutputFormat", keyClass="org.apache.hadoop.io.NullWritable",
return rdd # Add the streaming package and initialize findspark.add_packages( ["org.apache.spark:spark-streaming-kafka-0-8_2.11:2.2.2"]) findspark.init() TOPICS = ['taiwan'] BROKERS = "localhost:9092" PERIOD = 10 APP_NAME = 'sentiment' COMPANY = 'taiwan' sc = SparkContext(appName="PythonStreamingDirectKafkaWordCount") sc.addPyFile( os.path.dirname(os.path.join(os.path.realpath(__file__), 'stanfordNLP.py'))) # except: # conf = SparkConf().set("spark.default.paralleism", 1) # spark = pyspark.sql.SparkSession.builder \ # .master("local[4]") \ # .appName(APP_NAME) \ # .config(conf=conf) \ # .getOrCreate() # sc = spark.sparkContext #create a streaming context with batch interval 10 sec ssc = StreamingContext(sc, PERIOD) directKafkaStream = KafkaUtils.createDirectStream( ssc, TOPICS, {"metadata.broker.list": BROKERS})
from pyspark.streaming import StreamingContext from pyspark.streaming.kafka import KafkaUtils from pyspark.ml.evaluation import RegressionEvaluator from pyspark.ml.recommendation import ALS from pyspark.sql import SparkSession from pyspark import SparkContext sc = SparkContext("spark://vm1:7077", "StreamProcessing") sc.addPyFile('/home/tom/Spark-Recommendation-System/db_connector.py') sc.setLogLevel("ERROR") from db_connector import DBConnector TRIGGER_INTERVAL = 30 # in seconds TOPIC_NAME = 'spark_streaming' KAFKA_PORT = 'vm1:2181' db = DBConnector('streaming_db') def fit_model(df): als = ALS(maxIter=10, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop") model = als.fit(df) return model
if __name__ == "__main__": start_time = time.time() if len(sys.argv) != 3: print("Number of arguments not valid!") sys.exit(1) with open('./config.json') as config: parameters = json.load(config)["configuration"][0] INPUT_PATH = str(sys.argv[1]) OUTPUT_PATH = str(sys.argv[2]) sc = SparkContext("yarn", "Kmeans") sc.setLogLevel("ERROR") sc.addPyFile( "./point.py" ) ## It's necessary, otherwise the spark framework doesn't see point.py print("\n***START****\n") points = sc.textFile(INPUT_PATH).map(Point).cache() initial_centroids = init_centroids(points, k=parameters["k"]) distance_broadcast = sc.broadcast(parameters["distance"]) centroids_broadcast = sc.broadcast(initial_centroids) stop, n = False, 0 while True: print("--Iteration n. {itr:d}".format(itr=n + 1), end="\r", flush=True) cluster_assignment_rdd = points.map(assign_centroids) sum_rdd = cluster_assignment_rdd.reduceByKey(lambda x, y: x.sum(y)) centroids_rdd = sum_rdd.mapValues(
def main(): config = configparser.ConfigParser() config.read('config.ini') #Number of poses to select by buried area number_poses_to_select_hydrogen_bond = int( config.get('DRUGDESIGN', 'number_poses_to_select_hydrogen_bond')) # list of residues to select buried area file_select_hydrogen_bond = config.get( 'DRUGDESIGN', 'file_residue_to_select_hydrogen_bond') #Path that contains all files for analysis path_analysis = config.get('DEFAULT', 'path_analysis') #Path where all pdb receptor are path_receptor = config.get('DEFAULT', 'pdb_path') #Ligand Database file ligand_database = config.get('DEFAULT', 'ligand_database_path_file') #Path for saving pdb files of models generated by VS path_ligand = get_directory_pdb_analysis(path_analysis) #File for saving the filtered buried area result_file_to_select_hydrogen_bond = config.get( 'DRUGDESIGN', 'result_file_to_select_hydrogen_bond') #File for saving the filtered buried area only poses result_file_to_select_hydrogen_bond_only_pose = config.get( 'DRUGDESIGN', 'result_file_to_select_hydrogen_bond_only_pose') result_file_to_select_normalized_hydrogen_bond_only_pose = config.get( 'DRUGDESIGN', 'result_file_to_select_normalized_hydrogen_bond_only_pose') result_file_to_select_normalized_heavy_atom_hydrogen_bond_only_pose = config.get( 'DRUGDESIGN', 'result_file_to_select_normalized_heavy_atom_hydrogen_bond_only_pose') #Path where saved the selected compelex path_to_save = os.path.join("selected_complexo", "hydrogen_bond") path_to_save = os.path.join(path_analysis, path_to_save) if not os.path.exists(path_to_save): os.makedirs(path_to_save) #Path where saved the normalized selected compelex path_to_save_normalized_da = os.path.join( "selected_complexo", "normalized_hydrogen_bond_donors_acceptors") path_to_save_normalized_da = os.path.join(path_analysis, path_to_save_normalized_da) if not os.path.exists(path_to_save_normalized_da): os.makedirs(path_to_save_normalized_da) path_to_save_normalized_heavyAtom = os.path.join( "selected_complexo", "normalized_hydrogen_bond_heavyAtom") path_to_save_normalized_heavyAtom = os.path.join( path_analysis, path_to_save_normalized_heavyAtom) if not os.path.exists(path_to_save_normalized_heavyAtom): os.makedirs(path_to_save_normalized_heavyAtom) #Path where saved the normalized by residue list selected compelex path_to_save_normalized_residue = os.path.join( "selected_complexo", "normalized_hydrogen_bond_residue_donors_acceptors") path_to_save_normalized_residue = os.path.join( path_analysis, path_to_save_normalized_residue) if not os.path.exists(path_to_save_normalized_residue): os.makedirs(path_to_save_normalized_residue) path_to_save_normalized_residue_heavyAtoms = os.path.join( "selected_complexo", "normalized_hydrogen_bond_residue_heavyAtoms") path_to_save_normalized_residue_heavyAtoms = os.path.join( path_analysis, path_to_save_normalized_residue_heavyAtoms) if not os.path.exists(path_to_save_normalized_residue_heavyAtoms): os.makedirs(path_to_save_normalized_residue_heavyAtoms) # Create SPARK config maxResultSize = str(config.get('SPARK', 'maxResultSize')) conf = (SparkConf().set("spark.driver.maxResultSize", maxResultSize)) # Create context sc = SparkContext(conf=conf) sqlCtx = SQLContext(sc) start_time = datetime.now() #Broadcast path_to_save_b = sc.broadcast(path_to_save) path_receptor_b = sc.broadcast(path_receptor) path_ligand_b = sc.broadcast(path_ligand) #Adding Python Source file #Path for drugdesign project path_spark_drugdesign = config.get('DRUGDESIGN', 'path_spark_drugdesign') sc.addPyFile(os.path.join(path_spark_drugdesign, "vina_utils.py")) sc.addPyFile(os.path.join(path_spark_drugdesign, "database_io.py")) sc.addPyFile(os.path.join(path_spark_drugdesign, "hydrogen_bond_io.py")) sc.addPyFile(os.path.join(path_spark_drugdesign, "hydrogen_bond_crud.py")) sc.addPyFile(os.path.join(path_spark_drugdesign, "json_utils.py")) #load all-residue_hbonds_4.0A_30.0deg.dat file path_file_hydrogen_bond = os.path.join( path_analysis, "all-residue_hbonds_4.0A_30.0deg.dat") all_residue_split = load_file_all_residue_hbonds(sc, path_file_hydrogen_bond) #Creating all_residue Dataframe df_all_residue = create_df_all_residue(sqlCtx, all_residue_split) if os.path.isfile(file_select_hydrogen_bond): #Creating resudue list as Dataframe residue_listRDD = load_file_select_hydrogen_bond( sc, file_select_hydrogen_bond) df_residue_list = create_df_residue_list(sqlCtx, residue_listRDD) df_result = create_df_all_residue_filtered_by_res_list(sqlCtx) #Saving result path_file_result_file = os.path.join( path_analysis, result_file_to_select_hydrogen_bond) save_result(path_file_result_file, df_result) #Grouping by poses df_result = get_group_by_poses_all_residue_filtered_by_res_list(sqlCtx) #Saving result only pose path_file_result_file_only_pose = os.path.join( path_analysis, result_file_to_select_hydrogen_bond_only_pose) save_result_only_pose(path_file_result_file_only_pose, df_result) #Loading all poses group by poses only_poseRDD = load_only_poses_file_hydrogen_bond( sc, path_file_result_file_only_pose) only_pose_takeRDD = only_poseRDD.take( number_poses_to_select_hydrogen_bond) #Calculating normalized hydrogen bond #Loading database rdd_database = load_database(sc, ligand_database) #Creating Dataframe database_table = sqlCtx.createDataFrame(rdd_database) database_table.registerTempTable("database") #Creating Dataframe normalized_by_donors_acceptors df_result = create_df_normalized_by_donors_acceptors(sqlCtx, df_result) #Saving result only pose by normalized hydrogen bond path_file_result_file_only_pose = os.path.join( path_analysis, result_file_to_select_normalized_hydrogen_bond_only_pose) save_result_only_pose_normalized_by_residue_list( path_file_result_file_only_pose, df_result) #Loading poses - normalized_residues_filtered_by_list only_pose_normalizedRDD = load_only_poses_file_hydrogen_bond_normalized_by_residues( sc, path_file_result_file_only_pose) only_pose_normalizedRDD = only_pose_normalizedRDD.take( number_poses_to_select_hydrogen_bond) # Normalized Hydrogen Bond by heavy atoms df_result = create_df_normalized_by_heavy_atoms(sqlCtx) #Saving result only pose by normalized buried area path_file_result_file_only_pose = os.path.join( path_analysis, result_file_to_select_normalized_heavy_atom_hydrogen_bond_only_pose ) save_result_only_pose_normalized_by_residue_list_heavy_atoms( path_file_result_file_only_pose, df_result) #Loading poses - normalized_residues_filtered_by_list only_pose_normalized_heavyAtomsRDD = load_only_poses_file_hydrogen_bond_normalized_by_residues( sc, path_file_result_file_only_pose) only_pose_normalized_heavyAtomsRDD = only_pose_normalized_heavyAtomsRDD.take( number_poses_to_select_hydrogen_bond) #************** END OF RESIDUE LIST #Loading normalized poses by donors and acceptors path_file_normalized_pose = os.path.join( path_analysis, "summary_normalized_hbonds_donors_acceptors_4.0A_30.0deg.dat") normalized_poseRDD = load_file_summary_normalized_hbonds( sc, path_file_normalized_pose) normalized_poseRDD = normalized_poseRDD.take( number_poses_to_select_hydrogen_bond) #Loading normalized poses by heavy atoms path_file_normalized_pose = os.path.join( path_analysis, "summary_normalized_hbonds_heavyAtom_4.0A_30.0deg.dat") normalized_pose_heavyAtomsRDD = load_file_summary_normalized_hbonds( sc, path_file_normalized_pose) normalized_pose_heavyAtomsRDD = normalized_pose_heavyAtomsRDD.take( number_poses_to_select_hydrogen_bond) # ******************** STARTED FUNCTION ******************************** def build_complex_from_pose_file_name(p_name): from vina_utils import get_receptor_from_receptor_ligand_model, get_ligand_from_receptor_ligand_model, get_model_from_receptor_ligand_model, get_separator_filename_mode #Broadcast path_to_save = path_to_save_b.value path_receptor = path_receptor_b.value path_ligand = path_ligand_b.value #Based on row value from dataframe pose_file_name = p_name.pose #Receptor receptor_file_name = get_receptor_from_receptor_ligand_model( pose_file_name) receptor_file = os.path.join(path_receptor, receptor_file_name + ".pdb") f_receptor_file = open(receptor_file, "r") #ligand file name ligand_file_name = os.path.join(path_ligand, pose_file_name + ".pdb") f_ligand_file_name = open(ligand_file_name, "r") #Open file for writting the complex full_path_for_save_complex = os.path.join(path_to_save, p_name.f_name + ".pdb") f_compl = open(full_path_for_save_complex, "w") #Insert lines of receptor for item in f_receptor_file: if str(item).find("END") == -1: f_compl.write(item) #Insert lines of model for item in f_ligand_file_name: if str(item).find("REMARK") == -1: f_compl.write(item) #Closing files f_compl.close() f_ligand_file_name.close() f_receptor_file.close() # ******************** FINISHED FUNCTION ******************************** if os.path.isfile(file_select_hydrogen_bond): #Selecting poses by residues filtered sc.parallelize(only_pose_takeRDD).foreach( build_complex_from_pose_file_name) #Updated path to save complex path_to_save_b = sc.broadcast(path_to_save_normalized_residue) sc.parallelize(only_pose_normalizedRDD).foreach( build_complex_from_pose_file_name) #Updated path to save complex path_to_save_b = sc.broadcast( path_to_save_normalized_residue_heavyAtoms ) #Updated path to save complex sc.parallelize(only_pose_normalized_heavyAtomsRDD).foreach( build_complex_from_pose_file_name) #Selecting poses by normalized donors and acceptors #Broadcast path_to_save_b = sc.broadcast( path_to_save_normalized_da) #Updated path to save complex sc.parallelize(normalized_poseRDD).foreach( build_complex_from_pose_file_name) #Selecting poses by normalized heavy atoms #Broadcast path_to_save_b = sc.broadcast( path_to_save_normalized_heavyAtom) #Updated path to save complex sc.parallelize(normalized_pose_heavyAtomsRDD).foreach( build_complex_from_pose_file_name) finish_time = datetime.now() save_log(finish_time, start_time)
#todo: visualisasi from pyspark import SparkConf, SparkContext from pyspark.sql import SQLContext conf = SparkConf().setMaster('local').setAppName('InternationalStudentsByCountry') sc = SparkContext(conf = conf) sqlContext = SQLContext(sc) import pyspark_csv as pycsv sc.addPyFile('pyspark_csv.py') def extract_row(row): country = row[2] year = row[13] internationalStudents = 0 if (row[9] != None and row[11] != None): numStudents = float(str(row[9]).replace(',','')) internationalPercentage = float(str(row[11])[:-1]) internationalStudents = int(numStudents * internationalPercentage / 100.0) return ((year, country), internationalStudents) plaintext_rdd = sc.textFile('file:///Users/Wik/Documents/Kuliah/BigData/Tugas-2/WorldRankUniversity-Mining/data/timesData.csv') rdd = pycsv.csvToDataFrame(sqlContext, plaintext_rdd).rdd mapped = rdd.map(extract_row) reduced = mapped.reduceByKey(lambda a, b : a + b) sorted = reduced.sortByKey() result = sorted.collect() for item in result: print str(item[0][0]) + ' - ' + str(item[0][1]) + ': ' + str(item[1])
#spark-submit <name of job py file>.py /data/movie-ratings/ratings.dat /data/movie-ratings/movies.dat 1 0.97 20 1000 COSINE import findspark findspark.init() import pyspark import sys import re import random #import numpy from pyspark import SparkConf, SparkContext sc = SparkContext(appName = "MovieLens") from math import sqrt #sc.addPyFile("similarity.py") sc.addPyFile("movielensfcn.py") from movielensfcn import parseMovies, removeDuplicates, itemItem #from similarity import cosine_similarity, jaccard_similarity if __name__=="__main__": if len(sys.argv)< 3: print >> sys.stderr, "Usage: MovieLens ratings movies" exit(-1) ratings_file = sys.argv[1] movies_file = sys.argv[2] if len(sys.argv)>6: movie_id = int(sys.argv[3]) threshold = float(sys.argv[4])
from sklearn import cross_validation from sklearn.metrics import precision_recall_curve from sklearn.cross_validation import train_test_split from sklearn.metrics import classification_report from sklearn.metrics import accuracy_score from sklearn.ensemble import RandomForestClassifier from sklearn import svm from sklearn.grid_search import GridSearchCV from sklearn.ensemble import GradientBoostingClassifier from sklearn.cross_validation import cross_val_score #add spark content sc = SparkContext(appName="sidgan") sc.addPyFile('/home/spark-1.4.1-bin-hadoop2.4/pyspark-csv/pyspark_csv.py') def transform_csv(): global data global target #make target column for classification #target = data.map(convert) #this gives RDD #target should be float and not RDD target = filter(convert) #should give target as a float def merge_csv(): global data global week
returns.append([ arr[x] for x in stations[val] ] ) return returns if __name__ == "__main__": import os #if not os.path.exists("step" + str(step_size) + "/window_" + str(window_size) + "_dataset" ): # Path( "step" + str(step_size) + "/window_" + str(window_size) + "_dataset").mkdir(parents=True, exist_ok=True) conf = SparkConf().setAppName("features") # Run the above function and store its results in a variable. full_file_paths = get_filepaths("inputs/") #random.seed(100) paths = [ x for x in full_file_paths if x.split("/")[-1][:2] =="sp" ]#[ : int( 0.3 * len(full_file_paths) )] random.shuffle(paths) sc = SparkContext(master='spark://137.30.125.208:7077', appName='spark_features') #local files to import sc.addPyFile('bfi.py') sc.addPyFile('features.py') for f in paths: fil = open(f).readlines() val = gen_data(fil) depth = calculate_depth(val[1]) val = sc.parallelize(val) features = val.map( lambda x: separate_into_energy_features(x) )#.map(lambda a: window(a) ) llist = features.collect() val = [ a for a in window(llist, n = window_size) if a != [] and a[0] != [] ] my_val = sc.parallelize(val).map(lambda a : multiply(a) ) vals = my_val.collect() bfis = features.map(lambda a : calculate_bfi(a) ).collect() with open("feature_files/" + f.split("/")[-1], "w" ) as myfile: #since we loop over outputs rather than inputs, it is implicitly - window_size too inputs = [x for x in vals ][1:][:len(vals ) - step_size]# - window_size ] # first element is [0] by some thing and window already starts like that
iter += 1 return KMeansModel(centers) if __name__ == "__main__": parser = argparse.ArgumentParser(description="do kmeans clustering") parser.add_argument("master", type=str) parser.add_argument("datafile", type=str) parser.add_argument("outputdir", type=str) parser.add_argument("k", type=int) parser.add_argument("--maxiter", type=float, default=20, required=False) parser.add_argument("--tol", type=float, default=0.001, required=False) parser.add_argument("--preprocess", choices=("raw", "dff", "dff-highpass", "sub"), default="raw", required=False) args = parser.parse_args() sc = SparkContext(args.master, "kmeans") if args.master != "local": egg = glob.glob(os.path.join(os.environ['THUNDER_EGG'], "*.egg")) sc.addPyFile(egg[0]) data = load(sc, args.datafile, args.preprocess).cache() model = KMeans(k=args.k, maxiter=args.maxiter, tol=args.tol).train(data) labels = model.predict(data) outputdir = args.outputdir + "-kmeans" save(model.centers, outputdir, "centers", "matlab") save(labels, outputdir, "labels", "matlab")
from pyspark.sql import SparkSession import time from pyspark import SparkContext sc = SparkContext('yarn') sc.addPyFile("s3a://rogerzhuo/graphframes-0.6.0-spark2.3-s_2.11.jar") from pyspark.sql.functions import * from graphframes import * spark = SparkSession.builder.appName("Prime_algorithm").getOrCreate() # Prepare data. v1 = spark.createDataFrame([ (0,), (1,), (2,), (3,), (4,), (5,), (6,), (7,), (8,), (9,)], ["id"]) # Edges DataFrame e1 = spark.createDataFrame([ (1, 2, 1), (2, 3, 7), (1, 9, 5), (1, 8, 10), (9, 0, 2), (9, 5, 6),
printOnConsole('Nothing to process') except Exception, ex: printOnConsole('There was an error...') print ex if __name__ == "__main__": #_conf = new SparkConf(true) conf = (SparkConf() .setAppName(SPARK_APPNAME) .set("spark.serializer", SPARK_SERIALIZER)) sc = SparkContext(conf=conf) ssc = StreamingContext(sc, SPARK_STREAM_BATCH) sc.addPyFile(CODE_PATH + '/pyspark_csv.py') sc.addPyFile(CODE_PATH + '/constants.py') sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", S3ACCESSID) sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", S3SECRETKEY) sqlContext = SQLContext(sc) registerUDF(sqlContext) printOnConsole('Streaming started') kinesisStream = [KinesisUtils.createStream(ssc, APPLICATION_NAME, STREAM_NAME, ENDPOINT, REGION_NAME, INITIAL_POS, CHECKPOINT_INTERVAL, awsAccessKeyId =AWSACCESSID, awsSecretKey=AWSSECRETKEY, storageLevel=STORAGE_LEVEL) for _ in range (NUM_STREAMS)] unifiedStream = ssc.union(*kinesisStream)
from pyspark.sql.types import * from pyspark.sql.functions import * from pyspark.sql import SQLContext from pyspark import SparkConf, SparkContext from pyspark.sql import SparkSession conf = (SparkConf() .setMaster("spark://sparkmaster:7077") .setAppName("HUDI_EXERCISE")) sc = SparkContext(conf=conf) spark = SparkSession(sc) os.system("echo 'PROCESSING DATA...'") sc.addPyFile("/var/hoodie/ws/spark-job/utils.py") data = sc.wholeTextFiles("hdfs://namenode:8020/wiki/extra/delete") pages = data.flatMap(lambda x: (x[1].split('</doc>'))).map(lambda x: (Utils.get_title(x), Utils.get_date_timestamp( x), Utils.get_content(x))).filter(lambda x: ((len(x[0]) != 0) or (len(x[1]) != 0))).filter(lambda x: Utils.check_if_person(x[1])) df = pages.toDF(["title", "date", "content"]) df = df.select('title', to_date( df.date, 'MM/dd/yyyy').alias('date'), "content") tableName = "hudi_celebrities" basePath = "hdfs://namenode:8020/wiki/hudi_celebrities" hudi_delete_options = { 'hoodie.table.name': tableName, 'hoodie.datasource.write.table.type': 'COPY_ON_WRITE', 'hoodie.datasource.write.operation': 'delete',
sc.stop() if __name__ == '__main__': main() import sys sys.path.append('/usr/local/lib/python2.7/site-packages') sys.path.append('/home/hadoop/app/spark/python') sys.path.append('/home/hadoop/app/spark/python/lib/py4j-0.8.2.1-src.zip') from pyspark import SparkContext, SparkConf from mysql_utils import MySQLUtils master = 'local[2]' app_name = 'test-broadcast' # spark_home = '/data01/app/bigdata/spark' # local spark_home = '/home/hadoop/app/spark' # test pyFiles = ['mysql_utils.py'] spark_conf = SparkConf() spark_conf.setMaster(master).setAppName(app_name).setSparkHome(spark_home) sc = SparkContext(conf=spark_conf) for path in (pyFiles or []): sc.addPyFile(path) external_cache = get_api_deviceinfo() deviceinfo_b = sc.broadcast(external_cache)
from pyspark.sql.types import StringType, DateType, IntegerType from pyspark import SparkConf, SparkContext from pyspark.sql.functions import UserDefinedFunction from pyspark.storagelevel import StorageLevel from flask import Flask, jsonify from utils import crossdomain app = Flask(__name__) conf = SparkConf() \ .setMaster("spark://172.21.0.14:7077") \ .setAppName("tv-scenes") \ .set("spark.executor.memory", "1g") \ .set("spark.ui.port", 4040) sc = SparkContext(conf=conf) sc.addPyFile("foo.py") sqlContext = SQLContext(sc) CSV_PATH = 'file:///home/ubuntu/DWDB/' def read_csv_into_temptable(table_name): filename = "{}/{}.csv".format(CSV_PATH, table_name) df = sqlContext.read.csv(filename, header=True, inferSchema=True) df.registerTempTable(table_name) # load data read_csv_into_temptable('EventClientChannelTune') read_csv_into_temptable('Channels')
for x in range(100) ] rdd = run(image_collection).collect() rdd.sort(key=lambda x: x[0]) rdd = [str(x[0]) + ": " + str(x[1]) + "\n" for x in rdd] with open("test/test_output.txt", 'w') as f: f.writelines(rdd) else: #connecting mysql # db = mysql.connector.connect(user='******', password='******', # host=os.environ['mySQLHost'], # database='my_db') # cursor=db.cursor() # db.commit() sc = SparkContext() sc.addPyFile("./helper-functions.py") sc.addPyFile("./constants.py") sc.addPyFile("./spark_image_compressor.py") while True: # sql1='select * from people' # cursor.execute(sql1) # data=cursor.fetchall() # if len(data) % 3 == 0: # file_like=cStringIO.StringIO(data[0][0]) # img=PIL.Image.open(file_like) # this is the line that gets the images image = cv2.imread(args.input, cv2.IMREAD_UNCHANGED) image_collection = [(x, image) for x in range(10)] rdd = run(image_collection, sc).collect() cv2.imwrite(args.output, rdd[0][1])
def main(): sc = SparkContext(conf=SparkConf().setAppName("wil_hot_sku_calc_online")) hc = HiveContext(sc) sc.addPyFile(sys.argv[1]) from core.common import common param = common.init_params(sys.argv, p) # date, pid, ts, dc_id = param["date"], param["pid"], param["ts"], param["dc_id"] date = '2018-10-19' pid = '201810190031002' ts = '1539915102918' today = dt.datetime.strptime(date, "%Y-%m-%d").date() someday = dt.datetime.strptime('2018-10-24', "%Y-%m-%d").date() yesterday = today - dt.timedelta(1) three_days_ago = today - dt.timedelta(3) # thirty_days = today - dt.timedelta(30) # 2.1.关联订单数据 # 未来在表a, b中要加上dc_id字段, 并在join时使用dc_id作关联 sql_hot_sku_data_all = """ select a.sku_id as sku_id ,a.future_source_store_id as future_source_store_id ,to_date(b.out_wh_tm) as sub_dt ,b.ord_id as ord_id ,b.sale_qtty as sale_qtty from (select sku_id ,current_source_store_id ,future_source_store_id from app.app_wil_hot_sku_all where dt='""" + str(today) + """' and pid='""" + pid + """' and ts='""" + ts + """' ) a left join (select sku_id ,store_id ,parent_sale_ord_id as ord_id ,sale_qtty ,out_wh_tm from app.app_wil_hot_sale_store_sku_ord_sale where dt = '""" + str(someday) + """' and sale_qtty >= 0 ) b on a.current_source_store_id = b.store_id and a.sku_id = b.sku_id where to_date(out_wh_tm) is not null """ hc.sql(sql_hot_sku_data_all).createOrReplaceTempView("tb_sku_data_all") # 2.2.得到sku的打标信息 # 当前来源仓有多个的时候, 只有有可能被选入, 则认为这个sku可以被选品选出: white_flag = 1 # 当前来源仓有多个的时候, 只要在set1出现过, 则认为这个sku在爆品仓中不动: unset1_flag = 0 sql_hot_sku_all_flag = """ select sku_id ,future_source_store_id ,max(white_flag) as white_flag ,min(unset1_flag) as unset1_flag from (select sku_id ,white_flag ,future_source_store_id ,case when future_source_store_id <> hot_sku_target_store_id then 1 else 0 end as unset1_flag from app.app_wil_hot_sku_all ) a group by sku_id,future_source_store_id """ hc.sql(sql_hot_sku_all_flag).createOrReplaceTempView("tb_sku_all_flag") # 订单维度设置一个ord-sku 的权重值ord_weight, 一订单中如果含有3个sku, 这三个sku的这个订单对其总订单量的贡献为1/3) sql_hot_ord_cnt = """ select aa.sub_dt ,aa.sku_id ,round(sum(case when aa.ord_weight is not null then aa.ord_weight else 1.0 end),2) as ord_cnt from (select a.* ,b.ord_weight from tb_sku_data_all a left join ( select ord_id ,cast(1/count(distinct sku_id) as float) as ord_weight from tb_sku_data_all group by ord_id ) b on a.ord_id = b.ord_id ) aa group by sub_dt,sku_id """ hc.sql(sql_hot_ord_cnt).createOrReplaceTempView("tb_hot_ord_cnt") # -- 设目标是a, 则每天选出min(0.15 + a, 1)比例订单 # --不同的sku根据其 # future_source_store_id # 不同而有不同的 # hot_sku_out_store_rate # 然后分别计算, 需要分量的比例 # ---------- 如果分开计算, 相当于对订单进行了截断, 计算值相同, 分开计算的结果也不会和和在一起的结果一致 # ---------- 所以需要实际设置是否是两个不同值判断记否要分开计算, 如果只有一个值, 就和在一起走另一个选品算法 sql_sku_default_ratio = """ select future_source_store_id ,round(avg(hot_sku_out_store_rate),2) as default_ratio from app.app_wil_hot_sku_all where future_source_store_id != hot_sku_target_store_id and future_source_store_id is not null group by future_source_store_id """ value = hc.sql(sql_sku_default_ratio).rdd.map(lambda x: map( lambda a: a, x)).filter(lambda x: x[1] != None).collect() if len(value) == 2 and value[0][1] != value[1][1]: v_future_source_store_id_1 = value[0][0] v_future_source_store_id_2 = value[1][0] v_param_1 = min(0.15 + value[0][1], 1) v_param_2 = min(0.15 + value[1][1], 1) # 判断 不同的 future_source_store_id 是否对应不同的 default_ratio , 如果是 则 分别得到 两种 default_ratio: 并通过<<算法1>>分别选品 # 示例sql对应参数为: # 38 对应 0.50 (背景一共18960个sku) # 39 对应 0.50 (背景一共2710个sku) # 2.4.2.1 算法1(两仓分别选品) sql_hot_sku_list_1 = """ select c.sub_dt, c.sku_id ( select e.sub_dt ,e.sku_id from (select a.sub_dt ,a.sku_id ,cast(SUM(a.ord_cnt * b.canchoose_flag) OVER (PARTITION BY a.sub_dt ORDER BY a.ord_cnt desc ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as float) AS CumulativeTotal ,cast(SUM(a.ord_cnt * b.background_flag) OVER (PARTITION BY a.sub_dt) * CAST ('""" + str( v_param_1) + """' as float) as float) AS TotalOrd from tb_hot_ord_cnt a left join (select sku_id ,case when white_flag = 1 and unset1_flag = 1 then 1 else 0 end as canchoose_flag ,case when future_source_store_id = '""" + str( v_future_source_store_id_1 ) + """' then 1 else 0 end as background_flag from tb_sku_all_flag ) b on a.sku_id = b.sku_id ) e where e.CumulativeTotal <= e.TotalOrd ) c join ( select distinct sku_id from tb_sku_all_flag where canchoose_flag = 1 )d on c.sku_id = d.sku_id """ hc.sql(sql_hot_sku_list_1).createOrReplaceTempView("tb_hot_sku_list_1") sql_select_result_1 = """ select a.sku_id ,sum(a.cnt) as re_times from (select sku_id ,1 as cnt from tb_hot_sku_list_1 union all select sku_id ,4 as cnt from tb_hot_sku_list_1 where sub_dt between '""" + str( three_days_ago) + """' and '""" + str(yesterday) + """' ) a group by a.sku_id having re_times > 18 """ hc.sql(sql_select_result_1).createOrReplaceTempView( "tb_select_result_1") sql_hot_sku_list_2 = """ select c.sub_dt, c.sku_id FROM ( select a.sub_dt ,a.sku_id from (select a.sub_dt ,a.sku_id ,cast(SUM(a.ord_cnt * b.canchoose_flag) OVER (PARTITION BY a.sub_dt ORDER BY a.ord_cnt desc ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as float) AS CumulativeTotal ,cast(SUM(a.ord_cnt * b.background_flag) OVER (PARTITION BY a.sub_dt) * CAST ('""" + str( v_param_2) + """' as float) as float) AS TotalOrd from tb_hot_ord_cnt a left join (select sku_id ,case when white_flag = 1 and unset1_flag = 1 then 1 else 0 end as canchoose_flag ,case when future_source_store_id = '""" + str( v_future_source_store_id_2 ) + """' then 1 else 0 end as background_flag from tb_sku_all_flag ) b on a.sku_id = b.sku_id ) a where a.CumulativeTotal <= a.TotalOrd ) c join ( select distinct sku_id from tb_sku_all_flag where canchoose_flag = 1 )d on c.sku_id = d.sku_id """ hc.sql(sql_hot_sku_list_2).createOrReplaceTempView("tb_hot_sku_list_2") sql_select_result_2 = """ select a.sku_id ,sum(a.cnt) as re_times from (select sku_id ,1 as cnt from tb_hot_sku_list_2 union all select sku_id ,4 as cnt from tb_hot_sku_list_2 where sub_dt between '""" + str( three_days_ago) + """' and '""" + str(yesterday) + """' ) a group by a.sku_id having re_times > 16 """ hc.sql(sql_select_result_2).createOrReplaceTempView( "tb_select_result_2") # # sql_result = """ # insert overwrite table dev.dev_ipc_ioa_hot_select_result # select * from tb_select_result_1 # union # select * from tb_select_result_2 # """ # hc.sql(sql_result) # 最终选品结果 partition = """dt='""" + today + """',pid='""" + pid + """',ts='""" + ts + """'""" sql_select_result = """ insert overwrite table app.app_wil_hot_sku_selected partition(""" + partition + """) select a.sku_id from ( select * from tb_select_result_1 union select * from tb_select_result_2) a """ hc.sql(sql_select_result) else: v_param = min(0.15 + value[0][1], 1) sql_hot_sku_list = """ select c.sub_dt, c.sku_id FROM ( select e.sub_dt ,e.sku_id from (select a.sub_dt ,a.sku_id ,cast(SUM(a.ord_cnt * b.canchoose_flag) OVER (PARTITION BY a.sub_dt ORDER BY a.ord_cnt desc ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as float) AS CumulativeTotal ,cast(SUM(a.ord_cnt) OVER (PARTITION BY a.sub_dt) * CAST ('""" + str( v_param) + """' as float) as float) AS TotalOrd from tb_hot_ord_cnt a left join (select sku_id ,case when white_flag = 1 and unset1_flag = 1 then 1 else 0 end as canchoose_flag from tb_sku_all_flag ) b on a.sku_id = b.sku_id ) e where e.CumulativeTotal <= e.TotalOrd ) c join ( select distinct sku_id from tb_sku_all_flag where canchoose_flag = 1 )d on c.sku_id = d.sku_id """ hc.sql(sql_hot_sku_list).createOrReplaceTempView("tb_hot_sku_list") partition = """dt='""" + str( today) + """',pid='""" + pid + """',ts='""" + ts + """'""" sql_select_result = """ insert overwrite table app.app_wil_hot_sku_selected partition(""" + partition + """) select b.sku_id from ( select a.sku_id ,sum(a.cnt) as re_times from ( select sku_id ,1 as cnt from tb_hot_sku_list union all select sku_id ,4 as cnt from tb_hot_sku_list where sub_dt between '""" + str( three_days_ago) + """' and '""" + str(yesterday) + """') a group by a.sku_id having re_times > 0) b """ hc.sql(sql_select_result)
from pyspark import SparkConf, SparkContext conf = SparkConf().setAppName("LogisticRegressionWithSGD").setExecutorEnv("PYTHON_EGG_CACHE","/tmp/geap") sc = SparkContext(conf = conf) sc.addPyFile("hdfs://nameservice1/user/geap/warehouse/lib/numpy.egg") from pyspark.mllib.linalg import SparseVector from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.classification import LogisticRegressionWithSGD import numpy import logging # retain items etype=="pv", chkout==50 and total_price<=100000 # cks, ckp, acc, aid, chkout, ua, res, ip, genre, igenre, itemid, ni, price, ts def filterPoint(line): try: value = line.split("\t") except Exception, e: logging.exception(e) #etype = value[1] chkout = "" try: chkout = value[4] except Exception, e: logging.exception(e) if chkout == "50": try: prices = [int(i) for i in eval(value[12])] num = [int(i) for i in eval(value[11])] if len(prices) == len(num):
totalLocations = totalLocations.union(busyLocations) totalRoutes = totalRoutes.union(busyRoutes) totalTimes.reduceByKey(add).sortBy(lambda x:x[1], False).saveAsTextFile("/zitong/output/Busy_Times") totalLocations.reduceByKey(add).sortBy(lambda x:x[1], False).saveAsTextFile("/zitong/output/Busy_Locations") totalRoutes.reduceByKey(add).sortBy(lambda x:x[1], False).saveAsTextFile("/zitong/output/Busy_Routes") if __name__ == "__main__": conf = SparkConf() conf.setAppName(APP_NAME) conf.setMaster('yarn-client') conf.set('spark.executor.memory', '1g') conf.set('spark.executor.cores','1') conf.set('spark.executor.instances','5') sc = SparkContext(conf=conf) sc.addPyFile("shapefile.py") COUNTIES = ['Albany', 'Allegany', 'Bronx', 'Broome', 'Cattaraugus', 'Cayuga', 'Chautauqua', 'Chemung', 'Chenango', 'Clinton', 'Columbia', 'Cortland', 'Delaware', 'Dutchess', 'Erie', 'Essex', 'Franklin', 'Fulton', 'Genesee', 'Greene', 'Hamilton', 'Herkimer', 'Jefferson', 'Kings', 'Lewis', 'Livingston', 'Madison', 'Monroe', 'Montgomery', 'Nassau', 'New York', 'Niagara', 'Oneida', 'Onondaga', 'Ontario', 'Orange', 'Orleans', 'Oswego', 'Otsego', 'Putnam', 'Queens', 'Rensselaer', 'Richmond', 'Rockland', 'Saratoga', 'Schenectady', 'Schoharie', 'Schuyler', 'Seneca', 'St. Lawrence', 'Steuben', 'Suffolk', 'Sullivan', 'Tioga', 'Tompkins', 'Ulster', 'Warren', 'Washington', 'Wayne', 'Westchester', 'Wyoming', 'Yates'] spatialIdx = readSpatialIndex("spatialIdx.csv") spatialIdx = sc.broadcast(spatialIdx) couties = sc.broadcast(COUNTIES) main(sc)
if(longitude<-140 and longitude>=-180): result.append("node_3") elif (longitude >140 and longitude<=180): result.append("node_1"); return result; if __name__ == '__main__': import happybase; # configure the spark environment sparkConf = SparkConf().setAppName("Simulating Streamline"); sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") sc = SparkContext(conf=sparkConf); sc.addPyFile("module.zip"); # from pywebhdfs.webhdfs import PyWebHdfsClient; distributed_dataset = sc.textFile("hdfs:/user/uacharya/subset_dataset_1934.txt",use_unicode=False,minPartitions=24); print("this is the driver container"); # getting the header of the whole dataset header = distributed_dataset.first(); # filtering the header out of the data distributed_dataset = distributed_dataset.filter(lambda d: d != header); # mapping the data to prepare for processing data_in_required_format = distributed_dataset.map(create_required_datewise_data); data_in_required_format.cache(); #collecting keys to do batch processing based on keys temp = set(data_in_required_format.keys().collect()); print("total keys "+str(len(temp))); #sorting keys to create data in chronological order based on date sorted_keys = sorted(temp,key=int);
def main(): config = configparser.ConfigParser() config.read('config.ini') #Path for Gromacs project gromacs_path = preparing_path(config.get('DRUGDESIGN', 'gromacs_path')) #Path where PDB ligand are - They are NOT participated in docking pdb_ligand_path = config.get('DEFAULT', 'pdb_ligand_path') #Path that contains all files for analysis path_analysis = config.get('DEFAULT', 'path_analysis') #Path where all pdb receptor are path_receptor_pdb = config.get('DEFAULT', 'pdb_path') #Path for saving pdb files of models generated by VS path_analysis_pdb = get_directory_pdb_analysis(path_analysis) # Create SPARK config maxResultSize = str(config.get('SPARK', 'maxResultSize')) conf = (SparkConf().set("spark.driver.maxResultSize", maxResultSize)) # Create context sc = SparkContext(conf=conf) #Adding Python Source file #Path for drugdesign project path_spark_drugdesign = config.get('DRUGDESIGN', 'path_spark_drugdesign') sc.addPyFile(os.path.join(path_spark_drugdesign,"vina_utils.py")) sc.addPyFile(os.path.join(path_spark_drugdesign,"os_utils.py")) sc.addPyFile(os.path.join(path_spark_drugdesign,"gromacs_utils.py")) sc.addPyFile(os.path.join(path_spark_drugdesign,"pdb_io.py")) sc.addPyFile(os.path.join(path_spark_drugdesign,"json_utils.py")) #Adding bash scripts sc.addFile(os.path.join(path_spark_drugdesign,"make_ndx_buried_area_ligand.sh")) #Parameters form command line #Indicates probe. Example: 0.14 probe = float(sys.argv[1]) #Indicates ndots. Example: 24 ndots = int(sys.argv[2]) #Broadcast path_analysis_pdb_complex_b = sc.broadcast(path_analysis_pdb) gromacs_path = sc.broadcast(gromacs_path) pdb_ligand_path = sc.broadcast(pdb_ligand_path) probe = sc.broadcast(probe) ndots = sc.broadcast(ndots) start_time = datetime.now() os.environ["GMX_MAXBACKUP"]="-1" #Loading all PDB receptor files into memory list_all_pdb_receptor_files_path = [] all_receptor_for_complex = get_files_pdb(path_receptor_pdb) for receptor in all_receptor_for_complex: list_all_pdb_receptor_files_path.append(loading_pdb_2_list(receptor)) for pdb_receptor_files in list_all_pdb_receptor_files_path: #Getting receptor name by fully path base_file_name_receptor = get_name_receptor_pdb(str(pdb_receptor_files[0])) #PDB file loaded into memory is sent by broadcast pdb_file_receptor = pdb_receptor_files[1] pdb_file_receptor = sc.broadcast(pdb_file_receptor) #Loading PDB model files based on receptor into memory base_file_name_receptor_for_filter = base_file_name_receptor+"_-_" all_model_for_complex = get_files_pdb_filter(path_analysis_pdb,base_file_name_receptor_for_filter) all_model_for_complexRDD = sc.parallelize(all_model_for_complex) all_model_filesRDD = all_model_for_complexRDD.map(loading_pdb_2_list).collect() # ********** Starting function ********************************************************** def save_model_receptor(list_receptor_model_file): receptor_file = pdb_file_receptor.value #Obtained from broadcast model_file = list_receptor_model_file[0] full_path_for_save_complex = list_receptor_model_file[1] #Open file for writting the complex f_compl = open(full_path_for_save_complex, "w") #Insert lines of receptor for item in receptor_file: f_compl.write(item) #Insert lines of model and insert Z chain for item in model_file: item = replace_chain_atom_line(item,"d","z") f_compl.write(item) f_compl.close() # ********** Finish function ********************************************************** # ********** Starting function ********************************************************** def compute_buried_area_ligand(pdb_complex): chZ = "chZ" buried_lig_rec_perc = -1.0 buried_lig_rec = -1.0 buried_lig_lig = -1.0 buried_lig_lig_perc = -1.0 base_name = get_name_model_pdb(pdb_complex) ligand_name = get_ligand_from_receptor_ligand_model(base_name) receptor_name = get_receptor_from_receptor_ligand_model(base_name) pose = get_model_from_receptor_ligand_model(base_name) pdb_before_vs = os.path.join(pdb_ligand_path.value,ligand_name+".pdb") #ndx files f_ndx = os.path.join(path_analysis_pdb_complex_b.value,base_name+".ndx") #xvg files xvg_temp_sasa_lig_pose = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_lig_pose"+".xvg") xvg_temp_sasa_lig_complex = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_lig_complex"+".xvg") xvg_temp_sasa_lig_min = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_lig_min"+".xvg") # Creates a selection with the residues that are closer than 6A to the ligand script_make_ndx_buried_area_ligand = SparkFiles.get("make_ndx_buried_area_ligand.sh") #Getting bash script that was copied by addFile command command = script_make_ndx_buried_area_ligand + " " + gromacs_path.value + " "+ pdb_complex + " "+ f_ndx + " "+ xvg_temp_sasa_lig_pose + " "+ str(probe.value) + " "+ str(ndots.value) + " "+ xvg_temp_sasa_lig_complex + " "+ pdb_before_vs + " "+ xvg_temp_sasa_lig_min process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE) stdout, stderr = process.communicate() try: # SASA of the isolated ligand in the pose conformation sasa_lig_pose = get_value_from_xvg_sasa(xvg_temp_sasa_lig_pose) # SASA of the complexed ligand in the pose conformation sasa_lig_complex = get_value_from_xvg_sasa(xvg_temp_sasa_lig_complex) # SASA of the isolated ligand in its energy-minimized conformation. Only for carbohydrates! sasa_lig_min = get_value_from_xvg_sasa(xvg_temp_sasa_lig_min) # Area of the ligand which is buried in the receptor buried_lig_rec = sasa_lig_pose - sasa_lig_complex buried_lig_rec_perc = buried_lig_rec / sasa_lig_pose # Area of the ligand in the pose conformation which is buried in itself when compared to the energy-minimized conformation buried_lig_lig = sasa_lig_min - sasa_lig_pose buried_lig_lig_perc = buried_lig_lig / sasa_lig_min returned_list = (base_name, buried_lig_rec, buried_lig_rec_perc, buried_lig_lig, buried_lig_lig_perc) #Deleting files os.remove(f_ndx) os.remove(xvg_temp_sasa_lig_pose) os.remove(xvg_temp_sasa_lig_complex) os.remove(xvg_temp_sasa_lig_min) return returned_list except: return (base_name, float(0.0), float(0.0), float(0.0), float(0.0)) # ********** Finish function ********************************************************** # ********** Starting function ********************************************************** def build_list_model_for_complex(model): full_path_model = model[0] model_file = model[1] path_pdb_complex = path_analysis_pdb_complex_b.value #Obtained from broadcast #Building complex file based on model file name base_name_model = get_name_model_pdb(full_path_model) complex_name = "compl_"+base_name_model+".pdb" full_path_for_save_complex = os.path.join(path_pdb_complex,complex_name) list_receptor_model_file = (model_file, full_path_for_save_complex) save_model_receptor(list_receptor_model_file) list_ret = compute_buried_area_ligand(full_path_for_save_complex) os.remove(full_path_for_save_complex) return list_ret # ********** Finish function ********************************************************** all_model_filesRDD = sc.parallelize(all_model_filesRDD) all_model_filesRDD = all_model_filesRDD.map(build_list_model_for_complex).collect() #Saving buried area of residue receptor full_area_file = os.path.join(path_analysis,base_file_name_receptor+".ligandArea") save_buried_area_ligand(full_area_file, all_model_filesRDD) #Loading all area file all_area_file = os.path.join(path_analysis,"*.ligandArea") buried_areaRDD = sc.textFile(all_area_file).map(loading_lines_from_ligandArea_files).collect() #Sorting by buried_lig_lig column buried_area_sorted_by_buried_lig_rec = sorting_buried_area_ligand(sc, buried_areaRDD) buried_area_sorted_by_buried_lig_rec = buried_area_sorted_by_buried_lig_rec.map(lambda p: (p.pose, p.buried_lig_rec, p.buried_lig_rec_perc, p.buried_lig_lig, p.buried_lig_lig_perc) ).collect() #p.receptor, p.ligand, p.model #Saving buried area ligand file path_file_buried_area = os.path.join(path_analysis, "summary_buried_area_ligand.dat") save_buried_area_ligand_sort(path_file_buried_area, buried_area_sorted_by_buried_lig_rec) #Removing all area files all_area_files = get_files_ligandArea(path_analysis) for area_file in all_area_files: os.remove(area_file) finish_time = datetime.now() save_log(finish_time, start_time)
from pyspark import SparkConf, SparkContext conf = SparkConf().setAppName("LogisticRegressionWithSGD").setExecutorEnv(["PYTHON_EGG_CACHE","/tmp/geap"),("SPARK_LIBRARY_PATH", "$SPARK_LIBRARY_PATH:$HADOOP_HOME/lib/native")]) sc = SparkContext(conf = conf) sc.addPyFile("hdfs://nameservice1/tmp/geap/numpy.egg") from pyspark.mllib.linalg import SparseVector from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.classification import LogisticRegressionWithSGD import numpy import json # retain items etype=="pv", chkout==50 and total_price<=100000 # cks, ckp, acc, aid, chkout, ua, res, ip, genre, igenre, itemid, ni, price, ts def filterPoint(line): value = json.loads(line[1]) etype = value.get("etype") chkout = value.get("chkout") if chkout == "50": prices = [int(i) for i in value.get("price")] num = [int(i) for i in value.get("ni")] if len(prices) == len(num): total_price = sum([a*b for a, b in zip(prices, num)]) else: return False if total_price <= 100000: return True return False def parsePoint(line):
topics_reduced = {} for sc, v in shortest_paths.items(): for id, topic, count in v: if topic not in topics_reduced: topics_reduced[topic] = 1 else: topics_reduced[topic] += 1 return (p_id, topics_reduced) conf = SparkConf().setAppName("entity_topics") sc = SparkContext(conf=conf) pp = pprint.PrettyPrinter(width=100) sc.addPyFile('/home/username/src/pyspark/dist/libs.zip') sys.path.insert(0, SparkFiles.get('/home/username/src/pyspark/dist/libs.zip')) import networkx _topics = [(693763, 'Academic disciplines'), (4892515, 'Arts'), (771152, 'Business'), (24980271, 'Concepts'), (694861, 'Culture'), (696763, 'Education'), (693016, 'Entertainment'), (2766046, 'Events'), (693800, 'Geography'), (751381, 'Health'), (693555, 'History'), (1004110, 'Humanities'), (8017451, 'Language'), (691928, 'Law'), (2389032, 'Life'), (690747, 'Mathematics'), (696603, 'Nature'), (691008, 'People'), (691810, 'Philosophy'), (695027, 'Politics'), (722196, 'Reference'), (692694, 'Religion'), (691182, 'Science'), (1633936, 'Society'), (693708, 'Sports'), (696648, 'Technology'), (48005914, 'Universe'), (3260154, 'World')] topics = sc.broadcast(_topics)
def main(): sc = SparkContext() config = configparser.ConfigParser() config.read('config.ini') #Broadcast - global path_pdbqt = config.get('DEFAULT', 'pdbqt_ligand_path') pythonsh = config.get('VINA', 'pythonsh') script_ligand4 = config.get('VINA', 'script_ligand4') database_comp = config.get('DEFAULT', 'ligand_database_path_file') pdb_ligand_path = config.get('DEFAULT', 'pdb_ligand_path') script_pdbqt_to_pdb = config.get('VINA', 'script_pdbqt_to_pdb') path_spark_drugdesign = config.get('DRUGDESIGN', 'path_spark_drugdesign') #Adding Python Source file sc.addPyFile(os.path.join(path_spark_drugdesign,"vina_utils.py")) sc.addPyFile(os.path.join(path_spark_drugdesign,"pdbqt_io.py")) #creating pdbqt path if not os.path.isdir(path_pdbqt): os.mkdir(path_pdbqt) #creating PDB path if not os.path.isdir(pdb_ligand_path): os.mkdir(pdb_ligand_path) start_time = datetime.now() #preparing compound list list_obj_lig_vina = [] mol2_files = vina_utils.get_files_mol2(config.get('DEFAULT', 'mol2_path')) for fmol2 in mol2_files: obj_lig_vina = (path_pdbqt, pythonsh,script_ligand4, fmol2) list_obj_lig_vina.append(obj_lig_vina) molRDD = sc.parallelize(list_obj_lig_vina) molRDD.foreach(prepare_ligand) # *** Preparation of compound list finished. Now, it is able to create the database #preparing enviroment for creating database prepare_for_creating_database(database_comp, path_pdbqt) #preparing pdbqt list list_obj_pdbqt = [] pdbqt_files = vina_utils.get_files_pdbqt(path_pdbqt) for fpdbqt in pdbqt_files: list_obj_pdbqt.append(fpdbqt) #appling map and collect pdbqtRDD = sc.parallelize(list_obj_pdbqt) all_lines = pdbqtRDD.map(build_compound_database).collect() #creating database file save_database(database_comp, all_lines) #converting ligand pdbqt to pdb list_pdbqt_files_lig = [] all_pdbqt_files_lig = vina_utils.get_files_pdbqt(path_pdbqt) for pdbqt_files_lig in all_pdbqt_files_lig: list_pdbqt_files_lig.append( (pdbqt_files_lig, pdb_ligand_path, pythonsh, script_pdbqt_to_pdb) ) pdbqt_files_ligRDD = sc.parallelize(list_pdbqt_files_lig) pdbqt_files_ligRDD.foreach(pdbqt2pdb) finish_time = datetime.now() save_log(finish_time, start_time)
def main(): sc = SparkContext() sqlCtx = SQLContext(sc) config = configparser.ConfigParser() config.read('config.ini') #Path for drugdesign project path_spark_drugdesign = config.get('DRUGDESIGN', 'path_spark_drugdesign') #Detect interactions program detect_hbonds_program = config.get('DRUGDESIGN', 'detect_hbonds_program') #Path where all pdb receptor are path_receptor_pdbqt = config.get('DEFAULT', 'pdbqt_receptor_path') #Path that contains all files for analysis path_analysis = config.get('DEFAULT', 'path_analysis') #Ligand Database file ligand_database = config.get('DEFAULT', 'ligand_database_path_file') #Path of pdbqt model path_analysis_pdbqt_model = get_directory_pdbqt_analysis(path_analysis) #Path analysis temp path_analysis_temp = get_directory_temp_analysis(path_analysis) #Getting parameters # cutoff for hydrogen bind distance_cutoff = float(sys.argv[1]) angle_cutoff = float(sys.argv[2]) #Adding Python Source file sc.addPyFile(os.path.join(path_spark_drugdesign,"vina_utils.py")) sc.addPyFile(os.path.join(path_spark_drugdesign,"pdbqt_io.py")) sc.addPyFile(os.path.join(path_spark_drugdesign,"database_io.py")) sc.addPyFile(os.path.join(path_spark_drugdesign,"json_utils.py")) start_time = datetime.now() #broadcast path_analysis_temp_b = sc.broadcast(path_analysis_temp) detect_hbonds_program_b = sc.broadcast(detect_hbonds_program) distance_cutoff_b = sc.broadcast(distance_cutoff) angle_cutoff_b = sc.broadcast(angle_cutoff) #******************* start function ************************************************ def get_hydrogen_bind(ligand_pdbqt): #getting base name base_name = get_name_model_pdb(ligand_pdbqt) #temporary_lig_no temporary_lig_no = base_name+"_temporary_lig_no" list_param = ["C", "O", "N", "HD", "HS"] list_atom_pdbqt = get_atom_section_from_atom_list(ligand_pdbqt, list_param) list_ref = get_lig_values_from_atom_list_2_hydrogen_bind(list_atom_pdbqt) path_file_lig_no = os.path.join(path_analysis_temp_b.value, temporary_lig_no) save_text_file_from_list(path_file_lig_no, list_ref) total_lig_no = int(get_line_number(path_file_lig_no)) #temporary_rec_no temporary_rec_no = base_name+"_temporary_rec_no" list_param = ["C", "OA", "N", "HD", "HS", "SA", "A"] list_atom_pdbqt = get_atom_section_from_atom_list(receptor_b.value, list_param) list_ref = get_receptor_values_from_atom_list_2_hydrogen_bind(list_atom_pdbqt) path_file_rec_no = os.path.join(path_analysis_temp_b.value, temporary_rec_no) save_text_file_from_list(path_file_rec_no, list_ref) total_rec_no = int(get_line_number(path_file_rec_no)) #temporary_rec_h temporary_rec_h = base_name+"_temporary_rec_h" list_param = ["HD", "HS"] list_atom_pdbqt = get_atom_section_from_atom_list(receptor_b.value, list_param) list_ref = get_receptor_values_from_atom_list_2_hydrogen_bind(list_atom_pdbqt) path_file_rec_h = os.path.join(path_analysis_temp_b.value, temporary_rec_h) save_text_file_from_list(path_file_rec_h, list_ref) total_rec_h = int(get_line_number(path_file_rec_h)) #preparing file for saving file_for_saving = base_name+".saving" path_file_for_saving = os.path.join(path_analysis_temp_b.value, file_for_saving) if total_lig_no > 0: #print detect_hbonds_program_b.value+" "+ receptor_b.value+" "+ str(total_rec_no)+" "+ ligand_pdbqt+" "+ str(total_lig_no)+" "+ str(distance_cutoff_b.value)+" "+ str(angle_cutoff_b.value)+" "+ path_file_for_saving+" "+ path_file_rec_no+" "+ path_file_lig_no+" "+ path_file_rec_h+" "+ path_file_rec_no process = Popen( [detect_hbonds_program_b.value, receptor_b.value, str(total_rec_no), ligand_pdbqt, str(total_lig_no), str(distance_cutoff_b.value), str(angle_cutoff_b.value), path_file_for_saving, path_file_rec_no, path_file_lig_no, path_file_rec_h, path_file_rec_no ], stdout=PIPE, stderr=PIPE) stdout, stderr = process.communicate() os.remove(path_file_rec_no) os.remove(path_file_lig_no) os.remove(path_file_rec_h) #******************* finish function ************************************************ #Getting all receptores all_receptores = get_files_pdbqt(path_receptor_pdbqt) #Getting all pdbqt models all_pdbqt_models = get_files_pdbqt(path_analysis_pdbqt_model) all_pdbqt_modelsRDD = sc.parallelize(all_pdbqt_models) for receptor in all_receptores: check_temp_directory(path_analysis_temp) receptor_b = sc.broadcast(receptor) base_name_receptor = get_name_receptor_pdbqt(receptor) base_name_receptor = base_name_receptor+"_-_" models_by_receptorRDD = all_pdbqt_modelsRDD.filter(lambda m : base_name_receptor in m).collect() models_by_receptorRDD = sc.parallelize(models_by_receptorRDD) models_by_receptorRDD.foreach(get_hydrogen_bind) #Getting all saving files that have lines > 0 all_saving_files_by_receptor = get_saving_files_with_lines(path_analysis_temp, base_name_receptor) #Creating file based on all saving files create_file_receptor_all_saving_files(all_saving_files_by_receptor,base_name_receptor,path_analysis) #Getting all saving files that have lines equal 0 all_saving_files_no_lines = get_saving_files_no_lines(path_analysis_temp, base_name_receptor) #Creating file based on all saving files create_file_receptor_no_hydrogen_bonds(all_saving_files_no_lines,base_name_receptor,path_analysis) #Removing temp directory shutil.rmtree(path_analysis_temp) #Starting the final analysis all_hydrogen_bind = get_hydrogen_bind_files(path_analysis) if len(all_hydrogen_bind) > 0: #No Hydrogen bind all_NOT_hydrogen_bind = get_NOT_hydrogen_bind_files(path_analysis) all_NOT_hydrogen_bindRDD = sc.parallelize(all_NOT_hydrogen_bind) #loading from files all_NOT_hydrogen_bindRDD = all_NOT_hydrogen_bindRDD.flatMap(loading_from_files_NOT_hydrogen_bind).collect() #loading all values from list all_NOT_hydrogen_bindRDD = loading_from_all_lists_NOT_hydrogen_bind(sc, all_NOT_hydrogen_bindRDD, sqlCtx) all_NOT_hydrogen_bindRDD.cache() #Working with Hydrogen bind all_hydrogen_bindRDD = sc.parallelize(all_hydrogen_bind) #loading from files all_hydrogen_bindRDD = all_hydrogen_bindRDD.flatMap(loading_from_files).collect() #loading all values from list all_hydrogen_bindRDD = loading_from_all_lists(sc, all_hydrogen_bindRDD, sqlCtx) all_hydrogen_bindRDD.cache() #saving all_bonds_file save_all_bonds_file(path_analysis, distance_cutoff, angle_cutoff, all_hydrogen_bindRDD) #number hydrogen binds of poses number_poseRDD = get_hbonds_number_pose(sqlCtx) number_poseRDD.cache() save_number_pose(path_analysis, distance_cutoff, angle_cutoff, number_poseRDD, all_NOT_hydrogen_bindRDD) #Calculating Normalized Hydrogen Bond #Loading database rdd_database = load_database(sc, ligand_database) #Creating Dataframe database_table = sqlCtx.createDataFrame(rdd_database) database_table.registerTempTable("database") number_pose_ligandRDD = number_poseRDD.map(lambda p: Row(numPose=int(p.numPose), ligand=get_ligand_from_receptor_ligand_model(p.pose), pose=str(p.pose) ) ).collect() number_pose_ligand_table = sqlCtx.createDataFrame(number_pose_ligandRDD) number_pose_ligand_table.registerTempTable("pose_ligand_hb") #Calculating normalized Hydrogen Bond by donors_acceptors sql = """ SELECT pose, (b.numPose / a.hb_donors_acceptors) as normalized_hb FROM database a JOIN pose_ligand_hb b ON b.ligand = a.ligand ORDER BY normalized_hb DESC """ #Getting all data full_dataRDD = sqlCtx.sql(sql) #Saving file save_number_pose_normalized_donors_acceptors(path_analysis, distance_cutoff, angle_cutoff, full_dataRDD) #Calculating normalized Hydrogen Bond by heavy atoms sql = """ SELECT pose, (b.numPose / a.heavyAtom) as normalized_hb FROM database a JOIN pose_ligand_hb b ON b.ligand = a.ligand ORDER BY normalized_hb DESC """ #Getting all data full_dataRDD = sqlCtx.sql(sql) #Saving file save_number_pose_normalized_heavyAtom(path_analysis, distance_cutoff, angle_cutoff, full_dataRDD) #number hydrogen binds of ligands # number_ligandRDD = get_hbonds_number_ligand(sc, number_poseRDD, sqlCtx) # save_number_ligand(path_analysis, distance_cutoff, angle_cutoff, number_ligandRDD) #Removing all hydrogen bind files remove_all_hydrogen_files(all_hydrogen_bind) else: save_all_bonds_file_with_mensage(path_analysis, cutoff) finish_time = datetime.now() save_vs_hydrogen_bind_log(finish_time, start_time)
# Dummy Spark App demo from pyspark import SparkContext, SparkConf from pyspark import SparkFiles import numpy as np from barista.customer import Customer conf = SparkConf().setAppName("Dummy Demo") sc = SparkContext(conf=conf) # Add prototxt files to Spark Context sc.addFile("models/solver.prototxt") sc.addFile("models/train_val.prototxt") # Add barista module sc.addPyFile("barista.zip") sc.addPyFile("barista/start.py") # Subclass generic barista Customer class MyCustomer(Customer): def __init__(self, filename): compute_semaphore, model_semaphore, handles = \ Customer.parse_ipc_interface_file(filename) Customer.__init__(self, compute_semaphore, model_semaphore, handles) def update_data(self): self.arrays['data'][:] = np.random.randn(*self.arrays['data'].shape) self.arrays['label'][:] = np.random.choice( xrange(10), size=self.arrays['label'].shape)
if __name__ == "__main__": parser = argparse.ArgumentParser( description="compute summary statistics on time series data") parser.add_argument("master", type=str) parser.add_argument("datafile", type=str) parser.add_argument("outputdir", type=str) parser.add_argument("mode", choices=("mean", "median", "std", "norm"), help="which summary statistic") parser.add_argument("--preprocess", choices=("raw", "dff", "dff-highpass", "sub"), default="raw", required=False) args = parser.parse_args() sc = SparkContext(args.master, "stats") if args.master != "local": egg = glob.glob(os.path.join(os.environ['THUNDER_EGG'], "*.egg")) sc.addPyFile(egg[0]) data = load(sc, args.datafile, args.preprocess).cache() vals = stats(data, args.mode) outputdir = args.outputdir + "-stats" save(vals, outputdir, "stats_" + args.mode, "matlab")
from pyspark.ml.feature import VectorAssembler, StringIndexer, VectorIndexer from plotly.offline import download_plotlyjs, init_notebook_mode, iplot, plot import plotly import plotly.graph_objs as go import pandas as pd # init sc = SparkContext("local", "Test_PSAML") # Get parent directory of the tests directory parent_dir = os.path.abspath(os.path.dirname(os.path.dirname(__file__))) sys.path.append(os.path.join(parent_dir, "psaml")) import psaml sc.addPyFile(os.path.join(parent_dir, "psaml/psaml.py")) sql_context = SQLContext(sc) # header=false so the columns aren't named after the first row values # inferSchema=true so that data is read in as correct data type, not just strings data = sql_context.read.load( "tests/resources/iris.csv", format="com.databricks.spark.csv", header="false", inferSchema="true" ) # now we create a vector of the input columns so they can be one column ignore = ["C4"] # ignore the output column assembler = VectorAssembler(inputCols=[x for x in data.columns if x not in ignore], outputCol="features") # Automatically identify categorical features, and index them. # We specify maxCategories so features with > 4 distinct values are treated as continuous.
# Top stories and map data over time for tag in tags: analysis.top_stories(df_full, df_sub, context, tag) analysis.top_stories(df_full, df_sub, context, tag, 10) map_wrap_to_pandas(df_full, context, tag) # Scatter, sentiment, map data for t in [["demP", "demN"], ["gopP", "gopN"], ["djtP", "djtN"]]: map_wrap_to_pandas(df_full, context, t[0], t[1]) analysis.sentiment_over_time(df_full, context, t[0], t[1]) analysis.scatter(df_full, df_sub, context, t[0], t[1], 1) analysis.scatter(df_full, df_sub, context, t[0], t[1], 100) # Total Republican Scatter analysis.total_scatter(df_full, df_sub, context) # ---------------- END ---------------- if __name__ == "__main__": conf = SparkConf().setAppName("CS143 Project 2B") conf = conf.setMaster("local[*]") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) sc.addPyFile("cleantext.py") sc.setLogLevel("ERROR") main(sqlContext)
def main(): config = configparser.ConfigParser() config.read('config.ini') #Number of poses to select by buried area number_poses_to_select_buried_area = int(config.get('DRUGDESIGN', 'number_poses_to_select_buried_area') ) # list of residues to select buried area file_select_buried_area = config.get('DRUGDESIGN', 'file_residue_to_select_buried_area') #Path that contains all files for analysis path_analysis = config.get('DEFAULT', 'path_analysis') #File for saving the filtered buried area result_file_to_select_buried_area = config.get('DRUGDESIGN', 'result_file_to_select_buried_area') #File for saving the filtered buried area only poses result_file_to_select_buried_area_only_pose = config.get('DRUGDESIGN', 'result_file_to_select_buried_area_only_pose') #Path where all pdb receptor are path_receptor = config.get('DEFAULT', 'pdb_path') #Path for saving pdb files of models generated by VS path_ligand = get_directory_pdb_analysis(path_analysis) #Path where saved the selected compelex path_to_save = os.path.join("selected_complexo", "buried_area_residue") path_to_save = os.path.join(path_analysis, path_to_save) if not os.path.exists(path_to_save): os.makedirs(path_to_save) # Create SPARK config maxResultSize = str(config.get('SPARK', 'maxResultSize')) conf = (SparkConf().set("spark.driver.maxResultSize", maxResultSize)) # Create context sc = SparkContext(conf=conf) sqlCtx = SQLContext(sc) start_time = datetime.now() #Broadcast path_to_save_b = sc.broadcast(path_to_save) path_receptor_b = sc.broadcast(path_receptor) path_ligand_b = sc.broadcast(path_ligand) #Adding Python Source file #Path for drugdesign project path_spark_drugdesign = config.get('DRUGDESIGN', 'path_spark_drugdesign') sc.addPyFile(os.path.join(path_spark_drugdesign,"vina_utils.py")) sc.addPyFile(os.path.join(path_spark_drugdesign,"pdb_io.py")) #load all-residue_buried_areas.dat file path_file_buried_area = os.path.join(path_analysis, "all-residue_buried_areas.dat") all_residue = sc.textFile(path_file_buried_area) header = all_residue.first() #extract header #Spliting file by \t all_residue_split = all_residue.filter(lambda x:x !=header).map(lambda line: line.split("\t")) all_residue_split = all_residue_split.map(lambda p: Row( residue=str(p[0]), buried_area_residue=float(p[1]), residue_sasa_buried_perc=float(p[2]), pose=str(p[3]) )) #Creating all_residue Dataframe df_all_residue = sqlCtx.createDataFrame(all_residue_split) df_all_residue.registerTempTable("all_residue") #Creating resudue list as Dataframe residue_list = sc.textFile(file_select_buried_area) header = residue_list.first() #extract header #Spliting file by \t residue_listRDD = residue_list.filter(lambda x:x !=header).map(lambda line: line) residue_listRDD = residue_listRDD.map(lambda p: Row( residue=str(p).strip() )) df_residue_list = sqlCtx.createDataFrame(residue_listRDD) df_residue_list.registerTempTable("residue_list") #Getting all information based on list of residues sql = """ SELECT all_residue.* FROM all_residue JOIN residue_list ON residue_list.residue = all_residue.residue """ df_result = sqlCtx.sql(sql) df_result.registerTempTable("residues_filtered_by_list") #Saving result path_file_result_file = os.path.join(path_analysis, result_file_to_select_buried_area) save_result(path_file_result_file, df_result) #Grouping sql = """ SELECT pose, count(*) as num_res FROM residues_filtered_by_list GROUP BY pose ORDER BY num_res DESC """ df_result = sqlCtx.sql(sql) #Saving result only pose path_file_result_file_only_pose = os.path.join(path_analysis, result_file_to_select_buried_area_only_pose) save_result_only_pose(path_file_result_file_only_pose, df_result) #Loading poses only_poseRDD = sc.textFile(path_file_result_file_only_pose) header = only_poseRDD.first() #extract header #Spliting file by \t only_poseRDD = only_poseRDD.filter(lambda x:x !=header).map(lambda line: line.split("\t")) only_poseRDD = only_poseRDD.map(lambda p: Row( pose=str(p[0]).strip(), num_res=int(str(p[1]).strip() ) )) only_pose_takeRDD = only_poseRDD.take(number_poses_to_select_buried_area) # ******************** STARTED FUNCTION ******************************** def build_complex_from_pose_file_name(p_name): from vina_utils import get_receptor_from_receptor_ligand_model, get_ligand_from_receptor_ligand_model, get_model_from_receptor_ligand_model, get_separator_filename_mode #Broadcast path_to_save = path_to_save_b.value path_receptor = path_receptor_b.value path_ligand = path_ligand_b.value #Based on row value from dataframe pose_file_name = p_name.pose #Receptor receptor_file_name = get_receptor_from_receptor_ligand_model(pose_file_name) receptor_file = os.path.join(path_receptor, receptor_file_name+".pdb") f_receptor_file = open(receptor_file,"r") #ligand file name ligand_file_name = os.path.join(path_ligand, pose_file_name+".pdb") f_ligand_file_name = open(ligand_file_name,"r") #Open file for writting the complex full_path_for_save_complex = os.path.join(path_to_save, pose_file_name+".pdb") f_compl = open(full_path_for_save_complex, "w") #Insert lines of receptor for item in f_receptor_file: f_compl.write(item) #Insert lines of model for item in f_ligand_file_name: f_compl.write(item) #Closing files f_compl.close() f_ligand_file_name.close() f_receptor_file.close() # ******************** FINISHED FUNCTION ******************************** sc.parallelize(only_pose_takeRDD).foreach(build_complex_from_pose_file_name) finish_time = datetime.now() save_log(finish_time, start_time)
from __future__ import print_function from pyspark import SparkContext import csv from label import * def save_label_count(rdd, col_index, count_file_pattern, basic_type, semantic_type, label_func): """ Group by label column and count and save """ rdd.map(lambda row: (label_func(row[col_index - 7].strip()), 1)) \ .reduceByKey(lambda x, y: x + y) \ .map(lambda row: '%s,%s,%s,%d' % (basic_type, semantic_type, row[0], row[1])) \ .coalesce(1) \ .saveAsTextFile(count_file_pattern.format(col_index)) sc = SparkContext() sc.addPyFile('label.py') data = sc.textFile('./NYPD_Complaint_Data_Historic.csv') # Header header = data.first() # Extract column 7 - 13 rdd = data.filter(lambda row: row != header) \ .mapPartitions(lambda row: csv.reader(row)) \ .map(lambda row: (row[6], row[7], row[8], row[9], row[10], row[11], row[12])).cache() count_file_pattern = 'result/label_count/col{}.out' # Save for each row a tuple (basic type, semantic type, label, count) for each column ## Aggregate and count the label for each column. #Each row is assigned the count (last column) for the label (second last column): (basic_type, semantic_type, label, count)
from pyspark.ml.feature import VectorAssembler, StringIndexer, VectorIndexer from plotly.offline import download_plotlyjs, init_notebook_mode, iplot, plot import plotly import plotly.graph_objs as go import pandas as pd # init sc = SparkContext('local', 'PSAML_Titanic') # Get parent directory of the tests directory parent_dir = os.path.abspath(os.path.dirname(os.path.dirname(__file__))) sys.path.append(os.path.join(parent_dir, 'psaml')) import psaml sc.addPyFile(os.path.join(parent_dir, 'psaml/psaml.py')) sql_context = SQLContext(sc) # header=false so the columns aren't named after the first row values # inferSchema=true so that data is read in as correct data type, not just strings data = sql_context.read.load('tests/resources/titanic/train.csv', format='com.databricks.spark.csv', header='true', inferSchema='true') # now we create a vector of the input columns so they can be one column ignore = ['Survived', 'Name', 'Ticket', 'Cabin'] # ignore the output column and nonquantifiable data assembler = VectorAssembler(inputCols=[x for x in data.columns if x not in ignore], outputCol='features') # Automatically identify categorical features, and index them. # We specify maxCategories so features with > 4 distinct values are treated as continuous. # (maxCategories is not set at the moment, however) # feature_indexer = VectorIndexer(inputCol="features", outputCol="indexed")
def main(): sc = SparkContext() sqlCtx = SQLContext(sc) config = configparser.ConfigParser() config.read('config.ini') #Path that contains all files for analysis path_analysis = config.get('DEFAULT', 'path_analysis') #Ligand Database file ligand_database = config.get('DEFAULT', 'ligand_database_path_file') #Path for drugdesign project path_spark_drugdesign = config.get('DRUGDESIGN', 'path_spark_drugdesign') #Adding Python Source file sc.addPyFile(os.path.join(path_spark_drugdesign, "vina_utils.py")) sc.addPyFile(os.path.join(path_spark_drugdesign, "database_io.py")) sc.addPyFile(os.path.join(path_spark_drugdesign, "hydrogen_bond_io.py")) sc.addPyFile(os.path.join(path_spark_drugdesign, "hydrogen_bond_crud.py")) #Sufix of completly data file full_data_file_name = config.get('DRUGDESIGN', 'full_data_file_name') start_time = datetime.now() #**************** Loading file that contains all scores and ligand efficiency score_file_name = os.path.join(path_analysis, "summary_energies.dat") text_file = sc.textFile(score_file_name) header = text_file.first() #extract header #Spliting score file by \t rdd_vs_score_sorted_split = text_file.filter(lambda x: x != header).map( lambda line: line.split("\t")) #rdd_vs_score_sorted = rdd_vs_score_sorted_split.map(lambda p: Row(receptor=str(p[0]), ligand=str(p[1]), mode=int(p[2]), energy=float(p[3]) )) rdd_vs_score_sorted = rdd_vs_score_sorted_split.map(lambda p: Row( affinity=float(p[0]), ligand_efficiency=float(p[1]), pose=str(p[2]))) #Creating Vina Datafrase based on score file vina_table = sqlCtx.createDataFrame(rdd_vs_score_sorted) vina_table.registerTempTable("vina_lig_efficiency") #**************** Finish #**************** Loading Ligand Database rdd_database = load_database(sc, ligand_database) #Creating Dataframe database_table = sqlCtx.createDataFrame(rdd_database) database_table.registerTempTable("database") #**************** Finish #**************** Loading Buried Area total buried_area_file_name = os.path.join(path_analysis, "summary_buried_areas_total.dat") buried_area_file = sc.textFile(buried_area_file_name) #Spliting file by \t header = buried_area_file.first() #extract header rdd_buried_area_split = buried_area_file.filter(lambda x: x != header).map( lambda line: line.split("\t")) #rdd_buried_area = rdd_buried_area_split.map(lambda p: Row( receptor=str(p[0]), ligand=str(p[1]), mode=int(p[2]), buried_lig_rec=float(p[3]), buried_lig_rec_perc=float(p[4]), buried_lig_lig_perc=float(p[5]) )) rdd_buried_area = rdd_buried_area_split.map( lambda p: Row(buried_area_total=float(p[0]), pose=str(p[1]))) #Creating buried Dataframe buried_table = sqlCtx.createDataFrame(rdd_buried_area) buried_table.registerTempTable("buriedArea_total") #**************** Finish #**************** Loading Buried Area receptor buried_area_file_name = os.path.join(path_analysis, "summary_buried_areas_receptor.dat") buried_area_file_receptor = sc.textFile(buried_area_file_name) header = buried_area_file_receptor.first() #extract header #Spliting file by \t buried_area_file_receptor_split = buried_area_file_receptor.filter( lambda x: x != header).map(lambda line: line.split("\t")) buried_area_file_receptor = buried_area_file_receptor_split.map( lambda p: Row(buried_area_receptor=float(p[0]), pose=str(p[1]))) #Creating buried Dataframe buried_area_file_receptor_table = sqlCtx.createDataFrame( buried_area_file_receptor) buried_area_file_receptor_table.registerTempTable("buried_area_receptor") #**************** Finish #**************** Loading Buried Area ligand buried_area_file_name = os.path.join(path_analysis, "summary_buried_area_ligand.dat") buried_area_file_ligand = sc.textFile(buried_area_file_name) header = buried_area_file_ligand.first() #extract header #Spliting file by \t buried_area_file_ligand_split = buried_area_file_ligand.filter( lambda x: x != header).map(lambda line: line.split("\t")) buried_area_file_ligand = buried_area_file_ligand_split.map( lambda p: Row(buried_area_lig=float(p[0]), buried_area_lig_perc=float(p[1]), buried_area_lig_lig_perc=float(p[2]), pose=str(p[3]))) #Creating buried Dataframe buried_area_file_ligand_table = sqlCtx.createDataFrame( buried_area_file_ligand) buried_area_file_ligand_table.registerTempTable("buried_area_ligand") #**************** Finish #**************** Loading Hydrogen Bond hydrogen_bond_num_pose_file_name = os.path.join( path_analysis, "summary_hbonds_4.0A_30.0deg.dat") rdd_hydrogen_bond = load_file_summary_hbonds( sc, hydrogen_bond_num_pose_file_name) #Creating buried Dataframe hydrogen_bond_table = create_df_hydrogen_bond(sqlCtx, rdd_hydrogen_bond) #**************** Finish #Creating SQL command sql = "" sql = "SELECT vina_lig_efficiency.pose, vina_lig_efficiency.affinity, vina_lig_efficiency.ligand_efficiency" sql += " ,buriedArea_total.buried_area_total" sql += " ,buried_area_receptor.buried_area_receptor" sql += " ,buried_area_ligand.buried_area_lig, buried_area_ligand.buried_area_lig_perc, buried_area_ligand.buried_area_lig_lig_perc " sql += " ,hydrogenbond.numHydroBond " sql += " FROM vina_lig_efficiency" sql += " JOIN buriedArea_total ON buriedArea_total.pose = vina_lig_efficiency.pose" sql += " JOIN buried_area_receptor ON buried_area_receptor.pose = vina_lig_efficiency.pose" sql += " JOIN buried_area_ligand ON buried_area_ligand.pose = vina_lig_efficiency.pose" sql += " LEFT OUTER " sql += " JOIN hydrogenbond ON hydrogenbond.pose = vina_lig_efficiency.pose" sql += " ORDER BY vina_lig_efficiency.pose" #Getting all data full_dataRDD = sqlCtx.sql(sql) full_dataRDD = full_dataRDD.map(lambda p: ( p.affinity, p.ligand_efficiency, p.numHydroBond, p.buried_area_lig, p. buried_area_lig_perc, p.buried_area_lig_lig_perc, p.buried_area_total, p.buried_area_receptor, p.pose)).collect() #Saving file save_vs_full_data(path_analysis, full_dataRDD, full_data_file_name) finish_time = datetime.now() save_vs_full_data_analysis_log(finish_time, start_time)
class CommonSparkContext(object): def __init__(self): """ Create a spark context. The spark configuration is taken from $XPATTERNS_HOME/config.ini or from the values set in SparkInitContext.set(). Notes ----- cluster_url : str, optional The url of the spark cluster to use. To use the local spark, give 'local'. To use a spark cluster with its master on a specific IP addredd, give the IP address or the hostname as in the following example: cluster_url=spark://my_spark_host:7077 app_name : str, optional The app name is used on the job monitoring server, and for logging. cores_max : str, optional The maximum number of cores to use for execution. executor_memory : str, optional The amount of main memory to allocate to executors. For example, '2g'. """ env = Environment.create_default() config_context = {'cluster_url': env.get_config('spark', 'cluster_url', default='local'), 'cores_max': env.get_config('spark', 'cores_max', default='8'), 'executor_memory': env.get_config('spark', 'executor_memory', default='8g'), 'app_name': env.get_config('spark', 'app_name', 'xFrame')} config_context.update(SparkInitContext.context) config_pairs = [(k, v) for k, v in config_context.iteritems()] conf = (SparkConf().setAll(config_pairs)) self._sc = SparkContext(conf=conf) self._sqlc = SQLContext(self._sc) self.zip_path = self.build_zip() if self.zip_path: self._sc.addPyFile(self.zip_path) atexit.register(self.close_context) def close_context(self): if self._sc: self._sc.stop() self._sc = None if self.zip_path: os.remove(self.zip_path) def sc(self): return self._sc def sqlc(self): return self._sqlc @staticmethod def build_zip(): if 'XPATTERNS_HOME' not in os.environ: return None # This can fail at writepy if there is something wrong with the files # in xpatterns. Go ahead anyway, but things will probably fail of this job is # distributed try: tf = NamedTemporaryFile(suffix='.zip', delete=False) z = PyZipFile(tf, 'w') z.writepy(os.path.join(os.environ['XPATTERNS_HOME'], 'xpatterns')) z.close() return tf.name except: print 'Zip file distribution failed -- workers will not get xpatterns code.' print 'Check for unexpected files in XPATTERNS_HOME/xpatterns.' return None
import datetime from myUtils import * from validation_utils import * def getDateHour(date_text): given_date = datetime.datetime.strptime(date_text, '%Y-%m-%d %H:%M:%S') year = given_date.year month = given_date.month hour = given_date.hour day_of_the_week = given_date.isoweekday() return (given_date.date(), hour ,"Valid") sc = SparkContext() sc.addPyFile("myUtils.py") sc.addPyFile("validation_utils.py") (taxi_data,prefix) = readFiles2({2016:range(1,7),2015:range(1,13),2014:range(1,13),2013:range(1,13)},sc) field = taxi_data.map(lambda entry: (entry[1],checkPickUpDateValid(entry[1]))) filtered_valid_records = field.filter(lambda x: x[1] == "Valid").map(lambda x: (getDateHour(x[0]))).map(lambda x: (str(x[0])+"\t"+str(x[1]),1)).reduceByKey(lambda x,y: x+y) tabSeparated = filtered_valid_records.map(lambda x: x[0]+"\t"+str(x[1])) tabSeparated.saveAsTextFile("pickup_date_and_time_frequency.out") sc.stop()
Example usage: digests.foreachRDD(print_RDD_contents) """ for x in rdd.collect(): print x ########################################## # Spark job ########################################## # set Spark context sc = SparkContext(appName="Latency") sc.setLogLevel("WARN") sc.addPyFile("../tdigest/tdigest_altered.py") # import custom tdigest class from tdigest_altered import TDigest ssc = StreamingContext(sc, microbatch_size) # create D-Stream from Kafka topic kafka_stream = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": brokers}) # extract latency data (combined across devices) # json schema: {u'device': u'type2', u'latency': 2.487, u'message_num': 189} latencies = kafka_stream.map(lambda row: row[1])\ .map(json.loads)\ .map(lambda x: x["latency"]) # compute tdigest of each partition and write to redis
class CommonSparkContext(object): __metaclass__ = Singleton def __init__(self): """ Create a spark context. The spark configuration is taken from xframes/config.ini and from the values set in SparkInitContext.set() if this has been called. """ # This is placed here because otherwise it causes an error when used in a spark slave. from pyspark import SparkConf, SparkContext, SQLContext, HiveContext # This reads from default.ini and then xframes/config.ini # if they exist. self._env = Environment.create() context = create_spark_config(self._env) verbose = self._env.get_config('xframes', 'verbose', 'false').lower() == 'true' hdfs_user_name = self._env.get_config('webhdfs', 'user', 'hdfs') os.environ['HADOOP_USER_NAME'] = hdfs_user_name config_pairs = [(k, v) for k, v in context.iteritems()] self._config = (SparkConf().setAll(config_pairs)) if verbose: print 'Spark Config: {}'.format(config_pairs) self._sc = SparkContext(conf=self._config) self._sqlc = SQLContext(self._sc) self._hivec = HiveContext(self._sc) self.zip_path = [] version = [int(n) for n in self._sc.version.split('.')] self.status_tracker = self._sc.statusTracker() if cmp(version, [1, 4, 1]) >= 0: self.application_id = self._sc.applicationId else: self.application_id = None if verbose: print 'Spark Version: {}'.format(self._sc.version) if self.application_id: print 'Application Id: {}'.format(self.application_id) if not context['spark.master'].startswith('local'): zip_path = self.build_zip(get_xframes_home()) if zip_path: self._sc.addPyFile(zip_path) self.zip_path.append(zip_path) trace_flag = self._env.get_config('xframes', 'rdd-trace', 'false').lower() == 'true' XRdd.set_trace(trace_flag) atexit.register(self.close_context) def spark_add_files(self, dirs): """ Adds python files in the given directory or directories. Parameters ---------- dirs: str or list(str) If a str, the pathname to a directory containing a python module. If a list, then it is a list of such directories. The python files in each directory are compiled, packed into a zip, distributed to each spark slave, and placed in PYTHONPATH. This is only done if spark is deployed on a cluster. """ props = self.config() if props.get('spark.master', 'local').startswith('local'): return if isinstance(dirs, basestring): dirs = [dirs] for path in dirs: zip_path = self.build_zip(path) if zip_path: self._sc.addPyFile(zip_path) self.zip_path.append(zip_path) def close_context(self): if self._sc: self._sc.stop() self._sc = None for zip_path in self.zip_path: os.remove(zip_path) def config(self): """ Gets the configuration parameters used to initialize the spark context. Returns ------- out : dict A dict of the properties used to initialize the spark context. """ props = self._config.getAll() return {prop[0]: prop[1] for prop in props} def env(self): """ Gets the config environment. Returns ------- out : Environment The environment. This contains all the values from the configuration file(s). """ return self._env def sc(self): """ Gets the spark context. Returns ------- out : SparkContext The spark context. There is a single spark context per process. """ return self._sc def sqlc(self): """ Gets the spark sql context. Returns ------- out : sql.SqlContext The spark sql context. """ return self._sqlc def hivec(self): """ Gets the hive context. Returns ------- out : sql.HiveContext The hive context. """ return self._hivec def version(self): """ Gets the spark version. Returns ------- out: lst[int] The spark version, as a list of integers. """ return [int(n) for n in self._sc.version.split('.')] def jobs(self): """ Get the spark job ID and info for the active jobs. This method would normally be called by another thread from the executing job. Returns ------- out: map(job_id: job_info} A map of the active job IDs and their corresponding job info """ return { job_id: self.status_tracker.getJobInfo(job_id) for job_id in self.status_tracker.getActiveJobIds() } def cluster_mode(self): """ Get the cluster mode of the spark cluster. Returns ------- out: boolean True if spark is running in cluster mode. Cluster mode means that spark is running on a platform separate the program. In practice, cluster mode means that file arguments must be located on a network filesystem such as HDFS or NFS. """ return not self._config.get('spark.master').startswith('local') # noinspection PyBroadException @staticmethod def build_zip(module_dir): # This can fail at writepy if there is something wrong with the files # in xframes. Go ahead anyway, but things will probably fail if this job is # distributed try: tf = NamedTemporaryFile(suffix='.zip', delete=False) z = PyZipFile(tf, 'w') z.writepy(module_dir) z.close() return tf.name except: logging.warn( 'Zip file distribution failed -- workers will not get xframes code.' ) logging.warn('Check for unexpected files in xframes directory.') return None @staticmethod def spark_context(): """ Returns the spark context. Returns ------- out : pyspark.SparkContext The SparkContext object from spark. """ return CommonSparkContext().sc() @staticmethod def spark_config(): """ Returns the spark cofig parameters. Returns ------- out : list A list of the key-value pairs stored as tuples, used to initialize the spark context. """ return CommonSparkContext().config() @staticmethod def spark_sql_context(): """ Returns the spark sql context. Returns ------- out : pyspark.sql.SQLContext The SQLContext object from spark. """ return CommonSparkContext().sqlc() @staticmethod def hive_context(): """ Returns the hive context. Returns ------- out : pyspark.sql.HiveContext The Hive object from spark. """ return CommonSparkContext().hivec() @staticmethod def spark_version(): """ Gets the spark version. Returns ------- out: list[int] The spark version, as a list of integers. """ return CommonSparkContext().version() @staticmethod def spark_cluster_mode(): """ Gets the cluster mode Returns ------- out: boolean True if spark is running in cluster mode. Cluster mode means that spark is running on a platform separate the program. In practice, cluster mode means that file arguments must be located on a network filesystem such as HDFS or NFS. """ env = Environment.create() config = create_spark_config(env) return not config.get('spark.master').startswith('local')
from __future__ import print_function from pyspark import SparkContext import sys if __name__ == "__main__": print("Hello") sc = SparkContext() sc.addPyFile("classes.zip") from HelperTransformations import HelperTransformations text_file = sc.textFile( 's3://torstar-datateam-workspace/data/raw/samples/textfile') counts = text_file.map( lambda x: HelperTransformations.removeStringSpecialCharacters( x)).flatMap(lambda line: line.split(" ")).map( lambda word: (word, 1)).reduceByKey(lambda a, b: a + b) print(counts.take(5)) counts.saveAsTextFile( 's3://torstar-datateam-workspace/data/transformed/samples/textfile_output' ) sc.stop()
from .ProteinModels import xSeriesWLCe from collections import OrderedDict from multiprocessing import Pool from functools import partial #create a spark context conf = SparkConf().setAppName("App") conf = (conf.setMaster('local[*]').set('spark.executor.memory', '2G').set( 'spark.driver.memory', '8G').set('spark.driver.maxResultSize', '15G')) sc = SparkContext(conf=conf) ### NOTE: you will need to re-create this zip file every time you want to run this code. This is super annoying. We should look into using Dask instead of Spark. print( 'NOTE: you will need to re-create this zip file every time you want to run this code. This is super annoying. We should look into using Dask instead of Spark.' ) sc.addPyFile("/home/tbartsch/source/repos/single_molecule_mechanics.zip") class TimeSeriesLoader(object): '''Provides data structures and methods to analyze single-molecule data.''' def __init__(self): #define some default values data = np.empty((3, 2)) data.fill(np.nan) self.properties_mephisto = pd.DataFrame(data, columns=['k', 'tau']) data = np.empty((6, 3)) data.fill(np.nan) self.nonlin_correction = pd.DataFrame( data, columns=['coeff_x', 'coeff_y', 'coeff_z'])