예제 #1
0
def main():
	#Configure OPTIONS
	conf = SparkConf().setAppName("Citi-AddPyFile")    
	sc   = SparkContext(conf=conf)
	hiveContext = HiveContext(sc)
	#Agrego el o los códigos de python adicionales que requiere mi desarrollo
	sc.addPyFile("hdfs:///data_lake/Spark_Citi/lib/utileriasCiti.py")
	from utileriasCiti import GuardaMiTabla
# In[7]:
#Creamos dataframe apartir de un archivo json
	fileName = "/data_lake/Spark_Citi/config/parametros_citi_destinos.json" 
	data = hiveContext.read.format("json").option('encoding', 'UTF-8').load(fileName)
	data.show(100,truncate=False)
예제 #2
0
def main():
    # init
    args = parse_args()
    sc = SparkContext(conf=SparkConf().setAppName(args.app_name))
    sc.setLogLevel("WARN")
    hiveContext = HiveContext(sc)

    # read csv data
    data_df = hiveContext.read.csv(args.path, header=True, inferSchema=True)

    dest_dict = eval(args.destination)
    assert type(dest_dict) == dict, "please input str(dict)"

    save_data.save_df(data_df,dest_dict)
    def __init__(self):
        """
        Create a spark context.

        The spark configuration is taken from xframes/config.ini and from
        the values set in SparkInitContext.set() if this has been called.
        """

        # This is placed here because otherwise it causes an error when used in a spark slave.
        from pyspark import SparkConf, SparkContext, SQLContext, HiveContext
        # This reads from default.ini and then xframes/config.ini
        # if they exist.
        self._env = Environment.create()
        context = create_spark_config(self._env)
        verbose = self._env.get_config('xframes', 'verbose',
                                       'false').lower() == 'true'
        hdfs_user_name = self._env.get_config('webhdfs', 'user', 'hdfs')
        os.environ['HADOOP_USER_NAME'] = hdfs_user_name
        config_pairs = [(k, v) for k, v in context.iteritems()]
        self._config = (SparkConf().setAll(config_pairs))
        if verbose:
            print 'Spark Config: {}'.format(config_pairs)

        self._sc = SparkContext(conf=self._config)
        self._sqlc = SQLContext(self._sc)
        self._hivec = HiveContext(self._sc)
        self.zip_path = []
        version = [int(n) for n in self._sc.version.split('.')]
        self.status_tracker = self._sc.statusTracker()
        if cmp(version, [1, 4, 1]) >= 0:
            self.application_id = self._sc.applicationId
        else:
            self.application_id = None

        if verbose:
            print 'Spark Version: {}'.format(self._sc.version)
            if self.application_id:
                print 'Application Id: {}'.format(self.application_id)

        if not context['spark.master'].startswith('local'):
            zip_path = self.build_zip(get_xframes_home())
            if zip_path:
                self._sc.addPyFile(zip_path)
                self.zip_path.append(zip_path)

        trace_flag = self._env.get_config('xframes', 'rdd-trace',
                                          'false').lower() == 'true'
        XRdd.set_trace(trace_flag)
        atexit.register(self.close_context)
예제 #4
0
def sql(sc):
    try:
        if hasattr(pyspark.sql, 'types'):  # pyspark >= 1.3
            yield HiveContext(sc)
        else:
            yield SQLContext(sc)
    finally:
        dbpath = 'metastore_db'
        logpath = 'derby.log'
        if os.path.exists(dbpath):
            assert os.path.isdir(dbpath)
            shutil.rmtree(dbpath)
        if os.path.exists(logpath):
            assert os.path.isfile(logpath)
            os.remove(logpath)
예제 #5
0
def process(sc):
    hiveContext = HiveContext(sc)
    hql = "select * from kmeans_cluster_feature where pt = '%s'" % (pt)
    df_raw = hiveContext.sql(hql).repartition(160)
    columns = df_raw.columns[1: -2]    
    feature_num = len(columns)
    # type
    #df_tmp = df_raw
    #for k, i in zip(columns, range(feature_num)):
    #    df_tmp = df_tmp.withColumn(k, df_tmp[i + 1] * 1.0)
    # Imputer
    mean_value = df_raw.describe().collect()[1]
    print mean_value
    df_train = df_raw
    for k, i in zip(columns, range(feature_num)):
        df_train = df_train.na.fill({k:mean_value[i + 1]})
    # minmax
    vecAssembler = VectorAssembler(inputCols=columns, outputCol="features")
    df_b_s = vecAssembler.transform(df_train)
    mmScaler = MinMaxScaler(inputCol="features", outputCol="scaled")
    model = mmScaler.fit(df_b_s)
    df_scaled = model.transform(df_b_s)
    # kmeans
    n_clusters_ = 20
    model = KMeans(k=n_clusters_, initSteps=10, maxIter=300, featuresCol='scaled').fit(df_scaled)
    df_result = model.transform(df_scaled)
    # map
    global sensitivity_1, sensitivity_3
    sensitivity_1 = []
    sensitivity_2 = []
    sensitivity_3 = []
    key_cnt = []
    centers = model.clusterCenters()
    for xx, yy in zip(centers, range(n_clusters_)):
        key_cnt.append([yy, xx[0]])
    sorted_cluster = sorted(key_cnt, key=lambda asd: asd[1])
    split = n_clusters_ / 3
    split_end = n_clusters_ - split
    for xx, yy in zip(sorted_cluster, range(n_clusters_)):
        if yy < split:
            sensitivity_3.append(xx[0])
        elif yy >= split_end:
            sensitivity_1.append(xx[0])
        else:
            sensitivity_2.append(xx[0])
    #result
    df_result.map(result_process).saveAsTextFile("kmeans_cluster_result/pt=%s/" % (pt))
예제 #6
0
def load_dataset():
    '''Return Real Telco customers and labels.'''
    #df = pd.read_excel(ibmxlsxpath)

    conf = SparkConf().setAppName("Telco Churn IRL")
    sc = SparkContext(conf=conf)
    sqlContext = HiveContext(sc)
    df = sqlContext.sql("select * from jfletcher.churn_test_3").toPandas()

    df = drop_missing(df).reset_index()
    df.index.name = 'id'
    features, labels = utils.splitdf(df, labelcol)
    features = booleanize_senior_citizen(features)
    features = utils.drop_non_features(features, cols)
    features = utils.categorize(features, cols)
    labels = (labels == 'Yes')
    return features, labels
예제 #7
0
def run_hive():
    dic = [{'id': '1,2,3'}]
    # df = pd.DataFrame(dic)
    sc = SparkContext()
    # sc.parallelize(dic)

    sql_ctx = HiveContext(sc)
    # sql_ctx.registerDataFrameAsTable(df, "aaa")

    sdf = sql_ctx.createDataFrame(dic)
    sdf.registerTempTable('aaa')

    # sdf.show()

    # df2 = sql_ctx.sql('select split(id,',') from aaa')
    df2 = sql_ctx.sql(
        'select  select collect_list(cast (explode(split(id,",")) AS string)) from aaa'
    )
    df2.show()
def get_sqlContext():
    import sys
    import os

    try:
        sc.stop()
    except:
        pass

    spark_home = '/opt/cloudera/parcels/CDH/lib/spark/'

    os.environ['SPARK_HOME'] = spark_home

    sys.path.insert(0, os.path.join(spark_home, 'python'))
    sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.9-src.zip'))

    from pyspark import SparkContext, SparkConf, HiveContext

    conf = SparkConf().setAppName( 'drops_finding' )\
                        .setMaster( 'yarn-client' )\
                        .setExecutorEnv('PATH', os.environ[ 'PATH' ] ) \
                        .set('spark.executor.core', '5' )\
                        .set('spark.executor.memory', '25g' )\
                        .set('spark.driver.core', '5' )\
                        .set('spark.driver.memory', '25g' )\
                        .set('spark.yarn.driver.memoryOverhead', '4096' ) \
                        .set('spark.yarn.executor.memoryOverhead', '4096' )\
                        .set('spark.kryoserializer.buffer.max', '2047')\
                        .set('spark.driver.maxResultSize', '8g')\
                        .set("spark.dynamicAllocation.enabled","true") \
                        .set("spark.dynamicAllocation.minExecutors","10") \
                        .set("spark.dynamicAllocation.maxExecutors","16") \
                        .set("spark.dynamicAllocation.initialExecutors","10") \
                        .set("spark.dynamicAllocation.executorIdleTimeout","60s") \
                        .set("spark.dynamicAllocation.schedulerBacklogTimeout","5s") \
                        .set("spark.dynamicAllocation.sustainedSchedulerBacklogTimeout","5s") \

    sc = SparkContext(conf=conf)
    sqlContext = HiveContext(sc)

    return sc, sqlContext
예제 #9
0
def process_sql():
    filename = '/Users/baoqiang/Downloads/1.txt'

    sc = SparkContext()
    sql_ctx = HiveContext(sc)
    df = sql_ctx.read.json(filename)

    # keywords = ["小包", "小钰"]
    # keywords = ['"{}"'.format(keyword) for keyword in keywords]
    # df = df.where('score > 5 or keyword in ({})'.format(', '.join(keywords)))

    token = '2ZDMkVAQVjN'
    # df = df.where('token = "{}" and get_json_object(share_data, "$.[0].ShareCategory") = 2'.format(token))
    # df = df.where('token = "{}"'.format(token))
    # df.show()

    df.registerTempTable("events")
    q1 = 'SELECT get_json_object(share_data, "$.[0].ShareCategory"),token FROM events where token = "{}"'.format(
        token)

    res = sql_ctx.sql(q1)

    res.show()
예제 #10
0
def process_sql_sample():
    filename = '/Users/baoqiang/Downloads/3.txt'

    sc = SparkContext()
    sql_ctx = HiveContext(sc)
    df = sql_ctx.read.json(filename)

    df.registerTempTable("events")
    # q1 = 'SELECT get_json_object(students, "$.[0].name") as name,* FROM events' \
    #      'lateral view explode(split(userl_ids,"[[[")) snTable as user_id  where id = {}'.format(1)

    q1 = "select explode(split(substring(students,3,length(students)-4),'\\\\},\\\\{')) as student from events"

    q2 = "select id,concat('{',student,'}') as entities from (select * from events) a " \
         "lateral view explode(split(substring(students,3,length(students)-4),'\\\\},\\\\{')) b as student"

    q3 = "select id,get_json_object(entities,'$.name') as name from (" \
         "select id,concat('{',student,'}') as entities from (select * from events) a " \
         "lateral view explode(split(substring(students,3,length(students)-4),'\\\\},\\\\{')) b as student )"

    res = sql_ctx.sql(q3)

    res.show()
예제 #11
0
    def __init__(self):
        self.localClusterURL = "local[2]"
        self.clusterMasterURL = "spark://Master:7077"
        self.conf = SparkConf().setAppName('ELT').setMaster(
            self.localClusterURL)
        self.sc = SparkContext.getOrCreate(self.conf)
        self.sqlContext = SQLContext(self.sc)
        self.hc = HiveContext(self.sc)

        self.jdbcURL = "jdbc:mysql://Master:3306/recommend?useUnicode=true&characterEncoding=utf-8&useSSL=false"

        self.prop = {
            'dirver': 'com.mysql.jdbc.Driver',
            'user': '******',
            'password': '******'
        }
        #  user\rating\links\tags在hdfs中的位置 ===> 即推荐原料在hdfs中的存档路径
        self.hdfs_data_path = 'hdfs://Master:9000/movie/data/'
        self.movies_path = self.hdfs_data_path + 'movies.txt'
        self.ratings_path = self.hdfs_data_path + 'ratings.txt'
        self.links_path = self.hdfs_data_path + 'links.txt'
        self.tags_path = self.hdfs_data_path + 'tags.txt'

        # 各种result数据在mysql中的表
        self.default5Table = 'MovieSizer.operation_default5recommend'
        self.top5Table = 'MovieSizer.oertion_top5recomm'

        self.alsTable = 'MovieSizer.movies_alsTab'
        self.similarTable = 'MovieSizer.movies_movidesimilar'
        self.usesrTable = 'MovieSizer.usesr_userprofile'
        self.ratingTable = 'MovieSizer.operation_rating'

        self.movieTab = 'MovieSizer.movies_movieinfo'
        self.tagTab = 'MovieSizer.movies_movieinfo_typelist'

        # 设置RDD的partition的数量一般以集群分配给应用的CPU核数的整数倍为宜。
        self.minPartitions = 8
    def _initialize_spark_contexts(gateway):
        java_spark_context = gateway.entry_point.getSparkContext()
        java_spark_conf = java_spark_context.getConf()

        spark_context = SparkContext(
            conf=SparkConf(_jvm=gateway.jvm, _jconf=java_spark_conf),
            gateway=gateway,
            jsc=java_spark_context)

        java_spark_sql_session = gateway.entry_point.getSparkSQLSession()
        spark_version = spark_context.version
        spark_sql_session = None
        if spark_version == "1.6.1":
            from pyspark import HiveContext
            java_sql_context = java_spark_sql_session.getSQLContext()
            spark_sql_session = HiveContext(spark_context, java_sql_context)
        elif spark_version in ["2.0.0", "2.0.1", "2.0.2"]:
            from pyspark.sql import SparkSession
            java_spark_session = java_spark_sql_session.getSparkSession()
            spark_sql_session = SparkSession(spark_context, java_spark_session)
        else:
            raise ValueError("Spark version {} is not supported".format(spark_version))

        return spark_context, spark_sql_session
def hive(string):
    try:
        global sc
        hive_context = HiveContext(sc)
        colunas, nome_tab, condicao, compl = string.split("-")

        if condicao != '0':
            condicao = condicao.replace('_', ' ')
            condicao = 'and ' + condicao
        else:
            condicao = ''

        if compl != '0':
            compl = compl.replace('_', ' ')
            return str(
                hive_context.sql("select " + colunas + " from " + nome_tab +
                                 " where 1=1 " + condicao + compl).collect())
        else:
            return str(
                hive_context.sql("select " + colunas + " from " + nome_tab +
                                 " where 1=1 " + condicao).collect())

    except e:
        pass
    def create_context(parameters=None):
        if parameters is None:
            parameters = OrderedDict()
            parameters['spark.app.name'] = 'weta_workflow'
            parameters['spark.master'] = 'local'  # 'yarn'
            parameters["spark.executor.instances"] = "8"
            parameters["spark.executor.cores"] = "8"
            parameters["spark.executor.memory"] = "2g"
            parameters["spark.driver.cores"] = "4"
            parameters["spark.driver.memory"] = "1g"
            parameters["spark.logConf"] = "false"
            parameters["spark.app.id"] = "dummy"
            # parameters['spark.debug.maxToStringFields'] = 100

        cls = SparkEnvironment
        if cls._sc:
            cls._sc.stop()

        for key, parameter in parameters.items():
            cls._conf.set(key, parameter)

        cls._sc = SparkContext(conf=cls._conf)
        cls._sqlContext = SQLContext(cls._sc)
        cls._hc = HiveContext(cls._sc)
예제 #15
0
# coding:utf-8
from pyspark import SparkConf, SparkContext, HiveContext
from a2_week_add_index import *
import pandas as pd
import datetime
import sys

conf = SparkConf()
sc = SparkContext()
hql = HiveContext(sc)


def get_save_table(flag):
    if int(flag) == 0:
        save_table = 'c3_top2_stock_feature_train'
    else:
        save_table = 'c3_top2_stock_feature_test'
    return save_table


def get_is_label(flag):
    if int(flag) == 0:
        is_label = ',label'
    else:
        is_label = ''
    return is_label


args = sys.argv[1:]
if len(args) == 0:
    print '未传入参数一flag  默认使用flag=0 即训练集样本'
예제 #16
0
    print(sql)
    hiveCtx.sql(sql).registerTempTable("temp_table")

    insert_sql = """
        insert overwrite table {table_name} partition(dt='{dt}')
        select  * from temp_table
        """.format(table_name=table_name, dt=dt_str)
    print("insert_sql:\n" + insert_sql)
    hiveCtx.sql(insert_sql)


if __name__ == "__main__":
    conf = SparkConf()
    sc = SparkContext(conf=conf, appName="sp-tfidf")
    sc.setLogLevel("WARN")
    hiveCtx = HiveContext(sc)
    hiveCtx.setConf('spark.shuffle.consolidateFiles', 'true')
    hiveCtx.setConf('spark.shuffle.memoryFraction', '0.4')
    hiveCtx.setConf('spark.sql.shuffle.partitions', '1000')
    if len(sys.argv) == 1:
        dt = datetime.datetime.now() + datetime.timedelta(-1)
    else:
        dt = datetime.datetime.strptime(sys.argv[1], "%Y%m%d").date()

    dt_str = dt.strftime("%Y-%m-%d")
    yest_dt = dt + datetime.timedelta(-30)
    yest_str = yest_dt.strftime("%Y-%m-%d")

    hiveCtx.sql("use app")
    create_table(hiveCtx)
    getQuery(hiveCtx)
예제 #17
0
 def __init__(self):
     conf=SparkConf().set('spark.sql.shuffle.partitions','50').set('spark.jars.packages','ml.combust.mleap:mleap-spark-base_2.11:0.7.0,ml.combust.mleap:mleap-spark_2.11:0.7.0')
     sc=SparkContext(conf=conf)
     sc.setLogLevel('WARN')
     self.hc=HiveContext(sc)
예제 #18
0
from pyspark import SparkConf,SparkContext,HiveContext

APP_NAME="read-json"

def main(sc,sqlC):
    df=sqlC.read.json("./data.json")
    df.show()
    df.printSchema()
    df.select("age").show()
    # rdd的transformation
    df.filter(df["age"]>20).show()

    # 可选格式:json,orc,parquet
    df.write.format("orc").saveAsTable("people",mode="overwrite")
    df.groupBy("age").count().show()

if __name__=="__main__":
    conf=SparkConf().setAppName(APP_NAME)
    conf=conf.setMaster("local[*]")
    sc=SparkContext(conf=conf)
    sqlC=HiveContext(sc)

    main(sc,sqlC)
예제 #19
0
from __future__ import print_function

try:
    import findspark

    findspark.init()
    import pyspark

    sc = pyspark.SparkContext()
    sc.setLogLevel('WARN')
    print("spark context created")

    from pyspark import SparkConf, SparkContext, HiveContext
    from pyspark.sql import SQLContext

    sqlc = SQLContext(sc)
    sqlh = HiveContext(sc)
    sqlh.setConf("spark.sql.parquet.compression.codec.", "gzip")

except:
    print("spark context exists")
예제 #20
0
                                                maxDepth=10)

    preds = model.predict(lp_check.map(lambda x: x.features))
    labels_and_preds = lp_check.map(lambda x: x.label).zip(preds).sortBy(
        lambda x: x[1], ascending=False)

    for each in labels_and_preds.take(100):
        print each

    labels_and_preds = lp_check.map(lambda x: x.label).zip(preds).sortBy(
        lambda x: x[1], ascending=True)
    for each in labels_and_preds.take(100):
        print each

    mse = labels_and_preds.map(
        lambda x: math.pow(x[0] - x[1], 2)).sum() / labels_and_preds.count()
    print mse
    mse = labels_and_preds.map(
        lambda x: math.pow(x[0] - 1.0, 2)).sum() / labels_and_preds.count()
    print mse


if __name__ == "__main__":
    conf = SparkConf()
    conf.set("spark.executor.instances", "4")
    conf.set("spark.executor.cores", "4")
    conf.set("spark.executor.memory", "8g")
    sc = SparkContext(appName="bintrade.post.index", conf=conf)
    sql_context = HiveContext(sc)
    main(sc, sql_context, is_hive=True)
    sc.stop()
예제 #21
0
def rdd_to_spark_df_or_srdd(rdd, **kwargs):
    return append(HiveContext(rdd.context), rdd, **kwargs)
예제 #22
0
#Validating DateArg using Regex.
matchObj1 = re.match( r'[0-9]{4}[-][0][1-9]|[0-9]{4}[-][1][0-2]', DateArg)
if (len(DateArg) == 7):
    if (matchObj1):
        print("Hi, correct argument")

else:
        print('Invalid input, please try again in (YYYY-MM) format. \nTerminating Program..........')
        #Terminate program in case DateArg value is invalid.
        sys.exit(0)

#Initializing SparkContext and HiveContext.
conf = SparkConf().setAppName('Mini Bridge Table').set("spark.executor.memory", "64g").set("spark.driver.memory", "32g")
sc = SparkContext(conf = conf)
hc = HiveContext(sc)



windowdf = hc.sql("""SELECT DISTINCT rowid_cdh_household,
                            rowid_cdh_party,
                            last_update_date
                     FROM (
                           SELECT   rowid_cdh_household,
                                    rowid_cdh_party,
                                    last_update_date,
                                    rank()
                           OVER (PARTITION BY rowid_cdh_party
                           ORDER BY last_update_date DESC) AS rank
                           FROM t_sda01.c_cdh_household_prty_rel
                           WHERE  last_update_date < '2012-10-09 10:10:01') AS tmp
예제 #23
0
from pyspark import SparkConf, SparkContext
from pyspark import HiveContext
from pyspark.sql.types import DoubleType
from pyspark.sql.types import IntegerType
from pyspark.sql import functions as func
from pyspark.sql.functions import col

hive_context = HiveContext(sc)
procedure = hive_context.table("default.procedure")
hospitals = hive_context.table("default.hospital")
procedure_typecast = procedure.withColumn(
    "score", procedure["score"].cast(DoubleType())).withColumn(
        "sample", procedure["sample"].cast(IntegerType())).withColumn(
            "denominator", procedure["denominator"].cast(IntegerType()))
procedure_hospital = procedure_typecast.join(
    hospital, procedure_typecast.provider_id == hospital.provider_id)
# subset for those procedures that have a score not higher than 100 and a sample of at least 50
score_avg = procedure_hospital.where((procedure_hospital['score'] <= 100) & (
    procedure_hospital['sample'] > 50)).groupby('state').agg(func.avg('score'))
# show the 10 best states
best_states = score_avg.sort(score_avg['avg(score)'].desc()).show(10)
MOUNT_NAME = "sparkfish"
dbutils.fs.mount("s3n://%s:%s@%s" % (ACCESS_KEY, ENCODED_SECRET_KEY, AWS_BUCKET_NAME), "/mnt/%s" % MOUNT_NAME)
display(dbutils.fs.ls("/mnt/sparkfish"))

# COMMAND ----------

# MAGIC %python
# MAGIC from pyspark.sql import functions as F
# MAGIC from pyspark.sql.functions import datediff, to_date, lit, unix_timestamp,split
# MAGIC from pyspark.sql.types import *
# MAGIC 
# MAGIC # Build DataFrame dataset to work with. 
# MAGIC formatPackage = "csv" if sc.version > '1.6' else "com.databricks.spark.csv"
# MAGIC df = sqlContext.read.format(formatPackage).options(header='true', delimiter = ',').load("dbfs:/mnt/sparkfish/titanic.csv")
# MAGIC data_df=df.withColumn("Age", df["Age"].cast(IntegerType()))
# MAGIC data_df.printSchema()
# MAGIC data_df.write.saveAsTable('sparkfishTable', format='parquet', mode='overwrite',path='dbfs:/mnt/sparkfish/sparkfishTable/')

# COMMAND ----------

from pyspark import SparkContext, HiveContext
hiveContext = HiveContext(sc)
display(hiveContext.sql("SELECT percentile(Age, 0.75) FROM sparkfishTable"))


# COMMAND ----------

from pyspark import SparkContext, HiveContext
hiveContext = HiveContext(sc)
display(hiveContext.sql("SELECT avg(Age) FROM sparkfishTable"))
예제 #25
0
def analyze_column(sc, X):
    """analyze column by


    Note:

    Args(object):    Spark DataFrame created MatrixCreator

    Return:
        colInfo(dictionary):
        {
            'version': 'test_version'
            'preprocess':
                {
                    'all': [col1, col2, ... , colN],
                    'singleton': [col1, col2, ... , colN],
                    'string': [col1, col2, ... , colN],
                    'final': [col1, col2, ... , colN]
                }
        }

    """
    hc = HiveContext(sc)
    colInfo = {}
    preprocess = {}

    # 1. Create X matrix from matrixCreator
    # 2. Sampling fro this X matrix
    # 3. trasform the X matrix to dictionary
    # 4.
    print 'sampling from matrix'
    if X.is_cached:
        print 'the matrix is cached'
    else:
        print 'the matrix is somehow not cached, WTF!!!'

    sampleX = (X.sample(withReplacement=False, fraction=0.005,
                        seed=42).map(lambda x: x.items))

    df = pd.DataFrame(sampleX.collect())
    # df = df.toPandas()
    print 'sampling done'
    preprocess['all'] = list(df.columns)
    ori_num = len(df.columns)
    dfNu = df.apply(pd.to_numeric, errors='coerce')
    dfNuRM = dfNu.dropna(axis=1, how='all')
    nu_num = len(dfNuRM.columns)
    diff = list(set(df.columns) - set(dfNuRM.columns))

    preprocess['string'] = diff

    remove_count = 0
    colSet = set(dfNuRM.columns)
    for col in dfNuRM.columns:
        if len(dfNuRM[col].value_counts()) == 1:
            del dfNuRM[col]
            remove_count += 1
    print 'There are {} columns being removed'.format(remove_count)

    diff = list(colSet - set(dfNuRM.columns))

    preprocess['singleton'] = diff
    preprocess['final'] = list(dfNuRM.columns)

    colInfo['preprocess'] = preprocess

    try:
        listLen = 0
        listList = []
        for key in colInfo['preprocess'].keys():
            if key != 'all':
                listLen += len(colInfo['preprocess'][key])
                listList += list(colInfo['preprocess'][key])

        if ((listLen == len(colInfo['preprocess']['all'])) &
            (set(list(colInfo['preprocess']['all'])) == set(listList))):
            print 'the number is correct'
    except:
        print 'the number is incorrect'

    return colInfo
예제 #26
0
from pyspark import SparkContext
from pyspark import HiveContext
sc = SparkContext()
hospitals = HiveContext(sc).sql('from hospitals select *')
hospitals.show()
#import pandas as pd
import commands
import ast
import itertools
import pyspark.sql.functions
from pyspark.sql.functions import col
from pyspark.sql.functions import current_date
from datetime import datetime, timedelta
from collections import Counter
import re
#import numpy as np

from pyspark import SparkContext, SparkConf, HiveContext
sc = SparkContext.getOrCreate()
from pyspark.sql import SQLContext
sqlContext = HiveContext(sparkContext=sc)
sqlCtx = HiveContext(sparkContext=sc)


def freq(lst):
    d = {}
    for i in lst:
        if d.get(i):
            d[i] += 1
        else:
            d[i] = 1
    return d


def get_nested_keys(a):
    key_list = []
예제 #28
0
def hiveContext(sparkContext):
    return HiveContext(sparkContext)
예제 #29
0
#coding=UTF-8
from pyspark import SparkContext, SparkConf, SQLContext, Row, HiveContext
from pyspark.sql.types import *
from datetime import date, datetime, timedelta
import sys, re, os

st = datetime.now()
conf = SparkConf().setAppName('PROC_F_CI_SUN_CREDIT_EXT').setMaster(
    sys.argv[2])
sc = SparkContext(conf=conf)
sc.setLogLevel('WARN')
if len(sys.argv) > 5:
    if sys.argv[5] == "hive":
        sqlContext = HiveContext(sc)
else:
    sqlContext = SQLContext(sc)
hdfs = sys.argv[3]
dbname = sys.argv[4]

#处理需要使用的日期
etl_date = sys.argv[1]
#etl日期
V_DT = etl_date
#上一日日期
V_DT_LD = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8])) +
           timedelta(-1)).strftime("%Y%m%d")
#月初日期
V_DT_FMD = date(int(etl_date[0:4]), int(etl_date[4:6]), 1).strftime("%Y%m%d")
#上月末日期
V_DT_LMD = (date(int(etl_date[0:4]), int(etl_date[4:6]), 1) +
            timedelta(-1)).strftime("%Y%m%d")
예제 #30
0
def hive_context(spark_context):
    return HiveContext(spark_context)