예제 #1
0
def load_csv(spark, table):
    """
    :param spark: spark session
    :param table: table object which contains path and field describe
    :return: data frame in spark
    """
    reader = csv.reader(open(table.path, "r"), delimiter=table.delimiter)
    un_order_header = dict()
    for field_name in table.all_fields:
        if field_name not in table:
            un_order_header[field_name] = None
            continue
        field = table[field_name]
        if field.field_type == 'numeric':
            un_order_header[field_name] = float
        else:
            un_order_header[field_name] = None

    header = []
    col_type = []
    for row in reader:
        for r in row:
            if r not in un_order_header:
                raise Exception("column %s not found in configuration" % r)
            header.append(r)
            col_type.append(un_order_header[r])
        break
    col_num = len(header)

    i = 1
    data = list()
    for row in reader:
        if len(row) != col_num:
            raise Exception(
                "data not consist with header:line %d, expect %d columns, found %d"
                % (i, col_num, len(row)))
        line = list()
        for r, nm, tp in zip(row, header, col_type):
            if tp is None:
                line.append(r)
            else:
                try:
                    r = r.strip()
                    if r == '':
                        line.append(None)
                    else:
                        line.append(tp(r))
                except Exception as e:
                    raise Exception(
                        "line %d, column %s can not convert to float: %s" %
                        (i, nm, r))
        data.append(tuple(line))
        i += 1
    rdd = spark.sparkContext.parallelize(data)
    data = SQLContext(spark.sparkContext).createDataFrame(rdd, header)
    print "%s loaded!" % table.name
    print data
    data.show()
    return data
예제 #2
0
def kmeans_training(master_url):
    sf = SparkConf()\
        .setMaster(master_url) \
        .setAppName("SparkSessionZipsExample") \
        .set("spark.executor.memory", "8g")

    sc = SparkContext(conf=sf)

    data = sc.textFile("hdfs://master32:9000/vectors/word_vector_sh.vec")

    def get_word_vec(line):
        x = []
        i = 0

        __ = line.split(" ")

        if (len(__) >= 100):
            for _ in __:
                if (i == 0):
                    i = 1
                    continue

                if (_ == ""):
                    continue

                x.append(float(_))
                i = i + 1
        else:
            for i in range(0, 100):
                x.append(float(0))
        return array(x)

    tmp = data.map(lambda line: get_word_vec(line.encode('utf-8')))
    df = SQLContext(sc).createDataFrame(tmp)
    df.show()

    return
예제 #3
0
        ),
            
    ),
)


# In[15]:


df.printSchema()


# In[16]:


df.show()


# In[17]:


df.withColumn("date_sub_10",F.date_sub("date",10)).show()


# In[18]:


df.withColumn("date_add_10",F.date_add("date",20)).show()


# In[ ]: