def convert(value): if value is None: raise TypeError("None is not allowed.") elif isinstance(value, JavaObject): return {k: TypeConverters.toFloat(value[k]) for k in value.keySet().toArray()} elif isinstance(value, dict): return {k: TypeConverters.toFloat(v) for k, v in value.items()} else: raise TypeError("Invalid type.")
def test_list(self): l = [0, 1] for lst_like in [l, np.array(l), DenseVector(l), SparseVector(len(l), range(len(l)), l), array.array('l', l), xrange(2), tuple(l)]: converted = TypeConverters.toList(lst_like) self.assertEqual(type(converted), list) self.assertListEqual(converted, l)
def test_list(self): l = [0, 1] for lst_like in [l, np.array(l), DenseVector(l), SparseVector(len(l), range(len(l)), l), pyarray.array('l', l), xrange(2), tuple(l)]: converted = TypeConverters.toList(lst_like) self.assertEqual(type(converted), list) self.assertListEqual(converted, l)
def convert(value): if value is None: return None else: return [ H2OTypeConverters.toDenseVector()(v) for v in TypeConverters.toList(value) ]
def convert(value): if value is None: raise TypeError("None is not allowed.") else: return [ H2OTypeConverters.toPairString()(v) for v in TypeConverters.toList(value) ]
def toStringOrTFTensor(value): if isinstance(value, tf.Tensor): return value else: try: return TypeConverters.toString(value) except TypeError: raise TypeError("Could not convert %s to tensorflow.Tensor or str" % type(value))
def convert(value): if value is None: raise TypeError("None is not allowed.") else: valueForConversion = value if isinstance(value, JavaObject): valueForConversion = list(value) return TypeConverters.toListString(valueForConversion)
def toStringOrTFTensor(value): if isinstance(value, tf.Tensor): return value else: try: return TypeConverters.toString(value) except TypeError: raise TypeError( "Could not convert %s to tensorflow.Tensor or str" % type(value))
def convert(value): if value is None: raise TypeError("None is not allowed.") else: return TypeConverters.toBoolean(value)
def convert(value): if value is None: return None else: return TypeConverters.toBoolean(value)
def convert(value): if value is None: return None else: return TypeConverters.toFloat(value)
def convert(value): if value is None: return None else: return TypeConverters.toString(value)
def convert(value): package = getattr(_jvm().ai.h2o.sparkling.ml.params, "H2OAlgoParamsHelper$") return package.__getattr__("MODULE$").getValidatedEnumValue(enumClass, TypeConverters.toString(value))
def train(): sparkUrl = 'spark://ubuntu02:7077' file_path = 'hdfs://ubuntu02:9000/vectors/sentences_vector.csv' hdfs_url = 'http://ubuntu02:50070' user = '******' # 用來分桶的參數(愈小桶越多) r = 0.002 # 所有句向量集合 sc = get_conf(sparkUrl, 'LSH_train', "8g") df = load_sentence_data_frame(sc, file_path) # 隨機抽取一個向量v v = df.sample(False, 0.1, seed=0).rdd.first()['_vector'] # 對每一個值算一個hash code: floor(dot(u,v) / r) tmp = df.rdd.flatMap( lambda x: { Row(x['id'], x['sentence'], x['vector'], TypeConverters.toInt(np.floor(x['_vector'].dot(v) / r))) }) # 重新命名 df = SQLContext(sc).createDataFrame(tmp) \ .selectExpr("_1 as id", "_2 as sentence", "_3 as vector", "_4 as hash_code") # 保存dataframe以加快速度 df.persist() # 顯示各組分類情況 summary = df.groupby("hash_code").count() summary.persist() # 取得所有桶的名稱 names = summary.rdd.map(lambda x: x.hash_code).collect() # 歷遍每個分組 for name in names: print('save to ' + str(name)) tmp = df.filter(df['hash_code'] == name) # 刪除hash_code欄位以節省空間 tmp = tmp.drop('hash_code') # 寫入hdfs(這個操作巨慢) tmp.toPandas().to_csv('/home/hadoop/new/' + str(name) + '.csv', sep=',', index=False, encoding='utf-8') with open('/home/hadoop/new/meta.txt', 'w') as f: f.write('vector(v):\n') for e in v: f.write(str(e) + ',') f.write('\nnames:\n') for name in names: f.write(str(name) + ',') print('all done!!') return
def _tensor_name(tensor): if isinstance(tensor, tf.Tensor): return _tensor_name(tensor.name) return TypeConverters.toString(tensor)