def prepare_data(self): if not self.is_prepared: self.histogram_list = Utils.get_distribution(self.value_list) self.numeric_list = Utils.clean_examples_numeric(self.value_list) if self.is_numeric(): self.sample_list = sc.parallelize(self.numeric_list).sample(False, 100.0 / len(self.numeric_list)).collect() else: self.value_text = sc.parallelize(self.value_list).map(lambda x: " %s " % x).reduce(lambda x, y: x + y) self.is_prepared = True
def search_types_data(self, index_config, semantic_types): result = self.es.search(index=Utils.get_index_name(index_config), doc_type=','.join(semantic_types), body={"query": {"match_all": {}}}) return sc.parallelize(result['hits']['hits']).map( lambda hit: (hit['_type'], hit['_source'].items())).groupByKey().flatMap(lambda x: sc.parallelize(x[1]).map( lambda y: ((x[0], y[0]), y[1]) if isinstance(y[1], list) else ((x[0], y[0]), [y[1]]))).reduceByKey( lambda x, y: x + y).map(lambda x: (x[0][0], {x[0][1]: x[1]})).collectAsMap() return result
def get_test_results(train_examples_map, textual_train_map, test_examples_map, is_labeled=False): def zip_with_key(key, item_map): result_list = [] for value in item_map.items(): if value[0] not in data_tests_map: continue for test_name in data_tests_map[value[0]]: if test_name in feature_list: row = { 'name': key, 'data_type': value[0], 'test_name': test_name, 'values': value[1], 'num': item_map['is_numeric'] } result_list.append(row) return result_list feature_vectors = defaultdict(lambda: defaultdict(lambda: 0)) if is_column_based: train_data_rdd = sc.parallelize(train_examples_map).map( lambda hit: hit['_source']).flatMap( lambda hit: zip_with_key("%s" % (hit['semantic_type']), hit)) test_results = train_data_rdd.map(lambda row: ( (row['name'], row['test_name']), row)).mapValues(lambda row: round( feature_tests_map[row['test_name']] (row['values'], test_examples_map[row['data_type']], row[ 'num'], test_examples_map['is_numeric']), 2)).reduceByKey( max).collect() else: train_data_rdd = sc.parallelize(train_examples_map).map( lambda hit: (hit['_source']['semantic_type'], hit['_source']) ).flatMap(lambda row: [((row[0], x), row[1][x]) for x in [ 'char_length', 'histogram', "numeric", 'values' ]]).reduceByKey(lambda val1, val2: val1 + val2 if isinstance(val1, list) else val1 + " " + val2) test_results = train_data_rdd.flatMap(lambda row: [( (row[0][0], x), round(feature_tests_map[x](row[1], test_examples_map[row[0][1]]), 2 )) for x in data_tests_map[row[0][1]]]).collect() for result in sorted(test_results): feature_vectors[result[0][0]][result[0][1]] = result[1] for hit in textual_train_map['hits']['hits']: source = hit['_source'] score = hit['_score'] if is_column_based: name = "%s" % (source['semantic_type']) else: name = "%s" % (source['semantic_type']) score = balance_result(source["is_numeric"], test_examples_map["is_numeric"], False, score) if feature_vectors[name][TF_IDF_TEST] < score: feature_vectors[name][TF_IDF_TEST] = score if is_tree_based: for name in feature_vectors.keys(): for test in feature_vectors[name].keys(): for j in range(5): feature_vectors[name][test + str(j)] = j * 0.2 <= feature_vectors[ name][test] < (j + 1) * 0.2 del feature_vectors[name][test] for name in feature_vectors.keys(): if TF_IDF_TEST not in feature_vectors[name]: if is_tree_based: feature_vectors[name][TF_IDF_TEST + "0"] = 1 for i in range(1, 5): feature_vectors[name][TF_IDF_TEST + str(i)] = 0 else: feature_vectors[name][TF_IDF_TEST] = 0 feature_vectors[name][IS_NUMERIC] = test_examples_map['is_numeric'] feature_vectors[name]['name'] = name.encode("utf-8") feature_vectors[name]['column_name'] = test_examples_map[ 'name'] + "!" + test_examples_map['semantic_type'] if is_labeled and name.split( "!")[0] == test_examples_map['semantic_type']: feature_vectors[name]['label'] = 1 else: feature_vectors[name]['label'] = 0 return feature_vectors.values()
def get_distribution(data): return sc.parallelize(data).map(lambda word: (word, 1)).reduceByKey( lambda x, y: x + y).sortBy(lambda x: -x[1]).zipWithIndex().flatMap( lambda x: [x[1]] * int(x[0][1] * 100.0 / len(data))).collect()
def get_distribution(data): return sc.parallelize(data).map(lambda word: (word, 1)).reduceByKey(lambda x, y: x + y).sortBy( lambda x: -x[1]).zipWithIndex().flatMap( lambda x: [x[1]] * int(x[0][1] * 100.0 / len(data))).collect()
def get_test_results(train_examples_map, textual_train_map, test_examples_map, is_labeled=False): def zip_with_key(key, item_map): result_list = [] for value in item_map.items(): if value[0] not in data_tests_map: continue for test_name in data_tests_map[value[0]]: if test_name in feature_list: row = {'name': key, 'data_type': value[0], 'test_name': test_name, 'values': value[1], 'num': item_map['is_numeric']} result_list.append(row) return result_list feature_vectors = defaultdict(lambda: defaultdict(lambda: 0)) if is_column_based: train_data_rdd = sc.parallelize(train_examples_map).map(lambda hit: hit['_source']).flatMap( lambda hit: zip_with_key("%s" % (hit['semantic_type']), hit)) test_results = train_data_rdd.map(lambda row: ((row['name'], row['test_name']), row)).mapValues( lambda row: round( feature_tests_map[row['test_name']](row['values'], test_examples_map[row['data_type']], row['num'], test_examples_map['is_numeric']), 2)).reduceByKey(max).collect() else: train_data_rdd = sc.parallelize(train_examples_map).map( lambda hit: (hit['_source']['semantic_type'], hit['_source'])).flatMap( lambda row: [((row[0], x), row[1][x]) for x in ['char_length', 'histogram', "numeric" , 'values']]).reduceByKey( lambda val1, val2: val1 + val2 if isinstance(val1, list) else val1 + " " + val2) test_results = train_data_rdd.flatMap( lambda row: [((row[0][0], x), round(feature_tests_map[x](row[1], test_examples_map[row[0][1]]), 2)) for x in data_tests_map[row[0][1]]]).collect() for result in sorted(test_results): feature_vectors[result[0][0]][result[0][1]] = result[1] for hit in textual_train_map['hits']['hits']: source = hit['_source'] score = hit['_score'] if is_column_based: name = "%s" % (source['semantic_type']) else: name = "%s" % (source['semantic_type']) score = balance_result(source["is_numeric"], test_examples_map["is_numeric"], False, score) if feature_vectors[name][TF_IDF_TEST] < score: feature_vectors[name][TF_IDF_TEST] = score if is_tree_based: for name in feature_vectors.keys(): for test in feature_vectors[name].keys(): for j in range(5): feature_vectors[name][test + str(j)] = j * 0.2 <= feature_vectors[name][test] < (j + 1) * 0.2 del feature_vectors[name][test] for name in feature_vectors.keys(): if TF_IDF_TEST not in feature_vectors[name]: if is_tree_based: feature_vectors[name][TF_IDF_TEST + "0"] = 1 for i in range(1, 5): feature_vectors[name][TF_IDF_TEST + str(i)] = 0 else: feature_vectors[name][TF_IDF_TEST] = 0 feature_vectors[name][IS_NUMERIC] = test_examples_map['is_numeric'] feature_vectors[name]['name'] = name.encode("utf-8") feature_vectors[name]['column_name'] = test_examples_map['name'] + "!" + test_examples_map['semantic_type'] if is_labeled and name.split("!")[0] == test_examples_map['semantic_type']: feature_vectors[name]['label'] = 1 else: feature_vectors[name]['label'] = 0 return feature_vectors.values()
def search_all_types(self, index_config): result = self.es.search(index=Utils.get_index_name(index_config), doctype='semantic', body={"query": {"match_all": {}}}) return sc.parallelize(result['hits']['hits']).map(lambda hit: hit['semantic_type']).collect()
def get_distribution(data): return sc.parallelize(data).map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b).sortBy( lambda x: x).zipWithIndex().flatMap(lambda value, idx: [str(idx)] * int(value/len(data) * 100))
def clean_examples_numeric(examples): return sc.parallelize(examples).map(lambda x: float(x) if Utils.is_number(x) else "").filter( lambda x: x).collect()
def predict(self, test_data): return self.model.predict(sc.parallelize(test_data))
def train(self): train_data = sc.parallelize(self.generate_train_data(1000)) self.model = RandomForest.trainClassifier(train_data, numClasses=2, numTrees=100, featureSubsetStrategy='auto', impurity='gini', maxDepth=5, maxBins=32)