예제 #1
0
 def prepare_data(self):
     if not self.is_prepared:
         self.histogram_list = Utils.get_distribution(self.value_list)
         self.numeric_list = Utils.clean_examples_numeric(self.value_list)
         if self.is_numeric():
             self.sample_list = sc.parallelize(self.numeric_list).sample(False,
                                                                         100.0 / len(self.numeric_list)).collect()
         else:
             self.value_text = sc.parallelize(self.value_list).map(lambda x: " %s " % x).reduce(lambda x, y: x + y)
         self.is_prepared = True
예제 #2
0
    def search_types_data(self, index_config, semantic_types):
        result = self.es.search(index=Utils.get_index_name(index_config), doc_type=','.join(semantic_types),
                                body={"query": {"match_all": {}}})

        return sc.parallelize(result['hits']['hits']).map(
            lambda hit: (hit['_type'], hit['_source'].items())).groupByKey().flatMap(lambda x: sc.parallelize(x[1]).map(
            lambda y: ((x[0], y[0]), y[1]) if isinstance(y[1], list) else ((x[0], y[0]), [y[1]]))).reduceByKey(
            lambda x, y: x + y).map(lambda x: (x[0][0], {x[0][1]: x[1]})).collectAsMap()

        return result
예제 #3
0
def get_test_results(train_examples_map,
                     textual_train_map,
                     test_examples_map,
                     is_labeled=False):
    def zip_with_key(key, item_map):
        result_list = []
        for value in item_map.items():
            if value[0] not in data_tests_map:
                continue
            for test_name in data_tests_map[value[0]]:
                if test_name in feature_list:
                    row = {
                        'name': key,
                        'data_type': value[0],
                        'test_name': test_name,
                        'values': value[1],
                        'num': item_map['is_numeric']
                    }
                    result_list.append(row)
        return result_list

    feature_vectors = defaultdict(lambda: defaultdict(lambda: 0))

    if is_column_based:
        train_data_rdd = sc.parallelize(train_examples_map).map(
            lambda hit: hit['_source']).flatMap(
                lambda hit: zip_with_key("%s" % (hit['semantic_type']), hit))

        test_results = train_data_rdd.map(lambda row: (
            (row['name'], row['test_name']), row)).mapValues(lambda row: round(
                feature_tests_map[row['test_name']]
                (row['values'], test_examples_map[row['data_type']], row[
                    'num'], test_examples_map['is_numeric']), 2)).reduceByKey(
                        max).collect()
    else:
        train_data_rdd = sc.parallelize(train_examples_map).map(
            lambda hit: (hit['_source']['semantic_type'], hit['_source'])
        ).flatMap(lambda row: [((row[0], x), row[1][x]) for x in [
            'char_length', 'histogram', "numeric", 'values'
        ]]).reduceByKey(lambda val1, val2: val1 + val2
                        if isinstance(val1, list) else val1 + " " + val2)

        test_results = train_data_rdd.flatMap(lambda row: [(
            (row[0][0], x),
            round(feature_tests_map[x](row[1], test_examples_map[row[0][1]]), 2
                  )) for x in data_tests_map[row[0][1]]]).collect()

    for result in sorted(test_results):
        feature_vectors[result[0][0]][result[0][1]] = result[1]

    for hit in textual_train_map['hits']['hits']:
        source = hit['_source']
        score = hit['_score']
        if is_column_based:
            name = "%s" % (source['semantic_type'])
        else:
            name = "%s" % (source['semantic_type'])
        score = balance_result(source["is_numeric"],
                               test_examples_map["is_numeric"], False, score)
        if feature_vectors[name][TF_IDF_TEST] < score:
            feature_vectors[name][TF_IDF_TEST] = score

    if is_tree_based:
        for name in feature_vectors.keys():
            for test in feature_vectors[name].keys():
                for j in range(5):
                    feature_vectors[name][test +
                                          str(j)] = j * 0.2 <= feature_vectors[
                                              name][test] < (j + 1) * 0.2
                del feature_vectors[name][test]

    for name in feature_vectors.keys():

        if TF_IDF_TEST not in feature_vectors[name]:
            if is_tree_based:
                feature_vectors[name][TF_IDF_TEST + "0"] = 1
                for i in range(1, 5):
                    feature_vectors[name][TF_IDF_TEST + str(i)] = 0
            else:
                feature_vectors[name][TF_IDF_TEST] = 0
        feature_vectors[name][IS_NUMERIC] = test_examples_map['is_numeric']
        feature_vectors[name]['name'] = name.encode("utf-8")
        feature_vectors[name]['column_name'] = test_examples_map[
            'name'] + "!" + test_examples_map['semantic_type']
        if is_labeled and name.split(
                "!")[0] == test_examples_map['semantic_type']:
            feature_vectors[name]['label'] = 1
        else:
            feature_vectors[name]['label'] = 0
    return feature_vectors.values()
def get_distribution(data):
    return sc.parallelize(data).map(lambda word: (word, 1)).reduceByKey(
        lambda x, y: x + y).sortBy(lambda x: -x[1]).zipWithIndex().flatMap(
            lambda x: [x[1]] * int(x[0][1] * 100.0 / len(data))).collect()
예제 #5
0
def get_distribution(data):
    return sc.parallelize(data).map(lambda word: (word, 1)).reduceByKey(lambda x, y: x + y).sortBy(
        lambda x: -x[1]).zipWithIndex().flatMap(
        lambda x: [x[1]] * int(x[0][1] * 100.0 / len(data))).collect()
def get_test_results(train_examples_map, textual_train_map, test_examples_map, is_labeled=False):
    def zip_with_key(key, item_map):
        result_list = []
        for value in item_map.items():
            if value[0] not in data_tests_map:
                continue
            for test_name in data_tests_map[value[0]]:
                if test_name in feature_list:
                    row = {'name': key, 'data_type': value[0], 'test_name': test_name, 'values': value[1],
                           'num': item_map['is_numeric']}
                    result_list.append(row)
        return result_list

    feature_vectors = defaultdict(lambda: defaultdict(lambda: 0))

    if is_column_based:
        train_data_rdd = sc.parallelize(train_examples_map).map(lambda hit: hit['_source']).flatMap(
            lambda hit: zip_with_key("%s" % (hit['semantic_type']), hit))

        test_results = train_data_rdd.map(lambda row: ((row['name'], row['test_name']), row)).mapValues(
            lambda row: round(
                feature_tests_map[row['test_name']](row['values'], test_examples_map[row['data_type']], row['num'],
                                                    test_examples_map['is_numeric']), 2)).reduceByKey(max).collect()
    else:
        train_data_rdd = sc.parallelize(train_examples_map).map(
            lambda hit: (hit['_source']['semantic_type'], hit['_source'])).flatMap(
            lambda row: [((row[0], x), row[1][x]) for x in
                         ['char_length', 'histogram', "numeric"
                             , 'values']]).reduceByKey(
            lambda val1, val2: val1 + val2 if isinstance(val1, list) else val1 + " " + val2)

        test_results = train_data_rdd.flatMap(
            lambda row: [((row[0][0], x), round(feature_tests_map[x](row[1], test_examples_map[row[0][1]]), 2)) for
                         x in data_tests_map[row[0][1]]]).collect()

    for result in sorted(test_results):
        feature_vectors[result[0][0]][result[0][1]] = result[1]

    for hit in textual_train_map['hits']['hits']:
        source = hit['_source']
        score = hit['_score']
        if is_column_based:
            name = "%s" % (source['semantic_type'])
        else:
            name = "%s" % (source['semantic_type'])
        score = balance_result(source["is_numeric"], test_examples_map["is_numeric"], False, score)
        if feature_vectors[name][TF_IDF_TEST] < score:
            feature_vectors[name][TF_IDF_TEST] = score

    if is_tree_based:
        for name in feature_vectors.keys():
            for test in feature_vectors[name].keys():
                for j in range(5):
                    feature_vectors[name][test + str(j)] = j * 0.2 <= feature_vectors[name][test] < (j + 1) * 0.2
                del feature_vectors[name][test]

    for name in feature_vectors.keys():

        if TF_IDF_TEST not in feature_vectors[name]:
            if is_tree_based:
                feature_vectors[name][TF_IDF_TEST + "0"] = 1
                for i in range(1, 5):
                    feature_vectors[name][TF_IDF_TEST + str(i)] = 0
            else:
                feature_vectors[name][TF_IDF_TEST] = 0
        feature_vectors[name][IS_NUMERIC] = test_examples_map['is_numeric']
        feature_vectors[name]['name'] = name.encode("utf-8")
        feature_vectors[name]['column_name'] = test_examples_map['name'] + "!" + test_examples_map['semantic_type']
        if is_labeled and name.split("!")[0] == test_examples_map['semantic_type']:
            feature_vectors[name]['label'] = 1
        else:
            feature_vectors[name]['label'] = 0
    return feature_vectors.values()
예제 #7
0
    def search_all_types(self, index_config):
        result = self.es.search(index=Utils.get_index_name(index_config), doctype='semantic',
                                body={"query": {"match_all": {}}})

        return sc.parallelize(result['hits']['hits']).map(lambda hit: hit['semantic_type']).collect()
예제 #8
0
 def get_distribution(data):
     return sc.parallelize(data).map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b).sortBy(
         lambda x: x).zipWithIndex().flatMap(lambda value, idx: [str(idx)] * int(value/len(data) * 100))
예제 #9
0
 def clean_examples_numeric(examples):
     return sc.parallelize(examples).map(lambda x: float(x) if Utils.is_number(x) else "").filter(
             lambda x: x).collect()
예제 #10
0
 def predict(self, test_data):
     return self.model.predict(sc.parallelize(test_data))
예제 #11
0
 def train(self):
     train_data = sc.parallelize(self.generate_train_data(1000))
     self.model = RandomForest.trainClassifier(train_data, numClasses=2, numTrees=100, featureSubsetStrategy='auto',
                                               impurity='gini', maxDepth=5, maxBins=32)