Пример #1
0
def dsto_norm_labeled_points(
    input_dataset,
    feature_keys,
    preprocess_script="item['CloseDate'] = (datetime.datetime.strptime(item['CloseDate'], '%Y-%m-%d') - datetime.datetime(1970,1,1)).total_seconds()"
):
    from pyspark.mllib.classification import LabeledPoint
    from cognub.propmixapi.normalizers import NPNormalizer
    datamap = {'targets': [], 'features': []}
    for item in input_dataset:
        features = []
        exec preprocess_script
        for column in feature_keys:
            try:
                if item[column] is not None and str(item[column]).replace(
                        '.', '', 1).strip('-').isdigit():
                    features.append(float(item[column]))
                else:
                    raise InvalidFeatureValue()
            except:
                break
        else:
            datamap['targets'].append(
                float(item['ClosePrice']) / float(item['LivingArea']))
            datamap['features'].append(features)

    datamap['features'] = NPNormalizer().fit(datamap['features']).transform(
        datamap['features'])
    dataset = []
    for index, target in enumerate(datamap['targets']):
        dataset.append(LabeledPoint(target, datamap['features'][index]))
    return dataset
Пример #2
0
    def main(self):
        """
        Execute the process of loading, training and testing features to class them
        Input: Nothing
        Output: Result file named result_classifier.json
        """

        print(
            "#1: Reads the bucket specified in argument and extracts both classes and values from features"
        )
        self.load_classes_and_values_from_features()

        print("#2: Processing 1 vs All classification")
        for class1 in self.classes:
            print("#3: Dataframes construction for classifier %s vs All" %
                  (class1))

            print("#4: Loading features values into main dataframe")
            self.load_features_in_dataframe_1_vs_All(class1)

            print("#8: Spliting dataframe into train and test samples")
            self.train_features_df, self.test_features_df = self.features_df.randomSplit(
                [0.5, 0.5])

            print("#8.1: %i training data" % (self.train_features_df.count()))
            print("#8.2: %i testing data" % (self.test_features_df.count()))

            print("#9: Convert strings label into float with an estimator")
            self.convert_labels_to_float()

            print("#10: Convert dataframe into labelpoints RDD")
            self.train_features_labelpoints = self.train_features_df.rdd.map(
                lambda row: LabeledPoint(row.label_index, row.features))
            self.test_features_labelpoints = self.test_features_df.rdd.map(
                lambda row: LabeledPoint(row.label_index, row.features))

            print("#11: Training classifier")
            self.training(class1, "All")

        print(
            "#Final results: loading best_models dictionnary informations to best_classifiers.json file"
        )
        with open("./best_classifiers.json", "w") as out:
            json.dump(self.best_models, out)
Пример #3
0
    def build_regressors(self, split_dataset, split_kmeans_dataset,
                         feature_keys):
        self.logger.info('building regressors')
        mce_tuples = []
        for dataset, kmeans_dataset in zip(split_dataset,
                                           split_kmeans_dataset):
            kmeans_train_set = []
            for item in kmeans_dataset:
                features = [item[column] for column in feature_keys]
                kmeans_train_set.append(array(features))
            # print "kmeans_train_set", len(kmeans_train_set)
            del kmeans_dataset
            kmeans_train_set = sc.parallelize(kmeans_train_set)
            clusters = KMeans.train(kmeans_train_set,
                                    100,
                                    maxIterations=200,
                                    runs=10,
                                    initializationMode="random")
            del kmeans_train_set
            data = []
            for item in dataset:
                features = []
                for column in feature_keys:
                    features.append(item[column])
                data.append(LabeledPoint(item[self.target_key], features))
            del dataset
            data = sc.parallelize(data)

            def preprocess(observation):
                observation.label = float(observation.label / 10000)
                return observation

            data = data.map(preprocess)
            (trainingData, testData) = data.randomSplit([0.7, 0.3])
            # del data
            model = RandomForest.trainRegressor(
                trainingData,
                categoricalFeaturesInfo={},
                numTrees=self.rfr_config['num_trees'],
                featureSubsetStrategy=self.
                rfr_config['feature_subset_strategy'],  # "all",
                impurity='variance',
                maxDepth=self.rfr_config['max_depth'])
            predictions = model.predict(testData.map(lambda x: x.features))
            labelsAndPredictions = testData.map(lambda lp: lp.label).zip(
                predictions)
            testMSE = -1
            try:
                testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() /\
                    float(testData.count())
            except:
                pass
            mce_tuples.append((model, clusters, testMSE))
        self.logger.info('regressors build finished')
        return mce_tuples
Пример #4
0
def create_labeled_point(labels_and_features, wanted_category):
    """
    Parses the line using the parser function lambda, and creates a LabeledPoing with
    the 'wanted' category as label
    :param line: the line to parse
    :param parser_function: the lambda function that creates the tuples
    :param line: the string line to parse
    """
    labels = labels_and_features[0]
    features = labels_and_features[1]

    return LabeledPoint(labels[wanted_category], features)
Пример #5
0
def preprocess(sc, data, labels=None):
    data = classifier.tolibsvm(data)
    points = []
    for i in range(len(data)):
        wordarr = data[i]
        label = 0
        if labels:
            label = labels[i]
        point = LabeledPoint(label, wordarr)
        points.append(point)

    rdd = sc.parallelize(points)
    return rdd
Пример #6
0
def tuple_to_labeled_point(entry, category, l_encoder=None):
    """
    Creates a label point from a text line that is formated as a tuple
    :param entry: a tuple of format (3, 2, 1, [3,4,4 ..]), where the first
            entries in the tuple are labels, and the last entry is
            a list of features
    :param category: which one of the labels in the tuple to keep for the
            labeled point (0 to 2 for imr dataset)
    :param l_encoder: the label encoder to encode the label (if any)

    :return: a LabeledPoint
    """

    from pyspark.mllib.classification import LabeledPoint
    label = entry[category]
    if l_encoder:
        label = l_encoder.transform(label)
    features = entry[-1]
    return LabeledPoint(label, features)  # return a new labelPoint
Пример #7
0
def dsto_labeled_points(dataset, feature_keys):
    from pyspark.mllib.classification import LabeledPoint
    data = []
    for item in dataset:
        features = []
        for column in feature_keys:
            try:
                if item[column] is not None and str(item[column]).replace(
                        '.', '', 1).replace('-', '').isdigit():
                    features.append(float(item[column]))
                else:
                    raise InvalidFeatureValue()
            except:
                break
        else:
            data.append(
                LabeledPoint(
                    float(item['ClosePrice']) / float(item['LivingArea']),
                    features))
    return data
Пример #8
0
 def build_classifier(self, dataset, kmeans_dataset, feature_keys):
     self.logger.info('building classifier')
     kmeans_train_set = []
     for item in kmeans_dataset:
         features = [item[column] for column in feature_keys]
         kmeans_train_set.append(array(features))
     self.logger.debug("kmeans_train_set %d", len(kmeans_train_set))
     kmeans_train_set = sc.parallelize(kmeans_train_set)
     clusters = KMeans.train(kmeans_train_set,
                             100,
                             maxIterations=500,
                             runs=10,
                             initializationMode="random")
     del kmeans_dataset
     del kmeans_train_set
     data = []
     for item in dataset:
         features = [item[column] for column in feature_keys]
         data.append(LabeledPoint(int(item['classifier_label']), features))
     del dataset
     data = sc.parallelize(data)
     (trainingData, testData) = data.randomSplit([0.7, 0.3])
     del data
     model = RandomForest.trainClassifier(
         trainingData,
         numClasses=self.total_splits,
         categoricalFeaturesInfo={},
         numTrees=self.rfc_config['num_trees'],
         featureSubsetStrategy=self.
         rfr_config['feature_subset_strategy'],  # "all",
         impurity='gini',
         maxDepth=self.rfc_config['max_depth'],
         maxBins=32)
     predictions = model.predict(testData.map(lambda x: x.features))
     labelsAndPredictions = testData.map(lambda lp: lp.label).zip(
         predictions)
     testErr = labelsAndPredictions.filter(
         lambda (v, p): v != p).count() / float(testData.count())
     self.logger.info('classifier build finished')
     return model, clusters, testErr
Пример #9
0
def main():
	# parameters
	features_dir = sys.argv[1]
	global train_features_lp
	global test_features_lp
	global best_models
	global features_value_list
	features_value_list = []
	best_models = {}
	classes = []
	for feature_file in os.listdir(features_dir):
		new_class = re.sub(r'[0-9]', '', feature_file)
		new_class = new_class[:-9].strip('_')
		classes.append(new_class)
	classes = sorted(list(set(classes)))
	classes_dup = classes
	# [FEATURES EXTRACTION]
	# subprocess.call(["python", "features_extract.py"])

	# [LOADING FEATURE VALUES] loading featuresvalues into dictionnary
	print(">>>>> Loading features values into list of rows..")
	features_value_list = load_features_value(features_dir)

	# [CLASSIFIER SELECTION] Selecting classifiers (1vs1, 1vsAll)
	# 1vs1 classifiers
	for class1 in classes:
		class2_set = [x for x in classes_dup]
		del class2_set[0:(classes.index(class1)+1)]
		print("classes")
		print(classes)
		print("class2_set")
		print(class2_set)
		for class2 in class2_set:
			print(">>>>> Building dataframes for classifier %s vs. %s.." % (class1,class2))
			# [LOADING FEATURES] loading features values into dataframe
			print("_____ Loading features values into main dataframe")
			features_df = load_features_df_1vs1(features_value_list)
			print("_____ Filtering data within dataframe")
			features_classifier_df = features_df\
									.filter((features_df.label == class1)\
									| (features_df.label ==  class2))
			# [SPLIT DATA] Split data into train & test
			print("_____ Spliting data into training & test data..")
			train_features_df, test_features_df = features_classifier_df.randomSplit([0.8, 0.20])
			train_count = train_features_df.count()
			test_count = test_features_df.count()
			print("%i training data" % (train_count))
			print("%i testing data" % (test_count))
			# [CONVERET LABELS] Convert string labels into floats with an estimator
			print("_____ Converting string labels into floats with an estimator..")
			train_features_df, test_features_df = convert_labels(train_features_df,test_features_df)
			# [CONVERT INTO LABELDPOINTS]
			print(">>>>> Converting dataframe into labelpoint rdd..")
			train_features_lp = train_features_df.rdd.map(lambda row: LabeledPoint(row.label_index, row.features))
			test_features_lp = test_features_df.rdd.map(lambda row: LabeledPoint(row.label_index, row.features))
			# [BUILD MODEL] Learn classifier on training data
			print(">>>>> Training classifier..")
			training(class1,class2)
	# 1vsAll classifiers
	print("1vsALL---------------------------------------------------------------------------------------")
	print("classes")
	print(classes)
	print("class2_set")
	for class1 in classes:
		print(">>>>> Building dataframes for classifier %s vs. All.." % (class1))
		# [LOADING FEATURES] loading features values into dataframe
		print("_____ Loading features values into main dataframe")
		features_df = load_features_df_1vsAll(features_value_list,class1)
		# [SPLIT DATA] Split data into train & test
		print("_____ Spliting data into training & test data..")
		train_features_df, test_features_df = features_df.randomSplit([0.8, 0.20])
		train_count = train_features_df.count()
		test_count = test_features_df.count()
		print("%i training data" % (train_count))
		print("%i testing data" % (test_count))
		# [CONVERET LABELS] Convert string labels into floats with an estimator
		print("_____ Converting string labels into floats with an estimator..")
		train_features_df, test_features_df = convert_labels(train_features_df,test_features_df)
		# [CONVERT INTO LABELDPOINTS]
		print(">>>>> Converting dataframe into labelpoint rdd..")
		train_features_lp = train_features_df.rdd.map(lambda row: LabeledPoint(row.label_index, row.features))
		test_features_lp = test_features_df.rdd.map(lambda row: LabeledPoint(row.label_index, row.features))
		# [BUILD MODEL] Learn classifier on training data
		print(">>>>> Training classifier..")
		training(class1,"All")

	# [OUTPUT]
	# For each classifier, send model parameters in best_classifiers.json
	print(">>>>> Sending best model information to \"best_classifiers.json\"..")
	with open("./output/best_classifiers.json", "w") as out:
		json.dump(best_models, out)

	# hang script to tune it with Spark Web UI (available @ http://localhost:4040)
	raw_input("press ctrl+c to exit")
Пример #10
0
def parsePoint(line):
    values = [float(x) for x in line.split(' ')]
    return LabeledPoint(values[0], values[1:])
Пример #11
0
def clean_feature(rec):
    label = int(rec[-1])
    features = [float(x) for x in rec[:-1]]
    return LabeledPoint(label, features)