def get_table(self,iterator,target_name=None): feature_names = self.get_feature_names() X = [] y=[] converter = TypeConverter() for row_value in iterator: row = [] for name in feature_names: if name == target_name: y.append(converter.cast(self.metadata[name]['type'],row_value[name])) else: row.append(converter.cast(self.metadata[name]['type'],row_value[name])) X.append(row) return np.array(X),np.array(y)
def get_dataset(self,iterator,target_name=None): feature_names = self.get_feature_names() X = [] y=[] converter = TypeConverter() for row_value in iterator: row = {} for name in feature_names: if name == target_name: y.append(converter.cast(self.metadata[name]['type'],row_value[name])) else: row[name] = converter.cast(self.metadata[name]['type'],row_value[name]) X.append(row) vec = DictVectorizer() dataset = vec.fit_transform(X) feature_names = vec.get_feature_names() return y, dataset.toarray(), feature_names
def combine(self, key, values): resume = {} tc = TypeConverter() #type data values = map(tc.type,values) resume['num-values'] = len(values) resume['frequency'] = self._freq_count(values) try: resume['N/A'] = resume['frequency'][''] del resume['frequency'][''] except KeyError: resume['N/A'] = 0 resume['type'] = tc.get_type(values) if resume['type'] == 'int' or resume['type'] == 'float': defined_values = filter(lambda v : v != '',values) resume['min'] = min(defined_values) resume['max'] = max(defined_values) resume['sum'] = sum(defined_values) return (key,resume)