def feature_select(self): b = SelectPercentile(f_classif, percentile=task.percentile) y = np.array(self.results[self.task.label].data) X = np.array(self.results[self.task.features].data) data = pd.DataFrame(b.fit_transform(X, y)) result = TransformResult(self.task, 1.0, data) self.results[self.task.uuid] = result
def intersect(self): sources = [self.results[s] for s in self.task.sources] #wait(sources[0]) data = sources[0].data for s in sources[1:]: #wait(s) data = data[data.isin(s.data)].dropna(how='all') result = TransformResult(self.task, 1.0, data) self.results[self.task.uuid] = result
def select(self): pred = self.task.predicate pred = pred.replace(' and ', ' & ') pred = pred.replace(' or ', ' | ') pred = pred.replace(' not ', ' ~') source = self.results[self.task.source] #wait(source) data = source.data.query(pred) result = TransformResult(self.task, 1.0, data) self.results[self.task.uuid] = result
def difference(self): left = self.results[self.task.left] #wait(left) ldata = left.data right = self.results[self.task.right] #wait(right) rdata = right.data data = ldata[~ldata.isin(rdata)] result = TransformResult(self.task, 1.0, data) self.results[self.task.uuid] = result
def merge(self): sources = pd.DataFrame(self.get_result(self.task.sources[0]).y_pred) for source in self.task.sources[1:]: s = pd.DataFrame(self.get_result(source).y_pred) sources = pd.merge(sources, s, left_index=True, right_index=True, how='outer') result = TransformResult(self.task, 1.0, sources) self.results[self.task.uuid] = result
def load(self): name = self.task.name schema_file = self.task.schema_file data_file = self.task.data_file data = pd.read_csv(data_file) schema = None if schema_file is None: attrs = data.columns dtypes = data.dtypes schema = [(attr, str(dtypes[attr])) for attr in attrs] else: with open(self.task.schema_file) as f: schema = json.load(f) self.catalog[name] = {'uuid': self.task.uuid, 'schema': schema} result = TransformResult(self.task, 1.0, data) self.results[self.task.uuid] = result
def load(self): schema_file = self.task.schema_file data_file = self.task.data_file data = pd.read_csv(data_file, nrows=0) schema = None if schema_file is None: attrs = data.columns dtypes = data.dtypes schema = [(attr, str(dtypes[attr])) for attr in attrs] else: with open(self.task.schema_file) as f: schema = json.load(f) self.catalog[data_file] = {'uuid': self.task.uuid, 'schema': schema} for chunk in pd.read_csv(data_file, chunksize=1000): data.append(chunk, ignore_index=True) result = TransformResult(self.task, 1.0, data) #actual size self.results[self.task.uuid] = result
def frequent_itemsets(self): source = self.get_result(self.task.source) data = source.data size = float(len(data.index)) print 'size', size itemsets = [] C = set(data.columns) k = 1 while len(C) > k: C_next = set() for c in combinations(C, k): support = pd.DataFrame( data[list(c)] != 0).product(1).sum(0) / size if support > self.task.support: itemsets.append((str(c), support)) for x in c: C_next.add(x) C = C_next k += 1 itemsets = pd.DataFrame(itemsets, columns=['items', 'support']) print 'len:', len(itemsets.index) print 'final:', itemsets result = TransformResult(self.task, 1.0, itemsets) self.results[self.task.uuid] = result
def union(self): data = pd.concat([self.results[s].data for s in self.task.sources]).drop_duplicates() result = TransformResult(self.task, 1.0, data) self.results[self.task.uuid] = result
def project(self): self.wait(self.task.source) source = self.results[self.task.source] data = pd.DataFrame(source.data[self.task.attributes]) result = TransformResult(self.task, 1.0, data) self.results[self.task.uuid] = result
def correlate(self): source = self.results[self.task.source] #wait(source) result = TransformResult(self.task, 1.0, source.data.corr()) self.results[self.task.uuid] = result