예제 #1
0
 def feature_select(self):
     b = SelectPercentile(f_classif, percentile=task.percentile)
     y = np.array(self.results[self.task.label].data)
     X = np.array(self.results[self.task.features].data)
     data = pd.DataFrame(b.fit_transform(X, y))
     result = TransformResult(self.task, 1.0, data)
     self.results[self.task.uuid] = result
예제 #2
0
 def intersect(self):
     sources = [self.results[s] for s in self.task.sources]
     #wait(sources[0])
     data = sources[0].data
     for s in sources[1:]:
         #wait(s)
         data = data[data.isin(s.data)].dropna(how='all')
     result = TransformResult(self.task, 1.0, data)
     self.results[self.task.uuid] = result
예제 #3
0
 def select(self):
     pred = self.task.predicate
     pred = pred.replace(' and ', ' & ')
     pred = pred.replace(' or ', ' | ')
     pred = pred.replace(' not ', ' ~')
     source = self.results[self.task.source]
     #wait(source)
     data = source.data.query(pred)
     result = TransformResult(self.task, 1.0, data)
     self.results[self.task.uuid] = result
예제 #4
0
 def difference(self):
     left = self.results[self.task.left]
     #wait(left)
     ldata = left.data
     right = self.results[self.task.right]
     #wait(right)
     rdata = right.data
     data = ldata[~ldata.isin(rdata)]
     result = TransformResult(self.task, 1.0, data)
     self.results[self.task.uuid] = result
예제 #5
0
 def merge(self):
     sources = pd.DataFrame(self.get_result(self.task.sources[0]).y_pred)
     for source in self.task.sources[1:]:
         s = pd.DataFrame(self.get_result(source).y_pred)
         sources = pd.merge(sources,
                            s,
                            left_index=True,
                            right_index=True,
                            how='outer')
     result = TransformResult(self.task, 1.0, sources)
     self.results[self.task.uuid] = result
예제 #6
0
    def load(self):
        name = self.task.name
        schema_file = self.task.schema_file
        data_file = self.task.data_file
        data = pd.read_csv(data_file)

        schema = None
        if schema_file is None:
            attrs = data.columns
            dtypes = data.dtypes
            schema = [(attr, str(dtypes[attr])) for attr in attrs]
        else:
            with open(self.task.schema_file) as f:
                schema = json.load(f)
        self.catalog[name] = {'uuid': self.task.uuid, 'schema': schema}

        result = TransformResult(self.task, 1.0, data)
        self.results[self.task.uuid] = result
예제 #7
0
    def load(self):
        schema_file = self.task.schema_file
        data_file = self.task.data_file
        data = pd.read_csv(data_file, nrows=0)

        schema = None
        if schema_file is None:
            attrs = data.columns
            dtypes = data.dtypes
            schema = [(attr, str(dtypes[attr])) for attr in attrs]
        else:
            with open(self.task.schema_file) as f:
                schema = json.load(f)
        self.catalog[data_file] = {'uuid': self.task.uuid, 'schema': schema}

        for chunk in pd.read_csv(data_file, chunksize=1000):
            data.append(chunk, ignore_index=True)
            result = TransformResult(self.task, 1.0, data)  #actual size
            self.results[self.task.uuid] = result
예제 #8
0
 def frequent_itemsets(self):
     source = self.get_result(self.task.source)
     data = source.data
     size = float(len(data.index))
     print 'size', size
     itemsets = []
     C = set(data.columns)
     k = 1
     while len(C) > k:
         C_next = set()
         for c in combinations(C, k):
             support = pd.DataFrame(
                 data[list(c)] != 0).product(1).sum(0) / size
             if support > self.task.support:
                 itemsets.append((str(c), support))
                 for x in c:
                     C_next.add(x)
         C = C_next
         k += 1
     itemsets = pd.DataFrame(itemsets, columns=['items', 'support'])
     print 'len:', len(itemsets.index)
     print 'final:', itemsets
     result = TransformResult(self.task, 1.0, itemsets)
     self.results[self.task.uuid] = result
예제 #9
0
 def union(self):
     data = pd.concat([self.results[s].data
                       for s in self.task.sources]).drop_duplicates()
     result = TransformResult(self.task, 1.0, data)
     self.results[self.task.uuid] = result
예제 #10
0
 def project(self):
     self.wait(self.task.source)
     source = self.results[self.task.source]
     data = pd.DataFrame(source.data[self.task.attributes])
     result = TransformResult(self.task, 1.0, data)
     self.results[self.task.uuid] = result
예제 #11
0
 def correlate(self):
     source = self.results[self.task.source]
     #wait(source)
     result = TransformResult(self.task, 1.0, source.data.corr())
     self.results[self.task.uuid] = result