示例#1
0
    def visit(self, element):
        try:
            featureset_df = element.get_dataframe()

            # collect columns to restore initial column order in the end
            columns = list(featureset_df)

            # expand features, aggregate features, group features
            index_df = featureset_df[self._column] if type(self._column) is list else featureset_df[[self._column]]
            aggregated_groups = []
            for column in featureset_df:
                if featureset_df[column].dtype == "object":
                    if isinstance(featureset_df[column].iloc[0], (list, tuple, np.ndarray)):
                        group_df = self.expand(featureset_df, column, True)
                        group_idx = pd.concat([index_df, group_df], axis=1)
                        group_idx = group_idx.groupby(self._column).aggregate(self.select_numeric_op())
                        group_idx[column] = list(group_idx.values)
                        print(group_idx[[column]])
                        aggregated_groups.append(group_idx[[column]])

            print(featureset_df)
            featureset_df = featureset_df.groupby(self._column).aggregate(self.select_numeric_op())
            aggregated_groups.append(featureset_df)
            featureset_df = pd.concat(aggregated_groups, axis=1)
            featureset_df = featureset_df.reset_index()
            print(featureset_df)
            featureset_df = featureset_df.reindex(columns, axis=1)
            print(featureset_df)

            element.set_dataframe(featureset_df)

        except Exception as error:
            Util.print_error("Unable to condense Dataframe: " + str(error))
            Util.print_detailed_error()
示例#2
0
    def execute(self, element):
        try:
            data_type = element.get_dataframe()[self._column].dtype
            if data_type == "object":
                data = element.get_dataframe()[self._column]
                for x in range(0, 1000):
                    if len(data) <= x:
                        continue

                    if isinstance(data.iloc[x], (list, tuple, np.ndarray)):
                        if len(data.iloc[x]) == 0:
                            continue
                        elif isinstance(data.iloc[x][0], str):
                            return "stringarray"
                        elif isinstance(data.iloc[x][0], int):
                            return "intarray"
                        elif isinstance(data.iloc[x][0], float):
                            return "floatarray"
                        else:
                            return "featurevector_" + str(
                                data.iloc[x][0].dtype)
                    elif isinstance(data.iloc[x], str):
                        return "string"
            elif isinstance(data_type, str):
                return "string"
            else:
                return str(data_type)
        except Exception as error:
            util.print_error("Unable to print data type: " + str(error))
            util.print_detailed_error()
示例#3
0
    def visit(self, element):
        try:
            featureset_df = element.get_dataframe()

            # check if column is single-column or column group
            feature_vector = False
            target_column = featureset_df[self._column]
            if target_column.dtype == "object":
                if isinstance(target_column.iloc[0],
                              (list, tuple, np.ndarray)):
                    feature_vector = True
                    target_column = self.expand(featureset_df, self._column,
                                                True)

            # TODO no option for feature vector with string features yet (compare numbers)
            if self._type == "string":
                target_column.fillna(self._value, inplace=True)

            if self._type == "number":
                if feature_vector:
                    for column in target_column:
                        self.fill_number_feature_cells(target_column[column])
                    featureset_df[self._column] = list(
                        target_column[list(target_column)].values)
                else:
                    self.fill_number_feature_cells(target_column)

            element.set_dataframe(featureset_df)
        except Exception as error:
            Util.print_error("Unable to Group Features: " + str(error))
            Util.print_detailed_error()
示例#4
0
 def visit(self, model):
     try:
         result = model.get_metric()
         result['AccScore'] = accuracy_score(self._true, self._predict)
         model.set_metric(result)
     except Exception as error:
         Util.print_error("Unable to set estimator of Model")
         Util.print_error(error)
示例#5
0
 def visit(self, model):
     try:
         estimator = model.get_estimator()
         prediction = estimator.predict(self._predict)
         return prediction
     except Exception as error:
         Util.print_error("Unable to predict")
         Util.print_error(error)
示例#6
0
 def visit(self, model):
     try:
         estimator = model.get_estimator()
         estimator.fit(self._X, self._Y)
         model.set_estimator(estimator)
     except Exception as error:
         Util.print_error("Unable to fit estimator")
         Util.print_error(error)
示例#7
0
文件: join.py 项目: nsiegmun/shaperML
 def visit(self, featureset):
     try:
         featureset.set_dataframe_column(
             self._column,
             np.char.join(self._value,
                          featureset.get_column_values(self._column)))
     except Exception as error:
         util.print_error("Unable to add value to array")
         util.print_error(error)
示例#8
0
    def visit(self, model):
        try:
            return cross_validate(self._estimator,
                                  model.get_x_train(),
                                  model.get_y_train(),
                                  cv=self._k_fold)

        except Exception as error:
            Util.print_error("Unable to calculate cross validation")
            Util.print_error(error)
示例#9
0
 def visit(self, model):
     try:
         result = model.get_metric()
         result['PrecScore'] = precision_score(self._true,
                                               self._predict,
                                               average=self._average)
         model.set_metric(result)
     except Exception as error:
         Util.print_error("Unable to set estimator of Model")
         Util.print_error(error)
示例#10
0
 def visit(self, model):
     try:
         if self._return_value:
             return self.get_estimator(self._estimator)
         else:
             model.set_estimator(self.get_estimator(self._estimator))
             model.set_estimator_type(self._learning_type)
     except Exception as error:
         Util.print_error("Unable to set estimator of Model")
         Util.print_error(error)
示例#11
0
 def visit(self, featureset):
     try:
         data = featureset.get_featureset()
         if self._method == "median":
             data = data.fillna(data.median())
         else:
             data = data.interpolate(method=self._method)
         featureset.set_featureset(data)
     except Exception as error:
         Util.print_error("Unable to mask featureset: " + str(error))
         Util.print_detailed_error()
示例#12
0
 def visit(self, element):
     try:
         data = element.get_dataframe()
         if self._mode == "shuffle":
             data = data.sample(frac=1)
         elif self._mode == "column":
             data = data.sort_values(by=self._column)
         elif self._mode == "index":
             data = data.sort_index()
         element.set_dataframe(data)
     except Exception as error:
         Util.print_error("Unable to sort featureset: " + str(error))
         Util.print_detailed_error()
示例#13
0
 def execute(self, element):
     try:
         processor = LabelBinarizer(self._neg_label, self._pos_label,
                                    self._sparse_output)
         dataframe = element.get_dataframe()
         data = processor.fit_transform(dataframe.values)
         classes = processor.classes_
         element.set_classes(classes)
         dataframe.values = data
         element.set_dataframe(dataframe)
     except Exception as error:
         Util.print_error("Unable to label binarize Dataframe: " +
                          str(error))
         Util.print_detailed_error()
示例#14
0
 def visit(self, featureset):
     try:
         _result = []
         for text in featureset.get_column_values(self._column):
             if isinstance(text, list):
                 _preprocessed = []
                 for word in text:
                     _preprocessed.append(word.upper())
                 _result.append(_preprocessed)
             else:
                 _result.append(text.upper())
         featureset.set_dataframe_column(self._column, _result)
     except Exception as error:
         util.print_error("Unable to add value to array")
         util.print_error(error)
示例#15
0
 def visit(self, featureset):
     try:
         _result = []
         for text in featureset.get_column_values(self._column):
             if isinstance(text, list):
                 _preprocessed = []
                 for word in text:
                     _preprocessed.append(sum(len(x) for x in word))
             else:
                 _preprocessed = sum(len(x) for x in text)
             _result.append(_preprocessed)
         featureset.set_dataframe_column(self._column, np.asarray(list(_result))[:, np.newaxis])
     except Exception as error:
         util.print_error("Unable to create character sum of text")
         util.print_error(error)
示例#16
0
 def visit(self, featureset):
     try:
         _result = []
         for text in featureset.get_column_values(self._column):
             if isinstance(text, list):
                 _preprocessed = []
                 for word in text:
                     _preprocessed.append(1 if word else 0)
             else:
                 _preprocessed = 1 if text else 0
             _result.append(_preprocessed)
         featureset.set_dataframe_column(
             self._column,
             np.asarray(list(_result))[:, np.newaxis])
     except Exception as error:
         util.print_error("Unable to create binary column")
         util.print_error(error)
示例#17
0
 def visit(self, featureset):
     try:
         _result = []
         for text in featureset.get_column_values(self._column):
             if isinstance(text, list):
                 _preprocessed = []
                 for word in text:
                     _preprocessed.append(self.n_gram(word))
                 _result.append(_preprocessed)
             else:
                 _preprocessed = self.n_gram(text)
                 print(_preprocessed)
                 _result.append(_preprocessed)
         featureset.set_dataframe_column(self._column, _result)
     except Exception as error:
         util.print_error("Unable to Create Word NGrams")
         util.print_error(error)
示例#18
0
 def visit(self, featureset):
     try:
         _result = []
         for text in featureset.get_column_values(self._column):
             if isinstance(text, list):
                 _preprocessed = []
                 for word in text:
                     _preprocessed.append(self.remove(word))
             else:
                 _preprocessed = self.remove(text)
             _result.append(_preprocessed)
         _new_result = np.asarray(list(_result))[:, np.newaxis]
         _new_result = _new_result.reshape(
             featureset.get_column_values(self._column).shape)
         featureset.set_dataframe_column(self._column, _new_result)
     except Exception as error:
         util.print_error("Unable to tokenize column")
         util.print_error(error)
示例#19
0
 def visit(self, element):
     try:
         featuresets = {}
         data = element.get_dataframe()
         temp_data = []
         self.create_split_list(data.shape[0])
         for key, value in self._id_split.items():
             if self._mode == "sequential":
                 temp_data = data[:value]
             elif self._mode == "random":
                 temp_data = np.split(data, value)
             data = data.iloc[value:]
             featuresets[key] = temp_data
         return featuresets
     except Exception as error:
         Util.print_error(
             "Unable to split Featureset in multiple Frames: " + str(error))
         Util.print_detailed_error()
示例#20
0
文件: mask.py 项目: nsiegmun/shaperML
 def visit(self, element):
     try:
         featureset = element.get_dataframe()
         if self._column is None:
             featureset = featureset.mask(eval(self._condition))
         else:
             feature = featureset[self._column]
             if feature.dtype == "object":
                 if isinstance(feature.iloc[0], (list, tuple, np.ndarray)):
                     feature = self.expand(pd.DataFrame(feature),
                                           self._column)
                     feature = feature.mask(eval(self._condition))
                     feature = list(feature.values)
             else:
                 feature = feature.mask(eval(self._condition))
             featureset[self._column] = feature
         element.set_dataframe(featureset)
     except Exception as error:
         Util.print_error("Unable to mask featureset: " + str(error))
         Util.print_detailed_error()
示例#21
0
    def visit(self, element):
        try:
            featureset_df = element.get_dataframe()
            new_featureset = pd.DataFrame()
            if self._sequential is True:
                temp_dataframe = pd.DataFrame()
                temp_value = None
                temp_index = []
                for index, row in featureset_df.iterrows():
                    "Set first indexlist element"
                    if temp_value is None:
                        temp_value = row[self._column]
                        temp_index.append(index)
                    else:
                        "Check if value of column is in current row and add the row to new dataframe"
                        if temp_value != row[self._column]:
                            "Save the value from the selected column"
                            first_value_frame = pd.DataFrame(
                                {self._column: [temp_value]})

                            "Calculate the value for every column"
                            second_value_frame = self.select_numeric_feature(
                                temp_dataframe)

                            "Transform Series Dataframe to Dataframe and transpose it"
                            second_value_frame = second_value_frame.to_frame(
                            ).transpose()

                            "Add every column to dataframe"
                            for name, value in second_value_frame.iteritems():
                                first_value_frame[name] = value

                            "Add new row to dataframe"
                            new_featureset = new_featureset.append(
                                first_value_frame, ignore_index=True)
                            temp_dataframe = pd.DataFrame()

                            "Add index to indexlist"
                            temp_index.append(index)
                            temp_value = row[self._column]
                    "Add Row to temporary dataframe"
                    temp_dataframe = temp_dataframe.append(
                        featureset_df.iloc[index])

                "Save the value from the selected column"
                first_value_frame = pd.DataFrame({self._column: [temp_value]})

                "Calculate the value for every column"
                second_value_frame = self.select_numeric_feature(
                    temp_dataframe)

                "Transform Series Dataframe to Dataframe and transpose it"
                second_value_frame = second_value_frame.to_frame().transpose()

                "Add every column to dataframe for last row"
                for name, value in second_value_frame.iteritems():
                    first_value_frame[name] = value

                new_featureset = new_featureset.append(first_value_frame,
                                                       ignore_index=True)
            else:
                accumulate_list = featureset_df[self._column].unique()
                temp_unique_list = accumulate_list
                temp_index = []
                "Create Index list for unique values"
                for index, row in featureset_df.iterrows():
                    for value in temp_unique_list:
                        if value == row[self._column]:
                            temp_unique_list = temp_unique_list[
                                temp_unique_list != value]
                            temp_index.append(index)
                            break
                for value in accumulate_list:
                    "Select all rows with value in column"
                    temp_dataframe = featureset_df.loc[featureset_df[
                        self._column] == value]

                    "Save the value from the selected column"
                    first_value_frame = pd.DataFrame({self._column: [value]})

                    "Calculate the value for every column"
                    second_value_frame = self.select_numeric_feature(
                        temp_dataframe)

                    "Transform Series Dataframe to Dataframe and transpose it"
                    second_value_frame = second_value_frame.to_frame(
                    ).transpose()

                    "Add every column to dataframe"
                    for name, second_value in second_value_frame.iteritems():
                        first_value_frame[name] = second_value
                    new_featureset = new_featureset.append(first_value_frame,
                                                           ignore_index=True)

            # Update Dataframe index
            if self._save_index:
                new_featureset["#index#"] = temp_index
                new_featureset = new_featureset.set_index("#index#")
                new_featureset.index.name = None

            "Reindexcolumns of the new dataframe with the old dataframe"
            new_featureset = new_featureset.reindex(
                columns=featureset_df.columns)
            element.set_dataframe(new_featureset)

        except Exception as error:
            Util.print_error("Unable to condense Dataframe: " + str(error))
            Util.print_detailed_error()
示例#22
0
    def visit(self, featureset):
        try:
            # TODO: outsource into method "set_tokenizer" (tokenizer as member - no extraction_target required then)
            tokenizer = None
            if self._extraction_target == "word":
                tokenizer = LemmaTokenizer(LanguageProcessor())
            elif self._extraction_target == "pos":
                tokenizer = POSTokenizer(LanguageProcessor())
            elif self._extraction_target == "ne_simple":
                tokenizer = NamedEntityTokenizer(LanguageProcessor())
            elif self._extraction_target == "ne_detailed":
                tokenizer = NamedEntityTokenizer(LanguageProcessor(), detailed=True)
            elif self._extraction_target.startswith("wordlist"):
                path = self._extraction_target.split("_")[1]
                tokenizer = WordlistEntryTokenizer(LanguageProcessor(), wordlist=path)

            # TODO: outsource into method "set_vectorizer" (vectorizer as member - no measure required then)
            print(self._ngram)
            print(self._column)
            vectorizer = None
            binary = self._measure == "presence" or self._extraction_type == "presence"
            if self._ngram is None:
                if self._measure == "tfidf":
                    vectorizer = TfidfVectorizer(tokenizer=tokenizer)
                else:
                    # TODO: here it is absolute term-frequency - what about relative?
                    #   For ngrams not easy:
                    #   - needs to count the amount of n-gram for each document and divide each feature generated from
                    #     the ngram-counts of the document by that amount
                    #   For named-entities:
                    #   - count words inside named entities (not just the amount of NEs) devide by num tokens of doc
                    #   ...

                    vectorizer = CountVectorizer(tokenizer=tokenizer, binary=binary)
            else:
                if self._measure == "tfidf":
                    vectorizer = TfidfVectorizer(tokenizer=tokenizer, ngram_range=self._ngram)
                else:
                    vectorizer = CountVectorizer(tokenizer=tokenizer, ngram_range=self._ngram, binary=binary)
            temp_column = featureset.get_featureset()[self._column]
            temp_column = temp_column.values

            new_column = []
            "Note: Presence and Count for every(einzeln) feature or for all(alle) feature"
            if self._extraction_type == "bow" or self._extraction_type == "ngram":
                # Return Matrix
                new_column = list(vectorizer.fit_transform(temp_column).toarray())
            elif self._extraction_type == "list":
                # Return String Array
                analyzer = vectorizer.build_tokenizer()
                for row in temp_column:
                    print(row)
                    print(analyzer(row))
                    new_column.append(analyzer(row))
            elif self._extraction_type == "presence":
                # Return Numeric Array
                analyzer = vectorizer.build_tokenizer()
                for row in temp_column:
                    new_column.append(1 if len(analyzer(row)) > 0 else 0)
                    # new_column.append(len(analyzer(row)) > 0)
            elif self._extraction_type == "count":
                # Return Numeric Array
                analyzer = vectorizer.build_tokenizer()
                for row in temp_column:
                    new_column.append(len(analyzer(row)))
            return new_column
        except Exception as error:
            util.print_error("Failed to use Language Processor " + str(error))
            util.print_detailed_error()