Пример #1
0
 def addFile(self, dir_entry, file_type, folder_database, parent_path):
     base_name = os.path.basename(dir_entry.name)
     keywords = splitPattern(base_name)
     file_object = FileObject(os.path.join(parent_path, dir_entry.name), file_type,
             '', False)  # Future can generate some description
     file_name, _, _ = createName(folder_database._argument_database[file_type].name_dict.keys(), keywords)
     folder_database.add(file_type, keywords, file_object, file_name)
Пример #2
0
 def __init__(self):
     self.commandtype_database = Database()
     for command_type in AbstractCommand.CommandType:
         keywords = splitPattern(command_type.name)
         keywords.append(command_type.name.lower())
         command_name = '.'.join(keywords)
         self.commandtype_database.add(keywords,
                                       command_type,
                                       name=command_name)
Пример #3
0
 def evaluate(self, array_data, target):
     split_target = splitPattern(target.data)
     out = np.array([
         self.containsWordList(data, split_target)
         for data in array_data.data
     ])
     result = ResultObject(out, [], DataType.logical_array,
                           CommandStatus.Success, True)
     result.createName(array_data.keyword_list,
                       split_target,
                       command_name='contains',
                       set_keyword_list=True)
     return result
Пример #4
0
    def read(self, file_path, keyword_list, recursive=False, folder_database=None):
        """
        Load the file name specified and store it in history
        Parameters:
            file_path folder location
            keyword_list keywords used to describe the folder
        """
        result_object = ResultObject(None, None, None, CommandStatus.Error)
        if folder_database is None:
            folder_database = TypeDatabase(data_type_list=[DataType.csv, DataType.image])
            create_result = True
        else:
            create_result = False

        if len(keyword_list) == 0:
            keyword_list = splitPattern(file_path)
        if not os.path.isdir(file_path):
            file_path = os.path.join(os.path.expanduser('~'),
                                     file_path)
            if not os.path.isdir(file_path):
                print("Cannot find folder: ", file_path)
                return result_object
        for dir_entry in os.scandir(file_path):
            if self.checkEndsWith(dir_entry.name, ['.csv', '.xlsx']) and dir_entry.is_file():
                self.addFile(dir_entry, DataType.csv, folder_database, file_path)
            elif self.checkEndsWith(dir_entry.name, ['.png', '.jpg', '.JPG', '.jpeg']) and dir_entry.is_file():
                self.addFile(dir_entry, DataType.image, folder_database, file_path)
            if recursive and dir_entry.is_dir():
                dir_keywords = splitPattern(dir_entry.name)
                self.read(os.path.join(file_path, dir_entry.name), keyword_list + dir_keywords, True, folder_database)
        if not create_result:
            return False
        folder_object = FolderObject(folder_database, file_path)
        result_object = ResultObject(folder_object, keyword_list, DataType.folder, CommandStatus.Success)
        result_object.createName(keyword_list)
        return result_object
Пример #5
0
 def removeCommonNames(self, input_names):
     """
     Remove common occurences between different strings
     """
     name_list_list = [splitPattern(name) for name in input_names]
     if len(input_names) == 1:
         common_name = ' '.join(name_list_list[0])
         out_names = [common_name]
     else:
         common_name_set = set.intersection(
             *[set(name_list) for name_list in name_list_list])
         common_name = ' '.join(common_name_set)
         out_names = [
             ' '.join(self.removeFromList(name_list, common_name_set))
             for name_list in name_list_list
         ]
     return out_names, common_name
Пример #6
0
 def add_categories_as_columns(self, uniqVals, col_data, col_split,
                               result_objects, command_status):
     """
         Module to convert a categorical column into a bunch of logical
         arrays
     """
     for uniV in uniqVals:
         categ_data = col_data == uniV
         categ_name = str(uniV)
         category_split = [
             key_val.lower() for key_val in splitPattern(categ_name)
         ]
         category_keyword_list = category_split + col_split
         result_object = ResultObject(categ_data * 1, category_keyword_list,
                                      DataType.logical_array,
                                      command_status)
         result_object.createName(category_keyword_list)
         result_objects.append(result_object)
     return result_objects
Пример #7
0
    def preRead(self, file_path, keyword_list):
        command_status = CommandStatus.Success
        try:
            data = pd.read_csv(file_path)
        except:
            try:
                data = pd.read_excel(file_path)
            except:
                return ResultObject("File not found", None, None,
                                    CommandStatus.Error)
        result_objects = []
        result_object = ResultObject(data,
                                     keyword_list,
                                     DataType.csv,
                                     command_status,
                                     add_to_cache=True)
        result_object.createName(result_object.keyword_list)
        result_objects.append(result_object)
        # Too many columns do not extract them individually
        if len(data.columns) > 5000:
            return result_objects
        new_column_names = []
        # num_unique = float("inf")  # Used for smallest unique vec finding
        #current_gt = None
        for column in data.columns:
            if self.col_head_pattern.match(column):
                data.drop(column, axis=1, inplace=True)
                continue
            else:
                col_split = splitPattern(column)
            col_data = data[column].values
            col_keyword_list = col_split

            N = col_data.size
            if N == 0:
                continue
            if isinstance(col_data[0], str):
                if '%' in col_data[0]:
                    try:
                        col_data = data[column].str.rstrip('%').astype(
                            float, copy=False)
                        data[column] = col_data
                        if 'percent' not in col_keyword_list:
                            col_keyword_list.append('percent')
                    except ValueError:
                        pass
                elif '$' in col_data[0] or ',' in col_data[0]:
                    try:
                        col_data = data[column].str.translate(
                            self.currency_dict).astype(float, copy=False)
                        data[column] = col_data
                        if '$' not in col_keyword_list:
                            col_keyword_list.append('$')
                    except ValueError:
                        pass
            result_object = ResultObject(col_data,
                                         col_keyword_list,
                                         DataType.array,
                                         command_status,
                                         add_to_cache=True)
            result_object.createName(col_keyword_list)
            new_column_names.append(result_object.name)
            result_objects.append(result_object)
            # For now removing unique value search which is pretty slow
            #unique_vals = StatContainer.isCategorical(col_data)
            # if unique_vals is not None:
            #    if len(unique_vals) < num_unique:
            #        current_gt = result_object
            #        num_unique = len(unique_vals)
            #    # Do not add unique values as columns unless they are only a
            #    # few
            #    # if len(unique_vals) < 5:
            #    #    result_objects = self.add_categories_as_columns(
            #    #    unique_vals, col_data, col_split,
            #    #    result_objects, command_status)
        # Replace columns:
        data.columns = new_column_names
        # if current_gt is not None:
        #    StatContainer.ground_truth = current_gt
        return result_objects