예제 #1
0
    def __validate_input(self):
        if self.df is None:
            raise ValueError("Feature dataframe should not be of None type")

        if type(self.df) is not pd.core.frame.DataFrame:
            raise TypeError(
                "Feature dataframe is not a valid dataframe.\nExpected object type: pandas.core.frame.DataFrame"
            )

        if self.type is None:
            raise ValueError("Feature type should not be of None type")
        else:
            if type(self.type) is not str:
                raise TypeError(f'Expected string value for argument "type" ')
            if self.type not in ["MinMaxScaler", "BinaryScaler", "StandardScaler"]:
                raise ArgumentsError(
                    f'Allowed argument for type is "MinMaxScaler" or "BinaryScaler" or "StandardScaler", got {self.type}'
                )

        if self.columns is not None:
            column_list = list(self.df.keys())
            for column in self.columns:
                if type(column) != str:
                    raise TypeError(f"Expected str type column, got {type(column)}")
                if column not in column_list:
                    raise ArgumentsError(f"Column {column} does not exist in dataframe")

        self.new_df = self.df
예제 #2
0
 def isempty(content):
     if isinstance(content, int):
         return content == 0
     elif isinstance(content, basestring):
         return content.lower() in ["", "null", "[]"]
     else:
         raise ArgumentsError()
예제 #3
0
    def speech_recognition(self, content, _attrs=None):
        if content is not None:
            content = json.loads(content)
            if not isinstance(content, list):
                content = [content]

        if "dest_dir" not in _attrs and not Utils.isempty(_attrs["dest_dir"]):
            raise ArgumentsError("dest_dir Is Not Set In Attributes.")

        cmd = Config.SPEECH_RECOGNITION_CMD.format(_attrs["dest_dir"])
        if not self.is_test_mode:
            _dir = Config.SPEECH_RECOGNITION_RESULT_DIR
            _dirs = [
                _dir,
                os.path.join(_dir, "voiceconflict"),
                os.path.join(_dir, "voiceresult"),
                os.path.join(_dir, "voicesence")
            ]
            for _d in _dirs:
                if not os.path.is_dir(_d):
                    os.makedirs(_d, 0755)

            self.logger.info("Speech Recognition Command: {0}".format(cmd))
            os.system(cmd)
            time.sleep(self.cmd_sleep)

        return ProcessorResponse(corrects=content, attributes={
            "command": cmd
        })._print()
예제 #4
0
    def create_partition(self, content, _attrs=None):
        attrs = {
            "recorddate": "",
            "filename": "",
            "mysql_put_date": "",
            "area_of_job": "",
            "date": "",
            "batch": "",
            "command": "",
        }

        if content is not None:
            content = json.loads(content)
            if not isinstance(content, list):
                content = [content]

        for item in content:
            if "start_time" in item and "area_of_job" in item:
                start_time = item["start_time"]
                (year, mon, day) = Utils.timestamp_to_ymd(start_time)
                attrs["recorddate"] = Utils.timestamp_to_partiton(start_time)
                attrs["date"] = "{0}{1}{2}".format(year, mon, day)
                attrs["filename"] = "{0}_hive".format(attrs["date"])
                attrs["mysql_put_date"] = "{0}-{1}-{2}".format(year, mon, day)
                attrs["area_of_job"] = item["area_of_job"]
                attrs["batch"] = attrs["date"]
                break
            else:
                raise ArgumentsError("start_time or area_of_job is Empty.")

        if "recorddate" in attrs and attrs["recorddate"] != "":
            cmd = Config.CREATE_HIVE_PARTITIONS_COMMAND_PATTERN.format(
                recorddate=attrs["recorddate"]).strip()
            attrs["command"] = cmd
            self.logger.info("Create Partition CMD: {0}".format(cmd))
            if not self.is_test_mode:
                os.system(cmd)
                time.sleep(self.cmd_sleep)

        for (k, v) in attrs.items():
            if v == "":
                raise ArgumentsError("create_partition attrs value is Empty.")

        return ProcessorResponse(corrects=content, attributes=attrs)._print()
예제 #5
0
    def wav2png(self, content, _attrs=None):
        """
        语音文件生成波形图
        """
        cmd = None
        if content is not None:
            content = json.loads(content)
            if not isinstance(content, list):
                content = [content]

        if not Utils.jsonobj_isempty(content):
            if "dest_dir" not in _attrs and \
               not Utils.isempty(_attrs["dest_dir"]):
                raise ArgumentsError("dest_dir Is Not Set In Attributes.")

            ftp_dw_root = Utils.use_if_set_else_default(
                "ftp_download_root_dir", _attrs, Config.FTP_DOWNLOAD_ROOT_DIR)
            wavform_root = Utils.use_if_set_else_default(
                "wavform_root", _attrs, Config.WAVFORM_ROOT_DIR)

            input_dir = _attrs["dest_dir"]
            output_dir = os.path.join(
                wavform_root,
                *input_dir.split(ftp_dw_root)[-1].split("/"))

            (cmd, _, _output_dir) = Utils.wav2png(input_dir, output_dir,
                                                  self.is_test_mode)
            self.logger.info("WAV Transform PNG Command: {0}".format(cmd))
            for _file in content:
                if "download_path" in _file:
                    fname = os.path.basename(_file["download_path"])
                    png_path = os.path.join(_output_dir,
                                            "{0}.png".format(fname))
                    self.logger.info("Get PNG Result [{0}]".format(png_path))
                    if not self.is_test_mode:
                        try:
                            _content = Utils.get_file_strcontents(png_path)
                            _file["waveform"] = "{0}{1}".format(
                                Config.WAVFORM_PREFIX,
                                base64.b64encode(_content))
                        except Exception as e:
                            _file["wav2png_errors"] = str(e)

                    _file["id"] = fname

        return ProcessorResponse(corrects=content, attributes={
            "command": cmd
        })._print()
예제 #6
0
    def wav2png(input_dir=None, output_dir=None, is_test_mode=False):
        if input_dir is None or output_dir is None:
            raise ArgumentsError()

        input_dir = Utils.append_suffix_not_exists(input_dir, "/")
        output_dir = Utils.append_suffix_not_exists(output_dir, "/")

        if not os.path.exists(input_dir) or not os.path.isdir(input_dir):
            raise FileNotFoundError(input_dir)

        if not os.path.exists(output_dir) or not os.path.isdir(output_dir):
            os.makedirs(output_dir)

        cmd = Config.WAV_TO_PNG_COMMAND.format(
            input_dir=input_dir, output_dir=output_dir)
        if not is_test_mode:
            os.system(cmd)

        return (cmd, input_dir, output_dir)
예제 #7
0
    def split_flowfiles_for_stt(self, content=None, _attrs=None):
        """
        由于语音识别模块是集群,所以再进行语音识别模块前需要将数据预先分开,
        同时由于执行识别是以目录为入口,所以需要将分离的语音文件移动到不同的目录中。
        """
        if "dest_dir" not in _attrs and not Utils.isempty(_attrs["dest_dir"]):
            raise ArgumentsError("dest_dir Is Not Set In Attributes.")

        result = []
        if content is not None:
            content = json.loads(content.strip())
            if not isinstance(content, list):
                content = [content]

            split_group_number = int(
                Utils.use_if_set_else_default("split_group_number", _attrs,
                                              Config.DEFAULT_SPLIT_NUMBER))

            dest_dir = _attrs["dest_dir"]
            content_chunks = Utils.groups(content, split_group_number)
            for i, content_chunk in enumerate(content_chunks):
                _attrs = copy.copy(_attrs)
                _dest_dir = os.path.join(dest_dir, "CHUNK-{0}".format(i + 1))
                if not os.path.isdir(_dest_dir):
                    os.makedirs(_dest_dir)

                _attrs["dest_dir"] = _dest_dir
                for _file in content_chunk:
                    filename = os.path.basename(_file["DOCUMENTPATH"])
                    _src = os.path.join(dest_dir, filename)
                    _dest = os.path.join(_dest_dir, filename)
                    self.logger.debug("Copy Src:[{0}] To Dest:[{1}]".format(
                        _src, _dest))
                    if not self.is_test_mode:
                        shutil.move(_src, _dest)

                result.append(
                    ProcessorResponse(corrects=content_chunk,
                                      attributes=_attrs)._print())

        return result