示例#1
0
class OWPreprocess(OWWidget):

    name = 'Preprocess Text'
    description = 'Construct a text pre-processing pipeline.'
    icon = 'icons/TextPreprocess.svg'
    priority = 30

    inputs = [(Input.CORPUS, Corpus, 'set_data')]
    outputs = [(Output.PP_CORPUS, Corpus)]

    autocommit = settings.Setting(True)

    preprocessors = [
        TransformationModule,
        TokenizerModule,
        NormalizationModule,
        FilteringModule,
        NgramsModule,
        POSTaggingModule,
    ]

    transformers = settings.SettingProvider(TransformationModule)
    tokenizer = settings.SettingProvider(TokenizerModule)
    normalizer = settings.SettingProvider(NormalizationModule)
    filters = settings.SettingProvider(FilteringModule)
    ngrams_range = settings.SettingProvider(NgramsModule)
    pos_tagger = settings.SettingProvider(POSTaggingModule)

    control_area_width = 250
    buttons_area_orientation = Qt.Vertical

    UserAdviceMessages = [
        widget.Message(
            "Some preprocessing methods require data (like word relationships, stop words, "
            "punctuation rules etc.) from the NLTK package. This data, if you didn't have it "
            "already, was downloaded to: {}".format(Downloader().default_download_dir()),
            "nltk_data")]

    class Error(OWWidget.Error):
        stanford_tagger = Msg("Problem while loading Stanford POS Tagger\n{}")

    class Warning(OWWidget.Warning):
        no_token_left = Msg('No tokens on output! Please, change configuration.')

    def __init__(self, parent=None):
        super().__init__(parent)
        self.corpus = None
        self.initial_ngram_range = None     # initial range of input corpus — used for inplace
        self.preprocessor = preprocess.Preprocessor()

        # -- INFO --
        info_box = gui.widgetBox(self.controlArea, 'Info')
        info_box.setFixedWidth(self.control_area_width)
        self.controlArea.layout().addStretch()
        self.info_label = gui.label(info_box, self, '')
        self.update_info()

        # -- PIPELINE --
        frame = QFrame()
        frame.setContentsMargins(0, 0, 0, 0)
        frame.setFrameStyle(QFrame.Box)
        frame.setStyleSheet('.QFrame { border: 1px solid #B3B3B3; }')
        frame_layout = QVBoxLayout()
        frame_layout.setContentsMargins(0, 0, 0, 0)
        frame_layout.setSpacing(0)
        frame.setLayout(frame_layout)

        self.stages = []
        for stage in self.preprocessors:
            widget = stage(self)
            self.stages.append(widget)
            setattr(self, stage.attribute, widget)
            frame_layout.addWidget(widget)
            widget.change_signal.connect(self.settings_invalidated)

        frame_layout.addStretch()
        self.scroll = QScrollArea()
        self.scroll.setWidget(frame)
        self.scroll.setWidgetResizable(True)
        self.scroll.setHorizontalScrollBarPolicy(Qt.ScrollBarAlwaysOff)
        self.scroll.setVerticalScrollBarPolicy(Qt.ScrollBarAlwaysOn)
        self.scroll.resize(frame_layout.sizeHint())
        self.scroll.setMinimumHeight(500)
        self.set_minimal_width()
        self.mainArea.layout().addWidget(self.scroll)

        # Buttons area
        self.report_button.setFixedWidth(self.control_area_width)

        commit_button = gui.auto_commit(self.buttonsArea, self, 'autocommit',
                                        'Commit', box=False)
        commit_button.setFixedWidth(self.control_area_width - 5)

        self.buttonsArea.layout().addWidget(commit_button)

    def set_data(self, data=None):
        self.corpus = data.copy() if data is not None else None
        self.initial_ngram_range = data.ngram_range if data is not None else None
        self.commit()

    def update_info(self, corpus=None):
        if corpus is not None:
            info = 'Document count: {}\n' \
                   'Total tokens: {}\n'\
                   'Total types: {}'\
                   .format(len(corpus), sum(map(len, corpus.tokens)), len(corpus.dictionary))
        else:
            info = 'No corpus.'
        self.info_label.setText(info)

    def commit(self):
        self.Warning.no_token_left.clear()
        if self.corpus is not None:
            self.apply()
        else:
            self.update_info()
            self.send(Output.PP_CORPUS, None)

    def apply(self):
        self.preprocess()

    @asynchronous
    def preprocess(self):
        for module in self.stages:
            setattr(self.preprocessor, module.attribute, module.value)
        self.corpus.pos_tags = None     # reset pos_tags and ngrams_range
        self.corpus.ngram_range = self.initial_ngram_range
        return self.preprocessor(self.corpus, inplace=True, on_progress=self.on_progress)

    @preprocess.on_start
    def on_start(self):
        self.progressBarInit(None)

    @preprocess.callback
    def on_progress(self, i):
        self.progressBarSet(i, None)

    @preprocess.on_result
    def on_result(self, result):
        self.update_info(result)
        if result is not None and len(result.dictionary) == 0:
            self.Warning.no_token_left()
            result = None
        self.send(Output.PP_CORPUS, result)
        self.progressBarFinished(None)

    def set_minimal_width(self):
        max_width = 250
        for widget in self.stages:
            if widget.enabled:
                max_width = max(max_width, widget.sizeHint().width())
        self.scroll.setMinimumWidth(max_width + 20)

    @pyqtSlot()
    def settings_invalidated(self):
        self.set_minimal_width()
        self.commit()

    def send_report(self):
        self.report_items('Preprocessor', self.preprocessor.report())
示例#2
0
class OWPreprocess(OWWidget):

    name = 'Preprocess Text'
    description = 'Construct a text pre-processing pipeline.'
    icon = 'icons/TextPreprocess.svg'
    priority = 30

    class Inputs:
        corpus = Input("Corpus", Corpus)

    class Outputs:
        corpus = Output("Corpus", Corpus)

    autocommit = settings.Setting(True)

    preprocessors = [
        TransformationModule,
        TokenizerModule,
        NormalizationModule,
        FilteringModule,
        NgramsModule,
        POSTaggingModule,
    ]

    transformers = settings.SettingProvider(TransformationModule)
    tokenizer = settings.SettingProvider(TokenizerModule)
    normalizer = settings.SettingProvider(NormalizationModule)
    filters = settings.SettingProvider(FilteringModule)
    ngrams_range = settings.SettingProvider(NgramsModule)
    pos_tagger = settings.SettingProvider(POSTaggingModule)

    control_area_width = 250
    buttons_area_orientation = Qt.Vertical

    UserAdviceMessages = [
        widget.Message(
            "Some preprocessing methods require data (like word relationships, stop words, "
            "punctuation rules etc.) from the NLTK package. This data was downloaded "
            "to: {}".format(nltk_data_dir()), "nltk_data")
    ]

    class Error(OWWidget.Error):
        stanford_tagger = Msg("Problem while loading Stanford POS Tagger\n{}")

    class Warning(OWWidget.Warning):
        no_token_left = Msg(
            'No tokens on output! Please, change configuration.')

    def __init__(self, parent=None):
        super().__init__(parent)
        self.corpus = None
        self.initial_ngram_range = None  # initial range of input corpus — used for inplace
        self.preprocessor = preprocess.Preprocessor()

        # -- INFO --
        info_box = gui.widgetBox(self.controlArea, 'Info')
        info_box.setFixedWidth(self.control_area_width)
        self.controlArea.layout().addStretch()
        self.info_label = gui.label(info_box, self, '')
        self.update_info()

        # -- PIPELINE --
        frame = QFrame()
        frame.setContentsMargins(0, 0, 0, 0)
        frame.setFrameStyle(QFrame.Box)
        frame.setStyleSheet('.QFrame { border: 1px solid #B3B3B3; }')
        frame_layout = QVBoxLayout()
        frame_layout.setContentsMargins(0, 0, 0, 0)
        frame_layout.setSpacing(0)
        frame.setLayout(frame_layout)

        self.stages = []
        for stage in self.preprocessors:
            widget = stage(self)
            self.stages.append(widget)
            setattr(self, stage.attribute, widget)
            frame_layout.addWidget(widget)
            widget.change_signal.connect(self.settings_invalidated)

        frame_layout.addStretch()
        self.scroll = QScrollArea()
        self.scroll.setWidget(frame)
        self.scroll.setWidgetResizable(True)
        self.scroll.setHorizontalScrollBarPolicy(Qt.ScrollBarAlwaysOff)
        self.scroll.setVerticalScrollBarPolicy(Qt.ScrollBarAlwaysOn)
        self.scroll.resize(frame_layout.sizeHint())
        self.scroll.setMinimumHeight(500)
        self.set_minimal_width()
        self.mainArea.layout().addWidget(self.scroll)

        # Buttons area
        self.report_button.setFixedWidth(self.control_area_width)

        commit_button = gui.auto_commit(self.buttonsArea,
                                        self,
                                        'autocommit',
                                        'Commit',
                                        box=False)
        commit_button.setFixedWidth(self.control_area_width - 5)

        self.buttonsArea.layout().addWidget(commit_button)

    @Inputs.corpus
    def set_data(self, data=None):
        self.corpus = data.copy() if data is not None else None
        self.initial_ngram_range = data.ngram_range if data is not None else None
        self.commit()

    def update_info(self, corpus=None):
        if corpus is not None:
            info = 'Document count: {}\n' \
                   'Total tokens: {}\n'\
                   'Total types: {}'\
                   .format(len(corpus), sum(map(len, corpus.tokens)), len(corpus.dictionary))
        else:
            info = 'No corpus.'
        self.info_label.setText(info)

    def commit(self):
        self.Warning.no_token_left.clear()
        if self.corpus is not None:
            self.apply()
        else:
            self.update_info()
            self.Outputs.corpus.send(None)

    def apply(self):
        self.preprocess()

    @asynchronous
    def preprocess(self):
        for module in self.stages:
            setattr(self.preprocessor, module.attribute, module.value)
        self.corpus.pos_tags = None  # reset pos_tags and ngrams_range
        self.corpus.ngram_range = self.initial_ngram_range
        return self.preprocessor(self.corpus,
                                 inplace=True,
                                 on_progress=self.on_progress)

    @preprocess.on_start
    def on_start(self):
        self.progressBarInit(None)

    @preprocess.callback
    def on_progress(self, i):
        self.progressBarSet(i, None)

    @preprocess.on_result
    def on_result(self, result):
        self.update_info(result)
        if result is not None and len(result.dictionary) == 0:
            self.Warning.no_token_left()
            result = None
        self.Outputs.corpus.send(result)
        self.progressBarFinished(None)

    def set_minimal_width(self):
        max_width = 250
        for widget in self.stages:
            if widget.enabled:
                max_width = max(max_width, widget.sizeHint().width())
        self.scroll.setMinimumWidth(max_width + 20)

    @pyqtSlot()
    def settings_invalidated(self):
        self.set_minimal_width()
        self.commit()

    def send_report(self):
        self.report_items('Preprocessor', self.preprocessor.report())
示例#3
0
class OWPreprocess(OWWidget):

    name = '文本预处理'
    description = '构建文本预处理的管道'
    icon = 'icons/TextPreprocess.svg'
    priority = 200

    class Inputs:
        corpus = Input("Corpus", Corpus)

    class Outputs:
        corpus = Output("Corpus", Corpus)

    autocommit = settings.Setting(True)

    preprocessors = [
        TransformationModule,
        TokenizerModule,
        NormalizationModule,
        FilteringModule,
        NgramsModule,
        POSTaggingModule,
    ]

    transformers = settings.SettingProvider(TransformationModule)
    tokenizer = settings.SettingProvider(TokenizerModule)
    normalizer = settings.SettingProvider(NormalizationModule)
    filters = settings.SettingProvider(FilteringModule)
    ngrams_range = settings.SettingProvider(NgramsModule)
    pos_tagger = settings.SettingProvider(POSTaggingModule)

    control_area_width = 250
    buttons_area_orientation = Qt.Vertical

    UserAdviceMessages = [
        widget.Message("部分预处理所需要的数据(例如词汇关系、停用词、标点符号规则等)是从NLTK包中获取的,",
                       "这些数据可以从{}下载。".format(nltk_data_dir()))
    ]

    class Error(OWWidget.Error):
        stanford_tagger = Msg("无法加载Stanford POS Tagger\n{}")
        stopwords_encoding = Msg("停用词表编码不正确,请使用 UTF-8 再试一次。")
        lexicon_encoding = Msg("词典编码不正确,请使用 UTF-8 再试一次。")
        error_reading_stopwords = Msg("读取文件错误: {}")
        error_reading_lexicon = Msg("读取文件错误: {}")

    class Warning(OWWidget.Warning):
        no_token_left = Msg('没有标记输出,请重新配置')
        udpipe_offline = Msg('没有网络连接,UDPipe 只加载本地模型')
        udpipe_offline_no_models = Msg('没有网络连接,UDPipe无本地模型')

    def __init__(self, parent=None):
        super().__init__(parent)
        self.corpus = None
        self.initial_ngram_range = None  # initial range of input corpus — used for inplace
        self.preprocessor = preprocess.Preprocessor()

        # -- INFO --
        info_box = gui.widgetBox(self.controlArea, '基本信息')
        info_box.setFixedWidth(self.control_area_width)
        self.controlArea.layout().addStretch()
        self.info_label = gui.label(info_box, self, '')
        self.update_info()

        # -- PIPELINE --
        frame = QFrame()
        frame.setContentsMargins(0, 0, 0, 0)
        frame.setFrameStyle(QFrame.Box)
        frame.setStyleSheet('.QFrame { border: 1px solid #B3B3B3; }')
        frame_layout = QVBoxLayout()
        frame_layout.setContentsMargins(0, 0, 0, 0)
        frame_layout.setSpacing(0)
        frame.setLayout(frame_layout)

        self.stages = []
        for stage in self.preprocessors:
            widget = stage(self)
            self.stages.append(widget)
            setattr(self, stage.attribute, widget)
            frame_layout.addWidget(widget)
            widget.change_signal.connect(self.settings_invalidated)

        frame_layout.addStretch()
        self.scroll = QScrollArea()
        self.scroll.setWidget(frame)
        self.scroll.setWidgetResizable(True)
        self.scroll.setHorizontalScrollBarPolicy(Qt.ScrollBarAlwaysOff)
        self.scroll.setVerticalScrollBarPolicy(Qt.ScrollBarAlwaysOn)
        self.scroll.resize(frame_layout.sizeHint())
        self.scroll.setMinimumHeight(500)
        self.set_minimal_width()
        self.mainArea.layout().addWidget(self.scroll)

        # Buttons area
        self.report_button.setFixedWidth(self.control_area_width)

        commit_button = gui.auto_commit(self.buttonsArea,
                                        self,
                                        'autocommit',
                                        '提交',
                                        '自动提交',
                                        box=False)
        commit_button.setFixedWidth(self.control_area_width - 5)

        self.buttonsArea.layout().addWidget(commit_button)

    @Inputs.corpus
    def set_data(self, data=None):
        self.corpus = data.copy() if data is not None else None
        self.initial_ngram_range = data.ngram_range if data is not None else None
        self.commit()

    def update_info(self, corpus=None):
        if corpus is not None:
            info = '文档数量: {}\n' \
                   '标记数量: {}\n'\
                   '类型数量: {}'\
                   .format(len(corpus), sum(map(len, corpus.tokens)), len(corpus.dictionary))
        else:
            info = '没有数据集'
        self.info_label.setText(info)

    def commit(self):
        self.Warning.no_token_left.clear()
        if self.corpus is not None:
            self.apply()
        else:
            self.update_info()
            self.Outputs.corpus.send(None)

    def apply(self):
        self.preprocess()

    @asynchronous
    def preprocess(self):
        for module in self.stages:
            setattr(self.preprocessor, module.attribute, module.value)
        self.corpus.pos_tags = None  # reset pos_tags and ngrams_range
        self.corpus.ngram_range = self.initial_ngram_range
        return self.preprocessor(self.corpus,
                                 inplace=True,
                                 on_progress=self.on_progress)

    @preprocess.on_start
    def on_start(self):
        self.progressBarInit(None)

    @preprocess.callback
    def on_progress(self, i):
        self.progressBarSet(i, None)

    @preprocess.on_result
    def on_result(self, result):
        self.update_info(result)
        if result is not None and len(result.dictionary) == 0:
            self.Warning.no_token_left()
            result = None
        self.Outputs.corpus.send(result)
        self.progressBarFinished(None)

    def set_minimal_width(self):
        max_width = 250
        for widget in self.stages:
            if widget.enabled:
                max_width = max(max_width, widget.sizeHint().width())
        self.scroll.setMinimumWidth(max_width + 20)

    @pyqtSlot()
    def settings_invalidated(self):
        self.set_minimal_width()
        self.commit()

    def send_report(self):
        self.report_items('Preprocessor', self.preprocessor.report())