class OWPreprocess(OWWidget): name = 'Preprocess Text' description = 'Construct a text pre-processing pipeline.' icon = 'icons/TextPreprocess.svg' priority = 30 inputs = [(Input.CORPUS, Corpus, 'set_data')] outputs = [(Output.PP_CORPUS, Corpus)] autocommit = settings.Setting(True) preprocessors = [ TransformationModule, TokenizerModule, NormalizationModule, FilteringModule, NgramsModule, POSTaggingModule, ] transformers = settings.SettingProvider(TransformationModule) tokenizer = settings.SettingProvider(TokenizerModule) normalizer = settings.SettingProvider(NormalizationModule) filters = settings.SettingProvider(FilteringModule) ngrams_range = settings.SettingProvider(NgramsModule) pos_tagger = settings.SettingProvider(POSTaggingModule) control_area_width = 250 buttons_area_orientation = Qt.Vertical UserAdviceMessages = [ widget.Message( "Some preprocessing methods require data (like word relationships, stop words, " "punctuation rules etc.) from the NLTK package. This data, if you didn't have it " "already, was downloaded to: {}".format(Downloader().default_download_dir()), "nltk_data")] class Error(OWWidget.Error): stanford_tagger = Msg("Problem while loading Stanford POS Tagger\n{}") class Warning(OWWidget.Warning): no_token_left = Msg('No tokens on output! Please, change configuration.') def __init__(self, parent=None): super().__init__(parent) self.corpus = None self.initial_ngram_range = None # initial range of input corpus — used for inplace self.preprocessor = preprocess.Preprocessor() # -- INFO -- info_box = gui.widgetBox(self.controlArea, 'Info') info_box.setFixedWidth(self.control_area_width) self.controlArea.layout().addStretch() self.info_label = gui.label(info_box, self, '') self.update_info() # -- PIPELINE -- frame = QFrame() frame.setContentsMargins(0, 0, 0, 0) frame.setFrameStyle(QFrame.Box) frame.setStyleSheet('.QFrame { border: 1px solid #B3B3B3; }') frame_layout = QVBoxLayout() frame_layout.setContentsMargins(0, 0, 0, 0) frame_layout.setSpacing(0) frame.setLayout(frame_layout) self.stages = [] for stage in self.preprocessors: widget = stage(self) self.stages.append(widget) setattr(self, stage.attribute, widget) frame_layout.addWidget(widget) widget.change_signal.connect(self.settings_invalidated) frame_layout.addStretch() self.scroll = QScrollArea() self.scroll.setWidget(frame) self.scroll.setWidgetResizable(True) self.scroll.setHorizontalScrollBarPolicy(Qt.ScrollBarAlwaysOff) self.scroll.setVerticalScrollBarPolicy(Qt.ScrollBarAlwaysOn) self.scroll.resize(frame_layout.sizeHint()) self.scroll.setMinimumHeight(500) self.set_minimal_width() self.mainArea.layout().addWidget(self.scroll) # Buttons area self.report_button.setFixedWidth(self.control_area_width) commit_button = gui.auto_commit(self.buttonsArea, self, 'autocommit', 'Commit', box=False) commit_button.setFixedWidth(self.control_area_width - 5) self.buttonsArea.layout().addWidget(commit_button) def set_data(self, data=None): self.corpus = data.copy() if data is not None else None self.initial_ngram_range = data.ngram_range if data is not None else None self.commit() def update_info(self, corpus=None): if corpus is not None: info = 'Document count: {}\n' \ 'Total tokens: {}\n'\ 'Total types: {}'\ .format(len(corpus), sum(map(len, corpus.tokens)), len(corpus.dictionary)) else: info = 'No corpus.' self.info_label.setText(info) def commit(self): self.Warning.no_token_left.clear() if self.corpus is not None: self.apply() else: self.update_info() self.send(Output.PP_CORPUS, None) def apply(self): self.preprocess() @asynchronous def preprocess(self): for module in self.stages: setattr(self.preprocessor, module.attribute, module.value) self.corpus.pos_tags = None # reset pos_tags and ngrams_range self.corpus.ngram_range = self.initial_ngram_range return self.preprocessor(self.corpus, inplace=True, on_progress=self.on_progress) @preprocess.on_start def on_start(self): self.progressBarInit(None) @preprocess.callback def on_progress(self, i): self.progressBarSet(i, None) @preprocess.on_result def on_result(self, result): self.update_info(result) if result is not None and len(result.dictionary) == 0: self.Warning.no_token_left() result = None self.send(Output.PP_CORPUS, result) self.progressBarFinished(None) def set_minimal_width(self): max_width = 250 for widget in self.stages: if widget.enabled: max_width = max(max_width, widget.sizeHint().width()) self.scroll.setMinimumWidth(max_width + 20) @pyqtSlot() def settings_invalidated(self): self.set_minimal_width() self.commit() def send_report(self): self.report_items('Preprocessor', self.preprocessor.report())
class OWPreprocess(OWWidget): name = 'Preprocess Text' description = 'Construct a text pre-processing pipeline.' icon = 'icons/TextPreprocess.svg' priority = 30 class Inputs: corpus = Input("Corpus", Corpus) class Outputs: corpus = Output("Corpus", Corpus) autocommit = settings.Setting(True) preprocessors = [ TransformationModule, TokenizerModule, NormalizationModule, FilteringModule, NgramsModule, POSTaggingModule, ] transformers = settings.SettingProvider(TransformationModule) tokenizer = settings.SettingProvider(TokenizerModule) normalizer = settings.SettingProvider(NormalizationModule) filters = settings.SettingProvider(FilteringModule) ngrams_range = settings.SettingProvider(NgramsModule) pos_tagger = settings.SettingProvider(POSTaggingModule) control_area_width = 250 buttons_area_orientation = Qt.Vertical UserAdviceMessages = [ widget.Message( "Some preprocessing methods require data (like word relationships, stop words, " "punctuation rules etc.) from the NLTK package. This data was downloaded " "to: {}".format(nltk_data_dir()), "nltk_data") ] class Error(OWWidget.Error): stanford_tagger = Msg("Problem while loading Stanford POS Tagger\n{}") class Warning(OWWidget.Warning): no_token_left = Msg( 'No tokens on output! Please, change configuration.') def __init__(self, parent=None): super().__init__(parent) self.corpus = None self.initial_ngram_range = None # initial range of input corpus — used for inplace self.preprocessor = preprocess.Preprocessor() # -- INFO -- info_box = gui.widgetBox(self.controlArea, 'Info') info_box.setFixedWidth(self.control_area_width) self.controlArea.layout().addStretch() self.info_label = gui.label(info_box, self, '') self.update_info() # -- PIPELINE -- frame = QFrame() frame.setContentsMargins(0, 0, 0, 0) frame.setFrameStyle(QFrame.Box) frame.setStyleSheet('.QFrame { border: 1px solid #B3B3B3; }') frame_layout = QVBoxLayout() frame_layout.setContentsMargins(0, 0, 0, 0) frame_layout.setSpacing(0) frame.setLayout(frame_layout) self.stages = [] for stage in self.preprocessors: widget = stage(self) self.stages.append(widget) setattr(self, stage.attribute, widget) frame_layout.addWidget(widget) widget.change_signal.connect(self.settings_invalidated) frame_layout.addStretch() self.scroll = QScrollArea() self.scroll.setWidget(frame) self.scroll.setWidgetResizable(True) self.scroll.setHorizontalScrollBarPolicy(Qt.ScrollBarAlwaysOff) self.scroll.setVerticalScrollBarPolicy(Qt.ScrollBarAlwaysOn) self.scroll.resize(frame_layout.sizeHint()) self.scroll.setMinimumHeight(500) self.set_minimal_width() self.mainArea.layout().addWidget(self.scroll) # Buttons area self.report_button.setFixedWidth(self.control_area_width) commit_button = gui.auto_commit(self.buttonsArea, self, 'autocommit', 'Commit', box=False) commit_button.setFixedWidth(self.control_area_width - 5) self.buttonsArea.layout().addWidget(commit_button) @Inputs.corpus def set_data(self, data=None): self.corpus = data.copy() if data is not None else None self.initial_ngram_range = data.ngram_range if data is not None else None self.commit() def update_info(self, corpus=None): if corpus is not None: info = 'Document count: {}\n' \ 'Total tokens: {}\n'\ 'Total types: {}'\ .format(len(corpus), sum(map(len, corpus.tokens)), len(corpus.dictionary)) else: info = 'No corpus.' self.info_label.setText(info) def commit(self): self.Warning.no_token_left.clear() if self.corpus is not None: self.apply() else: self.update_info() self.Outputs.corpus.send(None) def apply(self): self.preprocess() @asynchronous def preprocess(self): for module in self.stages: setattr(self.preprocessor, module.attribute, module.value) self.corpus.pos_tags = None # reset pos_tags and ngrams_range self.corpus.ngram_range = self.initial_ngram_range return self.preprocessor(self.corpus, inplace=True, on_progress=self.on_progress) @preprocess.on_start def on_start(self): self.progressBarInit(None) @preprocess.callback def on_progress(self, i): self.progressBarSet(i, None) @preprocess.on_result def on_result(self, result): self.update_info(result) if result is not None and len(result.dictionary) == 0: self.Warning.no_token_left() result = None self.Outputs.corpus.send(result) self.progressBarFinished(None) def set_minimal_width(self): max_width = 250 for widget in self.stages: if widget.enabled: max_width = max(max_width, widget.sizeHint().width()) self.scroll.setMinimumWidth(max_width + 20) @pyqtSlot() def settings_invalidated(self): self.set_minimal_width() self.commit() def send_report(self): self.report_items('Preprocessor', self.preprocessor.report())
class OWPreprocess(OWWidget): name = '文本预处理' description = '构建文本预处理的管道' icon = 'icons/TextPreprocess.svg' priority = 200 class Inputs: corpus = Input("Corpus", Corpus) class Outputs: corpus = Output("Corpus", Corpus) autocommit = settings.Setting(True) preprocessors = [ TransformationModule, TokenizerModule, NormalizationModule, FilteringModule, NgramsModule, POSTaggingModule, ] transformers = settings.SettingProvider(TransformationModule) tokenizer = settings.SettingProvider(TokenizerModule) normalizer = settings.SettingProvider(NormalizationModule) filters = settings.SettingProvider(FilteringModule) ngrams_range = settings.SettingProvider(NgramsModule) pos_tagger = settings.SettingProvider(POSTaggingModule) control_area_width = 250 buttons_area_orientation = Qt.Vertical UserAdviceMessages = [ widget.Message("部分预处理所需要的数据(例如词汇关系、停用词、标点符号规则等)是从NLTK包中获取的,", "这些数据可以从{}下载。".format(nltk_data_dir())) ] class Error(OWWidget.Error): stanford_tagger = Msg("无法加载Stanford POS Tagger\n{}") stopwords_encoding = Msg("停用词表编码不正确,请使用 UTF-8 再试一次。") lexicon_encoding = Msg("词典编码不正确,请使用 UTF-8 再试一次。") error_reading_stopwords = Msg("读取文件错误: {}") error_reading_lexicon = Msg("读取文件错误: {}") class Warning(OWWidget.Warning): no_token_left = Msg('没有标记输出,请重新配置') udpipe_offline = Msg('没有网络连接,UDPipe 只加载本地模型') udpipe_offline_no_models = Msg('没有网络连接,UDPipe无本地模型') def __init__(self, parent=None): super().__init__(parent) self.corpus = None self.initial_ngram_range = None # initial range of input corpus — used for inplace self.preprocessor = preprocess.Preprocessor() # -- INFO -- info_box = gui.widgetBox(self.controlArea, '基本信息') info_box.setFixedWidth(self.control_area_width) self.controlArea.layout().addStretch() self.info_label = gui.label(info_box, self, '') self.update_info() # -- PIPELINE -- frame = QFrame() frame.setContentsMargins(0, 0, 0, 0) frame.setFrameStyle(QFrame.Box) frame.setStyleSheet('.QFrame { border: 1px solid #B3B3B3; }') frame_layout = QVBoxLayout() frame_layout.setContentsMargins(0, 0, 0, 0) frame_layout.setSpacing(0) frame.setLayout(frame_layout) self.stages = [] for stage in self.preprocessors: widget = stage(self) self.stages.append(widget) setattr(self, stage.attribute, widget) frame_layout.addWidget(widget) widget.change_signal.connect(self.settings_invalidated) frame_layout.addStretch() self.scroll = QScrollArea() self.scroll.setWidget(frame) self.scroll.setWidgetResizable(True) self.scroll.setHorizontalScrollBarPolicy(Qt.ScrollBarAlwaysOff) self.scroll.setVerticalScrollBarPolicy(Qt.ScrollBarAlwaysOn) self.scroll.resize(frame_layout.sizeHint()) self.scroll.setMinimumHeight(500) self.set_minimal_width() self.mainArea.layout().addWidget(self.scroll) # Buttons area self.report_button.setFixedWidth(self.control_area_width) commit_button = gui.auto_commit(self.buttonsArea, self, 'autocommit', '提交', '自动提交', box=False) commit_button.setFixedWidth(self.control_area_width - 5) self.buttonsArea.layout().addWidget(commit_button) @Inputs.corpus def set_data(self, data=None): self.corpus = data.copy() if data is not None else None self.initial_ngram_range = data.ngram_range if data is not None else None self.commit() def update_info(self, corpus=None): if corpus is not None: info = '文档数量: {}\n' \ '标记数量: {}\n'\ '类型数量: {}'\ .format(len(corpus), sum(map(len, corpus.tokens)), len(corpus.dictionary)) else: info = '没有数据集' self.info_label.setText(info) def commit(self): self.Warning.no_token_left.clear() if self.corpus is not None: self.apply() else: self.update_info() self.Outputs.corpus.send(None) def apply(self): self.preprocess() @asynchronous def preprocess(self): for module in self.stages: setattr(self.preprocessor, module.attribute, module.value) self.corpus.pos_tags = None # reset pos_tags and ngrams_range self.corpus.ngram_range = self.initial_ngram_range return self.preprocessor(self.corpus, inplace=True, on_progress=self.on_progress) @preprocess.on_start def on_start(self): self.progressBarInit(None) @preprocess.callback def on_progress(self, i): self.progressBarSet(i, None) @preprocess.on_result def on_result(self, result): self.update_info(result) if result is not None and len(result.dictionary) == 0: self.Warning.no_token_left() result = None self.Outputs.corpus.send(result) self.progressBarFinished(None) def set_minimal_width(self): max_width = 250 for widget in self.stages: if widget.enabled: max_width = max(max_width, widget.sizeHint().width()) self.scroll.setMinimumWidth(max_width + 20) @pyqtSlot() def settings_invalidated(self): self.set_minimal_width() self.commit() def send_report(self): self.report_items('Preprocessor', self.preprocessor.report())