def segmentation_pipeline(p_mergehyphen, p_splitcomma, p_split, p_punctadd): pipeline = Pipeline() pipeline.addModule(MergeWordHyphenModule(p_mergehyphen)) # Potrebbe essere possibile unire questi due moduli pipeline.addModule(SplitWithCommaModule(p_splitcomma, ",")) pipeline.addModule(SplitModuleGenerator(p_split)) pipeline.addModule(AddPunctuationModule(p_punctadd, ".")) pipeline.addModule(AddPunctuationModule(p_punctadd / 2, ",")) pipeline.addModule(AddPunctuationModule(p_punctadd / 2, "'")) return pipeline
def token_pipeline(p_charsub, sub_data): pipeline = Pipeline() pipeline.addModule(CharsSubModule(sub_data, p_charsub)) return pipeline