def __init__(self, language="en", punkt_data_path=None): self.lang2datapath = {"en": "tokenizers/punkt/english.pickle"} self.log = log.get_global_console_logger() try: import nltk.data except ImportError: self.log.error( "Cannot import NLTK data for the sentence splitter. Please " "check if the 'punkt' NLTK-package is installed correctly.") try: if not punkt_data_path: punkt_data_path = self.lang2datapath[language] self.sent_detector = nltk.data.load(punkt_data_path) except KeyError: self.log.error( "No sentence splitter data for language {}.".format(language)) except: self.log.error("Could not load sentence splitter data: {}".format( self.lang2datapath[language]))
def __init__(self, rouge_dir=None, rouge_args=None): """ Create a Rouge155 object. rouge_dir: Directory containing Rouge-1.5.5.pl rouge_args: Arguments to pass through to ROUGE if you don't want to use the default pyrouge arguments. """ self.log = log.get_global_console_logger() self.__set_dir_properties() self._config_file = None self._settings_file = os.path.join( os.path.dirname(__file__), 'settings.ini') self.__set_rouge_dir(rouge_dir) self.args = self.__clean_rouge_args(rouge_args) self._system_filename_pattern = None self._model_filename_pattern = None
def __init__(self, rouge_dir=None, rouge_args=None): """ Create a Rouge155 object. rouge_dir: Directory containing Rouge-1.5.5.pl rouge_args: Arguments to pass through to ROUGE if you don't want to use the default pyrouge arguments. """ self.log = log.get_global_console_logger() self.__set_dir_properties() self._config_file = None self._settings_file = os.path.join(os.path.dirname(__file__), 'settings.ini') self.__set_rouge_dir(rouge_dir) self.args = self.__clean_rouge_args(rouge_args) self._system_filename_pattern = None self._model_filename_pattern = None
def __init__(self, language="en", punkt_data_path=None): self.lang2datapath = {"en": "tokenizers/punkt/english.pickle"} self.log = log.get_global_console_logger() try: import nltk.data except ImportError: self.log.error( "Cannot import NLTK data for the sentence splitter. Please " "check if the 'punkt' NLTK-package is installed correctly.") try: if not punkt_data_path: punkt_data_path = self.lang2datapath[language] self.sent_detector = nltk.data.load(punkt_data_path) except KeyError: self.log.error( "No sentence splitter data for language {}.".format(language)) except: self.log.error( "Could not load sentence splitter data: {}".format( self.lang2datapath[language]))
def process(input_dir, output_dir, function): """ Apply function to all files in input_dir and save the resulting ouput files in output_dir. """ if not os.path.exists(output_dir): os.makedirs(output_dir) logger = log.get_global_console_logger() logger.info("Processing files in {}.".format(input_dir)) input_file_names = os.listdir(input_dir) for input_file_name in input_file_names: logger.info("Processing {}.".format(input_file_name)) input_file = os.path.join(input_dir, input_file_name) with codecs.open(input_file, "r", encoding="UTF-8") as f: input_string = f.read() output_string = function(input_string) output_file = os.path.join(output_dir, input_file_name) with codecs.open(output_file, "w", encoding="UTF-8") as f: f.write(output_string) logger.info("Saved processed files to {}.".format(output_dir))