Пример #1
0
    def __init__(self, config, trial='trial1'):
        """Initialize MALLET with trial name"""
        if trial not in config.trials:
            raise ValueError("Invalid trail name `{}`.format(trial)")

        self.config = config
        self.trial = trial
        self.config.set_config_attributes(self) # Prefixes keys with cfg_
        self.config.set_config_attributes(self, self.trial)

        # todo: Put this in config.ini
        self.cfg_tw_quantile = 0.8

        # Temporary hack to handle casting
        for key in "num_topics num_iterations optimize_interval num_threads num_top_words".split():
            att = 'cfg_{}'.format(key)
            setattr(self, att, int(getattr(self, att)))
        self.cfg_thresh = float(self.cfg_thresh)

        # Get replacment files
        # todo: Fix order; higher ngrams should go first ... argues for sortable names
        self.replacement_files = self.cfg_replacements
        for filename in os.listdir('corpus'):
            if 'replacements_' in filename:
                self.replacement_files += ' corpus/' + filename

        self.trial_name = self.trial  # HACK
        self.file_prefix = '{}/{}'.format(self.cfg_mallet_out_dir, self.trial_name)
        self.mallet = {'import-file': {}, 'train-topics': {}}
        self.mallet_init()

        dbfile = self.config.generate_model_db_file_path(self.trial)
        PoloDb.__init__(self, dbfile)
Пример #2
0
    def __init__(self, config):
        """Initialize corpus object"""

        # Import Configs
        self.config = config
        self.config.set_config_attributes(self)
        if not os.path.isfile(self.cfg_src_file_name):
            raise ValueError(
                "Missing source file. Check value of `src_file_name` in INI file."
            )
        self.dbfile = config.generate_corpus_db_file_path()
        PoloDb.__init__(self, self.dbfile)

        # self.db = PoloDb(self.dbfile) # Why not do this?

        if self.cfg_nltk_data_path:
            nltk.data.path.append(self.cfg_nltk_data_path)

        # For tokenizing into sentences
        # fixme: TOKENIZER ASSUMES ENGLISH -- PARAMETIZE THIS
        nltk.download('punkt')
        nltk.download('tagsets')
        nltk.download('averaged_perceptron_tagger')
        self.tokenizer = nltk.data.load('nltk:tokenizers/punkt/english.pickle')