Пример #1
0
    def __init__(self, lt, config_file):
        """
        Uses the section <tt>self.lt.language</tt>-wikimedia.

        The following parameters are used:
        - whitelist: the file that contains the list of pages to retain.
        - blacklist: the file that contains the list of pages to discard.

        The two parameters whitelist and blacklist can be used to specify which
        pages will be kept and which pages will be thrown out. It is enough
        to specify only one of these two parameters, as the presence of either
        will trigger a filtering based on that list alone.

        @param lt the LanguageTools object.
        """
        self.lt = lt

        config_parser = CascadingConfigParser(config_file)
        config = dict(config_parser.items(self.lt.language + '-wikimedia'))

        self.whitelist = None
        self.blacklist = None
        whitelist_file = config.get('whitelist', None)
        blacklist_file = config.get('blacklist', None)
        if whitelist_file is not None:
            self.whitelist = read_file_into_set(whitelist_file, 'utf-8')
        if blacklist_file is not None:
            self.blacklist = read_file_into_set(blacklist_file, 'utf-8')
Пример #2
0
    def __init__(self, lt, config_file):
        """
        Uses the section <tt>self.lt.language</tt>-wikimedia.

        The following parameters are used:
        - whitelist: the file that contains the list of pages to retain.
        - blacklist: the file that contains the list of pages to discard.

        The two parameters whitelist and blacklist can be used to specify which
        pages will be kept and which pages will be thrown out. It is enough
        to specify only one of these two parameters, as the presence of either
        will trigger a filtering based on that list alone.

        @param lt the LanguageTools object.
        """
        self.lt = lt

        config_parser = CascadingConfigParser(config_file)
        config = dict(config_parser.items(self.lt.language + '-wikimedia'))

        self.whitelist = None
        self.blacklist = None
        whitelist_file = config.get('whitelist', None)
        blacklist_file = config.get('blacklist', None)
        if whitelist_file is not None:
            self.whitelist = read_file_into_set(whitelist_file, 'utf-8')
        if blacklist_file is not None:
            self.blacklist = read_file_into_set(blacklist_file, 'utf-8')
Пример #3
0
 def read_config_file(self, config_file):
     """Reads the section of the configuration file that corresponds to
     @p language to a dict."""
     config_parser = CascadingConfigParser(config_file)
     config = config_parser.items(self.lang_config)
     return dict(config)
Пример #4
0
        self.patterns = patterns

    def accept(self, document):
        for pattern in self.patterns:
            if pattern in document.title:
                return False
        return True

if __name__ == '__main__':
    option_parser = OptionParser()
    option_parser.add_option("-l", "--language", dest="language",
            help="the Wikipedia language code. Default is en.", default="en")
    options, args = option_parser.parse_args()

    config_parser = CascadingConfigParser(args[0])
    config = dict(config_parser.items(options.language + '-wikimedia'))

    # Filters
    multi_filter = MultiFilter()
    templates = config.get('disambig_templates', '')
    if len(templates) > 0:
        template_filter = TemplateFilter([t.strip().decode('utf-8') for t in templates.split(',')])
        multi_filter.add(template_filter)
    titles = config.get('disambig_title', '')
    if len(titles) > 0:
        titles_filter = TitleDisambigFilter([t.strip().decode('utf-8') for t in titles.split(',')])
        multi_filter.add(titles_filter)

    reader = ConllReader()
    it = ConllIter(reader, 'utf-8')
    it.read(args[1:])
Пример #5
0
 def read_config_file(self, config_file):
     """Reads the section of the configuration file that corresponds to
     @p language to a dict."""
     config_parser = CascadingConfigParser(config_file)
     config = config_parser.items(self.lang_config)
     return dict(config)