Exemplo n.º 1
0
    def body(self, title, text, seek):
        global verbose

        title = self.translate(title).strip(u'\u200e\u200f')

        restricted = FilterWords.is_restricted(title) or FilterWords.is_restricted(text)

        self.article_count += 1

        # do closer inspection to see if realy restricted
        if restricted:
            (restricted, bad_words) = FilterWords.find_restricted(text)

        if restricted:
            self.restricted_count += 1

        if not verbose and self.article_count % 10000 == 0:
            start_time = time.time()
            print '%7.2fs %10d' % (start_time - self.time, self.article_count)
            self.time = start_time

        generate_bigram(title)

        if verbose:
            if restricted:
                print 'Restricted Title:', title.encode('utf-8')
                print '  -->', bad_words
            else:
                print 'Title:', title.encode('utf-8')

        self.offsets[self.article_count] = (self.file_id(), title, seek, len(text))

        if self.set_index(title, (self.article_count, -1, restricted)): # -1 == pfx place holder
            print 'Duplicate Title:', title.encode('utf-8')
Exemplo n.º 2
0
    def body(self, category, key, title, text, seek):
        global verbose
        global error_flag

        title = self.convert(title).strip(u'\u200e\u200f')

        if self.KEY_TEMPLATE == key:
            if title not in self.ignored_templates:
                title = unicode(category, 'utf-8').capitalize() + ':' + upper_case_first_char(title)
                t_body = TidyUp.template(text)
                self.template_cursor.execute(u'insert or replace into templates (title, body) values(?, ?)',
                                             [u'~{0:d}~{1:s}'.format(self.file_id(), title), u'~' + t_body])
                self.template_count += 1
            return

        restricted = FilterWords.is_restricted(title) or FilterWords.is_restricted(text)

        self.article_count += 1

        # do closer inspection to see if really restricted
        if restricted:
            (restricted, bad_words) = FilterWords.find_restricted(text)

        if restricted:
            self.restricted_count += 1

        if self.article_count % 10000 == 0:
            start_time = time.time()
            PrintLog.message(u'Index: {0:7.2f}s {1:10d}'.format(start_time - self.time, self.article_count))
            self.time = start_time

        for t in self.language_processor.translate(title):
            generate_bigram(t)

        if verbose:
            if restricted:
                PrintLog.message(u'Restricted Title: {0:s}'.format(title))
                PrintLog.message(u'  --> {0:s}'.format(bad_words))
            else:
                PrintLog.message(u'Title: {0:s}'.format(title))
                pass

        character_count = len(text)
        self.total_character_count += character_count
        self.offsets[self.article_count] = (self.file_id(), title, seek, character_count, self.total_character_count)

        if self.set_index(title, (self.article_count, -1, restricted, False)): # -1 == place holder
            PrintLog.message(u'ERROR: Duplicate Title: {0:s}'.format(title))
            error_flag = True
Exemplo n.º 3
0
    def body(self, category, key, title, text, seek):
        global verbose
        global error_flag

        title = self.convert(title).strip(u'\u200e\u200f')

        if self.KEY_TEMPLATE == key:
            if title not in self.ignored_templates:
                title = unicode(category, 'utf-8').capitalize() + ':' + upper_case_first_char(title)
                t_body = TidyUp.template(text)
                self.template_cursor.execute(u'insert or replace into templates (title, body) values(?, ?)',
                                             [u'~{0:d}~{1:s}'.format(self.file_id(), title), u'~' + t_body])
                self.template_count += 1
            return

        restricted = FilterWords.is_restricted(title) or FilterWords.is_restricted(text)

        self.article_count += 1

        # do closer inspection to see if really restricted
        if restricted:
            (restricted, bad_words) = FilterWords.find_restricted(text)

        if restricted:
            self.restricted_count += 1

        if not verbose and self.article_count % 10000 == 0:
            start_time = time.time()
            PrintLog.message(u'Index: {0:7.2f}s {1:10d}'.format(start_time - self.time, self.article_count))
            self.time = start_time

        for t in self.language_processor.translate(title):
            generate_bigram(t)

        if verbose:
            if restricted:
                PrintLog.message(u'Restricted Title: {0:s}'.format(title))
                PrintLog.message(u'  --> {0:s}'.format(bad_words))
            else:
                PrintLog.message(u'Title: {0:s}'.format(title))

        character_count = len(text)
        self.total_character_count += character_count
        self.offsets[self.article_count] = (self.file_id(), title, seek, character_count, self.total_character_count)

        if self.set_index(title, (self.article_count, -1, restricted, False)): # -1 == place holder
            PrintLog.message(u'ERROR: Duplicate Title: {0:s}'.format(title))
            error_flag = True
Exemplo n.º 4
0
def filter(title, text):

    (restricted, contains) = FilterWords.find_restricted(title + text)

    if restricted:
        print('TITLE: "{0:s}" restricted: [{1:s}]'.format(title, contains))

    return restricted
Exemplo n.º 5
0
def filter(title, text):

    (restricted, contains) = FilterWords.find_restricted(title + text)

    if restricted:
        print('TITLE: "{0:s}" restricted: [{1:s}]'.format(title, contains))

    return restricted
Exemplo n.º 6
0
    def body(self, category, key, title, text, seek):
        global verbose, show_restricted

        restricted_title = FilterWords.is_restricted(title)
        restricted_text = FilterWords.is_restricted(text)
        restricted = restricted_title or restricted_text

        self.article_count += 1
        if restricted:
            self.restricted_count += 1

        if not verbose and self.article_count % 10000 == 0:
            start_time = time.time()
            PrintLog.message('{0:7.2f}s {1:10d}'.format(
                start_time - self.time, self.article_count))
            self.time = start_time

        if verbose:
            PrintLog.message('Title: {0:s}'.format(title))

        if restricted:
            if restricted_title:
                t_state = ' Title'
            else:
                t_state = ''

            if restricted_text:
                b_state = ' Text'
                (flag, contains) = FilterWords.find_restricted(text)
                if not flag:
                    self.unrestricted_count += 1
            else:
                b_state = ''
                contains = None
            if show_restricted:
                PrintLog.message('{0:10d} Restricted{1:s}{2:s}: {3:s}'.format(
                    self.restricted_count, t_state, b_state, title))
                if None != contains:
                    PrintLog.message('        -> {0!s:s} {1:s}'.format(
                        flag, contains))
Exemplo n.º 7
0
    def body(self, title, text, seek):
        global verbose, show_restricted

        title = self.translate(title).strip(u"\u200e\u200f")

        restricted_title = FilterWords.is_restricted(title)
        restricted_text = FilterWords.is_restricted(text)
        restricted = restricted_title or restricted_text

        self.article_count += 1
        if restricted:
            self.restricted_count += 1

        if not verbose and self.article_count % 10000 == 0:
            start_time = time.time()
            print "%7.2fs %10d" % (start_time - self.time, self.article_count)
            self.time = start_time

        if verbose:
            print "Title:", title.encode("utf-8")

        if restricted:
            if restricted_title:
                t_state = " Title"
            else:
                t_state = ""

            if restricted_text:
                b_state = " Text"
                (flag, contains) = FilterWords.find_restricted(text)
                if not flag:
                    self.unrestricted_count += 1
            else:
                b_state = ""
                contains = None
            if show_restricted:
                print "%10d Restricted%s%s: %s" % (self.restricted_count, t_state, b_state, title.encode("utf-8"))
                if None != contains:
                    print "        ->", flag, contains
Exemplo n.º 8
0
    def body(self, category, key, title, text, seek):
        global verbose, show_restricted

        restricted_title =  FilterWords.is_restricted(title)
        restricted_text =  FilterWords.is_restricted(text)
        restricted = restricted_title or restricted_text

        self.article_count += 1
        if restricted:
            self.restricted_count += 1

        if not verbose and self.article_count % 10000 == 0:
            start_time = time.time()
            PrintLog.message('{0:7.2f}s {1:10d}'.format(start_time - self.time, self.article_count))
            self.time = start_time

        if verbose:
            PrintLog.message('Title: {0:s}'.format(title))

        if restricted:
            if restricted_title:
                t_state = ' Title'
            else:
                t_state = ''

            if restricted_text:
                b_state = ' Text'
                (flag, contains) = FilterWords.find_restricted(text)
                if not flag:
                    self.unrestricted_count += 1
            else:
                b_state = ''
                contains = None
            if show_restricted:
                PrintLog.message('{0:10d} Restricted{1:s}{2:s}: {3:s}'
                                 .format(self.restricted_count, t_state, b_state, title))
                if None != contains:
                    PrintLog.message('        -> {0!s:s} {1:s}'.format(flag, contains))