def body(self, title, text, seek): global verbose title = self.translate(title).strip(u'\u200e\u200f') restricted = FilterWords.is_restricted(title) or FilterWords.is_restricted(text) self.article_count += 1 # do closer inspection to see if realy restricted if restricted: (restricted, bad_words) = FilterWords.find_restricted(text) if restricted: self.restricted_count += 1 if not verbose and self.article_count % 10000 == 0: start_time = time.time() print '%7.2fs %10d' % (start_time - self.time, self.article_count) self.time = start_time generate_bigram(title) if verbose: if restricted: print 'Restricted Title:', title.encode('utf-8') print ' -->', bad_words else: print 'Title:', title.encode('utf-8') self.offsets[self.article_count] = (self.file_id(), title, seek, len(text)) if self.set_index(title, (self.article_count, -1, restricted)): # -1 == pfx place holder print 'Duplicate Title:', title.encode('utf-8')
def body(self, category, key, title, text, seek): global verbose global error_flag title = self.convert(title).strip(u'\u200e\u200f') if self.KEY_TEMPLATE == key: if title not in self.ignored_templates: title = unicode(category, 'utf-8').capitalize() + ':' + upper_case_first_char(title) t_body = TidyUp.template(text) self.template_cursor.execute(u'insert or replace into templates (title, body) values(?, ?)', [u'~{0:d}~{1:s}'.format(self.file_id(), title), u'~' + t_body]) self.template_count += 1 return restricted = FilterWords.is_restricted(title) or FilterWords.is_restricted(text) self.article_count += 1 # do closer inspection to see if really restricted if restricted: (restricted, bad_words) = FilterWords.find_restricted(text) if restricted: self.restricted_count += 1 if self.article_count % 10000 == 0: start_time = time.time() PrintLog.message(u'Index: {0:7.2f}s {1:10d}'.format(start_time - self.time, self.article_count)) self.time = start_time for t in self.language_processor.translate(title): generate_bigram(t) if verbose: if restricted: PrintLog.message(u'Restricted Title: {0:s}'.format(title)) PrintLog.message(u' --> {0:s}'.format(bad_words)) else: PrintLog.message(u'Title: {0:s}'.format(title)) pass character_count = len(text) self.total_character_count += character_count self.offsets[self.article_count] = (self.file_id(), title, seek, character_count, self.total_character_count) if self.set_index(title, (self.article_count, -1, restricted, False)): # -1 == place holder PrintLog.message(u'ERROR: Duplicate Title: {0:s}'.format(title)) error_flag = True
def body(self, category, key, title, text, seek): global verbose global error_flag title = self.convert(title).strip(u'\u200e\u200f') if self.KEY_TEMPLATE == key: if title not in self.ignored_templates: title = unicode(category, 'utf-8').capitalize() + ':' + upper_case_first_char(title) t_body = TidyUp.template(text) self.template_cursor.execute(u'insert or replace into templates (title, body) values(?, ?)', [u'~{0:d}~{1:s}'.format(self.file_id(), title), u'~' + t_body]) self.template_count += 1 return restricted = FilterWords.is_restricted(title) or FilterWords.is_restricted(text) self.article_count += 1 # do closer inspection to see if really restricted if restricted: (restricted, bad_words) = FilterWords.find_restricted(text) if restricted: self.restricted_count += 1 if not verbose and self.article_count % 10000 == 0: start_time = time.time() PrintLog.message(u'Index: {0:7.2f}s {1:10d}'.format(start_time - self.time, self.article_count)) self.time = start_time for t in self.language_processor.translate(title): generate_bigram(t) if verbose: if restricted: PrintLog.message(u'Restricted Title: {0:s}'.format(title)) PrintLog.message(u' --> {0:s}'.format(bad_words)) else: PrintLog.message(u'Title: {0:s}'.format(title)) character_count = len(text) self.total_character_count += character_count self.offsets[self.article_count] = (self.file_id(), title, seek, character_count, self.total_character_count) if self.set_index(title, (self.article_count, -1, restricted, False)): # -1 == place holder PrintLog.message(u'ERROR: Duplicate Title: {0:s}'.format(title)) error_flag = True
def filter(title, text): (restricted, contains) = FilterWords.find_restricted(title + text) if restricted: print('TITLE: "{0:s}" restricted: [{1:s}]'.format(title, contains)) return restricted
def body(self, category, key, title, text, seek): global verbose, show_restricted restricted_title = FilterWords.is_restricted(title) restricted_text = FilterWords.is_restricted(text) restricted = restricted_title or restricted_text self.article_count += 1 if restricted: self.restricted_count += 1 if not verbose and self.article_count % 10000 == 0: start_time = time.time() PrintLog.message('{0:7.2f}s {1:10d}'.format( start_time - self.time, self.article_count)) self.time = start_time if verbose: PrintLog.message('Title: {0:s}'.format(title)) if restricted: if restricted_title: t_state = ' Title' else: t_state = '' if restricted_text: b_state = ' Text' (flag, contains) = FilterWords.find_restricted(text) if not flag: self.unrestricted_count += 1 else: b_state = '' contains = None if show_restricted: PrintLog.message('{0:10d} Restricted{1:s}{2:s}: {3:s}'.format( self.restricted_count, t_state, b_state, title)) if None != contains: PrintLog.message(' -> {0!s:s} {1:s}'.format( flag, contains))
def body(self, title, text, seek): global verbose, show_restricted title = self.translate(title).strip(u"\u200e\u200f") restricted_title = FilterWords.is_restricted(title) restricted_text = FilterWords.is_restricted(text) restricted = restricted_title or restricted_text self.article_count += 1 if restricted: self.restricted_count += 1 if not verbose and self.article_count % 10000 == 0: start_time = time.time() print "%7.2fs %10d" % (start_time - self.time, self.article_count) self.time = start_time if verbose: print "Title:", title.encode("utf-8") if restricted: if restricted_title: t_state = " Title" else: t_state = "" if restricted_text: b_state = " Text" (flag, contains) = FilterWords.find_restricted(text) if not flag: self.unrestricted_count += 1 else: b_state = "" contains = None if show_restricted: print "%10d Restricted%s%s: %s" % (self.restricted_count, t_state, b_state, title.encode("utf-8")) if None != contains: print " ->", flag, contains
def body(self, category, key, title, text, seek): global verbose, show_restricted restricted_title = FilterWords.is_restricted(title) restricted_text = FilterWords.is_restricted(text) restricted = restricted_title or restricted_text self.article_count += 1 if restricted: self.restricted_count += 1 if not verbose and self.article_count % 10000 == 0: start_time = time.time() PrintLog.message('{0:7.2f}s {1:10d}'.format(start_time - self.time, self.article_count)) self.time = start_time if verbose: PrintLog.message('Title: {0:s}'.format(title)) if restricted: if restricted_title: t_state = ' Title' else: t_state = '' if restricted_text: b_state = ' Text' (flag, contains) = FilterWords.find_restricted(text) if not flag: self.unrestricted_count += 1 else: b_state = '' contains = None if show_restricted: PrintLog.message('{0:10d} Restricted{1:s}{2:s}: {3:s}' .format(self.restricted_count, t_state, b_state, title)) if None != contains: PrintLog.message(' -> {0!s:s} {1:s}'.format(flag, contains))