def _rewrite_mapping(key, value, node): if value in mapping: node.find(RESX.value_tag).text = mapping[value] elif filter_string(value) in mapping: node.find(RESX.value_tag).text = \ mapping[filter_string(value)]
def clean_mp3_id3_tags(audiofile): """ remove any ID3 tags that we don't like """ filter_list = get_filter_list() try: audiofile.tag.artist = filter_string(audiofile.tag.artist, filter_list) audiofile.tag.title = filter_string(audiofile.tag.title, filter_list) audiofile.tag.album = filter_string(audiofile.tag.album, filter_list) for comment in audiofile.tag.comments: comment.text = u'downloaded from themixtapesite.com' comment.data = u'downloaded from themixtapesite.com' audiofile.tag.save() except Exception as exc: debug('Caught exception trying to clean id3 tags for "%s"' % audiofile) debug('Exception: %s' % exc) return audiofile
def extract_strings_from_files(self, filenames, strings_to_ignore = []): logger = logging.getLogger(burton.logger_name) raw_strings = set([]) filtered_strings = set([]) for filename in set(self._filter_filenames(filenames)): logger.debug("Extracting strings from " + filename) raw_strings.update(self.extract_strings_from_filename(filename)) for string in raw_strings: filtered_string = filter_string(string) replaced_string, params = replace_params(string) if replaced_string not in strings_to_ignore: filtered_strings.add(filtered_string) return filtered_strings
def extract_strings_from_files(self, filenames, strings_to_ignore=[]): logger = logging.getLogger(burton.logger_name) raw_strings = set([]) filtered_strings = set([]) for filename in set(self._filter_filenames(filenames)): logger.debug("Extracting strings from " + filename) raw_strings.update(self.extract_strings_from_filename(filename)) for string in raw_strings: filtered_string = filter_string(string) replaced_string, params = replace_params(string) if replaced_string not in strings_to_ignore: filtered_strings.add(filtered_string) return filtered_strings
def maybe_hub(self, url, tree): if self.match_filter_url(url): return False, [] block, matched_a, paths = self.get_hub_block(url, tree) tree = self.remove_p_aside_a(block, tree, matched_a) content_tree = self.get_readability_content(url, tree) content = unicode(content_tree.text_content().strip()) content = re.sub(ur'\s', u'', content) chinese_content = filter_string(content, False, True, True) a_content = sum([len(a.text.strip()) for a in matched_a]) ratio = len(chinese_content)*1.0/(a_content or 0.001) print 'url:%s matched_a:%d match content/link:%f' % (url, len(matched_a), ratio) print len(chinese_content), content.encode('utf-8') #import pdb;pdb.set_trace() if len(matched_a) > 20 and len(chinese_content) < 200 and ratio < 0.2: return True, paths else: return False, paths
def _rewrite_mapping(key, value, node): if value in mapping: node.text = mapping[filter_string(value)]