Python Sanitizer примеры, sanitizer.Sanitizer Python примеры использования

Пример #1

0

Показать файл

def main(search):
    '''
        executes serach for car using make-model-year and cars.com
        params: search is the reddit message/comment body that should
            contain the make model year information that we would like to
            look up
    '''
    search = search.lower()
    #clean up info to create query
    output = ''
    sanitizer = Sanitizer()
    query = sanitizer.sanitize_input(search)

    #verify that we received data back, and make a request
    if query:
        print('making request with', query)
        req = requests.get('http://www.cars.com/research/' + query)
    else:
        return output

    #parse html response using beautiful soup https://www.crummy.com/software/BeautifulSoup/bs4/doc/
    soup = BeautifulSoup(req.text,
                         'html.parser')  #get the entire html of the site
    specs = soup.findAll(
        'div', {'class': 'mmy-spec'})  #find all list items in the list
    other_trims = soup.findAll('div',
                               {'class': 'trim_listing'})  #find other trims

    #print info
    if bool(specs) or bool(other_trims):
        output = output + print_information(specs)
        output = output + '\n\n---\n\n '\
        'in order to not be annoying I am not printing all trims!'
        # output = output + print_trims(other_trims, output)
    return output

Пример #2

0

Показать файл

    def add_sanitizer(self,
                      method: Method,
                      sink: Sink,
                      sink_method_idx: int,
                      sanitizer: dict,
                      level: int = 0):
        """Add a new sanitizer to a sink.
        New sanitizers are only relevant if a method's parameters are used in a sanitizer.

        Parameters
        ----------
        method : Method
            The method where the sanitizer was added
        sink : Sink
            The sink to add the sanitizer to
        sink_method_idx : int
            The index of the method for which to add the new sanitizer
        sanitizer : dict
            A dictionary definition of the sanitizer as defined in the ruleset
        level : int, optional
            The depth of the nesting before the sanitizer is reached (sanitizers defined in the
            rules get a level of 0, sanitizers that call those get a level of 1, sanitizers that
            call those get a level of 2 and so on)
        """
        new_sanitizer = Sanitizer(sanitizer, level)
        duplicate = False
        for existing_sanitizer in sink.methods[sink_method_idx]['Sanitizers']:
            if existing_sanitizer.object_name != new_sanitizer.object_name:
                continue
            if new_sanitizer.methods == existing_sanitizer.methods:
                duplicate = True
                break
        if not duplicate:
            sink.methods[sink_method_idx]['Sanitizers'].append(new_sanitizer)
            self.notify_observers(method, changed_sanitizer=True)

Пример #3

0

Показать файл

 def prep_sanits(self):
     #show how many bottles left
     self.sanits = Group()
     for sanit_number in range(self.stats.sanits_left):
         sanit = Sanitizer(self.s,self.screen)
         sanit.rect.x = 10 + sanit_number*sanit.rect.width
         sanit.rect.y =10
         self.sanits.add(sanit)

Пример #4

0

Показать файл

Файл: putContent.py Проект: Muugii-bs/ca_server

	def __init__(self, ifile):
		self.content = ifile
		self.sanitizer = Sanitizer()
		name = ifile.split('_')
		self.content_id = name[0]
		self.content_type = name[1]
		self.content_date = name[2]
		self.header = {'User-Agent': 'Mozilla/5.0'}

Пример #5

0

Показать файл

def rungame():

    #initializin pygame,settings,and scr objs
    pygame.init()

    s = Settings()
    
    screen = pygame.display.set_mode((s.screen_width,s.screen_height))
    pygame.display.set_caption("CORONA INVASION!")

    #make the play button
    play_button = Button(s,screen,"Play")

    #create instance to store game statistics and create a scoreboard
    stats = GameStats(s)
    sb = Scoreboard(s,screen,stats)


    #time to make a ship
    sanit = Sanitizer(s , screen)
    #make a  virus
    #coro = Coro(s,screen)                                    ####optional for now
    #making a group to store bullets in
    bubbles = Group()
    coros = Group()
    #create fleet of viruses
    f.create_fleet(s,screen,sanit,coros)
  

    #main loop for the game
    while True:
        f.check_events(s,screen,stats,sb,play_button,sanit,coros,bubbles) 


           
        bubbles.update()
        
        if stats.game_active:
            sanit.update()
            f.update_bubbles(s,screen,stats,sb,sanit,coros,bubbles)
            f.update_coros(s,screen,stats,sb,sanit,coros,bubbles)
            f.update_screen(s,screen,stats,sb,sanit,coros,bubbles,play_button)

Пример #6

0

Показать файл

Файл: analyser.py Проект: LourencoPonces/Ssof_20-21

    def get_identifier_flow(self, identifier):
        if identifier in self.variable_flows:
            # get existing flow
            flow = self.variable_flows[identifier]
        else:
            # new variable: check if source/sink/sanitizer
            flows = []
            flows.append(Source(identifier, self.is_source(identifier)))
            flows.append(Sink(identifier, self.is_sink(identifier)))
            flows.append(Sanitizer(identifier, self.is_sanitizer(identifier)))
            flow = Flow(flows)

        return flow

Пример #7

0

Показать файл

    def __init__(self, definition: dict):
        """Constructor for class `Sink`.  Load definitions.

        Parameters
        ----------
        definition : dict
            Definitions for the object name, methods and sanitizers
        """
        self.object_name = next(iter(definition))
        self.methods = definition[self.object_name]['Methods']
        for idx, method in enumerate(self.methods):
            original_sanitizers = copy.deepcopy(self.methods[idx].get(
                'Sanitizers', []))
            method['Sanitizers'] = list()
            for sanitizer in original_sanitizers:
                method['Sanitizers'].append(Sanitizer(sanitizer))

        # Make sure that the definition has a valid format
        assert all(['Methodname' in method for method in self.methods])
        assert all(['Parameters' in method for method in self.methods])
        assert all(['Comment' in method for method in self.methods])

Пример #8

0

Показать файл

Файл: flesch_kincaid.py Проект: Somsubhra/Simplify

    def calculate_grade_level(content):
        words = []
        tokens = content.split()
        garbage_words = ["", ";", ",", "!", "?", "."]

        number_of_syllables = 0

        for token in tokens:
            sanitized_word = Sanitizer.sanitize_word(token)

            if sanitized_word in garbage_words:
                continue
            else:
                number_of_syllables += syllables_en.count_syllables(sanitized_word)
                words.append(sanitized_word)

        token_arrays = re.split('\.|!|\?', content)
        sentences = []

        for sentence in token_arrays:
            if sentence not in garbage_words:
                sentences.append(sentence)

        number_of_sentences = len(sentences)
        number_of_words = len(words)

        print "Syllables: " + str(number_of_syllables)
        print "Words: " + str(number_of_words)
        print str(words)
        print "Sentences: " + str(number_of_sentences)
        print str(sentences)

        grade_level = 0.39 * (float(number_of_words) / float(number_of_sentences)) +\
                      11.8 * (float(number_of_syllables) / float(number_of_words)) - 15.59

        return grade_level

Пример #9

0

Показать файл

Файл: filefinder_test.py Проект: cvoegtle/jarsanitizer

 def test_sanitizer(self):
     sanitizer = Sanitizer()
     file_removed = sanitizer.clean(
         '/Users/cv/.m2/repository/ant/ant/1.6.5/ant-1.6.5.jar')
     print(file_removed)

Пример #10

0

Показать файл

Файл: gen_train_data.py Проект: gaobo9109/CS3245

import nltk
import csv
from sanitizer import Sanitizer
from multiprocessing import Pool

csv.field_size_limit(2**30)
sanitizer = Sanitizer()
filename = 'dataset.csv'


def generate_training_data(row):
    document_id, title, content, date_posted, court = row
    judgement = sanitizer.extract_judgement(content)
    sentences = nltk.sent_tokenize(unicode(judgement, errors='ignore'))
    return map(sanitizer.tokenize, sentences)


if __name__ == "__main__":
    reader = csv.reader(open(filename, 'rb'))
    reader.next()
    
    pool = Pool()
    results = pool.map(generate_training_data, reader)
    sentences = []
    for sent in results:
        sentences.extend(sent)

    with open('sentences.txt', 'w') as f:
        for sentence in sentences:
            f.write(' '.join(sentence) + '\n')

Пример #11

0

Показать файл

def makeAbove(arguments):
    global template_path_g, output_path_g, script_dir_g

    mySanitizer = Sanitizer(
    )  # default sanitizer for mAm. Clears JS and CSS, leaves html in.

    # for every argument, check if set and handle accordingly
    with open(template_path_g, 'r') as templateFile:
        template = Template(templateFile.read())

    # set title if there should be one
    if arguments['--title'] is not None:
        title = arguments['--title']
    else:
        title = ""
    # clean title
    title = mySanitizer.cleanHTML(title)

    image = ""
    if arguments['--image'] is not None:
        image = arguments['--image']
    image = mySanitizer.cleanHTML(image)

    # create all tags and store them in one long string
    global TAG_HTML_TEMPLATE_STRING
    alltags = ""
    for tag in arguments['--tag']:
        alltags += TAG_HTML_TEMPLATE_STRING.substitute(
            {'tagtext': mySanitizer.cleanHTML(tag)})

    # for the line with points and comments
    global COMMENTLINE_TEMPLATE_STRING
    if arguments['-C'] is not None:
        argsC = mySanitizer.cleanHTML(arguments['-C'])
        commentline = '<a href="" class="C">{0}</a>'.format(argsC)
    elif (arguments['--comments'] is None) and (arguments['--points'] is None):
        commentline = ""
    else:
        comments = 0 if arguments['--comments'] is None else arguments[
            '--comments']
        points = 0 if arguments['--points'] is None else arguments['--points']
        comments = mySanitizer.cleanHTML(comments)
        points = mySanitizer.cleanHTML(points)
        subC = "{0} comments".format(comments)
        subP = "{0} points".format(points)
        commentline = COMMENTLINE_TEMPLATE_STRING.substitute({
            'points': subP,
            'comments': subC
        })

    # set text if there should be
    global TXT_TEMPLATE_STRING
    text = ''
    if arguments['--text'] is not None:
        text = arguments['--text']
        text = mySanitizer.cleanHTML(text)
        text = TXT_TEMPLATE_STRING.substitute(
            {'text': text})  # write text into html-string

    substDir = {
        'title': title,
        'image': image,
        'tags': alltags,
        'commentline': commentline,
        'text': text
    }
    tempStr = template.substitute(substDir)

    # write result to temp file
    (fd, filename) = tempfile.mkstemp(
        suffix='.html', dir=script_dir_g
    )  # create the tempfile in the script-containing directory
    try:
        tfile = os.fdopen(fd, "w")
        tfile.write(tempStr)
        tfile.close()
        if not arguments['-X']:
            webkitres = subprocess.check_output([
                "webkit2png", filename, "-o", output_path_g, "-x", "70", "1000"
            ])
        else:
            webkitres = subprocess.check_output(
                ["webkit2png", filename, "-o", output_path_g])
        print("Called webkit2png with filename {0} and output path {1}".format(
            filename, output_path_g))
    except subprocess.CalledProcessError as e:
        print("webkit2png failed. DO SOMETHING."
              )  # handle error of webkit2png? I don't know how, so not my job
        exit(2)
    finally:
        os.remove(filename)

Пример #12

0

Показать файл

Файл: dbmock.py Проект: pszostek/newsgenie

"dziwi bo tak się zawsze robi to jest taka praktyka"

class m_news(object):
  bodies = ['dsa dsa fdasfsd gdfg dfg dfg fdgerg ghrhy etg ger gre ger',
  'dsa dsa fdasfsd gdfg dfg ee fdgerg fd etg ger gre asd',
  'dsa rer fdasfsd azz ghrhy etg ger gre ger',
  'dsa dsa fdas dfg dfg dfg  ger gre ger',
  'dsa fda ghrhy gdfg ger gre dsa',]
  def __init__(self):
    pass
  def __call__(self, arg):
    return News(title='abc', body=m_news.bodies[arg], clean_body=m_news.bodies[arg], url="http://dfsd.fd.com", date=int(time()))
  
from newsgroup import NewsGroup 
from sanitizer import Sanitizer
s=Sanitizer()
ng = NewsGroup()
m_news = m_news()
nr = [ng.quantity_reduce(m_news(i)) for i in range(0, len(m_news.bodies))]

print s.cleanup_news(dirty_news)
for i in range(0, len(m_news.bodies)):
  for j in range(i, len(m_news.bodies)):
    print "("+str(i)+","+str(j)+")"+str(ng.cosine_distance(nr[i], nr[j]))
print("jaccard")
nr = [ng.binary_reduce(m_news(i)) for i in range(0, len(m_news.bodies))]

for i in range(0, len(m_news.bodies)):
  for j in range(i, len(m_news.bodies)):
    print "("+str(i)+","+str(j)+")"+str(ng.jaccard_index(nr[i], nr[j]))

Пример #13

0

Показать файл

Файл: dbmock.py Проект: pszostek/newsgenie

    ]

    def __init__(self):
        pass

    def __call__(self, arg):
        return News(title='abc',
                    body=m_news.bodies[arg],
                    clean_body=m_news.bodies[arg],
                    url="http://dfsd.fd.com",
                    date=int(time()))


from newsgroup import NewsGroup
from sanitizer import Sanitizer
s = Sanitizer()
ng = NewsGroup()
m_news = m_news()
nr = [ng.quantity_reduce(m_news(i)) for i in range(0, len(m_news.bodies))]

print s.cleanup_news(dirty_news)
for i in range(0, len(m_news.bodies)):
    for j in range(i, len(m_news.bodies)):
        print "(" + str(i) + "," + str(j) + ")" + str(
            ng.cosine_distance(nr[i], nr[j]))
print("jaccard")
nr = [ng.binary_reduce(m_news(i)) for i in range(0, len(m_news.bodies))]

for i in range(0, len(m_news.bodies)):
    for j in range(i, len(m_news.bodies)):
        print "(" + str(i) + "," + str(j) + ")" + str(

Пример #14

0

Показать файл

Файл: scraper.py Проект: fzbuzz/OHHLA-WebScraper

class OHHLAScraper:
    OHHLA_URL = "http://ohhla.com/"
    ALL_ARTIST_SITES = ["http://ohhla.com/all.html",
                        "http://ohhla.com/all_two.html",
                        "http://ohhla.com/all_three.html",
                        "http://ohhla.com/all_four.html",
                        "http://ohhla.com/all_five.html"]
    TOP_ARTIST_SITES = ["http://ohhla.com/favorite.html"]
    EXCLUDED_ARTISTS = {'113'}

    def __init__(self, output_directory):
        self.output_directory = output_directory
        self.sanitizer = Sanitizer()

    def scrape_all_artists(self):
        for url in self.ALL_ARTIST_SITES:
            self._scrape_all_artists_page(url)

    def scrape_top_artists(self):
        for url in self.TOP_ARTIST_SITES:
            self._scrape_top_artists_page(url)

    def _scrape_all_artists_page(self, url):
        dom = self._extract_dom(url)
        artist_refs = dom.xpath("//pre/a[@href]/@href")

        for artist_ref in artist_refs:
            ref_split = artist_ref.rsplit('/')
            if not artist_ref or self._is_parent_ref(url, artist_ref) or len(ref_split) < 2:
                continue

            artist_name = ref_split[-2]
            if not artist_name or artist_name in self.EXCLUDED_ARTISTS:
                continue

            artist_url = self.OHHLA_URL + artist_ref
            artist_file_name = '{}/{}.txt'.format(self.output_directory, artist_name)

            with open(artist_file_name, 'w') as output_file:
                self._scrape_artist_page(artist_url, output_file)
                output_file.write('\n')

    def _scrape_top_artists_page(self, url):
        dom = self._extract_dom(url)
        artist_refs = dom.xpath("//td/a[@href]/@href")

        for artist_ref in artist_refs:
            artist_name = artist_ref.replace('YFA_', '').replace('.html', '')
            if not artist_name or artist_name in self.EXCLUDED_ARTISTS:
                continue

            artist_url = self.OHHLA_URL + artist_ref
            artist_file_name = '{}/{}.txt'.format(self.output_directory, artist_name)

            with open(artist_file_name, 'w') as output_file:
                self._scrape_top_artist_page(artist_url, output_file)
                output_file.write('\n')

    def _scrape_artist_page(self, url, output_file):
        try:
            dom = self._extract_dom(url)
        except:
            return

        album_refs = dom.xpath("//tr/td/a[@href]/@href")
        for album_ref in album_refs:
            if not album_ref or self._is_parent_ref(url, album_ref):
                continue
            album_url = url + album_ref
            self._scrape_album_page(album_url, output_file)

    def _scrape_album_page(self, url, output_file):
        try:
            dom = self._extract_dom(url)
        except:
            return

        song_refs = dom.xpath("//tr/td/a[@href]/@href")
        for song_ref in song_refs:
            if not song_ref or self._is_parent_ref(url, song_ref):
                continue
            song_url = url + song_ref
            self._scrape_song_page(song_url, output_file)

    def _scrape_top_artist_page(self, url, output_file, recurse=True):
        try:
            dom = self._extract_dom(url)
        except:
            return

        song_refs = dom.xpath("//tr/td/a[@href]/@href")
        for song_ref in song_refs:
            if not song_ref:
                continue
            elif song_ref.endswith('.txt'):
                song_url = self.OHHLA_URL + song_ref
                self._scrape_song_page(song_url, output_file)
            elif song_ref.endswith('html') and recurse:
                next_url = self.OHHLA_URL + song_ref
                self._scrape_top_artist_page(next_url, output_file, recurse=False)

    def _scrape_song_page(self, url, output_file):
        try:
            opened_url = urllib.request.urlopen(url)
            dom = opened_url.read()
        except:
            return

        if re.match(r'^b[\'\"]<!DOCTYPE.*?>', str(dom)) is not None:
            song_html = html.fromstring(dom)
            try:
                lyrics = song_html.xpath("//pre/text()")[0]
            except:
                return
        else:
            lyrics = dom.decode("utf-8", "ignore")

        cleaned_lyrics = self.sanitizer.clean_lyrics(lyrics)
        output_file.write(cleaned_lyrics)
        output_file.write('\n')

    def _is_parent_ref(self, url, ref):
        start_of_relative_ref = len(self.OHHLA_URL) - 1
        end_of_relative_ref = url.rindex('/', 0, len(url) - 1) + 1
        relative_ref = url[start_of_relative_ref:end_of_relative_ref]
        return relative_ref == ref

    @staticmethod
    def _extract_dom(url):
        opened_url = urllib.request.urlopen(url)
        return html.fromstring(opened_url.read())

Пример #15

0

Показать файл

Файл: scraper.py Проект: fzbuzz/OHHLA-WebScraper

 def __init__(self, output_directory):
     self.output_directory = output_directory
     self.sanitizer = Sanitizer()

Пример #16

0

Показать файл

Файл: putContent.py Проект: Muugii-bs/ca_server

class contentParser:
	
	def __init__(self, ifile):
		self.content = ifile
		self.sanitizer = Sanitizer()
		name = ifile.split('_')
		self.content_id = name[0]
		self.content_type = name[1]
		self.content_date = name[2]
		self.header = {'User-Agent': 'Mozilla/5.0'}

	def parse(self):
		conn = db.connect(**db_config)
		cursor = conn.cursor()
		with open(self.content, 'r') as fp:
			content = json.load(fp)
			print json.dumps(content, indent=4, sort_keys=True)
			for item in content:
                                time.sleep(2)
				attack_type = item['attack_type']
				attack_category = item['attack_category']
				author = item['author']
				country = item['country']
				date = item['date']
				target = item['target']
				target_category = item['target_category']
				my_count = 0
				for url in item['link']:
					res = ''
					if 'video' in url:
						continue
					elif 'image' in url:
						continue
					elif 'www.pravdareport.com' in url:
						res = self.parse_pravda(url)
						media = 'pravda'
					elif 'www.rt.com' in url:
						res = self.parse_rt(url)
						media = 'rt'
					elif 'www.washingtonpost.com' in url:
						res = self.parse_wp(url)
						media = 'washingtonpost'
					elif 'www.nytimes.com' in url:
						res = self.parse_nyt(url)
						media = 'nytimes'
					elif 'www.japantimes.co.jp' in url:
						res = self.parse_jpt(url)
						media = 'japantimes'
					elif 'www.nbcnews.com' in url:
						res = self.parse_nbc(url)
						media = 'nbcnews'
					elif 'www.theguardian.com' in url:
						res = self.parse_guardian(url)
						media = 'theguardian'
					elif 'www.bbc.com' in url:
						res = self.parse_bbc(url)
						media = 'bbc'
					elif 'news.yahoo.com' in url:
						res = self.parse_yahoo(url)
						media = 'yahoo'
					elif 'www.foxnews.com' in url:
						res = self.parse_fox(url)
						media = 'foxnews'
					elif 'www3.nhk.or.jp' in url:
						res = self.parse_nhk(url)
						media = 'nhk'
					elif 'www.chinadaily.com.cn' in url:
						res = self.parse_cndaily(url)
						media = 'chinadaily'
					elif 'www.aljazeera.com' in url:
						res = self.parse_alj(url)
						media = 'aljazeera'
					elif 'www.moscowtimes.com' in url:
						res = self.parse_moscowt(url)
						media = 'moscowtimes'
					elif 'www.shanghaidaily.com' in url:
						res = self.parse_shanghaid(url)
						media = 'shanghaidaily'
					my_count += 1
					if res:
						res = self.sanitizer.sanitize(res)
						print self.content_id,self.content_type,url
						print res
						query = ("INSERT INTO ca_analyze "
										"(attack_id, target, date, author, attack_type, attack_category, target_category, country, flag, count, content, media, url) "
										    "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) "
										    "ON DUPLICATE KEY UPDATE author=%s, attack_type=%s, attack_category=%s, target_category=%s, country=%s, "
										    "flag=%s, count=%s, content=%s"
										    )
                                                cursor.execute(query, (self.content_id, target, self.content_date, author, attack_type, attack_category, target_category, country, self.content_type, my_count, res, media, url, author, attack_type, attack_category, target_category, country, self.content_type, my_count, res, )) 
	
		conn.close()

	def parse_init(self, url):
		cj = CookieJar()
		opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
		opener.addheaders = [('User-Agent', 'Mozilla/5.0 (Windows NT 5.1; rv:10.0.1) Gecko/20100101 Firefox/10.0.1')]
		html = opener.open(url, timeout=100)
		return bs(html.read(), 'html.parser')

	def parse_init_off(self, url):
		req = urllib2.Request(url, headers=self.header)
		html = urllib2.urlopen(req, timeout=100)
		return bs(html.read(), 'html.parser')

	def parse_pravda(self, url):
		res = ''
		try:
			text = self.parse_init(url) 
		except Exception:
			text = self.parse_init_off(url)
		except Exception:
			return res
		else:
			article = text.body.find('div', attrs={'id': 'article'})
			if article:
				article = article.find_all("p")
				for p in article:
					res += p.get_text()
					res += ' '
			return res 

	def parse_rt(self, url):
		res = ''
		try:
			text = self.parse_init(url) 
		except Exception:
			text = self.parse_init_off(url)
		except Exception:
			return res
		else:
			res = text.body.find('div', attrs={'class': 'article__summary'}).get_text()
			article = text.body.find('div', attrs={'class': 'article__text'})
			if article:
				article = article.find_all("p")
				for p in article:
					res += p.get_text()
					res += ' '
			return res 

	def parse_yahoo(self, url):
		res = ''
		try:
			text = self.parse_init(url) 
		except Exception:
			text = self.parse_init_off(url)
		except Exception:
			return res
		else:
			article = text.body.find('div', attrs={'class': 'yom-art-content'})
			if article:
				article = article.find_all("p")
				for p in article:
					res += p.get_text()
					res += ' '
			return res 

	def parse_nyt(self, url):
		res = ''
		try:
			text = self.parse_init(url) 
		except Exception:
			text = self.parse_init_off(url)
		except Exception:
			return res
		else:
			article = text.body.find('div', attrs={'id': 'story-body'})
			if article:
				article = article.find_all("p")
				for p in article:
					res += p.get_text()
					res += ' '
			return res 

	def parse_wp(self, url):
		res = ''
		try:
			text = self.parse_init(url) 
		except Exception:
			text = self.parse_init_off(url)
		except Exception:
			return res
		else:
			article = text.body.find('article', attrs={'itemprop': 'articleBody'})
			if article:
				article = article.find_all("p")
				for p in article:
					res += p.get_text()
					res += ' '
			return res 

	def parse_shanghaid(self, url):
		res = ''
		try:
			httplib.HTTPConnection.debuglevel = 1
			request = urllib2.Request(url)
			request.add_header('Accept-encoding', 'gzip') 
			opener = urllib2.build_opener()
			html = opener.open(request)
			data = html.read()
			data = StringIO.StringIO(data)
			gzipper = gzip.GzipFile(fileobj=data)
			text = gzipper.read()
			text = bs(text, 'html.parser')
			article = text.find('div', attrs={'class': 'detail_content'})
			if article:
				article = article.find_all("p")
				for p in article:
					res += p.get_text()
					res += ' '
			return res
		except Exception:
			return res 

	def parse_jpt(self, url):
		res = ''
		try:
			text = self.parse_init(url) 
		except Exception:
			text = self.parse_init_off(url)
		except Exception:
			return res
		else:
			article = text.body.find('div', attrs={'id': 'jtarticle'})
			if article:
				article = article.find_all("p")
				for p in article:
					res += p.get_text()
					res += ' '
			return res 

	def parse_nbc(self, url):
		res = ''
		try:
			text = self.parse_init(url) 
		except Exception:
			text = self.parse_init_off(url)
		except Exception:
			return res
		else:
			article = text.body.find('div', attrs={'itemprop': 'articleBody'})
			if article:
				article = article.find_all("p")
				for p in article:
					res += p.get_text()
					res += ' '
			return res 

	def parse_bbc(self, url):
		res = ''
		try:
			text = self.parse_init(url) 
		except Exception:
			text = self.parse_init_off(url)
		except Exception:
			return res
		else:
			article = text.body.find('div', attrs={'property': 'articleBody'})
			if article:
				article = article.find_all("p")
				for p in article:
					res += p.get_text()
					res += ' '
			return res 

	def parse_fox(self, url):
		res = ''
		try:
			text = self.parse_init_off(url) 
		except Exception:
			text = self.parse_init(url)
		except Exception:
			return res
		else:
			article = text.body.find('div', attrs={'itemprop': 'articleBody'})
			if article:
				article = article.find_all("p")
				for p in article:
					res += p.get_text()
					res += ' '
			return res 

	def parse_nhk(self, url):
		res = ''
		try:
			text = self.parse_init(url) 
		except Exception:
			text = self.parse_init_off(url)
		except Exception:
			return res
		else:
			article = text.body.find('div', attrs={'class': 'content'})
			if article:
				article = article.find_all("p")
				for p in article:
					res += p.get_text()
					res += ' '
			return res 

	def parse_cndaily(self, url):
		res = ''
		try:
			text = self.parse_init_off(url) 
		except Exception:
			text = self.parse_init(url)
		except Exception:
			return res
		else:
			article = text.find('div', attrs={'id': 'Content'})
			if article:
				article = article.find_all("p")
				for p in article:
					res += p.get_text()
					res += ' '
			return res 

	def parse_alj(self, url):
		res = ''
		try:
			text = self.parse_init_off(url) 
		except Exception:
			text = self.parse_init(url)
		except Exception:
			return res
		else:
			article = text.body.find('div', attrs={'id': 'article-body'})
			if article:
				article = article.find_all("p")
				for p in article:
					res += p.get_text()
					res += ' '
			return res 

	def parse_moscowt(self, url):
		res = ''
		try:
			text = self.parse_init_off(url) 
		except Exception:
			text = self.parse_init(url)
		except Exception:
			return res
		else:
			article = text.body.find('div', attrs={'class': 'article_text'})
			if article:
				article = article.find_all("p")
				for p in article:
					res += p.get_text()
					res += ' '
			return res 

	def parse_guardian(self, url):
		res = ''
		try:
			text = self.parse_init_off(url) 
		except Exception:
			text = self.parse_init(url)
		except Exception:
			return res
		else:
			article = text.body.find('div', attrs={'itemprop': 'articleBody'})
			if article:
				article = article.find_all("p")
				for p in article:
					res += p.get_text()
					res += ' '
			return res

Python Sanitizer примеры использования