Python UnicodeDammit.lower示例

编程语言: Python

命名空间/包名称: bs4

类/类型: UnicodeDammit

方法/功能: lower

hotexamples.com的示例: 7

Python UnicodeDammit.lower - 已找到7个示例。这些是从开源项目中提取的最受好评的bs4.UnicodeDammit.lower现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

UnicodeDammit(30)

detwingle(21)

strip(10)

split(8)

replace(4)

lower(3)

splitlines(3)

encode(2)

startswith(2)

append(1)

decode(1)

endswith(1)

lstrip(1)

rstrip(1)

translate(1)

xpath(1)

示例#1

显示文件

文件： class_lm_cluster.py 项目： shen1993/NER

 def corpus_generator(self):
     with open(self.corpus_path, 'rb') as f:
         i = 0
         for line in f:
             line = UnicodeDammit(line.strip()).unicode_markup
             if line:
                 if self.lower:
                     line = line.lower()
                 i += 1
                 if i % 100000 == 0:
                     logging.info('Read {} nonblank lines'.format(i))
                 for tok in re.split(r'\s+', line):
                     yield tok

示例#2

显示文件

文件： class_lm_cluster.py 项目： DevSinghSachan/tan-clustering

 def corpus_generator(self):
     with open(self.corpus_path, 'rb') as f:
         i = 0
         for line in f:
             line = UnicodeDammit(line.strip()).unicode_markup
             if line:
                 if self.lower:
                     line = line.lower()
                 i += 1
                 if i % 100000 == 0:
                     logging.info('Read {} nonblank lines'.format(i))
                 for tok in re.split(r'\s+', line):
                     yield tok

示例#3

显示文件

def document_generator(path, lower=False):
    '''
    Default document reader.  Takes a path to a file with one document per line,
    with tokens separate by whitespace, and yields lists of tokens per document.
    This could be replaced by any function that yields lists of tokens.
    See main() for how it is called.

    Note: this uses BeautifulSoup's UnicodeDammit to convert to unicode.
    '''
    with open(path, 'rb') as f:
        i = 0
        for line in f:
            line = UnicodeDammit(line.strip()).unicode_markup
            if line:
                if lower:
                    line = line.lower()
                i += 1
                if i % 100000 == 0:
                    logging.info('Read {} nonblank lines'.format(i))
                yield re.split(r'\s+', line)

示例#4

显示文件

文件： pmi_cluster.py 项目： nickmarton/NLP

def document_generator(path, lower=False):
    '''
    Default document reader.  Takes a path to a file with one document per line,
    with tokens separate by whitespace, and yields lists of tokens per document.
    This could be replaced by any function that yields lists of tokens.
    See main() for how it is called.

    Note: this uses BeautifulSoup's UnicodeDammit to convert to unicode.
    '''
    with open(path, 'rb') as f:
        i = 0
        for line in f:
            line = UnicodeDammit(line.strip()).unicode_markup
            if line:
                if lower:
                    line = line.lower()
                i += 1
                if i % 100000 == 0:
                    logging.info('Read {} nonblank lines'.format(i))
                yield re.split(r'\s+', line)

示例#5

显示文件

文件： extractor.py 项目： yinonbaron/aminer-spider

    def clean_google_title(self, title):
        has_dot = False

        titleCleaned = UnicodeDammit(title).unicode_markup
        # clean step 1
        # BUGFIX: don't remove [xxx]. eg: "OQL[C++]: Ext...'
        titleCleaned = re.sub("(<(.*?)>)", "", titleCleaned)
        re_hasdot = re.compile("(\.\.\.|&hellip;)", re.I)
        match = re_hasdot.search(title)
        if match is not None:
            has_dot = True
            # clean step 2, here title is readable
        titleCleaned = re.sub("(&nbsp;|&#x25ba;|&hellip;)", "", titleCleaned)
        titleCleaned = re.sub("(&#.+?;|&.+?;)", "", titleCleaned)
        titleCleaned = titleCleaned.strip()
        readableTitle = titleCleaned
        # Shrink, only letters left
        titleCleaned = re.sub("\W", "", titleCleaned)
        titleCleaned = titleCleaned.lower()
        return (readableTitle, titleCleaned, has_dot)

示例#6

显示文件

文件： extractor.py 项目： AlexLyj/aminer-spider

	def clean_google_title(self, title):
		has_dot = False
		
		titleCleaned = UnicodeDammit(title).unicode_markup
		# clean step 1
		# BUGFIX: don't remove [xxx]. eg: "OQL[C++]: Ext...'
		titleCleaned = re.sub("(<(.*?)>)", "", titleCleaned)
		re_hasdot = re.compile("(\.\.\.|&hellip;)", re.I)
		match = re_hasdot.search(title)
		if match is not None:
			has_dot = True
			# clean step 2, here title is readable
		titleCleaned = re.sub("(&nbsp;|&#x25ba;|&hellip;)", "", titleCleaned)
		titleCleaned = re.sub("(&#.+?;|&.+?;)", "", titleCleaned)
		titleCleaned = titleCleaned.strip()
		readableTitle = titleCleaned
		# Shrink, only letters left
		titleCleaned = re.sub("\W", "", titleCleaned)
		titleCleaned = titleCleaned.lower()
		return (readableTitle, titleCleaned, has_dot)

示例#7

显示文件

文件： afrab0t.py 项目： Akendo/afrab0t

	def on_pubmsg(self, c, e):
		nick = e.source.nick
		target = e.target if is_channel(e.target) else nick
		def reply(msg):
			self.send(target, msg)
		def dm(msg):
			self.send(nick, msg)
		line = UnicodeDammit(e.arguments[0]).unicode_markup
		log('   \033[37m{}→{}\033[0m'.format(nick, line))
		a = line.split(":", 1)
		if len(a) > 1 and a[0].lower() == self.nick:
			self.do_command(e, a[1].strip().lower(), nick, target, reply, dm)
			return

		# zeltofilter
		if 'zeltoph' in nick:
			return

		foo = settings.VIPS.get(nick, 0)
		if random() < foo:
			self.kick(nick)
	
		match = re.match('.*┻━┻.*', line)
		if match:
			reply('┬─┬ノ(ಠ_ಠノ)')
			return

		match = re.match('^({} *:)? *chaos-?([☆★☼☀*]|sternchen) *: ?(.*)$'.format(self.nick), line)
		if match:
			newcs = match.group(3)
			self.chaossternchen.append(newcs)
			self.sendchan('Chaos-☆ Nr. {} notiert: {}'.format(len(self.chaossternchen), newcs))
			return

		if line.startswith('.wiki '):
			wikipage = line[len('.wiki '):].strip()
			if re.match('^[-_+\w]+$', wikipage):
				wikiurl = 'http://afra-berlin.de/dokuwiki/doku.php?id={}'.format(wikipage)
				if 'Dieses Thema existiert noch nicht' in requests.get(wikiurl).text:
					reply("I'm sorry, I can't find a wiki page with that name.")
				else:
					reply(wikiurl)
			else:
				reply('Try to troll somebot else.')
			return

		if line == 'wat?':
			reply("I don't have a clue.")
			return
		if re.match('^hail eris[.!]* ', line.lower()):
			reply("All Hail Discordia!")
			return
		m = re.findall('(^|\s)?(gh?ah?nh?dh?ih?)(\s|$)?', line, re.IGNORECASE)
		for _1,match,_2 in m:
			if not re.match('(^|\s)?gandhi(\s|$)?', match, re.IGNORECASE):
				self.kick(nick, "It's spelled Gandhi")
				return
		if re.search('https?://[-a-z0-9.]*facebook.com', line.lower()):
			reply('A facebook link? srsly? Get some self-respect!')
			return
		match = re.search('https?://pr0gramm.com/#(newest/\*/[0-9/]*)', line.lower())
		if match:
			reply('Fixed that pr0gramm link for you: http://pr0gramm.com/static/'+match.group(1))
			return
		if line == 'moin':
			self.moincount += 1
			if self.moincount == 5:
				reply('moin')
			return
		else:
			self.moincount = 0
		if line.lstrip('.!#').startswith('eta '):
			eta = line[4:].strip()
			with self.db as db:
				db.execute("DELETE FROM etas WHERE nick=?", (nick,))
				if eta:
					db.execute("INSERT INTO etas VALUES (DATETIME('now'), ?, ?)", (nick, eta))
			dm('ETA registered. Thanks!')
			return
		m = re.findall(URL_REGEX, line.lower())
		for url,*_ in m:
			res = requests.get(url)
			if res.status_code == requests.codes.ok:
				soup = BeautifulSoup(res.text)
				reply(soup.title.string)
		m = re.findall('(^|\s)(afra)(\s|$)', line, re.IGNORECASE)
		for _1,match,_2 in m:
			if match != 'AfRA' and match != 'afra' and random() < 0.1:
				reply("I'm sure you meant AfRA, not "+match)
				return