示例#1
0
 def corpus_generator(self):
     with open(self.corpus_path, 'rb') as f:
         i = 0
         for line in f:
             line = UnicodeDammit(line.strip()).unicode_markup
             if line:
                 if self.lower:
                     line = line.lower()
                 i += 1
                 if i % 100000 == 0:
                     logging.info('Read {} nonblank lines'.format(i))
                 for tok in re.split(r'\s+', line):
                     yield tok
 def corpus_generator(self):
     with open(self.corpus_path, 'rb') as f:
         i = 0
         for line in f:
             line = UnicodeDammit(line.strip()).unicode_markup
             if line:
                 if self.lower:
                     line = line.lower()
                 i += 1
                 if i % 100000 == 0:
                     logging.info('Read {} nonblank lines'.format(i))
                 for tok in re.split(r'\s+', line):
                     yield tok
示例#3
0
def document_generator(path, lower=False):
    '''
    Default document reader.  Takes a path to a file with one document per line,
    with tokens separate by whitespace, and yields lists of tokens per document.
    This could be replaced by any function that yields lists of tokens.
    See main() for how it is called.

    Note: this uses BeautifulSoup's UnicodeDammit to convert to unicode.
    '''
    with open(path, 'rb') as f:
        i = 0
        for line in f:
            line = UnicodeDammit(line.strip()).unicode_markup
            if line:
                if lower:
                    line = line.lower()
                i += 1
                if i % 100000 == 0:
                    logging.info('Read {} nonblank lines'.format(i))
                yield re.split(r'\s+', line)
示例#4
0
def document_generator(path, lower=False):
    '''
    Default document reader.  Takes a path to a file with one document per line,
    with tokens separate by whitespace, and yields lists of tokens per document.
    This could be replaced by any function that yields lists of tokens.
    See main() for how it is called.

    Note: this uses BeautifulSoup's UnicodeDammit to convert to unicode.
    '''
    with open(path, 'rb') as f:
        i = 0
        for line in f:
            line = UnicodeDammit(line.strip()).unicode_markup
            if line:
                if lower:
                    line = line.lower()
                i += 1
                if i % 100000 == 0:
                    logging.info('Read {} nonblank lines'.format(i))
                yield re.split(r'\s+', line)
示例#5
0
    def clean_google_title(self, title):
        has_dot = False

        titleCleaned = UnicodeDammit(title).unicode_markup
        # clean step 1
        # BUGFIX: don't remove [xxx]. eg: "OQL[C++]: Ext...'
        titleCleaned = re.sub("(<(.*?)>)", "", titleCleaned)
        re_hasdot = re.compile("(\.\.\.|&hellip;)", re.I)
        match = re_hasdot.search(title)
        if match is not None:
            has_dot = True
            # clean step 2, here title is readable
        titleCleaned = re.sub("(&nbsp;|&#x25ba;|&hellip;)", "", titleCleaned)
        titleCleaned = re.sub("(&#.+?;|&.+?;)", "", titleCleaned)
        titleCleaned = titleCleaned.strip()
        readableTitle = titleCleaned
        # Shrink, only letters left
        titleCleaned = re.sub("\W", "", titleCleaned)
        titleCleaned = titleCleaned.lower()
        return (readableTitle, titleCleaned, has_dot)
示例#6
0
	def clean_google_title(self, title):
		has_dot = False
		
		titleCleaned = UnicodeDammit(title).unicode_markup
		# clean step 1
		# BUGFIX: don't remove [xxx]. eg: "OQL[C++]: Ext...'
		titleCleaned = re.sub("(<(.*?)>)", "", titleCleaned)
		re_hasdot = re.compile("(\.\.\.|&hellip;)", re.I)
		match = re_hasdot.search(title)
		if match is not None:
			has_dot = True
			# clean step 2, here title is readable
		titleCleaned = re.sub("(&nbsp;|&#x25ba;|&hellip;)", "", titleCleaned)
		titleCleaned = re.sub("(&#.+?;|&.+?;)", "", titleCleaned)
		titleCleaned = titleCleaned.strip()
		readableTitle = titleCleaned
		# Shrink, only letters left
		titleCleaned = re.sub("\W", "", titleCleaned)
		titleCleaned = titleCleaned.lower()
		return (readableTitle, titleCleaned, has_dot)
示例#7
0
文件: afrab0t.py 项目: Akendo/afrab0t
	def on_pubmsg(self, c, e):
		nick = e.source.nick
		target = e.target if is_channel(e.target) else nick
		def reply(msg):
			self.send(target, msg)
		def dm(msg):
			self.send(nick, msg)
		line = UnicodeDammit(e.arguments[0]).unicode_markup
		log('   \033[37m{}→{}\033[0m'.format(nick, line))
		a = line.split(":", 1)
		if len(a) > 1 and a[0].lower() == self.nick:
			self.do_command(e, a[1].strip().lower(), nick, target, reply, dm)
			return

		# zeltofilter
		if 'zeltoph' in nick:
			return

		foo = settings.VIPS.get(nick, 0)
		if random() < foo:
			self.kick(nick)
	
		match = re.match('.*┻━┻.*', line)
		if match:
			reply('┬─┬ノ(ಠ_ಠノ)')
			return

		match = re.match('^({} *:)? *chaos-?([☆★☼☀*]|sternchen) *: ?(.*)$'.format(self.nick), line)
		if match:
			newcs = match.group(3)
			self.chaossternchen.append(newcs)
			self.sendchan('Chaos-☆ Nr. {} notiert: {}'.format(len(self.chaossternchen), newcs))
			return

		if line.startswith('.wiki '):
			wikipage = line[len('.wiki '):].strip()
			if re.match('^[-_+\w]+$', wikipage):
				wikiurl = 'http://afra-berlin.de/dokuwiki/doku.php?id={}'.format(wikipage)
				if 'Dieses Thema existiert noch nicht' in requests.get(wikiurl).text:
					reply("I'm sorry, I can't find a wiki page with that name.")
				else:
					reply(wikiurl)
			else:
				reply('Try to troll somebot else.')
			return

		if line == 'wat?':
			reply("I don't have a clue.")
			return
		if re.match('^hail eris[.!]* ', line.lower()):
			reply("All Hail Discordia!")
			return
		m = re.findall('(^|\s)?(gh?ah?nh?dh?ih?)(\s|$)?', line, re.IGNORECASE)
		for _1,match,_2 in m:
			if not re.match('(^|\s)?gandhi(\s|$)?', match, re.IGNORECASE):
				self.kick(nick, "It's spelled Gandhi")
				return
		if re.search('https?://[-a-z0-9.]*facebook.com', line.lower()):
			reply('A facebook link? srsly? Get some self-respect!')
			return
		match = re.search('https?://pr0gramm.com/#(newest/\*/[0-9/]*)', line.lower())
		if match:
			reply('Fixed that pr0gramm link for you: http://pr0gramm.com/static/'+match.group(1))
			return
		if line == 'moin':
			self.moincount += 1
			if self.moincount == 5:
				reply('moin')
			return
		else:
			self.moincount = 0
		if line.lstrip('.!#').startswith('eta '):
			eta = line[4:].strip()
			with self.db as db:
				db.execute("DELETE FROM etas WHERE nick=?", (nick,))
				if eta:
					db.execute("INSERT INTO etas VALUES (DATETIME('now'), ?, ?)", (nick, eta))
			dm('ETA registered. Thanks!')
			return
		m = re.findall(URL_REGEX, line.lower())
		for url,*_ in m:
			res = requests.get(url)
			if res.status_code == requests.codes.ok:
				soup = BeautifulSoup(res.text)
				reply(soup.title.string)
		m = re.findall('(^|\s)(afra)(\s|$)', line, re.IGNORECASE)
		for _1,match,_2 in m:
			if match != 'AfRA' and match != 'afra' and random() < 0.1:
				reply("I'm sure you meant AfRA, not "+match)
				return