def corpus_generator(self): with open(self.corpus_path, 'rb') as f: i = 0 for line in f: line = UnicodeDammit(line.strip()).unicode_markup if line: if self.lower: line = line.lower() i += 1 if i % 100000 == 0: logging.info('Read {} nonblank lines'.format(i)) for tok in re.split(r'\s+', line): yield tok
def document_generator(path, lower=False): ''' Default document reader. Takes a path to a file with one document per line, with tokens separate by whitespace, and yields lists of tokens per document. This could be replaced by any function that yields lists of tokens. See main() for how it is called. Note: this uses BeautifulSoup's UnicodeDammit to convert to unicode. ''' with open(path, 'rb') as f: i = 0 for line in f: line = UnicodeDammit(line.strip()).unicode_markup if line: if lower: line = line.lower() i += 1 if i % 100000 == 0: logging.info('Read {} nonblank lines'.format(i)) yield re.split(r'\s+', line)
def clean_google_title(self, title): has_dot = False titleCleaned = UnicodeDammit(title).unicode_markup # clean step 1 # BUGFIX: don't remove [xxx]. eg: "OQL[C++]: Ext...' titleCleaned = re.sub("(<(.*?)>)", "", titleCleaned) re_hasdot = re.compile("(\.\.\.|…)", re.I) match = re_hasdot.search(title) if match is not None: has_dot = True # clean step 2, here title is readable titleCleaned = re.sub("( |►|…)", "", titleCleaned) titleCleaned = re.sub("(&#.+?;|&.+?;)", "", titleCleaned) titleCleaned = titleCleaned.strip() readableTitle = titleCleaned # Shrink, only letters left titleCleaned = re.sub("\W", "", titleCleaned) titleCleaned = titleCleaned.lower() return (readableTitle, titleCleaned, has_dot)
def on_pubmsg(self, c, e): nick = e.source.nick target = e.target if is_channel(e.target) else nick def reply(msg): self.send(target, msg) def dm(msg): self.send(nick, msg) line = UnicodeDammit(e.arguments[0]).unicode_markup log(' \033[37m{}→{}\033[0m'.format(nick, line)) a = line.split(":", 1) if len(a) > 1 and a[0].lower() == self.nick: self.do_command(e, a[1].strip().lower(), nick, target, reply, dm) return # zeltofilter if 'zeltoph' in nick: return foo = settings.VIPS.get(nick, 0) if random() < foo: self.kick(nick) match = re.match('.*┻━┻.*', line) if match: reply('┬─┬ノ(ಠ_ಠノ)') return match = re.match('^({} *:)? *chaos-?([☆★☼☀*]|sternchen) *: ?(.*)$'.format(self.nick), line) if match: newcs = match.group(3) self.chaossternchen.append(newcs) self.sendchan('Chaos-☆ Nr. {} notiert: {}'.format(len(self.chaossternchen), newcs)) return if line.startswith('.wiki '): wikipage = line[len('.wiki '):].strip() if re.match('^[-_+\w]+$', wikipage): wikiurl = 'http://afra-berlin.de/dokuwiki/doku.php?id={}'.format(wikipage) if 'Dieses Thema existiert noch nicht' in requests.get(wikiurl).text: reply("I'm sorry, I can't find a wiki page with that name.") else: reply(wikiurl) else: reply('Try to troll somebot else.') return if line == 'wat?': reply("I don't have a clue.") return if re.match('^hail eris[.!]* ', line.lower()): reply("All Hail Discordia!") return m = re.findall('(^|\s)?(gh?ah?nh?dh?ih?)(\s|$)?', line, re.IGNORECASE) for _1,match,_2 in m: if not re.match('(^|\s)?gandhi(\s|$)?', match, re.IGNORECASE): self.kick(nick, "It's spelled Gandhi") return if re.search('https?://[-a-z0-9.]*facebook.com', line.lower()): reply('A facebook link? srsly? Get some self-respect!') return match = re.search('https?://pr0gramm.com/#(newest/\*/[0-9/]*)', line.lower()) if match: reply('Fixed that pr0gramm link for you: http://pr0gramm.com/static/'+match.group(1)) return if line == 'moin': self.moincount += 1 if self.moincount == 5: reply('moin') return else: self.moincount = 0 if line.lstrip('.!#').startswith('eta '): eta = line[4:].strip() with self.db as db: db.execute("DELETE FROM etas WHERE nick=?", (nick,)) if eta: db.execute("INSERT INTO etas VALUES (DATETIME('now'), ?, ?)", (nick, eta)) dm('ETA registered. Thanks!') return m = re.findall(URL_REGEX, line.lower()) for url,*_ in m: res = requests.get(url) if res.status_code == requests.codes.ok: soup = BeautifulSoup(res.text) reply(soup.title.string) m = re.findall('(^|\s)(afra)(\s|$)', line, re.IGNORECASE) for _1,match,_2 in m: if match != 'AfRA' and match != 'afra' and random() < 0.1: reply("I'm sure you meant AfRA, not "+match) return