class ChineseTokenizer(): # a wrapper class to support the duck typing def __init__(self): self._seg = SEG() def tokenize(self, text): wlist = self._seg.cut(text) #wlist.reverse() Using Bag of W assumption, so unneeded, yeah? return wlist
def get_data(): ''' Get the training and text datasets from local folds Positive and negative datasets were stored in different folds When loading the datasets , do sentences segmentation with smallseg tool ''' posPath = '/home/zhouxc/skindetector/AdultWebsiteText/' negPath = '/home/zhouxc/skindetector/NormalWebsiteText/' posFiles = os.listdir(posPath) negFiles = os.listdir(negPath) trainingData = [] seg = SEG() seg.set(dic) c = 0 print '---------------------Read Positive DataSet-----------------' for fileName in posFiles: #if c > 100: break c += 1 print "PositiveData" + str(c) path = posPath + fileName data = seg.cut(open(path).read()) text = [ word.encode('utf-8') for word in data if word.encode('utf-8') in pornDict ] trainingData.append((text, 'Positive')) print '---------------------Positive DataSet done-----------------' c = 0 print '---------------------Read Negative DataSet-----------------' for fileName in negFiles: #if c > 100: break c += 1 print "NegativeData" + str(c) path = negPath + fileName data = seg.cut(open(path).read()) text = [ word.encode('utf-8') for word in data if word.encode('utf-8') in pornDict ] trainingData.append((text, 'Negative')) print '--------Negative DataSet done-----------------------------------' return trainingData, trainingData
class SearchChinese(SearchLanguage): lang = 'zh' def init(self, options): print("reading Chiniese dictionary") self.seg = SEG() def split(self, input): return self.seg.cut(input.encode("utf8")) def word_filter(self, stemmed_word): return len(stemmed_word) > 1
class SearchChinese(SearchLanguage): lang = 'zh' def init(self, options): print ("reading Chiniese dictionary") self.seg = SEG() def split(self, input): return self.seg.cut(input.encode("utf8")) def word_filter(self, stemmed_word): return len(stemmed_word) > 1
def seg( text ): seg = SEG() wlist = seg.cut(text) word_nums = {} for w in wlist: if len(w)<2:continue if word_nums.has_key( w ): word_nums[w] += 1 else: word_nums[w] = 1 return word_nums.items()
def get_data(): ''' Get the training and text datasets from local folds Positive and negative datasets were stored in different folds When loading the datasets , do sentences segmentation with smallseg tool ''' posPath = '/home/zhouxc/skindetector/AdultWebsiteText/' negPath = '/home/zhouxc/skindetector/NormalWebsiteText/' posFiles = os.listdir(posPath) negFiles = os.listdir(negPath) trainingData = [] seg = SEG() seg.set(dic) c = 0 print '---------------------Read Positive DataSet-----------------' for fileName in posFiles: #if c > 100: break c += 1 print "PositiveData" + str(c) path = posPath + fileName data = seg.cut(open(path).read()) text = [word.encode('utf-8') for word in data if word.encode('utf-8') in pornDict] trainingData.append((text , 'Positive')) print '---------------------Positive DataSet done-----------------' c = 0 print '---------------------Read Negative DataSet-----------------' for fileName in negFiles: #if c > 100: break c += 1 print "NegativeData" + str(c) path = negPath + fileName data = seg.cut(open(path).read()) text = [word.encode('utf-8') for word in data if word.encode('utf-8') in pornDict] trainingData.append((text , 'Negative')) print '--------Negative DataSet done-----------------------------------' return trainingData , trainingData
def seg(text): seg = SEG() wlist = seg.cut(text) word_nums = {} for w in wlist: if len(w) < 2: continue if word_nums.has_key(w): word_nums[w] += 1 else: word_nums[w] = 1 return word_nums.items()
def get(self): Access_CronJob = True headers = self.request.headers.items() for key, value in headers: if (key == 'X-Appengine-Cron') and (value == 'true'): Access_CronJob = True break # 如果不是CronJob来源的请求,记录日志并放弃操作 if (not Access_CronJob): logging.debug('CronJobCheck() access denied!') logging.critical( '如果这个请求不是由你手动触发的话,这意味者你的CronJobKey已经泄漏!请立即修改CronJobKey以防被他人利用') return mydate = datetime.utcnow() + timedelta(hours=+8) ts_hour = mydate.time().hour ts_min = mydate.time().minute dbug = self.request.get('debug') logging.debug(dbug) # 7:00早安世界 if (((ts_hour == 7) and (30 <= ts_min <= 32)) or (dbug == 'morning')): # 7:00 error = False try: wther = weather.weather() except weather.FetchError: logging.error("Weather Fetch Error!") error = True msg_idx = random.randint(0, len(config.MSG_GET_UP) - 1) if error: msg = '%s%s' % (config.MSG_GET_UP[msg_idx], config.BOT_HASHTAG) else: msg = '%s 今天%s的天气是:%s %s' % \ (config.MSG_GET_UP[msg_idx], config.CITY, wther, config.BOT_HASHTAG) OAuth_UpdateTweet(msg) # 早安世界 logging.info("%s:%d" % (msg, wther)) # 23:30 晚安世界 elif ((ts_hour == 23) and (30 <= ts_min <= 32)): # 23:30 msg_idx = random.randint(0, len(config.MSG_SLEEP) - 1) msg = '%s%s' % (config.MSG_SLEEP[msg_idx], config.BOT_HASHTAG) OAuth_UpdateTweet(msg) # 晚安世界 logging.info(msg) # 每小时一条命令 elif (((7 <= ts_hour <= 23) and (15 <= ts_min <= 17)) or (dbug == 'cli')): msg = command.random() if msg != None: msg = msg.replace( "# commandlinefu.com by David Winterbottom\n\n#", "//") msg = '%s %s' % ("叮咚!小bot教CLI时间到了!", msg[:-1]) msg += "#commandlinefu #xdlinux" logging.info(msg) OAuth_UpdateTweet(msg) # 扫TL,转推 auth = tweepy.OAuthHandler(config.CONSUMER_KEY, config.CONSUMER_SECRET) auth.set_access_token(config.ACCESS_TOKEN, config.ACCESS_SECRET) api = tweepy.API(auth) #since id tweetid = SinceID.all().get() logging.info(tweetid) if (tweetid == None): logging.warning("Initial!") tweetid = SinceID() timeline = api.home_timeline() else: logging.info("Since ID is: %d" % tweetid.since_id) timeline = api.home_timeline(since_id=tweetid.since_id) #self.response.out.write('GETTING TIMELINE<br />') regx = re.compile(config.RT_REGEX, re.I | re.M) mgc = re.compile(config.MGC, re.I | re.M) talk_to_me = re.compile(config.TALK, re.I | re.M) tweets = timeline[::-1] # 时间是倒序的 if tweets == []: logging.info("no new tweets!") return msg = None seg = SEG() for tweet in tweets: user = tweet.user.screen_name if user == 'xdtuxbot': continue text = tweet.text n = mgc.search(text) if n != None: continue t = talk_to_me.search(text) if (not t) and text[0] == '@': continue wlist = seg.cut(text.encode('utf-8')) logging.info(' '.join(wlist)) for w in wlist: if w in config.RT_LIST: break else: continue if t: bot = TalkBot() reply = bot.respond(talk_to_me.sub("", text)).decode('UTF-8') if reply != '': msg = u"@%s %s" % (user, reply) try: if msg: OAuth_UpdateTweet(msg, reply=tweet.id) # 发送到Twitter logging.info('Send Tweet: %s' % (msg)) else: api.retweet(tweet.id) except tweepy.TweepError, e: logging.error('Tweepy Error:%s' % e) except Exception, e: logging.error('Uknow Error:%s' % e)
return math.log( len(documentList) / (0.01 + numDocsContaining(word, documentList))) def tfidf(document, documentList): retdict = {} for word in document: retdict[word] = document.count(word) / float(len(document)) * idf( word, documentList) return retdict if __name__ == '__main__': seg = SEG() documentList = [] documentList.append( seg.cut( """新华网布鲁塞尔3月24日电 (记者张伟)北约秘书长拉斯穆森24日晚宣布,北约成员国当天决定在利比亚设立禁飞区,北约将在数天内从美国手中接管对利比亚军事行动指挥权。 当天,北约28个成员国大使在布鲁塞尔举行会议,拉斯穆森在会后发表声明宣布了上述决定。 声明说,北约所采取的行动是“广泛国际行动的一部分”,旨在保护利比亚平民的安全。声明还说,北约成员国均致力于履行联合国安理会决议所规定的义务,“这也是北约决定承担禁飞区责任的原因”。 本月17日,联合国安理会通过第1973号决议,同意在利比亚设立禁飞区。从19日开始,法国、美国和英国等国对利比亚展开军事行动。目前,这一行动由美国指挥,但美方已明确表示希望在本周末把指挥权移交出去。 拉斯穆森24日晚在接受美国有线电视新闻网采访时说,北约已做好必要的准备,将在“未来数天内”从美国手中接管禁飞区的行动指挥权,行动将统归北约最高军事长官、欧洲盟军最高司令詹姆斯·斯塔夫里迪斯指挥。 拉斯穆森解释说,北约成员国目前只是决定执行设立禁飞区的任务,并正在考虑承担“更为广泛的责任”,但目前尚未做出决定。 北约22日决定对利比亚实施武器禁运,来自北约7个成员国的16艘海军舰艇参与这一行动。此前,土耳其、法国和德国一直反对北约在利比亚设立禁飞区,谈判一度陷入僵局。""" )) words = {} for document in documentList: words = tfidf(document, documentList) for item in sorted(words.items(), key=itemgetter(1)): print "%s : %f" % (item[0], item[1])
if word in document: count += 1 return count def idf(word, documentList): return math.log(len(documentList) / (0.01 + numDocsContaining(word,documentList))) def tfidf(document, documentList): retdict = {} for word in document: retdict[word] = document.count(word) / float(len(document)) * idf(word,documentList) return retdict if __name__ == '__main__': seg = SEG() documentList = [] documentList.append(seg.cut("""新华网布鲁塞尔3月24日电 (记者张伟)北约秘书长拉斯穆森24日晚宣布,北约成员国当天决定在利比亚设立禁飞区,北约将在数天内从美国手中接管对利比亚军事行动指挥权。 当天,北约28个成员国大使在布鲁塞尔举行会议,拉斯穆森在会后发表声明宣布了上述决定。 声明说,北约所采取的行动是“广泛国际行动的一部分”,旨在保护利比亚平民的安全。声明还说,北约成员国均致力于履行联合国安理会决议所规定的义务,“这也是北约决定承担禁飞区责任的原因”。 本月17日,联合国安理会通过第1973号决议,同意在利比亚设立禁飞区。从19日开始,法国、美国和英国等国对利比亚展开军事行动。目前,这一行动由美国指挥,但美方已明确表示希望在本周末把指挥权移交出去。 拉斯穆森24日晚在接受美国有线电视新闻网采访时说,北约已做好必要的准备,将在“未来数天内”从美国手中接管禁飞区的行动指挥权,行动将统归北约最高军事长官、欧洲盟军最高司令詹姆斯·斯塔夫里迪斯指挥。 拉斯穆森解释说,北约成员国目前只是决定执行设立禁飞区的任务,并正在考虑承担“更为广泛的责任”,但目前尚未做出决定。 北约22日决定对利比亚实施武器禁运,来自北约7个成员国的16艘海军舰艇参与这一行动。此前,土耳其、法国和德国一直反对北约在利比亚设立禁飞区,谈判一度陷入僵局。""")) words = {} for document in documentList: words = tfidf(document,documentList) for item in sorted(words.items(), key=itemgetter(1)): print "%s : %f" % (item[0], item[1])
#encoding=utf-8 try: import psyco psyco.full() except: pass s3 = file("text.txt").read() words = [x.rstrip() for x in file("main.dic") ] from smallseg import SEG seg = SEG() print 'Load dict...' seg.set(words) print "Dict is OK." from time import time for i in xrange(1,101): start = time() for j in xrange(0,i): A = seg.cut(s3) cost = time()-start print i,"times, cost:",cost print "********************************"
#May12 Tokenize Commenters bigger then 1k #Jun23 Tokenize all renewed Commenters #keep previous segmentation on celebrity users for future usage import os from smallseg import SEG seg = SEG() all_user_folder = "../CommentUser/" for user_name in os.listdir(all_user_folder): user_file = os.path.join(all_user_folder, user_name) user_all_text = user_name + '_text' user_tokenized_text = user_name + '_tokenized' with open(user_file,'r') as user_text_corpus: for line in user_text_corpus: # print line #every line of tweets has a list of words wlist wlist = seg.cut(line) wlist.reverse() # print wlist tmp = " ".join(wlist) fout = open(os.path.join("../CommentUserTokenized", user_tokenized_text),'a') fout.write(tmp.encode('utf-8'))
def get(self): Access_CronJob = True headers = self.request.headers.items() for key, value in headers: if (key == 'X-Appengine-Cron') and (value == 'true'): Access_CronJob = True break # 如果不是CronJob来源的请求,记录日志并放弃操作 if (not Access_CronJob): logging.debug('CronJobCheck() access denied!') logging.critical('如果这个请求不是由你手动触发的话,这意味者你的CronJobKey已经泄漏!请立即修改CronJobKey以防被他人利用') return mydate = datetime.utcnow() + timedelta(hours=+8) ts_hour = mydate.time().hour ts_min = mydate.time().minute dbug = self.request.get('debug') logging.debug(dbug) # 7:00早安世界 if (((ts_hour == 7) and ( 30 <= ts_min <= 32)) or (dbug=='morning')): # 7:00 error = False try: wther=weather.weather() except weather.FetchError: logging.error("Weather Fetch Error!") error = True msg_idx=random.randint(0,len(config.MSG_GET_UP)-1) if error: msg = '%s%s' % (config.MSG_GET_UP[msg_idx],config.BOT_HASHTAG) else: msg = '%s 今天%s的天气是:%s %s' % \ (config.MSG_GET_UP[msg_idx], config.CITY, wther, config.BOT_HASHTAG) OAuth_UpdateTweet(msg) # 早安世界 logging.info("%s:%d" % (msg,wther)) # 23:30 晚安世界 elif ((ts_hour == 23) and (30 <= ts_min <=32)): # 23:30 msg_idx=random.randint(0,len(config.MSG_SLEEP)-1) msg = '%s%s' % (config.MSG_SLEEP[msg_idx], config.BOT_HASHTAG) OAuth_UpdateTweet(msg) # 晚安世界 logging.info(msg) # 每小时一条命令 elif (((7<=ts_hour<=23) and (15<=ts_min<=17)) or (dbug=='cli')): msg = command.random() if msg != None: msg = msg.replace("# commandlinefu.com by David Winterbottom\n\n#","//") msg = '%s %s' % ( "叮咚!小bot教CLI时间到了!", msg[:-1]) msg +="#commandlinefu #xdlinux" logging.info(msg) OAuth_UpdateTweet(msg) # 扫TL,转推 auth = tweepy.OAuthHandler(config.CONSUMER_KEY, config.CONSUMER_SECRET) auth.set_access_token(config.ACCESS_TOKEN, config.ACCESS_SECRET) api = tweepy.API(auth) #since id tweetid=SinceID.all().get() logging.info(tweetid) if ( tweetid == None ): logging.warning("Initial!") tweetid=SinceID() timeline = api.home_timeline() else: logging.info("Since ID is: %d" % tweetid.since_id) timeline = api.home_timeline(since_id=tweetid.since_id) #self.response.out.write('GETTING TIMELINE<br />') regx=re.compile(config.RT_REGEX,re.I|re.M) mgc = re.compile(config.MGC,re.I|re.M) talk_to_me = re.compile(config.TALK,re.I|re.M) tweets=timeline[::-1] # 时间是倒序的 if tweets == []: logging.info("no new tweets!") return msg=None seg = SEG() for tweet in tweets: user = tweet.user.screen_name if user == 'xdtuxbot': continue text = tweet.text n = mgc.search(text) if n != None: continue t = talk_to_me.search(text) if (not t) and text[0]=='@': continue wlist = seg.cut(text.encode('utf-8')) logging.info( ' '.join(wlist) ) for w in wlist: if w in config.RT_LIST: break else: continue if t: bot = TalkBot() reply = bot.respond( talk_to_me.sub("",text) ).decode('UTF-8') if reply != '': msg = u"@%s %s" % (user, reply) try: if msg: OAuth_UpdateTweet(msg,reply=tweet.id) # 发送到Twitter logging.info('Send Tweet: %s' % (msg)) else: api.retweet(tweet.id) except tweepy.TweepError, e: logging.error('Tweepy Error:%s' % e) except Exception, e: logging.error('Uknow Error:%s' % e)