Пример #1
0
def seg(text):
    seg = SEG()
    wlist = seg.cut(text)
    word_nums = {}

    for w in wlist:
        if len(w) < 2: continue
        if word_nums.has_key(w):
            word_nums[w] += 1
        else:
            word_nums[w] = 1

    return word_nums.items()
Пример #2
0
def seg( text ):
    seg = SEG()
    wlist = seg.cut(text)
    word_nums = {}
    
    for w in wlist:
        if len(w)<2:continue
        if word_nums.has_key( w ):
            word_nums[w] += 1
        else:
            word_nums[w] = 1
            
    return word_nums.items()
 class ChineseTokenizer(): # a wrapper class to support the duck typing
     def __init__(self):
         self._seg = SEG()
     def tokenize(self, text):
         wlist = self._seg.cut(text)
         #wlist.reverse() Using Bag of W assumption, so unneeded, yeah?
         return wlist
Пример #4
0
class SearchChinese(SearchLanguage):
    lang = 'zh'

    def init(self, options):
        print("reading Chiniese dictionary")
        self.seg = SEG()

    def split(self, input):
        return self.seg.cut(input.encode("utf8"))

    def word_filter(self, stemmed_word):
        return len(stemmed_word) > 1
Пример #5
0
class SearchChinese(SearchLanguage):
    lang = 'zh'

    def init(self, options):
        print ("reading Chiniese dictionary")
        self.seg = SEG() 

    def split(self, input):
        return self.seg.cut(input.encode("utf8")) 

    def word_filter(self, stemmed_word):
        return len(stemmed_word) > 1
def get_data():
	'''
	Get the training and text datasets from local folds
	Positive and negative datasets were stored in different folds
	When loading the datasets , do sentences segmentation with smallseg tool
	'''
	posPath = '/home/zhouxc/skindetector/AdultWebsiteText/'
	negPath = '/home/zhouxc/skindetector/NormalWebsiteText/'
	posFiles = os.listdir(posPath)
	negFiles = os.listdir(negPath)

	trainingData = []
	seg = SEG()
	seg.set(dic)
	c = 0
	print '---------------------Read Positive DataSet-----------------'
	for fileName in posFiles:
		#if c > 100: break
		c += 1
		print "PositiveData" + str(c)
		path = posPath + fileName
		data = seg.cut(open(path).read())
		text = [word.encode('utf-8') for word in data if word.encode('utf-8') in pornDict]
		trainingData.append((text , 'Positive'))
	print '---------------------Positive DataSet done-----------------'
	c = 0
	
	print '---------------------Read Negative DataSet-----------------'
	for fileName in negFiles:
		#if c > 100:	break
		c += 1
		print "NegativeData" + str(c)
		path = negPath + fileName
		data = seg.cut(open(path).read())
		text = [word.encode('utf-8') for word in data if word.encode('utf-8')  in pornDict]
		trainingData.append((text , 'Negative'))
	print '--------Negative DataSet  done-----------------------------------'
	
	return trainingData  , trainingData
Пример #7
0
def get_data():
    '''
	Get the training and text datasets from local folds
	Positive and negative datasets were stored in different folds
	When loading the datasets , do sentences segmentation with smallseg tool
	'''
    posPath = '/home/zhouxc/skindetector/AdultWebsiteText/'
    negPath = '/home/zhouxc/skindetector/NormalWebsiteText/'
    posFiles = os.listdir(posPath)
    negFiles = os.listdir(negPath)

    trainingData = []
    seg = SEG()
    seg.set(dic)
    c = 0
    print '---------------------Read Positive DataSet-----------------'
    for fileName in posFiles:
        #if c > 100: break
        c += 1
        print "PositiveData" + str(c)
        path = posPath + fileName
        data = seg.cut(open(path).read())
        text = [
            word.encode('utf-8') for word in data
            if word.encode('utf-8') in pornDict
        ]
        trainingData.append((text, 'Positive'))
    print '---------------------Positive DataSet done-----------------'
    c = 0

    print '---------------------Read Negative DataSet-----------------'
    for fileName in negFiles:
        #if c > 100:	break
        c += 1
        print "NegativeData" + str(c)
        path = negPath + fileName
        data = seg.cut(open(path).read())
        text = [
            word.encode('utf-8') for word in data
            if word.encode('utf-8') in pornDict
        ]
        trainingData.append((text, 'Negative'))
    print '--------Negative DataSet  done-----------------------------------'

    return trainingData, trainingData
Пример #8
0
 def init(self, options):
     print("reading Chiniese dictionary")
     self.seg = SEG()
Пример #9
0
def idf(word, documentList):
    return math.log(
        len(documentList) / (0.01 + numDocsContaining(word, documentList)))


def tfidf(document, documentList):
    retdict = {}
    for word in document:
        retdict[word] = document.count(word) / float(len(document)) * idf(
            word, documentList)
    return retdict


if __name__ == '__main__':
    seg = SEG()
    documentList = []
    documentList.append(
        seg.cut(
            """新华网布鲁塞尔3月24日电 (记者张伟)北约秘书长拉斯穆森24日晚宣布,北约成员国当天决定在利比亚设立禁飞区,北约将在数天内从美国手中接管对利比亚军事行动指挥权。
  当天,北约28个成员国大使在布鲁塞尔举行会议,拉斯穆森在会后发表声明宣布了上述决定。
  声明说,北约所采取的行动是“广泛国际行动的一部分”,旨在保护利比亚平民的安全。声明还说,北约成员国均致力于履行联合国安理会决议所规定的义务,“这也是北约决定承担禁飞区责任的原因”。
  本月17日,联合国安理会通过第1973号决议,同意在利比亚设立禁飞区。从19日开始,法国、美国和英国等国对利比亚展开军事行动。目前,这一行动由美国指挥,但美方已明确表示希望在本周末把指挥权移交出去。
  拉斯穆森24日晚在接受美国有线电视新闻网采访时说,北约已做好必要的准备,将在“未来数天内”从美国手中接管禁飞区的行动指挥权,行动将统归北约最高军事长官、欧洲盟军最高司令詹姆斯·斯塔夫里迪斯指挥。
  拉斯穆森解释说,北约成员国目前只是决定执行设立禁飞区的任务,并正在考虑承担“更为广泛的责任”,但目前尚未做出决定。
  北约22日决定对利比亚实施武器禁运,来自北约7个成员国的16艘海军舰艇参与这一行动。此前,土耳其、法国和德国一直反对北约在利比亚设立禁飞区,谈判一度陷入僵局。"""
        ))
    words = {}
    for document in documentList:
        words = tfidf(document, documentList)
    for item in sorted(words.items(), key=itemgetter(1)):
Пример #10
0
 def init(self, options):
     print ("reading Chiniese dictionary")
     self.seg = SEG() 
Пример #11
0
                    + str(i) \
                    + '''')" href="'''\
                    + url \
                    + '''" target="_blank"><font size="3">''' \
                    + arrowscript \
                    + title \
                    + '''</font></a><br /><font size="-1">''' \
                    + snippet \
                    + '''<br /><font color="#008000">''' \
                    + url \
                    + '''<br /></font></font></td></tr></table>\n'''
        pageStr += resultStr
        i += 1

    return pageStr


if __name__ == '__main__':
    #resultsList = ["我是中国人民的儿子", "你是我儿子", "中国人民万岁", "我永远是中国人民的儿子"]
    seg = SEG()
    #print 'Load dict...'
    words = "main.dic"
    seg.set(words)
    #print "Dict is OK."

    #print psudorerank(resultsList, 2)
    username = "******"
    engine = request.GET.get("engine", "")
    resultsTable = ResultInfoTable[engine]
    [query, pagecontent] = userFeedbackRerank(username, resultsTable, seg)
Пример #12
0
#encoding=utf-8
#import psyco
#psyco.full()

import codecs
from smallseg import SEG
seg = SEG()


def cuttest(text):
    wlist = seg.cut(text)
    wlist.reverse()
    tmp = " ".join(wlist)
    print tmp
    print "================================"


if __name__ == "__main__":
    text = ''.join(open('wiki_test', 'r'))
    cuttest(text)
Пример #13
0
#encoding=utf-8
try:
    import psyco
    psyco.full()
except:
    pass

s3 = file("text.txt").read()
words = [x.rstrip() for x in file("main.dic") ]
from smallseg import SEG
seg = SEG()
print 'Load dict...'
seg.set(words)
print "Dict is OK."
from time import time

for i in xrange(1,101):
    start = time()
    for j in xrange(0,i):
        A = seg.cut(s3)
    cost = time()-start
    print i,"times, cost:",cost

print "********************************"

Пример #14
0
#May12 Tokenize Commenters bigger then 1k
#Jun23 Tokenize all renewed Commenters
#keep previous segmentation on celebrity users for future usage
import os
from smallseg import SEG
seg = SEG()

all_user_folder = "../CommentUser/"
for user_name in os.listdir(all_user_folder):
    user_file = os.path.join(all_user_folder, user_name)
    user_all_text = user_name + '_text'
    user_tokenized_text = user_name + '_tokenized'
    with open(user_file,'r') as user_text_corpus:
        for line in user_text_corpus:
            #  print line
            #every line of tweets has a list of words wlist
            wlist = seg.cut(line)
            wlist.reverse()
            #  print wlist
            tmp = " ".join(wlist)
            fout = open(os.path.join("../CommentUserTokenized", user_tokenized_text),'a')
            fout.write(tmp.encode('utf-8'))
 def __init__(self):
     self._seg = SEG()
Пример #16
0
    def get(self):
        Access_CronJob = True
        headers = self.request.headers.items()

        for key, value in headers:
            if (key == 'X-Appengine-Cron') and (value == 'true'):
                Access_CronJob = True
                break
        # 如果不是CronJob来源的请求,记录日志并放弃操作
        if (not Access_CronJob):
            logging.debug('CronJobCheck() access denied!')
            logging.critical(
                '如果这个请求不是由你手动触发的话,这意味者你的CronJobKey已经泄漏!请立即修改CronJobKey以防被他人利用')
            return

        mydate = datetime.utcnow() + timedelta(hours=+8)
        ts_hour = mydate.time().hour
        ts_min = mydate.time().minute

        dbug = self.request.get('debug')
        logging.debug(dbug)

        # 7:00早安世界
        if (((ts_hour == 7) and (30 <= ts_min <= 32))
                or (dbug == 'morning')):  # 7:00
            error = False
            try:
                wther = weather.weather()
            except weather.FetchError:
                logging.error("Weather Fetch Error!")
                error = True
            msg_idx = random.randint(0, len(config.MSG_GET_UP) - 1)
            if error:
                msg = '%s%s' % (config.MSG_GET_UP[msg_idx], config.BOT_HASHTAG)
            else:
                msg = '%s 今天%s的天气是:%s %s' % \
                    (config.MSG_GET_UP[msg_idx], config.CITY, wther, config.BOT_HASHTAG)

            OAuth_UpdateTweet(msg)  # 早安世界
            logging.info("%s:%d" % (msg, wther))

        # 23:30 晚安世界
        elif ((ts_hour == 23) and (30 <= ts_min <= 32)):  # 23:30
            msg_idx = random.randint(0, len(config.MSG_SLEEP) - 1)
            msg = '%s%s' % (config.MSG_SLEEP[msg_idx], config.BOT_HASHTAG)
            OAuth_UpdateTweet(msg)  # 晚安世界
            logging.info(msg)

        # 每小时一条命令
        elif (((7 <= ts_hour <= 23) and (15 <= ts_min <= 17))
              or (dbug == 'cli')):
            msg = command.random()
            if msg != None:
                msg = msg.replace(
                    "# commandlinefu.com by David Winterbottom\n\n#", "//")
                msg = '%s %s' % ("叮咚!小bot教CLI时间到了!", msg[:-1])
                msg += "#commandlinefu #xdlinux"
                logging.info(msg)
                OAuth_UpdateTweet(msg)

        # 扫TL,转推
        auth = tweepy.OAuthHandler(config.CONSUMER_KEY, config.CONSUMER_SECRET)
        auth.set_access_token(config.ACCESS_TOKEN, config.ACCESS_SECRET)
        api = tweepy.API(auth)

        #since id
        tweetid = SinceID.all().get()
        logging.info(tweetid)
        if (tweetid == None):
            logging.warning("Initial!")
            tweetid = SinceID()
            timeline = api.home_timeline()
        else:
            logging.info("Since ID is: %d" % tweetid.since_id)
            timeline = api.home_timeline(since_id=tweetid.since_id)

        #self.response.out.write('GETTING TIMELINE<br />')
        regx = re.compile(config.RT_REGEX, re.I | re.M)
        mgc = re.compile(config.MGC, re.I | re.M)
        talk_to_me = re.compile(config.TALK, re.I | re.M)
        tweets = timeline[::-1]  # 时间是倒序的
        if tweets == []:
            logging.info("no new tweets!")
            return

        msg = None
        seg = SEG()
        for tweet in tweets:
            user = tweet.user.screen_name
            if user == 'xdtuxbot':
                continue
            text = tweet.text
            n = mgc.search(text)
            if n != None:
                continue

            t = talk_to_me.search(text)
            if (not t) and text[0] == '@':
                continue

            wlist = seg.cut(text.encode('utf-8'))
            logging.info(' '.join(wlist))
            for w in wlist:
                if w in config.RT_LIST:
                    break
            else:
                continue

            if t:
                bot = TalkBot()
                reply = bot.respond(talk_to_me.sub("", text)).decode('UTF-8')
                if reply != '':
                    msg = u"@%s %s" % (user, reply)

            try:
                if msg:
                    OAuth_UpdateTweet(msg, reply=tweet.id)  # 发送到Twitter
                    logging.info('Send Tweet: %s' % (msg))
                else:
                    api.retweet(tweet.id)
            except tweepy.TweepError, e:
                logging.error('Tweepy Error:%s' % e)
            except Exception, e:
                logging.error('Uknow Error:%s' % e)
Пример #17
0
        if word in document:
            count += 1
    return count

def idf(word, documentList):
    return math.log(len(documentList) / (0.01 + numDocsContaining(word,documentList)))

def tfidf(document, documentList):
    retdict = {}
    for word in document:
        retdict[word] = document.count(word) / float(len(document)) * idf(word,documentList)
    return retdict
        

if __name__ == '__main__':
    seg = SEG()
    documentList = []
    documentList.append(seg.cut("""新华网布鲁塞尔3月24日电 (记者张伟)北约秘书长拉斯穆森24日晚宣布,北约成员国当天决定在利比亚设立禁飞区,北约将在数天内从美国手中接管对利比亚军事行动指挥权。
  当天,北约28个成员国大使在布鲁塞尔举行会议,拉斯穆森在会后发表声明宣布了上述决定。
  声明说,北约所采取的行动是“广泛国际行动的一部分”,旨在保护利比亚平民的安全。声明还说,北约成员国均致力于履行联合国安理会决议所规定的义务,“这也是北约决定承担禁飞区责任的原因”。
  本月17日,联合国安理会通过第1973号决议,同意在利比亚设立禁飞区。从19日开始,法国、美国和英国等国对利比亚展开军事行动。目前,这一行动由美国指挥,但美方已明确表示希望在本周末把指挥权移交出去。
  拉斯穆森24日晚在接受美国有线电视新闻网采访时说,北约已做好必要的准备,将在“未来数天内”从美国手中接管禁飞区的行动指挥权,行动将统归北约最高军事长官、欧洲盟军最高司令詹姆斯·斯塔夫里迪斯指挥。
  拉斯穆森解释说,北约成员国目前只是决定执行设立禁飞区的任务,并正在考虑承担“更为广泛的责任”,但目前尚未做出决定。
  北约22日决定对利比亚实施武器禁运,来自北约7个成员国的16艘海军舰艇参与这一行动。此前,土耳其、法国和德国一直反对北约在利比亚设立禁飞区,谈判一度陷入僵局。"""))
    words = {}
    for document in documentList:
        words = tfidf(document,documentList)
    for item in sorted(words.items(), key=itemgetter(1)):
        print "%s : %f" % (item[0], item[1])
     
Пример #18
0
  def get(self):
    Access_CronJob = True
    headers = self.request.headers.items()
     
    for key, value in headers:
      if (key == 'X-Appengine-Cron') and (value == 'true'):
        Access_CronJob = True
        break
    # 如果不是CronJob来源的请求,记录日志并放弃操作
    if (not Access_CronJob):
      logging.debug('CronJobCheck() access denied!')
      logging.critical('如果这个请求不是由你手动触发的话,这意味者你的CronJobKey已经泄漏!请立即修改CronJobKey以防被他人利用')
      return
    
    mydate = datetime.utcnow() + timedelta(hours=+8)
    ts_hour = mydate.time().hour
    ts_min = mydate.time().minute
    
    dbug = self.request.get('debug')
    logging.debug(dbug)
    
    # 7:00早安世界
    if (((ts_hour == 7) and ( 30 <= ts_min <= 32)) or (dbug=='morning')): # 7:00
        error = False
        try:
            wther=weather.weather()
        except weather.FetchError:
            logging.error("Weather Fetch Error!")
            error = True
        msg_idx=random.randint(0,len(config.MSG_GET_UP)-1)
        if error:
            msg = '%s%s' % (config.MSG_GET_UP[msg_idx],config.BOT_HASHTAG)
        else:
            msg = '%s 今天%s的天气是:%s %s' % \
                (config.MSG_GET_UP[msg_idx], config.CITY, wther, config.BOT_HASHTAG)
        
        OAuth_UpdateTweet(msg)                        # 早安世界
        logging.info("%s:%d" % (msg,wther))
   

    # 23:30 晚安世界
    elif ((ts_hour == 23) and (30 <= ts_min <=32)):    # 23:30
        msg_idx=random.randint(0,len(config.MSG_SLEEP)-1)
        msg = '%s%s' % (config.MSG_SLEEP[msg_idx], config.BOT_HASHTAG)
        OAuth_UpdateTweet(msg)                        # 晚安世界
        logging.info(msg)
  
    # 每小时一条命令
    elif (((7<=ts_hour<=23) and (15<=ts_min<=17)) or (dbug=='cli')):
        msg = command.random()
        if msg != None:
            msg = msg.replace("# commandlinefu.com by David Winterbottom\n\n#","//")
            msg = '%s %s' % ( "叮咚!小bot教CLI时间到了!", msg[:-1])
            msg +="#commandlinefu #xdlinux"
            logging.info(msg)
            OAuth_UpdateTweet(msg)

    # 扫TL,转推
    auth = tweepy.OAuthHandler(config.CONSUMER_KEY, config.CONSUMER_SECRET)
    auth.set_access_token(config.ACCESS_TOKEN, config.ACCESS_SECRET)
    api = tweepy.API(auth)
    
    #since id
    tweetid=SinceID.all().get()
    logging.info(tweetid)
    if ( tweetid == None ):
        logging.warning("Initial!")
        tweetid=SinceID()
        timeline = api.home_timeline()
    else:
        logging.info("Since ID is: %d" % tweetid.since_id)
        timeline = api.home_timeline(since_id=tweetid.since_id)
    
    #self.response.out.write('GETTING TIMELINE<br />')
    regx=re.compile(config.RT_REGEX,re.I|re.M)
    mgc = re.compile(config.MGC,re.I|re.M)
    talk_to_me = re.compile(config.TALK,re.I|re.M)
    tweets=timeline[::-1]   # 时间是倒序的
    if tweets == []:
        logging.info("no new tweets!")
        return

    msg=None 
    seg = SEG()
    for tweet in tweets:
        user = tweet.user.screen_name
        if user == 'xdtuxbot':
            continue
        text = tweet.text
        n = mgc.search(text)
        if n != None:
            continue
        
        t = talk_to_me.search(text)
        if (not t) and text[0]=='@':
            continue

        wlist = seg.cut(text.encode('utf-8')) 
        logging.info( ' '.join(wlist) ) 
        for w in wlist:
            if w in config.RT_LIST:
                break
        else:
            continue

        if t:
            bot = TalkBot()
            reply = bot.respond( talk_to_me.sub("",text) ).decode('UTF-8')
            if reply != '': 
                msg = u"@%s %s" % (user, reply)    
         
        try:
            if msg:
                OAuth_UpdateTweet(msg,reply=tweet.id)           # 发送到Twitter
                logging.info('Send Tweet: %s' % (msg))
            else:
                api.retweet(tweet.id) 
        except tweepy.TweepError, e:
            logging.error('Tweepy Error:%s' % e)
        except Exception, e:
            logging.error('Uknow Error:%s' % e)