Пример #1
0
def seg(text):
    seg = SEG()
    wlist = seg.cut(text)
    word_nums = {}

    for w in wlist:
        if len(w) < 2: continue
        if word_nums.has_key(w):
            word_nums[w] += 1
        else:
            word_nums[w] = 1

    return word_nums.items()
Пример #2
0
def get_data():
    '''
	Get the training and text datasets from local folds
	Positive and negative datasets were stored in different folds
	When loading the datasets , do sentences segmentation with smallseg tool
	'''
    posPath = '/home/zhouxc/skindetector/AdultWebsiteText/'
    negPath = '/home/zhouxc/skindetector/NormalWebsiteText/'
    posFiles = os.listdir(posPath)
    negFiles = os.listdir(negPath)

    trainingData = []
    seg = SEG()
    seg.set(dic)
    c = 0
    print '---------------------Read Positive DataSet-----------------'
    for fileName in posFiles:
        #if c > 100: break
        c += 1
        print "PositiveData" + str(c)
        path = posPath + fileName
        data = seg.cut(open(path).read())
        text = [
            word.encode('utf-8') for word in data
            if word.encode('utf-8') in pornDict
        ]
        trainingData.append((text, 'Positive'))
    print '---------------------Positive DataSet done-----------------'
    c = 0

    print '---------------------Read Negative DataSet-----------------'
    for fileName in negFiles:
        #if c > 100:	break
        c += 1
        print "NegativeData" + str(c)
        path = negPath + fileName
        data = seg.cut(open(path).read())
        text = [
            word.encode('utf-8') for word in data
            if word.encode('utf-8') in pornDict
        ]
        trainingData.append((text, 'Negative'))
    print '--------Negative DataSet  done-----------------------------------'

    return trainingData, trainingData
Пример #3
0
    def get(self):
        Access_CronJob = True
        headers = self.request.headers.items()

        for key, value in headers:
            if (key == 'X-Appengine-Cron') and (value == 'true'):
                Access_CronJob = True
                break
        # 如果不是CronJob来源的请求,记录日志并放弃操作
        if (not Access_CronJob):
            logging.debug('CronJobCheck() access denied!')
            logging.critical(
                '如果这个请求不是由你手动触发的话,这意味者你的CronJobKey已经泄漏!请立即修改CronJobKey以防被他人利用')
            return

        mydate = datetime.utcnow() + timedelta(hours=+8)
        ts_hour = mydate.time().hour
        ts_min = mydate.time().minute

        dbug = self.request.get('debug')
        logging.debug(dbug)

        # 7:00早安世界
        if (((ts_hour == 7) and (30 <= ts_min <= 32))
                or (dbug == 'morning')):  # 7:00
            error = False
            try:
                wther = weather.weather()
            except weather.FetchError:
                logging.error("Weather Fetch Error!")
                error = True
            msg_idx = random.randint(0, len(config.MSG_GET_UP) - 1)
            if error:
                msg = '%s%s' % (config.MSG_GET_UP[msg_idx], config.BOT_HASHTAG)
            else:
                msg = '%s 今天%s的天气是:%s %s' % \
                    (config.MSG_GET_UP[msg_idx], config.CITY, wther, config.BOT_HASHTAG)

            OAuth_UpdateTweet(msg)  # 早安世界
            logging.info("%s:%d" % (msg, wther))

        # 23:30 晚安世界
        elif ((ts_hour == 23) and (30 <= ts_min <= 32)):  # 23:30
            msg_idx = random.randint(0, len(config.MSG_SLEEP) - 1)
            msg = '%s%s' % (config.MSG_SLEEP[msg_idx], config.BOT_HASHTAG)
            OAuth_UpdateTweet(msg)  # 晚安世界
            logging.info(msg)

        # 每小时一条命令
        elif (((7 <= ts_hour <= 23) and (15 <= ts_min <= 17))
              or (dbug == 'cli')):
            msg = command.random()
            if msg != None:
                msg = msg.replace(
                    "# commandlinefu.com by David Winterbottom\n\n#", "//")
                msg = '%s %s' % ("叮咚!小bot教CLI时间到了!", msg[:-1])
                msg += "#commandlinefu #xdlinux"
                logging.info(msg)
                OAuth_UpdateTweet(msg)

        # 扫TL,转推
        auth = tweepy.OAuthHandler(config.CONSUMER_KEY, config.CONSUMER_SECRET)
        auth.set_access_token(config.ACCESS_TOKEN, config.ACCESS_SECRET)
        api = tweepy.API(auth)

        #since id
        tweetid = SinceID.all().get()
        logging.info(tweetid)
        if (tweetid == None):
            logging.warning("Initial!")
            tweetid = SinceID()
            timeline = api.home_timeline()
        else:
            logging.info("Since ID is: %d" % tweetid.since_id)
            timeline = api.home_timeline(since_id=tweetid.since_id)

        #self.response.out.write('GETTING TIMELINE<br />')
        regx = re.compile(config.RT_REGEX, re.I | re.M)
        mgc = re.compile(config.MGC, re.I | re.M)
        talk_to_me = re.compile(config.TALK, re.I | re.M)
        tweets = timeline[::-1]  # 时间是倒序的
        if tweets == []:
            logging.info("no new tweets!")
            return

        msg = None
        seg = SEG()
        for tweet in tweets:
            user = tweet.user.screen_name
            if user == 'xdtuxbot':
                continue
            text = tweet.text
            n = mgc.search(text)
            if n != None:
                continue

            t = talk_to_me.search(text)
            if (not t) and text[0] == '@':
                continue

            wlist = seg.cut(text.encode('utf-8'))
            logging.info(' '.join(wlist))
            for w in wlist:
                if w in config.RT_LIST:
                    break
            else:
                continue

            if t:
                bot = TalkBot()
                reply = bot.respond(talk_to_me.sub("", text)).decode('UTF-8')
                if reply != '':
                    msg = u"@%s %s" % (user, reply)

            try:
                if msg:
                    OAuth_UpdateTweet(msg, reply=tweet.id)  # 发送到Twitter
                    logging.info('Send Tweet: %s' % (msg))
                else:
                    api.retweet(tweet.id)
            except tweepy.TweepError, e:
                logging.error('Tweepy Error:%s' % e)
            except Exception, e:
                logging.error('Uknow Error:%s' % e)
Пример #4
0
 def init(self, options):
     print("reading Chiniese dictionary")
     self.seg = SEG()
Пример #5
0
def idf(word, documentList):
    return math.log(
        len(documentList) / (0.01 + numDocsContaining(word, documentList)))


def tfidf(document, documentList):
    retdict = {}
    for word in document:
        retdict[word] = document.count(word) / float(len(document)) * idf(
            word, documentList)
    return retdict


if __name__ == '__main__':
    seg = SEG()
    documentList = []
    documentList.append(
        seg.cut(
            """新华网布鲁塞尔3月24日电 (记者张伟)北约秘书长拉斯穆森24日晚宣布,北约成员国当天决定在利比亚设立禁飞区,北约将在数天内从美国手中接管对利比亚军事行动指挥权。
  当天,北约28个成员国大使在布鲁塞尔举行会议,拉斯穆森在会后发表声明宣布了上述决定。
  声明说,北约所采取的行动是“广泛国际行动的一部分”,旨在保护利比亚平民的安全。声明还说,北约成员国均致力于履行联合国安理会决议所规定的义务,“这也是北约决定承担禁飞区责任的原因”。
  本月17日,联合国安理会通过第1973号决议,同意在利比亚设立禁飞区。从19日开始,法国、美国和英国等国对利比亚展开军事行动。目前,这一行动由美国指挥,但美方已明确表示希望在本周末把指挥权移交出去。
  拉斯穆森24日晚在接受美国有线电视新闻网采访时说,北约已做好必要的准备,将在“未来数天内”从美国手中接管禁飞区的行动指挥权,行动将统归北约最高军事长官、欧洲盟军最高司令詹姆斯·斯塔夫里迪斯指挥。
  拉斯穆森解释说,北约成员国目前只是决定执行设立禁飞区的任务,并正在考虑承担“更为广泛的责任”,但目前尚未做出决定。
  北约22日决定对利比亚实施武器禁运,来自北约7个成员国的16艘海军舰艇参与这一行动。此前,土耳其、法国和德国一直反对北约在利比亚设立禁飞区,谈判一度陷入僵局。"""
        ))
    words = {}
    for document in documentList:
        words = tfidf(document, documentList)
    for item in sorted(words.items(), key=itemgetter(1)):