示例#1
0
def get_data():
    '''
	Get the training and text datasets from local folds
	Positive and negative datasets were stored in different folds
	When loading the datasets , do sentences segmentation with smallseg tool
	'''
    posPath = '/home/zhouxc/skindetector/AdultWebsiteText/'
    negPath = '/home/zhouxc/skindetector/NormalWebsiteText/'
    posFiles = os.listdir(posPath)
    negFiles = os.listdir(negPath)

    trainingData = []
    seg = SEG()
    seg.set(dic)
    c = 0
    print '---------------------Read Positive DataSet-----------------'
    for fileName in posFiles:
        #if c > 100: break
        c += 1
        print "PositiveData" + str(c)
        path = posPath + fileName
        data = seg.cut(open(path).read())
        text = [
            word.encode('utf-8') for word in data
            if word.encode('utf-8') in pornDict
        ]
        trainingData.append((text, 'Positive'))
    print '---------------------Positive DataSet done-----------------'
    c = 0

    print '---------------------Read Negative DataSet-----------------'
    for fileName in negFiles:
        #if c > 100:	break
        c += 1
        print "NegativeData" + str(c)
        path = negPath + fileName
        data = seg.cut(open(path).read())
        text = [
            word.encode('utf-8') for word in data
            if word.encode('utf-8') in pornDict
        ]
        trainingData.append((text, 'Negative'))
    print '--------Negative DataSet  done-----------------------------------'

    return trainingData, trainingData
def get_data():
	'''
	Get the training and text datasets from local folds
	Positive and negative datasets were stored in different folds
	When loading the datasets , do sentences segmentation with smallseg tool
	'''
	posPath = '/home/zhouxc/skindetector/AdultWebsiteText/'
	negPath = '/home/zhouxc/skindetector/NormalWebsiteText/'
	posFiles = os.listdir(posPath)
	negFiles = os.listdir(negPath)

	trainingData = []
	seg = SEG()
	seg.set(dic)
	c = 0
	print '---------------------Read Positive DataSet-----------------'
	for fileName in posFiles:
		#if c > 100: break
		c += 1
		print "PositiveData" + str(c)
		path = posPath + fileName
		data = seg.cut(open(path).read())
		text = [word.encode('utf-8') for word in data if word.encode('utf-8') in pornDict]
		trainingData.append((text , 'Positive'))
	print '---------------------Positive DataSet done-----------------'
	c = 0
	
	print '---------------------Read Negative DataSet-----------------'
	for fileName in negFiles:
		#if c > 100:	break
		c += 1
		print "NegativeData" + str(c)
		path = negPath + fileName
		data = seg.cut(open(path).read())
		text = [word.encode('utf-8') for word in data if word.encode('utf-8')  in pornDict]
		trainingData.append((text , 'Negative'))
	print '--------Negative DataSet  done-----------------------------------'
	
	return trainingData  , trainingData
示例#3
0
                    + str(i) \
                    + '''')" href="'''\
                    + url \
                    + '''" target="_blank"><font size="3">''' \
                    + arrowscript \
                    + title \
                    + '''</font></a><br /><font size="-1">''' \
                    + snippet \
                    + '''<br /><font color="#008000">''' \
                    + url \
                    + '''<br /></font></font></td></tr></table>\n'''
        pageStr += resultStr
        i += 1

    return pageStr


if __name__ == '__main__':
    #resultsList = ["我是中国人民的儿子", "你是我儿子", "中国人民万岁", "我永远是中国人民的儿子"]
    seg = SEG()
    #print 'Load dict...'
    words = "main.dic"
    seg.set(words)
    #print "Dict is OK."

    #print psudorerank(resultsList, 2)
    username = "******"
    engine = request.GET.get("engine", "")
    resultsTable = ResultInfoTable[engine]
    [query, pagecontent] = userFeedbackRerank(username, resultsTable, seg)
示例#4
0
文件: benchmark.py 项目: Nuos/lab
#encoding=utf-8
try:
    import psyco
    psyco.full()
except:
    pass

s3 = file("text.txt").read()
words = [x.rstrip() for x in file("main.dic") ]
from smallseg import SEG
seg = SEG()
print 'Load dict...'
seg.set(words)
print "Dict is OK."
from time import time

for i in xrange(1,101):
    start = time()
    for j in xrange(0,i):
        A = seg.cut(s3)
    cost = time()-start
    print i,"times, cost:",cost

print "********************************"