Пример #1
0
def getSample(path , langPercent):
	#problems = dr.problems(dr.dirproblems(path,r".*\.txt"))
	problems = dr.problems ( dr.dirproblems ( path ) )
	data = {}
	for dirname, (files,unknow) in problems:
		data[dirname]={}
		percent = langPercent(dirname)
		docs = ""
		for file in files:
			docs = docs + file[1]

		count = dr.bow(docs)	
	
		#Sample : Number of words to be obtained based on a percentage of the language	
		sample = int(round(float(len(count[0]))*float(percent)))  
		#Sample of words of all the total of docs
		selection= random.sample(count[0],sample)
	
		#We select only the selection in the "count list"
		data[dirname]['total'] = getSelection(count[0],selection)

		for file in files:
			namefile = file[0].split("/")[-1]
			count_file = dr.bow(file[1])
			#We geet the same selection as the final, for every single count of the file
			data[dirname][namefile] = getSelection(count_file[0],selection)
	
	print data	
	return  data
Пример #2
0
 def do_info(self,args):
     "Shows info of the problem"
     print "Problem Id      : ", problems[self.doc][0]
     print "Known documents : ", len(problems[self.doc][1][0])
     print "Answer          : ", gs[problems[self.doc][0]]
     print "Predction       : ", sys[problems[self.doc][0]]
     print "Known files     : "
     for i,doc in enumerate(problems[self.doc][1][0]):
         bow=docread.bow(doc[1])[0]
         print "    [{0}]".format(i), doc[0], "({0})".format(sum(bow.values()))
     print "Unknown file   : "
     for doc in problems[self.doc][1][1]:
         i+=1
         bow=docread.bow(doc[1])[0]
         print  "    [{0}]".format(i),doc[0],"({0})".format(sum(bow.values()))
Пример #3
0
 def do_info(self,args):
     "Shows info of the problem"
     print "Problem Id      : ", problems[self.doc][0]
     print "Known documents : ", len(problems[self.doc][1][0])
     print "Answer          : ", gs[problems[self.doc][0]]
     if sy:
         print "Predction       : ", sy[problems[self.doc][0]]
     print "Known files     : "
     for i,doc in enumerate(problems[self.doc][1][0]):
         bow=docread.bow(doc[1])
         print "    [{0}]".format(i), doc[0], "({0})".format(len(bow))
     print "Unknown file   : "
     for doc in problems[self.doc][1][1]:
         i+=1
         bow=docread.bow(doc[1])
         print  "    [{0}]".format(i),doc[0],"({0})".format(len(bow))
Пример #4
0
def getFromFile( idwords, path):
	count_file = dr.bow ( dr.readdoc(path) )
	return getSelection( count_file[0] , idwords)
Пример #5
0
def getFromText( idwords, text):
	count_file = dr.bow( text)
	return getSelection( count_file[0] , idwords)
Пример #6
0
def getIdsToSample(text, selected_lang, percent):
	#percent = lang[selected_lang]['percent']
	count = dr.bow(text)
	sample = int( math.ceil( len( count[0] ) * percent ) )
	selection = random.sample(count[0] , sample)
	return selection