def getSample(path , langPercent): #problems = dr.problems(dr.dirproblems(path,r".*\.txt")) problems = dr.problems ( dr.dirproblems ( path ) ) data = {} for dirname, (files,unknow) in problems: data[dirname]={} percent = langPercent(dirname) docs = "" for file in files: docs = docs + file[1] count = dr.bow(docs) #Sample : Number of words to be obtained based on a percentage of the language sample = int(round(float(len(count[0]))*float(percent))) #Sample of words of all the total of docs selection= random.sample(count[0],sample) #We select only the selection in the "count list" data[dirname]['total'] = getSelection(count[0],selection) for file in files: namefile = file[0].split("/")[-1] count_file = dr.bow(file[1]) #We geet the same selection as the final, for every single count of the file data[dirname][namefile] = getSelection(count_file[0],selection) print data return data
def do_info(self,args): "Shows info of the problem" print "Problem Id : ", problems[self.doc][0] print "Known documents : ", len(problems[self.doc][1][0]) print "Answer : ", gs[problems[self.doc][0]] print "Predction : ", sys[problems[self.doc][0]] print "Known files : " for i,doc in enumerate(problems[self.doc][1][0]): bow=docread.bow(doc[1])[0] print " [{0}]".format(i), doc[0], "({0})".format(sum(bow.values())) print "Unknown file : " for doc in problems[self.doc][1][1]: i+=1 bow=docread.bow(doc[1])[0] print " [{0}]".format(i),doc[0],"({0})".format(sum(bow.values()))
def do_info(self,args): "Shows info of the problem" print "Problem Id : ", problems[self.doc][0] print "Known documents : ", len(problems[self.doc][1][0]) print "Answer : ", gs[problems[self.doc][0]] if sy: print "Predction : ", sy[problems[self.doc][0]] print "Known files : " for i,doc in enumerate(problems[self.doc][1][0]): bow=docread.bow(doc[1]) print " [{0}]".format(i), doc[0], "({0})".format(len(bow)) print "Unknown file : " for doc in problems[self.doc][1][1]: i+=1 bow=docread.bow(doc[1]) print " [{0}]".format(i),doc[0],"({0})".format(len(bow))
def getFromFile( idwords, path): count_file = dr.bow ( dr.readdoc(path) ) return getSelection( count_file[0] , idwords)
def getFromText( idwords, text): count_file = dr.bow( text) return getSelection( count_file[0] , idwords)
def getIdsToSample(text, selected_lang, percent): #percent = lang[selected_lang]['percent'] count = dr.bow(text) sample = int( math.ceil( len( count[0] ) * percent ) ) selection = random.sample(count[0] , sample) return selection