示例#1
0
def getSample(path , langPercent):
	#problems = dr.problems(dr.dirproblems(path,r".*\.txt"))
	problems = dr.problems ( dr.dirproblems ( path ) )
	data = {}
	for dirname, (files,unknow) in problems:
		data[dirname]={}
		percent = langPercent(dirname)
		docs = ""
		for file in files:
			docs = docs + file[1]

		count = dr.bow(docs)	
	
		#Sample : Number of words to be obtained based on a percentage of the language	
		sample = int(round(float(len(count[0]))*float(percent)))  
		#Sample of words of all the total of docs
		selection= random.sample(count[0],sample)
	
		#We select only the selection in the "count list"
		data[dirname]['total'] = getSelection(count[0],selection)

		for file in files:
			namefile = file[0].split("/")[-1]
			count_file = dr.bow(file[1])
			#We geet the same selection as the final, for every single count of the file
			data[dirname][namefile] = getSelection(count_file[0],selection)
	
	print data	
	return  data
示例#2
0
                _ignore.append(line.strip())


    # Loading stopwords if exits
    stopwords=[]
    if os.path.exists(opts.stopwords):
        verbose('Loading stopwords: ',opts.stopwords)
        stopwords=docread.readstopwords(opts.stopwords)
    else:
        info('Stopwords file not found assuming, emtpy',opts.stopwords)

    # Loading main files -------------------------------------------------
    # load problems or problem
    verbose('Loading files')
    problems=docread.problems(
        docread.dirproblems(dirname,known_pattern,unknown_pattern,_ignore,
                    code=codes[opts.language][opts.genre]))

   
    # Loading answers file only for DEVELOPMENT OR TRAINNING MODE
    if opts.mode.startswith("train") or opts.mode.startswith("devel"):
        if opts.Answers:
            answers_file=opts.Answers
        else:
            answers_file="{0}/{1}".format(dirname,opts.answers)
        verbose('Loading answer file: {0}'.format(answers_file))
        answers = docread.loadanswers(answers_file,_ignore,
                code=codes[opts.language][opts.genre])

        # Checking for consistency
        if not len(problems) == len(answers):
            p.error("Not match for number of problems({0}) and \
示例#3
0
def getFiles(path):
	p = dr.dirproblems(path)
	problems = {}	
	for id , (ks,uks) in p : 
		problems[id] = {'known':[ dr.readdoc(k) for k in ks ], 'unknown': [dr.readdoc(u) for u in uks]}
	return problems
示例#4
0
        try:
            out = open(opts.output)
        except:
            p.error('Output parameter could not been open: {0}'\
                    .format(opts.output))

    # Loading ingnore if exists
    _ignore=[]
    if os.path.exists('.ignore'):
        verbose('Loading files to ignore frm: .ignore')
        with open('.ignore') as file:
            _ignore=file.read().readlines()

        
    # load problems or problem
    problems=docread.dirproblems(dirname,opts.known,opts.unknown,_ignore)

    # TRAINNING MODE
    if opts.mode.startswith("train"):
      
        # Loading answers file
        if not len(args)==2:
            p.error("Answers needed for train mode")
        verbose('Loading answer file: {0}'.format(args[1]))
        answers = docread.loadanswers(args[1])

        # Checking for consistency
        if not len(problems) == len(answers):
            p.error("Not match for number of problems({0}) and \
            answers({1})".format(len(problems),len(answers)))