outputdir = sys.argv[2] threads = int(sys.argv[3]) except: print >>sys.stderr,"Syntax: sonar_postproc.py inputdir outputdir threads" sys.exit(2) cat_freqlist_word = FrequencyList() cat_freqlist_lemma = FrequencyList() cat_freqlist_lemmapos = FrequencyList() maxtasksperchild = 10 preindex = True prevcategory = None print >>sys.stderr,"Initialising (indexing)..." processor = folia.CorpusProcessor(inputdir, process, threads, 'folia.xml',"",lambda x: True, maxtasksperchild,preindex) print >>sys.stderr,"Processing..." for i, data in enumerate(processor): filepath, freqlist_word, freqlist_lemma, freqlist_lemmapos = data if filepath: category = None for e in filepath.split('/'): if e[-4:] != '.xml' and e[:3] == 'WR-' or e[:3] == 'WS-': category = e if not category: print >>sys.stderr, "No category found for: " + filepath sys.exit(2) if category != prevcategory: if prevcategory: print >>sys.stderr,"Saving frequency lists for ", prevcategory
print >> sys.stderr, "Unable to write file " + outputfile except Exception as e: print >> sys.stderr, "ERROR: Got exception curating " + filepath + ": ", repr( e) if __name__ == '__main__': try: inputdir = sys.argv[1] outputdir = sys.argv[2] threads = int(sys.argv[3]) except: print >> sys.stderr, "Syntax: sonar_postproc.py inputdir outputdir threads" sys.exit(2) maxtasksperchild = 10 preindex = True prevcategory = None print >> sys.stderr, "Initialising (indexing)..." processor = folia.CorpusProcessor( inputdir, process, threads, 'folia.xml', "", lambda x: not os.path.exists(x.replace(inputdir, outputdir)), maxtasksperchild, preindex) l = len(processor.index) print >> sys.stderr, "Indexed " + str(l) + " files for curation" print >> sys.stderr, "Processing..." for i, _ in enumerate(processor.run()): progress = round((i + 1) / float(l) * 100, 1) print "#" + str(i) + " - " + str(progress) + '%'
#Let XSLT do the basic conversion to HTML xslt = lxml.etree.parse(xsltfile) dcoitofoliatransformer = lxml.etree.XSLT(xslt) schema = lxml.etree.RelaxNG(folia.relaxng()) if foliadir[-1] != '/': foliadir += '/' try: os.mkdir(foliadir[:-1]) except: pass maxtasksperchild = 10 preindex = True processor = folia.CorpusProcessor(sonardir, process, threads, 'pos', "", lambda x: True, maxtasksperchild, preindex) for i, _ in enumerate(processor): progress = round((i + 1) / float(len(processor.index)) * 100, 1) print "#" + str(i) + " - " + str(progress) + '%' #print "Building index..." #index = list(enumerate([ x for x in sonar.CorpusFiles(sonardir,'pos', "", lambda x: True, True) if not outputexists(x, sonardir, foliadir) ])) #indexlength = len(index) #print str(indexlength) + " documents found in " + sonardir #print "Processing..." #p = Pool(threads) #p.map(process, index ) print "All done."