def __init__(self, corpus, field): doc_index = [] self._corpus_files = [] if type(corpus) is types.ListType: corpus_files = corpus elif os.path.isfile(corpus): corpus_files = [corpus] else: #corpus is a directory corpus_files = utils.fileList(corpus,list_suffix=1,list_non_suffix=0) doc = [] for f in corpus_files: l = len(doc) doc_index.append(l) log(2,'. ') log(4,' '+f+'\n') d = open(f) while 1: tok = d.readline() if not tok: break doc.append(eval(tok,globals())) d.close() ###need to record the breaks between documents! # creating a single Document saves speed, but will result in some dubious # features e..g. x_1 in the last instance of one document where x is first # in next doc. these should be irrelevant and filtered though Document.__init__(self, doc, field,window=config.window,stem=config.stem,suffix=config.suffix,token=config.token,pos=config.pos,types=config.types,gaz=config.gaz,chunk=config.chunk,erc=config.erc) self._corpus_files = corpus_files self._doc_index = doc_index
options['DATADIR'] = arg if opt == '-n': options['ntrials'] = int(arg) if opt == '-p': options['t_prop'] = float(arg) if not options.has_key('FIELD') or not options.has_key('TRAINDIR') or not options.has_key('DATADIR'): usage() sys.exit(2) #test_corpus = utils.fileList(sys.argv[3],list_suffix=1,list_non_suffix=0) #exper = Experiment(sys.argv[2],test_corpus, sys.argv[1]) # need to pass fields to experiment if options.has_key('SPLITBASE'): import glob splitfiles = glob.glob(options['SPLITBASE']+'*.split') print >>sys.stderr,"Experiment using the following pre-defined train/test splits:" for sf in splitfiles: print >>sys.stderr,"\t",sf exper = definedSplitExperiment(options['TRAINDIR'],options['DATADIR'],options['FIELD'], splitfiles) elif not options.has_key('TESTDIR'): corpus = utils.fileList(options['TRAINDIR'],list_suffix=1,list_non_suffix=0) exper = randomSplitExperiment(corpus,options['DATADIR'],options['FIELD'],ntrials=options['ntrials'],train_proportion=options['t_prop']) else: train = utils.fileList(options['TRAINDIR'],list_suffix=1,list_non_suffix=0) test = utils.fileList(options['TESTDIR'],list_suffix=1,list_non_suffix=0) exper = Experiment(train,test,options['DATADIR'],options['FIELD']) exper.run() #exper.results.write_bwi_scorer(os.path.join(exper.datadir,'bwiscore'))