示例#1
0
文件: instances.py 项目: aidanf/Elie
    def __init__(self, corpus, field):
        doc_index = []
        self._corpus_files = []
        if type(corpus) is types.ListType:
            corpus_files = corpus
        elif os.path.isfile(corpus):      
            corpus_files = [corpus]
        else:
            #corpus is a directory
            corpus_files = utils.fileList(corpus,list_suffix=1,list_non_suffix=0)

        doc = []
        for f in corpus_files:
            l = len(doc)
            doc_index.append(l)
            log(2,'. ')
            log(4,' '+f+'\n')
            d = open(f)
            while 1:
                tok = d.readline()
                if not tok:
                    break
                doc.append(eval(tok,globals()))
            d.close()
        ###need to record the breaks between documents!
        # creating a single Document saves speed, but will result in some dubious
        # features e..g. x_1 in the last instance of one document where x is first
        # in next doc. these should be irrelevant and filtered though
        Document.__init__(self, doc, field,window=config.window,stem=config.stem,suffix=config.suffix,token=config.token,pos=config.pos,types=config.types,gaz=config.gaz,chunk=config.chunk,erc=config.erc)
        self._corpus_files = corpus_files
        self._doc_index = doc_index
示例#2
0
文件: evaluation.py 项目: aidanf/Elie
            options['DATADIR'] = arg
        if opt == '-n':
            options['ntrials'] = int(arg)
        if opt == '-p':
            options['t_prop'] = float(arg)
    if not options.has_key('FIELD') or not options.has_key('TRAINDIR') or not options.has_key('DATADIR'):
        usage()
        sys.exit(2)
 


    #test_corpus = utils.fileList(sys.argv[3],list_suffix=1,list_non_suffix=0)
    #exper = Experiment(sys.argv[2],test_corpus, sys.argv[1])
    # need to pass fields to experiment
    if options.has_key('SPLITBASE'):
        import glob
        splitfiles = glob.glob(options['SPLITBASE']+'*.split')
        print >>sys.stderr,"Experiment using the following pre-defined train/test splits:"
        for sf in splitfiles:
            print >>sys.stderr,"\t",sf
        exper = definedSplitExperiment(options['TRAINDIR'],options['DATADIR'],options['FIELD'], splitfiles)
    elif not options.has_key('TESTDIR'):        
        corpus = utils.fileList(options['TRAINDIR'],list_suffix=1,list_non_suffix=0)
        exper = randomSplitExperiment(corpus,options['DATADIR'],options['FIELD'],ntrials=options['ntrials'],train_proportion=options['t_prop'])
    else:
        train = utils.fileList(options['TRAINDIR'],list_suffix=1,list_non_suffix=0)
        test = utils.fileList(options['TESTDIR'],list_suffix=1,list_non_suffix=0)
        exper = Experiment(train,test,options['DATADIR'],options['FIELD'])
    exper.run()
    #exper.results.write_bwi_scorer(os.path.join(exper.datadir,'bwiscore'))