def __init__(self,corpus,datadir,field,ntrials=10,train_proportion=0.5): self.datadir = datadir self.field=field self.corpus = corpus self.experiments = [] desc_file = open(os.path.join(self.datadir,"parameters.txt"),'w') config.describe("%s trials of %s split"%(ntrials,train_proportion),outstream=desc_file) desc_file.close() for n in range(ntrials): random.shuffle(self.corpus) random.shuffle(self.corpus) split_point = int(ceil(len(self.corpus)*train_proportion)) train = self.corpus[:split_point] test = self.corpus[split_point:] split_file = open(os.path.join(datadir,'elie.'+field+'.'+str(n)+'.split'),'w') for t in train: t = os.path.split(t)[-1] print >>split_file,t print >>split_file, 20*'*' for t in test: t = os.path.split(t)[-1] print >>split_file,t new_exper = Experiment(train,test,self.datadir,self.field,number=n) self.experiments.append(new_exper)
def __init__(self,corpusdir, datadir, field, splitfiles): self.datadir = datadir self.field=field self.corpusdir = corpusdir self.splitfiles = splitfiles self.experiments = [] desc_file = open(os.path.join(self.datadir,"parameters.txt"),'w') desc_str = "Using pre-defined splits:\n" for sf in self.splitfiles: desc_str = desc_str + sf + "\n" config.describe(desc_str,outstream=desc_file) desc_file.close() for sf in self.splitfiles: n = os.path.split(sf)[-1].split('.')[2] f = open(sf) tmp1,tmp2 = string.split(f.read(),20*'*') train = [] test = [] for tmp in tmp1.split('\n'): tmp = tmp.strip() if tmp: tmp = os.path.join(self.corpusdir,tmp) if tmp[-9:] != '.elie.pre': tmp = tmp + '.elie.pre' train.append(tmp) for tmp in tmp2.split('\n'): tmp = tmp.strip() if tmp: tmp = os.path.join(self.corpusdir,tmp) if tmp[-9:] != '.elie.pre': tmp = tmp + '.elie.pre' test.append(tmp) #print "XXX",train,"\n\n",test #raw_input() split_log = open(os.path.join(datadir,'elie.'+field+'.'+str(n)+'.split'),'w') for t in train: t = os.path.split(t)[-1] print >>split_log,t print >>split_log, 20*'*' for t in test: t = os.path.split(t)[-1] print >>split_log,t new_exper = Experiment(train,test,self.datadir,self.field,number=n) self.experiments.append(new_exper)