def labeling(line, modelname, options): """ Label input sting. Called by simpleLabeling or detailLabeling """ tmpDir = rootDir+'/simpletmp' resDir = os.getcwd() #current working directory dtype = options.t if dtype == "bibl" : typeCorpus = 1 elif dtype == "note" : typeCorpus = 2 dirModel = os.path.join(rootDir, 'model/corpus')+str(typeCorpus)+"/"+options.m+"/" bilbo = Bilbo(resDir, options, modelname) if not os.path.exists(tmpDir): os.makedirs(tmpDir) else : #delete all existing files for dir_name, sub_dirs, files in os.walk(tmpDir): for f in files : os.unlink(os.path.join(dir_name, f)) #tmp file generation filename = os.path.join(tmpDir, 'tmp.xml') tmpFile = open(filename, "w") tmpFile.write('<list'+dtype.title()+'>\n') tmpFile.write('<'+dtype+'> '+str(line)+' </'+dtype+'>') tmpFile.write('\n</list'+dtype.title()+'>\n') tmpFile.close() if options.t == "note" and options.e: bilbo.annotate(tmpDir, dirModel, typeCorpus, 1) else : bilbo.annotate(tmpDir, dirModel, typeCorpus) tmp_str = ''.join(open(os.path.join(resDir, 'tmp.xml')).readlines()) os.unlink(filename) os.rmdir(tmpDir) return tmp_str
def annoterCorpus(corpus, request): dirModel = os.path.abspath('../../model/corpus' + str(corpus) + "/revues/") + "/" dir_in = os.path.abspath('tmp/in') + "/" dir_out = os.path.abspath('tmp/out') + "/" if corpus == 2: optStr = '-T -t note' else: optStr = '-T -t bibl' if hasattr(request, 'doi'): optStr += ' -d' parser = defaultOptions() options, args = parser.parse_args(optStr.split()) bilbo = Bilbo(dir_out, options, "crf_model_simple") bilbo.annotate(dir_in, dirModel, corpus) return
def annotate(self): for dirPartition in self.dirPartitions: (annotateDir, testDir, trainDir, modelDir, resultDir) = self.partitions.getDirTestNames(dirPartition) # annotation of test data striped tagged self._setBilboAnnotate() self._del_tmp_file(resultDir) bilbo = Bilbo(resultDir, self.bilboOptions, "crf_model_simple") bilbo.annotate(annotateDir, modelDir, 1) # train with test data for evaluation self._setBilboTrain() self._del_tmp_file(trainDir) bilbo = Bilbo(trainDir, self.bilboOptions, "crf_model_simple") # To save tmpFiles in testDir corpus = Corpus(testDir, self.bilboOptions) corpus.extract(1, "bibl") bilbo.crf.prepareTrain(corpus, 1, "evaldata_CRF.txt", 1, 1) #CRF training data extraction
def labeling(line, modelname, options): """ Label input sting. Called by simpleLabeling or detailLabeling """ #tmpDir = rootDir+'/simpletmp' #resDir = os.getcwd() #current working directory #It's better to have secure tmp dirs for multiple threads tmpDir = tempfile.mkdtemp(prefix='bilbo_labeling_tmp') resDir = tempfile.mkdtemp(prefix='bilbo_labeling_res_dir') dtype = options.t if dtype == "bibl" : typeCorpus = 1 elif dtype == "note" : typeCorpus = 2 dirModel = os.path.join(rootDir, 'model/corpus')+str(typeCorpus)+"/"+options.m+"/" bilbo = Bilbo(resDir, options, modelname) if not os.path.exists(tmpDir): os.makedirs(tmpDir) else : #delete all existing files for dir_name, sub_dirs, files in os.walk(tmpDir): for f in files : os.unlink(os.path.join(dir_name, f)) #tmp file generation filename = os.path.join(tmpDir, 'tmp.xml') tmpFile = open(filename, "w") tmpFile.write('<list'+dtype.title()+'>\n') tmpFile.write('<'+dtype+'> ') tmpFile.write(line.encode(encoding="utf8")) tmpFile.write(' </'+dtype+'>') tmpFile.write('\n</list'+dtype.title()+'>\n') tmpFile.close() if options.t == "note" and options.e: bilbo.annotate(tmpDir, dirModel, typeCorpus, 1) else : bilbo.annotate(tmpDir, dirModel, typeCorpus) #tmp_str = ''.join(open(os.path.join(resDir, 'tmp.xml')).readlines()) tmp_str = unicode('') with codecs.open(os.path.join(resDir, 'tmp.xml'), encoding='utf8') as tmp_str_fp: for line in tmp_str_fp: tmp_str += unicode(line) os.unlink(filename) os.rmdir(tmpDir) shutil.rmtree(resDir) #Because this one may not be empty when we delete it return tmp_str
print "\t input data folder where the data files are (training or labeling)" print " arg2 : <string>" print "\t output data folder where the result files are saved\n" else: if options.g == "simple": bilbo = Bilbo(str(args[1]), options, "crf_model_simple") elif options.g == "detail": bilbo = Bilbo(str(args[1]), options, "crf_model_detail") dtype = options.t if dtype == "bibl": typeCorpus = 1 elif dtype == "note": typeCorpus = 2 dirModel = os.path.join(rootDir, "model/corpus") + str(typeCorpus) + "/" + options.m + "/" if not os.path.exists(dirModel): os.makedirs(dirModel) if options.T: # training bilbo.train(str(args[0]), dirModel, typeCorpus) elif options.L: # labeling if dtype == "note" and options.e: bilbo.annotate(str(args[0]), dirModel, typeCorpus, 1) else: bilbo.annotate(str(args[0]), dirModel, typeCorpus) else: print "Please choose training(-T option) or labeling(-L option)" # simpleLabeling("Y.-M. KIM et al., An Extension of PLSA for Document Clustering, In Proceedings of ACM 17th Conference on Information and Knowledge Management, 2008.")
def annotate(self): self.bilboOptions.T = False self.bilboOptions.L = True bilbo = Bilbo(self.dirResult, self.bilboOptions, "crf_model_simple") bilbo.annotate(self.dirLabel, self.dirModel, 1)