def NGramLangModel(): cl = Loader(MAIN_DIR+DS_DIR) f = cl.loadLarge('tb_kota_bywiki.txt',lazy_load=True)#tb_berita_onlinemedia, tb_kota_bywiki w = cl.processRaw(f,to_lower=True) r = cl.rawForLangmodel(w,punct_remove=True,to_token=True) lms = NGramModels(ngram=2) # njump parameter belum bisa digunakan untuk modkn optimizer models = lms.train(r, optimizer='modkn',\ separate=False, njump=0, verbose=False) print "##########################################################"
def NGramLangModel(): cl = Loader('C:\\BimaNLP\\dataset\\') f = cl.loadLarge('tb_kota_bywiki.txt',lazy_load=True)#tb_berita_onlinemedia, tb_kota_bywiki w = cl.processRaw(f,to_lower=True) r = cl.rawForLangmodel(w,punct_remove=True,to_token=True) dataset=[['saya','suka','kamu'], ['kamu','suka','saya'], ['saya','tidak','suka','jika','kamu','pergi','dengan','dia'] ] lms = NGramModels(ngram=2) # njump parameter belum bisa digunakan untuk modkn optimizer models = lms.train(dataset, optimizer='modkn',\ separate=False, njump=0, verbose=True) print "##########################################################"
def fit(sentence, method, dataset_folder=None, json_data=None, json_ent=None, verbose=False): if dataset_folder: train_dir = dataset_folder + 'classifier/' # Read the data train_data = [] train_labels = [] classes = [] test_data = [sentence] test_labels = ['NONE'] if verbose: print "Begin train to classifying sentence..." if json_data: classes = json_data.keys() if verbose: print "Using json as training, so adding some new class, and classes become:\n", else: classes = ['int_greetings', 'int_ask'] ## Per 17-October 2016, ## dimaksud jika user menambahkan specific char @ pada trained data, maka secara otomatis ## system akan menambahkan kata tersebut berulang sebanyak entities yg dimaksud regex = re.compile('\@\w+') for z in classes: if json_data: if z.lower() != 'none': #Dont process none data f = json_data[z]["trained_data"] else: pass else: ld = Loader(train_dir) f = ld.loadLarge(z + '.txt', lazy_load=True) if z.lower() != 'none': label = z ttl = len(f) i = 0 txtre = [] for x in f: i += 1 #### sub untuk autogenerate trained data addition = v0.1 #### regex_string = re.search(regex, x.lower()) if regex_string: xx = list(set(re.findall(regex, x.lower()))) ents = defaultdict(list) for ii in range(len(xx)): ent, type = intentReqParamLoader(xx[ii][1:], json_ent) for k, v in ent.iteritems(): for it in v: if it not in ents: ents[xx[ii][1:]].append(it) for ii in ents.keys(): for iii in range(len(ents[ii])): random.shuffle(ents[ii]) train_data.append( re.sub(r'@' + ii + '', ents[ii][0], x)) train_labels.append(label) ##### End Sub #### else: if verbose: msg = "Processing train data {} of {}".format(i, ttl) sys.stdout.write("\r {:<10}".format(msg)) sys.stdout.flush() sen = x if len(sen) >= 1: train_data.append(sen.lower()) train_labels.append(label) if verbose: print "\n" ######################## Begin Training to Classifying Data ######################## print "solvin intent using classfier:", method model = IntentClassifier(solver_algo=method) models = model.train(train_data, train_labels, max_df=1.0, minword=1) predicted_label = [models.predict(test_data)[0]] from operator import itemgetter predict_proba = sorted(zip(models.clf.classes_, models.predict_proba(test_data)[0]), key=itemgetter(1), reverse=True) #################################################################################### if verbose: print "Hasil klasifikasi kalimat: %s , adalah: %s" % (sentence, predicted_label) print "\n" return predicted_label, predict_proba