示例#1
0
def NGramLangModel():
    cl = Loader(MAIN_DIR+DS_DIR)
    f = cl.loadLarge('tb_kota_bywiki.txt',lazy_load=True)#tb_berita_onlinemedia, tb_kota_bywiki
    w = cl.processRaw(f,to_lower=True)
    r = cl.rawForLangmodel(w,punct_remove=True,to_token=True)
    
    lms = NGramModels(ngram=2)
    # njump parameter belum bisa digunakan untuk modkn optimizer
    models = lms.train(r, optimizer='modkn',\
                       separate=False, njump=0, verbose=False)

    print "##########################################################"
示例#2
0
def NGramLangModel():
    cl = Loader('C:\\BimaNLP\\dataset\\')
    f = cl.loadLarge('tb_kota_bywiki.txt',lazy_load=True)#tb_berita_onlinemedia, tb_kota_bywiki
    w = cl.processRaw(f,to_lower=True)
    r = cl.rawForLangmodel(w,punct_remove=True,to_token=True)
                           
    dataset=[['saya','suka','kamu'],
         ['kamu','suka','saya'],
         ['saya','tidak','suka','jika','kamu','pergi','dengan','dia']
         ]
    
    lms = NGramModels(ngram=2)
    # njump parameter belum bisa digunakan untuk modkn optimizer
    models = lms.train(dataset, optimizer='modkn',\
                       separate=False, njump=0, verbose=True)

    print "##########################################################"
示例#3
0
def fit(sentence,
        method,
        dataset_folder=None,
        json_data=None,
        json_ent=None,
        verbose=False):
    if dataset_folder:
        train_dir = dataset_folder + 'classifier/'

    # Read the data
    train_data = []
    train_labels = []
    classes = []
    test_data = [sentence]
    test_labels = ['NONE']

    if verbose:
        print "Begin train to classifying sentence..."

    if json_data:
        classes = json_data.keys()

        if verbose:
            print "Using json as training, so adding some new class, and classes become:\n",
    else:
        classes = ['int_greetings', 'int_ask']

    ## Per 17-October 2016,
    ##  dimaksud jika user menambahkan specific char @ pada trained data, maka secara otomatis
    ##  system akan menambahkan kata tersebut berulang sebanyak entities yg dimaksud
    regex = re.compile('\@\w+')

    for z in classes:
        if json_data:
            if z.lower() != 'none':  #Dont process none data
                f = json_data[z]["trained_data"]
            else:
                pass
        else:
            ld = Loader(train_dir)
            f = ld.loadLarge(z + '.txt', lazy_load=True)

        if z.lower() != 'none':
            label = z

            ttl = len(f)
            i = 0

            txtre = []
            for x in f:
                i += 1

                #### sub untuk autogenerate trained data addition = v0.1 ####
                regex_string = re.search(regex, x.lower())

                if regex_string:
                    xx = list(set(re.findall(regex, x.lower())))
                    ents = defaultdict(list)
                    for ii in range(len(xx)):
                        ent, type = intentReqParamLoader(xx[ii][1:], json_ent)
                        for k, v in ent.iteritems():
                            for it in v:
                                if it not in ents:
                                    ents[xx[ii][1:]].append(it)

                    for ii in ents.keys():
                        for iii in range(len(ents[ii])):
                            random.shuffle(ents[ii])

                            train_data.append(
                                re.sub(r'@' + ii + '', ents[ii][0], x))
                            train_labels.append(label)

                ##### End Sub ####
                else:
                    if verbose:
                        msg = "Processing train data {} of {}".format(i, ttl)
                        sys.stdout.write("\r {:<10}".format(msg))
                        sys.stdout.flush()

                    sen = x

                    if len(sen) >= 1:
                        train_data.append(sen.lower())
                        train_labels.append(label)
        if verbose:
            print "\n"

    ######################## Begin Training to Classifying Data ########################
    print "solvin intent using classfier:", method
    model = IntentClassifier(solver_algo=method)

    models = model.train(train_data, train_labels, max_df=1.0, minword=1)
    predicted_label = [models.predict(test_data)[0]]

    from operator import itemgetter
    predict_proba = sorted(zip(models.clf.classes_,
                               models.predict_proba(test_data)[0]),
                           key=itemgetter(1),
                           reverse=True)
    ####################################################################################

    if verbose:
        print "Hasil klasifikasi kalimat: %s , adalah: %s" % (sentence,
                                                              predicted_label)
        print "\n"

    return predicted_label, predict_proba