def test_essays():
    essays_paths = glob.glob("data/csv/*.csv")
    for essaypath in sorted(essays_paths):
        essayname = essaypath.split("/")[-1].split(".")[0]
        essays = EssayCollection(essaypath,essayname)
        essay_types = essays.meta_data()["essay_type"]
        print essayname, "TR", (essay_types=="TRAINING").sum(), "VA", (essay_types=="VALIDATION").sum() 
def test_essays():
    essays_paths = glob.glob("data/csv/*.csv")
    for essaypath in sorted(essays_paths):
        essayname = essaypath.split("/")[-1].split(".")[0]
        essays = EssayCollection(essaypath, essayname)
        essay_types = essays.meta_data()["essay_type"]
        print essayname, "TR", (essay_types == "TRAINING").sum(), "VA", (
            essay_types == "VALIDATION").sum()
Exemplo n.º 3
0
      <Student_Test_List>
         <Student_Test_Details Student_Test_ID="%(test_id)s" Grade="%(final_score)s" Total_CR_Item_Count="1">
            <Item_DataPoint_List>
               <Item_DataPoint_Details Item_ID="%(item_id)s" Data_Point="" Item_No="1" Final_Score="%(final_score)d">
                  <Read_Details Read_Number="1" Score_Value="%(final_score)s" Reader_ID="1" Date_Time="20141026134100" />
               </Item_DataPoint_Details>
            </Item_DataPoint_List>
         </Student_Test_Details>
      </Student_Test_List>
   </Student_Details>
"""

for ensemblepath in glob.glob("ensemble/*.csv"):
    preds = pd.read_csv(ensemblepath)
    item_id = os.path.split(ensemblepath)[-1][:5]
    essays = EssayCollection("data/csv/" + item_id + "_1.csv")
    realscores = essays.meta_data()["score3"]
    scores = [col for col in preds.columns if col.find("prob") > 0]

    # optimize probability
    optpar = np.array([0.0, 0.0])
    for grade, col in enumerate(scores):
        trainpred = np.array(preds.ix[preds.essay_type == "TRAINING", col])
        trainreal = np.array(
            (realscores[preds.essay_type == "TRAINING"] == grade).map(int))

        def fnopt(par):
            loss = log_loss(trainreal, logitinv(par[0] * trainpred + par[1]))
            return loss

        opt = fmin(fnopt, np.array([0.0, 0.0]))
         <Student_Test_Details Student_Test_ID="%(test_id)s" Grade="%(final_score)s" Total_CR_Item_Count="1">
            <Item_DataPoint_List>
               <Item_DataPoint_Details Item_ID="%(item_id)s" Data_Point="" Item_No="1" Final_Score="%(final_score)d">
                  <Read_Details Read_Number="1" Score_Value="%(final_score)s" Reader_ID="1" Date_Time="20141026134100" />
               </Item_DataPoint_Details>
            </Item_DataPoint_List>
         </Student_Test_Details>
      </Student_Test_List>
   </Student_Details>
"""


for ensemblepath in glob.glob("ensemble/*.csv"):
    preds = pd.read_csv(ensemblepath)
    item_id = os.path.split(ensemblepath)[-1][:5]    
    essays = EssayCollection("data/csv/" + item_id + "_1.csv")
    realscores = essays.meta_data()["score3"]
    scores = [col for col in preds.columns if col.find("prob") > 0]

    # optimize probability
    optpar = np.array([0.0,0.0])
    for grade,col in enumerate(scores): 
        trainpred = np.array(preds.ix[preds.essay_type=="TRAINING",col])
        trainreal = np.array((realscores[preds.essay_type=="TRAINING"]==grade).map(int))
        def fnopt(par):
            loss = log_loss(trainreal, logitinv(par[0]*trainpred+par[1]))
            return loss
        opt = fmin(fnopt, np.array([0.0,0.0]))
        optpar += opt
    
    optpar /= len(scores)
import os
import numpy as np
import pandas as pd

from essay.essay import EssayCollection
from lib import kappa
from lib.utils import logit
from scipy.optimize import fmin, fmin_bfgs, fmin_cg, fmin_ncg
from sklearn import cross_validation
from sklearn.linear_model import ElasticNet

results = open("results.csv", "w")
results.write("essay,model,score\n")
for essaypath in sorted(glob.glob("data/csv/*.csv")):
    print essaypath,
    essays = EssayCollection(essaypath)

    essayname = os.path.split(essaypath)[-1][:-4]
    predictions_all_list = []
    trainset = np.where(essays.meta_data().essay_type == "TRAINING")[0]
    for modelpath in glob.glob("models/*" + essayname + "*"):
        modelname = os.path.split(modelpath)[-1]
        predictions = pd.read_csv(modelpath)
        predictions["modelname"] = modelname
        results.write("%s,%s,%.6f\n" %
                      (essayname, modelname[8:],
                       kappa.quadratic_weighted_kappa(
                           essays.meta_data()["score3"][trainset],
                           predictions["pred_scorer_3"][trainset])))
        predictions_all_list.append(predictions.copy())
import os
import numpy as np
import pandas as pd

from essay.essay import EssayCollection
from lib import kappa
from lib.utils import logit
from scipy.optimize import fmin, fmin_bfgs, fmin_cg, fmin_ncg
from sklearn import cross_validation
from sklearn.linear_model import ElasticNet

results = open("results.csv","w")
results.write("essay,model,score\n")
for essaypath in sorted(glob.glob("data/csv/*.csv")):
    print essaypath, 
    essays = EssayCollection(essaypath)
    
    essayname = os.path.split(essaypath)[-1][:-4]
    predictions_all_list = []
    trainset = np.where(essays.meta_data().essay_type=="TRAINING")[0]
    for modelpath in glob.glob("models/*" + essayname + "*"):
        modelname = os.path.split(modelpath)[-1]
        predictions = pd.read_csv(modelpath)
        predictions["modelname"] = modelname
        results.write("%s,%s,%.6f\n" % (essayname, modelname[8:], kappa.quadratic_weighted_kappa(essays.meta_data()["score3"][trainset],predictions["pred_scorer_3"][trainset])))
        predictions_all_list.append(predictions.copy())

    results.write("%s,%s,%.6f\n" % (essayname, "HUMAN", kappa.quadratic_weighted_kappa(essays.meta_data()["score1"][trainset],essays.meta_data()["score2"][trainset])))
    
    predictions_all = pd.concat(predictions_all_list)
    scores = [col for col in predictions_all.columns if col.find("grade") > 0]    
def model_generic_1ofK_clas(pipeline,model_name,model_f,essays_paths,parallel = False):
    print model_name
    for essaypath in sorted(essays_paths):
        print essaypath, 
        essayname = essaypath.split("/")[-1].split(".")[0]
        save_as = MODEL_PATHS + essayname + "_" + pipeline["name"] + "_" + model_name 
        if os.path.exists(save_as):
            print "Skipping"
            continue
        
        essays = EssayCollection(essaypath,essayname)
        essays.apply_datasteps(pipeline["steps"])
        essays.create_feature_matrix(min_sparsity=5)
                
        predictions = pd.DataFrame({'id':range(essays.meta_data().shape[0])})
        trainset = np.where(essays.meta_data().essay_type == "TRAINING")[0]
        testset = np.where(essays.meta_data().essay_type == "")[0]
        
        X_all = np.array(essays.feature_matrix.todense(),dtype=np.float32)
        y_all = np.array(essays.meta_data()["score3"].map(int),dtype=np.int32)
        non_duplicated = get_nonduplicate_columns(pd.DataFrame(X_all))
        
        print "orig dimensions", X_all.shape[1],
        X_all = X_all[:,non_duplicated]
        print "reduced dimensions", X_all.shape[1], 
        
        predictions = pd.DataFrame({'id':range(essays.meta_data().shape[0])})
        trainset = np.where(essays.meta_data().essay_type == "TRAINING")[0]
        testset = np.where(essays.meta_data().essay_type == "VALIDATION")[0]          
                  
        for scorer in [3]:
            kf = cross_validation.KFold(len(trainset), n_folds=7, random_state=0)
            kf = [(trainset[tr], trainset[te]) for tr, te in kf] + [(trainset,testset)]
              
            for grade in sorted(essays.meta_data()["score%d" % (scorer)].unique()):
                y_all = np.array(essays.meta_data()["score%d" % (scorer)].map(lambda y: 1 if int(y)==int(grade) else 0))
        
                pred_name = "scorer_%d_grade_%d" % (scorer,grade)
                predictions[pred_name] = 0        

                if parallel:
                    pool = Pool(processes=4)
                    essay_sets = pool.map(cv, [[kf, X_all, y_all, model_f, n] for n in range(8)])
                    pool.close()
        
                    for n, essay_set in enumerate(essay_sets):
                        te_ind = kf[n][1]
                        predictions.ix[te_ind,pred_name] = essay_set[:,1]               
                else:
                    for n,(tr_ind,te_ind) in enumerate(kf):
                        predictions.ix[te_ind,pred_name] = model_f(X_all[tr_ind,:], 
                                                                   y_all[tr_ind],
                                                                   X_all[te_ind,:],
                                                                   feature_names=essays.feature_names)
                    
             
        predictions = predictions.ix[:,sorted(predictions.columns)]
        predictions["pred_scorer_3"] = np.array(predictions.ix[:,[c for c in predictions.columns if c.startswith("scorer_3")]]).argmax(axis=1)
        predictions.to_csv(save_as,index=False)
        print kappa.quadratic_weighted_kappa(essays.meta_data()["score3"][trainset],predictions["pred_scorer_3"][trainset])
def model_generic_DBN(pipeline,model_name,model_f,essays_paths,parallel = False):
    print model_name
    for essaypath in sorted(essays_paths):
        print essaypath, 
        essayname = essaypath.split("/")[-1].split(".")[0]
        save_as = MODEL_PATHS + essayname + "_" + pipeline["name"] + "_" + model_name 
        if os.path.exists(save_as):
            print "Skipping"
            continue
        
        essays = EssayCollection(essaypath,essayname)
        essays.apply_datasteps(pipeline["steps"])
        essays.create_feature_matrix(min_sparsity=5)
                
        predictions = pd.DataFrame({'id':range(essays.meta_data().shape[0])})
        trainset = np.where(essays.meta_data().essay_type == "TRAINING")[0]
        testset = np.where(essays.meta_data().essay_type == "")[0]
        
        X_all = np.array(essays.feature_matrix.todense(),dtype=np.float32)
        y_all = np.array(essays.meta_data()["score3"].map(int),dtype=np.int32)
        non_duplicated = get_nonduplicate_columns(pd.DataFrame(X_all))
        
        print "orig dimensions", X_all.shape[1],
        X_all = X_all[:,non_duplicated]
        print "reduced dimensions", X_all.shape[1], 

        predictions = pd.DataFrame({'id':range(essays.meta_data().shape[0])})
        trainset = np.where(essays.meta_data().essay_type == "TRAINING")[0]
        testset = np.where(essays.meta_data().essay_type == "VALIDATION")[0]          

        scorer = 3
        kf = cross_validation.KFold(len(trainset), n_folds=7, random_state=0)
        kf = [(trainset[tr], trainset[te]) for tr, te in kf] + [(trainset,testset)]

        scores = sorted(essays.meta_data()["score%d" % (scorer)].unique())
        predictions = np.zeros((essays.meta_data().shape[0],len(scores)))
        
        model_f.layer_sizes[0] = X_all.shape[1]
        model_f.layer_sizes[2] = len(scores)        
        
        try:
            for n,(tr_ind,te_ind) in enumerate(kf):
                print n
                scaler = StandardScaler()
                _ = scaler.fit(X_all[tr_ind,:])
                X_tr = scaler.transform(X_all[tr_ind,:]) / 50.0
                X_te = scaler.transform(X_all[te_ind,:]) / 50.0
                model_f.fit(X_tr, y_all[tr_ind])
                print kappa.quadratic_weighted_kappa(essays.meta_data()["score3"][tr_ind],model_f.predict(X_tr))
                print kappa.quadratic_weighted_kappa(essays.meta_data()["score3"][te_ind],model_f.predict(X_te))
                predictions[te_ind,:] = model_f.predict_proba(X_te)                    
        except:
            pass
             
        predictions = pd.DataFrame(predictions)
        predictions.columns = ["scorer_%d_grade_%d" % (scorer,grade) for grade in scores]
        predictions["pred_scorer_3"] = np.array(predictions).argmax(axis=1)
        predictions.to_csv(save_as,index=False)
        print kappa.quadratic_weighted_kappa(essays.meta_data()["score3"][trainset],predictions["pred_scorer_3"][trainset])
def model_generic_1ofK_clas(pipeline,
                            model_name,
                            model_f,
                            essays_paths,
                            parallel=False):
    print model_name
    for essaypath in sorted(essays_paths):
        print essaypath,
        essayname = essaypath.split("/")[-1].split(".")[0]
        save_as = MODEL_PATHS + essayname + "_" + pipeline[
            "name"] + "_" + model_name
        if os.path.exists(save_as):
            print "Skipping"
            continue

        essays = EssayCollection(essaypath, essayname)
        essays.apply_datasteps(pipeline["steps"])
        essays.create_feature_matrix(min_sparsity=5)

        predictions = pd.DataFrame({'id': range(essays.meta_data().shape[0])})
        trainset = np.where(essays.meta_data().essay_type == "TRAINING")[0]
        testset = np.where(essays.meta_data().essay_type == "")[0]

        X_all = np.array(essays.feature_matrix.todense(), dtype=np.float32)
        y_all = np.array(essays.meta_data()["score3"].map(int), dtype=np.int32)
        non_duplicated = get_nonduplicate_columns(pd.DataFrame(X_all))

        print "orig dimensions", X_all.shape[1],
        X_all = X_all[:, non_duplicated]
        print "reduced dimensions", X_all.shape[1],

        predictions = pd.DataFrame({'id': range(essays.meta_data().shape[0])})
        trainset = np.where(essays.meta_data().essay_type == "TRAINING")[0]
        testset = np.where(essays.meta_data().essay_type == "VALIDATION")[0]

        for scorer in [3]:
            kf = cross_validation.KFold(len(trainset),
                                        n_folds=7,
                                        random_state=0)
            kf = [(trainset[tr], trainset[te])
                  for tr, te in kf] + [(trainset, testset)]

            for grade in sorted(essays.meta_data()["score%d" %
                                                   (scorer)].unique()):
                y_all = np.array(essays.meta_data()["score%d" % (scorer)].map(
                    lambda y: 1 if int(y) == int(grade) else 0))

                pred_name = "scorer_%d_grade_%d" % (scorer, grade)
                predictions[pred_name] = 0

                if parallel:
                    pool = Pool(processes=4)
                    essay_sets = pool.map(cv, [[kf, X_all, y_all, model_f, n]
                                               for n in range(8)])
                    pool.close()

                    for n, essay_set in enumerate(essay_sets):
                        te_ind = kf[n][1]
                        predictions.ix[te_ind, pred_name] = essay_set[:, 1]
                else:
                    for n, (tr_ind, te_ind) in enumerate(kf):
                        predictions.ix[te_ind, pred_name] = model_f(
                            X_all[tr_ind, :],
                            y_all[tr_ind],
                            X_all[te_ind, :],
                            feature_names=essays.feature_names)

        predictions = predictions.ix[:, sorted(predictions.columns)]
        predictions["pred_scorer_3"] = np.array(
            predictions.
            ix[:, [c for c in predictions.columns
                   if c.startswith("scorer_3")]]).argmax(axis=1)
        predictions.to_csv(save_as, index=False)
        print kappa.quadratic_weighted_kappa(
            essays.meta_data()["score3"][trainset],
            predictions["pred_scorer_3"][trainset])
Exemplo n.º 10
0
def model_generic_DBN(pipeline,
                      model_name,
                      model_f,
                      essays_paths,
                      parallel=False):
    print model_name
    for essaypath in sorted(essays_paths):
        print essaypath,
        essayname = essaypath.split("/")[-1].split(".")[0]
        save_as = MODEL_PATHS + essayname + "_" + pipeline[
            "name"] + "_" + model_name
        if os.path.exists(save_as):
            print "Skipping"
            continue

        essays = EssayCollection(essaypath, essayname)
        essays.apply_datasteps(pipeline["steps"])
        essays.create_feature_matrix(min_sparsity=5)

        predictions = pd.DataFrame({'id': range(essays.meta_data().shape[0])})
        trainset = np.where(essays.meta_data().essay_type == "TRAINING")[0]
        testset = np.where(essays.meta_data().essay_type == "")[0]

        X_all = np.array(essays.feature_matrix.todense(), dtype=np.float32)
        y_all = np.array(essays.meta_data()["score3"].map(int), dtype=np.int32)
        non_duplicated = get_nonduplicate_columns(pd.DataFrame(X_all))

        print "orig dimensions", X_all.shape[1],
        X_all = X_all[:, non_duplicated]
        print "reduced dimensions", X_all.shape[1],

        predictions = pd.DataFrame({'id': range(essays.meta_data().shape[0])})
        trainset = np.where(essays.meta_data().essay_type == "TRAINING")[0]
        testset = np.where(essays.meta_data().essay_type == "VALIDATION")[0]

        scorer = 3
        kf = cross_validation.KFold(len(trainset), n_folds=7, random_state=0)
        kf = [(trainset[tr], trainset[te])
              for tr, te in kf] + [(trainset, testset)]

        scores = sorted(essays.meta_data()["score%d" % (scorer)].unique())
        predictions = np.zeros((essays.meta_data().shape[0], len(scores)))

        model_f.layer_sizes[0] = X_all.shape[1]
        model_f.layer_sizes[2] = len(scores)

        try:
            for n, (tr_ind, te_ind) in enumerate(kf):
                print n
                scaler = StandardScaler()
                _ = scaler.fit(X_all[tr_ind, :])
                X_tr = scaler.transform(X_all[tr_ind, :]) / 50.0
                X_te = scaler.transform(X_all[te_ind, :]) / 50.0
                model_f.fit(X_tr, y_all[tr_ind])
                print kappa.quadratic_weighted_kappa(
                    essays.meta_data()["score3"][tr_ind],
                    model_f.predict(X_tr))
                print kappa.quadratic_weighted_kappa(
                    essays.meta_data()["score3"][te_ind],
                    model_f.predict(X_te))
                predictions[te_ind, :] = model_f.predict_proba(X_te)
        except:
            pass

        predictions = pd.DataFrame(predictions)
        predictions.columns = [
            "scorer_%d_grade_%d" % (scorer, grade) for grade in scores
        ]
        predictions["pred_scorer_3"] = np.array(predictions).argmax(axis=1)
        predictions.to_csv(save_as, index=False)
        print kappa.quadratic_weighted_kappa(
            essays.meta_data()["score3"][trainset],
            predictions["pred_scorer_3"][trainset])