예제 #1
0
def loadData(params, withhold, ffs, trainfile="train.xml", testfile="testcases.xml"):
    """
    loads the movie data

    arguments:
        params      : dict with several keys:
            load        : loading mode; either: 'extract' to load from
                        `params['extractFile']`, 'split' to load from
                        `params['splitFile']`, or None to extract features and
                        save to `params['extractFile']` and/or
                        `params['splitFile']`.
            extractFile : file to load/save extracted features to/from,
                        depending on loading mode
            splitFile   : file to load/save split data to/from,
                        depending on loading mode
        withhold    : number of data points to withhold for cross-validation
        ffs         : list of feature functions
        trainfile   : path to training file (train.xml)
        testfile    : path to test cases file

    returns:

    """
    # load data from `params['splitFile']`
    if params['load']=='split':
        X_train, y_train, train_ids,X_test,y_test,test_ids = unpickle(params['splitFile'])
        print "loaded %d fds" % len(train_ids)
        print "withholding %d of %d fds" % (len(test_ids), len(train_ids))
    else:
        # load data from scratch
        if params['load']==None:
            fds, targets, train_ids = regress.extract_feats_helper(ffs, trainfile)
            print "loaded %d fds" % len(fds)
            if params['extractFile'] != None:
                pickle((fds,targets,train_ids),params['extractFile'])
        # load data from `params['extractFile']`, but split it anew
        elif params['load']=='extract':
            fds,targets,train_ids=unpickle(params['extractFile'])

        # load the test data from the testcases file
        if withhold==0:
            X_train,feat_dict = regress.make_design_mat(fds)
            y_train=np.array(targets)
            X_test,_,y_test,test_ids = regress.extract_feats(ffs, testfile, global_feat_dict=feat_dict)
            train_ids = []
        # withhold some of the training data into test data
        else:
            fds, targets, train_ids, fdsTest, targetsTest, test_ids = splitData(fds, targets, train_ids, withhold, params['splitMethod'])
            X_train,feat_dict = regress.make_design_mat(fds)
            X_test,_ = regress.make_design_mat(fdsTest, feat_dict)
            y_train=np.array(targets)
            y_test=np.array(targetsTest)

        if params['splitFile'] != None:
            pickle((X_train, y_train, train_ids,X_test,y_test,test_ids), params['splitFile'])

    return X_train,y_train,train_ids, X_test,y_test,test_ids
예제 #2
0
 def postproc(feat_dict):
     return rs.make_design_mat(feat_dict)
예제 #3
0
파일: misc.py 프로젝트: rchen152/CS181
                    assert False
                else:
                    curr_inst = [line]
                    in_instance = True
            elif end_tag in line:
                curr_inst.append(line)
                movies.append(util.MovieData(ET.fromstring("".join(curr_inst))))
                curr_inst = []
                in_instance = False
            elif in_instance:
                curr_inst.append(line)
    return movies

'''movies = get_movies('train.xml')
print regression_starter.metadata_feats(movies[0])
print '-------'
print regression_starter.unigram_feats(movies[0])'''

fds = [{'hi':1,'bye':0,'foo':3},{'hi':1,'hello':1,'foo':2,'bar':0}]
'''movies = get_movies('train.xml')
fd1 = regression_starter.metadata_feats(movies[0])
fd2 = regression_starter.metadata_feats(movies[1])
print fd1
print fd2
print '------'
fds = [fd1,fd2]'''
X, dict = regression_starter.make_design_mat(fds)
print X
print '------'
print dict