Пример #1
0
	def train(self, sentences, labels, cross_validation = False):
		x = []
		y = []
		
		for i in range(0, len(sentences)):
			sentence = sentences[i]
			prev = []
			
			j = 0
			for word in sentence:
				body = word.lower()
				
				featurespace = self._construct_featurespace(body, prev)
				
				prev.append((body, labels[i][j]))
				if len(prev) > self.chain_len:
					del(prev[0])
					
				x.append(featurespace.featureset)
				j += 1

			y.extend(labels[i])

		prob = svm.problem(y, x)
		
		if cross_validation:
			param = svm.parameter('-c 1 -v 4 -s 4')
			svm.train(prob, param)
		else:
			param = svm.parameter('-c 1 -s 4')
			self._svm_model = svm.train(prob, param)
Пример #2
0
    def train(self, sentences, labels, cross_validation=False):
        x = []
        y = []

        for i in range(0, len(sentences)):
            sentence = sentences[i]
            prev = []

            j = 0
            for word in sentence:
                body = word.lower()

                featurespace = self._construct_featurespace(body, prev)

                prev.append((body, labels[i][j]))
                if len(prev) > self.chain_len:
                    del (prev[0])

                x.append(featurespace.featureset)
                j += 1

            y.extend(labels[i])

        prob = svm.problem(y, x)

        if cross_validation:
            param = svm.parameter('-c 1 -v 4 -s 4')
            svm.train(prob, param)
        else:
            param = svm.parameter('-c 1 -s 4')
            self._svm_model = svm.train(prob, param)
Пример #3
0
    def train(self, x, y, biased=False):
        data = []
        for sample in x:
            data.append(dict([(self._features.setId(d), sample[d]) for d in sample]))

        labels = [self._labels.setId(C) for C in y]
        if self._labels.count() == 2:
            labels = [1 if label == 1 else -1 for label in labels]
            param = liblinear.parameter("-c 1 -s 2 -q" + (" -B {0}".format(biased) if biased else ""))
        else:
            param = liblinear.parameter("-c 1 -s 4 -q" + (" -B {0}".format(biased) if biased else ""))
        prob = liblinear.problem(labels, data)
        self._model = liblinear.train(prob, param)
Пример #4
0
    def train(self, x, y, biased=False):
        data = []
        for sample in x:
            data.append(
                dict([(self._features.setId(d), sample[d]) for d in sample]))

        labels = [self._labels.setId(C) for C in y]
        if self._labels.count() == 2:
            labels = [1 if label == 1 else -1 for label in labels]
            param = liblinear.parameter(
                '-c 1 -s 2 -q' + (' -B {0}'.format(biased) if biased else ''))
        else:
            param = liblinear.parameter(
                '-c 1 -s 4 -q' + (' -B {0}'.format(biased) if biased else ''))
        prob = liblinear.problem(labels, data)
        self._model = liblinear.train(prob, param)
Пример #5
0
    def _complete_training(self, debug=False):
        """ Forward data to external training and extract classifier information
        """
        if self.str_label_function is not None:
            self.label_function = eval(self.str_label_function)
            self.labels = self.label_function()

        options = "-c %.42f  -e %.42f -s %d -B %d" % \
             (self.complexity, self.tolerance, self.alg_num, self.offset)
        for i,w in enumerate(self.weight):
            options += " -w%d %.42f" % (i, w)
        if not self.debug:
            options += " -q"
            self._log("Liblinear is now quiet!")

        import liblinearutil

        param = liblinearutil.parameter(options)
        problem = liblinearutil.problem(self.labels, self.samples)
        model = liblinearutil.train(problem, param)

        self.calculate_classification_vector(model)
        if self.debug:
            print self.print_w
            print self.b
Пример #6
0
 def train(self):
     sys.stderr.write('creating training problem...')
     prob = problem(self.labels, self.contexts)
     sys.stderr.write('done\ntraining with option(s) "' + self.parameters +
                      '"...')
     self.model = train(prob, parameter(self.parameters))
     sys.stderr.write('done\n')
Пример #7
0
def _lib_train_liblinear(user_tfidf, num_pos, num_neg, ignore):
    param = parameter("-s 0")
    sparse_user_tfidf, num_pos, num_neg = _convert_to_sparse_matrix(user_tfidf, num_pos, num_neg, ignore)
    labels = ([1] * num_pos) + ([-1] * num_neg)
    prob = problem(labels, sparse_user_tfidf)
    modellog = train(prob, param)
    return modellog
Пример #8
0
    def train(self, data_train, data_dev):
        """Trains Minitagger on the given data."""
        start_time = time.time()
        assert self.__feature_extractor.is_training  # Assert untrained

        # Extract features (only labeled instances) and pass them to liblinear.
        [label_list, features_list, _] = \
            self.__feature_extractor.extract_features(data_train, False, [])
        if not self.quiet:
            print("{0} labeled instances (out of {1})".format(
                len(label_list), data_train.num_instances))
            print("{0} label types".format(len(data_train.label_count)))
            print("{0} observation types".format(
                len(data_train.observation_count)))
            print("\"{0}\" feature template".format(
                self.__feature_extractor.feature_template))
            print("{0} feature types".format(
                self.__feature_extractor.num_feature_types()))
        problem = liblinearutil.problem(label_list, features_list)
        self.__liblinear_model = \
            liblinearutil.train(problem, liblinearutil.parameter("-q"))
        self.__feature_extractor.is_training = False

        if not self.quiet:
            num_seconds = int(math.ceil(time.time() - start_time))
            print("Training time: {0}".format(
                str(datetime.timedelta(seconds=num_seconds))))
            if data_dev is not None:
                quiet_value = self.quiet
                self.quiet = True
                _, acc = self.predict(data_dev)
                self.quiet = quiet_value
                print("Dev accuracy: {0:.3f}%".format(acc))
Пример #9
0
    def train(self, data_train, data_dev):
        """Trains Minitagger on the given data."""
        start_time = time.time()
        assert self.__feature_extractor.is_training  # Assert untrained

        # Extract features (only labeled instances) and pass them to liblinear.
        [label_list, features_list, _] = \
            self.__feature_extractor.extract_features(data_train, False, [])
        if not self.quiet:
            print("{0} labeled instances (out of {1})".format(
                    len(label_list), data_train.num_instances))
            print("{0} label types".format(len(data_train.label_count)))
            print("{0} observation types".format(
                    len(data_train.observation_count)))
            print("\"{0}\" feature template".format(
                    self.__feature_extractor.feature_template))
            print("{0} feature types".format(
                    self.__feature_extractor.num_feature_types()))
        problem = liblinearutil.problem(label_list, features_list)
        self.__liblinear_model = \
            liblinearutil.train(problem, liblinearutil.parameter("-q"))
        self.__feature_extractor.is_training = False

        if not self.quiet:
            num_seconds = int(math.ceil(time.time() - start_time))
            print("Training time: {0}".format(
                    str(datetime.timedelta(seconds=num_seconds))))
            if data_dev is not None:
                quiet_value = self.quiet
                self.quiet = True
                _, acc = self.predict(data_dev)
                self.quiet = quiet_value
                print("Dev accuracy: {0:.3f}%".format(acc))
Пример #10
0
def train_liblinear(args):
    model_name, gold_dir, dirs = args[0], args[1], args[2:]
    vectors, predicates = get_data(gold_dir, dirs)
    prob = problem(map(num_to_class, predicates), vectors)
    param = parameter('-s 0')
    model = train(prob, param)
    save_model(model_name, model)
Пример #11
0
    def _complete_training(self, debug=False):
        """ Forward data to external training and extract classifier information
        """
        if self.str_label_function is not None:
            self.label_function = eval(self.str_label_function)
            self.labels = self.label_function()

        options = "-c %.42f  -e %.42f -s %d -B %d" % \
             (self.complexity, self.tolerance, self.alg_num, self.offset)
        for i, w in enumerate(self.weight):
            options += " -w%d %.42f" % (i, w)
        if not self.debug:
            options += " -q"
            self._log("Liblinear is now quiet!")

        import liblinearutil

        param = liblinearutil.parameter(options)
        problem = liblinearutil.problem(self.labels, self.samples)
        model = liblinearutil.train(problem, param)

        self.calculate_classification_vector(model)
        if self.debug:
            print self.print_w
            print self.b
Пример #12
0
def train_liblinear(args):
    model_name, gold_dir, dirs = args[0], args[1], args[2:]
    vectors, predicates = get_data(gold_dir, dirs)
    prob = problem(map(num_to_class, predicates), vectors)
    param = parameter('-s 0')
    model = train(prob, param)
    save_model(model_name, model)
Пример #13
0
    def train(self, data_train, data_test):
        """
		Trains Minitagger on the given train data. If test data is given, it reports the accuracy of the trained model
		and the F1_score (macro average of f1_score of each label)

		@type data_train: SequenceData
		@param data_train: the training data set
		@type data_test: SequenceData
		@param data_test: the test data set
		"""

        # keep the training start timestamp
        start_time = time.time()
        assert (self.__feature_extractor.is_training
                ), "In order to train, is_training flag should be True"

        # Extract features only for labeled instances from data_train
        [label_list, features_list,
         _] = self.__feature_extractor.extract_features(data_train, False, [])
        # print some useful information about the data
        if not self.quiet:
            print("{0} labeled words (out of {1})".format(
                len(label_list), data_train.num_of_words))
            print("{0} label types".format(len(data_train.label_count)))
            print("{0} word types".format(len(data_train.word_count)))
            print("\"{0}\" feature template".format(
                self.__feature_extractor.feature_template))
            print("{0} feature types".format(
                self.__feature_extractor.num_feature_types()))
        # define problem to be trained using the parameters received from the feature_extractor
        problem = liblinearutil.problem(label_list, features_list)
        # train the model (-q stands for quiet = True in the liblinearutil)
        self.__liblinear_model = liblinearutil.train(
            problem, liblinearutil.parameter("-q"))
        # training is done, set is_training to False, so that prediction can be done
        self.__feature_extractor.is_training = False

        # print some useful information
        if not self.quiet:
            num_seconds = int(math.ceil(time.time() - start_time))
            # how much did the training last
            print("Training time: {0}".format(
                str(datetime.timedelta(seconds=num_seconds))))
            # perform prediction on the data_test and report accuracy
        if data_test is not None:
            quiet_value = self.quiet
            self.quiet = True
            pred_labels, acc = self.predict(data_test)
            self.quiet = quiet_value

            self.__save_prediction_to_file(data_test, pred_labels)
            f1score, precision, recall = report_fscore(self.prediction_path +
                                                       "/predictions.txt",
                                                       wikiner=self.wikiner)
            print("Accuracy: ", acc)
            # create some files useful for debugging
            if self.debug:
                self.__debug(data_test, pred_labels)
        return f1score, precision, recall
Пример #14
0
    def train_regression(self, x, y):
        data = []
        for sample in x:
            data.append(dict([(self._features.setId(d), sample[d]) for d in sample]))

        self._regression = True
        param = liblinear.parameter("-c 1 -s 0")
        prob = liblinear.problem(y, data)
        self._model = liblinear.train(prob, param)
Пример #15
0
    def train_regression(self, x, y):
        data = []
        for sample in x:
            data.append(
                dict([(self._features.setId(d), sample[d]) for d in sample]))

        self._regression = True
        param = liblinear.parameter('-c 1 -s 0')
        prob = liblinear.problem(y, data)
        self._model = liblinear.train(prob, param)
Пример #16
0
def train_log_regr_liblinear(features, responses):
    echo('Training with Liblinear')
    prob = liblinearutil.problem(responses, features)

    # -s 0: L2-regularized logistic regression (primal)
    # -B 1: Fit a bias term
    # -q: quiet mode
    param = liblinearutil.parameter('-s 0 -B 1 -q')

    return liblinearutil.train(prob, param)
Пример #17
0
def train_SVR_liblinear(features, responses):
    echo('Training with Liblinear')
    prob = liblinearutil.problem(responses, features)

    # -s 11: L2-regularized L2-loss support vector regression (primal)
    # -B 1: Fit a bias term
    # -q: quiet mode
    param = liblinearutil.parameter('-s 11 -B 1 -q')

    return liblinearutil.train(prob, param)
Пример #18
0
def parallel_train_predict(args):
    print("A process begins.")
    x_train,y_train,x_test,y_test=args
    problem = liblinearutil.problem(y_train, x_train)
    parameter = liblinearutil.parameter('-s 0 -c 1')
    time_start = time.clock()
    model = liblinearutil.train(problem, parameter)
    print("A process training finished in %f."%(time.clock()-time_start))
    time_start = time.clock()
    p_label, p_acc, p_val = liblinearutil.predict(y_test, x_test,model,'-b 0')
    print("A process predicting finished in %f."%(time.clock()-time_start))
    return p_val
Пример #19
0
def train(train_data, features, c):
    x = []
    y = []

    for key in train_data:
        y.append(train_data[key]['class'])
        x.append(features[key])

    prob = liblinearutil.problem(y, x)
    param = liblinearutil.parameter('-q -c ' + str(c) )
    model = liblinearutil.train(prob, param)

    return model
Пример #20
0
def eval_SVM(X, y, Xhat, yhat):
    # create classification problem
    problem = liblinearutil.problem(y, X)

    # set SVM parameters
    svm_param = liblinearutil.parameter('-s 3 -c 10 -q -B 1')

    # train SVM
    model = liblinearutil.train(problem, svm_param)

    # predict and evaluate
    p_label, p_acc, p_val = liblinearutil.predict(yhat, Xhat, model, '-q')

    # compute accuracy
    acc, mse, scc = liblinearutil.evaluations(yhat, p_label)
    return acc
Пример #21
0
 def best_C(self,x,y):
     """
     training using y=list,x=dict
     parameter = string of parameters
     searches for best C
     """
     prob=lu.problem(y,x)
     para=""
     para+= "-s 2 -C -B %f -p %f -e %f" % (self.bias,
                                           self.p,
                                           self.eps)
     print para
     para1=lu.parameter(para)
     self.model=lu.train(prob,para1)
     best_C, best_rate = lu.train(y, x, para)
     return best_C, best_rate
Пример #22
0
def main():
    if __name__ == "__main__":
        y, x = svm_read_problem(feature_file, return_scipy=True)
        # train:test = 7:3
        train_X = x[:14000]
        train_y = y[:14000]
        test_X = x[14000:]
        test_y = y[14000:]

        prob = problem(train_y, train_X)
        param = parameter("-c 1 -s 2")
        model = train(prob, param)
        p_labs, p_acc, p_vals = predict(test_y, test_X, model)
        accuracy, precision, recall = metrics_result(test_y, p_labs)
        print
        print "accuracy: ", accuracy
        print "precision: ", precision
        print "recall: ", recall
Пример #23
0
 def train(self,x,y):
     """
     training using y=list,x=dict
     parameter = string of parameters
     """
     prob=lu.problem(y,x)
     para=""
     para+= "-s %d -c %f -B %f -p %f -e %f" % (self.L,
                                               self.c,
                                               self.bias,
                                               self.p,
                                               self.eps)
     if(self.v!=0):
         para+=" -v %d" % self.v
     if(self.q!=0):
         para+= " -q"
     print para
     para1=lu.parameter(para)
     self.model=lu.train(prob,para1)
     return True
Пример #24
0
def classify(ds_cur=None):
    from os import chdir, system
    chdir('./liblinear-2.1/python/')
    from liblinearutil import problem, parameter, train, predict
    chdir('../../')
    from pdb import set_trace
    from tqdm import tqdm
    from pymongo import MongoClient
    from json import dumps
    from bson.objectid import ObjectId

    set_trace()

    dont_include = {'_id': 0}
    print 'List of variables:\n'
    for key in variable_lookup:
        print key[1]
    ch1 = raw_input(
        'Input "s" to select custom fields (default selection - all fields):')
    if ch1 == 's':
        print 'Please input 0 for fields you would like to exclude, any other input would include it.'
        for key in variable_lookup:
            if key[0] == 'class':
                continue
            ch2 = raw_input(key[1] + ':')
            if ch2 == '0':
                dont_include[key[0]] = 0

    if ds_cur == None:
        conn = MongoClient('mongodb://localhost:27017')
        dataset = conn['rmpdb']['dataset_profs_ten_over']
        ds_cur = dataset.find(filter={}, projection=dont_include)
        dataset2 = conn['rmpdb']['dataset_profs_five_over_less_ten']
        ds_cur2 = dataset2.find(filter={}, projection=dont_include)

    X = []  # Variables
    Y = []  # Classes
    ids = []  # Keep track of professor IDs
    X2 = []  # Variables
    Y2 = []  # Classes
    ids2 = []  # Keep track of professor IDs

    print 'Building training set according to selection..'
    for row in tqdm(ds_cur):
        x_dict = dict()
        for key in row:
            if key == 'class':
                Y.append(int(row[key]))
            elif key == 'prof_id':
                ids.append(row[key])
            elif isNan(row[key]):
                continue
            else:
                x_dict[int(key)] = float(row[key])
        X.append(x_dict)

    for row in tqdm(ds_cur2):
        x_dict2 = dict()
        for key in row:
            if key == 'class':
                Y2.append(int(row[key]))
            elif key == 'prof_id':
                ids2.append(row[key])
            elif isNan(row[key]):
                continue
            else:
                x_dict2[int(key)] = float(row[key])
        X2.append(x_dict2)

    ch = raw_input(
        'Include top words for males and females as features? (y/n) [n]: ')
    if ch == 'y':
        from glob import glob
        from json import loads

        vec_files = glob('../logs/*.vec')
        if not len(vec_files) == 0:
            print 'Word vector files found in ../logs: \n'
            print vec_files
            fch = raw_input(
                'Enter name of file without extension. [../logs/trial0.vec] Enter 0 to skip. ../logs/'
            )
            if fch == '0':
                male_vector, female_vector = build_vector()
            else:
                try:
                    f = open('../logs/' + fch + '.vec', 'r')
                    male_vector, female_vector = loads(f.read())
                except:
                    f = open('../logs/trial0.vec', 'r')
                    male_vector, female_vector = loads(f.read())
        else:
            male_vector, female_vector = build_vector()

        print 'Male vectors as (word, count)'
        print male_vector
        print "============================================="
        print 'Female vectors as (word, count)'
        print female_vector
        print "============================================="
        print 'Calculating word features for all professors in dataset. This shall take some time.'
        print 'Depending on your cutoff, this can take from 4 - 6 hours. Probably a good idea to get some other stuff done..'

        male_words = [tup[0] for tup in male_vector]
        female_words = [tup[0] for tup in female_vector]

        union_words = list(set(male_words).union(set(female_words)))
        final_words = list()
        print 'Select words you want to remove by entering "x".'
        for word in union_words:
            wch = raw_input(word + ':')
            if wch == 'x':
                continue
            else:
                final_words.append(word)

        from string import punctuation

        exclude = set(punctuation)
        rmpdb = MongoClient('mongodb://localhost:27017')['rmpdb']
        for i in tqdm(range(len(ids))):
            prof_id = ids[i]

            # male_dict = dict()
            # female_dict = dict()

            # for tup in male_vector:
            # 	male_dict[tup[0]] = 0
            # for tup in female_vector:
            # 	female_dict[tup[0]] = 0

            vec_dict = dict()
            for word in final_words:
                vec_dict[word] = 0

            prof_comments = rmpdb['profs'].find_one(
                {'_id': ObjectId(prof_id)}, {
                    '_id': 0,
                    'all comments.rComments': 1
                })
            for comment in prof_comments['all comments']:
                text = comment['rComments']
                no_punc_text = ''.join(ch for ch in text if ch not in exclude)
                toks = no_punc_text.split()

                for tok in toks:
                    # if tok.lower() in male_dict:
                    # 	male_dict[tok.lower()] += 1
                    # if tok.lower() in female_dict:
                    # 	female_dict[tok.lower()] += 1
                    if tok.lower() in vec_dict:
                        vec_dict[tok.lower()] += 1

            feature_counter = 53  #starts right after variable_lookup['53']
            # for j in range(len(male_vector)):
            # 	feature_counter += 1
            # 	tup = male_vector[j]
            # 	if not male_dict[tup[0]] == 0:
            # 		X[i][feature_counter] = male_dict[tup[0]]
            # for j in range(len(female_vector)):
            # 	feature_counter += 1
            # 	tup = female_vector[j]
            # 	if not female_dict[tup[0]] == 0:
            # 		X[i][feature_counter] = female_dict[tup[0]]
            for j in range(len(final_words)):
                feature_counter += 1
                word = final_words[j]
                if not vec_dict[word] == 0:
                    X[i][feature_counter] = vec_dict[word]
                # if feature_counter == 97:
                # 	break

        print "Building test set.."
        for i in tqdm(range(len(ids2))):
            prof_id = ids2[i]

            # male_dict = dict()
            # female_dict = dict()

            # for tup in male_vector:
            # 	male_dict[tup[0]] = 0
            # for tup in female_vector:
            # 	female_dict[tup[0]] = 0

            vec_dict = dict()
            for word in final_words:
                vec_dict[word] = 0

            prof_comments = rmpdb['profs'].find_one(
                {'_id': ObjectId(prof_id)}, {
                    '_id': 0,
                    'all comments.rComments': 1
                })
            for comment in prof_comments['all comments']:
                text = comment['rComments']
                no_punc_text = ''.join(ch for ch in text if ch not in exclude)
                toks = no_punc_text.split()

                for tok in toks:
                    # if tok.lower() in male_dict:
                    # 	male_dict[tok.lower()] += 1
                    # if tok.lower() in female_dict:
                    # 	female_dict[tok.lower()] += 1
                    if tok.lower() in vec_dict:
                        vec_dict[tok.lower()] += 1

            feature_counter = 53  #starts right after variable_lookup['53']
            # for j in range(len(male_vector)):
            # 	feature_counter += 1
            # 	tup = male_vector[j]
            # 	if not male_dict[tup[0]] == 0:
            # 		X[i][feature_counter] = male_dict[tup[0]]
            # for j in range(len(female_vector)):
            # 	feature_counter += 1
            # 	tup = female_vector[j]
            # 	if not female_dict[tup[0]] == 0:
            # 		X[i][feature_counter] = female_dict[tup[0]]
            for j in range(len(final_words)):
                feature_counter += 1
                word = final_words[j]
                if not vec_dict[word] == 0:
                    X2[i][feature_counter] = vec_dict[word]

        print 'Words used:'
        print final_words

    else:
        pass

    print 'Writing temp files for AUC calculation..'
    build_svm_file(X, Y)
    print 'Temp file written..'
    print 'Features used:'
    fstr = list()
    for key in variable_lookup:
        if key[0] in dont_include or key[0] == 'class':
            continue
        else:
            fstr.append(key[1])
    print dumps(fstr)
    print '======================================\n'
    prob = problem(Y, X)
    param = parameter('-s 6 -v 10')
    m = train(prob, param)
    print 'Evaluating..\n'
    system('liblinear-2.1/train -s 6 -v 10 liblinear-2.1/temp_ds')
    model = train(prob, parameter('-s 6 -q'))
    #system('rm liblinear-2.1/temp_ds')

    print 'Testing model on test set..'
    p_Y2, p_acc, p_vals = predict(Y2, X2, model)

    contingency_mat = [[0, 0], [0, 0]]
    for i in range(len(Y2)):
        if (Y2[i] == 0) and (p_Y2[i] == 0):
            contingency_mat[0][0] += 1
        elif (Y2[i] == 0) and (p_Y2[i] == 1):
            contingency_mat[0][1] += 1
        elif (Y2[i] == 1) and (p_Y2[i] == 0):
            contingency_mat[1][0] += 1
        else:
            contingency_mat[1][1] += 1

    return (model, p_acc, contingency_mat)
def classify(ds_cur = None):
	from os import chdir, system
	chdir('./liblinear-2.1/python/')
	from liblinearutil import problem, parameter, train, predict
	chdir('../../')
	from pdb import set_trace
	from tqdm import tqdm
	from pymongo import MongoClient
	from json import dumps
	from bson.objectid import ObjectId

	set_trace()

	dont_include = {'_id' : 0}
	print 'List of variables:\n'
	for key in variable_lookup:
		print key[1]
	ch1 = raw_input('Input "s" to select custom fields (default selection - all fields):')
	if ch1 == 's':
		print 'Please input 0 for fields you would like to exclude, any other input would include it.'
		for key in variable_lookup:
			if key[0] == 'class':
				continue
			ch2 = raw_input(key[1] + ':')
			if ch2 == '0':
				dont_include[key[0]] = 0

	if ds_cur == None:
		conn = MongoClient('mongodb://localhost:27017')
		dataset = conn['rmpdb']['dataset_profs_ten_over']
		ds_cur = dataset.find(filter = {}, projection = dont_include)
		dataset2 = conn['rmpdb']['dataset_profs_five_over_less_ten']
		ds_cur2 = dataset2.find(filter = {}, projection = dont_include)

	X = [] # Variables
	Y = [] # Classes
	ids = [] # Keep track of professor IDs
	X2 = [] # Variables
	Y2 = [] # Classes
	ids2 = [] # Keep track of professor IDs
	

	print 'Building training set according to selection..'
	for row in tqdm(ds_cur):
		x_dict = dict()
		for key in row:
			if key == 'class':
				Y.append(int(row[key]))
			elif key == 'prof_id':
				ids.append(row[key])
			elif isNan(row[key]):
				continue
			else:
				x_dict[int(key)] = float(row[key])
		X.append(x_dict)

	for row in tqdm(ds_cur2):
		x_dict2 = dict()
		for key in row:
			if key == 'class':
				Y2.append(int(row[key]))
			elif key == 'prof_id':
				ids2.append(row[key])
			elif isNan(row[key]):
				continue
			else:
				x_dict2[int(key)] = float(row[key])
		X2.append(x_dict2)

	ch = raw_input('Include top words for males and females as features? (y/n) [n]: ')
	if ch == 'y':
		from glob import glob
		from json import loads

		vec_files = glob('../logs/*.vec')
		if not len(vec_files) == 0:
			print 'Word vector files found in ../logs: \n'
			print vec_files
			fch = raw_input('Enter name of file without extension. [../logs/trial0.vec] Enter 0 to skip. ../logs/')
			if fch == '0':
				male_vector, female_vector = build_vector()
			else:
				try:
					f = open('../logs/' + fch + '.vec', 'r')
					male_vector, female_vector = loads(f.read())
				except:
					f = open('../logs/trial0.vec', 'r')
					male_vector, female_vector = loads(f.read())
		else:
			male_vector, female_vector = build_vector()
		
		print 'Male vectors as (word, count)'
		print male_vector
		print "============================================="
		print 'Female vectors as (word, count)'
		print female_vector
		print "============================================="
		print 'Calculating word features for all professors in dataset. This shall take some time.'
		print 'Depending on your cutoff, this can take from 4 - 6 hours. Probably a good idea to get some other stuff done..'

		male_words = [tup[0] for tup in male_vector]
		female_words = [tup[0] for tup in female_vector]

		union_words = list(set(male_words).union(set(female_words)))
		final_words = list()
		print 'Select words you want to remove by entering "x".'
		for word in union_words:
			wch = raw_input(word + ':')
			if wch == 'x':
				continue
			else:
				final_words.append(word)



		from string import punctuation

		exclude = set(punctuation)
		rmpdb = MongoClient('mongodb://localhost:27017')['rmpdb']
		for i in tqdm(range(len(ids))):
			prof_id = ids[i]

			# male_dict = dict()
			# female_dict = dict()

			# for tup in male_vector:
			# 	male_dict[tup[0]] = 0
			# for tup in female_vector:
			# 	female_dict[tup[0]] = 0

			vec_dict = dict()
			for word in final_words:
				vec_dict[word] = 0

			prof_comments = rmpdb['profs'].find_one({'_id' : ObjectId(prof_id)}, {'_id' : 0, 'all comments.rComments' : 1})
			for comment in prof_comments['all comments']:
				text = comment['rComments']
				no_punc_text = ''.join(ch for ch in text if ch not in exclude)
				toks = no_punc_text.split()

				for tok in toks:
					# if tok.lower() in male_dict:
					# 	male_dict[tok.lower()] += 1
					# if tok.lower() in female_dict:
					# 	female_dict[tok.lower()] += 1
					if tok.lower() in vec_dict:
						vec_dict[tok.lower()] += 1

			feature_counter = 53 #starts right after variable_lookup['53']
			# for j in range(len(male_vector)):
			# 	feature_counter += 1
			# 	tup = male_vector[j]
			# 	if not male_dict[tup[0]] == 0:
			# 		X[i][feature_counter] = male_dict[tup[0]]
			# for j in range(len(female_vector)):
			# 	feature_counter += 1
			# 	tup = female_vector[j]
			# 	if not female_dict[tup[0]] == 0:
			# 		X[i][feature_counter] = female_dict[tup[0]]		
			for j in range(len(final_words)):
				feature_counter += 1
				word = final_words[j]
				if not vec_dict[word] == 0:
					X[i][feature_counter] = vec_dict[word]
				# if feature_counter == 97:
				# 	break

		print "Building test set.."
		for i in tqdm(range(len(ids2))):
			prof_id = ids2[i]

			# male_dict = dict()
			# female_dict = dict()

			# for tup in male_vector:
			# 	male_dict[tup[0]] = 0
			# for tup in female_vector:
			# 	female_dict[tup[0]] = 0

			vec_dict = dict()
			for word in final_words:
				vec_dict[word] = 0

			prof_comments = rmpdb['profs'].find_one({'_id' : ObjectId(prof_id)}, {'_id' : 0, 'all comments.rComments' : 1})
			for comment in prof_comments['all comments']:
				text = comment['rComments']
				no_punc_text = ''.join(ch for ch in text if ch not in exclude)
				toks = no_punc_text.split()

				for tok in toks:
					# if tok.lower() in male_dict:
					# 	male_dict[tok.lower()] += 1
					# if tok.lower() in female_dict:
					# 	female_dict[tok.lower()] += 1
					if tok.lower() in vec_dict:
						vec_dict[tok.lower()] += 1

			feature_counter = 53 #starts right after variable_lookup['53']
			# for j in range(len(male_vector)):
			# 	feature_counter += 1
			# 	tup = male_vector[j]
			# 	if not male_dict[tup[0]] == 0:
			# 		X[i][feature_counter] = male_dict[tup[0]]
			# for j in range(len(female_vector)):
			# 	feature_counter += 1
			# 	tup = female_vector[j]
			# 	if not female_dict[tup[0]] == 0:
			# 		X[i][feature_counter] = female_dict[tup[0]]		
			for j in range(len(final_words)):
				feature_counter += 1
				word = final_words[j]
				if not vec_dict[word] == 0:
					X2[i][feature_counter] = vec_dict[word]

		print 'Words used:'
		print final_words

	else:
		pass

	print 'Writing temp files for AUC calculation..'
	build_svm_file(X, Y)
	print 'Temp file written..'
	print 'Features used:'
	fstr = list()
	for key in variable_lookup:
		if key[0] in dont_include or key[0] == 'class':
			continue
		else:
			fstr.append(key[1])
	print dumps(fstr)
	print '======================================\n'
	prob = problem(Y, X)
	param = parameter('-s 6 -v 10')
	m = train(prob, param)
	print 'Evaluating..\n'
	system('liblinear-2.1/train -s 6 -v 10 liblinear-2.1/temp_ds')
	model = train(prob, parameter('-s 6 -q'))
	#system('rm liblinear-2.1/temp_ds')

	print 'Testing model on test set..'
	p_Y2, p_acc, p_vals = predict(Y2, X2, model)

	contingency_mat = [[0, 0], [0, 0]]
	for i in range(len(Y2)):
		if (Y2[i] == 0) and (p_Y2[i] == 0):
			contingency_mat[0][0] += 1
		elif (Y2[i] == 0) and (p_Y2[i] == 1):
			contingency_mat[0][1] += 1
		elif (Y2[i] == 1) and (p_Y2[i] == 0):
			contingency_mat[1][0] += 1
		else:
			contingency_mat[1][1] += 1


	return (model, p_acc, contingency_mat)
def simpleLibLinear(X_train, Y_train):
    prob = ll.problem(Y_train, X_train)
    param = ll.parameter('-c ' + str(c))
    m = ll.train(prob, param)
    return m
Пример #27
0
    return problems
def parallel_train_predict(args):
    print("A process begins.")
    x_train,y_train,x_test,y_test=args
    problem = liblinearutil.problem(y_train, x_train)
    parameter = liblinearutil.parameter('-s 0 -c 1')
    time_start = time.clock()
    model = liblinearutil.train(problem, parameter)
    print("A process training finished in %f."%(time.clock()-time_start))
    time_start = time.clock()
    p_label, p_acc, p_val = liblinearutil.predict(y_test, x_test,model,'-b 0')
    print("A process predicting finished in %f."%(time.clock()-time_start))
    return p_val
if parallel_computing==0:
    problems = divide_problem(20000,20000,data_only=1)
    parameter = liblinearutil.parameter('-s 0 -c 1')
    models=[]
    time_start = time.clock()
    for i in range(len(problems)):
        t=[]
        for j in range(len(problems[i])):
            t.append(train(problems[i][j][1],problems[i][j][0]))
        models.append(t)
    print("Exercise 3 with MLP training finished in %f."%(time.clock()-time_start))
    time_start = time.clock()
    p_val=[-1e100]*len(data.x_test)
    for i in range(len(problems)):
        t=[1e100]*len(data.x_test)
        for j in range(len(problems[i])):
            p_val_ij=predict(data.x_test, data.y_test,models[i][j])
            for k in range(len(t)):
Пример #28
0
 def train(self):
     sys.stderr.write('creating training problem...')
     prob = problem(self.labels, self.contexts)
     sys.stderr.write('done\ntraining with option(s) "'+self.parameters+'"...')
     self.model = train(prob, parameter(self.parameters))
     sys.stderr.write('done\n')
def simpleLibLinear(X_train,Y_train):
	prob = ll.problem(Y_train,X_train)
	param = ll.parameter('-c '+str(c))
	m = ll.train(prob, param)
	return m