예제 #1
0
파일: bagging.py 프로젝트: ysperetz/thesis
def main(filename):
    # initialize global data
    labels, _, comments = util.get_comments_data(filename)
    vec = TfidfVectorizer(
        ngram_range = (1, 2),
        strip_accents = None,
        charset_error = 'ignore',
        stop_words = None,
        min_df = 2
    )
    labels, instances = numpy.array(labels), vec.fit_transform(comments)
    random.seed(0)
    
    # cross validate
    n_models = 9; cv = 5; cv_accuracy = list();
    for i in xrange(0, cv):
        print "Iteration #" + str(i) + "..."
        
        # initialize training/testing data
        cv_data = cross_validation.train_test_split(instances, labels, test_size = 0.3, random_state = i)
        x_training = cv_data[0]
        x_testing = cv_data[1]
        y_training = cv_data[2]
        y_testing = cv_data[3]
        
        # initialize the classifier
        clf = BaggingSVM(n_models)
        clf.fit(x_training, y_training)
        
        # measure prediction accuracy
        cv_accuracy.append(clf.score(x_testing, y_testing))
    
    print "Scores => " + str(cv_accuracy)
    print "Mean   => " + str(numpy.mean(cv_accuracy))
예제 #2
0
def main(infile, outfile):
    """read from infile, write to outfile"""
    print "Reading from %s..." % infile
    labels, timestamps, comments = get_comments_data(infile)
    print "Parsing..."
    vec = TfidfVectorizer(ngram_range=(1, 2),
                          strip_accents=None,
                          charset_error="ignore",
                          stop_words=None,
                          min_df=2)
    vec.fit(comments)
    print "Processing and writing to %s..." % outfile
    f = open(outfile, 'w')
    counter = 0
    comments = vec.transform(comments)
    rows, cols = comments.get_shape()
    for row in xrange(0, rows):
        if counter % 100 == 0:
            print counter
        buf, indices = list(), comments[row].indices.tolist()
        indices.sort()
        buf.append(str(float(labels[row])))
        for col in indices:
            buf.append("%d:%.3f" % (col, comments[row, col]))
        buf.append("\n")
        f.write(" ".join(buf))
        counter = counter + 1
    f.close()
    print "Done!"
예제 #3
0
파일: convert.py 프로젝트: RITUZZ/thesis
def main(infile, outfile):
    """read from infile, write to outfile"""
    print "Reading from %s..." % infile
    labels, timestamps, comments = get_comments_data(infile)
    print "Parsing..."
    vec = TfidfVectorizer(
        ngram_range = (1, 2),
        strip_accents = None,
        charset_error = "ignore",
        stop_words = None,
        min_df = 2
    )
    vec.fit(comments)
    print "Processing and writing to %s..." % outfile
    f = open(outfile, 'w')
    counter = 0
    comments = vec.transform(comments)
    rows, cols = comments.get_shape()
    for row in xrange(0, rows):
        if counter % 100 == 0:
            print counter
        buf, indices = list(), comments[row].indices.tolist()
        indices.sort()
        buf.append(str(float(labels[row])))
        for col in indices:
            buf.append("%d:%.3f" % (col, comments[row, col]))
        buf.append("\n")
        f.write(" ".join(buf))
        counter = counter + 1
    f.close()
    print "Done!"