示例#1
0
def sampling():
    cutoffLine('*')
    print 'Sampling using EasyEnsemble method'
    start_time = time.time()

    TRAIN_SET = 'training_set'
    if not os.path.exists(TRAIN_SET): os.mkdir(TRAIN_SET)
    propotion = 10
    negative_size = POSITIVE * propotion
    r_file = file(PRE_DIR + '/negative_set.csv', 'r')
    reader = csv.reader(r_file)

    positive_set = readCSV(PRE_DIR + '/positive_set.csv', int)
    negative_set = []
    set_count = 0
    for line in reader:
        progressBar(reader.line_num, NEGATIVE)
        line = map(int, line)
        if line[-1] == 1: positive_set.append(line)
        if line[-1] == 0: negative_set.append(line)
        if len(negative_set) == negative_size or reader.line_num == NEGATIVE:
            set_count += 1
            training_set = positive_set + negative_set
            random.shuffle(training_set)
            file_name =  TRAIN_SET + '/' + '%d.csv'%set_count
            writeCSV(training_set, file_name)
            negative_set = []

    r_file.close()
    end_time = time.time()
    duration = timekeeper(start_time, end_time)
    cutoffLine('*')
    print 'It takes %s to sampling' % duration
示例#2
0
def drop_no_buy_user():
    cutoffLine('-')
    rfile = file('data/nuser.csv', 'r')
    reader = csv.reader(rfile)
    buyed_user = set()
    print 'user behavior stat'
    for line in reader:
        doneCount(reader.line_num)
        if int(line[2]) == 4: buyed_user.add(int(line[0]))
    rfile.close()
    print '\ndrop...'
    rfile = file('data/nuser.csv', 'r')
    wfile = file('data/nuser_cleaned', 'w')
    reader = csv.reader(rfile)
    writer = csv.writer(wfile)

    count = 0
    for line in reader:
        doneCount(reader.line_num)
        if int(line[0]) in buyed_user:
            writer.writerow(line)
            count += 1
    cutoffLine('-')
    print count
    rfile.close()
    wfile.close()
示例#3
0
def global_feature():
    cutoffLine('-')
    print 'Generate global feature'
    # 统计每种商品每天销量,为统计每种商品在同类商品种排名服务, 为了避免使用未来信息
    global ci_sale
    if os.path.exists('data/ci_sale.pkl'):
        ci_sale_file = open('data/ci_sale.pkl', 'rb')
        ci_sale = pickle.load(ci_sale_file)
        # for c in ci_rank: print ci_rank[c]
        ci_sale_file.close()
    else:
        u_file = file('data/nuser.csv', 'r')
        u_reader = csv.reader(u_file)
        ci_sale = {}
        for line in u_reader:
            doneCount(u_reader.line_num)
            item = int(line[1])
            behavior = int(line[2])
            category = int(line[4])
            date = int(line[5])
            if not ci_sale.has_key(category): ci_sale[category] = {}
            if behavior == 4:
                if not ci_sale[category].has_key(item): ci_sale[category][item] = [0]*(TOTAL_DAY+1)
                ci_sale[category][item][date] += 1

        ci_sale_file = open('data/ci_sale.pkl', 'wb')
        pickle.dump(ci_sale, ci_sale_file)
        ci_sale_file.close()
        u_file.close()
示例#4
0
def drop_no_buy_user():
    cutoffLine('-')
    rfile = file('data/nuser.csv','r')
    reader = csv.reader(rfile)
    buyed_user = set()
    print 'user behavior stat'
    for line in reader:
        doneCount(reader.line_num)
        if int(line[2]) == 4: buyed_user.add(int(line[0]))
    rfile.close()
    print '\ndrop...'
    rfile = file('data/nuser.csv','r')
    wfile = file('data/nuser_cleaned','w')
    reader = csv.reader(rfile)
    writer = csv.writer(wfile)

    count = 0
    for line in reader:
        doneCount(reader.line_num)
        if int(line[0]) in buyed_user:
            writer.writerow(line)
            count += 1
    cutoffLine('-')
    print count
    rfile.close()
    wfile.close()
示例#5
0
def LR(X, y):
    cutoffLine('-')
    print 'Training...'
    X = preprocessing.scale(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
    c_set = [0.01, 0.5, 0.1] + map(lambda x: x / 100.0, range(50, 1001, 50))
    # c_set = [1]
    min_error = 100000
    best_model = 1
    best_c = -1
    for c in c_set:
        LR_model = LogisticRegression(C=c,
                                      penalty='l1',
                                      tol=0.001,
                                      max_iter=20000)
        LR_model.fit(X, y)
        y_pred = LR_model.predict(X_test)
        error = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
        if error < min_error:
            min_error = error
            best_model = LR_model
            best_c = c
    print "best C is %f, error is %f" % (best_c, min_error)
    print 'coefs below:'
    print best_model.coef_[0]
    return best_model
示例#6
0
def SVM(X, y):
    cutoffLine('-')
    print 'Training...'
    X = preprocessing.scale(X)
    #X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)
    SVM_model = SVC()
    SVM_model.fit(X, y)
    return SVM_model
示例#7
0
def predict(window, model, item_subset, proportion, algo, confidence):
    cutoffLine('-')
    print 'Generate result set with confidence %f' % confidence
    feature_file = file('splited_data_%d/set_for_prediction.csv'%window, 'r')
    result_file = file('data/tianchi_mobile_recommendation_predict_%d_%s_%d_%s.csv'%\
                                        (window, algo, proportion, str(confidence)), 'w')
    f_reader = csv.reader(feature_file)
    r_writer = csv.writer(result_file)
    r_writer.writerow(['user_id','item_id'])
    predict_set = set()
    UI = []
    X = []
    each_time = 500000
    for line in f_reader:
        doneCount(f_reader.line_num)
        line = map(int, line)
        UI.append(tuple(line[0:2]))
        X.append(line[3:])
        if f_reader.line_num % each_time == 0:
            if algo == 'lr' or algo == 'svm': X = preprocessing.scale(X)
            if algo == 'lr' or algo == 'rf':
                y_pred = model.predict_proba(X)
                print y_pred
                for index, y in enumerate(y_pred):
                    if y[1] > confidence: predict_set.add(UI[index])
            if algo == 'svm':
                y_pred = model.predict(X)
                for index, y in enumerate(y_pred):
                    if y == 1: predict_set.add(UI[index])
            UI = []
            X = []
    if len(UI) > 0:
        if algo == 'lr' or algo == 'svm': X = preprocessing.scale(X)
        if algo == 'lr' or algo == 'rf':
            y_pred = model.predict_proba(X)
            for index, y in enumerate(y_pred):
                if y[1] > confidence: predict_set.add(UI[index])
        if algo == 'svm':
            y_pred = model.predict(X)
            for index, y in enumerate(y_pred):
                if y == 1: predict_set.add(UI[index])
        UI = []
        X = []

    cutoffLine('-')
    print "Prediction set size before drop: %d" % len(predict_set)
    predict_set = dropItemsNotInSet(predict_set, item_subset)
    r_writer.writerows(predict_set)
    print "Prediction set size after drop: %d" % len(predict_set)

    feature_file.close()
    result_file.close()

    return len(predict_set)
示例#8
0
def predict(model, index):
    cutoffLine('-')
    print 'Generate result set %d' % index
    feature_file = file('splited_data/set_for_prediction.csv', 'r')
    result_file = file(TRAIN_SET_DIR + '/' + 'lr_result_%d.csv' % index, 'w')
    f_reader = csv.reader(feature_file)
    r_writer = csv.writer(result_file)
    r_writer.writerow(['user_id','item_id'])
    for line in f_reader:
        doneCount(f_reader.line_num)
        line = map(int, line)
        if model.predict([line[2:]])[0] == 1: r_writer.writerow(line[0:2])

    feature_file.close()
    result_file.close()
示例#9
0
def splitData():
    cutoffLine('*')
    print 'Start split data with window %d' % WINDOW
    start_time = time.time()

    stat_file = file(PRE_DIR + '/stat.csv','w')
    stat_writer = csv.writer(stat_file)
    for i in range(1,FILES+1):
        cutoffLine('-')
        print 'Split dataset %d/%d: ' % (i, FILES)
        rfile = file(DATA_SET,'r')
        reader = csv.reader(rfile)
        j = i + WINDOW
        if j != TOTAL_DAY + 1:
            if j == TOTAL_DAY:
                train_file_name = 'test.csv'
                result_file_name = 'result_test.csv'
            else:
                train_file_name = '%d.csv'%i
                result_file_name = '%s_%d.csv'%('result',i)
            train_file = file(PRE_DIR + '/' + train_file_name,'w')
            result_file = file(PRE_DIR + '/' + result_file_name,'w')
            train_writer = csv.writer(train_file)
            result_writer = csv.writer(result_file)
            train_count = 0
            result_count = 0
            for line in reader:
                progressBar(reader.line_num, DATASET_SIZE)
                if int(line[5]) >= i and int(line[5]) < j:
                    train_writer.writerow(line)
                    train_count += 1
                if int(line[5]) == j and int(line[2]) == 4:
                    result_writer.writerow([line[0],line[1]])
                    result_count += 1
            stat_writer.writerow([train_file_name, train_count])
            stat_writer.writerow([result_file_name, result_count])
            train_file.close()
            result_file.close()
        else:
            forpredict_file_name = 'for_prediction.csv'
            train_file = file(PRE_DIR + '/' + forpredict_file_name,'w')
            train_writer = csv.writer(train_file)
            train_count = 0
            for line in reader:
                progressBar(reader.line_num,DATASET_SIZE)
                if int(line[5]) >= i and int(line[5]) < j:
                    train_writer.writerow(line)
                    train_count += 1
            stat_writer.writerow([forpredict_file_name, train_count])
            train_file.close()
        rfile.close()

    stat_file.close()
    end_time = time.time()
    duration = timekeeper(start_time,end_time)
    cutoffLine('-')
    print 'It takes ' + duration + ' to split dataset.'
    cutoffLine('*')
示例#10
0
def stat():
    cutoffLine('-')
    print 'stat some information...'
    user_file = file('data/nuser.csv','r')
    item_file = file('data/item.csv','r')
    stat_file = open('data/stat.txt','w')


    row_count = 0
    user_set = set()
    sub_item_set = set()
    all_item_set = set()
    category_set = set()
    user_geo_count = 0
    item_geo_count = 0

    reader = csv.reader(item_file)
    for line in reader:
        doneCount(reader.line_num)
        if reader.line_num == 1: continue
        if line[1]: item_geo_count += 1
        category_set.add(line[2])
        sub_item_set.add(line[0])

    reader = csv.reader(user_file)
    for line in reader:
        doneCount(reader.line_num)
        row_count += 1
        user_set.add(line[0])
        all_item_set.add(line[1])
        if line[3]: user_geo_count += 1

    interact_item_set = all_item_set & sub_item_set

    stat_file.write('%s : %s\n'%(u'Total Count',row_count))
    stat_file.write('%s : %s\n'%(u'User Count',len(user_set)))
    stat_file.write('%s : %s\n'%(u'All Item Count',len(all_item_set)))
    stat_file.write('%s : %s\n'%(u'Sub Item Count',len(sub_item_set)))
    stat_file.write('%s : %s %f\n'%(u'Interact Item Count',
                                    len(interact_item_set),
                                    float(len(interact_item_set))/len(sub_item_set)))
    stat_file.write('%s : %s\n'%(u'Category Count',len(category_set)))
    stat_file.write('%s : %s\n'%(u'User Geo Count',user_geo_count))
    stat_file.write('%s : %s\n'%(u'Item Geo Count',item_geo_count))

    stat_file.close()
    user_file.close()
    item_file.close()
示例#11
0
def splitData():
    cutoffLine('*')
    print 'Start split data with window %d' % WINDOW
    start_time = time.time()

    stat_file = file(PRE_DIR + '/stat.csv', 'w')
    stat_writer = csv.writer(stat_file)
    for i in range(1, FILES + 1):
        cutoffLine('-')
        print 'Split dataset %d/%d: ' % (i, FILES)
        rfile = file(DATA_SET, 'r')
        reader = csv.reader(rfile)
        j = i + WINDOW
        if j != TOTAL_DAY + 1:
            if j == TOTAL_DAY:
                train_file_name = 'test.csv'
                result_file_name = 'result_test.csv'
            else:
                train_file_name = '%d.csv' % i
                result_file_name = '%s_%d.csv' % ('result', i)
            train_file = file(PRE_DIR + '/' + train_file_name, 'w')
            result_file = file(PRE_DIR + '/' + result_file_name, 'w')
            train_writer = csv.writer(train_file)
            result_writer = csv.writer(result_file)
            train_count = 0
            result_count = 0
            for line in reader:
                progressBar(reader.line_num, DATASET_SIZE)
                if int(line[5]) >= i and int(line[5]) < j:
                    train_writer.writerow(line)
                    train_count += 1
                if int(line[5]) == j and int(line[2]) == 4:
                    result_writer.writerow([line[0], line[1]])
                    result_count += 1
            stat_writer.writerow([train_file_name, train_count])
            stat_writer.writerow([result_file_name, result_count])
            train_file.close()
            result_file.close()
        else:
            forpredict_file_name = 'for_prediction.csv'
            train_file = file(PRE_DIR + '/' + forpredict_file_name, 'w')
            train_writer = csv.writer(train_file)
            train_count = 0
            for line in reader:
                progressBar(reader.line_num, DATASET_SIZE)
                if int(line[5]) >= i and int(line[5]) < j:
                    train_writer.writerow(line)
                    train_count += 1
            stat_writer.writerow([forpredict_file_name, train_count])
            train_file.close()
        rfile.close()

    stat_file.close()
    end_time = time.time()
    duration = timekeeper(start_time, end_time)
    cutoffLine('-')
    print 'It takes ' + duration + ' to split dataset.'
    cutoffLine('*')
示例#12
0
def evaluate_model(model, index):
    cutoffLine('-')
    print 'offline evaluate RF model %d' % index
    test_file = file('splited_data/set_test.csv', 'r')
    test_reader = csv.reader(test_file)
    predict_set = set()
    real_set = set()
    for line in test_reader:
        doneCount(test_file.line_num)
        line = map(int, line)
        if line[-1] == 1 : real_set.add((line[0],line[1]))
        if model.predict([line[2:-1]])[0] == 1: predict_set.add((line[0],line[1]))
    import evaluate
    P, R, F = evaluate.evaluate(predict_set, real_set)
    test_file.close()
    return P, R, F
示例#13
0
def stat():
    cutoffLine('-')
    print 'stat some information...'
    user_file = file('data/nuser.csv', 'r')
    item_file = file('data/item.csv', 'r')
    stat_file = open('data/stat.txt', 'w')

    row_count = 0
    user_set = set()
    sub_item_set = set()
    all_item_set = set()
    category_set = set()
    user_geo_count = 0
    item_geo_count = 0

    reader = csv.reader(item_file)
    for line in reader:
        doneCount(reader.line_num)
        if reader.line_num == 1: continue
        if line[1]: item_geo_count += 1
        category_set.add(line[2])
        sub_item_set.add(line[0])

    reader = csv.reader(user_file)
    for line in reader:
        doneCount(reader.line_num)
        row_count += 1
        user_set.add(line[0])
        all_item_set.add(line[1])
        if line[3]: user_geo_count += 1

    interact_item_set = all_item_set & sub_item_set

    stat_file.write('%s : %s\n' % (u'Total Count', row_count))
    stat_file.write('%s : %s\n' % (u'User Count', len(user_set)))
    stat_file.write('%s : %s\n' % (u'All Item Count', len(all_item_set)))
    stat_file.write('%s : %s\n' % (u'Sub Item Count', len(sub_item_set)))
    stat_file.write('%s : %s %f\n' %
                    (u'Interact Item Count', len(interact_item_set),
                     float(len(interact_item_set)) / len(sub_item_set)))
    stat_file.write('%s : %s\n' % (u'Category Count', len(category_set)))
    stat_file.write('%s : %s\n' % (u'User Geo Count', user_geo_count))
    stat_file.write('%s : %s\n' % (u'Item Geo Count', item_geo_count))

    stat_file.close()
    user_file.close()
    item_file.close()
示例#14
0
def evaluate_model(algo, window, model, item_subset, confidence):
    cutoffLine('-')
    print 'offline evaluate model with confidence %f' % confidence
    test_file = file('splited_data_%d/set_test.csv'%window, 'r')
    test_reader = csv.reader(test_file)
    predict_set = set()
    real_set = set()
    UI = []
    X = []
    each_time = 500000
    for line in test_reader:
        doneCount(test_reader.line_num)
        line = map(int, line)
        UI.append(tuple(line[0:2]))
        X.append(line[3:-1])
        if line[-1] == 1 : real_set.add((line[0],line[1]))
        if test_reader.line_num % each_time == 0:
            if algo == 'lr' or algo == 'svm': X = preprocessing.scale(X)
            if algo == 'lr' or algo == 'rf':
                y_pred = model.predict_proba(X)
                for index, y in enumerate(y_pred):
                    if y[1] > confidence: predict_set.add(UI[index])
            if algo == 'svm':
                y_pred = model.predict(X)
                for index, y in enumerate(y_pred):
                    if y == 1: predict_set.add(UI[index])
            UI = []
            X = []
    if len(UI) > 0:
        if algo == 'lr' or algo == 'svm': X = preprocessing.scale(X)
        if algo == 'lr' or algo == 'rf':
            y_pred = model.predict_proba(X)
            for index, y in enumerate(y_pred):
                if y[1] > confidence: predict_set.add(UI[index])
        if algo == 'svm':
            y_pred = model.predict(X)
            for index, y in enumerate(y_pred):
                if y == 1: predict_set.add(UI[index])
        UI = []
        X = []

    predict_set = dropItemsNotInSet(predict_set, item_subset)
    real_set = dropItemsNotInSet(real_set, item_subset)
    import evaluate
    P, R, F = evaluate.evaluate(predict_set, real_set)
    test_file.close()
    return P, R, F
示例#15
0
文件: split.py 项目: boke168/tianchi
def splitData():
    stat_file = file('splited_data/stat.csv','w')
    stat_writer = csv.writer(stat_file)
    for i in range(1,FILES+1):
        cutoffLine('-')
        print 'Split dataset %d: ' % i
        rfile = file(DATA_SET,'r')
        reader = csv.reader(rfile)
        j = i + 10
        if j != TOTAL_DAY + 1:
            if j == TOTAL_DAY:
                train_file_name = 'test.csv'
                result_file_name = 'result_test.csv'
            else:
                train_file_name = '%d.csv'%i
                result_file_name = '%s_%d.csv'%('result',i)
            train_file = file(PRE_DIR + '/' + train_file_name,'w')
            result_file = file(PRE_DIR + '/' + result_file_name,'w')
            train_writer = csv.writer(train_file)
            result_writer = csv.writer(result_file)
            train_count = 0
            result_count = 0
            for line in reader:
                progressBar(reader.line_num, DATASET_SIZE)
                if int(line[5]) >= i and int(line[5]) < j:
                    train_writer.writerow(line)
                    train_count += 1
                if int(line[5]) == j and int(line[2]) == 4:
                    result_writer.writerow([line[0],line[1]])
                    result_count += 1
            stat_writer.writerow([train_file_name, train_count])
            stat_writer.writerow([result_file_name, result_count])
            train_file.close()
            result_file.close()
        else:
            forpredict_file_name = 'for_prediction.csv'
            train_file = file(PRE_DIR + '/' + forpredict_file_name,'w')
            train_writer = csv.writer(train_file)
            train_count = 0
            for line in reader:
                progressBar(reader.line_num,DATASET_SIZE)
                if int(line[5]) >= i and int(line[5]) < j:
                    train_writer.writerow(line)
                    train_count += 1
            stat_writer.writerow([forpredict_file_name, train_count])
            train_file.close()
        rfile.close()
示例#16
0
def evaluate(prediction, result):
    cutoffLine('-')
    print 'Prediction set size: %d' % len(prediction)
    print 'Result set size: %d' % len(result)
    prediction = set(prediction)
    result = set(result)

    intersection = prediction & result

    precision = float(len(intersection)) / len(prediction) * 100
    recall = float(len(intersection)) / len(result) * 100

    F1 = 2 * precision * recall / (precision + recall)

    print 'P : %2f' % precision
    print 'R : %2f' % recall
    print 'F1: %2f' % F1
    return precision, recall, F1
示例#17
0
def evaluate(prediction,result):
    cutoffLine('-')
    print 'Prediction set size: %d' % len(prediction)
    print 'Result set size: %d' % len(result)
    prediction = set(prediction)
    result = set(result)

    intersection = prediction & result

    precision = float(len(intersection))/len(prediction)*100
    recall = float(len(intersection))/len(result)*100

    F1 = 2 * precision * recall / (precision + recall)

    print 'P : %2f' % precision
    print 'R : %2f' % recall
    print 'F1: %2f' % F1
    return precision, recall, F1
示例#18
0
def merge_training_set():
    cutoffLine('*')
    print 'Merge training set'
    start_time = time.time()

    positive_count = 0
    negative_count = 0
    total_count = 0

    total_file = file(PRE_DIR + '/' + 'train_set.csv', 'w')
    pos_file = file(PRE_DIR + '/' + 'positive_set.csv', 'w')
    neg_file = file(PRE_DIR + '/' + 'negative_set.csv', 'w')
    total_writer = csv.writer(total_file)
    pos_writer = csv.writer(pos_file)
    neg_writer = csv.writer(neg_file)

    for i in range(1, FILES-1):
        cutoffLine('-')
        print 'load train set %d' % i

        r_file  = file(PRE_DIR + '/' + 'set_%d.csv' % i)
        reader = csv.reader(r_file)
        for line in reader:
            doneCount(reader.line_num)
            line = map(int, line)
            if line[-1] == 1:
                positive_count += 1
                pos_writer.writerow(line)
            if line[-1] == 0:
                negative_count += 1
                neg_writer.writerow(line)
            total_count += 1
            total_writer.writerow(line)
        r_file.close()

    total_file.close()
    pos_file.close()
    neg_file.close()

    cutoffLine('-')
    # 44114
    print 'Positive Example: %d' % positive_count
    # 59373295
    print 'Negative Example: %d' % (total_count - positive_count)
    # 59417409
    print 'Total Example: %d' % total_count
    # 一致性判断
    print 'Is right? %s'%('Yes' if positive_count + negative_count == total_count else 'No')

    end_time = time.time()
    duration = timekeeper(start_time, end_time)
    cutoffLine('*')
    print 'It takes %s to merge training set and backup negative and positive set' % duration
示例#19
0
def generate_training_set():
    start_time = time.time()
    ## load the information of data set

    line_count = {}
    rfile = file(PRE_DIR + '/stat.csv','r')
    reader = csv.reader(rfile)
    for line in reader:
        line_count[line[0]] = int(line[1])
    rfile.close()

    cutoffLine('*')
    print 'Generate training set'

    for i in range(1,FILES + 1):
        cutoffLine('-')
        if i == FILES:
            file_name = 'for_prediction.csv'
            print 'Extract feature from %s'%file_name
            extract_feature(file_name, line_count[file_name], i)
        elif i == FILES - 1:
            file_name = 'test.csv'
            print 'Extract feature from %s'%file_name
            result_name = 'result_%s'%file_name
            extract_feature(file_name, line_count[file_name], i, result_name)
        else:
            file_name = '%d.csv' % i
            print 'Extract feature from %s and tag it'%file_name
            result_name = 'result_%d.csv' % i
            extract_feature(file_name, line_count[file_name], i, result_name)
    end_time = time.time()

    duration = timekeeper(start_time, end_time)
    cutoffLine('*')
    print 'It takes %s to generate training set' % duration
示例#20
0
def merge_training_set():
    cutoffLine('*')
    print 'Merge training set'
    start_time = time.time()

    positive_count = 0
    negative_count = 0
    total_count = 0

    total_file = file(PRE_DIR + '/' + 'train_set.csv', 'w')
    pos_file = file(PRE_DIR + '/' + 'positive_set.csv', 'w')
    neg_file = file(PRE_DIR + '/' + 'negative_set.csv', 'w')
    total_writer = csv.writer(total_file)
    pos_writer = csv.writer(pos_file)
    neg_writer = csv.writer(neg_file)

    for i in range(1, FILES - 1):
        cutoffLine('-')
        print 'load train set %d' % i

        r_file = file(PRE_DIR + '/' + 'set_%d.csv' % i)
        reader = csv.reader(r_file)
        for line in reader:
            doneCount(reader.line_num)
            line = map(int, line)
            if line[-1] == 1:
                positive_count += 1
                pos_writer.writerow(line)
            if line[-1] == 0:
                negative_count += 1
                neg_writer.writerow(line)
            total_count += 1
            total_writer.writerow(line)
        r_file.close()

    total_file.close()
    pos_file.close()
    neg_file.close()

    cutoffLine('-')
    print 'Positive Example: %d' % positive_count
    print 'Negative Example: %d' % (total_count - positive_count)
    print 'Total Example: %d' % total_count
    print 'Is right? %s' % ('Yes' if positive_count +
                            negative_count == total_count else 'No')

    end_time = time.time()
    duration = timekeeper(start_time, end_time)
    cutoffLine('*')
    print 'It takes %s to merge training set and backup negative and positive set' % duration
示例#21
0
def train_RF():
    start_time = time.time()
    cutoffLine('*')
    print 'Use RF model to train %d models'%TRAIN_SET_FILES
    for i in range(1, 1 + 1):
    #for i in range(1, TRAIN_SET_FILES + 1):
        cutoffLine('-')
        print 'model %d'%i
        t_file = file(TRAIN_SET_DIR + '/%d.csv'%i, 'r')
        t_reader = csv.reader(t_file)
        X = []
        y = []
        for line in t_reader:
            line = map(int, line)
            X.append(line[2:-1])
            y.append(line[-1])
        model = RF(X, y)
        P ,R ,F = evaluate_model(model, i)
        predict(model, i)
        models.append(model)
        t_file.close()

    cutoffLine('*')
    end_time = time.time()
    duration = timekeeper(start_time, end_time)
    print 'I takes %s to train , evaluate model and generate result'% duration
示例#22
0
def predict(model, item_subset):
    cutoffLine('-')
    print 'Generate result set'
    feature_file = file('splited_data/set_for_prediction.csv', 'r')
    result_file = file('data/prediction_lr.csv', 'w')
    f_reader = csv.reader(feature_file)
    r_writer = csv.writer(result_file)
    r_writer.writerow(['user_id','item_id'])
    predict_set = set()
    for line in f_reader:
        doneCount(f_reader.line_num)
        line = map(int, line)
        if model.predict([line[2:]])[0] == 1: predict_set.add((line[0], line[1]))

    cutoffLine('-')
    print "Prediction set size before drop: %d" % len(predict_set)
    predict_set = dropItemsNotInSet(predict_set, item_subset)
    r_writer.writerows(predict_set)
    print "Prediction set size after drop: %d" % len(predict_set)

    feature_file.close()
    result_file.close()
示例#23
0
def LR(X, y):
    cutoffLine('-')
    print 'Training...'
    X = preprocessing.scale(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)
    c_set = [0.01, 0.5, 0.1] + map(lambda x: x/100.0, range(50,1001,50))
    # c_set = [1]
    min_error = 100000
    best_model = 1
    best_c = -1
    for c in c_set:
        LR_model = LogisticRegression(C=c, penalty = 'l1', tol = 0.001, max_iter = 20000)
        LR_model.fit(X, y)
        y_pred = LR_model.predict(X_test)
        error = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
        if error < min_error:
            min_error = error
            best_model = LR_model
            best_c = c
    print "best C is %f, error is %f" % (best_c, min_error)
    print 'coefs below:'
    print best_model.coef_[0]
    return best_model
示例#24
0
def train(window, proportion, algo, confidence):
    start_time = time.time()
    cutoffLine('*')
    print '%s model training with sample proportion 1:%d...' %(algo, proportion)
    t_file = file('data/training_set_%d_%d.csv' % (window, proportion), 'r')
    t_reader = csv.reader(t_file)
    X = []
    y = []
    for line in t_reader:
        doneCount(t_reader.line_num)
        line = map(int, line)
        X.append(line[3:-1])
        y.append(line[-1])

    model_name = 'data/model/%s_%d_%d.model'%(algo, window, proportion)
    if os.path.exists(model_name): model = joblib.load(model_name)
    else:
        if algo == 'lr': model = LR(X, y)
        if algo == 'rf': model = RF(X, y)
        if algo == 'svm': model = SVM(X, y)
        joblib.dump(model, model_name)
    cutoffLine('-')
    print model.classes_
    item_subset = loadItemSubset()

    record_file = open('data/model_evaluate_record.txt','a')
    P, R, F= evaluate_model(algo, window, model, item_subset, confidence)
    predict_set_size = predict(window, model, item_subset, proportion, algo, confidence)
    record_file.write('window %d '%window + algo+' %d'%proportion + ' %.2f\n'%confidence)
    record_file.write('\tP: %f\n'%P)
    record_file.write('\tR: %f\n'%R)
    record_file.write('\tF1: %f\n'%F)
    record_file.write('Predict Set Size: %d\n'%predict_set_size)
    record_file.write('-'*30+'\n')
    record_file.close()



    t_file.close()
    cutoffLine('*')
    end_time = time.time()
    duration = timekeeper(start_time, end_time)
    print 'I takes %s to train , evaluate model and generate result' % duration
示例#25
0
def generate_training_set(window):
    start_time = time.time()
    global PRE_DIR, FILES
    PRE_DIR = 'splited_data_%d' % window
    FILES = TOTAL_DAY - window + 1

    ## load the information of data set
    line_count = {}
    rfile = file(PRE_DIR + '/stat.csv', 'r')
    reader = csv.reader(rfile)
    for line in reader:
        line_count[line[0]] = int(line[1])
    rfile.close()

    cutoffLine('*')
    print 'Generate training set with window %d' % window

    for i in range(1, FILES + 1):
        cutoffLine('-')
        if i == FILES:
            file_name = 'for_prediction.csv'
            print 'Extract feature from %s' % file_name
            extract_feature(window, i + window, file_name,
                            line_count[file_name], i)
        elif i == FILES - 1:
            file_name = 'test.csv'
            print 'Extract feature from %s' % file_name
            result_name = 'result_%s' % file_name
            extract_feature(window, i + window, file_name,
                            line_count[file_name], i, result_name)
        else:
            file_name = '%d.csv' % i
            print 'Extract feature from %s and tag it' % file_name
            result_name = 'result_%d.csv' % i
            extract_feature(window, i + window, file_name,
                            line_count[file_name], i, result_name)
    end_time = time.time()

    duration = timekeeper(start_time, end_time)
    cutoffLine('*')
    print 'It takes %s to generate training set' % duration
示例#26
0
def train_LR():
    start_time = time.time()
    cutoffLine('*')
    print 'LR model training...'
    cutoffLine('-')
    t_file = file('data/training_set_10.csv', 'r')
    t_reader = csv.reader(t_file)
    X = []
    y = []
    for line in t_reader:
        line = map(int, line)
        X.append(line[2:-1])
        y.append(line[-1])
    model = logRes(X,y)
    item_subset = loadItemSubset()
    evaluate_model(model, item_subset)
    predict(model, item_subset)
    t_file.close()

    cutoffLine('*')
    end_time = time.time()
    duration = timekeeper(start_time, end_time)
    print 'I takes %s to train , evaluate model and generate result' % duration
示例#27
0
def sampling(proportion):
    cutoffLine('*')
    start_time = time.time()
    print 'sampling with propotion %d...'%proportion
    negative_needed = POSITIVE * proportion
    sample_times = 10
    mod = NEGATIVE / sample_times
    negative_eachtime = negative_needed / sample_times

    training_set = readCSV(PRE_DIR + '/positive_set.csv', int)

    ## sampling negative example
    rfile = file(PRE_DIR + '/' + 'negative_set.csv', 'r')
    reader = csv.reader(rfile)
    negative_tmp = []
    for line in reader:
        progressBar(reader.line_num, NEGATIVE)
        negative_tmp.append(map(int, line))
        if reader.line_num % mod == 0:
            random.shuffle(negative_tmp)
            training_set = training_set + negative_tmp[0:negative_eachtime]
            negative_tmp = []
    rfile.close()

    wfile = file('data/training_set_%d.csv'%proportion, 'w')
    writer = csv.writer(wfile)
    random.shuffle(training_set)
    writer.writerows(training_set)
    wfile.close()

    cutoffLine('-')
    print "Real proportion: %f" %((len(training_set)-POSITIVE) / float(POSITIVE))
    cutoffLine('*')
    end_time = time.time()
    duration = timekeeper(start_time, end_time)
    print 'It takes %s to sampling with proportion %d'%(duration, proportion)
示例#28
0
def sampling(window, proportion):
    cutoffLine('*')
    start_time = time.time()
    print 'sampling with propotion %d...' % proportion
    exec('negative_needed = POSITIVE_%d * propotion' % window)
    sample_times = 20
    exec('mod = NEGATIVE_%d / sample_times' % window)
    exec('negative_eachtime = negative_needed / sample_times')
    training_set = readCSV(PRE_DIR + '/positive_set.csv', int)

    ## sampling negative example
    rfile = file(PRE_DIR + '/' + 'negative_set.csv', 'r')
    reader = csv.reader(rfile)
    negative_tmp = []
    for line in reader:
        exec('progressBar(reader.line_num, NEGATIVE_%d)' % window)
        negative_tmp.append(map(int, line))
        if reader.line_num % mod == 0:
            random.shuffle(negative_tmp)
            training_set.extend(negative_tmp[0:negative_eachtime])
            negative_tmp = []
    rfile.close()

    wfile = file('data/training_set_%d_%d.csv' % (window, propotion), 'w')
    writer = csv.writer(wfile)
    random.shuffle(training_set)
    writer.writerows(training_set)
    wfile.close()

    cutoffLine('-')
    exec('real_proportion = (len(training_set)- POSITIVE_%d) / float(POSITIVE_%d)'%(window, window))
    print "Real proportion: %f" % real_proportion
    cutoffLine('*')
    end_time = time.time()
    duration = timekeeper(start_time, end_time)
    print 'It takes %s to sampling with proportion %d'%(duration, proportion)
示例#29
0
文件: split.py 项目: boke168/tianchi
                if int(line[5]) == j and int(line[2]) == 4:
                    result_writer.writerow([line[0],line[1]])
                    result_count += 1
            stat_writer.writerow([train_file_name, train_count])
            stat_writer.writerow([result_file_name, result_count])
            train_file.close()
            result_file.close()
        else:
            forpredict_file_name = 'for_prediction.csv'
            train_file = file(PRE_DIR + '/' + forpredict_file_name,'w')
            train_writer = csv.writer(train_file)
            train_count = 0
            for line in reader:
                progressBar(reader.line_num,DATASET_SIZE)
                if int(line[5]) >= i and int(line[5]) < j:
                    train_writer.writerow(line)
                    train_count += 1
            stat_writer.writerow([forpredict_file_name, train_count])
            train_file.close()
        rfile.close()

if __name__ == '__main__':
    print 'Start split data'
    cutoffLine('*')
    start_time = time.time()
    splitData()
    end_time = time.time()
    duration = timekeeper(start_time,end_time)
    cutoffLine('*')
    print 'It takes ' + duration + ' to split dataset.'
示例#30
0
def RF(X, y):
    cutoffLine('-')
    print 'Training...'
    model = RandomForestClassifier(n_estimators = 100)
    model.fit(X, y)
    return model