Пример #1
0
def Estimate(filename, old_files = [], n=10, learner='lr'):
    read = MAR()
    read = read.create(filename, old_files)

    result = []
    for i in range(n):
        np.random.seed(i)
        a = read.body[['projectname', 'label']]
        b = a.loc[a['label'] == 'yes']
        total_df = a.groupby(['projectname']).count()
        yes_df = b.groupby(['projectname']).count()
        df = pd.DataFrame()
        df[['total']] = total_df[['label']]
        df[['pos']] = yes_df[['label']]

        test_file = filename.rsplit('.',1)[0]
        test_series = df.loc[test_file]
        train_df = df.drop([test_file])
        x_train = list(train_df.total.values)
        y_train = list(train_df.pos.values)
        if learner == 'lr':
            clf = LogisticRegression(random_state=i)
        elif learner == 'dt':
            clf = DecisionTreeClassifier(random_state=i)
        elif learner == 'svm_linear':
            clf = svm.SVC(kernel='linear', random_state=i)
        elif learner == 'nbm':
            clf = MultinomialNB(alpha=1)

        x_train = np.reshape(x_train, (-1, 1))
        clf.fit(x_train, y_train)
        res = clf.predict(test_series['total'])
        result.append(res[0])
    print(test_file, result)
    return result
Пример #2
0
def START_AUTO(filename):
    read = MAR()
    read = read.create(filename)
    pos_last = 0
    full_life = 3
    life = full_life
    while True:
        pos, neg, total = read.get_numbers()
        print("%d/ %d" % (pos, pos + neg))
        if pos >= 10:
            if pos == pos_last:
                life = life - 1
                if life == 0:
                    break
            else:
                life = full_life
        if pos == 0:
            for id in read.random():
                read.code(id, read.body["label"][id])
        else:
            a, b, ids, c = read.train()
            for id in ids:
                read.code(id, read.body["label"][id])
        pos_last = pos
    return read
Пример #3
0
def REUSE_RANDOM(filename, old):
    stop = 0.9

    read = MAR()
    read = read.create(filename)
    read.create_old(old)
    num2 = read.get_allpos()
    target = int(num2 * stop)
    while True:
        pos, neg, total = read.get_numbers()
        # print("%d/ %d" % (pos,pos+neg))
        if pos >= target:
            break
        a, b, ids, c = read.train_reuse_random()
        for id in ids:
            read.code(id, read.body["label"][id])
    return read
Пример #4
0
def UPDATE(filename, old, pne=False, cl="RF"):
    stop = 1

    read = MAR()
    read = read.create(filename)
    read.create_old(old)
    num2 = read.get_allpos()
    target = int(num2 * stop)
    while True:
        pos, neg, total = read.get_numbers()
        print("%d/ %d" % (pos, pos + neg))
        if pos >= target:
            break
        a, b, ids, c = read.train(pne=pne, cl=cl)
        for id in ids:
            read.code(id, read.body["label"][id])
    return read
Пример #5
0
def LINEAR(filename):
    read = MAR()
    read = read.create(filename)
    while True:
        pos, neg, total = read.get_numbers()
        if total - (pos + neg) < 10:
            break
        for id in read.random():
            read.code(id, read.body["label"][id])
    return read
Пример #6
0
def LOC(filename):
    stop = 1

    read = MAR()
    read = read.create(filename)
    target = int(read.get_allpos() * stop)
    while True:
        pos, neg, total = read.get_numbers()
        print("%d, %d" % (pos, pos + neg))
        if pos >= target:
            break
        for id in read.loc_sort():
            read.code(id, read.body["label"][id])
    return read
Пример #7
0
def UPDATE_AUTO(filename, old, pne=True):

    read = MAR()
    read = read.create(filename)
    read.create_old(old)
    pos_last = -1
    full_life = 5
    life = full_life
    while True:
        pos, neg, total = read.get_numbers()
        print("%d/ %d" % (pos, pos + neg))
        if pos == pos_last:
            life = life - 1
            if life == 0:
                break
        else:
            life = full_life
        a, b, ids, c = read.train(pne)
        for id in ids:
            read.code(id, read.body["label"][id])
        pos_last = pos
    return read
Пример #8
0
from __future__ import print_function, division

import os
import sys

root = os.getcwd().split("src")[0] + "src/src/util"
sys.path.append(root)

from flask import Flask, url_for, render_template, request, jsonify, Response, json
from pdb import set_trace
from mar import MAR

app = Flask(__name__, static_url_path='/static')

global target
target = MAR()
global clf
clf = []


@app.route('/hello/')
def hello():
    return render_template('hello.html')


@app.route('/load', methods=['POST'])
def load():
    global target
    file = request.form['file']
    target = target.create(file)
    pos, neg, total = target.get_numbers()
Пример #9
0
def START(filename, cl="linear"):
    stop = 1
    thres = 40
    # thres=10000000000

    read = MAR()
    read = read.create(filename)
    target = int(read.get_allbugs() * stop)
    while True:
        found, cost, total = read.get_numbers()
        try:
            print("%d, %d" % (found, cost))
        except:
            pass
        if found >= target:
            break

        if found == 0 or cost < thres:
            for id in read.loc_sort():
                read.code(id, read.body["label"][id])
        else:
            ids, c = read.train(cl=cl)
            for id in ids:
                read.code(id, read.body["label"][id])
    read.plot()
    set_trace()
    return read
Пример #10
0
def test_semi(filename="Hall.csv"):
    num = 20
    read = MAR()
    read = read.create(filename)
    poses = np.where(np.array(read.body['label']) == "yes")[0]
    negs = np.where(np.array(read.body['label']) == "no")[0]
    pos_sel = np.random.choice(poses, num, replace=False)
    neg_sel = np.random.choice(negs, num*10, replace=False)

    for id in pos_sel:
        read.code_error(id)
    for id in neg_sel:
        read.code_error(id)
    read.enable_est = True
    read.get_numbers()
    a,b,c,d = read.train()
    set_trace()
Пример #11
0
def test(filename):
    p = 5

    for i in xrange(10):
        num = 10*(i+1)
        read = MAR()
        read = read.create(filename,partitions = p)
        poses = np.where(np.array(read.body['label']) == "yes")[0]
        negs = np.where(np.array(read.body['label']) == "no")[0]
        pos_sel = np.random.choice(poses, num, replace=False)
        neg_sel = np.random.choice(negs, num*10, replace=False)

        for id in pos_sel:
            read.code_error(id)
        for id in neg_sel:
            read.code_error(id)

        read.get_numbers()
        start = time.time()
        a,b,c,d = read.train_para()
        duration = time.time()-start
        print(duration)

        read.get_numbers()
        start = time.time()
        a,b,c,d = read.train()
        duration2 = time.time()-start
        print(duration2)
Пример #12
0
def TEST_AL(filename,
            old_files=[],
            stop='est',
            stopat=1,
            error='none',
            interval=100000,
            starting=1,
            seed=0,
            step=10):
    stopat = float(stopat)
    thres = 0
    counter = 0
    pos_last = 0
    np.random.seed(seed)

    read = MAR()
    read = read.create(filename, old_files)
    read.step = step

    read.interval = interval

    num2 = read.get_allpos()
    target = int(num2 * stopat)
    if stop == 'est':
        read.enable_est = True
    else:
        read.enable_est = False

    while True:
        pos, neg, total = read.get_numbers()
        try:
            print("%d, %d, %d" % (pos, pos + neg, read.est_num))
        except:
            print("%d, %d" % (pos, pos + neg))

        if pos + neg >= total:
            break

        if pos < starting or pos + neg < thres:
            for id in read.random():
                read.code_error(id, error=error)
        else:
            a, b, c, d = read.train(weighting=True, pne=True)
            if pos >= target and read.est_num * stopat <= pos:
                break
            for id in c:
                read.code_error(id, error=error)
    # read.export()
    # results = analyze(read)
    # print(results)
    # read.plot()
    return read
Пример #13
0
def Codes(filename, code):
    stop = 0.95
    thres = 0
    if "P" in code:
        starting = 5
    else:
        starting = 1

    weighting = "W" in code or "M" in code
    uncertain = "U" in code
    stopping = "S" in code

    read = MAR()
    read = read.create(filename)
    read.restart()
    read = MAR()
    read = read.create(filename)
    if not ("A" in code or "M" in code):
        read.enough = 100000
    target = int(read.get_allpos() * stop)
    while True:
        pos, neg, total = read.get_numbers()
        # print("%d, %d" %(pos,pos+neg))
        if pos >= target:
            break
        if pos < starting or pos + neg < thres:
            for id in read.random():
                read.code(id, read.body["label"][id])
        else:
            a, b, c, d, e = read.train(weighting=weighting)
            if pos < 30 and uncertain:
                for id in a:
                    read.code(id, read.body["label"][id])
            else:
                if stopping:
                    now = 0
                    while pos < target:
                        for id in e[now:now + read.step]:
                            read.code(id, read.body["label"][id])
                        pos, neg, total = read.get_numbers()
                        now = now + read.step
                else:
                    for id in c:
                        read.code(id, read.body["label"][id])
    return read
Пример #14
0
def active_learning(filename,
                    query='',
                    stop='true',
                    stopat=0.95,
                    error='none',
                    interval=100000,
                    seed=0):
    stopat = float(stopat)
    thres = 0
    starting = 1
    counter = 0
    pos_last = 0
    np.random.seed(seed)
    read = MAR()
    read = read.create(filename)
    read.interval = interval

    read.BM25(query.strip().split('_'))

    num2 = read.get_allpos()
    target = int(num2 * stopat)
    print("number of target, true/close here:", target)

    if stop == 'est':
        read.enable_est = True
    else:
        read.enable_est = False

    while True:
        pos, neg, total = read.get_numbers()
        try:
            print("%d, %d, %d" % (pos, pos + neg, read.est_num))
        except:
            print("%d, %d" % (pos, pos + neg))

        if pos + neg >= total:
            if stop == 'knee' and error == 'random':
                coded = np.where(
                    np.array(read.body['code']) != "undetermined")[0]
                seq = coded[np.argsort(read.body['time'][coded])]
                part1 = set(seq[:read.kneepoint * read.step]) & set(
                    np.where(np.array(read.body['code']) == "no")[0])
                part2 = set(seq[read.kneepoint * read.step:]) & set(
                    np.where(np.array(read.body['code']) == "yes")[0])
                for id in part1 | part2:
                    read.code_error(id, error=error)
            break

        if pos < starting or pos + neg < thres:
            for id in read.BM25_get():
                read.code_error(id, error=error)
        else:
            a, b, c, d = read.train(weighting=True, pne=True)
            if stop == 'est':
                if stopat * read.est_num <= pos:
                    break
            elif stop == 'soft':
                if pos >= 10 and pos_last == pos:
                    counter = counter + 1
                else:
                    counter = 0
                pos_last = pos
                if counter >= 5:
                    break
            elif stop == 'knee':
                if pos >= 10:
                    if read.knee():
                        if error == 'random':
                            coded = np.where(
                                np.array(read.body['code']) != "undetermined"
                            )[0]
                            seq = coded[np.argsort(
                                np.array(read.body['time'])[coded])]
                            part1 = set(
                                seq[:read.kneepoint * read.step]) & set(
                                    np.where(
                                        np.array(read.body['code']) == "no")
                                    [0])
                            part2 = set(
                                seq[read.kneepoint * read.step:]) & set(
                                    np.where(
                                        np.array(read.body['code']) == "yes")
                                    [0])
                            for id in part1 | part2:
                                read.code_error(id, error=error)
                        break
            else:
                if pos >= target:
                    break
            if pos < 10:
                for id in a:
                    read.code_error(id, error=error)
            else:
                for id in c:
                    read.code_error(id, error=error)
    set_trace()
    return read
Пример #15
0
def ERROR(filename):
    read = MAR()
    read = read.create(filename)
    read.lda()
    read.syn_error()
Пример #16
0
def REUSE(filename, old, pne=True):
    stop = 0.9
    thres = 5

    read = MAR()
    read = read.create(filename)
    read.create_old(old)
    num2 = read.get_allpos()
    target = int(num2 * stop)
    while True:
        pos, neg, total = read.get_numbers()
        print("%d/ %d" % (pos, pos + neg))
        if pos >= target:
            break
        if pos < thres:
            a, b, ids, c = read.train(pne)
            for id in ids:
                read.code(id, read.body["label"][id])
        else:
            a, b, ids, c = read.train_reuse(pne)
            for id in ids:
                read.code(id, read.body["label"][id])
    return read
Пример #17
0
def Supervised(filename,
               old_files=[],
               stop='est',
               stopat=1,
               error='none',
               interval=100000,
               starting=1,
               seed=0,
               step=10):
    stopat = float(stopat)
    np.random.seed(seed)

    read = MAR()
    read = read.create(filename, old_files)
    read.step = step

    read.interval = interval

    num2 = read.get_allpos()
    target = int(num2 * stopat)
    if stop == 'est':
        read.enable_est = True
    else:
        read.enable_est = False

    read.train_supervised()
    pos, neg, total = read.get_numbers()

    read.query_supervised()
    read.record['est'][0] = read.est_num

    while True:
        pos, neg, total = read.get_numbers()
        try:
            print("%d, %d, %d" % (pos, pos + neg, read.est_num))
        except:
            print("%d, %d" % (pos, pos + neg))

        if pos + neg >= total:
            break

        if pos >= target and read.est_num * stopat <= pos:
            break
        for id in read.query_supervised()[:read.step]:
            read.code_error(id, error=error)
    return read
Пример #18
0
def TIME_START(filename):
    stop = 0.9

    read = MAR()
    read = read.create(filename)
    num2 = read.get_allpos()
    target = int(num2 * stop)
    while True:
        pos, neg, total = read.get_numbers()
        # print("%d/ %d" % (pos,pos+neg))
        if pos >= target:
            break
        if pos == 0:
            for id in read.random():
                read.code(id, read.body["label"][id])
        else:
            a, b, ids, c = read.train_kept()
            for id in ids:
                read.code(id, read.body["label"][id])
    return read
Пример #19
0
def START_est(filename):
    stop = 0.90
    thres = 40
    flag = True

    read = MAR()
    read = read.create(filename)
    read.restart()
    read = MAR()
    read = read.create(filename)
    target = int(read.get_allpos() * stop)
    while True:
        pos, neg, total = read.get_numbers()
        # print("%d, %d" %(pos,pos+neg))
        if pos >= target:
            break
        if pos == 0 or pos + neg < thres:
            for id in read.random():
                read.code(id, read.body["label"][id])
        else:
            a, b, ids, c = read.train(pne=True)

            if pos >= 60 and flag:
                read.cache_est()
                # read.xx=read.simcurve['x']
                # read.yy=read.simcurve['pos']
                flag = False

            for id in ids:
                read.code(id, read.body["label"][id])
    return read
Пример #20
0
def export(file):
    read = MAR()
    read = read.create_lda(file)
    read.export_feature()
Пример #21
0
def START_LOC(filename, cl="SVM-linear"):
    stop = 1

    read = MAR()
    read = read.create(filename)
    target = int(read.get_allpos() * stop)
    while True:
        pos, neg, total = read.get_numbers()
        print("%d, %d" % (pos, pos + neg))
        if pos >= target:
            break
        if pos == 0 or pos + neg < 40:
            for id in read.loc_sort():
                read.code(id, read.body["label"][id])
        else:
            a, b, ids, c = read.train(cl=cl)
            for id in ids:
                read.code(id, read.body["label"][id])
    return read
Пример #22
0
def START_DOC2VEC(filename):
    stop = 0.95
    thres = 40

    read = MAR()
    read = read.create(filename)
    read.restart()
    read = MAR()
    read = read.create(filename)
    target = int(read.get_allpos() * stop)
    while True:
        pos, neg, total = read.get_numbers()
        print("%d, %d" % (pos, pos + neg))
        if pos >= target:
            break
        if pos == 0 or pos + neg < thres:
            for id in read.random():
                read.code(id, read.body["label"][id])
        else:
            a, b, c, d, e = read.train(weighting=True)
            for id in c:
                read.code(id, read.body["label"][id])
    return read
Пример #23
0
def UPDATE_REUSE(filename, old):
    stop = 0.9
    lifes = 2
    life = lifes
    last_pos = 0
    thres = 5

    read = MAR()
    read = read.create(filename)
    read.create_old(old)
    num2 = read.get_allpos()
    target = int(num2 * stop)
    while True:
        pos, neg, total = read.get_numbers()
        # print("%d/ %d" % (pos, pos + neg))

        if pos - last_pos:
            life = lifes
        else:
            life = life - 1
        last_pos = pos

        if pos >= target:
            break
        # if (pos >= thres or pos==0) and life<1:
        if (pos >= thres) and life < 1:
            # print("reuse")
            lifes = 0
            a, b, ids, c = read.train_reuse()
            for id in ids:
                read.code(id, read.body["label"][id])
        else:
            # print("update")
            a, b, ids, c = read.train()
            for id in ids:
                read.code(id, read.body["label"][id])
    return read
Пример #24
0
def active_learning(filename,
                    query='',
                    stop='true',
                    stopat=1.00,
                    error='none',
                    interval=100000,
                    seed=0):
    stopat = float(stopat)
    thres = 0
    starting = 1
    counter = 0
    pos_last = 0
    np.random.seed(seed)

    read = MAR()
    read = read.create(filename)
    # random sampling or by querying similar documents
    # self.bm is provided with a list or a view of a dict's value which is not sorted
    read.BM25(query.strip().split('_'))

    # get the rest #pos documents
    num2 = read.get_allpos()
    target = int(
        num2 * stopat
    )  # stopat is 1. Is it the minum num of pos to activate svm training ?
    if stop == 'est':  # stop = 'true'
        read.enable_est = True
    else:
        read.enable_est = False  # will excute this line

    while True:
        pos, neg, total = read.get_numbers()
        try:
            print("%d, %d, %d" %
                  (pos, pos + neg, read.est_num))  # what is est_num ?
        except:
            print("%d, %d" % (pos, pos + neg))  # execute this line

        if pos + neg >= total:  # do not go inside
            if stop == 'knee' and error == 'random':
                coded = np.where(
                    np.array(read.body['code']) != "undetermined")[0]
                seq = coded[np.argsort(read.body['time'][coded])]
                part1 = set(seq[:read.kneepoint * read.step]) & set(
                    np.where(np.array(read.body['code']) == "no")[0])
                part2 = set(seq[read.kneepoint * read.step:]) & set(
                    np.where(np.array(read.body['code']) == "yes")[0])
                for id in part1 | part2:
                    read.code_error(id, error=error)
            break

        if pos < starting or pos + neg < thres:  # the second condition doesn't work
            for id in read.BM25_get(
            ):  # select a set of candidates from self.pool
                read.code_error(
                    id, error=error
                )  # simulate human labeling error, default is no error
        else:
            a, b, c, d = read.train(weighting=True, pne=True)
            if stop == 'est':
                if stopat * read.est_num <= pos:
                    break
            elif stop == 'soft':
                if pos >= 10 and pos_last == pos:
                    counter = counter + 1
                else:
                    counter = 0
                pos_last = pos
                if counter >= 5:
                    break
            elif stop == 'knee':
                if pos >= 10:
                    if read.knee():
                        if error == 'random':
                            coded = np.where(
                                np.array(read.body['code']) != "undetermined"
                            )[0]
                            seq = coded[np.argsort(
                                np.array(read.body['time'])[coded])]
                            part1 = set(
                                seq[:read.kneepoint * read.step]) & set(
                                    np.where(
                                        np.array(read.body['code']) == "no")
                                    [0])
                            part2 = set(
                                seq[read.kneepoint * read.step:]) & set(
                                    np.where(
                                        np.array(read.body['code']) == "yes")
                                    [0])
                            for id in part1 | part2:
                                read.code_error(id, error=error)
                        break
            else:
                if pos >= target:
                    break
            if pos < 10:
                for id in a:
                    read.code_error(id, error=error)
            else:
                for id in c:
                    read.code_error(id, error=error)
    return read
Пример #25
0
def Boosting(filename, old_files = [], stop='', stopat=1, error='none', interval = 100000, starting =1, seed=0, step =10):
    print("FILENAME: ", filename, "OLDFILES: ", len(old_files))
    stopat = float(stopat)
    np.random.seed(seed)

    read = MAR()
    read = read.create(filename,old_files)
    read.step = step

    read.interval = interval

    util.vote(read)

    num2 = read.get_allpos()
    target = int(num2 * stopat)
    if stop == 'est':
        read.enable_est = True
    else:
        read.enable_est = False

    pos, neg, total = read.get_numbers()


    read.query_boost()
    read.record['est'][0]= read.est_num


    while True:
        pos, neg, total = read.get_numbers()
        try:
            print("%d, %d, %d" %(pos,pos+neg, read.est_num))
        except:
            print("%d, %d" %(pos,pos+neg))

        if pos + neg >= total:
            break

        if read.enable_est and read.est_num*stopat<= pos:
            break
        for id in read.query_boost()[:read.step]:
            read.code_error(id, error=error)
    return read
Пример #26
0
def Supervised(filename, old_files = [], stop='', stopat=1, error='none', interval = 100000, starting =1, seed=0,
               step =10, learner='svm_linear', boost=None):
    print("FILENAME: ", filename, "OLDFILES: ", len(old_files))
    stopat = float(stopat)
    np.random.seed(seed)

    read = MAR()
    read = read.create(filename, old_files)
    read.step = step

    read.interval = interval
    read.seed = seed

    if boost:
        util.vote(read, clf_name=boost, seed=seed, all=False, temp=str(seed) + filename)
    return
    num2 = read.get_allpos()
    target = int(num2 * stopat)
    if stop == 'est':
        read.enable_est = True
    else:
        read.enable_est = False

    if boost == None:
        read.train_supervised(learner, seed)
    pos, neg, total = read.get_numbers()

    if boost:
        read.query_boost()
    else:
        read.query_supervised()

    read.record['est'][0] = read.est_num

    while True:
        pos, neg, total = read.get_numbers()

        # try:
        #     print("%d, %d, %d" %(pos,pos+neg, read.est_num))
        # except:
        #     print("%d, %d" %(pos,pos+neg))

        if pos + neg >= total:
            break

        # if pos >= target and (pos+neg) >= total * .22 and read.enable_est and read.est_num*stopat<= pos:
        #     break
        if boost:
            ids = read.query_boost()[:read.step]
        else:
            ids = read.query_supervised()[:read.step]
        read.code_batch(ids)
    return read
Пример #27
0
def START_ERROR(filename):
    read = MAR()
    read = read.create(filename)
    pos_last = 0
    full_life = 3
    human_error = 0.2
    life = full_life
    while True:
        pos, neg, total = read.get_numbers()
        print("%d/ %d" % (pos, pos + neg))
        if pos >= 10:
            if pos == pos_last:
                life = life - 1
                if life == 0:
                    break
            else:
                life = full_life
        if pos == 0:
            for id in read.random():
                if read.body["label"][id] == "no":
                    if random.random() < human_error**2:
                        hl = "yes"
                    else:
                        hl = "no"
                elif read.body["label"][id] == "yes":
                    if random.random() < 2 * (human_error - human_error**2):
                        hl = "no"
                    else:
                        hl = "yes"
                read.code(id, hl)
        else:
            a, b, ids, c = read.train()
            for id in ids:
                if read.body["label"][id] == "no":
                    if random.random() < human_error**2:
                        hl = "yes"
                    else:
                        hl = "no"
                elif read.body["label"][id] == "yes":
                    if random.random() < 2 * (human_error - human_error**2):
                        hl = "no"
                    else:
                        hl = "yes"
                read.code(id, hl)
        pos_last = pos
    read.export()
    return read
Пример #28
0
from __future__ import print_function, division
import sys, os

root = os.getcwd().split("src")[0] + "src/src/util"
sys.path.append(root)
from mar import MAR
from pdb import set_trace

if __name__ == "__main__":
    data_path = "Hall.csv"
    target_recall = 0.95
    thres = 10
    query = "defect prediction"
    read = MAR()
    read = read.create(data_path)
    read.enable_est = True
    if query:
        read.BM25(query.split())
    while True:
        pos, neg, total = read.get_numbers()
        try:
            print("%d, %d, %d" % (pos, pos + neg, read.est_num))
        except:
            print("%d, %d" % (pos, pos + neg))

        if pos + neg >= total:
            break

        if pos < 1:
            if query:
                ids, scores = read.BM25_get()