Пример #1
0
    def setup(self, argv):
        self.set_parameters(argv[1:])
        print self.set1, self.set2
        print self.__dict__.items()

        self.data = make_d.read_data(open(argv[0]))
        self.data = make_d.assign_classes(self.data)
        self.data = make_d.prepare_data(self.data)
        print self.data.keys(), [len(v) for v in self.data.values()]
        pass
Пример #2
0
def main(argv):

    fn = argv[0]
    data = make_d.prepare_data(make_d.read_data(open(fn)))
    N = 10000

    xdata = []
    classes = []
    for x in data:
        xdata.extend([y for y in data[x]])
        classes.extend([x for y in data[x]])
    
    mic = compute_mic_pos_class(xdata, classes)
    print mic
    cmp_mic = mic
    
    rnd_mic = [0.0 for x in mic]
    for i in xrange(N):
        data = make_d.prepare_data(make_d.read_data(open(fn)),
                                   randomize='class-shuffle')
        xdata = []
        classes = []
        for x in data:
            xdata.extend([y for y in data[x]])
            classes.extend([x for y in data[x]])
        mic = compute_mic_pos_class(xdata, classes)
        rnd_mic = map(sum, zip(rnd_mic, mic))
    rnd_mic = [x/N for x in rnd_mic]

    print cmp_mic
    print rnd_mic

    fo = open('mic.csv', 'w')
    fo.write('MIC_computed,MIC_random\n')
    for xy in zip(cmp_mic, rnd_mic):
        print xy
        fo.write('%f,%f\n' % xy)
    fo.close()
    return None
Пример #3
0
def main(argv):

    global C_RANGE
    global GAMMA_RANGE
    global SET1
    global SET2

    i = 0
    param_grid = {}
    results = []
    sum_acc = 0

    init(argv[1:])    
    print SET1, SET2
    # return None

    fn = argv[0]
    dataset = make_d.read_data(open(fn))
    dataset = make_d.assign_classes(dataset)
    
    data = make_d.prepare_data(dataset)
    print data.keys(), [len(v) for v in data.values()]

    param = svm.svm_parameter('-b 1')
    if KERNEL_TYPE == 'LINEAR':
        param.kernel_type = svm.LINEAR
        GAMMA_RANGE = 1, 0, -2
    else:
        param.kernel_type = svm.RBF

    cvfunc = leave_one_out
    n_cv = None

    use_sets = not SET1 is None and not SET2 is None

    outfile = os.path.basename(fn)
    outfile = outfile.replace('.fasta', '')
    outfile = outfile.replace('.fas', '')
    if use_sets:
        outfile = ''.join(map(str, map(int, SET1))) + 'vs' + ''.join(map(str, map(int, SET2)))

    log_name = '%s-%s-%i-%s.csv' % (TIMESTAMP, 
                                    KERNEL_TYPE,
                                    int(RANDOMIZE_DATA),
                                    outfile)
    logfile = open(log_name, 'w')                                    

    while i < N_RUNS:
        sys.stdout.write('%i ' % i)
        sys.stdout.flush()


        if use_sets:
            """ TODO: If set sizes are reasonably large, do not use the complete smallest set. """
            set1 = dict([item for item in data.items() 
                         if item[0] in SET1])
            set2 = dict([item for item in data.items()
                         if item[0] in SET2])
            set1 = make_d.make_set(set1, training_fraction=1.0)
            set2 = make_d.make_set(set2, training_fraction=1.0)
            new_sets = {1.0: set1[1], -1.0: set2[1]}
            sets = make_d.make_set(new_sets, training_fraction=0.75)
        else:
            sets = make_d.make_set(data, training_fraction=0.75)
        train_y, train_x, test_y, test_x = sets

        if RANDOMIZE_DATA:
            random.shuffle(train_y)
            random.shuffle(test_y)
            pass
        
        print [len(x) for x in sets]
Пример #4
0
def main(argv):

    global C_RANGE
    global GAMMA_RANGE
    global SET1
    global SET2

    i = 0
    param_grid = {}
    results = []
    sum_acc = 0

    init(argv[1:])    
    print SET1, SET2

    fn = argv[0]
    dataset = make_d.read_data(open(fn))
    dataset = make_d.assign_classes(dataset)
    data = make_d.prepare_data(dataset)
    print data.keys(), [len(v) for v in data.values()]

    param = svm.svm_parameter('-b 1')
    if KERNEL_TYPE == 'LINEAR':
        param.kernel_type = svm.LINEAR
        GAMMA_RANGE = 1, 0, -2
    else:
        param.kernel_type = svm.RBF

    cvfunc = svmfun.leave_one_out
    n_cv = None

    limit_sets = not SET1 is None and not SET2 is None

    outfile = os.path.basename(fn)
    outfile = outfile.replace('.fasta', '')
    outfile = outfile.replace('.fas', '')
    if limit_sets:
        outfile = ''.join(map(str, map(int, SET1))) + 'vs'
        outfile += ''.join(map(str, map(int, SET2)))

    log_name = '%s-%s-%i-%s.csv' % (TIMESTAMP, 
                                    KERNEL_TYPE,
                                    int(RANDOMIZE_DATA),
                                    outfile)
    logfile = open(log_name, 'w')                                    

    while i < N_RUNS:
        sys.stdout.write('%i ' % i)
        sys.stdout.flush()

        if limit_sets:
            new_sets = make_d.merge_multiclasses(data, SET1, SET2)
            sets = make_d.make_set(new_sets, training_fraction=0.75)
        else:
            sets = make_d.make_set(data, training_fraction=0.75)
        train_y, train_x, test_y, test_x = sets

        if RANDOMIZE_DATA:
            random.shuffle(train_y)
            random.shuffle(test_y)
            pass
        
        print [len(x) for x in sets]
Пример #5
0
def main(argv):

    global C_RANGE
    global GAMMA_RANGE
    global SET1
    global SET2

    i = 0
    param_grid = {}
    results = []
    sum_acc = 0

    init(argv[2:])
    print SET1, SET2

    param = svm.svm_parameter("-b 1")
    if KERNEL_TYPE == "LINEAR":
        param.kernel_type = svm.LINEAR
        GAMMA_RANGE = 1, 0, -2
    else:
        param.kernel_type = svm.RBF

    cvfunc = leave_one_out
    n_cv = None

    use_sets = not SET1 is None and not SET2 is None

    fn = argv[0]
    dataset = make_d.read_data(open(fn))
    data = make_d.assign_classes(dataset)

    data = [(d[0], d[1][1:]) for d in data]

    data = make_d.prepare_data(data)
    """ Next line is just for testing. """
    data = {1.0: data[1.0], 0.0: data[0.0]}
    print data.keys(), [len(v) for v in data.values()]

    testdata = make_d.read_data(open(argv[1]))
    testset = make_d.assign_classes(testdata)

    testset = [(d[0], d[1][1:]) for d in testset]

    testset = make_d.prepare_data(testset)

    precursor = {}
    for k, v in testdata.items():
        v = v[1:]
        precursor[v] = precursor.get(v, []) + [int(k.split("_")[-1])]
    print precursor

    outfile = os.path.basename(fn)
    outfile = outfile.replace(".fasta", "")
    outfile = outfile.replace(".fas", "")
    if use_sets:
        outfile = "".join(map(str, map(int, SET1))) + "vs" + "".join(map(str, map(int, SET2)))

    log_name = "%s-%s-%i-%s.csv" % (TIMESTAMP, KERNEL_TYPE, int(RANDOMIZE_DATA), outfile)
    logfile = open(log_name, "w")

    """ Prepare test set (precursor fragments). """
    testset[-1.0] = copy.deepcopy(testset[0.0])
    del testset[0.0]
    testset = make_d.make_set(testset, balanced_set=False, training_fraction=1.0)
    """ 'Training' and 'Test' sets flipped """
    test_y, test_x = testset[:2]
    encoded_x = [make_d.encode(x, make_d.encode_dic) for x in test_x]

    # logfile.write(',%s\n' % ','.join(map(str, map(int, test_y))))

    """ Train and predict """
    row = [0.0 for x in test_x]
    while i < N_RUNS:
        sys.stdout.write("%i " % i)
        sys.stdout.flush()

        set1 = dict([item for item in data.items() if item[0] == 1.0])
        set2 = dict([item for item in data.items() if item[0] == 0.0])
        set1 = make_d.make_set(set1, training_fraction=1.0)
        set2 = make_d.make_set(set2, training_fraction=1.0)
        new_sets = {1.0: set1[1], -1.0: set2[1]}
        sets = make_d.make_set(new_sets, training_fraction=1.0)
        train_y, train_x, dummy_y, dummy_x = sets

        print [len(x) for x in sets]
Пример #6
0
def main(argv):

    global C_RANGE
    global GAMMA_RANGE

    init(argv[2:])    

    fn = argv[0]
    dataset = make_d.read_data(open(fn))
    items = dataset.items()
    keys = [float(x[0].split('_')[0][3:]) for x in items]
    dataset = zip(keys, [v[1] for v in items])

    data = make_d.prepare_data(dataset)
    print data.keys(), [len(v) for v in data.values()]

    param = svm.svm_parameter('-b 1')
    if KERNEL_TYPE == 'LINEAR':
        param.kernel_type = svm.LINEAR
        GAMMA_RANGE = 1, 0, -2
    else:
        param.kernel_type = svm.RBF

    fn_test = argv[1]
    testdata = make_d.read_data(open(fn_test))
    testitems = testdata.items()
    testkeys = [float(x[0].split('_')[0][3:]) for x in testitems]
    testdataset = zip(testkeys, [v[1] for v in testitems])
    testdata = make_d.prepare_data(testdataset)
    

    cvfunc = leave_one_out
    n_cv = None

    outfile = os.path.basename(fn)
    outfile = outfile.replace('.fasta', '')
    outfile = outfile.replace('.fas', '')

    log_name = '%s-%s-%i-%s.csv' % (TIMESTAMP, 
                                    KERNEL_TYPE,
                                    int(RANDOMIZE_DATA),
                                    outfile)
    logfile = open(log_name, 'w')
                                    

    i = 0
    param_grid = {}
    results = []
    sum_acc = 0

    sets = make_d.make_set(data, balanced_set=False, training_fraction=1.0)
    train_y, train_x, test_y, test_x = sets
    train_x = [make_d.encode(x, make_d.encode_dic) for x in train_x]
    
    testsets = make_d.make_set(testdata, balanced_set=False, 
                               training_fraction=0.0)
    dummy0, dummy1, test_y, test_x = testsets
    test_x  = [make_d.encode(x, make_d.encode_dic) for x in test_x]
        
    param_grid = {}
    param_grid = grid_search(train_y, train_x, param, param_grid,
                             leave_one_out, n_cv, C_RANGE, GAMMA_RANGE)
    ranking = []
    for k, v in param_grid.items():
        recognized = [v_i[0][0] == v_i[3] for v_i in v]
        recog_rate = sum(map(int, recognized))/float(len(recognized))
        ranking.append((recog_rate, k))
    ranking.sort()
    
    param.C, param.gamma = map(lambda x: 2**x, ranking[-1][1])
    problem = svm.svm_problem(train_y, train_x)
    model = svmutil.svm_train(problem, param, '-q')
    result = svmutil.svm_predict(test_y, test_x, model, '-b 1')
    print result
    
    """
    cur_result = zip(result[0], test_y)
    cur_acc = compute_accuracy(cur_result)
    
    results.extend(cur_result)
    total_acc = compute_accuracy(results)

    sum_acc += cur_acc
    mean_acc = sum_acc/(i+1)
    # print cur_acc, mean_acc, total_acc
    
    logfile.write('%f,%f,%f\n' % (cur_acc, mean_acc, total_acc))

    
    print 'ACC', compute_accuracy(results)
    """
    logfile.close()
    return None
Пример #7
0
def main(argv):

    do_write = True

    fn = argv[0]
    dataset = make_d.read_data(open(fn))
    negset = make_d.read_data(open(argv[1]))
    
    items = dataset.items()
    keys = [float(x[0].split('_')[0][3]) for x in items]
    dataset = zip(keys, [v[1] for v in items])

    negitems = negset.items()
    negkeys = [float(x[0].split('_')[0][3]) for x in negitems]
    negset = zip(negkeys, [v[1] for v in negitems])               

    set1 = [x for x in dataset if x[0] == 1.0]
    print 'Set1:', len(set1), len(set(set1))
    set2 = [x for x in dataset if x[0] == 2.0]
    print 'Set2:', len(set2), len(set(set2))
    set4 = [x for x in dataset if x[0] == 4.0]
    print 'Set4:', len(set4), len(set(set4))
    set0 = [x for x in negset]
    print 'Set0:', len(set0)
    
        
    set1_2 = [(1.0, x[1]) for x in set1] + [(-1.0, x[1]) for x in set2]
    print 'Set1_2:', len(set1_2)
    if do_write: write_set(set1_2, open('1vs2.fas', 'w'))
    set1_4 = [(1.0, x[1]) for x in set1] + [(-1.0, x[1]) for x in set4]
    print 'Set1_4:', len(set1_4)
    if do_write: write_set(set1_4, open('1vs4.fas', 'w'))
    set2_4 = [(1.0, x[1]) for x in set2] + [(-1.0, x[1]) for x in set4]
    print 'Set2_4:', len(set2_4)
    if do_write: write_set(set2_4, open('2vs4.fas', 'w'))

    set12_4 = [(1.0, x[1]) for x in set1 + set2] + [(-1.0, x[1]) for x in set4]
    print 'Set12_4:', len(set12_4)
    if do_write: write_set(set12_4, open('12vs4.fas', 'w'))
    set14_2 = [(1.0, x[1]) for x in set1 + set4] + [(-1.0, x[1]) for x in set2]
    print 'Set14_2:', len(set14_2)
    if do_write: write_set(set14_2, open('14vs2.fas', 'w'))
    set24_1 = [(1.0, x[1]) for x in set2 + set4] + [(-1.0, x[1]) for x in set1]
    print 'Set24_1:', len(set24_1)
    if do_write: write_set(set24_1, open('24vs1.fas', 'w'))

    set_all = set1 + set2 + set4 + set0
    print 'Set_all:', len(set_all)
    if do_write: write_set(set_all, open('1vs2vs4vs0.fas', 'w'))

    set124_0 = [(1.0, x[1]) for x in set1 + set2 + set4] + [(-1.0, x[1]) for x in set0]
    print 'Set124_0:', len(set124_0)
    if do_write: write_set(set124_0, open('124vs0.fas', 'w'))
    
    set1_0 = [(1.0, x[1]) for x in set1] + [(-1.0, x[1]) for x in set0]
    print 'Set1_0:', len(set1_0)
    if do_write: write_set(set1_0, open('1vs0.fas', 'w'))
    set2_0 = [(1.0, x[1]) for x in set2] + [(-1.0, x[1]) for x in set0]
    print 'Set2_0:', len(set2_0)
    if do_write: write_set(set2_0, open('2vs0.fas', 'w'))
    set4_0 = [(1.0, x[1]) for x in set4] + [(-1.0, x[1]) for x in set0]
    print 'Set4_0:', len(set4_0)
    if do_write: write_set(set4_0, open('4vs0.fas', 'w'))

    set12_0 = [(1.0, x[1]) for x in set1 + set2] + [(-1.0, x[1]) for x in set0]
    print 'Set12_0:', len(set12_0)
    if do_write: write_set(set12_0, open('12vs0.fas', 'w'))
    set14_0 = [(1.0, x[1]) for x in set1 + set4] + [(-1.0, x[1]) for x in set0]
    print 'Set14_0:', len(set14_0)
    if do_write: write_set(set14_0, open('14vs0.fas', 'w'))
    set24_0 = [(1.0, x[1]) for x in set2 + set4] + [(-1.0, x[1]) for x in set0]
    print 'Set24_0:', len(set24_0)
    if do_write: write_set(set24_0, open('24vs0.fas', 'w'))
    
    

    return None