Пример #1
0
def get_best_polyrbf_params(aizko_svm,
                            trainfeatsf,
                            kernel,
                            cgrid,
                            paramgrid,
                            workdir,
                            expname,
                            ntimes=3,
                            stratified=False,
                            rocarea_opt=False,
                            svmargs=''):

    bestc = cgrid[0]
    bestp = paramgrid[0]
    nfolds = 2
    rate = 0
    rate_idx = 8  #brier-score

    if kernel == 1:
        suffix = '.poly'
        param = 'd'
    elif kernel == 2:
        suffix = '.rbf'
        param = 'g'

    if rocarea_opt:
        suffix += '.rocarea'
    else:
        suffix += '.errorrate'

    suffix += '.gridsearch'

    redoing = False

    f = open(trainfeatsf)
    data = f.readlines()
    nlines = len(data)
    f.close()

    if data[0][0] == '#':
        nlines -= 1
        data = data[1:]

    data = np.array(data)

    for i in np.arange(ntimes):
        #create partitions
        if not stratified:
            partition = cvpartition(nlines, nfolds)
        else:
            partition = np.empty([nlines, nfolds], dtype=bool)
            for i in classes:
                gsiz = np.sum(testlabels == i)
                gcvpart = cvpartition(gsiz, nfolds)
                for f in np.arange(nfolds):
                    partition[testlabels == i, f] = gcvpart[:, f]
            partition = np.bool_(partition)

        basefname = os.path.splitext(trainfeatsf)[:-1][0]
        [trainf, testf] = twofold_file_split(basefname, data, partition)

        for cval in cgrid:
            for pval in paramgrid:
                fail_count = 0
                done = False
                while not done:
                    texpname = expname + '_c' + str(cval) + '_' + param + str(
                        pval) + suffix
                    try:
                        results = svm_polyrbf_test(aizko_svm, trainf, testf,
                                                   texpname, workdir, cval,
                                                   pval, redoing, rocarea_opt,
                                                   svmargs)
                        done = True
                    except:
                        log.debug('Failed. Repeating...')
                        partition = cvpartition(nlines, 2)
                        basefname = os.path.splitext(trainfeatsf)[:-1][0]
                        [trainf,
                         testf] = twofold_file_split(basefname, data,
                                                     partition)
                        fail_count += 1
                        if fail_count < 10: pass
                        else:
                            log.error('Unexpected error: ' +
                                      str(sys.exc_info()))
                            log.debug('Failed too many times.')
                            raise

                    if done:
                        log.debug(results)
                        new_rate = results[rate_idx]
                        if rate < new_rate:
                            rate = new_rate
                            bestc = cval
                            bestp = pval

        remove_all(find(os.listdir(workdir), 'gridsearch'), workdir)

    return bestc, bestp
Пример #2
0
def get_best_c_param(aizko_svm,
                     trainfeatsf,
                     cgrid,
                     workdir,
                     expname,
                     ntimes=3,
                     stratified=False,
                     rocarea_opt=False,
                     svmargs=''):

    bestc = cgrid[0]
    rate = 0
    nfolds = 2
    #rate_idx = 8 #brier
    rate_idx = 0  #accuracy

    log.debug('Grid search optimization index: ' + str(rate_idx))

    if rocarea_opt:
        suffix = '.linear.rocarea.gridsearch'
    else:
        suffix = '.linear.errorrate.gridsearch'

    redoing = False

    f = open(trainfeatsf)
    data = f.readlines()
    nlines = len(data)
    f.close()

    if data[0][0] == '#':
        nlines -= 1
        data = data[1:]

    data = np.array(data)
    testlabels = read_labels_from_svmperf_file(trainfeatsf)
    classes = np.unique(testlabels)
    classnum = len(classes)

    for i in np.arange(ntimes):
        #create partitions
        if not stratified:
            partition = cvpartition(nlines, nfolds)
        else:
            partition = np.empty([nlines, nfolds], dtype=int)
            for i in classes:
                gsiz = np.sum(testlabels == i)
                gcvpart = cvpartition(gsiz, nfolds)
                for f in np.arange(nfolds):
                    partition[testlabels == i, f] = gcvpart[:, f]
            partition = np.bool_(partition)

        basefname = os.path.splitext(trainfeatsf)[:-1][0] + '.' + expname
        [trainf, testf] = twofold_file_split(basefname, data, partition)

        #evaluate best parameter
        for cval in cgrid:
            fail_count = 0
            done = False
            texpname = expname + '_c' + str(cval) + suffix
            while not done:
                try:
                    results = svm_linear_test(aizko_svm, trainf, testf,
                                              texpname, workdir, cval, redoing,
                                              rocarea_opt, svmargs)
                    done = True
                except:
                    log.error('Unexpected error: ' + str(sys.exc_info()))
                    log.debug('Failed. Repeating...')
                    partition = cvpartition(nlines, 2)
                    basefname = os.path.splitext(trainfeatsf)[:-1][0]
                    [trainf,
                     testf] = twofold_file_split(basefname, data, partition)
                    fail_count += 1
                    if fail_count < 10: pass
                    else:
                        log.error('Unexpected error: ' + str(sys.exc_info()))
                        log.debug('Failed too many times.')
                        raise

            if done:
                log.debug(results)
                new_rate = results[rate_idx]
                if rate < new_rate:
                    rate = new_rate
                    bestc = cval

        remove_all(find(os.listdir(workdir), suffix), workdir)

    return bestc
def main(argv=None):

    parser = argparse.ArgumentParser(description='Creates text files with the same number of lines as the subjs file with 0s and 1s indicating which ones go to the training set (0) or test set(1)')
    parser.add_argument('-c','--classes', dest='classes', required=True, help='class label file. one line per class: <class_label>,<class_name>.')
    parser.add_argument('-s','--subjs',  dest='subjs', required=True, help='list file with the subjects for the analysis. Each line: <class_label>,<subject_file>')
    parser.add_argument('-k','--folds', dest='folds', type=int, default=10, required=False, help='Number of folds to separate the data. Set to 0 if you want a leave-one-out.')
    parser.add_argument('-o','--out', dest='outdir', required=True, help='name of the output directory where the results will be put.')
    parser.add_argument('-b','--balanced', dest='balanced', default='1', choices=['1','0'], required=False, help='If 1 it will separate proportional number of subjects for each class, else it will randomly pick any subject from the list (default: 1)')

    args     = parser.parse_args()

    classf   = args.classes.strip()
    subjsf   = args.subjs.strip()
    outdir   = args.outdir.strip()
    folds    = args.folds
    balanced = args.balanced.strip()

    #reading label file
    labels     = []
    classnames = []

    labfile = open(classf, 'r')
    for l in labfile:
        line = l.strip().split(',')
        labels    .append (int(line[0]))
        classnames.append (line[1])

    labfile.close()

    labels     = np.array (labels)
    classnames = np.array (classnames)

    #reading subjects list
    subjlabidx = []
    subjs      = []
    subjfile   = open(subjsf, 'r')
    for s in subjfile:
        line = s.strip().split(',')
        lab = int(line[0])
        idx = np.where(labels == lab)[0]
        subjlabidx.append(idx[0])
        subjs.append     (line[1])

    subjfile.close()

    #transforming from list to vector
    subjlabidx = np.array (subjlabidx)
    subjs      = np.array (subjs)

    classnum = labels.size
    subjsnum = subjlabidx.size

    #if output dir does not exist, create
    if not(os.path.exists(outdir)):
        os.mkdir(outdir)

#copying input files to outdir
#shutil.copy (subjsf, outdir + os.path.sep + os.path.basename(subjsf))
#shutil.copy (classf, outdir + os.path.sep + os.path.basename(classf))

#saving the input files to the output folder
#outf_subjs  = outdir + os.path.sep + 'subjects'
#outf_labels = outdir + os.path.sep + 'labels'
#np.savetxt(outf_subjs,  subjs,      fmt='%s')
#np.savetxt(outf_labels, subjlabels, fmt='%i')

    #generating partitions
    if balanced:
        #gsiz[i] has number of subjects of group with label in idx i
        #gsiz    = np.empty(classnum, dtype=int)
        #gcvpart will have the partition for group i
        #cvparts will be iteratively filled with partition information for each group
        cvparts = np.empty([subjsnum, folds], dtype=int)
        for i in range(classnum):
            gsiz    = sum(subjlabidx == i)
            gcvpart = cvpartition (gsiz, folds)
        for f in range(folds):
            cvparts[subjlabidx == i,f] = gcvpart[:,f]

    else:
        cvparts = cvpartition(subjsnum, folds)

    #generating files
    np.savetxt(outdir + '/all.txt', np.column_stack([labels[subjlabidx],subjs]), fmt='%s,%s')

    for i in range(folds):
        part   = cvparts[:,i]
        fname = outdir + '/fold_'  + str(i+1).zfill(4) + '.txt'

        f = open(fname, 'w')
        f.write ('#subjects file name: ' + subjsf)
        f.write ('\n')
        f.write ('#number of subjects: ' + str(len(part)))
        f.write ('\n')
        f.write ('#fold number: ' + str(i+1))
        f.write ('\n')
        f.write ('#training set size: ' + str(sum(part==0)))
        f.write ('\n')
        f.write ('#training set label: 0')
        f.write ('\n')
        f.write ('#test set size: ' + str(sum(part==1)))
        f.write ('\n')
        f.write ('#test set label: 1')
        f.write ('\n')

        np.savetxt(f, part, fmt='%i')

        f.close()

    return 0
Пример #4
0
def get_best_polyrbf_params (aizko_svm, trainfeatsf, kernel, cgrid, paramgrid, workdir, expname, ntimes=3, stratified=False, rocarea_opt=False, svmargs=''):

   bestc    = cgrid[0]
   bestp    = paramgrid[0]
   nfolds   = 2
   rate     = 0 
   rate_idx = 8 #brier-score

   if   kernel == 1:
      suffix = '.poly'
      param  = 'd'
   elif kernel == 2:
      suffix = '.rbf'
      param  = 'g'

   if rocarea_opt:
      suffix += '.rocarea'
   else:
      suffix += '.errorrate'

   suffix += '.gridsearch'

   redoing = False

   f      = open(trainfeatsf)
   data   = f.readlines()
   nlines = len(data)
   f.close()

   if data[0][0] == '#':
      nlines -= 1
      data    = data[1:]

   data = np.array(data)

   for i in np.arange(ntimes):
      #create partitions
      if not stratified:
         partition = cvpartition (nlines, nfolds)
      else:
         partition = np.empty([nlines, nfolds], dtype=bool)
         for i in classes:
            gsiz    = np.sum(testlabels == i)
            gcvpart = cvpartition (gsiz, nfolds)
            for f in np.arange(nfolds):
               partition[testlabels == i,f] = gcvpart[:,f]
         partition = np.bool_(partition)

      basefname       = os.path.splitext(trainfeatsf)[:-1][0]
      [trainf, testf] = twofold_file_split (basefname, data, partition)

      for cval in cgrid:
         for pval in paramgrid:
            fail_count = 0
            done = False
            while not done:
               texpname = expname + '_c' + str(cval) + '_' + param + str(pval) + suffix
               try:
                  results = svm_polyrbf_test(aizko_svm, trainf, testf, texpname, workdir, cval, pval, redoing, rocarea_opt, svmargs)
                  done = True
               except:
                  log.debug ('Failed. Repeating...')
                  partition       = cvpartition (nlines, 2)
                  basefname       = os.path.splitext(trainfeatsf)[:-1][0]
                  [trainf, testf] = twofold_file_split (basefname, data, partition)
                  fail_count += 1
                  if fail_count < 10: pass
                  else:
                     log.error ('Unexpected error: ' + str(sys.exc_info()))
                     log.debug ('Failed too many times.')
                     raise 

               if done:
                 log.debug(results)
                 new_rate = results[rate_idx]
                 if rate < new_rate:
                    rate   = new_rate
                    bestc  = cval
                    bestp  = pval

      remove_all(find(os.listdir(workdir), 'gridsearch'), workdir)

   return bestc, bestp
Пример #5
0
def get_best_c_param (aizko_svm, trainfeatsf, cgrid, workdir, expname, ntimes=3, stratified=False, rocarea_opt=False, svmargs=''):

   bestc    = cgrid[0]
   rate     = 0
   nfolds   = 2
   #rate_idx = 8 #brier
   rate_idx = 0 #accuracy

   log.debug('Grid search optimization index: ' + str(rate_idx))

   if rocarea_opt:
      suffix = '.linear.rocarea.gridsearch'
   else:
      suffix = '.linear.errorrate.gridsearch'

   redoing = False

   f      = open(trainfeatsf)
   data   = f.readlines()
   nlines = len(data)
   f.close()

   if data[0][0] == '#':
      nlines -= 1
      data    = data[1:]

   data        = np.array(data)
   testlabels  = read_labels_from_svmperf_file (trainfeatsf)
   classes     = np.unique(testlabels)
   classnum    = len(classes)

   for i in np.arange(ntimes):
      #create partitions
      if not stratified:
         partition = cvpartition (nlines, nfolds)
      else:
         partition = np.empty([nlines, nfolds], dtype=int)
         for i in classes:
            gsiz    = np.sum(testlabels == i)
            gcvpart = cvpartition (gsiz, nfolds)
            for f in np.arange(nfolds):
               partition[testlabels == i,f] = gcvpart[:,f]
         partition = np.bool_(partition)

      basefname       = os.path.splitext(trainfeatsf)[:-1][0] + '.' + expname
      [trainf, testf] = twofold_file_split (basefname, data, partition)

      #evaluate best parameter
      for cval in cgrid:
         fail_count = 0
         done = False
         texpname = expname + '_c' + str(cval) + suffix
         while not done:
            try:
               results = svm_linear_test (aizko_svm, trainf, testf, texpname, workdir, cval, redoing, rocarea_opt, svmargs)
               done = True
            except:
               log.error ('Unexpected error: ' + str(sys.exc_info()))
               log.debug ('Failed. Repeating...')
               partition       = cvpartition (nlines, 2)
               basefname       = os.path.splitext(trainfeatsf)[:-1][0]
               [trainf, testf] = twofold_file_split (basefname, data, partition)
               fail_count += 1
               if fail_count < 10: pass
               else:
                   log.error ('Unexpected error: ' + str(sys.exc_info()))
                   log.debug ('Failed too many times.')
                   raise 

         if done:
            log.debug (results)
            new_rate = results[rate_idx]
            if rate < new_rate:
               rate   = new_rate
               bestc  = cval

      remove_all(find(os.listdir(workdir), suffix), workdir)

   return bestc
def main(argv=None):

    parser = argparse.ArgumentParser(
        description=
        'Creates text files with the same number of lines as the subjs file with 0s and 1s indicating which ones go to the training set (0) or test set(1)'
    )
    parser.add_argument(
        '-c',
        '--classes',
        dest='classes',
        required=True,
        help='class label file. one line per class: <class_label>,<class_name>.'
    )
    parser.add_argument(
        '-s',
        '--subjs',
        dest='subjs',
        required=True,
        help=
        'list file with the subjects for the analysis. Each line: <class_label>,<subject_file>'
    )
    parser.add_argument(
        '-k',
        '--folds',
        dest='folds',
        type=int,
        default=10,
        required=False,
        help=
        'Number of folds to separate the data. Set to 0 if you want a leave-one-out.'
    )
    parser.add_argument(
        '-o',
        '--out',
        dest='outdir',
        required=True,
        help='name of the output directory where the results will be put.')
    parser.add_argument(
        '-b',
        '--balanced',
        dest='balanced',
        default='1',
        choices=['1', '0'],
        required=False,
        help=
        'If 1 it will separate proportional number of subjects for each class, else it will randomly pick any subject from the list (default: 1)'
    )

    args = parser.parse_args()

    classf = args.classes.strip()
    subjsf = args.subjs.strip()
    outdir = args.outdir.strip()
    folds = args.folds
    balanced = args.balanced.strip()

    #reading label file
    labels = []
    classnames = []

    labfile = open(classf, 'r')
    for l in labfile:
        line = l.strip().split(',')
        labels.append(int(line[0]))
        classnames.append(line[1])

    labfile.close()

    labels = np.array(labels)
    classnames = np.array(classnames)

    #reading subjects list
    subjlabidx = []
    subjs = []
    subjfile = open(subjsf, 'r')
    for s in subjfile:
        line = s.strip().split(',')
        lab = int(line[0])
        idx = np.where(labels == lab)[0]
        subjlabidx.append(idx[0])
        subjs.append(line[1])

    subjfile.close()

    #transforming from list to vector
    subjlabidx = np.array(subjlabidx)
    subjs = np.array(subjs)

    classnum = labels.size
    subjsnum = subjlabidx.size

    #if output dir does not exist, create
    if not (os.path.exists(outdir)):
        os.mkdir(outdir)

#copying input files to outdir
#shutil.copy (subjsf, outdir + os.path.sep + os.path.basename(subjsf))
#shutil.copy (classf, outdir + os.path.sep + os.path.basename(classf))

#saving the input files to the output folder
#outf_subjs  = outdir + os.path.sep + 'subjects'
#outf_labels = outdir + os.path.sep + 'labels'
#np.savetxt(outf_subjs,  subjs,      fmt='%s')
#np.savetxt(outf_labels, subjlabels, fmt='%i')

#generating partitions
    if balanced:
        #gsiz[i] has number of subjects of group with label in idx i
        #gsiz    = np.empty(classnum, dtype=int)
        #gcvpart will have the partition for group i
        #cvparts will be iteratively filled with partition information for each group
        cvparts = np.empty([subjsnum, folds], dtype=int)
        for i in range(classnum):
            gsiz = sum(subjlabidx == i)
            gcvpart = cvpartition(gsiz, folds)
        for f in range(folds):
            cvparts[subjlabidx == i, f] = gcvpart[:, f]

    else:
        cvparts = cvpartition(subjsnum, folds)

    #generating files
    np.savetxt(outdir + '/all.txt',
               np.column_stack([labels[subjlabidx], subjs]),
               fmt='%s,%s')

    for i in range(folds):
        part = cvparts[:, i]
        fname = outdir + '/fold_' + str(i + 1).zfill(4) + '.txt'

        f = open(fname, 'w')
        f.write('#subjects file name: ' + subjsf)
        f.write('\n')
        f.write('#number of subjects: ' + str(len(part)))
        f.write('\n')
        f.write('#fold number: ' + str(i + 1))
        f.write('\n')
        f.write('#training set size: ' + str(sum(part == 0)))
        f.write('\n')
        f.write('#training set label: 0')
        f.write('\n')
        f.write('#test set size: ' + str(sum(part == 1)))
        f.write('\n')
        f.write('#test set label: 1')
        f.write('\n')

        np.savetxt(f, part, fmt='%i')

        f.close()

    return 0