示例#1
0
def mainOptions(options, convertFunction=lambda x: x):
    try:
        #options.saveFile = getSaveFilename(options)
        filename = combineConfigs(options.saveFile, options, convertFunction)
        targetData = getFilename(options.targetBase, options.student, TRAIN)
        if (options.numSource == 0) or not (options.useSource):
            sourceData = []
        else:
            sourceData = [
                getFilename(options.sourceBase, s, TRAIN)
                for s in options.otherStudents
            ]
        # run the cmd
        print 'save config will be at', options.saveConfigFilename
        cmd = [
            'bin/%s/trainClassifier' % getArch(), filename, options.saveFile,
            targetData
        ] + sourceData
        if options.fracSourceData is not None:
            cmd += ['--fracSourceData', str(options.fracSourceData)]
        if options.debug:
            print ' '.join(cmd)
        if options.catchOutput:
            p = subprocess.Popen(cmd, stdout=subprocess.PIPE)
            output, error = p.communicate()
        else:
            subprocess.check_call(cmd)
    finally:
        if not (options.debug):
            try:
                os.remove(filename)
            except:
                pass
    if options.catchOutput:
        return output, error
示例#2
0
def mainOptions(options, convertFunction=lambda x: x):
    try:
        # options.saveFile = getSaveFilename(options)
        filename = combineConfigs(options.saveFile, options, convertFunction)
        targetData = getFilename(options.targetBase, options.student, TRAIN)
        if (options.numSource == 0) or not (options.useSource):
            sourceData = []
        else:
            sourceData = [getFilename(options.sourceBase, s, TRAIN) for s in options.otherStudents]
        # run the cmd
        print "save config will be at", options.saveConfigFilename
        cmd = ["bin/%s/trainClassifier" % getArch(), filename, options.saveFile, targetData] + sourceData
        if options.fracSourceData is not None:
            cmd += ["--fracSourceData", str(options.fracSourceData)]
        if options.debug:
            print " ".join(cmd)
        if options.catchOutput:
            p = subprocess.Popen(cmd, stdout=subprocess.PIPE)
            output, error = p.communicate()
        else:
            subprocess.check_call(cmd)
    finally:
        if not (options.debug):
            try:
                os.remove(filename)
            except:
                pass
    if options.catchOutput:
        return output, error
示例#3
0
def main(args=sys.argv[1:]):
    options = parseArgs(args)
    testFile = getFilename(options.testBase, options.student, TRAIN)
    cmd = [
        'bin/%s/runClassifier' % getArch(), options.saveConfigFilename,
        testFile,
        str(options.numTest), '--notrain'
    ]
    subprocess.check_call(cmd)
示例#4
0
def main(basename,dataDir,stayWeight,treeOptions,options):
  students = ['gr','ta','gp','pd']
  
  for i,student in enumerate(students):
    if (options.studentInd is not None) and (i != options.studentInd):
      continue
    print '-------------------'
    print student
    print '-------------------'
    dataFile = getFilename(dataDir,student,TRAIN)
    createDT(dataFile,basename,'only-%s' % student,stayWeight,treeOptions,options.useWeka,options.numInstances,options.randomTree,options.numRandomTrees,options.featureFrac,options.resampleFrac,options.randomTreeInd)
示例#5
0
def makeTree(data,useWeka,stayWeight,base,name,treeOptions,randomTree,featureFrac):
  descFile = getFilename(base,name,DESC)
  #unweightedFile = getFilename(base,name,UNWEIGHTED)
  weightedFile = getFilename(base,name,WEIGHTED)
  if (stayWeight is not None) and (abs(stayWeight - 1.0) > 0.0001):
    print 'Adding stay weights'
    addARFFWeights(data,data,stayWeight)
  if useWeka:
    print 'Running weka to create initial tree'
    createTree(data,descFile,treeOptions,randomTree,featureFrac)
    print 'Extracting tree from weka output'
    # NOTE: changed weka to output class distributions, no longer need to add my own weights
    #if randomTree:
      #extractTree(data,descFile,unweightedFile)
      #print 'Adding class weights to tree'
      #weightTree(unweightedFile,data,weightedFile)
    #else:
    extractTree(data,descFile,weightedFile)
  else:
    print 'Running buildDT to create a weighted tree'
    buildDT(data,weightedFile,treeOptions,randomTree)
示例#6
0
def makeTree(data, useWeka, stayWeight, base, name, treeOptions, randomTree,
             featureFrac):
    descFile = getFilename(base, name, DESC)
    #unweightedFile = getFilename(base,name,UNWEIGHTED)
    weightedFile = getFilename(base, name, WEIGHTED)
    if (stayWeight is not None) and (abs(stayWeight - 1.0) > 0.0001):
        print 'Adding stay weights'
        addARFFWeights(data, data, stayWeight)
    if useWeka:
        print 'Running weka to create initial tree'
        createTree(data, descFile, treeOptions, randomTree, featureFrac)
        print 'Extracting tree from weka output'
        # NOTE: changed weka to output class distributions, no longer need to add my own weights
        #if randomTree:
        #extractTree(data,descFile,unweightedFile)
        #print 'Adding class weights to tree'
        #weightTree(unweightedFile,data,weightedFile)
        #else:
        extractTree(data, descFile, weightedFile)
    else:
        print 'Running buildDT to create a weighted tree'
        buildDT(data, weightedFile, treeOptions, randomTree)
示例#7
0
def main(basename, dataDir, stayWeight, treeOptions, options):
    students = ['gr', 'ta', 'gp', 'pd']

    for i, student in enumerate(students):
        if (options.studentInd is not None) and (i != options.studentInd):
            continue
        print '-------------------'
        print student
        print '-------------------'
        dataFile = getFilename(dataDir, student, TRAIN)
        createDT(dataFile, basename, 'only-%s' % student, stayWeight,
                 treeOptions, options.useWeka, options.numInstances,
                 options.randomTree, options.numRandomTrees,
                 options.featureFrac, options.resampleFrac,
                 options.randomTreeInd)
示例#8
0
def main(args=sys.argv[1:]):
    usage = 'createTrBagg [--balanced] numTarget numSource jobStart [numJobs]'
    balanced = False
    if '--balanced' in args:
        args.remove('--balanced')
        balanced = True
    if (len(args) < 3) or (len(args) > 4):
        print >> sys.stderr, usage
        sys.exit(1)
    numClassifiers = 1000
    numTarget = int(args[0])
    numSource = int(args[1])
    jobStart = int(args[2])
    if len(args) >= 4:
        numJobs = int(args[3])
    else:
        numJobs = 1
    jobStart *= numJobs
    base = 'studentsNew29-unperturbed-%i'
    outputBase = 'studentsNew29-unperturbed-transfer/target%i-source%i' % (
        numTarget, numSource)
    if not os.path.exists('data/dt/' + outputBase + '/desc'):
        os.makedirs('data/dt/' + outputBase + '/desc')
    if not os.path.exists('data/dt/' + outputBase + '/weighted'):
        os.makedirs('data/dt/' + outputBase + '/weighted')

    for jobOffset in range(numJobs):
        jobNum = jobStart + jobOffset
        studentInd = jobNum / numClassifiers
        classifierInd = jobNum % numClassifiers
        print 'job,student,classifier:', jobNum, studentInd, classifierInd

        students = getUniqueStudents()
        if studentInd >= len(students):
            return
        if balanced:
            probs = calcBalancedProbs(students, numSource, numTarget,
                                      studentInd)
        else:
            probs = calcProbs(students, numSource, numTarget, studentInd)
        print 'probs:', probs, sum(probs)

        counts = [0 for s in students]
        for i in range(numTarget):
            r = random.random()
            total = 0
            for j, p in enumerate(probs):
                total += p
                if r < total:
                    ind = j
                    break
            else:
                ind = len(probs) - 1
            counts[ind] += 1
        eps = 1e-10
        props = [
            eps + float(c) / (numTarget if i == studentInd else numSource)
            for i, c in enumerate(counts)
        ]

        try:
            arffFilename = makeTemp('.arff')
            #arffFilenameFilt = makeTemp('.arff')
            tempFile = makeTemp('.arff')
            with open(arffFilename, 'w') as arffFile:
                for i, (student, prop) in enumerate(zip(students, props)):
                    inFile = getFilename(
                        base % (numTarget if student == students[studentInd]
                                else numSource), student, TRAIN)
                    print 'resampling', student
                    resample(inFile, tempFile, prop)
                    with open(tempFile, 'r') as f:
                        if i != 0:
                            for line in f:
                                if line.strip() == '@data':
                                    break
                        for line in f:
                            arffFile.write(line)
            print 'removing trial step'
            #removeTrialStep(arffFilename,arffFilenameFilt)
            if balanced:
                name = 'trBagg-balanced-%s-%i' % (students[studentInd],
                                                  classifierInd)
            else:
                name = 'trBagg-%s-%i' % (students[studentInd], classifierInd)
            makeTree(arffFilename, True, None, outputBase, name, [], False,
                     1.0)
            #makeTree(arffFilenameFilt,True,None,outputBase,'trBagg-%s-%i' % (students[studentInd],classifierInd),[],False,1.0)
        finally:
            os.remove(arffFilename)
            #os.remove(arffFilenameFilt)
            os.remove(tempFile)
            os.remove(getFilename(outputBase, name, DESC))
示例#9
0
def main(args=sys.argv[1:]):
  usage = 'createTrBagg [--balanced] numTarget numSource jobStart [numJobs]'
  balanced = False
  if '--balanced' in args:
    args.remove('--balanced')
    balanced = True
  if (len(args) < 3) or (len(args) > 4):
    print >>sys.stderr,usage
    sys.exit(1)
  numClassifiers = 1000
  numTarget = int(args[0])
  numSource = int(args[1])
  jobStart = int(args[2])
  if len(args) >= 4:
    numJobs = int(args[3])
  else:
    numJobs = 1
  jobStart *= numJobs
  base = 'studentsNew29-unperturbed-%i'
  outputBase = 'studentsNew29-unperturbed-transfer/target%i-source%i' % (numTarget,numSource)
  if not os.path.exists('data/dt/' + outputBase + '/desc'):
    os.makedirs('data/dt/' + outputBase + '/desc')
  if not os.path.exists('data/dt/' + outputBase + '/weighted'):
    os.makedirs('data/dt/' + outputBase + '/weighted')

  for jobOffset in range(numJobs):
    jobNum = jobStart + jobOffset
    studentInd = jobNum / numClassifiers
    classifierInd = jobNum % numClassifiers
    print 'job,student,classifier:',jobNum,studentInd,classifierInd

    students = getUniqueStudents()
    if studentInd >= len(students):
      return
    if balanced:
      probs = calcBalancedProbs(students,numSource,numTarget,studentInd)
    else:
      probs = calcProbs(students,numSource,numTarget,studentInd)
    print 'probs:',probs,sum(probs)

    counts = [0 for s in students]
    for i in range(numTarget):
      r = random.random()
      total = 0
      for j,p in enumerate(probs):
        total += p
        if r < total:
          ind = j
          break
      else:
        ind = len(probs) - 1
      counts[ind] += 1
    eps = 1e-10
    props = [eps + float(c) / (numTarget if i == studentInd else numSource) for i,c in enumerate(counts)]

    try:
      arffFilename = makeTemp('.arff')
      #arffFilenameFilt = makeTemp('.arff')
      tempFile = makeTemp('.arff')
      with open(arffFilename,'w') as arffFile:
        for i,(student,prop) in enumerate(zip(students,props)):
          inFile = getFilename(base % (numTarget if student == students[studentInd] else numSource),student,TRAIN)
          print 'resampling',student
          resample(inFile,tempFile,prop)
          with open(tempFile,'r') as f:
            if i != 0:
              for line in f:
                if line.strip() == '@data':
                  break
            for line in f:
              arffFile.write(line)
      print 'removing trial step'
      #removeTrialStep(arffFilename,arffFilenameFilt)
      if balanced:
        name = 'trBagg-balanced-%s-%i' % (students[studentInd],classifierInd)
      else:
        name = 'trBagg-%s-%i' % (students[studentInd],classifierInd)
      makeTree(arffFilename,True,None,outputBase,name,[],False,1.0)
      #makeTree(arffFilenameFilt,True,None,outputBase,'trBagg-%s-%i' % (students[studentInd],classifierInd),[],False,1.0)
    finally:
      os.remove(arffFilename)
      #os.remove(arffFilenameFilt)
      os.remove(tempFile)
      os.remove(getFilename(outputBase,name,DESC))
示例#10
0
def main(args = sys.argv[1:]):
  options = parseArgs(args)
  testFile = getFilename(options.testBase,options.student,TRAIN)
  cmd = ['bin/%s/runClassifier' % getArch(),options.saveConfigFilename,testFile,str(options.numTest),'--notrain']
  subprocess.check_call(cmd)
 def newSubject(self, picturePath):
     filename = getFilename(picturePath)
     name, email = decodeSubjectPictureName(filename)
     # doFullContactSearch(email)
     doBuscarCUITSearch(name)
     doPiplSearch(email)