예제 #1
0
파일: movies.py 프로젝트: tgadf/movies
def processWikipediaOscarFiles(procYear = None):
    outdir = getWikipediaDir()
    if procYear == None:
        files = findSubExt(outdir, "data", ext=".p")
        #files = glob(join(outdir, "data", "*.p"))
    else:
        files = findSubPatternExt(outdir, "data", pattern=str(procYear), ext=".p")
        #files = glob(join(outdir, "data", str(procYear)+".p"))

    movies = OrderedDict()    
    for ifile in files:
        print ifile
        year    = getBaseFilename(ifile)
        print year
        #if year == "1985": continue
        htmldata = get(ifile)
        bsdata   = getHTML(htmldata)
        if int(year) <= 1984:
            results = parseWikipediaOscarDataPre1985(bsdata, True)
        elif int(year) >= 1986:
            results = parseWikipediaOscarDataPost1987(bsdata, True)
        else:
            results = parseWikipediaOscarData1985(debug = True)
        movies[year] = results
        for k,v in results.iteritems():
            print "====>",year,'\t',k
            print "      Winner  :",results[k]["Winner"]
            print "      Nominees:",results[k]["Nominees"]
            print ""

    savename = setFile(outdir, "oscars.yaml")
    print "Saving",len(movies),"years of wikipedia oscar data to",savename
    save(savename, movies)
예제 #2
0
파일: movies.py 프로젝트: tgadf/movies
def getOscarData():    
    filename   = setFile(getOscarDir(), "oscars.yaml")
    data       = get(filename)
    yearlyData = {}
    for year,ydata in data.iteritems():
        
        movies = {}
        for category,categorydata in ydata.iteritems():
            if category.find("Song") != -1:
                continue
            sf = 1
            if category.find("Song") != -1:
                sf = 0
            elif category.find("Picture") != -1:
                sf = 40
            elif category.find("Animated Feature") != -1:
                sf = 35
            elif category.find("Director") != -1:
                sf = 30
            elif category.find("Actor") != -1 or category.find("Actress") != -1:
                sf = 25
            elif category.find("Screenplay") != -1:
                sf = 20
            winner = categorydata.get("Winner")
            if winner:
                #print category,'\t',winner
                if isinstance(winner, list):                    
                    movie = winner[0]
                else:
                    movie = winner
                    
                #print category,'\t',10*sf,'\t',winner
                if movies.get(movie) == None:
                    movies[movie] = 10*sf
                else:
                    movies[movie] = max(10*sf, movies[movie])
        
            nominees = categorydata.get("Nominees")
            if nominees:
                for nominee in nominees:
                    if isinstance(nominee, list):
                        movie = nominee[0]
                    else:
                        movie = nominee
                    
                    #print category,'\t',sf,'\t',winner
                    if movies.get(movie) == None:
                        movies[movie] = sf
                    else:
                        movies[movie] = max(sf, movies[movie])
        
        yearlyData[year] = sorted(movies.items(), key=operator.itemgetter(1), reverse=True)
        print "---->",year,"<----"
        for item in yearlyData[year][:15]:
            print item
        print '\n'
        
    savename = setFile(getOscarDir(), "oscars.json")
    print "Saving",len(yearlyData),"yearly results to",savename
    save(savename, yearlyData)
예제 #3
0
파일: movies.py 프로젝트: tgadf/movies
def processBoxOfficeMojo():
    outdir   = getBoxOfficeDir()
    savename = setFile(outdir, "results.json")
    
    data = get(savename)
    movies = {}
    yearlyData = {}
    for i,year in enumerate(data.keys()):
        movies[year] = {}
        ydata = data[year]
        for wdata in ydata:
            for mdata in wdata:
                movie  = mdata[2]
                retval = search("\((\d+)\)",movie)
                if retval:
                    stryear  = retval.group()
                    movie = movie.replace(stryear, "").strip()

                gross  = convertCurrency(mdata[9])
                weekly = convertCurrency(mdata[4])
                money  = max(gross, weekly)
                if movies[year].get(movie) == None:
                    movies[year][movie] = money
                else:                    
                    movies[year][movie] = max(money, movies[year][movie])

        yearlyData[year] = sorted(movies[year].items(), key=operator.itemgetter(1), reverse=True)
        print "---->",year,"<----"
        for item in yearlyData[year][:25]:
            print item
        print '\n'
        
    savename = setFile(outdir, "boxofficemojo.json")
    print "Saving",len(yearlyData),"yearly results to",savename
    save(savename, yearlyData)
예제 #4
0
def woerter_lernen(indizes, tdn, wbn):
	wb = fileio.wb_laden(wbn)
	worte = fehlerabschnitt.fa.worte_finden(indizes)
	wb.extend(worte)
	fileio.save(wbn, wb)
	fehlerabschnitt.fa.fehlerworte = [x for x in fehlerabschnitt.fa.fehlerworte if x not in worte]
	zeilendialog.anzeigen(fa = fehlerabschnitt.fa, tdn = tdn, wbn = wbn)
예제 #5
0
def new_task(args):
    name = ' '.join(args.name)

    if not name:
        print(INVALID_NAME_MSG)
        return

    date_str = args.date

    if date_str:
        try:
            date = interpret_date(date_str)
        except ValueError as e:
            print(e)
            return
    else:
        date = None

    estimated_hours = float(args.est_hours)

    manager = TaskManager(fileio.load())
    id = manager.new_task(name, estimated_hours, date)
    fileio.save(manager.task_dict)

    print_table(manager, id)
예제 #6
0
파일: movies.py 프로젝트: tgadf/movies
def correctOscarData():
    print "Checking for unparsed oscar data."
    backupfilename = setFile(getWikipediaDir(), "oscars.yaml.backup")    
    filename = setFile(getWikipediaDir(), "oscars.yaml")
    copyFile(filename, backupfilename)
    data     = get(filename)
    #fixes    = {}
    for year,ydata in data.iteritems():
        print "\n==>",year
        for cat,catdata in ydata.iteritems():
            
            winner = catdata["Winner"]
            if isinstance(winner, list):
                if winner[0].find(",") != -1:
                    print "\t",cat,"\t",winner[0]

            nominees = catdata["Nominees"]
            for nominee in nominees:
                if isinstance(nominee, list):
                    if nominee[0].find(",") != -1:
                        print "\t",cat,"\t",nominee[0]


    savename = setFile(getOscarDir(), "oscars.yaml")
    print "Saving",len(data),"yearly results to",savename
    save(savename, data)
예제 #7
0
def woerter_lernen(indizes, tdn, wbn):
    wb = fileio.wb_laden(wbn)
    worte = fehlerabschnitt.fa.worte_finden(indizes)
    wb.extend(worte)
    fileio.save(wbn, wb)
    fehlerabschnitt.fa.fehlerworte = [
        x for x in fehlerabschnitt.fa.fehlerworte if x not in worte
    ]
    zeilendialog.anzeigen(fa=fehlerabschnitt.fa, tdn=tdn, wbn=wbn)
예제 #8
0
    def save(self, fname='save.dcb'):
        '''Cleans everything up (play back, recording, etc), and saves the current
    lecture and audio.  If there is no set filename, uses the GUI to ask the
    user where they would like the file saved.  (Their answer will effect the
    format of the save.'''

        # TODO check if this is dirty.
        if self.is_recording():
            self.record(False)
        fileio.save(fname, self.lec)
        self.gui.canvas.dirty = False
예제 #9
0
  def save(self, fname = 'save.dcb'):
    '''Cleans everything up (play back, recording, etc), and saves the current
    lecture and audio.  If there is no set filename, uses the GUI to ask the
    user where they would like the file saved.  (Their answer will effect the
    format of the save.'''

    # TODO check if this is dirty.
    if self.is_recording():
      self.record(False)
    fileio.save(fname, self.lec)
    self.gui.canvas.dirty = False
예제 #10
0
파일: movies.py 프로젝트: tgadf/movies
def mergeBoxOfficeMojoResults():
    outdir = getBoxOfficeDir()
    retval = {}
    files  = findSubExt(outdir, "results", ext=".json")
    for ifile in files:
        year = getBaseFilename(ifile)
        data = get(ifile)
        retval[year] = data
              
    savename = setFile(outdir, "results.json")
    print "Saving",len(retval),"years of movie data to",savename
    save(savename, retval)
예제 #11
0
파일: movies.py 프로젝트: tgadf/movies
def mergeYearlyMovies(outdir, yearlyMovies):
    data = OrderedDict()
    for year,ymovies in yearlyMovies.iteritems():
        data[year] = {}
        for movie,movieType in ymovies.iteritems():
            if data[year].get(movieType) == None:
                data[year][movieType] = {}
            data[year][movieType][movie] = None

    savename = setFile(outdir, "officialMovies.yaml")
    print "Saving",len(data),"to",savename
    save(savename, data)
예제 #12
0
def clock(args):
    manager = TaskManager(fileio.load())
    task_id = args.id
    to_deduct = args.to_deduct
    # TODO Complete if less than/eq 0?
    try:
        old_task: task.Task = manager.task_dict[task_id]
        old_task.hours_remaining -= to_deduct

        fileio.save(manager.task_dict)
        print_table(manager, task_id)
    except KeyError:
        print(INVALID_ID_MSG)
예제 #13
0
파일: movies.py 프로젝트: tgadf/movies
def parseBoxOfficeMojoResults(startYear = 1982, endYear = 2017):
    outdir   = getBoxOfficeDir()
    if endYear == None: endYear = startYear
    years    = range(int(startYear), int(endYear)+1)
    for year in years:
        retval = []
        files  = findSubPatternExt(outdir, "data", pattern=str(year), ext=".p")
        for ifile in files:
            result = parseBoxOfficeMojo(ifile)
            retval.append(result)

        savename = setSubFile(outdir, "results", str(year)+".json")
        print "Saving",len(retval),"weekends of movie data to",savename
        save(savename, retval)
예제 #14
0
def complete_task(args):
    manager = TaskManager(fileio.load())
    ids = args.id
    if not ids:
        print(INVALID_ID_MSG)
    else:
        completed_tasks_dict = dict()
        for i in ids:
            try:
                completed_tasks_dict[i] = manager.task_dict[i]
                manager.complete(i)
                fileio.save(manager.task_dict)
            except KeyError:
                print(INVALID_ID_MSG)
        print_table(manager)
        print()
        print_completed_tasks(completed_tasks_dict)
예제 #15
0
def modify_task(args):
    manager = TaskManager(fileio.load())
    task_id = args.id
    try:
        old_task: task.Task = manager.task_dict[task_id]
        new_name = args.name if args.name else old_task.name
        if args.date:
            new_date = interpret_date(args.date)
        elif args.floating or isinstance(old_task, task.FloatingTask):
            new_date = None
        else:
            new_date = old_task.due_date
        new_hours_remaining = args.est_hours if args.est_hours else old_task.hours_remaining
        manager.new_task(new_name, new_hours_remaining, new_date, task_id)
        fileio.save(manager.task_dict)
        print_table(manager, task_id)
    except KeyError:
        print(INVALID_ID_MSG)
    except ValueError as e:
        print(e)
예제 #16
0
def estimate(respfile,
             covfile,
             maskfile=None,
             cvfolds=None,
             testcov=None,
             testresp=None,
             saveoutput=True,
             outputsuffix=None):
    """ Estimate a normative model

    This will estimate a model in one of two settings according to the
    particular parameters specified (see below):

    * under k-fold cross-validation
    * estimating a training dataset then applying to a second test dataset

    The models are estimated on the basis of data stored on disk in ascii or
    neuroimaging data formats (nifti or cifti). Ascii data should be in
    tab or space delimited format with the number of subjects in rows and the
    number of variables in columns. Neuroimaging data will be reshaped
    into the appropriate format

    Basic usage::

        estimate(respfile, covfile, [extra_arguments])

    where the variables are defined below. Note that either the cfolds
    parameter or (testcov, testresp) should be specified, but not both.

    :param respfile: response variables for the normative model
    :param covfile: covariates used to predict the response variable
    :param maskfile: mask used to apply to the data (nifti only)
    :param cvfolds: Number of cross-validation folds
    :param testcov: Test covariates
    :param testresp: Test responses
    :param saveoutput: Save the output to disk? Otherwise returned as arrays
    :param outputsuffix: Text string to add to the output filenames

    All outputs are written to disk in the same format as the input. These are:

    :outputs: * yhat - predictive mean
              * ys2 - predictive variance
              * Z - deviance scores
              * Rho - Pearson correlation between true and predicted responses
              * pRho - parametric p-value for this correlation
              * rmse - root mean squared error between true/predicted responses
              * smse - standardised mean squared error

    The outputsuffix may be useful to estimate multiple normative models in the
    same directory (e.g. for custom cross-validation schemes)
    """

    # load data
    print("Processing data in " + respfile)
    X = fileio.load(covfile)
    Y, maskvol = load_response_vars(respfile, maskfile)
    if len(Y.shape) == 1:
        Y = Y[:, np.newaxis]
    if len(X.shape) == 1:
        X = X[:, np.newaxis]
    Nmod = Y.shape[1]

    if testcov is not None:
        # we have a separate test dataset
        Xte = fileio.load(testcov)
        Yte, testmask = load_response_vars(testresp, maskfile)
        testids = range(X.shape[0], X.shape[0] + Xte.shape[0])

        if len(Yte.shape) == 1:
            Yte = Yte[:, np.newaxis]
        if len(Xte.shape) == 1:
            Xte = Xte[:, np.newaxis]

        # treat as a single train-test split
        splits = CustomCV((range(0, X.shape[0]), ), (testids, ))

        Y = np.concatenate((Y, Yte), axis=0)
        X = np.concatenate((X, Xte), axis=0)

        # force the number of cross-validation folds to 1
        if cvfolds is not None and cvfolds != 1:
            print("Ignoring cross-valdation specification (test data given)")
        cvfolds = 1
    else:
        # we are running under cross-validation
        splits = KFold(n_splits=cvfolds)
        testids = range(0, X.shape[0])

    # find and remove bad variables from the response variables
    # note: the covariates are assumed to have already been checked
    nz = np.where(
        np.bitwise_and(np.isfinite(Y).any(axis=0),
                       np.var(Y, axis=0) != 0))[0]

    # starting hyperparameters. Could also do random restarts here
    covfunc = CovSum(X, ('CovLin', 'CovSqExpARD'))
    hyp0 = np.zeros(covfunc.get_n_params() + 1)

    # run cross-validation loop
    Yhat = np.zeros_like(Y)
    S2 = np.zeros_like(Y)
    Z = np.zeros_like(Y)
    nlZ = np.zeros((Nmod, cvfolds))
    Hyp = np.zeros((Nmod, len(hyp0), cvfolds))
    for idx in enumerate(splits.split(X)):
        fold = idx[0]
        tr = idx[1][0]
        te = idx[1][1]

        # standardize responses and covariates, ignoring invalid entries
        iy, jy = np.ix_(tr, nz)
        mY = np.mean(Y[iy, jy], axis=0)
        sY = np.std(Y[iy, jy], axis=0)
        Yz = np.zeros_like(Y)
        Yz[:, nz] = (Y[:, nz] - mY) / sY
        mX = np.mean(X[tr, :], axis=0)
        sX = np.std(X[tr, :], axis=0)
        Xz = (X - mX) / sX

        # estimate the models for all subjects
        for i in range(0, len(nz)):  # range(0, Nmod):
            print("Estimating model ", i + 1, "of", len(nz))
            gpr = GPR(hyp0, covfunc, Xz[tr, :], Yz[tr, nz[i]])
            Hyp[nz[i], :, fold] = gpr.estimate(hyp0, covfunc, Xz[tr, :],
                                               Yz[tr, nz[i]])

            yhat, s2 = gpr.predict(Hyp[nz[i], :, fold], Xz[tr, :],
                                   Yz[tr, nz[i]], Xz[te, :])

            Yhat[te, nz[i]] = yhat * sY[i] + mY[i]
            S2[te, nz[i]] = np.diag(s2) * sY[i]**2
            Z[te, nz[i]] = (Y[te, nz[i]] - Yhat[te, nz[i]]) / \
                           np.sqrt(S2[te, nz[i]])
            nlZ[nz[i], fold] = gpr.nlZ

    # compute performance metrics
    MSE = np.mean((Y[testids, :] - Yhat[testids, :])**2, axis=0)
    RMSE = np.sqrt(MSE)
    # for the remaining variables, we need to ignore zero variances
    SMSE = np.zeros_like(MSE)
    Rho = np.zeros(Nmod)
    pRho = np.ones(Nmod)
    iy, jy = np.ix_(testids, nz)  # ids for tested samples with nonzero values
    SMSE[nz] = MSE[nz] / np.var(Y[iy, jy], axis=0)
    Rho[nz], pRho[nz] = compute_pearsonr(Y[iy, jy], Yhat[iy, jy])

    # Set writing options
    if saveoutput:
        print("Writing output ...")
        if fileio.file_type(respfile) == 'cifti' or \
           fileio.file_type(respfile) == 'nifti':
            exfile = respfile
        else:
            exfile = None
        if outputsuffix is not None:
            ext = str(outputsuffix) + fileio.file_extension(respfile)
        else:
            ext = fileio.file_extension(respfile)

        # Write output
        fileio.save(Yhat[testids, :].T,
                    'yhat' + ext,
                    example=exfile,
                    mask=maskvol)
        fileio.save(S2[testids, :].T,
                    'ys2' + ext,
                    example=exfile,
                    mask=maskvol)
        fileio.save(Z[testids, :].T, 'Z' + ext, example=exfile, mask=maskvol)
        fileio.save(Rho, 'Rho' + ext, example=exfile, mask=maskvol)
        fileio.save(pRho, 'pRho' + ext, example=exfile, mask=maskvol)
        fileio.save(RMSE, 'rmse' + ext, example=exfile, mask=maskvol)
        fileio.save(SMSE, 'smse' + ext, example=exfile, mask=maskvol)
        if cvfolds is None:
            fileio.save(Hyp, 'Hyp' + ext, example=exfile, mask=maskvol)
        else:
            for idx in enumerate(splits.split(X)):
                fold = idx[0]
                fileio.save(Hyp[:, :, fold],
                            'Hyp_' + str(fold + 1) + ext,
                            example=exfile,
                            mask=maskvol)
    else:
        output = (Yhat, S2, Z, Rho, pRho, RMSE, SMSE)
        return output
예제 #17
0
    config = parse_args(sys.argv[1:])

    if config.help_req:
        Configuration.print_usage()
        sys.exit(0)

    if config.export_fmt is not None:
        if config.export_fmt == 'swf':
            lec = fileio.load(config.file_to_load)
            exporter.to_swf(lec, lec.adats, config.file_to_load[:-4] + '.swf')
        elif config.export_fmt == 'pdf':
            lec = fileio.load(config.file_to_load)
            exporter.to_pdf(lec, config.file_to_load[:-4] + '.swf')
        elif config.export_fmt in ['dcd', 'dcb', 'dcx', 'dar', 'dct']:
            lec = fileio.load(config.file_to_load)
            fileio.save(config.file_to_load[:-3] + config.export_fmt, lec,
                        lec.adats)
        else:
            print 'Unknown flag "--exp-%s"' % config.export_fmt
        sys.exit(0)

    # Something was passed, so use that to
    if config.audio_module is not None:
        try:
            Audio = __import__(config.audio_module).Audio
        except AttributeError:
            config.audio_module = None
            print 'audio module "%s" not found' % config.audio_module

    if config.audio_module is None:
        for a in Configuration.VALID_AV_MODULES:
            try:
예제 #18
0
  config = parse_args(sys.argv[1:])

  if config.help_req:
    Configuration.print_usage()
    sys.exit(0)

  if config.export_fmt is not None:
    if config.export_fmt == 'swf':
      lec = fileio.load(config.file_to_load)
      exporter.to_swf(lec, lec.adats, config.file_to_load[:-4] + '.swf')
    elif config.export_fmt == 'pdf':
      lec = fileio.load(config.file_to_load)
      exporter.to_pdf(lec, config.file_to_load[:-4] + '.swf')
    elif config.export_fmt in ['dcd', 'dcb', 'dcx', 'dar', 'dct']:
      lec = fileio.load(config.file_to_load)
      fileio.save(config.file_to_load[:-3] + config.export_fmt, lec, lec.adats)
    else:
      print 'Unknown flag "--exp-%s"' % config.export_fmt
    sys.exit(0)

  # Something was passed, so use that to 
  if config.audio_module is not None:
    try:
      Audio = __import__(config.audio_module).Audio
    except AttributeError:
      config.audio_module = None
      print 'audio module "%s" not found' % config.audio_module

  if config.audio_module is None:
    for a in Configuration.VALID_AV_MODULES:
      try:
예제 #19
0
 def doSave(self):
     fileio.save(self.__processedValues, unicode(QtGui.QFileDialog.getSaveFileName()))
예제 #20
0
def estimate(respfile, covfile, maskfile=None, cvfolds=None,
             testcov=None, testresp=None, alg='gpr', configparam=None,
             saveoutput=True, outputsuffix=None):
    """ Estimate a normative model

    This will estimate a model in one of two settings according to the
    particular parameters specified (see below):

    * under k-fold cross-validation
        required settings 1) respfile 2) covfile 3) cvfolds>2
    * estimating a training dataset then applying to a second test dataset
        required sessting 1) respfile 2) covfile 3) testcov 4) testresp
    * estimating on a training dataset ouput of forward maps mean and se
        required sessting 1) respfile 2) covfile 3) testcov

    The models are estimated on the basis of data stored on disk in ascii or
    neuroimaging data formats (nifti or cifti). Ascii data should be in
    tab or space delimited format with the number of subjects in rows and the
    number of variables in columns. Neuroimaging data will be reshaped
    into the appropriate format

    Basic usage::

        estimate(respfile, covfile, [extra_arguments])

    where the variables are defined below. Note that either the cfolds
    parameter or (testcov, testresp) should be specified, but not both.

    :param respfile: response variables for the normative model
    :param covfile: covariates used to predict the response variable
    :param maskfile: mask used to apply to the data (nifti only)
    :param cvfolds: Number of cross-validation folds
    :param testcov: Test covariates
    :param testresp: Test responses
    :param alg: Algorithm for normative model
    :param configparam: Parameters controlling the estimation algorithm
    :param saveoutput: Save the output to disk? Otherwise returned as arrays
    :param outputsuffix: Text string to add to the output filenames

    All outputs are written to disk in the same format as the input. These are:

    :outputs: * yhat - predictive mean
              * ys2 - predictive variance
              * Hyp - hyperparameters
              * Z - deviance scores
              * Rho - Pearson correlation between true and predicted responses
              * pRho - parametric p-value for this correlation
              * rmse - root mean squared error between true/predicted responses
              * smse - standardised mean squared error

    The outputsuffix may be useful to estimate multiple normative models in the
    same directory (e.g. for custom cross-validation schemes)
    """

    # load data
    print("Processing data in " + respfile)
    X = fileio.load(covfile)
    Y, maskvol = load_response_vars(respfile, maskfile)
    if len(Y.shape) == 1:
        Y = Y[:, np.newaxis]
    if len(X.shape) == 1:
        X = X[:, np.newaxis]
    Nmod = Y.shape[1]

    if testcov is not None:
        # we have a separate test dataset
        Xte = fileio.load(testcov)
        testids = range(X.shape[0], X.shape[0]+Xte.shape[0])
        trainids = range(0, X.shape[0])
        if len(Xte.shape) == 1:
            Xte = Xte[:, np.newaxis]
        if testresp is not None:
            Yte, testmask = load_response_vars(testresp, maskfile)
            if len(Yte.shape) == 1:
                Yte = Yte[:, np.newaxis]
        else:
            sub_te = Xte.shape[0]
            Yte = np.zeros([sub_te, Nmod])

        # treat as a single train-test split
        splits = CustomCV((range(0, X.shape[0]),), (testids,))

        Y = np.concatenate((Y, Yte), axis=0)
        X = np.concatenate((X, Xte), axis=0)

        # force the number of cross-validation folds to 1
        if cvfolds is not None and cvfolds != 1:
            print("Ignoring cross-valdation specification (test data given)")
        cvfolds = 1
    else:
        # we are running under cross-validation
        splits = KFold(n_splits=cvfolds)
        testids = range(0, X.shape[0])

    # find and remove bad variables from the response variables
    # note: the covariates are assumed to have already been checked
    nz = np.where(np.bitwise_and(np.isfinite(Y).any(axis=0),
                                 np.var(Y, axis=0) != 0))[0]

    # Initialise normative model
    nm = norm_init(X, alg=alg, configparam=configparam)

    # run cross-validation loop
    Yhat = np.zeros_like(Y)
    S2 = np.zeros_like(Y)
    Hyp = np.zeros((Nmod, nm.n_params, cvfolds))

    Z = np.zeros_like(Y)
    nlZ = np.zeros((Nmod, cvfolds))

    for idx in enumerate(splits.split(X)):
        fold = idx[0]
        tr = idx[1][0]
        te = idx[1][1]

        # standardize responses and covariates, ignoring invalid entries
        iy, jy = np.ix_(tr, nz)
        mY = np.mean(Y[iy, jy], axis=0)
        sY = np.std(Y[iy, jy], axis=0)
        Yz = np.zeros_like(Y)
        Yz[:, nz] = (Y[:, nz] - mY) / sY
        mX = np.mean(X[tr, :], axis=0)
        sX = np.std(X[tr, :],  axis=0)
        Xz = (X - mX) / sX

        # estimate the models for all subjects
        for i in range(0, len(nz)):  # range(0, Nmod):
            print("Estimating model ", i+1, "of", len(nz))      
            try:
                nm = norm_init(Xz[tr, :], Yz[tr, nz[i]], alg=alg, configparam=configparam)
                Hyp[nz[i], :, fold] = nm.estimate(Xz[tr, :], Yz[tr, nz[i]])

                # Work around to get stats for subject in th emodel and out. Instead of te for all :
                #yhat, s2 = nm.predict(Xz[tr, :], Yz[tr, nz[i]], Xz[te, :], Hyp[nz[i], :, fold])
                yhat, s2 = nm.predict(Xz[tr, :], Yz[tr, nz[i]], Xz, Hyp[nz[i], :, fold])

                #Yhat[te, nz[i]] = yhat * sY[i] + mY[i]
                Yhat[:, nz[i]] = yhat * sY[i] + mY[i]
                #S2[te, nz[i]] = s2 * sY[i]**2
                S2[:, nz[i]] = s2 * sY[i] ** 2
                nlZ[nz[i], fold] = nm.neg_log_lik
                if testcov is None:
                    #Z[te, nz[i]] = (Y[te, nz[i]] - Yhat[te, nz[i]]) / np.sqrt(S2[te, nz[i]])
                    Z[:, nz[i]] = (Y[:, nz[i]] - Yhat[:, nz[i]]) / np.sqrt(S2[:, nz[i]])
                else:
                    if testresp is not None:
                        #Z[te, nz[i]] = (Y[te, nz[i]] - Yhat[te, nz[i]]) / np.sqrt(S2[te, nz[i]])
                        Z[:, nz[i]] = (Y[:, nz[i]] - Yhat[:, nz[i]]) / np.sqrt(S2[:, nz[i]])

            except Exception as e:
                exc_type, exc_obj, exc_tb = sys.exc_info()
                fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
                print("Model ", i+1, "of", len(nz),
                      "FAILED!..skipping and writing NaN to outputs")
                print("Exception:")
                print(e)
                print(exc_type, fname, exc_tb.tb_lineno)
                Hyp[nz[i], :, fold] = float('nan')

                Yhat[te, nz[i]] = float('nan')
                S2[te, nz[i]] = float('nan')
                nlZ[nz[i], fold] = float('nan')
                if testcov is None:
                    Z[te, nz[i]] = float('nan')
                else:
                    if testresp is not None:
                        Z[te, nz[i]] = float('nan')


    # compute performance metrics
    if testcov is None:
        MSE = np.mean((Y[testids, :] - Yhat[testids, :])**2, axis=0)
        MSE_tr = np.mean((Y[0, X.shape[0], :] - Yhat[0, X.shape[0], :])**2, axis=0)
        RMSE = np.sqrt(MSE)
        # for the remaining variables, we need to ignore zero variances
        SMSE = np.zeros_like(MSE)
        Rho = np.zeros(Nmod)
        pRho = np.ones(Nmod)
        iy, jy = np.ix_(testids, nz)  # ids for tested samples nonzero values
        SMSE[nz] = MSE[nz] / np.var(Y[iy, jy], axis=0)
        Rho[nz], pRho[nz] = compute_pearsonr(Y[iy, jy], Yhat[iy, jy])
    else:
        if testresp is not None:
            MSE = np.mean((Y[testids, :] - Yhat[testids, :])**2, axis=0)
            MSE_tr = np.mean((Y[0:X.shape[0], :] - Yhat[0:X.shape[0], :]) ** 2, axis=0)

            RMSE = np.sqrt(MSE)
            RMSE_tr = np.sqrt(MSE_tr)

            # for the remaining variables, we need to ignore zero variances
            SMSE = np.zeros_like(MSE)
            SMSE_tr = np.zeros_like(RMSE_tr)

            Rho = np.zeros(Nmod)
            Rho_tr = np.zeros(Nmod)

            pRho = np.ones(Nmod)
            pRho_tr = np.ones(Nmod)

            iy, jy = np.ix_(testids, nz)  # ids tested samples nonzero values
            iy_tr, jy_tr = np.ix_(range(0, X.shape[0]), nz)

            SMSE[nz] = MSE[nz] / np.var(Y[iy, jy], axis=0)
            SMSE_tr[nz] = MSE[nz] / np.var(Y[iy_tr, jy_tr], axis=0)

            Rho[nz], pRho[nz] = compute_pearsonr(Y[iy, jy], Yhat[iy, jy])
            Rho_tr[nz], pRho_tr[nz] = compute_pearsonr(Y[iy_tr, jy_tr], Yhat[iy_tr, jy_tr])

    # Set writing options
    if saveoutput:
        print("Writing output ...")
        if fileio.file_type(respfile) == 'cifti' or \
           fileio.file_type(respfile) == 'nifti':
            exfile = respfile
        else:
            exfile = None
        if outputsuffix is not None:
            ext = str(outputsuffix) + fileio.file_extension(respfile)
        else:
            ext = fileio.file_extension(respfile)

        # Write output
        if testcov is None:
            fileio.save(Yhat[testids, :].T, 'yhat' + ext,
                        example=exfile, mask=maskvol)
            fileio.save(S2[testids, :].T, 'ys2' + ext,
                        example=exfile, mask=maskvol)
            fileio.save(Z[testids, :].T, 'Z' + ext, example=exfile,
                        mask=maskvol)
            fileio.save(Rho, 'Rho' + ext, example=exfile, mask=maskvol)
            fileio.save(pRho, 'pRho' + ext, example=exfile, mask=maskvol)
            fileio.save(RMSE, 'rmse' + ext, example=exfile, mask=maskvol)
            fileio.save(SMSE, 'smse' + ext, example=exfile, mask=maskvol)
            if cvfolds is None:
                fileio.save(Hyp[:,:,0], 'Hyp' + ext, example=exfile, mask=maskvol)
            else:
                for idx in enumerate(splits.split(X)):
                    fold = idx[0]
                    fileio.save(Hyp[:, :, fold], 'Hyp_' + str(fold+1) +
                                ext, example=exfile, mask=maskvol)
        else:
            if testresp is None:
                fileio.save(Yhat[testids, :].T, 'yhat' + ext,
                            example=exfile, mask=maskvol)
                fileio.save(S2[testids, :].T, 'ys2' + ext,
                            example=exfile, mask=maskvol)
                fileio.save(Hyp[:,:,0], 'Hyp' + ext,
                            example=exfile, mask=maskvol)
            else:
                fileio.save(Yhat[testids, :].T, 'yhat' + ext, example=exfile, mask=maskvol)
                fileio.save(Yhat[trainids, :].T, 'yhat_controls' + ext, example=exfile, mask=maskvol)

                fileio.save(S2[testids, :].T, 'ys2' + ext, example=exfile, mask=maskvol)
                fileio.save(S2[trainids, :].T, 'ys2_controls' + ext, example=exfile, mask=maskvol)

                fileio.save(Z[testids, :].T, 'Z' + ext, example=exfile, mask=maskvol)
                fileio.save(Z[trainids, :].T, 'Z_controls' + ext, example=exfile, mask=maskvol)

                fileio.save(Rho, 'Rho' + ext, example=exfile, mask=maskvol)
                fileio.save(Rho_tr, 'Rho_controls' + ext, example=exfile, mask=maskvol)

                fileio.save(pRho, 'pRho' + ext, example=exfile, mask=maskvol)
                fileio.save(pRho_tr, 'pRho_controls' + ext, example=exfile, mask=maskvol)

                fileio.save(RMSE, 'rmse' + ext, example=exfile, mask=maskvol)
                fileio.save(RMSE_tr, 'rmse_controls' + ext, example=exfile, mask=maskvol)

                fileio.save(SMSE, 'smse' + ext, example=exfile, mask=maskvol)
                fileio.save(SMSE_tr, 'smse_controls' + ext, example=exfile, mask=maskvol)
                if cvfolds is None:
                    fileio.save(Hyp[:,:,0], 'Hyp' + ext,
                                example=exfile, mask=maskvol)
                else:
                    for idx in enumerate(splits.split(X)):
                        fold = idx[0]
                        fileio.save(Hyp[:, :, fold], 'Hyp_'+ str(fold+1) +
                                    ext, example=exfile, mask=maskvol)
    else:
        if testcov is None:
            output = (Yhat, S2, Hyp, Z, Rho, pRho, RMSE, SMSE)
        else:
            if testresp is None:
                output = (Yhat, S2, Hyp)
            else:
                output = (Yhat, S2, Hyp, Z, Rho, pRho, RMSE, SMSE)
        return output
예제 #21
0
파일: movies.py 프로젝트: tgadf/movies
def processSundanceData():
    files  = findSubExt(getSundanceDir(), "data", ext=".p")
    data   = OrderedDict()
    for ifile in files:
        htmldata = get(ifile)
        bsdata   = getHTML(htmldata)
        years    = []
        for h2 in bsdata.findAll("h2"):
            span = h2.find("span")
            try:
                year = int(span.string)
            except:
                continue
            years.append(year)

        for j,ul in enumerate(bsdata.findAll("ul")):
            try:
                year = years[j]
            except:
                break
            data[year] = {}
            lis = ul.findAll("li")
            for li in lis:                
                try:
                    txt    = li.text
                    txt    = re.sub("\xe2\x80\x93", " :: ", txt)
                    txt    = re.sub(u"(\u2018|\u2013)", " :: ", txt)
                except:
                    print "Error with",li
                    continue

                vals = txt.split(" :: ")
                if len(vals) > 2:
                    vals[1] = "-".join(vals[1:])
                    vals = vals[:2]
                vals = [x.strip() for x in vals]
                if len(vals) != 2:
                    raise ValueError(vals)

                    
                cat   = vals[0]
                movie = vals[1]
                
                if cat.find("Piper-Heidsieck") != -1:
                    continue
                
                if cat.find("Alfred P. Sloan") != -1:
                    cat = "Alfred P. Sloan Prize"

                
                if cat in ["World Cinema Dramatic Screenwriting Award",
                           "Sundance Institute/Mahindra Global Filmmaking Awards",
                           "World Cinema Documentary Editing Award",
                           "Excellence in Cinematography Award: Documentary",
                           "Excellence in Cinematography Award: Dramatic",
                           "World Cinema Cinematography Award: Documentary",
                           "World Cinema Cinematography Award: Dramatic",
                           "World Cinema Directing Award: Dramatic",
                           "World Cinema Directing Award: Documentary",
                           "World Dramatic Special Jury Prizes for Breakout Performances",
                           "Dramatic Special Jury Prize for Breakout Performance",
                           "Excellence in Cinematography Award Dramatic",
                           "xcellence in Cinematography Award Documentary",
                           "Documentary Editing Award",
                           "Waldo Salt Screenwriting Award: Dramatic",
                           "World Cinema Screenwriting Award",
                           "Directing Award Documentary",
                           "Directing Award Dramatic"]:
                    vals = movie.split(" for ")
                    if len(vals) == 2:
                        movie = vals[1]
                    elif len(vals) == 1:
                        movie = vals[0]
                    else:
                        print "Error in",cat,"===>",movie
                        continue

                if cat in ["Special Jury Prize for Acting"]:
                    movie = movie.replace("for her performance ", "")
                    vals = movie.split(" in ")
                    if len(vals) == 2:
                        movie = vals[1]
                    vals = movie.split(" for ")
                    if len(vals) == 2:
                        movie = vals[1]

                if movie.find("retitled") != -1:
                    movie = movie.split("retitled ")[1]
                    movie = movie[:-1]
                    
                movie = movie.replace(" (tie)", "")
                
                if movie.find(" director of ") != -1:
                    movie = movie.split(" director of ")[1]
                    
                
                print years[j],'\t',cat,'\t\t',movie,'\t\t'
                try:
                    data[year][str(cat)] = str(movie)
                except:
                    data[year][str(cat)] = movie

    savename = setFile(getSundanceDir(), "winners.yaml")
    print "Saving",len(data),"yearly results to",savename
    save(savename, data)