예제 #1
0
파일: SWAP.py 프로젝트: rgavazzi/SpaceWarps
    tonights.parameters['dir'] = os.getcwd()+'/'+tonights.parameters['trunk']
    subprocess.call(["mkdir","-p",tonights.parameters['dir']])

    # ------------------------------------------------------------------
    # Pickle the bureau, sample, and database, if required. If we do
    # this, its because we want to pick up from where we left off
    # (ie with SWAPSHOP) - so save the pickles in the $cwd. This is
    # taken care of in io.py. Note that we update the parameters as
    # we go - this will be useful later when we write update.config.

    if tonights.parameters['repickle'] and count > 0:

        new_bureaufile = swap.get_new_filename(tonights.parameters,'bureau')
        print "SWAP: saving agents to "+new_bureaufile
        swap.write_pickle(bureau,new_bureaufile)
        tonights.parameters['bureaufile'] = new_bureaufile

        new_samplefile = swap.get_new_filename(tonights.parameters,'collection')
        print "SWAP: saving subjects to "+new_samplefile
        swap.write_pickle(sample,new_samplefile)
        tonights.parameters['samplefile'] = new_samplefile

        if practise:
            new_dbfile = swap.get_new_filename(tonights.parameters,'database')
            print "SWAP: saving database to "+new_dbfile
            swap.write_pickle(db,new_dbfile)
            tonights.parameters['dbfile'] = new_dbfile

    # ------------------------------------------------------------------
예제 #2
0
파일: SWAP.py 프로젝트: kapadia/SpaceWarps
    tonights.parameters['dir'] = os.getcwd(
    ) + '/' + tonights.parameters['trunk']
    subprocess.call(["mkdir", "-p", tonights.parameters['dir']])

    # ------------------------------------------------------------------
    # Pickle the bureau, sample, and database, if required. If we do
    # this, its because we want to pick up from where we left off
    # (ie with SWAPSHOP) - so save the pickles in the $cwd. This is
    # taken care of in io.py. Note that we update the parameters as
    # we go - this will be useful later when we write update.config.

    if tonights.parameters['repickle'] and count > 0:

        new_bureaufile = swap.get_new_filename(tonights.parameters, 'bureau')
        print "SWAP: saving agents to " + new_bureaufile
        swap.write_pickle(bureau, new_bureaufile)
        tonights.parameters['bureaufile'] = new_bureaufile

        new_samplefile = swap.get_new_filename(tonights.parameters,
                                               'collection')
        print "SWAP: saving subjects to " + new_samplefile
        swap.write_pickle(sample, new_samplefile)
        tonights.parameters['samplefile'] = new_samplefile

        if practise:
            new_dbfile = swap.get_new_filename(tonights.parameters, 'database')
            print "SWAP: saving database to " + new_dbfile
            swap.write_pickle(db, new_dbfile)
            tonights.parameters['dbfile'] = new_dbfile

    # ------------------------------------------------------------------
예제 #3
0
def make_offline_reports(args):
    """
    NAME
        make_offline_reports

    PURPOSE
        Given an offline tuple as well as other bureau tuples etc,
        this script produces the reports made at the end of SWAP

    COMMENTS

    FLAGS
        -h              Print this message
        --out           Output directory, otherwise is '.'
        --do_offline    Do offline analysis?

    INPUTS
        configfile Plain text file containing SW experiment configuration
        bureaufile
        collectionfile


    OUTPUTS

    EXAMPLE

    BUGS

    AUTHORS
        This file is part of the Space Warps project, and is distributed
        under the MIT license by the Space Warps Science Team.
        http://spacewarps.org/

    HISTORY
        2014-09-16  started Davis (KIPAC)
    """
    # ------------------------------------------------------------------
    # Some defaults:

    # default settings are for offline using only exact training info
    flags = {'do_offline': False,
             'output_directory': '.',
             'PL0': 0.5,  # initial PL guess
             'PD0': 0.5,  # initial PD guess
             'pi': 4e-2,  # initial lens probability
             'n_min_assessment': 0,  # minimum number of assessments before included in analysis
             'use_training_info': True,
             'exclude_test_info': True,
             'exclude_training_info': False,
             'N_min': 10,  # min number of EM steps required
             'N_max': 100,  # max number of EM steps
             'epsilon_min': 1e-6,  # escape condition
             }

    # this has to be easier to do...
    for arg in args:
        if arg in flags:
            flags[arg] = args[arg]
        elif arg == 'config':
            configfile = args[arg]
        elif arg == 'collection':
            collectionfile = args[arg]
        elif arg == 'bureau':
            bureaufile = args[arg]
        else:
            print "make_offline_reports: unrecognized flag ",arg

    out_dir = flags['output_directory']

    # ------------------------------------------------------------------
    # Read in run configuration:
    tonights = swap.Configuration(configfile)
    # TODO: do this correctly
    tonights.parameters['finish'] = 'now'
    tonights.parameters['start'] = 'now'
    tonights.parameters['trunk'] = \
        tonights.parameters['survey']+'_'+tonights.parameters['finish']
    tonights.parameters['dir'] = out_dir
    # How will we make decisions based on probability?
    thresholds = {}
    thresholds['detection'] = tonights.parameters['detection_threshold']
    thresholds['rejection'] = tonights.parameters['rejection_threshold']

    t = -1  # for now?!

    # ------------------------------------------------------------------
    # Read in, or create, a bureau of agents who will represent the
    # volunteers:

    bureau = swap.read_pickle(bureaufile, 'bureau')

    # ------------------------------------------------------------------
    # Read in, or create, an object representing the candidate list:

    sample = swap.read_pickle(collectionfile, 'collection')


    # ------------------------------------------------------------------
    # if do_offline, run offline analysis here:

    if flags['do_offline']:
        PL0 = flags['PL0']
        PD0 = flags['PD0']
        pi = flags['pi']
        n_min_assessment = flags['n_min_assessment']
        use_training_info = flags['use_training_info']
        exclude_test_info = flags['exclude_test_info']
        exclude_training_info = flags['exclude_training_info']
        N_min = flags['N_min']
        N_max = flags['N_max']
        epsilon_min = flags['epsilon_min']

        # initialize offline params
        bureau_offline = {}
        probabilities = {}
        online_probabilities = {}
        training_IDs = {}  # which entries in collection are training
        set_aside_subject = {}  # which subjects do we set aside? Here we set aside none
        set_aside_agent = {}  # which agents do we set aside? Here we set aside none

        collection = {}
        for ID in sample.list():
            if ID in set_aside_subject:
                continue
            else:
                collection.update({ID: sample.member[ID]})

        for ID in collection.keys():
            subject = collection[ID]
            n_assessment = len(subject.annotationhistory['ItWas'])
            if (n_assessment > n_min_assessment):
                if (subject.category == 'training'):
                    if use_training_info:
                        truth = {'LENS': 1, 'NOT': 0}[subject.truth]
                        training_IDs.update({ID: truth})
                    if exclude_training_info:
                        # when doing M step, don't use these to update parameters
                        training_IDs.update({ID: -1})
                elif (subject.category == 'test'):
                    if exclude_test_info:
                        # when doing M step, don't use these to update parameters
                        training_IDs.update({ID: -1})
                probabilities.update({ID: pi})
                online_probabilities.update({ID: subject.mean_probability})
                for agent_i in xrange(len(subject.annotationhistory['Name'])):
                    name = subject.annotationhistory['Name'][agent_i]
                    if name in set_aside_agent:
                        continue
                    xij = subject.annotationhistory['ItWas'][agent_i]
                    if name not in bureau_offline:
                        bureau_offline.update({name: {
                            'PD': PD0, 'PL': PL0,
                            'PL_in': bureau.member[name].PL,
                            'PD_in': bureau.member[name].PD,
                            'Pi': pi,
                            'Subjects': {ID: xij}}})
                    else:
                        bureau_offline[name]['Subjects'].update({ID: xij})

        # Run EM Algorithm

        bureau_offline, pi, probabilities, information_dict = EM_algorithm(
                bureau_offline, pi, probabilities, training_IDs,
                N_min=N_min, N_max=N_max, epsilon_min=epsilon_min,
                return_information=True)

        tup = (bureau_offline, pi, probabilities, information_dict)
        offlinefile = out_dir + '/offline.pickle'
        swap.write_pickle(tup, offlinefile)

        # ------------------------------------------------------------------
        # Now replace sample member probabilities with offline probabilities
        # Also update bureau with offline results
        for ID in sample.list():
            # just in case any IDs didn't get into offline somehow?!
            if ID not in probabilities.keys():
                sample.member.pop(ID)
                continue
            # This is a bit hackish: update mean_probability,
            # median_probability, and do the rejection threshold stuff
            subject = sample.member[ID]
            subject.mean_probability = probabilities[ID]
            subject.median_probability = probabilities[ID]
            # ripped from subject.py
            if subject.mean_probability < subject.rejection_threshold:
                subject.status = 'rejected'
                if subject.kind == 'test':
                    subject.state = 'inactive'
                    subject.retirement_time = -1#at_time
                    subject.retirement_age = subject.exposure

            elif subject.mean_probability > subject.detection_threshold:
                subject.status = 'detected'
                if subject.kind == 'test':
                    # Let's keep the detections live!
                    #   subject.state = 'inactive'
                    #   subject.retirement_time = at_time
                    #   subject.retirement_age = subject.exposure
                    pass

            else:
                # Keep the subject alive! This code is only reached if
                # we are not being hasty.
                subject.status = 'undecided'
                if subject.kind == 'test':
                    subject.state = 'active'
                    subject.retirement_time = 'not yet'
                    subject.retirement_age = 0.0

            # I don't think this is necessary, but just in case
            sample.member[ID] = subject

        for kind in ['sim', 'dud', 'test']:
            sample.collect_probabilities(kind)

        # now save
        collectionfile = out_dir + '/collection_offline.pickle'
        swap.write_pickle(collection, collectionfile)

        # now update bureau
        for ID in bureau.list():
            # just in case any IDs didn't make it to offline?
            if ID not in bureau_offline.keys():
                bureau.member.pop(ID)
                continue
            # update PL, PD, then update_skill
            agent = bureau.member[ID]
            agent.PL = bureau_offline[ID]['PL']
            agent.PD = bureau_offline[ID]['PD']
            agent.update_skill()

            # I don't think this is necessary, but just in case
            bureau.member[ID] = agent

        bureau.collect_probabilities()


        # now save
        bureaufile = out_dir + '/bureau_offline.pickle'
        swap.write_pickle(bureau, bureaufile)

    # ------------------------------------------------------------------
    # now we can pretend we're in SWAP.py

    new_retirementfile = swap.get_new_filename(tonights.parameters,'retire_these')
    print "make_offline_reports: saving retiree subject Zooniverse IDs..."
    N = swap.write_list(sample,new_retirementfile,item='retired_subject')
    print "make_offline_reports: "+str(N)+" lines written to "+new_retirementfile

    # Also print out lists of detections etc! These are urls of images.

    new_samplefile = swap.get_new_filename(tonights.parameters,'candidates')
    print "make_offline_reports: saving lens candidates..."
    N = swap.write_list(sample,new_samplefile,item='candidate')
    print "make_offline_reports: "+str(N)+" lines written to "+new_samplefile

    # Now save the training images, for inspection:
    new_samplefile = swap.get_new_filename(tonights.parameters,'training_true_positives')
    print "make_offline_reports: saving true positives..."
    N = swap.write_list(sample,new_samplefile,item='true_positive')
    print "make_offline_reports: "+str(N)+" lines written to "+new_samplefile

    new_samplefile = swap.get_new_filename(tonights.parameters,'training_false_positives')
    print "make_offline_reports: saving false positives..."
    N = swap.write_list(sample,new_samplefile,item='false_positive')
    print "make_offline_reports: "+str(N)+" lines written to "+new_samplefile

    new_samplefile = swap.get_new_filename(tonights.parameters,'training_false_negatives')
    print "make_offline_reports: saving false negatives..."
    N = swap.write_list(sample,new_samplefile,item='false_negative')
    print "make_offline_reports: "+str(N)+" lines written to "+new_samplefile

    # Also write out catalogs of subjects, including the ZooID, subject ID,
    # how many classifications, and probability:

    catalog = swap.get_new_filename(tonights.parameters,'candidate_catalog')
    print "make_offline_reports: saving catalog of high probability subjects..."
    Nlenses,Nsubjects = swap.write_catalog(sample,catalog,thresholds,kind='test')
    print "make_offline_reports: From "+str(Nsubjects)+" subjects classified,"
    print "make_offline_reports: "+str(Nlenses)+" candidates (with P > rejection) written to "+catalog

    catalog = swap.get_new_filename(tonights.parameters,'sim_catalog')
    print "make_offline_reports: saving catalog of high probability subjects..."
    Nsims,Nsubjects = swap.write_catalog(sample,catalog,thresholds,kind='sim')
    print "make_offline_reports: From "+str(Nsubjects)+" subjects classified,"
    print "make_offline_reports: "+str(Nsims)+" sim 'candidates' (with P > rejection) written to "+catalog

    catalog = swap.get_new_filename(tonights.parameters,'dud_catalog')
    print "make_offline_reports: saving catalog of high probability subjects..."
    Nduds,Nsubjects = swap.write_catalog(sample,catalog,thresholds,kind='dud')
    print "make_offline_reports: From "+str(Nsubjects)+" subjects classified,"
    print "make_offline_reports: "+str(Nduds)+" dud 'candidates' (with P > rejection) written to "+catalog

    # ------------------------------------------------------------------

    # Make plots! Can't plot everything - uniformly sample 200 of each
    # thing (agent or subject).

    # Agent histories:

    fig1 = bureau.start_history_plot()
    pngfile = swap.get_new_filename(tonights.parameters,'histories')
    Nc = np.min([200,bureau.size()])
    print "make_offline_reports: plotting "+str(Nc)+" agent histories in "+pngfile

    for Name in bureau.shortlist(Nc):
        bureau.member[Name].plot_history(fig1)

    bureau.finish_history_plot(fig1,t,pngfile)
    tonights.parameters['historiesplot'] = pngfile

    # Agent probabilities:

    pngfile = swap.get_new_filename(tonights.parameters,'probabilities')
    print "make_offline_reports: plotting "+str(Nc)+" agent probabilities in "+pngfile
    bureau.plot_probabilities(Nc,t,pngfile)
    tonights.parameters['probabilitiesplot'] = pngfile

    # Subject trajectories:

    fig3 = sample.start_trajectory_plot()
    pngfile = swap.get_new_filename(tonights.parameters,'trajectories')

    # Random 500  for display purposes:
    Ns = np.min([500,sample.size()])
    print "make_offline_reports: plotting "+str(Ns)+" subject trajectories in "+pngfile

    for ID in sample.shortlist(Ns):
        sample.member[ID].plot_trajectory(fig3)

    # To plot only false negatives, or only true positives:
    # for ID in sample.shortlist(Ns,kind='sim',status='rejected'):
    #     sample.member[ID].plot_trajectory(fig3)
    # for ID in sample.shortlist(Ns,kind='sim',status='detected'):
    #     sample.member[ID].plot_trajectory(fig3)

    sample.finish_trajectory_plot(fig3,pngfile,t=t)
    tonights.parameters['trajectoriesplot'] = pngfile

    # Candidates! Plot all undecideds or detections:

    fig4 = sample.start_trajectory_plot(final=True)
    pngfile = swap.get_new_filename(tonights.parameters,'sample')

    # BigN = 100000 # Would get them all...
    BigN = 500      # Can't see them all!
    candidates = []
    candidates += sample.shortlist(BigN,kind='test',status='detected')
    candidates += sample.shortlist(BigN,kind='test',status='undecided')
    sims = []
    sims += sample.shortlist(BigN,kind='sim',status='detected')
    sims += sample.shortlist(BigN,kind='sim',status='undecided')
    duds = []
    duds += sample.shortlist(BigN,kind='dud',status='detected')
    duds += sample.shortlist(BigN,kind='dud',status='undecided')

    print "make_offline_reports: plotting "+str(len(sims))+" sims in "+pngfile
    for ID in sims:
        sample.member[ID].plot_trajectory(fig4)
    print "make_offline_reports: plotting "+str(len(duds))+" duds in "+pngfile
    for ID in duds:
        sample.member[ID].plot_trajectory(fig4)
    print "make_offline_reports: plotting "+str(len(candidates))+" candidates in "+pngfile
    for ID in candidates:
        sample.member[ID].plot_trajectory(fig4)

    # They will all show up in the histogram though:
    sample.finish_trajectory_plot(fig4,pngfile,final=True)
    tonights.parameters['candidatesplot'] = pngfile

    # ------------------------------------------------------------------
    # Finally, write a PDF report:

    swap.write_report(tonights.parameters,bureau,sample)
예제 #4
0
        equal = N1+N2==N3
        print "SWAP: SANITY CHECK! %i + %i = %i? %s"%(N1,N2,N3, equal)


    # ------------------------------------------------------------------
    # Pickle the bureau, sample, and database, if required. If we do
    # this, its because we want to pick up from where we left off
    # (ie with SWAPSHOP) - so save the pickles in the $cwd. This is
    # taken care of in io.py. Note that we update the parameters as
    # we go - this will be useful later when we write update.config.
    
    if tonights.parameters['repickle'] and count > 0:

        new_bureaufile = swap.get_new_filename(tonights.parameters,'bureau')
        print "SWAP: saving agents to "+new_bureaufile
        swap.write_pickle(bureau,new_bureaufile)
        tonights.parameters['bureaufile'] = new_bureaufile

        new_samplefile = swap.get_new_filename(tonights.parameters,'collection')
        print "SWAP: saving subjects to "+new_samplefile
        swap.write_pickle(sample,new_samplefile)
        tonights.parameters['samplefile'] = new_samplefile
        
        metadatafile = swap.get_new_filename(tonights.parameters,'metadata')
        print "SWAP: saving metadata to "+metadatafile
        swap.write_pickle(storage,metadatafile)
        tonights.parameters['metadatafile'] = metadatafile

    # ------------------------------------------------------------------
    # If there is more to do we need to update the config file for the next day
예제 #5
0
def make_lens_catalog(args):
    """
    NAME
        make_lens_catalog

    PURPOSE
        Given location of collection pickle, this script produces a set of
        annotated images of lenses (heatmaps for lens locations, markers for
        where clicks were, etc).

    COMMENTS
        You have to download the file so it chooses whever your output
        directory is to also download the raw images.
        This should be pretty customizable.

    FLAGS
        -h              Print this message

        --skill         Weight by skill


    INPUTS
        collection.pickle

    OUTPUTS
        lens.dat
            Assumed format:
            ID   kind   x   y    Prob     N0   Skill   Dist

            Here:
            ID = Space Warps subject ID
            kind = Space Warps subject type (sim, dud, test)
            x,y = object (cluster) centroid, in pixels
            P = Space Warps subject probability
            N0 = number of markers in the cluster
            S = total skill per cluster, summed over markers
            D = biggest distance within cluster

    EXAMPLE

    BUGS

    AUTHORS
        This file is part of the Space Warps project, and is distributed
        under the GPL v2 by the Space Warps Science Team.
        http://spacewarps.org/

    HISTORY
        2013-07-16  started Davis (KIPAC)
    """

    # ------------------------------------------------------------------
    # Some defaults:

    flags = {'skill': False,
             'output_directory': './',
             'output_name': 'catalog.dat',
             'image_y_size': 440,
             'catalog_path': '',
             'update_collection': '',}

    # ------------------------------------------------------------------
    # Read in options:

    # this has to be easier to do...
    for arg in args:
        if arg in flags:
            flags[arg] = args[arg]
        elif arg == 'collection_path':
            collection_path = args[arg]
        else:
            print "make_lens_atlas: unrecognized flag ",arg

    print "make_lens_catalog: illustrating behaviour captured in collection file: "
    print "make_lens_catalog: ",collection_path

    memory = joblib.Memory(cachedir=flags['output_directory'])
    memory.clear()

    catalog_path = flags['output_directory'] + flags['output_name']
    if len(flags['output_name']) > 0:
        F = open(catalog_path, 'w')
        F.write('id,kind,x,y,prob,n0,skill,dist\n')

    # ------------------------------------------------------------------
    # Read in files:

    collection = swap.read_pickle(collection_path, 'collection')
    ID_list = collection.list()
    print "make_lens_catalog: collection numbers ", len(ID_list)

    if flags['catalog_path'] != '':
        print "make_lens_catalog: filtering from catalog ",flags['catalog_path']
        catalog_in = csv2rec(flags['catalog_path'])
        ID_list = np.unique(catalog_in['id'])

    # ------------------------------------------------------------------
    # Run through data:

    catalog = {}
    for ID in ID_list:

        subject = collection.member[ID]
        kind = subject.kind
        P = subject.mean_probability


        itwas = subject.annotationhistory['ItWas']
        x_all = subject.annotationhistory['At_X']
        y_all = subject.annotationhistory['At_Y']

        x_markers = np.array([xi for xj in x_all for xi in xj])
        y_markers = np.array([yi for yj in y_all for yi in yj])

        catalog.update({ID: {'agents_reject': [],
                             'x': x_markers,
                             'y': y_markers,}})
        PL_all = subject.annotationhistory['PL']
        PD_all = subject.annotationhistory['PD']

        # filter out the empty clicks
        PL_list = []
        PL_nots = []
        for i, xj in enumerate(x_all):
            # len(xj) of empty = 0
            PL_list.append([PL_all[i]] * len(xj))
            if len(xj) == 0:
                PL_nots.append(PL_all[i])
        PL = np.array([PLi for PLj in PL_list for PLi in PLj])
        PL_nots = np.array(PL_nots)

        # filter out the empty clicks
        PD_list = []
        PD_nots = []
        for i, xj in enumerate(x_all):
            PD_list.append([PD_all[i]] * len(xj))
            if len(xj) == 0:
                PD_nots.append(PD_all[i])
                catalog[ID]['agents_reject'].append(i)
        PD = np.array([PDi for PDj in PD_list for PDi in PDj])
        PD_nots = np.array(PD_nots)

        skill = swap.expectedInformationGain(0.5, PL, PD)  # skill

        # it is only fair to write out the NOTs, too
        # do the empty guys
        skill_nots = swap.expectedInformationGain(0.5, PL_nots, PD_nots)  # skill

        x, y = -1, -1
        N0 = len(skill_nots)
        S = np.sum(skill_nots)
        D = 0

        ## catalog.append((ID, kind, x, y, P, N0, S, D))
        if len(catalog)%500 == 0:
            print len(catalog)
        if len(flags['output_name']) > 0:
            F.write('{0},{1},{2},{3},{4},{5},{6},{7}\n'.format(
                ID, kind, x, y, P, N0, S, D))

        if len(x_markers) == 0:
            # apparently everyone was a not...
            catalog[ID]['agents_labels'] = np.array([])
            continue

        # ------------------------------------------------------------------
        # cluster
        print 'make_lens_catalog: subject ID = ', ID
        if flags['skill']:
            cluster_centers, cluster_center_labels, cluster_labels, \
                    n_clusters, dist_within = outlier_clusters(x_markers, y_markers, skill, memory=memory)
        else:
            cluster_centers, cluster_center_labels, cluster_labels, \
                    n_clusters, dist_within = outlier_clusters(x_markers, y_markers, None, memory=memory)
        # need to get: x, y, N0, S

        catalog[ID]['agents_labels'] = cluster_labels

        for cluster_center_label in cluster_center_labels:
            cluster_center = cluster_centers[cluster_center_label]
            members = (cluster_labels == cluster_center_label)

            x, y = cluster_center
            # convert y to catalog convention
            y = flags['image_y_size'] - y
            N0 = np.sum(members)
            S = np.sum(skill[members])
            D = dist_within[cluster_center_label]

            if cluster_center_label == -1:
                # outlier cluster
                # so really every point is its own cluster...
                D = 0
            ## catalog.append((ID, kind, x, y, P, N0, S, D))
            ## if len(catalog)%500 == 0:
            ##     print len(catalog)
            # TODO: make some requirement to be included (exclude outliers)
            if len(flags['output_name']) > 0:
                F.write('{0},{1},{2},{3},{4},{5},{6},{7}\n'.format(
                    ID, kind, x, y, P, N0, S, D))


    print 'make_lens_catalog: Clearing memory'
    # clear memory
    memory.clear()

    if len(flags['output_name']) > 0:
        print 'make_lens_catalog: closing file!'
        F.close()

    if len(flags['update_collection']) > 0:
        print 'make_lens_catalog: writing updated collection to', flags['update_collection']

        # TODO: get the other params correct!!!!
        collection_fat = swap.collection.Collection()
        for ID in catalog:
            subject = collection.member[ID]
            atx = subject.annotationhistory['At_X']
            labels_in = list(catalog[ID]['agents_labels'])
            labels_fat = []
            for atx_i in atx:
                labels_fat.append([])
                for atx_ij in atx_i:
                    labels_fat[-1].append(labels_in.pop(0))
            subject.annotationhistory.update({'labels': labels_fat})
            collection_fat.member.update({ID: subject})
        swap.write_pickle(collection_fat, flags['update_collection'])

    print 'make_lens_catalog: All done!'

    return catalog
예제 #6
0
def MachineShop(args):

    # Buh. I never built in the ability to change directories on the fly
    #machine_sim_directory = 'sims_Machine/redo_with_circular_morphs'
    """
    Sometimes you just need to the run the Machine on a bunch of already
    made SWAP-runs / simulations. 
    If so, this script is for you!
    """

    # Get parameters from the SWAP run of interest
    the = swap.Configuration(args.config)
    params = the.parameters

    # This pulls up the FIDUCIAL SWAP simulation
    sim = Simulation(config=args.config,
                     directory='sims_SWAP/S_PLPD5_p5_ff_norand',
                     variety='feat_or_not')

    # this was originally set to 2/17/09 which is WRONG
    # 11/2/17: WHY?? F**k you, Past Melanie. What am I supposed to do here??
    first_day = dt.datetime(2009, 2, 12)
    today = dt.datetime.strptime(params['start'], '%Y-%m-%d_%H:%M:%S')
    start_day = dt.datetime(2009, 2, 17)
    last_day = dt.datetime.strptime(params['end'], '%Y-%m-%d_%H:%M:%S')
    yesterday = None

    run_machine = False
    SWAP_retired = 0
    notfound = 0
    last_night = None

    for idx, filename in enumerate(sim.retiredFileList[(today -
                                                        first_day).days:]):
        print ""
        print "----------------------- The Machine Shop ----------------------------"
        print "Today is {}".format(today)

        if today >= last_day:
            print "Get outta the machine shop!"
            exit()

        # ---------------------------------------------------------------------
        #  OPEN METADATA PICKLE (updated each time MachineClassifier is run)
        # ---------------------------------------------------------------------

        backup_meta_file = params['metadatafile'].replace(
            '.pickle', '_orig.pickle')

        if today == first_day:
            try:
                storage = swap.read_pickle(backup_meta_file, 'metadata')
            except:
                print "MachineShop: Backup metadata pickle not yet created."
                print "MachineShop: Opening original metadata pickle file instead"
                storage = swap.read_pickle(params['metadatafile'], 'metadata')

                if 'retired_date' not in storage.subjects.colnames:
                    storage.subjects['retired_date'] = '2016-09-10'

                if 'valid' not in np.unique(storage.subjects['MLsample']):
                    expert = (storage.subjects['Expert_label'] != -1)
                    storage.subjects['MLsample'][expert] = 'valid'

                # save an untouched copy for reference later
                print "MachineShop: Creating a backup metadata pickle"
                swap.write_pickle(storage, backup_meta_file)
        else:
            storage = swap.read_pickle(params['metadatafile'], 'metadata')

        # Regardless of which metadata you open, make sure it has these columns
        #       (old metadata files WON'T have them!)
        if 'retired_date' not in storage.subjects.colnames:
            storage.subjects['retired_date'] = '2016-09-10'

        if 'valid' not in np.unique(storage.subjects['MLsample']):
            expert = (storage.subjects['Expert_label'] != -1)
            storage.subjects['MLsample'][expert] = 'valid'

        subjects = storage.subjects

        # I just need to know what was retired TONIGHT --
        # compare what's retired UP TILL tonight with what was
        # retired up till LAST NIGHT
        SWAP_retired_by_tonight = sim.fetchCatalog(filename)

        # If we're picking up where we left off, grab previous training sample
        #if today>start_day and last_night is None:
        #    print 'MachineShop: getting previous training sample'
        #    last_night = subjects[subjects['MLsample']=='train']
        #    last_night['zooid'] = last_night['SDSS_id']

        try:
            ids_retired_tonight = set(SWAP_retired_by_tonight['zooid']) - \
                                        set(last_night['zooid'])
        except:
            ids_retired_tonight = set(SWAP_retired_by_tonight['zooid'])

        print "Newly retired subjects: {}".format(len(ids_retired_tonight))

        # Now that I have the ids from the previous night, adjust the
        # metadata file to reflect what was retired / add SWAP info
        for ID in list(ids_retired_tonight):

            # Locate this subject in the metadata file
            mask = subjects['SDSS_id'] == int(ID)

            # Update them in metadata file as training sample for MC
            # DOUBLE CHECK THAT IT HAS NOT BEEN RETIRED BY MACHINE!!!
            #if subjects['MLsample'][mask] == 'test ':
            if subjects['MLsample'][mask] == 'test':
                SWAP_retired += 1

                subjects['MLsample'][mask] = 'train'
                subjects['retired_date'][mask] = dt.datetime.strftime(
                    today, '%Y-%m-%d')
                subjects['SWAP_prob'][mask] = SWAP_retired_by_tonight['P'][
                    SWAP_retired_by_tonight['zooid'] == ID]

                run_machine = True

            else:
                notfound += 1

        if len(subjects[subjects['MLsample'] == 'train']) >= 10000:
            run_machine = True

        last_night = SWAP_retired_by_tonight

        print "Retired by this day:", len(last_night)

        print ""
        print "MachineShop: Found {0} subjects retired by SWAP on {1}"\
                .format(SWAP_retired, today)

        print "MachineShop: {0} total subjects retired so far"\
                .format(np.sum(subjects['MLsample']=='train'))

        print "MachineShop: Found {0} subjects retired by Machine."\
                .format(np.sum(subjects['MLsample']=='mclas'))

        print "MachineShop: Saving updated StorageLocker."

        params['dir'] = os.getcwd()
        # Save our new metadata file -- MC needs this -- save to NOT the original
        params['metadatafile'] = params['dir'] + '/' + params[
            'survey'] + '_metadata.pickle'
        swap.write_pickle(storage, params['metadatafile'])

        if run_machine:
            # Need to doctor the config to refect the "correct date"
            params['start'] = today.strftime('%Y-%m-%d_%H:%M:%S')
            swap.write_config(args.config, params)

            # Run MachineClassifier.py using this subject file
            os.system("python MachineClassifier.py -c %s" % args.config)
            """os.system("python test_Machine.py -c {0}".format(args.config))"""

            # MachineClassifier updates the configfile so now we need to open the NEW one
            the = swap.Configuration(args.config)
            params = the.parameters

        # Update date (since we're not running SWAP)
        today += dt.timedelta(days=1)
예제 #7
0
def MachineClassifier(options, args):

    try: config = options.configfile
    except: pdb.set_trace()

    tonights = swap.Configuration(config)

    #"""
    # Read the pickled random state file
    random_file = open(tonights.parameters['random_file'],"r");
    random_state = cPickle.load(random_file);
    random_file.close();
    np.random.set_state(random_state);
    #"""

    # Get the machine threshold (make retirement decisions)
    threshold = tonights.parameters['machine_threshold']

    # Get list of evaluation metrics and criteria   
    eval_metrics = tonights.parameters['evaluation_metrics']

    survey = tonights.parameters['survey']
    subdir = 'sup_run4'

    #----------------------------------------------------------------------
    # read in the metadata for all subjects (Test or Training sample?)
    subjects = swap.read_pickle(tonights.parameters['metadatafile'], 'metadata')

    #----------------------------------------------------------------------
    # read in the SWAP collection
    sample = swap.read_pickle(tonights.parameters['samplefile'],'collection')

    #----------------------------------------------------------------------
    # read in or create the ML collection
    MLsample = swap.read_pickle(tonights.parameters['MLsamplefile'],
                                'MLcollection')

    # read in or create the ML bureau for machine agents (history)
    MLbureau = swap.read_pickle(tonights.parameters['MLbureaufile'], 'MLbureau')

    #-----------------------------------------------------------------------    
    #        DETERMINE IF THERE IS A TRAINING SAMPLE TO WORK WITH 
    #-----------------------------------------------------------------------
    # TO DO: training sample should only select those which are NOT part of 
    # validation sample (Nair catalog objects) 2/22/16

    # IDENTIFY TRAINING SAMPLE
    train_sample = subjects[subjects['MLsample']=='train']
    train_meta, train_features = ml.extract_training(train_sample)
    train_labels = np.array([1 if p > 0.3 else 0 \
                             for p in train_meta['SWAP_prob']])

    # IDENTIFY VALIDATION SAMPLE (FINAL) 
    valid_sample = subjects[subjects['MLsample']=='valid']
    valid_meta, valid_features = ml.extract_training(valid_sample)
    valid_labels = valid_meta['Expert_label'].filled()

    #if len(train_sample) >= 100: 
    # TO DO: LOOP THROUGH DIFFERENT MACHINES? HOW MANY MACHINES?
    for metric in eval_metrics:
        
        # REGISTER Machine Classifier
        # Construct machine name --> Machine+Metric? For now: KNC
        machine = 'KNC'
        Name = machine+'_'+metric
        
        # register an Agent for this Machine
        try: 
            test = MLbureau.member[Name]
        except: 
            MLbureau.member[Name] = swap.Agent_ML(Name, metric)
            

        #---------------------------------------------------------------    
        #     TRAIN THE MACHINE; EVALUATE ON VALIDATION SAMPLE
        #---------------------------------------------------------------        

        # Now we run the machine -- need cross validation on whatever size 
        # training sample we have .. 
        
        # For now this will be fixed until we build in other machine options
        params = {'n_neighbors':np.arange(1, 2*(len(train_sample)-1) / 3, 2), 
                  'weights':('uniform','distance')}
        
        # Create the model 
        general_model = GridSearchCV(estimator=KNC(), param_grid=params,
                                     error_score=0, scoring=metric)        

        # Train the model -- k-fold cross validation is embedded
        trained_model = general_model.fit(train_features, train_labels)

        # Test "accuracy" (metric of choice) on validation sample
        score = trained_model.score(valid_features, valid_labels)

        MLbureau.member[Name].record_training(\
                            model_described_by=trained_model.best_estimator_, 
                            with_params=trained_model.best_params_, 
                            trained_on=len(train_features), 
                            at_time=TIME, 
                            with_train_acc=traineed_model.best_score_,
                            and_valid_acc=trained_model.score(valid_features,
                                                              valid_labels))

        # Store the trained machine
        MLbureau.member[Name].model = trained_model

        
        # Compute / store confusion matrix as a function of threshold
        # produced by this machine on the Expert Validation sample

        fps, tps, thresh = mtrx._binary_clf_curve(valid_labels,
                            trained_model.predict_proba(valid_features)[:,1])
        metric_list = mtrx.compute_binary_metrics(fps, tps)
        ACC, TPR, FPR, FNR, TNR, PPV, FDR, FOR, NPV = metric_list
        
        MLbureau.member[Name].record_evaluation(accuracy=ACC, 
                                                completeness_s=TPR,
                                                contamination_s=FDR,
                                                completeness_f=TNR,
                                                contamination_f=NPV)

        pdb.set_trace()



        
        # 3. compare the metric of choice with the evaluation criterion to
        # see if this machine has sufficiently learned? 
        # ... what if my criterion is simply "Maximize Accuracy"? 
        # ... or minimize feature contamination? these require that we 
        # compare tonight's machine with the previous night's machine 
        # But if my criterion is simply "have feature contam less than 20%"
        # then it's easy.... 
        
        # IF TRAINED MACHINE PREDICTS WELL ON VALIDATION .... 
        if MLbureau.member[Name].evaluate():
            #---------------------------------------------------------------    
            #                 APPLY MACHINE TO TEST SAMPLE
            #--------------------------------------------------------------- 
            # This requires that my runKNC function returns the Machine Object
            shitski=5
      
            #---------------------------------------------------------------    
            #                    PROCESS PREDICTIONS/PROBS
            #---------------------------------------------------------------
            for s,p,l in zip(test_meta, probas, predictions):
                ID = str(s['id'])

                descriptions = Nair_or_Not(s)
                category, kind, flavor, truth = descriptions

                # LOAD EACH TEST SUBJECT INTO MACHINE COLLECTION
                # -------------------------------------------------------------
                try: 
                    test = MLsample.member[ID]
                except: MLsample.member[ID] = swap.Subject_ML(ID,
                                            str(s['name']), category, kind,
                                            truth,threshold,s['external_ref'])
                
                tstring = datetime.now().strftime('%Y-%m-%d_%H:%M:%S')
                MLsample.member[ID].was_described(by='knn', as_being=1, 
                                                  withp=p, at_time=tstring)

                # NOTE: if subject is Nair (training) it doesn't get flagged as 
                # inactive but it can be flagged as detected/rejected


                # IF MACHINE P >= THRESHOLD, INSERT INTO SWAP COLLECTION
                # -------------------------------------------------------------
                thresholds = {'detection':0.,'rejection':0.}
                if (p >= threshold) or (1-p >= threshold):
                    print "BOOM! WE'VE GOT A MACHINE-CLASSIFIED SUBJECT:"
                    print "Probability:",p
                    # Initialize the subject in SWAP Collection
                    sample.member[ID] = swap.Subject(ID, str(s['name']), 
                                            category, kind,flavor,truth,
                                            thresholds, s['external_ref'],0.) 
                    sample.member[ID].retiredby = 'machine'
                
                    # Flag subject as 'INACTIVE' / 'DETECTED' / 'REJECTED'
                    # ----------------------------------------------------------
                    if p >= threshold:
                        sample.member[str(s['id'])].state = 'inactive'
                    elif 1-p >= threshold:
                        sample.member[str(s['id'])].status = 'rejected' 

                        
            #---------------------------------------------------------------    
            #                 SAVE MACHINE METADATA? 
            #---------------------------------------------------------------
            print "Size of SWAP sample:", sample.size()
            print "Size of ML sample:", MLsample.size()

      
            if tonights.parameters['report']:
                
                # Output list of subjects to retire, based on this batch of
                # classifications. Note that what is needed here is the ZooID,
                # not the subject ID:
            
                new_retirementfile = swap.get_new_filename(tonights.parameters,\
                                                   'retire_these', source='ML')
                print "SWAP: saving Machine-retired subject Zooniverse IDs..."
                N = swap.write_list(MLsample,new_retirementfile,
                                    item='retired_subject', source='ML')
                print "SWAP: "+str(N)+" lines written to "+new_retirementfile
            
                # write catalogs of smooth/not over MLthreshold
                # -------------------------------------------------------------
                catalog = swap.get_new_filename(tonights.parameters,
                                            'retired_catalog', source='ML')
                print "SWAP: saving catalog of Machine-retired subjects..."
                Nretired, Nsubjects = swap.write_catalog(MLsample,bureau,
                                                catalog, threshold,
                                                kind='rejected', source='ML')
                print "SWAP: From "+str(Nsubjects)+" subjects classified,"
                print "SWAP: "+str(Nretired)+" retired (with P < rejection) "\
                    "written to "+catalog
            
                catalog = swap.get_new_filename(tonights.parameters,
                                            'detected_catalog', source='ML')
                print "SWAP: saving catalog of Machine detected subjects..."
                Ndetected, Nsubjects = swap.write_catalog(MLsample, bureau,
                                                catalog, threshold, 
                                                kind='detected', source='ML')
                print "SWAP: From "+str(Nsubjects)+" subjects classified,"
                print "SWAP: %i detected (with P > MLthreshold) "\
                "written to %s"%(Ndetected, catalog)    


    

    # If is hasn't been done already, save the current directory
    # ---------------------------------------------------------------------
    tonights.parameters['dir'] = os.getcwd()+'/'+tonights.parameters['trunk']
    
    if not os.path.exists(tonights.parameters['dir']):
        os.makedirs(tonights.parameters['dir'])


    # Repickle all the shits
    # -----------------------------------------------------------------------
    if tonights.parameters['repickle']:

        new_samplefile = swap.get_new_filename(tonights.parameters,'collection')
        print "SWAP: saving SWAP subjects to "+new_samplefile
        swap.write_pickle(sample,new_samplefile)
        tonights.parameters['samplefile'] = new_samplefile
        
        new_samplefile=swap.get_new_filename(tonights.parameters,'MLcollection')
        print "SWAP: saving test sample subjects to "+new_samplefile
        swap.write_pickle(MLsample,new_samplefile)
        tonights.parameters['MLsamplefile'] = new_samplefile

        metadatafile = swap.get_new_filename(tonights.parameters,'metadata')
        print "SWAP: saving metadata to "+metadatafile
        swap.write_pickle(subjects,metadatafile)
        tonights.parameters['metadatafile'] = metadatafile
       

    # Update the time increment for SWAP's next run
    # -----------------------------------------------------------------------
    t2 = datetime.datetime.strptime(tonights.parameters['start'],
                                    '%Y-%m-%d_%H:%M:%S') + \
         datetime.timedelta(days=tonights.parameters['increment'])
    tstop = datetime.datetime.strptime(tonights.parameters['end'],
                                    '%Y-%m-%d_%H:%M:%S')
    if t2 == tstop: 
        plots = True
    else:
        tonights.parameters['start'] = t2.strftime('%Y-%m-%d_%H:%M:%S')
                

    # Update configfile to reflect Machine additions
    # -----------------------------------------------------------------------
    configfile = 'update.config'

    random_file = open(tonights.parameters['random_file'],"w");
    random_state = np.random.get_state();
    cPickle.dump(random_state,random_file);
    random_file.close();
    swap.write_config(configfile, tonights.parameters)

    pdb.set_trace()
예제 #8
0
파일: SWAP.py 프로젝트: willettk/SpaceWarps
        print "SWAP: "+str(Ndetected)+" detected (with P > acceptance) "\
            "written to "+catalog        


    # ------------------------------------------------------------------
    # Pickle the bureau, sample, and database, if required. If we do
    # this, its because we want to pick up from where we left off
    # (ie with SWAPSHOP) - so save the pickles in the $cwd. This is
    # taken care of in io.py. Note that we update the parameters as
    # we go - this will be useful later when we write update.config.
    
    if tonights.parameters['repickle'] and count > 0:

        new_bureaufile = swap.get_new_filename(tonights.parameters,'bureau')
        print "SWAP: saving agents to "+new_bureaufile
        swap.write_pickle(bureau,new_bureaufile)
        tonights.parameters['bureaufile'] = new_bureaufile

        new_samplefile = swap.get_new_filename(tonights.parameters,'collection')
        print "SWAP: saving subjects to "+new_samplefile
        swap.write_pickle(sample,new_samplefile)
        tonights.parameters['samplefile'] = new_samplefile
        
        metadatafile = swap.get_new_filename(tonights.parameters,'metadata')
        print "SWAP: saving metadata to "+metadatafile
        swap.write_pickle(subjects,metadatafile)
        tonights.parameters['metadatafile'] = metadatafile

    # ------------------------------------------------------------------
    # If there is more to do we need to update the config file for the next day
예제 #9
0
    # Pickle the bureau, sample, and database, if required. If we do
    # this, its because we want to pick up from where we left off
    # (ie with SWAPSHOP) - so save the pickles in the $cwd. This is
    # taken care of in io.py. Note that we update the parameters as
    # we go - this will be useful later when we write update.config.
    
    if tonights.parameters['repickle'] and count > 0:

        # 8-4-16: MB just learned that constantly appending numpy
        #         arrays is vastly more expensive and slow! 
        #         Instead -- make agent and subject histories python lists
        #         If we want to pickle the object, THEN cast them as numpy arrays

        new_bureaufile = swap.get_new_filename(tonights.parameters,'bureau')
        print "SWAP: saving agents to "+new_bureaufile
        swap.write_pickle(bureau,new_bureaufile)
        tonights.parameters['bureaufile'] = new_bureaufile

        new_samplefile = swap.get_new_filename(tonights.parameters,'collection')
        print "SWAP: saving subjects to "+new_samplefile
        swap.write_pickle(sample,new_samplefile)
        tonights.parameters['samplefile'] = new_samplefile
        
        metadatafile = swap.get_new_filename(tonights.parameters,'metadata')
        print "SWAP: saving metadata to "+metadatafile
        swap.write_pickle(storage,metadatafile)
        tonights.parameters['metadatafile'] = metadatafile

    # ------------------------------------------------------------------
    # If there is more to do we need to update the config file for the next day
예제 #10
0
def MachineClassifier(options, args):
    """
    NAME
        MachineClassifier.py

    PURPOSE
        Machine learning component of Galaxy Zoo Express

        Read in a training sample generated by human users (which have 
        preferentially been analyzed by SWAP).
        Learn on the training sample and moniter progress. 
        Once "fully trained", apply learned model to test sample. 

    COMMENTS
        Lots I'm sure. 

    FLAGS
        -h            Print this message
        -c            config file name 
    """

    # Check for setup file in array args:
    if (len(args) >= 1) or (options.configfile):
        if args: config = args[0]
        elif options.configfile: config = options.configfile
        print swap.doubledashedline
        print swap.ML_hello
        print swap.doubledashedline
        print "ML: taking instructions from",config
    else:
        print MachineClassifier.__doc__
        return

    tonights = swap.Configuration(config)
    
    # Read the pickled random state file
    random_file = open(tonights.parameters['random_file'],"r");
    random_state = cPickle.load(random_file);
    random_file.close();
    np.random.set_state(random_state);


    time = tonights.parameters['start']
    print time

    # Get the machine threshold (make retirement decisions)
    threshold = tonights.parameters['machine_threshold']
    prior = tonights.parameters['prior']

    # Get list of evaluation metrics and criteria   
    eval_metrics = tonights.parameters['evaluation_metrics']
    
    # How much cross-validation should we do? 
    cv = tonights.parameters['cross_validation']

    survey = tonights.parameters['survey']

    #----------------------------------------------------------------------
    # read in the metadata for all subjects (Test or Training sample?)
    storage = swap.read_pickle(tonights.parameters['metadatafile'], 'metadata')
    subjects = storage.subjects

    #----------------------------------------------------------------------
    # read in the SWAP collection
    sample = swap.read_pickle(tonights.parameters['samplefile'],'collection')

    #----------------------------------------------------------------------
    # read in or create the ML collection
    MLsample = swap.read_pickle(tonights.parameters['MLsamplefile'],
                                'MLcollection')

    # read in or create the ML bureau for machine agents (history)
    MLbureau = swap.read_pickle(tonights.parameters['MLbureaufile'],'bureau')
    #if not tonights.parameters['MLbureaufile']:
    #    MLbureaufile = swap.get_new_filename(tonights.parameters,'bureau','ML')
    #    tonights.parameters['MLbureaufile'] = MLbureaufile

    #MLbureau = swap.read_pickle(tonights.parameters['MLbureaufile'],'bureau')


    #-----------------------------------------------------------------------    
    #                 SELECT TRAINING & VALIDATION SAMPLES  
    #-----------------------------------------------------------------------
    # TO DO: training sample should only select those which are NOT part of 
    # validation sample (Nair catalog objects) 2/22/16

    train_sample = storage.fetch_subsample(sample_type='train',
                                           class_label='GZ2_label')
    """ Notes about the training sample:
    # this will select only those which have my morphology measured for them
    # AND which have a true "answer" according to GZ2
    # Eventually we could open this up to include the ~10k that aren't in the 
    # GZ Main Sample but I think, for now, we should reduce ourselves to this
    # stricter sample so that we always have back-up "truth" for each galaxy.
    """

    try:
        train_meta, train_features = ml.extract_features(train_sample)
        original_length = len(train_meta)

    except TypeError:
        print "ML: can't extract features from subsample."
        print "ML: Exiting MachineClassifier.py"
        sys.exit()

    else:
        # TODO: consider making this part of SWAP's duties? 
        # 5/18/16: Only use those subjects which are no longer on the prior
        off_the_fence = np.where(train_meta['SWAP_prob']!=prior)
        train_meta = train_meta[off_the_fence]
        train_features = train_features[off_the_fence]
        train_labels = np.array([1 if p > prior else 0 for p in 
                                 train_meta['SWAP_prob']])

        #train_labels = train_meta['Nair_label'].filled()

        shortened_length = len(train_meta)
        print "ML: found a training sample of %i subjects"%shortened_length
        removed = original_length - shortened_length
        print "ML: %i subjects had prior probability and were removed"%removed
    

    valid_sample = storage.fetch_subsample(sample_type='valid',
                                           class_label='Expert_label')
    try:
        valid_meta, valid_features = ml.extract_features(valid_sample)
    except:
        print "ML: there are no subjects with the label 'valid'!"
    else:
        valid_labels = valid_meta['Expert_label'].filled()
        print "ML: found a validation sample of %i subjects"%len(valid_meta)

    # ---------------------------------------------------------------------
    # Require a minimum size training sample [Be reasonable, my good man!]
    # ---------------------------------------------------------------------
    if len(train_sample) < 10000: 
        print "ML: training sample is too small to be worth anything."
        print "ML: Exiting MachineClassifier.py"
        sys.exit()
        
    else:
        print "ML: training sample is large enough to give it a shot."

        # TODO: LOOP THROUGH DIFFERENT MACHINES? 
        # 5/12/16 -- no... need to make THIS a class and create multiple 
        #            instances? Each one can be passed an instance of a machine?

        # Machine can be trained to maximize/minimize different metrics
        # (ACC, completeness, purity, etc. Have a list of acceptable ones.)
        # Minimize a Loss function (KNC doesn't have a loss fcn). 
        for metric in eval_metrics:
        
            # REGISTER Machine Classifier
            # Construct machine name --> Machine+Metric? For now: KNC
            machine = 'KNC'
            machine = 'RF'
            Name = machine+'_'+metric
        
            # register an Agent for this Machine
            # This "Agent" doesn't behave like a SW agent... at least not yet

            try: 
                test = MLbureau.member[Name]
            except: 
                MLbureau.member[Name] = swap.Agent_ML(Name, metric)
                
            MLagent = MLbureau.member[Name]

            #---------------------------------------------------------------    
            #     TRAIN THE MACHINE; EVALUATE ON VALIDATION SAMPLE
            #---------------------------------------------------------------

            # Now we run the machine -- need cross validation on whatever size 
            # training sample we have .. 
        
            # Fixed until we build in other machine options
            # Need to dynamically determine appropriate parameters...

            #max_neighbors = get_max_neighbors(train_features, cv)
            #n_neighbors = np.arange(1, (cv-1)*max_neighbors/cv, 5, dtype=int)
            #params = {'n_neighbors':n_neighbors, 
            #          'weights':('uniform','distance')}

            num_features = train_features.shape[1]
        
            min_features = int(round(np.sqrt(num_features)))
            params = {'max_features':np.arange(min_features, num_features+1),
                      'max_depth':np.arange(2,16)}

            # Create the model 
            # for "estimator=XXX" all you need is an instance of a machine -- 
            # any scikit-learn machine will do. However, non-sklearn machines..
            # That will be a bit trickier! (i.e. Phil's conv-nets)
            general_model = GridSearchCV(estimator=RF(n_estimators=30), 
                                         param_grid=params, n_jobs=-1,
                                         error_score=0, scoring=metric, cv=cv) 
            
            # Train the model -- k-fold cross validation is embedded
            print "ML: Searching the hyperparameter space for values that "\
                "optimize the %s."%metric
            trained_model = general_model.fit(train_features, train_labels)

            MLagent.model = trained_model

            # Test "accuracy" (metric of choice) on validation sample
            score = trained_model.score(valid_features, valid_labels)

            ratio = np.sum(train_labels==1) / len(train_labels)

            MLagent.record_training(model_described_by=
                                    trained_model.best_estimator_, 
                                    with_params=trained_model.best_params_, 
                                    trained_on=len(train_features), 
                                    with_ratio=ratio,
                                    at_time=time, 
                                    with_train_score=trained_model.best_score_,
                                    and_valid_score=trained_model.score(
                                        valid_features, valid_labels))

            fps, tps, thresh = mtrx.roc_curve(valid_labels, 
                            trained_model.predict_proba(valid_features)[:,1])

            metric_list = compute_binary_metrics(fps, tps)
            ACC, TPR, FPR, FNR, TNR, PPV, FDR, FOR, NPV = metric_list
        
            MLagent.record_validation(accuracy=ACC, recall=TPR, precision=PPV,
                                      false_pos=FPR, completeness_f=TNR,
                                      contamination_f=NPV)
            
            #MLagent.plot_ROC()

            # ---------------------------------------------------------------
            # IF TRAINED MACHINE PREDICTS WELL ON VALIDATION ....
            # ---------------------------------------------------------------
            if MLagent.is_trained(metric):
                print "ML: %s has successfully trained and will be applied "\
                    "to the test sample."

                # Retrieve the test sample 
                test_sample = storage.fetch_subsample(sample_type='test',
                                                      class_label='GZ2_label')
                """ Notes on test sample:
                The test sample will, in real life, be those subjects for which
                we don't have an answer a priori. However, for now, this sample
                is how we will judge, in part, the performance of the overall
                method. As such, we only include those subjects which have 
                GZ2 labels in the Main Sample.
                """

                try:
                    test_meta, test_features = ml.extract_features(test_sample)
                except:
                    print "ML: there are no subjects with the label 'test'!"
                    print "ML: which means there's nothing more to do!"
                else:
                    print "ML: found test sample of %i subjects"%len(test_meta)

                #-----------------------------------------------------------    
                #                 APPLY MACHINE TO TEST SAMPLE
                #----------------------------------------------------------- 
                predictions = MLagent.model.predict(test_features)
                probabilities = MLagent.model.predict_proba(test_features)

                print "ML: %s has finished predicting labels for the test "\
                    "sample."%Name
                print "ML: Generating performance report on the test sample:"

                test_labels = test_meta['GZ2_label'].filled()
                print mtrx.classification_report(test_labels, predictions)

                test_accuracy=mtrx.accuracy_score(test_labels,predictions)
                test_precision=mtrx.precision_score(test_labels,predictions)
                test_recall=mtrx.recall_score(test_labels,predictions)

                MLagent.record_evaluation(accuracy_score=test_accuracy,
                                          precision_score=test_precision,
                                          recall_score=test_recall,
                                          at_time=time)
                #pdb.set_trace()
                
                # ----------------------------------------------------------
                # Save the predictions and probabilities to a new pickle

                test_meta['predictions'] = predictions
                test_meta['probability_of_smooth'] = probabilities[:,1]
                
                filename=tonights.parameters['trunk']+'_'+Name+'.pickle'
                swap.write_pickle(test_meta, filename)



                """
                for thing, pred, p in zip(test_meta, predictions,
                                          probabitilies):
                    
                    # IF MACHINE P >= THRESHOLD, INSERT INTO SWAP COLLECTION
                    # --------------------------------------------------------
                    if (p >= threshold) or (1-p >= threshold):
                        print "BOOM! WE'VE GOT A MACHINE-CLASSIFIED SUBJECT:"
                        print "Probability:", p
                        # Initialize the subject in SWAP Collection
                        ID = thing['asset_id']
                        sample.member[ID] = swap.Subject(ID, str(s['SDSS_id']), 
                                            location=s['external_ref']) 
                    sample.member[ID].retiredby = 'machine'
                
                    # Flag subject as 'INACTIVE' / 'DETECTED' / 'REJECTED'
                    # ----------------------------------------------------------
                    if p >= threshold:
                        sample.member[str(s['id'])].state = 'inactive'
                    elif 1-p >= threshold:
                        sample.member[str(s['id'])].status = 'rejected' 

                #"""
    
    
    # If is hasn't been done already, save the current directory
    # ---------------------------------------------------------------------
    tonights.parameters['dir'] = os.getcwd()+'/'+tonights.parameters['trunk']
    
    if not os.path.exists(tonights.parameters['dir']):
        os.makedirs(tonights.parameters['dir'])


    # Repickle all the shits
    # -----------------------------------------------------------------------
    if tonights.parameters['repickle']:

        new_samplefile = swap.get_new_filename(tonights.parameters,'collection')
        print "ML: saving SWAP subjects to "+new_samplefile
        swap.write_pickle(sample, new_samplefile)
        tonights.parameters['samplefile'] = new_samplefile
        
        new_samplefile=swap.get_new_filename(tonights.parameters,'MLcollection')
        print "ML: saving test sample subjects to "+new_samplefile
        swap.write_pickle(MLsample,new_samplefile)
        tonights.parameters['MLsamplefile'] = new_samplefile

        new_bureaufile=swap.get_new_filename(tonights.parameters,'bureau','ML')
        print "ML: saving MLbureau to "+new_bureaufile
        swap.write_pickle(MLbureau, new_bureaufile)
        tonights.parameters['MLbureaufile'] = new_bureaufile

        metadatafile = swap.get_new_filename(tonights.parameters,'metadata')
        print "ML: saving metadata to "+metadatafile
        swap.write_pickle(storage, metadatafile)
        tonights.parameters['metadatafile'] = metadatafile


    # UPDATE CONFIG FILE with pickle filenames, dir/trunk, and (maybe) new day
    # ----------------------------------------------------------------------
    configfile = config.replace('startup','update')

    # Random_file needs updating, else we always start from the same random
    # state when update.config is reread!
    random_file = open(tonights.parameters['random_file'],"w");
    random_state = np.random.get_state();
    cPickle.dump(random_state,random_file);
    random_file.close();
    swap.write_config(configfile, tonights.parameters)

    return
예제 #11
0
def MachineClassifier(options, args):
    """
    NAME
        MachineClassifier.py

    PURPOSE
        Machine learning component of Galaxy Zoo Express

        Read in a training sample generated by human users (which have 
        previously been analyzed by SWAP).
        Learn on the training sample and moniter progress. 
        Once "fully trained", apply learned model to test sample. 

    COMMENTS
        Lots I'm sure. 

    FLAGS
        -h            Print this message
        -c            config file name 
    """


    #-----------------------------------------------------------------------    
    #                 LOAD CONFIG FILE PARAMETERS  
    #-----------------------------------------------------------------------
    # Check for config file in array args:
    if (len(args) >= 1) or (options.configfile):
        if args: config = args[0]
        elif options.configfile: config = options.configfile
        print swap.doubledashedline
        print swap.ML_hello
        print swap.doubledashedline
        print "ML: taking instructions from",config
    else:
        print MachineClassifier.__doc__
        return

    machine_sim_directory = 'sims_Machine/redo_with_circular_morphs/'

    tonights = swap.Configuration(config)
    
    # Read the pickled random state file
    random_file = open(tonights.parameters['random_file'],"r");
    random_state = cPickle.load(random_file);
    random_file.close();
    np.random.set_state(random_state)

    time = tonights.parameters['start']

    # Get the machine threshold (to make retirement decisions)
    swap_thresholds = {}
    swap_thresholds['detection'] = tonights.parameters['detection_threshold']  
    swap_thresholds['rejection'] = tonights.parameters['rejection_threshold']
    threshold = tonights.parameters['machine_threshold']
    prior = tonights.parameters['prior']

    # Get list of evaluation metrics and criteria   
    eval_metrics = tonights.parameters['evaluation_metrics']
    
    # How much cross-validation should we do? 
    cv = tonights.parameters['cross_validation']

    survey = tonights.parameters['survey']

    # To generate training labels based on the subject probability, 
    # we need to know what should be considered the positive label: 
    # i.e., GZ2 has labels (in metadatafile) Smooth = 1, Feat = 0
    # Doing a Smooth or Not run, the positive label is 1
    # Doing a Featured or Not run, the positive label is 0
    pos_label = tonights.parameters['positive_label']

    #----------------------------------------------------------------------
    # read in the metadata for all subjects
    storage = swap.read_pickle(tonights.parameters['metadatafile'], 'metadata')

    # 11TH HOUR QUICK FIX CUZ I F****D UP. MB 10/27/16
    if 'GZ2_raw_combo' not in storage.subjects.colnames:
        gz2_metadata = Table.read('metadata_ground_truth_labels.fits')
        storage.subjects['GZ2_raw_combo'] = gz2_metadata['GZ2_raw_combo']
        swap.write_pickle(storage, tonights.parameters['metadatafile'])

    subjects = storage.subjects

    #----------------------------------------------------------------------
    # read in the PROJECT COLLECTION -- (shared between SWAP/Machine)
    #sample = swap.read_pickle(tonights.parameters['samplefile'],'collection')

    # read in or create the ML bureau for machine agents (history for Machines)
    MLbureau = swap.read_pickle(tonights.parameters['MLbureaufile'],'bureau')



    #-----------------------------------------------------------------------    
    #                 FETCH TRAINING & VALIDATION SAMPLES  
    #-----------------------------------------------------------------------
    train_sample = storage.fetch_subsample(sample_type='train',
                                           class_label='GZ2_raw_combo')
    """ Notes about the training sample:
    # this will select only those which have my morphology measured for them
    # AND which have "ground truth" according to GZ2
    # Eventually we could open this up to include the ~10k that aren't in the 
    # GZ Main Sample but I think, for now, we should reduce ourselves to this
    # stricter sample so that we always have back-up "truth" for each galaxy.
    """

    try:
        train_meta, train_features = ml.extract_features(train_sample, 
                                        keys=['M20_corr', 'C_corr', 'E', 'A_corr', 'G_corr'])
        original_length = len(train_meta)

    except TypeError:
        print "ML: can't extract features from subsample."
        print "ML: Exiting MachineClassifier.py"
        sys.exit()

    else:
        # TODO: consider making this part of SWAP's duties? 
        # 5/18/16: Only use those subjects which are no longer on the prior
        off_the_fence = np.where(train_meta['SWAP_prob']!=prior)
        train_meta = train_meta[off_the_fence]
        train_features = train_features[off_the_fence]
        train_labels = np.array([pos_label if p > prior else 1-pos_label 
                                 for p in train_meta['SWAP_prob']])


        shortened_length = len(train_meta)
        print "ML: found a training sample of %i subjects"%shortened_length
        removed = original_length - shortened_length
        print "ML: %i subjects removed to create balanced training sample"%removed
    

    valid_sample = storage.fetch_subsample(sample_type='valid',
                                           class_label='Expert_label')
    try:
        valid_meta, valid_features = ml.extract_features(valid_sample,
                                        keys=['M20_corr', 'C_corr', 'E', 'A_corr', 'G_corr'])
    except:
        print "ML: there are no subjects with the label 'valid'!"
    else:
        valid_labels = valid_meta['Expert_label'].filled()
        print "ML: found a validation sample of %i subjects"%len(valid_meta)

    # ---------------------------------------------------------------------
    # Require a minimum size training sample [Be reasonable, my good man!]
    # ---------------------------------------------------------------------
    if len(train_sample) < 10000: 
        print "ML: training sample is too small to be worth anything."
        print "ML: Exiting MachineClassifier.py"
        sys.exit()
        
    else:
        print "ML: training sample is large enough to give it a shot."

        # TODO: LOOP THROUGH DIFFERENT MACHINES? 
        # 5/12/16 -- no... need to make THIS a class and create multiple 
        #            instances? Each one can be passed an instance of a machine?

        # Machine can be trained to optimize different metrics
        # (ACC, completeness, purity, etc. Have a list of acceptable ones.)
        # Minimize a Loss function. 
        for metric in eval_metrics:
        
            # REGISTER Machine Classifier
            # Construct machine name --> Machine+Metric
            machine = 'RF'
            Name = machine+'_'+metric
        
            # register an Agent for this Machine
            try: 
                test = MLbureau.member[Name]
            except: 
                MLbureau.member[Name] = swap.Agent_ML(Name, metric)
                
            MLagent = MLbureau.member[Name]

            #---------------------------------------------------------------    
            #     TRAIN THE MACHINE; EVALUATE ON VALIDATION SAMPLE
            #---------------------------------------------------------------

            # Now we run the machine -- need cross validation on whatever size 
            # training sample we have .. 
        
            # Fixed until we build in other machine options
            # Need to dynamically determine appropriate parameters...

            #max_neighbors = get_max_neighbors(train_features, cv)
            #n_neighbors = np.arange(1, (cv-1)*max_neighbors/cv, 5, dtype=int)
            #params = {'n_neighbors':n_neighbors, 
            #          'weights':('uniform','distance')}

            num_features = train_features.shape[1]
        
            min_features = int(round(np.sqrt(num_features)))
            params = {'max_features':np.arange(min_features, num_features+1),
                      'max_depth':np.arange(2,16)}

            # Create the model 
            # for "estimator=XXX" all you need is an instance of a machine -- 
            # any scikit-learn machine will do. However, non-sklearn machines..
            # That will be a bit trickier! (i.e. Phil's conv-nets)
            general_model = GridSearchCV(estimator=RF(n_estimators=30), 
                                         param_grid=params, n_jobs=31,
                                         error_score=0, scoring=metric, cv=cv) 
            
            # Train the model -- k-fold cross validation is embedded
            print "ML: Searching the hyperparameter space for values that "\
                  "optimize the %s."%metric

            trained_model = general_model.fit(train_features, train_labels)
            MLagent.model = trained_model

            # Test accuracy (metric of choice) on validation sample
            score = trained_model.score(valid_features, valid_labels)

            ratio = np.sum(train_labels==pos_label) / len(train_labels)

            MLagent.record_training(model_described_by=
                                    trained_model.best_estimator_, 
                                    with_params=trained_model.best_params_, 
                                    trained_on=len(train_features), 
                                    with_ratio=ratio,
                                    at_time=time, 
                                    with_train_score=trained_model.best_score_,
                                    and_valid_score=trained_model.score(
                                        valid_features, valid_labels))

            valid_prob_thresh = trained_model.predict_proba(valid_features)[:,pos_label]
            fps, tps, thresh = mtrx.roc_curve(valid_labels,valid_prob_thresh, pos_label=pos_label)

            metric_list = compute_binary_metrics(fps, tps)
            ACC, TPR, FPR, FNR, TNR, PPV, FDR, FOR, NPV = metric_list
        
            MLagent.record_validation(accuracy=ACC, recall=TPR, precision=PPV,
                                      false_pos=FPR, completeness_f=TNR,
                                      contamination_f=NPV)
            
            #MLagent.plot_ROC()

            # ---------------------------------------------------------------
            # IF TRAINED MACHINE PREDICTS WELL ON VALIDATION ....
            # ---------------------------------------------------------------
            if MLagent.is_trained(metric) or MLagent.trained:
                print "ML: %s has successfully trained and will be applied "\
                      "to the test sample."%Name

                # Retrieve the test sample 
                test_sample = storage.fetch_subsample(sample_type='test',
                                                      class_label='GZ2_raw_combo')
                """ Notes on test sample:
                The test sample will, in real life, be those subjects for which
                we don't have an answer a priori. However, for now, this sample
                is how we will judge, in part, the performance of the overall
                method. As such, we only include those subjects which have 
                GZ2 labels in the Main Sample.
                """

                try:
                    test_meta, test_features = ml.extract_features(test_sample,
                                                keys=['M20_corr', 'C_corr', 'E', 'A_corr', 'G_corr'])
                except:
                    print "ML: there are no subjects with the label 'test'!"
                    print "ML: Either there is nothing more to do or there is a BIG mistake..."
                else:
                    print "ML: found test sample of %i subjects"%len(test_meta)

                #-----------------------------------------------------------    
                #                 APPLY MACHINE TO TEST SAMPLE
                #----------------------------------------------------------- 
                predictions = MLagent.model.predict(test_features)
                probabilities = MLagent.model.predict_proba(test_features)[:,pos_label]

                print "ML: %s has finished predicting labels for the test "\
                      "sample."%Name
                print "ML: Generating performance report on the test sample:"

                test_labels = test_meta['GZ2_raw_combo'].filled()
                print mtrx.classification_report(test_labels, predictions)

                test_accuracy = mtrx.accuracy_score(test_labels,predictions)
                test_precision = mtrx.precision_score(test_labels,predictions,pos_label=pos_label)
                test_recall = mtrx.recall_score(test_labels,predictions,pos_label=pos_label)

                MLagent.record_evaluation(accuracy_score=test_accuracy,
                                          precision_score=test_precision,
                                          recall_score=test_recall,
                                          at_time=time)
                
                # ----------------------------------------------------------
                # Save the predictions and probabilities to a new pickle

                test_meta['predictions'] = predictions
                test_meta['machine_probability'] = probabilities

                # If is hasn't been done already, save the current directory
                # ---------------------------------------------------------------------
                tonights.parameters['trunk'] = survey+'_'+tonights.parameters['start']
                # This is the standard directory... 
                #tonights.parameters['dir'] = os.getcwd()+'/'+tonights.parameters['trunk']

                # This is to put files into the sims_Machine/... directory. 
                tonights.parameters['dir'] = os.getcwd()
                filename=tonights.parameters['dir']+'/'+tonights.parameters['trunk']+'_'+Name+'.fits'
                test_meta.write(filename)

                count=0
                noSWAP=0
                for sub, pred, prob in zip(test_meta, predictions, probabilities):
                    
                    # IF MACHINE P >= THRESHOLD, INSERT INTO SWAP COLLECTION
                    # --------------------------------------------------------
                    if (prob >= threshold) or (1-prob >= threshold):

                        # Flip the set label in the metadata file -- 
                        #   don't want to use this as a training sample!
                        idx = np.where(subjects['asset_id'] == sub['asset_id'])
                        
                        storage.subjects['MLsample'][idx] = 'mclass'
                        storage.subjects['retired_date'][idx] = time
                        count+=1

                print "MC: Machine classifed {0} subjects with >= 90% confidence".format(count)
                print "ML: Of those, {0} had never been seen by SWAP".format(noSWAP)

    
    tonights.parameters['trunk'] = survey+'_'+tonights.parameters['start']
    tonights.parameters['dir'] = os.getcwd()
    if not os.path.exists(tonights.parameters['dir']):
        os.makedirs(tonights.parameters['dir'])


    # Repickle all the shits
    # -----------------------------------------------------------------------
    if tonights.parameters['repickle']:

        #new_samplefile = swap.get_new_filename(tonights.parameters,'collection')
        #print "ML: saving SWAP subjects to "+new_samplefile
        #swap.write_pickle(sample, new_samplefile)
        #tonights.parameters['samplefile'] = new_samplefile
        
        new_bureaufile=swap.get_new_filename(tonights.parameters,'bureau','ML')
        print "ML: saving MLbureau to "+new_bureaufile
        swap.write_pickle(MLbureau, new_bureaufile)
        tonights.parameters['MLbureaufile'] = new_bureaufile

        metadatafile = swap.get_new_filename(tonights.parameters,'metadata')
        print "ML: saving metadata to "+metadatafile
        swap.write_pickle(storage, metadatafile)
        tonights.parameters['metadatafile'] = metadatafile


    # UPDATE CONFIG FILE with pickle filenames, dir/trunk, and (maybe) new day
    # ----------------------------------------------------------------------
    configfile = config.replace('startup','update')

    # Random_file needs updating, else we always start from the same random
    # state when update.config is reread!
    random_file = open(tonights.parameters['random_file'],"w");
    random_state = np.random.get_state();
    cPickle.dump(random_state,random_file);
    random_file.close();

    swap.write_config(configfile, tonights.parameters)

    return
예제 #12
0
def make_lens_catalog(args):
    """
    NAME
        make_lens_catalog

    PURPOSE
        Given location of collection pickle, this script produces a set of
        annotated images of lenses (heatmaps for lens locations, markers for
        where clicks were, etc).

    COMMENTS
        You have to download the file so it chooses whever your output
        directory is to also download the raw images.
        This should be pretty customizable.

    FLAGS
        -h              Print this message

        --skill         Weight by skill


    INPUTS
        collection.pickle

    OUTPUTS
        lens.dat
            Assumed format:
            ID   kind   x   y    Prob     N0   Skill   Dist

            Here:
            ID = Space Warps subject ID
            kind = Space Warps subject type (sim, dud, test)
            x,y = object (cluster) centroid, in pixels
            P = Space Warps subject probability
            N0 = number of markers in the cluster
            S = total skill per cluster, summed over markers
            D = biggest distance within cluster

    EXAMPLE

    BUGS

    AUTHORS
        This file is part of the Space Warps project, and is distributed
        under the GPL v2 by the Space Warps Science Team.
        http://spacewarps.org/

    HISTORY
        2013-07-16  started Davis (KIPAC)
    """

    # ------------------------------------------------------------------
    # Some defaults:

    flags = {
        'skill': False,
        'output_directory': './',
        'output_name': 'catalog.dat',
        'image_y_size': 440,
        'catalog_path': '',
        'update_collection': '',
    }

    # ------------------------------------------------------------------
    # Read in options:

    # this has to be easier to do...
    for arg in args:
        if arg in flags:
            flags[arg] = args[arg]
        elif arg == 'collection_path':
            collection_path = args[arg]
        else:
            print "make_lens_atlas: unrecognized flag ", arg

    print "make_lens_catalog: illustrating behaviour captured in collection file: "
    print "make_lens_catalog: ", collection_path

    memory = joblib.Memory(cachedir=flags['output_directory'])
    memory.clear()

    catalog_path = flags['output_directory'] + flags['output_name']
    if len(flags['output_name']) > 0:
        F = open(catalog_path, 'w')
        F.write('id,kind,x,y,prob,n0,skill,dist\n')

    # ------------------------------------------------------------------
    # Read in files:

    collection = swap.read_pickle(collection_path, 'collection')
    ID_list = collection.list()
    print "make_lens_catalog: collection numbers ", len(ID_list)

    if flags['catalog_path'] != '':
        print "make_lens_catalog: filtering from catalog ", flags[
            'catalog_path']
        catalog_in = csv2rec(flags['catalog_path'])
        ID_list = np.unique(catalog_in['id'])

    # ------------------------------------------------------------------
    # Run through data:

    catalog = {}
    for ID in ID_list:

        subject = collection.member[ID]
        kind = subject.kind
        P = subject.mean_probability

        itwas = subject.annotationhistory['ItWas']
        x_all = subject.annotationhistory['At_X']
        y_all = subject.annotationhistory['At_Y']

        x_markers = np.array([xi for xj in x_all for xi in xj])
        y_markers = np.array([yi for yj in y_all for yi in yj])

        catalog.update(
            {ID: {
                'agents_reject': [],
                'x': x_markers,
                'y': y_markers,
            }})
        PL_all = subject.annotationhistory['PL']
        PD_all = subject.annotationhistory['PD']

        # filter out the empty clicks
        PL_list = []
        PL_nots = []
        for i, xj in enumerate(x_all):
            # len(xj) of empty = 0
            PL_list.append([PL_all[i]] * len(xj))
            if len(xj) == 0:
                PL_nots.append(PL_all[i])
        PL = np.array([PLi for PLj in PL_list for PLi in PLj])
        PL_nots = np.array(PL_nots)

        # filter out the empty clicks
        PD_list = []
        PD_nots = []
        for i, xj in enumerate(x_all):
            PD_list.append([PD_all[i]] * len(xj))
            if len(xj) == 0:
                PD_nots.append(PD_all[i])
                catalog[ID]['agents_reject'].append(i)
        PD = np.array([PDi for PDj in PD_list for PDi in PDj])
        PD_nots = np.array(PD_nots)

        skill = swap.expectedInformationGain(0.5, PL, PD)  # skill

        # it is only fair to write out the NOTs, too
        # do the empty guys
        skill_nots = swap.expectedInformationGain(0.5, PL_nots,
                                                  PD_nots)  # skill

        x, y = -1, -1
        N0 = len(skill_nots)
        S = np.sum(skill_nots)
        D = 0

        ## catalog.append((ID, kind, x, y, P, N0, S, D))
        if len(catalog) % 500 == 0:
            print len(catalog)
        if len(flags['output_name']) > 0:
            F.write('{0},{1},{2},{3},{4},{5},{6},{7}\n'.format(
                ID, kind, x, y, P, N0, S, D))

        if len(x_markers) == 0:
            # apparently everyone was a not...
            catalog[ID]['agents_labels'] = np.array([])
            continue

        # ------------------------------------------------------------------
        # cluster
        print 'make_lens_catalog: subject ID = ', ID
        if flags['skill']:
            cluster_centers, cluster_center_labels, cluster_labels, \
                    n_clusters, dist_within = outlier_clusters(x_markers, y_markers, skill, memory=memory)
        else:
            cluster_centers, cluster_center_labels, cluster_labels, \
                    n_clusters, dist_within = outlier_clusters(x_markers, y_markers, None, memory=memory)
        # need to get: x, y, N0, S

        catalog[ID]['agents_labels'] = cluster_labels

        for cluster_center_label in cluster_center_labels:
            cluster_center = cluster_centers[cluster_center_label]
            members = (cluster_labels == cluster_center_label)

            x, y = cluster_center
            # convert y to catalog convention
            y = flags['image_y_size'] - y
            N0 = np.sum(members)
            S = np.sum(skill[members])
            D = dist_within[cluster_center_label]

            if cluster_center_label == -1:
                # outlier cluster
                # so really every point is its own cluster...
                D = 0
            ## catalog.append((ID, kind, x, y, P, N0, S, D))
            ## if len(catalog)%500 == 0:
            ##     print len(catalog)
            # TODO: make some requirement to be included (exclude outliers)
            if len(flags['output_name']) > 0:
                F.write('{0},{1},{2},{3},{4},{5},{6},{7}\n'.format(
                    ID, kind, x, y, P, N0, S, D))

    print 'make_lens_catalog: Clearing memory'
    # clear memory
    memory.clear()

    if len(flags['output_name']) > 0:
        print 'make_lens_catalog: closing file!'
        F.close()

    if len(flags['update_collection']) > 0:
        print 'make_lens_catalog: writing updated collection to', flags[
            'update_collection']

        # TODO: get the other params correct!!!!
        collection_fat = swap.collection.Collection()
        for ID in catalog:
            subject = collection.member[ID]
            atx = subject.annotationhistory['At_X']
            labels_in = list(catalog[ID]['agents_labels'])
            labels_fat = []
            for atx_i in atx:
                labels_fat.append([])
                for atx_ij in atx_i:
                    labels_fat[-1].append(labels_in.pop(0))
            subject.annotationhistory.update({'labels': labels_fat})
            collection_fat.member.update({ID: subject})
        swap.write_pickle(collection_fat, flags['update_collection'])

    print 'make_lens_catalog: All done!'

    return catalog