예제 #1
0
    def populate_master_summarized(self):
        from itertools import groupby
        # Construct the insert query
        taxa_ranks = ["class", "order", "family", "genus", "species"]
        trait_names = [trait for (trait, _, _) in self.SCHEMA.TRAITS]
        field_names = taxa_ranks + trait_names
        field_refs = ['"%s"' % f for f in field_names] 
        fields_spec = ",   ".join(field_refs)
        # value_refs = ['%%(%s)s' % f for f in field_names ]
        value_refs = ["%s" for _ in field_names ]
        values_spec = ", ".join(value_refs)
        q_out = '''
insert into master_summarized ( %s )
                       values ( %s ) ;
        ''' % (fields_spec, values_spec)
        print q_out
        cur_out = self.conn.cursor()

        # Get input data stream         
        q_in = '''
select * 
from master_raw
order by "class", "order", "family", "genus", "species"         
        '''
        cur_in = self.conn.cursor()
        cur_in.execute(q_in)
        records_in = DB.generate_cur2dicts(cur_in)
        groups_in = groupby(records_in, 
                                      lambda r : {"class":r["class"], "order":r["order"], "family":r["family"], 
                                                  "genus":r["genus"], "species":r["species"]})

        from visualwg.core.aggregators import * 
        
        aggs = AggregatorArray(self.SCHEMA.TRAITS) 
        for (taxon, datagroup) in groups_in:
            #!! print (taxon, datagroup)
            # Compute data summary for each trait
            aggs.reset()  
            for datarow in datagroup:
                #-- aggs.start_row()
                for trait in trait_names: 
                    agg = aggs[trait]
                    agg.add(datarow[trait])
                if aggs.invalidity_in_row():
                    #TODO: (1) log the problem; (2) emit a row into the problem spreadsheet
                    print "Invalid points in submission %s, row %s: %s" % (datarow['Submission'], datarow['Linenum'], aggs.get_invalid_row_part())
            #TODO: Insert a new record into cur_out  
            #!! print "Aggregate data:"
            #!! print taxon,;  print aggs.get_aggregate_row()
            summarized = util.merge_dicts([taxon, aggs.get_aggregate_row()])
            summarized_list = [ summarized[f]  for f in field_names]    #cannot use the summarized dict for "DB dict insert", 
                                                                        # since some fields contain ")" !
            cur_out.execute(q_out, summarized_list)
        
        cur_out.close()
        cur_in.close()
        self.conn.commit()

        #Download to master_summarized spreadsheet
        q_download = '''
select %s 
from master_summarized
order by "class", "order", "family", "genus", "species" 
        ''' % fields_spec
        fname_download = os.path.join(conf.results_dir, "master_summarized.csv")
        self.query2file(q_download, fname_download) 
        
예제 #2
0
    def decide_actions(self):
        ''' Determines actions for the taxa (take, rename, ignore) and the data rows (ignore vs accept), 
            to be used during the later merge. 
            The actions are based on the raw matches in "taxamatch", according to the rules in taxa-matching.txt 
            Results are written into a tab-delimited file. 
        '''
        #Set up the file to write the results into
        subm_columns = ['upload_id'] + self.submtaxa_columns
        match_columns = ['authority_id'] + TABLES["authority"]
        diagnosis_columns = ["binom_match", "fam_match", "pub_auth_agreement", "taxon_action", "data_action", "auth_used"]
        taxaaction_columns = subm_columns + diagnosis_columns + match_columns
        taxaaction_header = dict([(c,c) for c in taxaaction_columns])
        taxaaction_fname = os.path.join(conf.subm_reports_dir, "taxaaction.tab")
        taxaaction_file = open(taxaaction_fname, "w")
        taxaaction_tab = csv.DictWriter(taxaaction_file, taxaaction_columns, "###", 'raise', 'excel-tab')
        taxaaction_tab.writerow(taxaaction_header)
        
        #DB query for the join with matches 
        q = 'select * from taxamatch;'
        cur = self.conn.cursor()
        cur.execute(q)

        #Go over all the grouped matches
        for (upload_id, subm_taxon, match_taxa) in self.generate_grouped_matches(cur):
            #-- print "a match %i, %s ### %s" % (upload_id, subm_taxon, match_taxa)
            # Find WG taxon decision, if available
            wg_match = None
            for m in match_taxa:
                if m["sourceid"] and m["sourceid"].split(":")[0] == "WG":
                    wg_match = m 
            
            #Determine the diagnosis 
            diagnosis = None
            if len(match_taxa) == 1 and match_taxa[0]["authority_id"] == None:
                # subm_taxon was not matched in the authority table 
                diagnosis = {"binom_match":"notfound", "fam_match":"n/a", "pub_auth_agreement":"n/a", 
                             "taxon_action":"notfound", "data_action":"ignore", "auth_used":None}                
            elif len(match_taxa) == 1:
                # subm_taxon matches exactly one entry in the authority
                mtx = match_taxa[0]
                if subm_taxon["Pub. Class"] == None and subm_taxon["Pub. Order"] == None and subm_taxon["Pub. Family"] == None:
                    # No current C-O-F naming was submitted  => write-in from authority
                    diagnosis = {"binom_match":"exact", "fam_match":"n/a", "pub_auth_agreement":"nopub", 
                                 "taxon_action":"writein", "data_action":"accept", "auth_used":mtx['authority_id']}
                elif subm_taxon["Pub. Class"] == mtx["class"] and subm_taxon["Pub. Order"] == mtx["order"] and \
                     subm_taxon["Pub. Family"] == mtx["family"] and \
                     subm_taxon["Pub. Genus"] == mtx["genus"] and subm_taxon["Pub. Species"] == mtx["species"]:
                    # Submitted current naming agrees with that in the authority
                    diagnosis = {"binom_match":"exact", "fam_match":"n/a", "pub_auth_agreement":"agree", 
                                 "taxon_action":"ok", "data_action":"accept", "auth_used":mtx['authority_id']}
                else:
                    # Submitted current naming does not agree with that in the authority => override with authority
                    diagnosis = {"binom_match":"exact", "fam_match":"n/a", "pub_auth_agreement":"disagree", 
                                 "taxon_action":"override", "data_action":"accept", "auth_used":mtx['authority_id']}
            else:
                # There is more than one match for subm_taxon in the authority
                # Try to use current Family, if submitted, for further resolution
                if subm_taxon["Pub. Family"] == None:
                    # The Family was not submitted for subm_taxon
                    diagnosis = {"binom_match":"ambig", "fam_match":"nopub", "pub_auth_agreement":"n/a", 
                         "taxon_action":"foundmany", "data_action":"ignore", "auth_used":None}   
                else:
                    match_taxa_wfamily = [mtx  for mtx in match_taxa if mtx["family"] == subm_taxon["Pub. Family"]]
                    if len(match_taxa_wfamily) == 0:
                        # Submitted family is not among those in the authority
                        diagnosis = {"binom_match":"ambig", "fam_match":"notfound", "pub_auth_agreement":"n/a", 
                                     "taxon_action":"foundmany", "data_action":"ignore", "auth_used":None}     
                    elif len(match_taxa_wfamily) == 1:
                        # The Family helped to resolve ambiguity
                        mtx = match_taxa_wfamily[0]
                        if subm_taxon["Pub. Class"] == None and subm_taxon["Pub. Order"] == None:
                            # No current C-O naming was submitted  => write-in from authority
                            diagnosis = {"binom_match":"ambig", "fam_match":"exact", "pub_auth_agreement":"nopub", 
                                         "taxon_action":"writein", "data_action":"accept", "auth_used":mtx['authority_id']}                          
                        elif subm_taxon["Pub. Class"] == mtx["class"] and subm_taxon["Pub. Order"] == mtx["order"] and \
                             subm_taxon["Pub. Genus"] == mtx["genus"] and subm_taxon["Pub. Species"] == mtx["species"]:
                            # Submitted current naming agrees with that in the authority
                            diagnosis = {"binom_match":"ambig", "fam_match":"exact", "pub_auth_agreement":"agree", 
                                         "taxon_action":"ok", "data_action":"accept", "auth_used":mtx['authority_id']}                             
                        else:
                            # Submitted current C-O naming does not agree with that in the authority => override with authority
                            diagnosis = {"binom_match":"ambig", "fam_match":"exact", "pub_auth_agreement":"disagree", 
                                         "taxon_action":"override", "data_action":"accept", "auth_used":mtx['authority_id']}
                    else:
                        # Even with Family, there are still several possibilities in the authority
                        diagnosis = {"binom_match":"ambig", "fam_match":"ambig", "pub_auth_agreement":"n/a", 
                                     "taxon_action":"foundmany", "data_action":"ignore", "auth_used":None}                                         
            assert diagnosis != None
            
            #Now, if there is an overriding WG correction, apply it to the diagnosis
            # Find WG taxon decision, if available
            wg_match = None
            for m in match_taxa:
                if m["sourceid"] and m["sourceid"].split(":")[0] == "WG":
                    wg_match = m 
            if wg_match:
                diagnosis["taxon_action"] = "wg"
                diagnosis["auth_used"] = wg_match['authority_id']
                diagnosis["data_action"] = "accept"
            
            #Write results (subm_taxon, match_taxa and dignosis) to a file 
            dummy_diagnosis = {"binom_match":"", "fam_match":"", "pub_auth_agreement":"", 
                               "taxon_action":"", "data_action":"", "auth_used":None}
            dummy_auth_taxon = dict([(colname, None) for colname in ['authority_id'] + TABLES["authority"]])
            #emit an extra row (with empty authority data) when there were multiple authority matches, but none "worked"
            if diagnosis["auth_used"] == None and len(match_taxa) > 1:
                act_row = util.merge_dicts([subm_taxon, diagnosis, dummy_auth_taxon])   
                taxaaction_tab.writerow(act_row)      
            #in every case, emit a row for each authority match that was considered, 
            #  printing the diagnosis alongside the successful match   
            for auth_taxon in match_taxa:
                if auth_taxon['authority_id'] == diagnosis["auth_used"]:
                    act_row = util.merge_dicts([subm_taxon, diagnosis, auth_taxon])
                else:
                    act_row = util.merge_dicts([subm_taxon, dummy_diagnosis, auth_taxon])
                taxaaction_tab.writerow(act_row)
            
        taxaaction_file.close()
        cur.close()