def populate_master_summarized(self): from itertools import groupby # Construct the insert query taxa_ranks = ["class", "order", "family", "genus", "species"] trait_names = [trait for (trait, _, _) in self.SCHEMA.TRAITS] field_names = taxa_ranks + trait_names field_refs = ['"%s"' % f for f in field_names] fields_spec = ", ".join(field_refs) # value_refs = ['%%(%s)s' % f for f in field_names ] value_refs = ["%s" for _ in field_names ] values_spec = ", ".join(value_refs) q_out = ''' insert into master_summarized ( %s ) values ( %s ) ; ''' % (fields_spec, values_spec) print q_out cur_out = self.conn.cursor() # Get input data stream q_in = ''' select * from master_raw order by "class", "order", "family", "genus", "species" ''' cur_in = self.conn.cursor() cur_in.execute(q_in) records_in = DB.generate_cur2dicts(cur_in) groups_in = groupby(records_in, lambda r : {"class":r["class"], "order":r["order"], "family":r["family"], "genus":r["genus"], "species":r["species"]}) from visualwg.core.aggregators import * aggs = AggregatorArray(self.SCHEMA.TRAITS) for (taxon, datagroup) in groups_in: #!! print (taxon, datagroup) # Compute data summary for each trait aggs.reset() for datarow in datagroup: #-- aggs.start_row() for trait in trait_names: agg = aggs[trait] agg.add(datarow[trait]) if aggs.invalidity_in_row(): #TODO: (1) log the problem; (2) emit a row into the problem spreadsheet print "Invalid points in submission %s, row %s: %s" % (datarow['Submission'], datarow['Linenum'], aggs.get_invalid_row_part()) #TODO: Insert a new record into cur_out #!! print "Aggregate data:" #!! print taxon,; print aggs.get_aggregate_row() summarized = util.merge_dicts([taxon, aggs.get_aggregate_row()]) summarized_list = [ summarized[f] for f in field_names] #cannot use the summarized dict for "DB dict insert", # since some fields contain ")" ! cur_out.execute(q_out, summarized_list) cur_out.close() cur_in.close() self.conn.commit() #Download to master_summarized spreadsheet q_download = ''' select %s from master_summarized order by "class", "order", "family", "genus", "species" ''' % fields_spec fname_download = os.path.join(conf.results_dir, "master_summarized.csv") self.query2file(q_download, fname_download)
def decide_actions(self): ''' Determines actions for the taxa (take, rename, ignore) and the data rows (ignore vs accept), to be used during the later merge. The actions are based on the raw matches in "taxamatch", according to the rules in taxa-matching.txt Results are written into a tab-delimited file. ''' #Set up the file to write the results into subm_columns = ['upload_id'] + self.submtaxa_columns match_columns = ['authority_id'] + TABLES["authority"] diagnosis_columns = ["binom_match", "fam_match", "pub_auth_agreement", "taxon_action", "data_action", "auth_used"] taxaaction_columns = subm_columns + diagnosis_columns + match_columns taxaaction_header = dict([(c,c) for c in taxaaction_columns]) taxaaction_fname = os.path.join(conf.subm_reports_dir, "taxaaction.tab") taxaaction_file = open(taxaaction_fname, "w") taxaaction_tab = csv.DictWriter(taxaaction_file, taxaaction_columns, "###", 'raise', 'excel-tab') taxaaction_tab.writerow(taxaaction_header) #DB query for the join with matches q = 'select * from taxamatch;' cur = self.conn.cursor() cur.execute(q) #Go over all the grouped matches for (upload_id, subm_taxon, match_taxa) in self.generate_grouped_matches(cur): #-- print "a match %i, %s ### %s" % (upload_id, subm_taxon, match_taxa) # Find WG taxon decision, if available wg_match = None for m in match_taxa: if m["sourceid"] and m["sourceid"].split(":")[0] == "WG": wg_match = m #Determine the diagnosis diagnosis = None if len(match_taxa) == 1 and match_taxa[0]["authority_id"] == None: # subm_taxon was not matched in the authority table diagnosis = {"binom_match":"notfound", "fam_match":"n/a", "pub_auth_agreement":"n/a", "taxon_action":"notfound", "data_action":"ignore", "auth_used":None} elif len(match_taxa) == 1: # subm_taxon matches exactly one entry in the authority mtx = match_taxa[0] if subm_taxon["Pub. Class"] == None and subm_taxon["Pub. Order"] == None and subm_taxon["Pub. Family"] == None: # No current C-O-F naming was submitted => write-in from authority diagnosis = {"binom_match":"exact", "fam_match":"n/a", "pub_auth_agreement":"nopub", "taxon_action":"writein", "data_action":"accept", "auth_used":mtx['authority_id']} elif subm_taxon["Pub. Class"] == mtx["class"] and subm_taxon["Pub. Order"] == mtx["order"] and \ subm_taxon["Pub. Family"] == mtx["family"] and \ subm_taxon["Pub. Genus"] == mtx["genus"] and subm_taxon["Pub. Species"] == mtx["species"]: # Submitted current naming agrees with that in the authority diagnosis = {"binom_match":"exact", "fam_match":"n/a", "pub_auth_agreement":"agree", "taxon_action":"ok", "data_action":"accept", "auth_used":mtx['authority_id']} else: # Submitted current naming does not agree with that in the authority => override with authority diagnosis = {"binom_match":"exact", "fam_match":"n/a", "pub_auth_agreement":"disagree", "taxon_action":"override", "data_action":"accept", "auth_used":mtx['authority_id']} else: # There is more than one match for subm_taxon in the authority # Try to use current Family, if submitted, for further resolution if subm_taxon["Pub. Family"] == None: # The Family was not submitted for subm_taxon diagnosis = {"binom_match":"ambig", "fam_match":"nopub", "pub_auth_agreement":"n/a", "taxon_action":"foundmany", "data_action":"ignore", "auth_used":None} else: match_taxa_wfamily = [mtx for mtx in match_taxa if mtx["family"] == subm_taxon["Pub. Family"]] if len(match_taxa_wfamily) == 0: # Submitted family is not among those in the authority diagnosis = {"binom_match":"ambig", "fam_match":"notfound", "pub_auth_agreement":"n/a", "taxon_action":"foundmany", "data_action":"ignore", "auth_used":None} elif len(match_taxa_wfamily) == 1: # The Family helped to resolve ambiguity mtx = match_taxa_wfamily[0] if subm_taxon["Pub. Class"] == None and subm_taxon["Pub. Order"] == None: # No current C-O naming was submitted => write-in from authority diagnosis = {"binom_match":"ambig", "fam_match":"exact", "pub_auth_agreement":"nopub", "taxon_action":"writein", "data_action":"accept", "auth_used":mtx['authority_id']} elif subm_taxon["Pub. Class"] == mtx["class"] and subm_taxon["Pub. Order"] == mtx["order"] and \ subm_taxon["Pub. Genus"] == mtx["genus"] and subm_taxon["Pub. Species"] == mtx["species"]: # Submitted current naming agrees with that in the authority diagnosis = {"binom_match":"ambig", "fam_match":"exact", "pub_auth_agreement":"agree", "taxon_action":"ok", "data_action":"accept", "auth_used":mtx['authority_id']} else: # Submitted current C-O naming does not agree with that in the authority => override with authority diagnosis = {"binom_match":"ambig", "fam_match":"exact", "pub_auth_agreement":"disagree", "taxon_action":"override", "data_action":"accept", "auth_used":mtx['authority_id']} else: # Even with Family, there are still several possibilities in the authority diagnosis = {"binom_match":"ambig", "fam_match":"ambig", "pub_auth_agreement":"n/a", "taxon_action":"foundmany", "data_action":"ignore", "auth_used":None} assert diagnosis != None #Now, if there is an overriding WG correction, apply it to the diagnosis # Find WG taxon decision, if available wg_match = None for m in match_taxa: if m["sourceid"] and m["sourceid"].split(":")[0] == "WG": wg_match = m if wg_match: diagnosis["taxon_action"] = "wg" diagnosis["auth_used"] = wg_match['authority_id'] diagnosis["data_action"] = "accept" #Write results (subm_taxon, match_taxa and dignosis) to a file dummy_diagnosis = {"binom_match":"", "fam_match":"", "pub_auth_agreement":"", "taxon_action":"", "data_action":"", "auth_used":None} dummy_auth_taxon = dict([(colname, None) for colname in ['authority_id'] + TABLES["authority"]]) #emit an extra row (with empty authority data) when there were multiple authority matches, but none "worked" if diagnosis["auth_used"] == None and len(match_taxa) > 1: act_row = util.merge_dicts([subm_taxon, diagnosis, dummy_auth_taxon]) taxaaction_tab.writerow(act_row) #in every case, emit a row for each authority match that was considered, # printing the diagnosis alongside the successful match for auth_taxon in match_taxa: if auth_taxon['authority_id'] == diagnosis["auth_used"]: act_row = util.merge_dicts([subm_taxon, diagnosis, auth_taxon]) else: act_row = util.merge_dicts([subm_taxon, dummy_diagnosis, auth_taxon]) taxaaction_tab.writerow(act_row) taxaaction_file.close() cur.close()