from taxolib.taxacomponents import RankTable, Taxon from taxolib.taxoconfig import ConfigError from argparse import ArgumentParser argp = ArgumentParser(description='Searches for taxa in the taxonomy database by matching the taxon name \ string. "%" can be used as a wildcard character in the search string.') argp.add_argument('-d', '--dbconf', help='the SQLite database file ("database.sqlite" by default)') argp.add_argument('-s', '--nosynonyms', action='store_true', help='do not search synonyms for taxa names') argp.add_argument('search_string', help='the name search string') argp.set_defaults(dbconf='database.sqlite', numtaxa=-1, maxdepth=-1) args = argp.parse_args() # Get a cursor for the taxonomy database. try: pgcur = taxodatabase.getDBCursor(args.dbconf) except ConfigError as e: exit('\n' + str(e) + '\n') # Initialize the rank table from the database. ranktable = RankTable() ranktable.loadFromDB(pgcur) taxa = Taxon.find(pgcur, args.search_string, ranktable) # Organize the taxa by their source taxonomies. # Create a dictionary mapping taxonomy IDs to lists of taxa. taxonomy_taxa = {} for taxon in taxa: if taxon.taxonomy_id not in taxonomy_taxa: taxonomy_taxa[taxon.taxonomy_id] = []
#!/usr/bin/python # This program provides a simple example of how to use the approximate string # matching search library. Note that this requires only 2 lines of code: 1 to # instantiate a matcher object, and 1 to actually perform the match. import sys import approxmatch # A hack for now to get the local taxonomy package to import. sys.path.append('../') from taxolib import taxodatabase, taxoconfig # Get a cursor for the taxonomy database. try: pgcur = taxodatabase.getDBCursor('../database.conf') except taxoconfig.ConfigError as e: exit('\n' + str(e) + '\n') if len(sys.argv) != 2: exit('\nPlease provide a name to search for in the names table.\n') searchstr = sys.argv[1] # Specify the database table and column names. #tablename = 'names' tablename = 'ftest_genus_names' colname = 'namestr' # Instantiate a q-gram/DL hybrid algorithm matcher. matcher = approxmatch.HybridMatcher(tablename, colname, pgcur)
argp.add_argument('-tr', '--timer_runs', type=int, help='The number of complete search runs to execute when \ running in timer mode. The best time among all runs is taken as the final run time. The default is 3.') argp.add_argument('-m', '--method', help='the matching method to use ("exact", "qgram", "neighbor", \ "wcneighbor", "dmetaphone", "soundex", or "hybrid")') argp.add_argument('-qgt', '--qgram_threshold', type=float, help='The similarity threshold to use for \ qgram-based matching. The default is 0.3.') argp.add_argument('-fo', '--output_format', help='The format for reporting results, either "text" \ [the default] or "json".') argp.add_argument('csv_file', help='the input CSV file') argp.set_defaults(dbconf='../database.conf', table='ftest_genus_names', write_failed='', timer_runs=3, method='qgram', qgram_threshold=0.3, output_format='text') args = argp.parse_args() # Get a cursor for the taxonomy database. try: pgcur = taxodatabase.getDBCursor(args.dbconf) except taxoconfig.ConfigError as e: exit('\n' + str(e) + '\n') #nhoodMatch(pgcur, 'Anas') #wcNhoodMatch(pgcur, 'Ictaluris') #qgramMatch(pgcur, 'Anas') #exit() # Instantiate a matcher object for the requested match strategy. if args.method == 'qgram': matcher = approxmatch.QgramMatcher(args.table, 'namestr', pgcur) # Set the qgram matching similarity threshold. matcher.setSimilarityCutoff(args.qgram_threshold) elif args.method == 'exact': matcher = approxmatch.ExactMatcher(args.table, 'namestr', pgcur)