def query_lowlevel(db, param_dict, sort_order): vars = [] clauses = [] for column, values in param_dict.items(): norm_path = db.path + "/frequencies/normalized_" + column + "_frequencies" for v in values: parsed = parse_query(v) if db.locals['debug']: print >> sys.stderr, "METADATA_TOKENS:", parsed grouped = group_terms(parsed) if db.locals['debug']: print >> sys.stderr, "METADATA_SYNTAX GROUPED:", grouped expanded = expand_grouped_query(grouped, norm_path) if db.locals['debug']: print >> sys.stderr, "METADATA_SYNTAX EXPANDED:", expanded sql_clause = make_grouped_sql_clause(expanded, column) if db.locals['debug']: print >> sys.stderr, "SQL_SYNTAX:", sql_clause clauses.append(sql_clause) if not sort_order: sort_order = ["rowid"] if clauses: query = "SELECT philo_id FROM toms WHERE " + " AND ".join("(%s)" % c for c in clauses) + " order by %s;" % ", ".join(sort_order) else: query = "SELECT philo_id FROM toms order by %s;" % ", ".join(sort_order) if db.locals['debug']: print >> sys.stderr, "INNER QUERY: ", "%s %% %s" % (query, vars), sort_order results = db.dbh.execute(query, vars) return results
def query_lowlevel(db,param_dict): vars = [] clauses = [] # if column = _philo_id I can do a special query here for column,values in param_dict.items(): norm_path = db.path+"/frequencies/normalized_" + column + "_frequencies" for v in values: parsed = parse_query(v) print >> sys.stderr, "METADATA_TOKENS:", parsed grouped = group_terms(parsed) print >> sys.stderr, "METADATA_SYNTAX:", grouped expanded = expand_grouped_query(grouped,norm_path) print >> sys.stderr, "METADATA_SYNTAX:", expanded sql_clause = make_grouped_sql_clause(expanded,column) print >> sys.stderr, "SQL_SYNTAX:", sql_clause clauses.append(sql_clause) # clause,some_vars = make_clause(column,parsed,norm_path) # print >> sys.stderr, "METADATA_QUERY:",clause,some_vars # clauses.append(clause) # vars += some_vars if clauses: query = "SELECT philo_id FROM toms WHERE " + " AND ".join("(%s)" % c for c in clauses) + " order by rowid;" else: query = "SELECT philo_id FROM toms order by rowid;" # vars = [v.decode("utf-8") for v in vars] print >> sys.stderr, "INNER QUERY: ", "%s %% %s" % (query,vars) #for v in vars: # print >> sys.stderr, "%s : %s" % (type(v),repr(v)) results = db.dbh.execute(query,vars) return results
def query_lowlevel(db, param_dict): vars = [] clauses = [] # if column = _philo_id I can do a special query here for column, values in param_dict.items(): norm_path = db.locals[ "db_path"] + "/frequencies/normalized_" + column + "_frequencies" for v in values: parsed = parse_query(v) print >> sys.stderr, "METADATA_TOKENS:", parsed grouped = group_terms(parsed) print >> sys.stderr, "METADATA_SYNTAX:", grouped expanded = expand_grouped_query(grouped, norm_path) print >> sys.stderr, "METADATA_SYNTAX:", expanded sql_clause = make_grouped_sql_clause(expanded, column) print >> sys.stderr, "SQL_SYNTAX:", sql_clause clauses.append(sql_clause) # clause,some_vars = make_clause(column,parsed,norm_path) # print >> sys.stderr, "METADATA_QUERY:",clause,some_vars # clauses.append(clause) # vars += some_vars if clauses: query = "SELECT philo_id FROM toms WHERE " + " AND ".join( "(%s)" % c for c in clauses) + " order by rowid;" else: query = "SELECT philo_id FROM toms order by rowid;" # vars = [v.decode("utf-8") for v in vars] print >> sys.stderr, "INNER QUERY: ", "%s %% %s" % (query, vars) #for v in vars: # print >> sys.stderr, "%s : %s" % (type(v),repr(v)) results = db.dbh.execute(query, vars) return results
def format_query(qstring,db): parsed = parse_query(qstring) parsed_split = [] for label,token in parsed: l,t = label,token if l == "QUOTE": subtokens = t[1:-1].split(" ") parsed_split += [("QUOTE_S",sub_t) for sub_t in subtokens if sub_t] else: parsed_split += [(l,t)] command = format_parsed_query(parsed_split,db) print >> sys.stderr, "QUERY_COMMAND",repr(command) return command
def query_lowlevel(db,param_dict): vars = [] clauses = [] for column,values in param_dict.items(): norm_path = db.locals["db_path"]+"/frequencies/normalized_" + column + "_frequencies" for v in values: parsed = parse_query(v) clause,some_vars = make_clause(column,parsed,norm_path) clauses.append(clause) vars += some_vars if clauses: query = "SELECT philo_id FROM toms WHERE " + " AND ".join("(%s)" % c for c in clauses) + ";" else: query = "SELECT philo_id FROM toms;" # vars = [v.decode("utf-8") for v in vars] #print >> sys.stderr, "%s %% %s" % (query,vars) #for v in vars: # print >> sys.stderr, "%s : %s" % (type(v),repr(v)) results = db.dbh.execute(query,vars) return results
def query(db,terms,corpus_file=None,corpus_size=0,method=None,method_arg=None,limit=3000,filename="", query_debug=False): sys.stdout.flush() tstart = datetime.now() parsed = parse_query(terms) grouped = group_terms(parsed) split = split_terms(grouped) # print >> sys.stderr, "QUERY FORMATTED at ", datetime.now() - tstart words_per_hit = len(split) # print >> sys.stderr, "QUERY SPLIT at ", datetime.now() - tstart, repr(split) origpid = os.getpid() if not filename: hfile = str(origpid) + ".hitlist" dir = db.path + "/hitlists/" filename = filename or (dir + hfile) hl = open(filename, "w") err = open("/dev/null", "w") freq_file = db.path+"/frequencies/normalized_word_frequencies" if (query_debug): print >> sys.stderr, "FORKING" pid = os.fork() if pid == 0: os.umask(0) os.chdir(dir) os.setsid() pid = os.fork() if pid > 0: os._exit(0) else: #now we're detached from the parent, and can do our work. if query_debug: print >> sys.stderr, "WORKER DETACHED at ", datetime.now() - tstart # args = ["search4", db.path,"--limit",str(limit)] args = ["corpus_search"] if corpus_file: args.extend(("-c", corpus_file)); # if corpus_file and corpus_size: # args.extend(("--corpusfile", corpus_file , "--corpussize" , str(corpus_size))) if method and method_arg: args.extend(("-m",method,"-a",str(method_arg))) args.extend(("-o","binary",db.path,)); worker = subprocess.Popen(args,stdin=subprocess.PIPE,stdout=hl,stderr=err) # if query_debug: # print >> sys.stderr, "WORKER STARTED:"," ".join(args); query_log_fh = filename + ".terms" print >> sys.stderr, "LOGGING TERMS to " + filename + ".terms" logger = subprocess.Popen(["tee",query_log_fh],stdin=subprocess.PIPE,stdout = worker.stdin) # print >> sys.stderr, "EXPANDING" expand_query_not(split,freq_file,logger.stdin) logger.stdin.close() # if query_debug == True: # print >> sys.stderr, "DEBUGGING" # query_log_fh = filename + ".terms" # print >> sys.stderr, "LOGGING to " + filename + ".terms" # logger = subprocess.Popen(["tee",query_log_fh],stdin=subprocess.PIPE,stdout = worker.stdin) # print >> sys.stderr, "EXPANDING" # expand_query_not(split,freq_file,logger.stdin) # logger.stdin.close() # else: # expand_query_not(split,freq_file,worker.stdin) worker.stdin.close() returncode = worker.wait() if returncode == -11: print >> sys.stderr, "SEGFAULT" seg_flag = open(filename + ".error","w") seg_flag.close() #do something to mark query as finished flag = open(filename + ".done","w") flag.write(" ".join(args) + "\n") flag.close() # print >> sys.stderr, "SUBPROC DONE at ", datetime.now() - tstart os._exit(0) else: hl.close() return HitList.HitList(filename,words_per_hit,db)
print >> sys.stderr, grep_command grep_proc = subprocess.Popen(grep_command,stdin=in_fh,stdout=dest_fh) return grep_proc def invert_grep_exact(token, in_fh, dest_fh): #don't strip accent or case, exact match only. grep_command = ["egrep", "-v", "[[:blank:]]%s$" % token[1:-1]] print >> sys.stderr, grep_command grep_proc = subprocess.Popen(grep_command,stdin=in_fh,stdout=dest_fh) #can't wait because input isn't ready yet. return grep_proc if __name__ == "__main__": path = sys.argv[1] terms = sys.argv[2:] parsed = parse_query(" ".join(terms)) print >> sys.stderr, "PARSED:", parsed grouped = group_terms(parsed) print >> sys.stderr, "GROUPED:", grouped split = split_terms(grouped) print >> sys.stderr, "parsed %d terms:" % len(split), split class Fake_DB: pass fake_db = Fake_DB() fake_db.locals = {"db_path":path + "/data/"} fake_db.path = path + "/data/" fake_db.encoding = "utf-8" freq_file = path + "/data/frequencies/normalized_word_frequencies" # freq_file = "/Library/WebServer/Documents/philologic/plain_text_test/data/frequencies/normalized_word_frequencies" expand_query_not(split,freq_file,sys.stdout)
def query(db, terms, corpus_file=None, corpus_size=0, method=None, method_arg=None, limit=3000, filename="", query_debug=False): sys.stdout.flush() tstart = datetime.now() parsed = parse_query(terms) grouped = group_terms(parsed) split = split_terms(grouped) # print >> sys.stderr, "QUERY FORMATTED at ", datetime.now() - tstart words_per_hit = len(split) # print >> sys.stderr, "QUERY SPLIT at ", datetime.now() - tstart, repr(split) origpid = os.getpid() if not filename: hfile = str(origpid) + ".hitlist" dir = db.locals["db_path"] + "/hitlists/" filename = filename or (dir + hfile) hl = open(filename, "w") err = open("/dev/null", "w") freq_file = db.locals[ "db_path"] + "/frequencies/normalized_word_frequencies" pid = os.fork() if pid == 0: os.umask(0) os.chdir(dir) os.setsid() pid = os.fork() if pid > 0: os._exit(0) else: #now we're detached from the parent, and can do our work. print >> sys.stderr, "WORKER DETACHED at ", datetime.now() - tstart args = ["search4", db.path, "--limit", str(limit)] if corpus_file and corpus_size: args.extend(("--corpusfile", corpus_file, "--corpussize", str(corpus_size))) if method and method_arg: args.extend((method, str(method_arg))) worker = subprocess.Popen(args, stdin=subprocess.PIPE, stdout=hl, stderr=err) print >> sys.stderr, "WORKER STARTED" if query_debug == True: print >> sys.stderr, "DEBUGGING" query_log_fh = filename + ".terms" print >> sys.stderr, "LOGGING to " + filename + ".terms" logger = subprocess.Popen(["tee", query_log_fh], stdin=subprocess.PIPE, stdout=worker.stdin) print >> sys.stderr, "EXPANDING" expand_query(split, freq_file, logger.stdin) logger.stdin.close() else: expand_query(split, freq_file, worker.stdin) worker.stdin.close() returncode = worker.wait() if returncode == -11: print >> sys.stderr, "SEGFAULT" seg_flag = open(filename + ".error", "w") seg_flag.close() #do something to mark query as finished flag = open(filename + ".done", "w") flag.write(" ".join(args) + "\n") flag.close() # print >> sys.stderr, "SUBPROC DONE at ", datetime.now() - tstart os._exit(0) else: hl.close() return HitList.HitList(filename, words_per_hit, db)