Exemplo n.º 1
0
def query_lowlevel(db, param_dict, sort_order):
    vars = []
    clauses = []
    for column, values in param_dict.items():
        norm_path = db.path + "/frequencies/normalized_" + column + "_frequencies"
        for v in values:
            parsed = parse_query(v)
            if db.locals['debug']:
                print >> sys.stderr, "METADATA_TOKENS:", parsed
            grouped = group_terms(parsed)
            if db.locals['debug']:
                print >> sys.stderr, "METADATA_SYNTAX GROUPED:", grouped
            expanded = expand_grouped_query(grouped, norm_path)
            if db.locals['debug']:
                print >> sys.stderr, "METADATA_SYNTAX EXPANDED:", expanded
            sql_clause = make_grouped_sql_clause(expanded, column)
            if db.locals['debug']:
                print >> sys.stderr, "SQL_SYNTAX:", sql_clause
            clauses.append(sql_clause)
    if not sort_order:
        sort_order = ["rowid"]
    if clauses:
        query = "SELECT philo_id FROM toms WHERE " + " AND ".join("(%s)" % c for c in clauses) + " order by %s;" % ", ".join(sort_order)
    else:
        query = "SELECT philo_id FROM toms order by %s;" % ", ".join(sort_order)
    if db.locals['debug']:
        print >> sys.stderr, "INNER QUERY: ", "%s %% %s" % (query, vars), sort_order
    results = db.dbh.execute(query, vars)
    return results
Exemplo n.º 2
0
def query_lowlevel(db,param_dict):
    vars = []
    clauses = []
    # if column = _philo_id I can do a special query here
    for column,values in param_dict.items():
        norm_path = db.path+"/frequencies/normalized_" + column + "_frequencies"
        for v in values:
            parsed = parse_query(v)            
            print >> sys.stderr, "METADATA_TOKENS:", parsed
            grouped = group_terms(parsed)
            print >> sys.stderr, "METADATA_SYNTAX:", grouped
            expanded = expand_grouped_query(grouped,norm_path)
            print >> sys.stderr, "METADATA_SYNTAX:", expanded
            sql_clause = make_grouped_sql_clause(expanded,column)
            print >> sys.stderr, "SQL_SYNTAX:", sql_clause
            clauses.append(sql_clause)            
#            clause,some_vars = make_clause(column,parsed,norm_path)
#            print >> sys.stderr, "METADATA_QUERY:",clause,some_vars
#            clauses.append(clause)
#            vars += some_vars
    if clauses:
        query = "SELECT philo_id FROM toms WHERE " + " AND ".join("(%s)" % c for c in clauses) + " order by rowid;"
    else:
        query = "SELECT philo_id FROM toms order by rowid;"
#    vars = [v.decode("utf-8") for v in vars]
    print >> sys.stderr, "INNER QUERY: ", "%s %% %s" % (query,vars)
    #for v in vars:
    #    print >> sys.stderr, "%s : %s" % (type(v),repr(v))

    results = db.dbh.execute(query,vars)
    return results
Exemplo n.º 3
0
def query_lowlevel(db, param_dict):
    vars = []
    clauses = []
    # if column = _philo_id I can do a special query here
    for column, values in param_dict.items():
        norm_path = db.locals[
            "db_path"] + "/frequencies/normalized_" + column + "_frequencies"
        for v in values:
            parsed = parse_query(v)
            print >> sys.stderr, "METADATA_TOKENS:", parsed
            grouped = group_terms(parsed)
            print >> sys.stderr, "METADATA_SYNTAX:", grouped
            expanded = expand_grouped_query(grouped, norm_path)
            print >> sys.stderr, "METADATA_SYNTAX:", expanded
            sql_clause = make_grouped_sql_clause(expanded, column)
            print >> sys.stderr, "SQL_SYNTAX:", sql_clause
            clauses.append(sql_clause)
#            clause,some_vars = make_clause(column,parsed,norm_path)
#            print >> sys.stderr, "METADATA_QUERY:",clause,some_vars
#            clauses.append(clause)
#            vars += some_vars
    if clauses:
        query = "SELECT philo_id FROM toms WHERE " + " AND ".join(
            "(%s)" % c for c in clauses) + " order by rowid;"
    else:
        query = "SELECT philo_id FROM toms order by rowid;"


#    vars = [v.decode("utf-8") for v in vars]
    print >> sys.stderr, "INNER QUERY: ", "%s %% %s" % (query, vars)
    #for v in vars:
    #    print >> sys.stderr, "%s : %s" % (type(v),repr(v))

    results = db.dbh.execute(query, vars)
    return results
Exemplo n.º 4
0
def query(db,terms,corpus_file=None,corpus_size=0,method=None,method_arg=None,limit=3000,filename="", query_debug=False):
    sys.stdout.flush()
    tstart = datetime.now()

    parsed = parse_query(terms)
    grouped = group_terms(parsed)
    split = split_terms(grouped)

#    print >> sys.stderr, "QUERY FORMATTED at ", datetime.now() - tstart
    words_per_hit = len(split)
 #   print >> sys.stderr, "QUERY SPLIT at ", datetime.now() - tstart, repr(split)
    origpid = os.getpid()
    if not filename:
        hfile = str(origpid) + ".hitlist"
    dir = db.path + "/hitlists/"
    filename = filename or (dir + hfile)
    hl = open(filename, "w")
    err = open("/dev/null", "w")
    freq_file = db.path+"/frequencies/normalized_word_frequencies"
    if (query_debug):
        print >> sys.stderr, "FORKING"
    pid = os.fork()
    if pid == 0:
        os.umask(0)
        os.chdir(dir)
        os.setsid()
        pid = os.fork()
        if pid > 0:
            os._exit(0)
        else:
            #now we're detached from the parent, and can do our work.
            if query_debug:
                print >> sys.stderr, "WORKER DETACHED at ", datetime.now() - tstart
#            args = ["search4", db.path,"--limit",str(limit)]
            args = ["corpus_search"]
            if corpus_file:
                args.extend(("-c", corpus_file));
#            if corpus_file and corpus_size:
#                args.extend(("--corpusfile", corpus_file , "--corpussize" , str(corpus_size)))
            if method and method_arg:
                args.extend(("-m",method,"-a",str(method_arg)))
            
            args.extend(("-o","binary",db.path,));

            worker = subprocess.Popen(args,stdin=subprocess.PIPE,stdout=hl,stderr=err)
            # if query_debug:
            #     print >> sys.stderr, "WORKER STARTED:"," ".join(args);

            query_log_fh = filename + ".terms"
            print >> sys.stderr, "LOGGING TERMS to " + filename + ".terms"
            logger = subprocess.Popen(["tee",query_log_fh],stdin=subprocess.PIPE,stdout = worker.stdin)
            # print >> sys.stderr, "EXPANDING"
            expand_query_not(split,freq_file,logger.stdin)
            logger.stdin.close()


            # if query_debug == True:
            #     print >> sys.stderr, "DEBUGGING"
            #     query_log_fh = filename + ".terms"
            #     print >> sys.stderr, "LOGGING to " + filename + ".terms"
            #     logger = subprocess.Popen(["tee",query_log_fh],stdin=subprocess.PIPE,stdout = worker.stdin)
            #     print >> sys.stderr, "EXPANDING"
            #     expand_query_not(split,freq_file,logger.stdin)
            #     logger.stdin.close()
            # else:
            #     expand_query_not(split,freq_file,worker.stdin)

            worker.stdin.close()

            returncode = worker.wait()

            if returncode == -11:
                print >> sys.stderr, "SEGFAULT"
                seg_flag = open(filename + ".error","w")
                seg_flag.close()
            #do something to mark query as finished
            flag = open(filename + ".done","w")
            flag.write(" ".join(args) + "\n")
            flag.close()
#            print >> sys.stderr, "SUBPROC DONE at ", datetime.now() - tstart
            os._exit(0)
    else:
        hl.close()
        return HitList.HitList(filename,words_per_hit,db)
Exemplo n.º 5
0
    return grep_proc

def invert_grep_exact(token, in_fh, dest_fh):
    #don't strip accent or case, exact match only.
    grep_command = ["egrep", "-v", "[[:blank:]]%s$" % token[1:-1]]
    print >> sys.stderr, grep_command
    grep_proc = subprocess.Popen(grep_command,stdin=in_fh,stdout=dest_fh)
    #can't wait because input isn't ready yet.
    return grep_proc

if __name__ == "__main__":
    path = sys.argv[1]
    terms = sys.argv[2:]
    parsed = parse_query(" ".join(terms))
    print >> sys.stderr, "PARSED:", parsed
    grouped = group_terms(parsed)
    print >> sys.stderr, "GROUPED:", grouped
    split = split_terms(grouped)
    print >> sys.stderr, "parsed %d terms:" % len(split), split

    class Fake_DB: pass

    fake_db = Fake_DB()
    fake_db.locals = {"db_path":path + "/data/"}
    fake_db.path = path + "/data/"
    fake_db.encoding = "utf-8"
    freq_file = path + "/data/frequencies/normalized_word_frequencies"    
#    freq_file = "/Library/WebServer/Documents/philologic/plain_text_test/data/frequencies/normalized_word_frequencies"
    expand_query_not(split,freq_file,sys.stdout)
    hits = query(fake_db," ".join(terms),query_debug = True)
Exemplo n.º 6
0
def query(db,
          terms,
          corpus_file=None,
          corpus_size=0,
          method=None,
          method_arg=None,
          limit=3000,
          filename="",
          query_debug=False):
    sys.stdout.flush()
    tstart = datetime.now()

    parsed = parse_query(terms)
    grouped = group_terms(parsed)
    split = split_terms(grouped)

    #    print >> sys.stderr, "QUERY FORMATTED at ", datetime.now() - tstart
    words_per_hit = len(split)
    #   print >> sys.stderr, "QUERY SPLIT at ", datetime.now() - tstart, repr(split)
    origpid = os.getpid()
    if not filename:
        hfile = str(origpid) + ".hitlist"
    dir = db.locals["db_path"] + "/hitlists/"
    filename = filename or (dir + hfile)
    hl = open(filename, "w")
    err = open("/dev/null", "w")
    freq_file = db.locals[
        "db_path"] + "/frequencies/normalized_word_frequencies"
    pid = os.fork()
    if pid == 0:
        os.umask(0)
        os.chdir(dir)
        os.setsid()
        pid = os.fork()
        if pid > 0:
            os._exit(0)
        else:
            #now we're detached from the parent, and can do our work.
            print >> sys.stderr, "WORKER DETACHED at ", datetime.now() - tstart
            args = ["search4", db.path, "--limit", str(limit)]
            if corpus_file and corpus_size:
                args.extend(("--corpusfile", corpus_file, "--corpussize",
                             str(corpus_size)))
            if method and method_arg:
                args.extend((method, str(method_arg)))

            worker = subprocess.Popen(args,
                                      stdin=subprocess.PIPE,
                                      stdout=hl,
                                      stderr=err)
            print >> sys.stderr, "WORKER STARTED"
            if query_debug == True:
                print >> sys.stderr, "DEBUGGING"
                query_log_fh = filename + ".terms"
                print >> sys.stderr, "LOGGING to " + filename + ".terms"
                logger = subprocess.Popen(["tee", query_log_fh],
                                          stdin=subprocess.PIPE,
                                          stdout=worker.stdin)
                print >> sys.stderr, "EXPANDING"
                expand_query(split, freq_file, logger.stdin)
                logger.stdin.close()
            else:
                expand_query(split, freq_file, worker.stdin)

            worker.stdin.close()

            returncode = worker.wait()

            if returncode == -11:
                print >> sys.stderr, "SEGFAULT"
                seg_flag = open(filename + ".error", "w")
                seg_flag.close()
            #do something to mark query as finished
            flag = open(filename + ".done", "w")
            flag.write(" ".join(args) + "\n")
            flag.close()
            #            print >> sys.stderr, "SUBPROC DONE at ", datetime.now() - tstart
            os._exit(0)
    else:
        hl.close()
        return HitList.HitList(filename, words_per_hit, db)