def concordance_results(request, config): """Fetch concordances results.""" db = DB(config.db_path + "/data/") if request.collocation_type: first_hits = db.query(request["q"], request["method"], request["arg"], **request.metadata) second_hits = db.query(request["left"], request["method"], request["arg"], **request.metadata) hits = CombinedHitlist(first_hits, second_hits) else: hits = db.query( request["q"], request["method"], request["arg"], sort_order=request["sort_order"], **request.metadata ) start, end, page_num = page_interval(request["results_per_page"], hits, request.start, request.end) concordance_object = { "description": {"start": start, "end": end, "results_per_page": request.results_per_page}, "query": dict([i for i in request]), "default_object": db.locals["default_object_level"], } formatting_regexes = [] if config.concordance_formatting_regex: for pattern, replacement in config.concordance_formatting_regex: compiled_regex = re.compile(r"%s" % pattern) formatting_regexes.append((compiled_regex, replacement)) results = [] for hit in hits[start - 1 : end]: citation_hrefs = citation_links(db, config, hit) metadata_fields = {} for metadata in db.locals["metadata_fields"]: metadata_fields[metadata] = hit[metadata] citation = citations(hit, citation_hrefs, config, report="concordance") context = get_concordance_text(db, hit, config.db_path, config.concordance_length) if formatting_regexes: for formatting_regex, replacement in formatting_regexes: context = formatting_regex.sub(r"%s" % replacement, context) result_obj = { "philo_id": hit.philo_id, "citation": citation, "citation_links": citation_hrefs, "context": context, "metadata_fields": metadata_fields, "bytes": hit.bytes, } results.append(result_obj) concordance_object["results"] = results concordance_object["results_length"] = len(hits) concordance_object["query_done"] = hits.done return concordance_object
def term_group(environ, start_response): status = "200 OK" headers = [("Content-type", "application/json; charset=UTF-8"), ("Access-Control-Allow-Origin", "*")] start_response(status, headers) config = WebConfig(os.path.abspath(os.path.dirname(__file__)).replace("scripts", "")) db = DB(config.db_path + "/data/") request = WSGIHandler(environ, config) if not request["q"]: dump = json.dumps({"original_query": "", "term_groups": []}) else: hits = db.query( request["q"], request["method"], request["arg"], sort_order=request["sort_order"], **request.metadata ) parsed = parse_query(request.q) group = group_terms(parsed) all_groups = split_terms(group) term_groups = [] for g in all_groups: term_group = "" not_started = False for kind, term in g: if kind == "NOT": if not_started is False: not_started = True term_group += " NOT " elif kind == "OR": term_group += "|" elif kind == "TERM": term_group += " %s " % term elif kind == "QUOTE": term_group += " %s " % term term_group = term_group.strip() term_groups.append(term_group) dump = json.dumps({"term_groups": term_groups, "original_query": request.original_q}) yield dump.encode("utf8")
def lookup_word_service(environ, start_response): status = '200 OK' headers = [('Content-type', 'application/json; charset=UTF-8'), ("Access-Control-Allow-Origin", "*")] start_response(status, headers) config = WebConfig(os.path.abspath(os.path.dirname(__file__)).replace('scripts', '')) db = DB(config.db_path + '/data/') request = WSGIHandler(environ, config) cursor = db.dbh.cursor() if request.report == "concordance": hits = db.query(request["q"], request["method"], request["arg"], **request.metadata) context_size = config['concordance_length'] * 3 hit = hits[int(request.position)] bytes = hit.bytes hit_span = hit.bytes[-1] - hit.bytes[0] length = context_size + hit_span + context_size bytes, start_byte = adjust_bytes(bytes, length) end_byte = start_byte + length filename = hit.filename token = request.selected elif request.report == "navigation": token = request.selected philo_id = request.philo_id.split(" ") text_obj = db[philo_id] start_byte, end_byte = int(text_obj.start_byte), int(text_obj.end_byte) filename = text_obj.filename # print >> sys.stderr, "WORD LOOKUP FROM NAVIGATION", request.philo_id,request.selected, start_byte, end_byte, filename else: pass # print >> sys.stderr, "TOKEN", token, "BYTES: ", start_byte, end_byte, "FILENAME: ", filename, "POSITION", request.position token_n = 0 yield lookup_word(db, cursor, token, token_n, start_byte, end_byte, filename).encode('utf8')
def bibliography_results(request, config): """Fetch bibliography results""" db = DB(config.db_path + "/data/") if request.no_metadata: hits = db.get_all(db.locals["default_object_level"], request["sort_order"]) else: hits = db.query(sort_order=request["sort_order"], **request.metadata) if ( request.simple_bibliography == "all" ): # request from simple landing page report which gets all biblio in load order hits.finish() start = 1 end = len(hits) page_num = end else: start, end, page_num = page_interval(request.results_per_page, hits, request.start, request.end) bibliography_object = { "description": {"start": start, "end": end, "n": page_num, "results_per_page": request.results_per_page}, "query": dict([i for i in request]), "default_object": db.locals["default_object_level"], } results = [] result_type = "doc" for hit in hits[start - 1 : end]: citation_hrefs = citation_links(db, config, hit) metadata_fields = {} for metadata in db.locals["metadata_fields"]: metadata_fields[metadata] = hit[metadata] result_type = hit.object_type if request.simple_bibliography == "all": citation = citations(hit, citation_hrefs, config, report="simple_landing") else: citation = citations(hit, citation_hrefs, config, report="bibliography", result_type=result_type) if config.dictionary_bibliography is False or result_type == "doc": results.append( { "citation": citation, "citation_links": citation_hrefs, "philo_id": hit.philo_id, "metadata_fields": metadata_fields, "object_type": result_type, } ) else: context = get_text_obj(hit, config, request, db.locals["token_regex"], images=False) results.append( { "citation": citation, "citation_links": citation_hrefs, "philo_id": hit.philo_id, "metadata_fields": metadata_fields, "context": context, "object_type": result_type, } ) bibliography_object["results"] = results bibliography_object["results_length"] = len(hits) bibliography_object["query_done"] = hits.done bibliography_object["result_type"] = result_type return bibliography_object, hits
def kwic_results(request, config): """Fetch KWIC results""" db = DB(config.db_path + "/data/") hits = db.query(request["q"], request["method"], request["arg"], **request.metadata) start, end, n = page_interval(request.results_per_page, hits, request.start, request.end) kwic_object = { "description": { "start": start, "end": end, "results_per_page": request.results_per_page }, "query": dict([i for i in request]), } kwic_object["results"] = [] for hit in hits[start - 1:end]: kwic_result = kwic_hit_object(hit, config, db) kwic_object["results"].append(kwic_result) kwic_object["results_length"] = len(hits) kwic_object["query_done"] = hits.done return kwic_object
def get_start_end_date(environ, start_response): status = "200 OK" headers = [("Content-type", "text/html; charset=UTF-8"), ("Access-Control-Allow-Origin", "*")] start_response(status, headers) config = WebConfig( os.path.abspath(os.path.dirname(__file__)).replace("scripts", "")) db = DB(config.db_path + "/data/") request = WSGIHandler(environ, config) start_date, end_date = start_end_date(db, config, start_date=request.start_date, end_date=request.end_date) request.metadata["year"] = "{}-{}".format(start_date, end_date) request["start_date"] = "" request["end_date"] = "" hits = db.query(request["q"], request["method"], request["arg"], **request.metadata) total_results = 0 hits.finish() total_results = len(hits) yield json.dumps({ "start_date": start_date, "end_date": end_date, "total_results": total_results }).encode("utf8")
def term_list(environ, start_response): status = "200 OK" headers = [("Content-type", "application/json; charset=UTF-8"), ("Access-Control-Allow-Origin", "*")] start_response(status, headers) config = WebConfig(os.path.abspath(os.path.dirname(__file__)).replace("scripts", "")) db = DB(config.db_path + "/data/") request = WSGIHandler(environ, config) hits = db.query(request["q"], request["method"], request["arg"], **request.metadata) hits.finish() expanded_terms = get_expanded_query(hits) yield json.dumps(expanded_terms[0]).encode("utf8")
def generate_word_frequency(request, config): """reads through a hitlist. looks up request["field"] in each hit, and builds up a list of unique values and their frequencies.""" db = DB(config.db_path + "/data/") hits = db.query(request["q"], request["method"], request["arg"], **request.metadata) field = request["field"] counts = {} frequency_object = {} start_time = timeit.default_timer() last_hit_done = request.start try: for n in hits[request.start:]: key = get_word_attrib(n, field, db) if not key: # NULL is a magic value for queries, don't change it # recklessly. key = "NULL" if key not in counts: counts[key] = 0 counts[key] += 1 elapsed = timeit.default_timer() - start_time last_hit_done += 1 if elapsed > 5: break table = {} for k, v in counts.items(): url = make_absolute_query_link( config, request, start="0", end="0", report="word_property_filter", word_property=field, word_property_value=k, ) table[k] = {"count": v, "url": url} frequency_object["results"] = table frequency_object["hits_done"] = last_hit_done if last_hit_done == len(hits): frequency_object["more_results"] = False else: frequency_object["more_results"] = True except IndexError: frequency_object["results"] = {} frequency_object["more_results"] = False frequency_object["results_length"] = len(hits) frequency_object["query"] = dict([i for i in request]) return frequency_object
def get_total_results(environ, start_response): status = '200 OK' headers = [('Content-type', 'application/json; charset=UTF-8'), ("Access-Control-Allow-Origin", "*")] start_response(status, headers) config = WebConfig(os.path.abspath(os.path.dirname(__file__)).replace('scripts', '')) db = DB(config.db_path + '/data/') request = WSGIHandler(environ, config) if request.no_q: if request.no_metadata: hits = db.get_all(db.locals['default_object_level'], request["sort_order"]) else: hits = db.query(sort_order=request["sort_order"], **request.metadata) else: hits = db.query(request["q"], request["method"], request["arg"], **request.metadata) total_results = 0 hits.finish() total_results = len(hits) yield json.dumps(total_results).encode('utf8')
def generate_word_frequency(request, config): """reads through a hitlist. looks up request["field"] in each hit, and builds up a list of unique values and their frequencies.""" db = DB(config.db_path + "/data/") hits = db.query(request["q"], request["method"], request["arg"], **request.metadata) field = request["field"] counts = {} frequency_object = {} start_time = timeit.default_timer() last_hit_done = request.start try: for n in hits[request.start :]: key = get_word_attrib(n, field, db) if not key: # NULL is a magic value for queries, don't change it # recklessly. key = "NULL" if key not in counts: counts[key] = 0 counts[key] += 1 elapsed = timeit.default_timer() - start_time last_hit_done += 1 if elapsed > 5: break table = {} for k, v in counts.items(): url = make_absolute_query_link( config, request, start="0", end="0", report="word_property_filter", word_property=field, word_property_value=k, ) table[k] = {"count": v, "url": url} frequency_object["results"] = table frequency_object["hits_done"] = last_hit_done if last_hit_done == len(hits): frequency_object["more_results"] = False else: frequency_object["more_results"] = True except IndexError: frequency_object["results"] = {} frequency_object["more_results"] = False frequency_object["results_length"] = len(hits) frequency_object["query"] = dict([i for i in request]) return frequency_object
def term_list(environ, start_response): status = "200 OK" headers = [("Content-type", "application/json; charset=UTF-8"), ("Access-Control-Allow-Origin", "*")] start_response(status, headers) config = WebConfig( os.path.abspath(os.path.dirname(__file__)).replace("scripts", "")) db = DB(config.db_path + "/data/") request = WSGIHandler(environ, config) hits = db.query(request["q"], request["method"], request["arg"], **request.metadata) hits.finish() expanded_terms = get_expanded_query(hits) yield json.dumps(expanded_terms[0]).encode("utf8")
def get_more_context(environ, start_response): status = '200 OK' headers = [('Content-type', 'application/json; charset=UTF-8'), ("Access-Control-Allow-Origin", "*")] start_response(status, headers) config = WebConfig(os.path.abspath(os.path.dirname(__file__)).replace('scripts', '')) db = DB(config.db_path + '/data/') request = WSGIHandler(environ, config) hit_num = int(request.hit_num) hits = db.query(request["q"], request["method"], request["arg"], **request.metadata) context_size = config['concordance_length'] * 3 hit_context = get_concordance_text(db, hits[hit_num], config.db_path, context_size) yield json.dumps(hit_context).encode('utf8')
def get_total_results(environ, start_response): status = '200 OK' headers = [('Content-type', 'application/json; charset=UTF-8'), ("Access-Control-Allow-Origin", "*")] start_response(status, headers) config = WebConfig( os.path.abspath(os.path.dirname(__file__)).replace('scripts', '')) db = DB(config.db_path + '/data/') request = WSGIHandler(environ, config) if request.no_q: if request.no_metadata: hits = db.get_all(db.locals['default_object_level'], request["sort_order"]) else: hits = db.query(sort_order=request["sort_order"], **request.metadata) else: hits = db.query(request["q"], request["method"], request["arg"], **request.metadata) total_results = 0 hits.finish() total_results = len(hits) yield json.dumps(total_results).encode('utf8')
def get_more_context(environ, start_response): status = '200 OK' headers = [('Content-type', 'application/json; charset=UTF-8'), ("Access-Control-Allow-Origin", "*")] start_response(status, headers) config = WebConfig( os.path.abspath(os.path.dirname(__file__)).replace('scripts', '')) db = DB(config.db_path + '/data/') request = WSGIHandler(environ, config) hit_num = int(request.hit_num) hits = db.query(request["q"], request["method"], request["arg"], **request.metadata) context_size = config['concordance_length'] * 3 hit_context = get_concordance_text(db, hits[hit_num], config.db_path, context_size) yield json.dumps(hit_context).encode('utf8')
def get_start_end_date(environ, start_response): status = "200 OK" headers = [("Content-type", "text/html; charset=UTF-8"), ("Access-Control-Allow-Origin", "*")] start_response(status, headers) config = WebConfig(os.path.abspath(os.path.dirname(__file__)).replace("scripts", "")) db = DB(config.db_path + "/data/") request = WSGIHandler(environ, config) start_date, end_date = start_end_date(db, config, start_date=request.start_date, end_date=request.end_date) request.metadata["year"] = "{}-{}".format(start_date, end_date) request["start_date"] = "" request["end_date"] = "" hits = db.query(request["q"], request["method"], request["arg"], **request.metadata) total_results = 0 hits.finish() total_results = len(hits) yield json.dumps({"start_date": start_date, "end_date": end_date, "total_results": total_results}).encode("utf8")
def kwic_results(request, config): """Fetch KWIC results""" db = DB(config.db_path + "/data/") hits = db.query(request["q"], request["method"], request["arg"], **request.metadata) start, end, n = page_interval(request.results_per_page, hits, request.start, request.end) kwic_object = { "description": {"start": start, "end": end, "results_per_page": request.results_per_page}, "query": dict([i for i in request]), } kwic_object["results"] = [] for hit in hits[start - 1 : end]: kwic_result = kwic_hit_object(hit, config, db) kwic_object["results"].append(kwic_result) kwic_object["results_length"] = len(hits) kwic_object["query_done"] = hits.done return kwic_object
def lookup_word_service(environ, start_response): status = '200 OK' headers = [('Content-type', 'application/json; charset=UTF-8'), ("Access-Control-Allow-Origin", "*")] start_response(status, headers) config = WebConfig( os.path.abspath(os.path.dirname(__file__)).replace('scripts', '')) db = DB(config.db_path + '/data/') request = WSGIHandler(environ, config) cursor = db.dbh.cursor() if request.report == "concordance": hits = db.query(request["q"], request["method"], request["arg"], **request.metadata) context_size = config['concordance_length'] * 3 hit = hits[int(request.position)] bytes = hit.bytes hit_span = hit.bytes[-1] - hit.bytes[0] length = context_size + hit_span + context_size bytes, start_byte = adjust_bytes(bytes, length) end_byte = start_byte + length filename = hit.filename token = request.selected elif request.report == "navigation": token = request.selected philo_id = request.philo_id.split(" ") text_obj = db[philo_id] start_byte, end_byte = int(text_obj.start_byte), int(text_obj.end_byte) filename = text_obj.filename # print >> sys.stderr, "WORD LOOKUP FROM NAVIGATION", request.philo_id,request.selected, start_byte, end_byte, filename else: pass # print >> sys.stderr, "TOKEN", token, "BYTES: ", start_byte, end_byte, "FILENAME: ", filename, "POSITION", request.position token_n = 0 yield lookup_word(db, cursor, token, token_n, start_byte, end_byte, filename).encode('utf8')
def get_neighboring_words(environ, start_response): status = '200 OK' headers = [('Content-type', 'application/json; charset=UTF-8'), ("Access-Control-Allow-Origin", "*")] start_response(status, headers) config = WebConfig( os.path.abspath(os.path.dirname(__file__)).replace('scripts', '')) db = DB(config.db_path + '/data/') request = WSGIHandler(environ, config) try: index = int(request.hits_done) except: index = 0 max_time = int(request.max_time) kwic_words = [] start_time = timeit.default_timer() hits = db.query(request["q"], request["method"], request["arg"], **request.metadata) cursor = db.dbh.cursor() for hit in hits[index:]: word_id = ' '.join([str(i) for i in hit.philo_id]) query = 'select rowid, philo_name, parent from words where philo_id="%s" limit 1' % word_id cursor.execute(query) results = cursor.fetchone() highlighted_text = kwic_hit_object(hit, config, db)["highlighted_text"] highlighted_text = highlighted_text.translate(remove_punctuation_map) highlighted_text = highlighted_text.strip() result_obj = { "left": "", "right": "", "index": index, "q": highlighted_text } left_rowid = results["rowid"] - 10 right_rowid = results["rowid"] + 10 cursor.execute( 'select philo_name, philo_id from words where rowid between ? and ?', (left_rowid, results['rowid'] - 1)) result_obj["left"] = [] for i in cursor: result_obj["left"].append(i['philo_name']) result_obj["left"].reverse() result_obj["left"] = ' '.join(result_obj["left"]) cursor.execute( 'select philo_name, philo_id from words where rowid between ? and ?', (results['rowid'] + 1, right_rowid)) result_obj["right"] = [] for i in cursor: result_obj["right"].append(i['philo_name']) result_obj["right"] = ' '.join(result_obj["right"]) for metadata in config.kwic_metadata_sorting_fields: result_obj[metadata] = hit[metadata].lower() kwic_words.append(result_obj) index += 1 elapsed = timeit.default_timer() - start_time if elapsed > max_time: # avoid timeouts by splitting the query if more than 10 seconds has been spent in the loop break yield json.dumps({ "results": kwic_words, "hits_done": index }).encode('utf8')
def filter_words_by_property(request, config): """Filter words by property""" db = DB(config.db_path + "/data/") hits = db.query(request["q"], request["method"], request["arg"], **request.metadata) concordance_object = {"query": dict([i for i in request])} # Do these need to be captured in wsgi_handler? word_property = request["word_property"] word_property_value = request["word_property_value"] word_property_total = request["word_property_total"] new_hitlist = [] results = [] position = 0 more_pages = False if request.start == 0: start = 1 else: start = request.start for hit in hits: # get my chunk of text hit_val = get_word_attrib(hit, word_property, db) if hit_val == word_property_value: position += 1 if position < start: continue new_hitlist.append(hit) citation_hrefs = citation_links(db, config, hit) metadata_fields = {} for metadata in db.locals["metadata_fields"]: metadata_fields[metadata] = hit[metadata] citation = citations(hit, citation_hrefs, config) context = get_concordance_text(db, hit, config.db_path, config.concordance_length) result_obj = { "philo_id": hit.philo_id, "citation": citation, "citation_links": citation_hrefs, "context": context, "metadata_fields": metadata_fields, "bytes": hit.bytes, "collocate_count": 1, } results.append(result_obj) if len(new_hitlist) == (request.results_per_page): more_pages = True break end = start + len(results) - 1 if len(results) < request.results_per_page: word_property_total = end else: word_property_total = end + 1 concordance_object["results"] = results concordance_object["query_done"] = hits.done concordance_object["results_length"] = word_property_total concordance_object["description"] = { "start": start, "end": end, "results_per_page": request.results_per_page, "more_pages": more_pages, } return concordance_object
def collocation_results(request, config): """Fetch collocation results""" db = DB(config.db_path + "/data/") if request["collocate_distance"]: hits = db.query(request["q"], "proxy", int(request["collocate_distance"]), **request.metadata) else: hits = db.query(request["q"], "cooc", request["arg"], **request.metadata) hits.finish() collocation_object = {"query": dict([i for i in request])} try: collocate_distance = int(request["collocate_distance"]) except ValueError: # Getting an empty string since the keyword is not specificed in the URL collocate_distance = None if request.colloc_filter_choice == "nofilter": filter_list = [] else: filter_list = build_filter_list(request, config) collocation_object["filter_list"] = filter_list filter_list = set(filter_list) # Build list of search terms to filter out query_words = [] for group in get_expanded_query(hits): for word in group: word = word.replace('"', "") query_words.append(word) query_words = set(query_words) filter_list = filter_list.union(query_words) if request["collocate_distance"]: hits = db.query(request["q"], "proxy", int(request["collocate_distance"]), raw_results=True, **request.metadata) else: hits = db.query(request["q"], "proxy", request["arg"], raw_results=True, **request.metadata) hits.finish() stored_sentence_id = None stored_sentence_counts = defaultdict(int) sentence_hit_count = 1 hits_done = request.start or 0 max_time = request.max_time or 10 all_collocates = defaultdict(lambda: {"count": 0}) cursor = db.dbh.cursor() start_time = timeit.default_timer() try: for hit in hits[hits_done:]: word_id = " ".join([str(i) for i in hit[:6]]) + " " + str(hit[7]) query = """select parent, rowid from words where philo_id='%s' limit 1""" % word_id cursor.execute(query) result = cursor.fetchone() parent = result["parent"] if parent != stored_sentence_id: rowid = int(result["rowid"]) sentence_hit_count = 1 stored_sentence_id = parent stored_sentence_counts = defaultdict(int) if collocate_distance: begin_rowid = rowid - collocate_distance if begin_rowid < 0: begin_rowid = 0 end_rowid = rowid + collocate_distance row_query = """select philo_name from words where parent='%s' and rowid between %d and %d""" % ( parent, begin_rowid, end_rowid, ) else: row_query = """select philo_name from words where parent='%s'""" % (parent,) cursor.execute(row_query) for i in cursor: collocate = i["philo_name"] if collocate not in filter_list: stored_sentence_counts[collocate] += 1 else: sentence_hit_count += 1 for word in stored_sentence_counts: if stored_sentence_counts[word] < sentence_hit_count: continue all_collocates[word]["count"] += 1 hits_done += 1 elapsed = timeit.default_timer() - start_time # avoid timeouts by splitting the query if more than request.max_time (in # seconds) has been spent in the loop if elapsed > int(max_time): break except IndexError: collocation_object["hits_done"] = len(hits) collocation_object["collocates"] = all_collocates collocation_object["results_length"] = len(hits) if hits_done < collocation_object["results_length"]: collocation_object["more_results"] = True collocation_object["hits_done"] = hits_done else: collocation_object["more_results"] = False collocation_object["hits_done"] = collocation_object["results_length"] return collocation_object
def generate_time_series(request, config): db = DB(config.db_path + "/data/") time_series_object = { "query": dict([i for i in request]), "query_done": False } # Invalid date range if request.start_date == "invalid" or request.end_date == "invalid": time_series_object["results_length"] = 0 time_series_object["more_results"] = False time_series_object["new_start_date"] = 0 time_series_object["results"] = { "absolute_count": {}, "date_count": {} } return time_series_object start_date, end_date = get_start_end_date(db, config, start_date=request.start_date or None, end_date=request.end_date or None) # Generate date ranges interval = int(request.year_interval) date_ranges = [] # Make sure last date gets included in for loop below by adding one to last step for start in range(start_date, end_date + 1, interval): end = start + interval - 1 if end > end_date: end = end_date date_range = "%d-%d" % (start, end) date_ranges.append((start, date_range)) absolute_count = defaultdict(int) date_counts = {} total_hits = 0 last_date_done = start_date start_time = timeit.default_timer() max_time = request.max_time or 10 for start_range, date_range in date_ranges: request.metadata[config.time_series_year_field] = date_range hits = db.query(request["q"], request["method"], request["arg"], raw_results=True, **request.metadata) hits.finish() hit_len = len(hits) params = {"report": "concordance", "start": "0", "end": "0"} params[config.time_series_year_field] = date_range url = make_absolute_query_link(config, request, **params) absolute_count[start_range] = { "label": start_range, "count": hit_len, "url": url } # Get date total count if interval != "1": end_range = start_range + (int(request["year_interval"]) - 1) query = 'select sum(word_count) from toms where %s between "%d" and "%d"' % ( config.time_series_year_field, start_range, end_range, ) else: query = "select sum(word_count) from toms where %s='%s'" % ( config.time_series_year_field, start_range) cursor = db.dbh.cursor() cursor.execute(query) date_counts[start_range] = cursor.fetchone()[0] or 0 total_hits += hit_len elapsed = timeit.default_timer() - start_time last_date_done = start_range # avoid timeouts by splitting the query if more than request.max_time # (in seconds) has been spent in the loop if elapsed > int(max_time): break time_series_object["results_length"] = total_hits if (last_date_done + int(request.year_interval)) >= end_date: time_series_object["more_results"] = False else: time_series_object["more_results"] = True time_series_object["new_start_date"] = last_date_done + int( request.year_interval) time_series_object["results"] = { "absolute_count": absolute_count, "date_count": date_counts } return time_series_object
def frequency_results(request, config, sorted_results=False): """reads through a hitlist. looks up request.frequency_field in each hit, and builds up a list of unique values and their frequencies.""" db = DB(config.db_path + "/data/") biblio_search = False if request.q == "" and request.no_q: biblio_search = True if request.no_metadata: hits = db.get_all(db.locals["default_object_level"], sort_order=["rowid"], raw_results=True) else: hits = db.query(sort_order=["rowid"], raw_results=True, **request.metadata) else: hits = db.query(request["q"], request["method"], request["arg"], raw_results=True, **request.metadata) if sorted_results is True: hits.finish() cursor = db.dbh.cursor() cursor.execute("select philo_id, %s from toms where %s is not null" % (request.frequency_field, request.frequency_field)) metadata_dict = {} for i in cursor: philo_id, field = i philo_id = tuple(int(s) for s in philo_id.split() if int(s)) metadata_dict[philo_id] = field counts = {} frequency_object = {} start_time = timeit.default_timer() last_hit_done = request.start obj_dict = { "doc": 1, "div1": 2, "div2": 3, "div3": 4, "para": 5, "sent": 6, "word": 7 } metadata_type = db.locals["metadata_types"][request.frequency_field] try: object_level = obj_dict[metadata_type] except KeyError: # metadata_type == "div" pass try: for philo_id in hits[request.start:]: if not biblio_search: philo_id = tuple(list(philo_id[:6]) + [philo_id[7]]) if metadata_type == "div": key = "" for div in ["div1", "div2", "div3"]: if philo_id[:obj_dict[div]] in metadata_dict: key = metadata_dict[philo_id[:obj_dict[div]]] while not key: if philo_id[:4] in metadata_dict: key = metadata_dict[philo_id[:4]] break if philo_id[:5] in metadata_dict: key = metadata_dict[philo_id[:5]] break break if not key: last_hit_done += 1 continue else: try: key = metadata_dict[philo_id[:object_level]] except: last_hit_done += 1 continue if key not in counts: counts[key] = { "count": 0, "metadata": { request.frequency_field: key } } counts[key]["url"] = make_absolute_query_link( config, request, frequency_field="", start="0", end="0", report=request.report, script="", **{request.frequency_field: '"%s"' % key}) if not biblio_search: query_metadata = dict([ (k, v) for k, v in request.metadata.items() if v ]) query_metadata[request.frequency_field] = '"%s"' % key local_hits = db.query(**query_metadata) counts[key][ "total_word_count"] = local_hits.get_total_word_count( ) counts[key]["count"] += 1 # avoid timeouts by splitting the query if more than # request.max_time (in seconds) has been spent in the loop elapsed = timeit.default_timer() - start_time last_hit_done += 1 if elapsed > 5 and sorted_results is False: break frequency_object["results"] = counts frequency_object["hits_done"] = last_hit_done if last_hit_done == len(hits): new_metadata = dict([(k, v) for k, v in request.metadata.items() if v]) new_metadata[request.frequency_field] = '"NULL"' if request.q == "" and request.no_q: new_hits = db.query(sort_order=["rowid"], raw_results=True, **new_metadata) else: new_hits = db.query(request["q"], request["method"], request["arg"], raw_results=True, **new_metadata) new_hits.finish() if len(new_hits): null_url = make_absolute_query_link( config, request, frequency_field="", start="0", end="0", report=request.report, script="", **{request.frequency_field: '"NULL"'}) local_hits = db.query(**new_metadata) if not biblio_search: frequency_object["results"]["NULL"] = { "count": len(new_hits), "url": null_url, "metadata": { request.frequency_field: '"NULL"' }, "total_word_count": local_hits.get_total_word_count(), } else: frequency_object["results"]["NULL"] = { "count": len(new_hits), "url": null_url, "metadata": { request.frequency_field: '"NULL"' }, } frequency_object["more_results"] = False else: frequency_object["more_results"] = True except IndexError: frequency_object["results"] = {} frequency_object["more_results"] = False frequency_object["results_length"] = len(hits) frequency_object["query"] = dict([i for i in request]) if sorted_results is True: frequency_object["results"] = sorted( frequency_object["results"].items(), key=lambda x: x[1]["count"], reverse=True) return frequency_object
def generate_time_series(request, config): db = DB(config.db_path + "/data/") time_series_object = {"query": dict([i for i in request]), "query_done": False} # Invalid date range if request.start_date == "invalid" or request.end_date == "invalid": time_series_object["results_length"] = 0 time_series_object["more_results"] = False time_series_object["new_start_date"] = 0 time_series_object["results"] = {"absolute_count": {}, "date_count": {}} return time_series_object start_date, end_date = get_start_end_date( db, config, start_date=request.start_date or None, end_date=request.end_date or None ) # Generate date ranges interval = int(request.year_interval) date_ranges = [] # Make sure last date gets included in for loop below by adding one to last step for start in range(start_date, end_date + 1, interval): end = start + interval - 1 if end > end_date: end = end_date date_range = "%d-%d" % (start, end) date_ranges.append((start, date_range)) absolute_count = defaultdict(int) date_counts = {} total_hits = 0 last_date_done = start_date start_time = timeit.default_timer() max_time = request.max_time or 10 for start_range, date_range in date_ranges: request.metadata[config.time_series_year_field] = date_range hits = db.query(request["q"], request["method"], request["arg"], raw_results=True, **request.metadata) hits.finish() hit_len = len(hits) params = {"report": "concordance", "start": "0", "end": "0"} params[config.time_series_year_field] = date_range url = make_absolute_query_link(config, request, **params) absolute_count[start_range] = {"label": start_range, "count": hit_len, "url": url} # Get date total count if interval != "1": end_range = start_range + (int(request["year_interval"]) - 1) query = 'select sum(word_count) from toms where %s between "%d" and "%d"' % ( config.time_series_year_field, start_range, end_range, ) else: query = "select sum(word_count) from toms where %s='%s'" % (config.time_series_year_field, start_range) cursor = db.dbh.cursor() cursor.execute(query) date_counts[start_range] = cursor.fetchone()[0] or 0 total_hits += hit_len elapsed = timeit.default_timer() - start_time last_date_done = start_range # avoid timeouts by splitting the query if more than request.max_time # (in seconds) has been spent in the loop if elapsed > int(max_time): break time_series_object["results_length"] = total_hits if (last_date_done + int(request.year_interval)) >= end_date: time_series_object["more_results"] = False else: time_series_object["more_results"] = True time_series_object["new_start_date"] = last_date_done + int(request.year_interval) time_series_object["results"] = {"absolute_count": absolute_count, "date_count": date_counts} return time_series_object
def get_neighboring_words(environ, start_response): status = '200 OK' headers = [('Content-type', 'application/json; charset=UTF-8'), ("Access-Control-Allow-Origin", "*")] start_response(status, headers) config = WebConfig(os.path.abspath(os.path.dirname(__file__)).replace('scripts', '')) db = DB(config.db_path + '/data/') request = WSGIHandler(environ, config) try: index = int(request.hits_done) except: index = 0 max_time = int(request.max_time) kwic_words = [] start_time = timeit.default_timer() hits = db.query(request["q"], request["method"], request["arg"], **request.metadata) cursor = db.dbh.cursor() for hit in hits[index:]: word_id = ' '.join([str(i) for i in hit.philo_id]) query = 'select rowid, philo_name, parent from words where philo_id="%s" limit 1' % word_id cursor.execute(query) results = cursor.fetchone() highlighted_text = kwic_hit_object(hit, config, db)["highlighted_text"] highlighted_text = highlighted_text.translate(remove_punctuation_map) highlighted_text = highlighted_text.strip() result_obj = { "left": "", "right": "", "index": index, "q": highlighted_text } left_rowid = results["rowid"] - 10 right_rowid = results["rowid"] + 10 cursor.execute('select philo_name, philo_id from words where rowid between ? and ?', (left_rowid, results['rowid']-1)) result_obj["left"] = [] for i in cursor: result_obj["left"].append(i['philo_name']) result_obj["left"].reverse() result_obj["left"] = ' '.join(result_obj["left"]) cursor.execute('select philo_name, philo_id from words where rowid between ? and ?', (results['rowid']+1, right_rowid)) result_obj["right"] = [] for i in cursor: result_obj["right"].append(i['philo_name']) result_obj["right"] = ' '.join(result_obj["right"]) for metadata in config.kwic_metadata_sorting_fields: result_obj[metadata] = hit[metadata].lower() kwic_words.append(result_obj) index += 1 elapsed = timeit.default_timer() - start_time if elapsed > max_time: # avoid timeouts by splitting the query if more than 10 seconds has been spent in the loop break yield json.dumps({"results": kwic_words, "hits_done": index}).encode('utf8')