def generate_word_frequency(request, config): """reads through a hitlist. looks up request["field"] in each hit, and builds up a list of unique values and their frequencies.""" db = DB(config.db_path + "/data/") hits = db.query(request["q"], request["method"], request["arg"], **request.metadata) field = request["field"] counts = {} frequency_object = {} start_time = timeit.default_timer() last_hit_done = request.start try: for n in hits[request.start:]: key = get_word_attrib(n, field, db) if not key: # NULL is a magic value for queries, don't change it # recklessly. key = "NULL" if key not in counts: counts[key] = 0 counts[key] += 1 elapsed = timeit.default_timer() - start_time last_hit_done += 1 if elapsed > 5: break table = {} for k, v in counts.items(): url = make_absolute_query_link( config, request, start="0", end="0", report="word_property_filter", word_property=field, word_property_value=k, ) table[k] = {"count": v, "url": url} frequency_object["results"] = table frequency_object["hits_done"] = last_hit_done if last_hit_done == len(hits): frequency_object["more_results"] = False else: frequency_object["more_results"] = True except IndexError: frequency_object["results"] = {} frequency_object["more_results"] = False frequency_object["results_length"] = len(hits) frequency_object["query"] = dict([i for i in request]) return frequency_object
def generate_word_frequency(request, config): """reads through a hitlist. looks up request["field"] in each hit, and builds up a list of unique values and their frequencies.""" db = DB(config.db_path + "/data/") hits = db.query(request["q"], request["method"], request["arg"], **request.metadata) field = request["field"] counts = {} frequency_object = {} start_time = timeit.default_timer() last_hit_done = request.start try: for n in hits[request.start :]: key = get_word_attrib(n, field, db) if not key: # NULL is a magic value for queries, don't change it # recklessly. key = "NULL" if key not in counts: counts[key] = 0 counts[key] += 1 elapsed = timeit.default_timer() - start_time last_hit_done += 1 if elapsed > 5: break table = {} for k, v in counts.items(): url = make_absolute_query_link( config, request, start="0", end="0", report="word_property_filter", word_property=field, word_property_value=k, ) table[k] = {"count": v, "url": url} frequency_object["results"] = table frequency_object["hits_done"] = last_hit_done if last_hit_done == len(hits): frequency_object["more_results"] = False else: frequency_object["more_results"] = True except IndexError: frequency_object["results"] = {} frequency_object["more_results"] = False frequency_object["results_length"] = len(hits) frequency_object["query"] = dict([i for i in request]) return frequency_object
def cite_linker(hit, citation_object, citation_hrefs, config, report): """Get links""" href = None if citation_object["link"]: if citation_object["object_level"] == "doc": if citation_object["field"] == "title" or citation_object["field"] == "filename": href = citation_hrefs['doc'] elif report == "bibliography" and citation_object["field"] == "head": href = make_absolute_object_link(config, hit.philo_id) else: params = [("report", "bibliography"), (citation_object["field"], '"%s"' % hit[citation_object["field"]])] href = make_absolute_query_link(config, params) else: href = citation_hrefs[citation_object["object_level"]] return href
def cite_linker(hit, citation_object, citation_hrefs, config, report): """Get links""" href = None if citation_object["link"]: if citation_object["object_level"] == "doc": if citation_object["field"] == "title" or citation_object["field"] == "filename": href = citation_hrefs["doc"] elif report == "bibliography" and citation_object["field"] == "head": href = make_absolute_object_link(config, hit.philo_id) else: params = [ ("report", "bibliography"), (citation_object["field"], '"%s"' % hit[citation_object["field"]]), ] href = make_absolute_query_link(config, params) else: href = citation_hrefs[citation_object["object_level"]] return href
def frequency_results(request, config, sorted_results=False): """reads through a hitlist. looks up request.frequency_field in each hit, and builds up a list of unique values and their frequencies.""" db = DB(config.db_path + "/data/") biblio_search = False if request.q == "" and request.no_q: biblio_search = True if request.no_metadata: hits = db.get_all(db.locals["default_object_level"], sort_order=["rowid"], raw_results=True) else: hits = db.query(sort_order=["rowid"], raw_results=True, **request.metadata) else: hits = db.query(request["q"], request["method"], request["arg"], raw_results=True, **request.metadata) if sorted_results is True: hits.finish() cursor = db.dbh.cursor() cursor.execute("select philo_id, %s from toms where %s is not null" % (request.frequency_field, request.frequency_field)) metadata_dict = {} for i in cursor: philo_id, field = i philo_id = tuple(int(s) for s in philo_id.split() if int(s)) metadata_dict[philo_id] = field counts = {} frequency_object = {} start_time = timeit.default_timer() last_hit_done = request.start obj_dict = { "doc": 1, "div1": 2, "div2": 3, "div3": 4, "para": 5, "sent": 6, "word": 7 } metadata_type = db.locals["metadata_types"][request.frequency_field] try: object_level = obj_dict[metadata_type] except KeyError: # metadata_type == "div" pass try: for philo_id in hits[request.start:]: if not biblio_search: philo_id = tuple(list(philo_id[:6]) + [philo_id[7]]) if metadata_type == "div": key = "" for div in ["div1", "div2", "div3"]: if philo_id[:obj_dict[div]] in metadata_dict: key = metadata_dict[philo_id[:obj_dict[div]]] while not key: if philo_id[:4] in metadata_dict: key = metadata_dict[philo_id[:4]] break if philo_id[:5] in metadata_dict: key = metadata_dict[philo_id[:5]] break break if not key: last_hit_done += 1 continue else: try: key = metadata_dict[philo_id[:object_level]] except: last_hit_done += 1 continue if key not in counts: counts[key] = { "count": 0, "metadata": { request.frequency_field: key } } counts[key]["url"] = make_absolute_query_link( config, request, frequency_field="", start="0", end="0", report=request.report, script="", **{request.frequency_field: '"%s"' % key}) if not biblio_search: query_metadata = dict([ (k, v) for k, v in request.metadata.items() if v ]) query_metadata[request.frequency_field] = '"%s"' % key local_hits = db.query(**query_metadata) counts[key][ "total_word_count"] = local_hits.get_total_word_count( ) counts[key]["count"] += 1 # avoid timeouts by splitting the query if more than # request.max_time (in seconds) has been spent in the loop elapsed = timeit.default_timer() - start_time last_hit_done += 1 if elapsed > 5 and sorted_results is False: break frequency_object["results"] = counts frequency_object["hits_done"] = last_hit_done if last_hit_done == len(hits): new_metadata = dict([(k, v) for k, v in request.metadata.items() if v]) new_metadata[request.frequency_field] = '"NULL"' if request.q == "" and request.no_q: new_hits = db.query(sort_order=["rowid"], raw_results=True, **new_metadata) else: new_hits = db.query(request["q"], request["method"], request["arg"], raw_results=True, **new_metadata) new_hits.finish() if len(new_hits): null_url = make_absolute_query_link( config, request, frequency_field="", start="0", end="0", report=request.report, script="", **{request.frequency_field: '"NULL"'}) local_hits = db.query(**new_metadata) if not biblio_search: frequency_object["results"]["NULL"] = { "count": len(new_hits), "url": null_url, "metadata": { request.frequency_field: '"NULL"' }, "total_word_count": local_hits.get_total_word_count(), } else: frequency_object["results"]["NULL"] = { "count": len(new_hits), "url": null_url, "metadata": { request.frequency_field: '"NULL"' }, } frequency_object["more_results"] = False else: frequency_object["more_results"] = True except IndexError: frequency_object["results"] = {} frequency_object["more_results"] = False frequency_object["results_length"] = len(hits) frequency_object["query"] = dict([i for i in request]) if sorted_results is True: frequency_object["results"] = sorted( frequency_object["results"].items(), key=lambda x: x[1]["count"], reverse=True) return frequency_object
def frequency_results(request, config, sorted=False): """reads through a hitlist. looks up request.frequency_field in each hit, and builds up a list of unique values and their frequencies.""" db = DB(config.db_path + '/data/') biblio_search = False if request.q == '' and request.no_q: biblio_search = True if request.no_metadata: hits = db.get_all(db.locals['default_object_level'], sort_order=["rowid"], raw_results=True) else: hits = db.query(sort_order=["rowid"], raw_results=True, **request.metadata) else: hits = db.query(request["q"], request["method"], request["arg"], raw_results=True, **request.metadata) if sorted: hits.finish() c = db.dbh.cursor() c.execute('select philo_id, %s from toms where %s is not null' % (request.frequency_field, request.frequency_field)) metadata_dict = {} for i in c.fetchall(): philo_id, field = i philo_id = tuple(int(s) for s in philo_id.split() if int(s)) metadata_dict[philo_id] = field counts = {} frequency_object = {} start_time = timeit.default_timer() last_hit_done = request.start obj_dict = {'doc': 1, 'div1': 2, 'div2': 3, 'div3': 4, 'para': 5, 'sent': 6, 'word': 7} metadata_type = db.locals["metadata_types"][request.frequency_field] try: object_level = obj_dict[metadata_type] except KeyError: # metadata_type == "div" pass try: for philo_id in hits[request.start:]: if not biblio_search: philo_id = tuple(list(philo_id[:6]) + [philo_id[7]]) if metadata_type == "div": key = "" for div in ["div1", "div2", "div3"]: if philo_id[:obj_dict[div]] in metadata_dict: key = metadata_dict[philo_id[:obj_dict[div]]] while not key: if philo_id[:4] in metadata_dict: key = metadata_dict[philo_id[:4]] break if philo_id[:5] in metadata_dict: key = metadata_dict[philo_id[:5]] break break if not key: last_hit_done += 1 continue else: try: key = metadata_dict[philo_id[:object_level]] except: last_hit_done += 1 continue if key not in counts: counts[key] = {"count": 0, 'metadata': {request.frequency_field: key}} counts[key]["url"] = make_absolute_query_link(config, request, frequency_field="", start="0", end="0", report=request.report, script='', **{request.frequency_field: '"%s"' % key}) if not biblio_search: query_metadata = dict([(k, v) for k, v in request.metadata.iteritems() if v]) query_metadata[request.frequency_field] = '"%s"' % key local_hits = db.query(**query_metadata) counts[key]["total_word_count"] = local_hits.get_total_word_count() counts[key]["count"] += 1 # avoid timeouts by splitting the query if more than # request.max_time (in seconds) has been spent in the loop elapsed = timeit.default_timer() - start_time last_hit_done += 1 if elapsed > 5: break frequency_object['results'] = counts frequency_object["hits_done"] = last_hit_done if last_hit_done == len(hits): new_metadata = dict([(k, v) for k, v in request.metadata.iteritems() if v]) new_metadata[request.frequency_field] = '"NULL"' if request.q == '' and request.no_q: new_hits = db.query(sort_order=["rowid"], raw_results=True, **new_metadata) else: new_hits = db.query(request["q"], request["method"], request["arg"], raw_results=True, **new_metadata) new_hits.finish() if len(new_hits): null_url = make_absolute_query_link(config, request, frequency_field="", start="0", end="0", report=request.report, script='', **{request.frequency_field: '"NULL"'}) local_hits = db.query(**new_metadata) if not biblio_search: frequency_object["results"]["NULL"] = {"count": len(new_hits), "url": null_url, "metadata": {request.frequency_field: '"NULL"'}, "total_word_count": local_hits.get_total_word_count()} else: frequency_object["results"]["NULL"] = {"count": len(new_hits), "url": null_url, "metadata": {request.frequency_field: '"NULL"'}} frequency_object['more_results'] = False else: frequency_object['more_results'] = True except IndexError: frequency_object['results'] = {} frequency_object['more_results'] = False frequency_object['results_length'] = len(hits) frequency_object['query'] = dict([i for i in request]) if sorted: frequency_object["results"] = sorted(frequency_object['results'].iteritems(), key=lambda x: x[1]['count'], reverse=True) return frequency_object
def generate_time_series(request, config): db = DB(config.db_path + '/data/') time_series_object = {'query': dict([i for i in request]), 'query_done': False} # Invalid date range if request.start_date == 'invalid' or request.end_date == 'invalid': time_series_object['results_length'] = 0 time_series_object['more_results'] = False time_series_object['new_start_date'] = 0 time_series_object['results'] = {'absolute_count': {}, 'date_count': {}} return time_series_object start_date, end_date = get_start_end_date(db, config, start_date=request.start_date or None, end_date=request.end_date or None) # Generate date ranges interval = int(request.year_interval) date_ranges = [] # Make sure last date gets included in for loop below by adding one to last step for start in range(start_date, end_date+1, interval): end = start + interval - 1 if end > end_date: end = end_date date_range = "%d-%d" % (start, end) date_ranges.append((start, date_range)) absolute_count = defaultdict(int) date_counts = {} total_hits = 0 last_date_done = start_date start_time = timeit.default_timer() max_time = request.max_time or 10 for start_range, date_range in date_ranges: request.metadata[config.time_series_year_field] = date_range hits = db.query(request["q"], request["method"], request["arg"], raw_results=True, **request.metadata) hits.finish() hit_len = len(hits) params = {"report": "concordance", "start": "0", "end": "0"} params[config.time_series_year_field] = date_range url = make_absolute_query_link(config, request, **params) absolute_count[start_range] = {"label": start_range, "count": hit_len, "url": url} # Get date total count if interval != '1': end_range = start_range + (int(request['year_interval']) - 1) query = 'select sum(word_count) from toms where %s between "%d" and "%d"' % (config.time_series_year_field, start_range, end_range) else: query = "select sum(word_count) from toms where %s='%s'" % (config.time_series_year_field, start_range) c = db.dbh.cursor() c.execute(query) date_counts[start_range] = c.fetchone()[0] or 0 total_hits += hit_len elapsed = timeit.default_timer() - start_time last_date_done = start_range # avoid timeouts by splitting the query if more than request.max_time # (in seconds) has been spent in the loop if elapsed > int(max_time): break time_series_object['results_length'] = total_hits if (last_date_done + int(request.year_interval)) >= end_date: time_series_object['more_results'] = False else: time_series_object['more_results'] = True time_series_object['new_start_date'] = last_date_done + int(request.year_interval) time_series_object['results'] = {'absolute_count': absolute_count, 'date_count': date_counts} return time_series_object
def format_text_object(obj, text, config, request, word_regex, byte_offsets=None, note=False, images=True): """Format text objects""" philo_id = obj.philo_id if byte_offsets is not None: new_text = b"" last_offset = 0 for b in byte_offsets: new_text += text[last_offset:b] + b"<philoHighlight/>" last_offset = b text = new_text + text[last_offset:] current_obj_img = [] current_graphic_img = [] text = "<div>" + text.decode('utf8', 'ignore') + "</div>" xml = FragmentParserParse(text) c = obj.db.dbh.cursor() for el in xml.iter(): try: if el.tag.startswith("DIV"): el.tag = el.tag.lower() if el.tag == "sc" or el.tag == "scx": el.tag = "span" el.attrib["class"] = "small-caps" elif el.tag == "head": el.tag = "b" el.attrib["class"] = "headword" elif el.tag == "list": el.tag = "ul" elif el.tag == "title": el.tag = "span" el.attrib['class'] = "xml-title" elif el.tag == "q": el.tag = "span" el.attrib['class'] = 'xml-q' elif el.tag == "table": el.tag = "span" el.attrib["class"] = "xml-table" elif el.tag == "ref" or el.tag == "xref": if el.attrib["type"] == "note" or el.attrib["type"] == "footnote": target = el.attrib["target"] link = make_absolute_query_link(config, request, script_name="/scripts/get_notes.py", target=target) if "n" in el.attrib: el.text = el.attrib["n"] else: el.text = "*" if el.text == "": el.text = "*" el.tag = "span" el.attrib["data-ref"] = link el.attrib["id"] = target.replace('#', '') + '-link-back' # attributes for popover note el.attrib['class'] = "note-ref" el.attrib['tabindex'] = "0" el.attrib['data-toggle'] = "popover" el.attrib['data-container'] = "body" el.attrib["data-placement"] = "right" el.attrib["data-trigger"] = "focus" el.attrib["data-html"] = "true" el.attrib["data-animation"] = "true" elif el.attrib["type"] == "cross": c.execute("SELECT philo_id FROM toms WHERE id=? LIMIT 1", (el.attrib["target"],)) try: object_id = c.fetchone()[0] except IndexError: el.tag = "span" continue el.tag = "a" el.attrib["href"] = 'navigate/%s' % '/'.join([i for i in object_id.split() if i != "0"]) el.attrib["class"] = "xml-ref-cross" del el.attrib["target"] elif el.attrib["type"] == "search": metadata, metadata_value = el.attrib["target"].split(':') params = {metadata: metadata_value, "report": "bibliography"} el.tag = "a" el.attrib["href"] = make_absolute_query_link(config, [], **params) del el.attrib["target"] elif el.tag == "note": # endnotes in_end_note = False for ancestor in el.iterancestors(): if ancestor.tag.startswith('div'): if "type" in ancestor.attrib: if ancestor.attrib["type"] == "notes": in_end_note = True break if note: # in footnote el.tag = "div" elif in_end_note: # in end note el.tag = "div" el.attrib['class'] = "xml-note" link_back = etree.Element("a") c.execute('select parent from refs where target=? and parent like ?', (el.attrib['id'], str(philo_id[0]) + " %")) object_id = c.fetchone()[0] link_back.attrib['href'] = 'navigate/%s%s' % ('/'.join([i for i in object_id.split() if i != "0"]), '#%s-link-back' % el.attrib['id']) link_back.attrib['class'] = "btn btn-xs btn-default link-back" link_back.attrib['role'] = "button" link_back.text = "Go back to text" el.append(link_back) else: ## inline notes el.tag = 'span' el.attrib['class'] = "note-content" for child in el: child = note_content(child) # insert an anchor before this element by scanning through the parent parent = el.getparent() for i, child in enumerate(parent): if child == el: attribs = {"class": "note", "tabindex": "0", "data-toggle": "popover", "data-container": "body", "data-placement": "right", "data-trigger": "focus"} parent.insert(i, etree.Element("a", attrib=attribs)) new_anchor = parent[i] new_anchor.text = "note" elif el.tag == "item": el.tag = "li" elif el.tag == "ab" or el.tag == "ln": el.tag = "l" elif el.tag == "img": el.attrib["onerror"] = "this.style.display='none'" elif el.tag == "pb" and "n" in el.attrib: el.tag = "span" el.attrib["class"] = "xml-pb-image" if config.page_images_url_root: if "facs" in el.attrib or "id" in el.attrib: if "facs" in el.attrib: img = el.attrib["facs"] else: img = el.attrib["id"] current_obj_img.append(img.split()[0]) el.append(etree.Element("a")) img_split = img.split() el[-1].attrib["href"] = os.path.join(config.page_images_url_root, img_split[0]) + config.page_image_extension if len(img_split) == 2: el[-1].attrib["large-img"] = os.path.join(config.page_images_url_root, img_split[1]) + config.page_image_extension else: el[-1].attrib["large-img"] = os.path.join(config.page_images_url_root, img_split[0]) + config.page_image_extension el[-1].text = "[page " + el.attrib["n"] + "]" if config.external_page_images: el[-1].attrib["target"] = "_blank" else: el[-1].attrib['class'] = "page-image-link" el[-1].attrib['data-gallery'] = '' else: if el.attrib["n"]: el.text = "--%s--" % el.attrib["n"] else: el.text = "--na--" grand_parent = el.getparent().getparent() if grand_parent.attrib["class"] == "xml-row": # Move page outside of table row to avoid display issues tail = etree.Element("span") tail.text = el.tail el.tail = "" great_grand_parent = grand_parent.getparent() grand_parent_index = great_grand_parent.index(grand_parent) el_index = el.getparent().index(el) great_grand_parent.insert(grand_parent_index+1, el) parent.insert(el_index, tail) if el.tag == "graphic": if config.page_images_url_root: imgs = el.attrib["facs"].split() current_graphic_img.append(imgs[0]) el.attrib["src"] = os.path.join(config.page_images_url_root, imgs[0]) el.tag = "img" el.attrib["class"] = "inline-img" el.attrib['data-gallery'] = '' el.attrib["inline-img"] = "" if len(imgs) > 1: el.attrib["large-img"] = os.path.join(config.page_images_url_root, imgs[1]) else: el.attrib["large-img"] = os.path.join(config.page_images_url_root, imgs[0]) del el.attrib["url"] elif el.tag == "philoHighlight": word_match = re.match(word_regex, el.tail, re.U) if word_match: el.text = el.tail[:word_match.end()] el.tail = el.tail[word_match.end():] el.tag = "span" el.attrib["class"] = "highlight" if el.tag not in VALID_HTML_TAGS: el = xml_to_html_class(el) except Exception as exception: import sys print(exception, file=sys.stderr) output = etree.tostring(xml).decode('utf8', 'ignore') ## remove spaces around hyphens and apostrophes output = re.sub(r" ?([-';.])+ ", '\\1 ', output) output = convert_entities(output) if note: ## Notes don't need to fetch images return (output, {}) if not images: return (output, {}) ## Page images output, images = page_images(config, output, current_obj_img, current_graphic_img, philo_id) return output, images
def format_text_object(obj, text, config, request, word_regex, byte_offsets=None, note=False, images=True): """Format text objects""" philo_id = obj.philo_id if byte_offsets is not None: new_text = b"" last_offset = 0 for b in byte_offsets: new_text += text[last_offset:b] + b"<philoHighlight/>" last_offset = b text = new_text + text[last_offset:] current_obj_img = [] current_graphic_img = [] text = "<div>" + text.decode('utf8', 'ignore') + "</div>" xml = FragmentParserParse(text) c = obj.db.dbh.cursor() for el in xml.iter(): try: if el.tag.startswith("DIV"): el.tag = el.tag.lower() if el.tag == "h1" or el.tag == "h2": el.tag = "b" el.attrib["class"] = "headword" if el.tag == "sc" or el.tag == "scx": el.tag = "span" el.attrib["class"] = "small-caps" if el.tag == "page": el.tag = "pb" elif el.tag == "head": el.tag = "b" el.attrib["class"] = "headword" elif el.tag == "list": el.tag = "ul" elif el.tag == "title": el.tag = "span" el.attrib['class'] = "xml-title" elif el.tag == "q": el.tag = "span" el.attrib['class'] = 'xml-q' elif el.tag == "table": el.tag = "span" el.attrib["class"] = "xml-table" elif el.tag == "ref" or el.tag == "xref": if el.attrib["type"] == "note" or el.attrib["type"] == "footnote": target = el.attrib["target"] link = make_absolute_query_link(config, request, script_name="/scripts/get_notes.py", target=target) if "n" in el.attrib: el.text = el.attrib["n"] else: el.text = "*" if el.text == "": el.text = "*" el.tag = "span" el.attrib["data-ref"] = link el.attrib["id"] = target.replace('#', '') + '-link-back' # attributes for popover note el.attrib['class'] = "note-ref" el.attrib['tabindex'] = "0" el.attrib['data-toggle'] = "popover" el.attrib['data-container'] = "body" el.attrib["data-placement"] = "right" el.attrib["data-trigger"] = "focus" el.attrib["data-html"] = "true" el.attrib["data-animation"] = "true" elif el.attrib["type"] == "cross": c.execute("SELECT philo_id FROM toms WHERE id=? LIMIT 1", (el.attrib["target"],)) try: object_id = c.fetchone()[0] except IndexError: el.tag = "span" continue el.tag = "a" el.attrib["href"] = 'navigate/%s' % '/'.join([i for i in object_id.split() if i != "0"]) el.attrib["class"] = "xml-ref-cross" del el.attrib["target"] elif el.attrib["type"] == "search": metadata, metadata_value = el.attrib["target"].split(':') params = {metadata: metadata_value, "report": "bibliography"} el.tag = "a" el.attrib["href"] = make_absolute_query_link(config, [], **params) del el.attrib["target"] elif el.tag == "note": # endnotes in_end_note = False for ancestor in el.iterancestors(): if ancestor.tag.startswith('div'): if "type" in ancestor.attrib: if ancestor.attrib["type"] == "notes": in_end_note = True break if note: # in footnote el.tag = "div" elif in_end_note: # in end note el.tag = "div" el.attrib['class'] = "xml-note" link_back = etree.Element("a") c.execute('select parent from refs where target=? and parent like ?', (el.attrib['id'], str(philo_id[0]) + " %")) object_id = c.fetchone()[0] link_back.attrib['href'] = 'navigate/%s%s' % ('/'.join([i for i in object_id.split() if i != "0"]), '#%s-link-back' % el.attrib['id']) link_back.attrib['class'] = "btn btn-xs btn-default link-back" link_back.attrib['role'] = "button" link_back.text = "Go back to text" el.append(link_back) else: ## inline notes el.tag = 'span' el.attrib['class'] = "note-content" for child in el: child = note_content(child) # insert an anchor before this element by scanning through the parent parent = el.getparent() for i, child in enumerate(parent): if child == el: attribs = {"class": "note", "tabindex": "0", "data-toggle": "popover", "data-container": "body", "data-placement": "right", "data-trigger": "focus"} parent.insert(i, etree.Element("a", attrib=attribs)) new_anchor = parent[i] new_anchor.text = "note" elif el.tag == "item": el.tag = "li" elif el.tag == "img": el.attrib["onerror"] = "this.style.display='none'" elif el.tag == "pb" and "n" in el.attrib: el.tag = "span" el.attrib["class"] = "xml-pb-image" if config.page_images_url_root and "facs" in el.attrib or "id" in el.attrib: if "facs" in el.attrib: img = el.attrib["facs"] else: img = el.attrib["id"] current_obj_img.append(img.split()[0]) el.append(etree.Element("a")) img_split = img.split() el[-1].attrib["href"] = os.path.join(config.page_images_url_root, img_split[0]) + config.page_image_extension if len(img_split) == 2: el[-1].attrib["large-img"] = os.path.join(config.page_images_url_root, img_split[1]) + config.page_image_extension else: el[-1].attrib["large-img"] = os.path.join(config.page_images_url_root, img_split[0]) + config.page_image_extension el[-1].text = "[page " + el.attrib["n"] + "]" if config.external_page_images: el[-1].attrib["target"] = "_blank" else: el[-1].attrib['class'] = "page-image-link" el[-1].attrib['data-gallery'] = '' else: if el.attrib["n"]: el.text = "--%s--" % el.attrib["n"] else: el.text = "--na--" grand_parent = el.getparent().getparent() if grand_parent.attrib["class"] == "xml-row": # Move page outside of table row to avoid display issues tail = etree.Element("span") tail.text = el.tail el.tail = "" great_grand_parent = grand_parent.getparent() grand_parent_index = great_grand_parent.index(grand_parent) el_index = el.getparent().index(el) great_grand_parent.insert(grand_parent_index+1, el) parent.insert(el_index, tail) if el.tag == "graphic": if config.page_images_url_root: imgs = el.attrib["facs"].split() current_graphic_img.append(imgs[0]) el.attrib["src"] = os.path.join(config.page_images_url_root, imgs[0]) el.tag = "img" el.attrib["class"] = "inline-img" el.attrib['data-gallery'] = '' el.attrib["inline-img"] = "" if len(imgs) > 1: el.attrib["large-img"] = os.path.join(config.page_images_url_root, imgs[1]) else: el.attrib["large-img"] = os.path.join(config.page_images_url_root, imgs[0]) del el.attrib["url"] elif el.tag == "ptr": if "facs" in el.attrib and config.page_images_url_root: el.tag = "a" el.attrib["href"] = os.path.join(config.page_images_url_root, el.attrib["facs"]) el.text = el.attrib["rend"] el.attrib["external-img"] = "" el.attrib["class"] = "external-img" el.attrib["large-img"] = el.attrib["href"] del el.attrib["rend"] del el.attrib["facs"] el.attrib['data-gallery'] = '' elif el.tag == "philoHighlight": word_match = re.match(word_regex, el.tail, re.U) if word_match: el.text = el.tail[:word_match.end()] el.tail = el.tail[word_match.end():] el.tag = "span" el.attrib["class"] = "highlight" if el.tag not in VALID_HTML_TAGS: el = xml_to_html_class(el) except Exception as exception: import sys print(exception, file=sys.stderr) output = etree.tostring(xml).decode('utf8', 'ignore') ## remove spaces around hyphens and apostrophes output = re.sub(r" ?([-';.])+ ", '\\1 ', output) output = convert_entities(output) if note: ## Notes don't need to fetch images return (output, {}) if not images: return (output, {}) ## Page images output, images = page_images(config, output, current_obj_img, current_graphic_img, philo_id) return output, images
def generate_time_series(request, config): db = DB(config.db_path + "/data/") time_series_object = {"query": dict([i for i in request]), "query_done": False} # Invalid date range if request.start_date == "invalid" or request.end_date == "invalid": time_series_object["results_length"] = 0 time_series_object["more_results"] = False time_series_object["new_start_date"] = 0 time_series_object["results"] = {"absolute_count": {}, "date_count": {}} return time_series_object start_date, end_date = get_start_end_date( db, config, start_date=request.start_date or None, end_date=request.end_date or None ) # Generate date ranges interval = int(request.year_interval) date_ranges = [] # Make sure last date gets included in for loop below by adding one to last step for start in range(start_date, end_date + 1, interval): end = start + interval - 1 if end > end_date: end = end_date date_range = "%d-%d" % (start, end) date_ranges.append((start, date_range)) absolute_count = defaultdict(int) date_counts = {} total_hits = 0 last_date_done = start_date start_time = timeit.default_timer() max_time = request.max_time or 10 for start_range, date_range in date_ranges: request.metadata[config.time_series_year_field] = date_range hits = db.query(request["q"], request["method"], request["arg"], raw_results=True, **request.metadata) hits.finish() hit_len = len(hits) params = {"report": "concordance", "start": "0", "end": "0"} params[config.time_series_year_field] = date_range url = make_absolute_query_link(config, request, **params) absolute_count[start_range] = {"label": start_range, "count": hit_len, "url": url} # Get date total count if interval != "1": end_range = start_range + (int(request["year_interval"]) - 1) query = 'select sum(word_count) from toms where %s between "%d" and "%d"' % ( config.time_series_year_field, start_range, end_range, ) else: query = "select sum(word_count) from toms where %s='%s'" % (config.time_series_year_field, start_range) cursor = db.dbh.cursor() cursor.execute(query) date_counts[start_range] = cursor.fetchone()[0] or 0 total_hits += hit_len elapsed = timeit.default_timer() - start_time last_date_done = start_range # avoid timeouts by splitting the query if more than request.max_time # (in seconds) has been spent in the loop if elapsed > int(max_time): break time_series_object["results_length"] = total_hits if (last_date_done + int(request.year_interval)) >= end_date: time_series_object["more_results"] = False else: time_series_object["more_results"] = True time_series_object["new_start_date"] = last_date_done + int(request.year_interval) time_series_object["results"] = {"absolute_count": absolute_count, "date_count": date_counts} return time_series_object
def generate_time_series(request, config): db = DB(config.db_path + "/data/") time_series_object = { "query": dict([i for i in request]), "query_done": False } # Invalid date range if request.start_date == "invalid" or request.end_date == "invalid": time_series_object["results_length"] = 0 time_series_object["more_results"] = False time_series_object["new_start_date"] = 0 time_series_object["results"] = { "absolute_count": {}, "date_count": {} } return time_series_object start_date, end_date = get_start_end_date(db, config, start_date=request.start_date or None, end_date=request.end_date or None) # Generate date ranges interval = int(request.year_interval) date_ranges = [] # Make sure last date gets included in for loop below by adding one to last step for start in range(start_date, end_date + 1, interval): end = start + interval - 1 if end > end_date: end = end_date date_range = "%d-%d" % (start, end) date_ranges.append((start, date_range)) absolute_count = defaultdict(int) date_counts = {} total_hits = 0 last_date_done = start_date start_time = timeit.default_timer() max_time = request.max_time or 10 cursor = db.dbh.cursor() for start_range, date_range in date_ranges: request.metadata[config.time_series_year_field] = date_range hits = db.query(request["q"], request["method"], request["arg"], raw_results=True, **request.metadata) hits.finish() hit_len = len(hits) params = {"report": "concordance", "start": "0", "end": "0"} params[config.time_series_year_field] = date_range url = make_absolute_query_link(config, request, **params) absolute_count[start_range] = { "label": start_range, "count": hit_len, "url": url } # Get date total count if interval != 1: end_range = start_range + (int(request["year_interval"]) - 1) if request.q: query = 'select sum(word_count) from toms where %s between "%d" and "%d"' % ( config.time_series_year_field, start_range, end_range, ) else: query = f"SELECT COUNT(*) FROM toms WHERE philo_type='{db.locals.default_object_level}' AND {config.time_series_year_field} BETWEEN {start_range} AND {end_range}" else: if request.q: query = "select sum(word_count) from toms where %s='%s'" % ( config.time_series_year_field, start_range) else: query = f"SELECT COUNT(*) FROM toms WHERE philo_type='{db.locals.default_object_level}' AND {config.time_series_year_field}='{start_range}'" cursor.execute(query) date_counts[start_range] = cursor.fetchone()[0] or 0 total_hits += hit_len elapsed = timeit.default_timer() - start_time last_date_done = start_range # avoid timeouts by splitting the query if more than request.max_time # (in seconds) has been spent in the loop if elapsed > int(max_time): break time_series_object["results_length"] = total_hits if (last_date_done + int(request.year_interval)) >= end_date: time_series_object["more_results"] = False else: time_series_object["more_results"] = True time_series_object["new_start_date"] = last_date_done + int( request.year_interval) time_series_object["results"] = { "absolute_count": absolute_count, "date_count": date_counts } return time_series_object