def generate_word_frequency(request, config):
    """reads through a hitlist. looks up request["field"] in each hit, and builds up a list of
       unique values and their frequencies."""
    db = DB(config.db_path + "/data/")
    hits = db.query(request["q"], request["method"], request["arg"],
                    **request.metadata)
    field = request["field"]
    counts = {}
    frequency_object = {}
    start_time = timeit.default_timer()
    last_hit_done = request.start
    try:
        for n in hits[request.start:]:
            key = get_word_attrib(n, field, db)
            if not key:
                # NULL is a magic value for queries, don't change it
                # recklessly.
                key = "NULL"
            if key not in counts:
                counts[key] = 0
            counts[key] += 1
            elapsed = timeit.default_timer() - start_time
            last_hit_done += 1
            if elapsed > 5:
                break

        table = {}
        for k, v in counts.items():
            url = make_absolute_query_link(
                config,
                request,
                start="0",
                end="0",
                report="word_property_filter",
                word_property=field,
                word_property_value=k,
            )
            table[k] = {"count": v, "url": url}

        frequency_object["results"] = table
        frequency_object["hits_done"] = last_hit_done
        if last_hit_done == len(hits):
            frequency_object["more_results"] = False
        else:
            frequency_object["more_results"] = True

    except IndexError:
        frequency_object["results"] = {}
        frequency_object["more_results"] = False

    frequency_object["results_length"] = len(hits)
    frequency_object["query"] = dict([i for i in request])

    return frequency_object
def generate_word_frequency(request, config):
    """reads through a hitlist. looks up request["field"] in each hit, and builds up a list of
       unique values and their frequencies."""
    db = DB(config.db_path + "/data/")
    hits = db.query(request["q"], request["method"], request["arg"], **request.metadata)
    field = request["field"]
    counts = {}
    frequency_object = {}
    start_time = timeit.default_timer()
    last_hit_done = request.start
    try:
        for n in hits[request.start :]:
            key = get_word_attrib(n, field, db)
            if not key:
                # NULL is a magic value for queries, don't change it
                # recklessly.
                key = "NULL"
            if key not in counts:
                counts[key] = 0
            counts[key] += 1
            elapsed = timeit.default_timer() - start_time
            last_hit_done += 1
            if elapsed > 5:
                break

        table = {}
        for k, v in counts.items():
            url = make_absolute_query_link(
                config,
                request,
                start="0",
                end="0",
                report="word_property_filter",
                word_property=field,
                word_property_value=k,
            )
            table[k] = {"count": v, "url": url}

        frequency_object["results"] = table
        frequency_object["hits_done"] = last_hit_done
        if last_hit_done == len(hits):
            frequency_object["more_results"] = False
        else:
            frequency_object["more_results"] = True

    except IndexError:
        frequency_object["results"] = {}
        frequency_object["more_results"] = False

    frequency_object["results_length"] = len(hits)
    frequency_object["query"] = dict([i for i in request])

    return frequency_object
Пример #3
0
def cite_linker(hit, citation_object, citation_hrefs, config, report):
    """Get links"""
    href = None
    if citation_object["link"]:
        if citation_object["object_level"] == "doc":
            if citation_object["field"] == "title" or citation_object["field"] == "filename":
                href = citation_hrefs['doc']
            elif report == "bibliography" and citation_object["field"] == "head":
                href = make_absolute_object_link(config, hit.philo_id)
            else:
                params = [("report", "bibliography"),
                          (citation_object["field"], '"%s"' % hit[citation_object["field"]])]
                href = make_absolute_query_link(config, params)
        else:
            href = citation_hrefs[citation_object["object_level"]]
    return href
Пример #4
0
def cite_linker(hit, citation_object, citation_hrefs, config, report):
    """Get links"""
    href = None
    if citation_object["link"]:
        if citation_object["object_level"] == "doc":
            if citation_object["field"] == "title" or citation_object["field"] == "filename":
                href = citation_hrefs["doc"]
            elif report == "bibliography" and citation_object["field"] == "head":
                href = make_absolute_object_link(config, hit.philo_id)
            else:
                params = [
                    ("report", "bibliography"),
                    (citation_object["field"], '"%s"' % hit[citation_object["field"]]),
                ]
                href = make_absolute_query_link(config, params)
        else:
            href = citation_hrefs[citation_object["object_level"]]
    return href
Пример #5
0
def frequency_results(request, config, sorted_results=False):
    """reads through a hitlist. looks up request.frequency_field in each hit, and builds up a list of
       unique values and their frequencies."""
    db = DB(config.db_path + "/data/")
    biblio_search = False
    if request.q == "" and request.no_q:
        biblio_search = True
        if request.no_metadata:
            hits = db.get_all(db.locals["default_object_level"],
                              sort_order=["rowid"],
                              raw_results=True)
        else:
            hits = db.query(sort_order=["rowid"],
                            raw_results=True,
                            **request.metadata)
    else:
        hits = db.query(request["q"],
                        request["method"],
                        request["arg"],
                        raw_results=True,
                        **request.metadata)

    if sorted_results is True:
        hits.finish()

    cursor = db.dbh.cursor()

    cursor.execute("select philo_id, %s from toms where %s is not null" %
                   (request.frequency_field, request.frequency_field))
    metadata_dict = {}
    for i in cursor:
        philo_id, field = i
        philo_id = tuple(int(s) for s in philo_id.split() if int(s))
        metadata_dict[philo_id] = field

    counts = {}
    frequency_object = {}
    start_time = timeit.default_timer()
    last_hit_done = request.start

    obj_dict = {
        "doc": 1,
        "div1": 2,
        "div2": 3,
        "div3": 4,
        "para": 5,
        "sent": 6,
        "word": 7
    }
    metadata_type = db.locals["metadata_types"][request.frequency_field]
    try:
        object_level = obj_dict[metadata_type]
    except KeyError:
        # metadata_type == "div"
        pass

    try:
        for philo_id in hits[request.start:]:
            if not biblio_search:
                philo_id = tuple(list(philo_id[:6]) + [philo_id[7]])
            if metadata_type == "div":
                key = ""
                for div in ["div1", "div2", "div3"]:
                    if philo_id[:obj_dict[div]] in metadata_dict:
                        key = metadata_dict[philo_id[:obj_dict[div]]]
                while not key:
                    if philo_id[:4] in metadata_dict:
                        key = metadata_dict[philo_id[:4]]
                        break
                    if philo_id[:5] in metadata_dict:
                        key = metadata_dict[philo_id[:5]]
                        break
                    break
                if not key:
                    last_hit_done += 1
                    continue
            else:
                try:
                    key = metadata_dict[philo_id[:object_level]]
                except:
                    last_hit_done += 1
                    continue
            if key not in counts:
                counts[key] = {
                    "count": 0,
                    "metadata": {
                        request.frequency_field: key
                    }
                }
                counts[key]["url"] = make_absolute_query_link(
                    config,
                    request,
                    frequency_field="",
                    start="0",
                    end="0",
                    report=request.report,
                    script="",
                    **{request.frequency_field: '"%s"' % key})
                if not biblio_search:
                    query_metadata = dict([
                        (k, v) for k, v in request.metadata.items() if v
                    ])
                    query_metadata[request.frequency_field] = '"%s"' % key
                    local_hits = db.query(**query_metadata)
                    counts[key][
                        "total_word_count"] = local_hits.get_total_word_count(
                        )
            counts[key]["count"] += 1

            # avoid timeouts by splitting the query if more than
            # request.max_time (in seconds) has been spent in the loop
            elapsed = timeit.default_timer() - start_time
            last_hit_done += 1
            if elapsed > 5 and sorted_results is False:
                break

        frequency_object["results"] = counts
        frequency_object["hits_done"] = last_hit_done
        if last_hit_done == len(hits):
            new_metadata = dict([(k, v) for k, v in request.metadata.items()
                                 if v])
            new_metadata[request.frequency_field] = '"NULL"'
            if request.q == "" and request.no_q:
                new_hits = db.query(sort_order=["rowid"],
                                    raw_results=True,
                                    **new_metadata)
            else:
                new_hits = db.query(request["q"],
                                    request["method"],
                                    request["arg"],
                                    raw_results=True,
                                    **new_metadata)
            new_hits.finish()
            if len(new_hits):
                null_url = make_absolute_query_link(
                    config,
                    request,
                    frequency_field="",
                    start="0",
                    end="0",
                    report=request.report,
                    script="",
                    **{request.frequency_field: '"NULL"'})
                local_hits = db.query(**new_metadata)
                if not biblio_search:
                    frequency_object["results"]["NULL"] = {
                        "count": len(new_hits),
                        "url": null_url,
                        "metadata": {
                            request.frequency_field: '"NULL"'
                        },
                        "total_word_count": local_hits.get_total_word_count(),
                    }
                else:
                    frequency_object["results"]["NULL"] = {
                        "count": len(new_hits),
                        "url": null_url,
                        "metadata": {
                            request.frequency_field: '"NULL"'
                        },
                    }
            frequency_object["more_results"] = False
        else:
            frequency_object["more_results"] = True
    except IndexError:
        frequency_object["results"] = {}
        frequency_object["more_results"] = False
    frequency_object["results_length"] = len(hits)
    frequency_object["query"] = dict([i for i in request])

    if sorted_results is True:
        frequency_object["results"] = sorted(
            frequency_object["results"].items(),
            key=lambda x: x[1]["count"],
            reverse=True)

    return frequency_object
Пример #6
0
def frequency_results(request, config, sorted=False):
    """reads through a hitlist. looks up request.frequency_field in each hit, and builds up a list of
       unique values and their frequencies."""
    db = DB(config.db_path + '/data/')
    biblio_search = False
    if request.q == '' and request.no_q:
        biblio_search = True
        if request.no_metadata:
            hits = db.get_all(db.locals['default_object_level'], sort_order=["rowid"], raw_results=True)
        else:
            hits = db.query(sort_order=["rowid"], raw_results=True, **request.metadata)
    else:
        hits = db.query(request["q"], request["method"], request["arg"], raw_results=True, **request.metadata)

    if sorted:
        hits.finish()

    c = db.dbh.cursor()

    c.execute('select philo_id, %s from toms where %s is not null' % (request.frequency_field, request.frequency_field))
    metadata_dict = {}
    for i in c.fetchall():
        philo_id, field = i
        philo_id = tuple(int(s) for s in philo_id.split() if int(s))
        metadata_dict[philo_id] = field

    counts = {}
    frequency_object = {}
    start_time = timeit.default_timer()
    last_hit_done = request.start

    obj_dict = {'doc': 1, 'div1': 2, 'div2': 3, 'div3': 4, 'para': 5, 'sent': 6, 'word': 7}
    metadata_type = db.locals["metadata_types"][request.frequency_field]
    try:
        object_level = obj_dict[metadata_type]
    except KeyError:
        # metadata_type == "div"
        pass

    try:
        for philo_id in hits[request.start:]:
            if not biblio_search:
                philo_id = tuple(list(philo_id[:6]) + [philo_id[7]])
            if metadata_type == "div":
                key = ""
                for div in ["div1", "div2", "div3"]:
                    if philo_id[:obj_dict[div]] in metadata_dict:
                        key = metadata_dict[philo_id[:obj_dict[div]]]
                while not key:
                    if philo_id[:4] in metadata_dict:
                        key = metadata_dict[philo_id[:4]]
                        break
                    if philo_id[:5] in metadata_dict:
                        key = metadata_dict[philo_id[:5]]
                        break
                    break
                if not key:
                    last_hit_done += 1
                    continue
            else:
                try:
                    key = metadata_dict[philo_id[:object_level]]
                except:
                    last_hit_done += 1
                    continue
            if key not in counts:
                counts[key] = {"count": 0, 'metadata': {request.frequency_field: key}}
                counts[key]["url"] = make_absolute_query_link(config,
                                                              request,
                                                              frequency_field="",
                                                              start="0",
                                                              end="0",
                                                              report=request.report,
                                                              script='',
                                                              **{request.frequency_field: '"%s"' % key})
                if not biblio_search:
                    query_metadata = dict([(k, v) for k, v in request.metadata.iteritems() if v])
                    query_metadata[request.frequency_field] = '"%s"' % key
                    local_hits = db.query(**query_metadata)
                    counts[key]["total_word_count"] = local_hits.get_total_word_count()
            counts[key]["count"] += 1

            # avoid timeouts by splitting the query if more than
            # request.max_time (in seconds) has been spent in the loop
            elapsed = timeit.default_timer() - start_time
            last_hit_done += 1
            if elapsed > 5:
                break

        frequency_object['results'] = counts
        frequency_object["hits_done"] = last_hit_done
        if last_hit_done == len(hits):
            new_metadata = dict([(k, v) for k, v in request.metadata.iteritems() if v])
            new_metadata[request.frequency_field] = '"NULL"'
            if request.q == '' and request.no_q:
                new_hits = db.query(sort_order=["rowid"], raw_results=True, **new_metadata)
            else:
                new_hits = db.query(request["q"], request["method"], request["arg"], raw_results=True, **new_metadata)
            new_hits.finish()
            if len(new_hits):
                null_url = make_absolute_query_link(config,
                                                    request,
                                                    frequency_field="",
                                                    start="0",
                                                    end="0",
                                                    report=request.report,
                                                    script='',
                                                    **{request.frequency_field: '"NULL"'})
                local_hits = db.query(**new_metadata)
                if not biblio_search:
                    frequency_object["results"]["NULL"] = {"count": len(new_hits),
                                                           "url": null_url,
                                                           "metadata": {request.frequency_field: '"NULL"'},
                                                           "total_word_count": local_hits.get_total_word_count()}
                else:
                    frequency_object["results"]["NULL"] = {"count": len(new_hits),
                                                           "url": null_url,
                                                           "metadata": {request.frequency_field: '"NULL"'}}
            frequency_object['more_results'] = False
        else:
            frequency_object['more_results'] = True
    except IndexError:
        frequency_object['results'] = {}
        frequency_object['more_results'] = False
    frequency_object['results_length'] = len(hits)
    frequency_object['query'] = dict([i for i in request])

    if sorted:
        frequency_object["results"] = sorted(frequency_object['results'].iteritems(),
                                             key=lambda x: x[1]['count'],
                                             reverse=True)

    return frequency_object
Пример #7
0
def generate_time_series(request, config):
    db = DB(config.db_path + '/data/')
    time_series_object = {'query': dict([i for i in request]), 'query_done': False}

    # Invalid date range
    if request.start_date == 'invalid' or request.end_date == 'invalid':
        time_series_object['results_length'] = 0
        time_series_object['more_results'] = False
        time_series_object['new_start_date'] = 0
        time_series_object['results'] = {'absolute_count': {}, 'date_count': {}}
        return time_series_object

    start_date, end_date = get_start_end_date(db,
                                              config,
                                              start_date=request.start_date or None,
                                              end_date=request.end_date or None)

    # Generate date ranges
    interval = int(request.year_interval)
    date_ranges = []
    # Make sure last date gets included in for loop below by adding one to last step
    for start in range(start_date, end_date+1, interval):
        end = start + interval - 1
        if end > end_date:
            end = end_date
        date_range = "%d-%d" % (start, end)
        date_ranges.append((start, date_range))

    absolute_count = defaultdict(int)
    date_counts = {}
    total_hits = 0
    last_date_done = start_date
    start_time = timeit.default_timer()
    max_time = request.max_time or 10
    for start_range, date_range in date_ranges:
        request.metadata[config.time_series_year_field] = date_range
        hits = db.query(request["q"], request["method"], request["arg"], raw_results=True, **request.metadata)
        hits.finish()
        hit_len = len(hits)
        params = {"report": "concordance", "start": "0", "end": "0"}
        params[config.time_series_year_field] = date_range
        url = make_absolute_query_link(config, request, **params)
        absolute_count[start_range] = {"label": start_range, "count": hit_len, "url": url}

        # Get date total count
        if interval != '1':
            end_range = start_range + (int(request['year_interval']) - 1)
            query = 'select sum(word_count) from toms where %s between "%d" and "%d"' % (config.time_series_year_field,
                                                                                         start_range, end_range)
        else:
            query = "select sum(word_count) from toms where %s='%s'" % (config.time_series_year_field, start_range)

        c = db.dbh.cursor()
        c.execute(query)
        date_counts[start_range] = c.fetchone()[0] or 0
        total_hits += hit_len
        elapsed = timeit.default_timer() - start_time
        last_date_done = start_range
        # avoid timeouts by splitting the query if more than request.max_time
        # (in seconds) has been spent in the loop
        if elapsed > int(max_time):
            break

    time_series_object['results_length'] = total_hits
    if (last_date_done + int(request.year_interval)) >= end_date:
        time_series_object['more_results'] = False
    else:
        time_series_object['more_results'] = True
        time_series_object['new_start_date'] = last_date_done + int(request.year_interval)
    time_series_object['results'] = {'absolute_count': absolute_count, 'date_count': date_counts}

    return time_series_object
Пример #8
0
def format_text_object(obj, text, config, request, word_regex, byte_offsets=None, note=False, images=True):
    """Format text objects"""
    philo_id = obj.philo_id
    if byte_offsets is not None:
        new_text = b""
        last_offset = 0
        for b in byte_offsets:
            new_text += text[last_offset:b] + b"<philoHighlight/>"
            last_offset = b
        text = new_text + text[last_offset:]
    current_obj_img = []
    current_graphic_img = []
    text = "<div>" + text.decode('utf8', 'ignore') + "</div>"
    xml = FragmentParserParse(text)
    c = obj.db.dbh.cursor()
    for el in xml.iter():
        try:
            if el.tag.startswith("DIV"):
                el.tag = el.tag.lower()
            if el.tag == "sc" or el.tag == "scx":
                el.tag = "span"
                el.attrib["class"] = "small-caps"
            elif el.tag == "head":
                el.tag = "b"
                el.attrib["class"] = "headword"
            elif el.tag == "list":
                el.tag = "ul"
            elif el.tag == "title":
                el.tag = "span"
                el.attrib['class'] = "xml-title"
            elif el.tag == "q":
                el.tag = "span"
                el.attrib['class'] = 'xml-q'
            elif el.tag == "table":
                el.tag = "span"
                el.attrib["class"] = "xml-table"
            elif el.tag == "ref" or el.tag == "xref":
                if el.attrib["type"] == "note" or el.attrib["type"] == "footnote":
                    target = el.attrib["target"]
                    link = make_absolute_query_link(config, request, script_name="/scripts/get_notes.py", target=target)
                    if "n" in el.attrib:
                        el.text = el.attrib["n"]
                    else:
                        el.text = "*"
                    if el.text == "":
                        el.text = "*"
                    el.tag = "span"
                    el.attrib["data-ref"] = link
                    el.attrib["id"] = target.replace('#', '') + '-link-back'
                    # attributes for popover note
                    el.attrib['class'] = "note-ref"
                    el.attrib['tabindex'] = "0"
                    el.attrib['data-toggle'] = "popover"
                    el.attrib['data-container'] = "body"
                    el.attrib["data-placement"] = "right"
                    el.attrib["data-trigger"] = "focus"
                    el.attrib["data-html"] = "true"
                    el.attrib["data-animation"] = "true"
                elif el.attrib["type"] == "cross":
                    c.execute("SELECT philo_id FROM toms WHERE id=? LIMIT 1", (el.attrib["target"],))
                    try:
                        object_id = c.fetchone()[0]
                    except IndexError:
                        el.tag = "span"
                        continue
                    el.tag = "a"
                    el.attrib["href"] = 'navigate/%s' % '/'.join([i for i in object_id.split() if i != "0"])
                    el.attrib["class"] = "xml-ref-cross"
                    del el.attrib["target"]
                elif el.attrib["type"] == "search":
                    metadata, metadata_value = el.attrib["target"].split(':')
                    params = {metadata: metadata_value, "report": "bibliography"}
                    el.tag = "a"
                    el.attrib["href"] = make_absolute_query_link(config, [], **params)
                    del el.attrib["target"]
            elif el.tag == "note":
                # endnotes
                in_end_note = False
                for ancestor in el.iterancestors():
                    if ancestor.tag.startswith('div'):
                        if "type" in ancestor.attrib:
                            if ancestor.attrib["type"] == "notes":
                                in_end_note = True
                                break
                if note:  # in footnote
                    el.tag = "div"
                elif in_end_note:  # in end note
                    el.tag = "div"
                    el.attrib['class'] = "xml-note"
                    link_back = etree.Element("a")
                    c.execute('select parent from refs where target=? and parent like ?',
                              (el.attrib['id'], str(philo_id[0]) + " %"))
                    object_id = c.fetchone()[0]
                    link_back.attrib['href'] = 'navigate/%s%s' % ('/'.join([i for i in object_id.split() if i != "0"]),
                                                                  '#%s-link-back' % el.attrib['id'])
                    link_back.attrib['class'] = "btn btn-xs btn-default link-back"
                    link_back.attrib['role'] = "button"
                    link_back.text = "Go back to text"
                    el.append(link_back)
                else:  ## inline notes
                    el.tag = 'span'
                    el.attrib['class'] = "note-content"

                    for child in el:
                        child = note_content(child)
                    # insert an anchor before this element by scanning through the parent
                    parent = el.getparent()
                    for i, child in enumerate(parent):
                        if child == el:
                            attribs = {"class": "note",
                                       "tabindex": "0",
                                       "data-toggle": "popover",
                                       "data-container": "body",
                                       "data-placement": "right",
                                       "data-trigger": "focus"}
                            parent.insert(i, etree.Element("a", attrib=attribs))
                            new_anchor = parent[i]
                            new_anchor.text = "note"
            elif el.tag == "item":
                el.tag = "li"
            elif el.tag == "ab" or el.tag == "ln":
                el.tag = "l"
            elif el.tag == "img":
                el.attrib["onerror"] = "this.style.display='none'"
            elif el.tag == "pb" and "n" in el.attrib:
                el.tag = "span"
                el.attrib["class"] = "xml-pb-image"
                if config.page_images_url_root:
                    if "facs" in el.attrib or "id" in el.attrib:
                        if "facs" in el.attrib:
                            img = el.attrib["facs"]
                        else:
                            img = el.attrib["id"]
                        current_obj_img.append(img.split()[0])
                        el.append(etree.Element("a"))
                        img_split = img.split()
                        el[-1].attrib["href"] = os.path.join(config.page_images_url_root, img_split[0]) + config.page_image_extension
                        if len(img_split) == 2:
                            el[-1].attrib["large-img"] = os.path.join(config.page_images_url_root, img_split[1]) + config.page_image_extension
                        else:
                            el[-1].attrib["large-img"] = os.path.join(config.page_images_url_root, img_split[0]) + config.page_image_extension
                        el[-1].text = "[page " + el.attrib["n"] + "]"
                        if config.external_page_images:
                            el[-1].attrib["target"] = "_blank"
                        else:
                            el[-1].attrib['class'] = "page-image-link"
                            el[-1].attrib['data-gallery'] = ''
                else:
                    if el.attrib["n"]:
                        el.text = "--%s--" % el.attrib["n"]
                    else:
                        el.text = "--na--"
                grand_parent = el.getparent().getparent()
                if grand_parent.attrib["class"] == "xml-row":
                    # Move page outside of table row to avoid display issues
                    tail = etree.Element("span")
                    tail.text = el.tail
                    el.tail = ""
                    great_grand_parent = grand_parent.getparent()
                    grand_parent_index = great_grand_parent.index(grand_parent)
                    el_index = el.getparent().index(el)
                    great_grand_parent.insert(grand_parent_index+1, el)
                    parent.insert(el_index, tail)
            if el.tag == "graphic":
                if config.page_images_url_root:
                    imgs = el.attrib["facs"].split()
                    current_graphic_img.append(imgs[0])
                    el.attrib["src"] = os.path.join(config.page_images_url_root, imgs[0])
                    el.tag = "img"
                    el.attrib["class"] = "inline-img"
                    el.attrib['data-gallery'] = ''
                    el.attrib["inline-img"] = ""
                    if len(imgs) > 1:
                        el.attrib["large-img"] = os.path.join(config.page_images_url_root, imgs[1])
                    else:
                        el.attrib["large-img"] = os.path.join(config.page_images_url_root, imgs[0])
                    del el.attrib["url"]
            elif el.tag == "philoHighlight":
                word_match = re.match(word_regex, el.tail, re.U)
                if word_match:
                    el.text = el.tail[:word_match.end()]
                    el.tail = el.tail[word_match.end():]
                el.tag = "span"
                el.attrib["class"] = "highlight"
            if el.tag not in VALID_HTML_TAGS:
                el = xml_to_html_class(el)
        except Exception as exception:
            import sys
            print(exception, file=sys.stderr)
    output = etree.tostring(xml).decode('utf8', 'ignore')
    ## remove spaces around hyphens and apostrophes
    output = re.sub(r" ?([-';.])+ ", '\\1 ', output)
    output = convert_entities(output)

    if note:  ## Notes don't need to fetch images
        return (output, {})
    if not images:
        return (output, {})

    ## Page images
    output, images = page_images(config, output, current_obj_img, current_graphic_img, philo_id)
    return output, images
Пример #9
0
def format_text_object(obj, text, config, request, word_regex, byte_offsets=None, note=False, images=True):
    """Format text objects"""
    philo_id = obj.philo_id
    if byte_offsets is not None:
        new_text = b""
        last_offset = 0
        for b in byte_offsets:
            new_text += text[last_offset:b] + b"<philoHighlight/>"
            last_offset = b
        text = new_text + text[last_offset:]
    current_obj_img = []
    current_graphic_img = []
    text = "<div>" + text.decode('utf8', 'ignore') + "</div>"
    xml = FragmentParserParse(text)
    c = obj.db.dbh.cursor()
    for el in xml.iter():
        try:
            if el.tag.startswith("DIV"):
                el.tag = el.tag.lower()
            if el.tag == "h1" or el.tag == "h2":
                el.tag = "b"
                el.attrib["class"] = "headword"
            if el.tag == "sc" or el.tag == "scx":
                el.tag = "span"
                el.attrib["class"] = "small-caps"
            if el.tag == "page":
                el.tag = "pb"
            elif el.tag == "head":
                el.tag = "b"
                el.attrib["class"] = "headword"
            elif el.tag == "list":
                el.tag = "ul"
            elif el.tag == "title":
                el.tag = "span"
                el.attrib['class'] = "xml-title"
            elif el.tag == "q":
                el.tag = "span"
                el.attrib['class'] = 'xml-q'
            elif el.tag == "table":
                el.tag = "span"
                el.attrib["class"] = "xml-table"
            elif el.tag == "ref" or el.tag == "xref":
                if el.attrib["type"] == "note" or el.attrib["type"] == "footnote":
                    target = el.attrib["target"]
                    link = make_absolute_query_link(config, request, script_name="/scripts/get_notes.py", target=target)
                    if "n" in el.attrib:
                        el.text = el.attrib["n"]
                    else:
                        el.text = "*"
                    if el.text == "":
                        el.text = "*"
                    el.tag = "span"
                    el.attrib["data-ref"] = link
                    el.attrib["id"] = target.replace('#', '') + '-link-back'
                    # attributes for popover note
                    el.attrib['class'] = "note-ref"
                    el.attrib['tabindex'] = "0"
                    el.attrib['data-toggle'] = "popover"
                    el.attrib['data-container'] = "body"
                    el.attrib["data-placement"] = "right"
                    el.attrib["data-trigger"] = "focus"
                    el.attrib["data-html"] = "true"
                    el.attrib["data-animation"] = "true"
                elif el.attrib["type"] == "cross":
                    c.execute("SELECT philo_id FROM toms WHERE id=? LIMIT 1", (el.attrib["target"],))
                    try:
                        object_id = c.fetchone()[0]
                    except IndexError:
                        el.tag = "span"
                        continue
                    el.tag = "a"
                    el.attrib["href"] = 'navigate/%s' % '/'.join([i for i in object_id.split() if i != "0"])
                    el.attrib["class"] = "xml-ref-cross"
                    del el.attrib["target"]
                elif el.attrib["type"] == "search":
                    metadata, metadata_value = el.attrib["target"].split(':')
                    params = {metadata: metadata_value, "report": "bibliography"}
                    el.tag = "a"
                    el.attrib["href"] = make_absolute_query_link(config, [], **params)
                    del el.attrib["target"]
            elif el.tag == "note":
                # endnotes
                in_end_note = False
                for ancestor in el.iterancestors():
                    if ancestor.tag.startswith('div'):
                        if "type" in ancestor.attrib:
                            if ancestor.attrib["type"] == "notes":
                                in_end_note = True
                                break
                if note:  # in footnote
                    el.tag = "div"
                elif in_end_note:  # in end note
                    el.tag = "div"
                    el.attrib['class'] = "xml-note"
                    link_back = etree.Element("a")
                    c.execute('select parent from refs where target=? and parent like ?',
                              (el.attrib['id'], str(philo_id[0]) + " %"))
                    object_id = c.fetchone()[0]
                    link_back.attrib['href'] = 'navigate/%s%s' % ('/'.join([i for i in object_id.split() if i != "0"]),
                                                                  '#%s-link-back' % el.attrib['id'])
                    link_back.attrib['class'] = "btn btn-xs btn-default link-back"
                    link_back.attrib['role'] = "button"
                    link_back.text = "Go back to text"
                    el.append(link_back)
                else:  ## inline notes
                    el.tag = 'span'
                    el.attrib['class'] = "note-content"

                    for child in el:
                        child = note_content(child)
                    # insert an anchor before this element by scanning through the parent
                    parent = el.getparent()
                    for i, child in enumerate(parent):
                        if child == el:
                            attribs = {"class": "note",
                                       "tabindex": "0",
                                       "data-toggle": "popover",
                                       "data-container": "body",
                                       "data-placement": "right",
                                       "data-trigger": "focus"}
                            parent.insert(i, etree.Element("a", attrib=attribs))
                            new_anchor = parent[i]
                            new_anchor.text = "note"
            elif el.tag == "item":
                el.tag = "li"
            elif el.tag == "img":
                el.attrib["onerror"] = "this.style.display='none'"
            elif el.tag == "pb" and "n" in el.attrib:
                el.tag = "span"
                el.attrib["class"] = "xml-pb-image"
                if config.page_images_url_root and "facs" in el.attrib or "id" in el.attrib:
                    if "facs" in el.attrib:
                        img = el.attrib["facs"]
                    else:
                        img = el.attrib["id"]
                    current_obj_img.append(img.split()[0])
                    el.append(etree.Element("a"))
                    img_split = img.split()
                    el[-1].attrib["href"] = os.path.join(config.page_images_url_root, img_split[0]) + config.page_image_extension
                    if len(img_split) == 2:
                        el[-1].attrib["large-img"] = os.path.join(config.page_images_url_root, img_split[1]) + config.page_image_extension
                    else:
                        el[-1].attrib["large-img"] = os.path.join(config.page_images_url_root, img_split[0]) + config.page_image_extension
                    el[-1].text = "[page " + el.attrib["n"] + "]"
                    if config.external_page_images:
                        el[-1].attrib["target"] = "_blank"
                    else:
                        el[-1].attrib['class'] = "page-image-link"
                        el[-1].attrib['data-gallery'] = ''
                else:
                    if el.attrib["n"]:
                        el.text = "--%s--" % el.attrib["n"]
                    else:
                        el.text = "--na--"
                grand_parent = el.getparent().getparent()
                if grand_parent.attrib["class"] == "xml-row":
                    # Move page outside of table row to avoid display issues
                    tail = etree.Element("span")
                    tail.text = el.tail
                    el.tail = ""
                    great_grand_parent = grand_parent.getparent()
                    grand_parent_index = great_grand_parent.index(grand_parent)
                    el_index = el.getparent().index(el)
                    great_grand_parent.insert(grand_parent_index+1, el)
                    parent.insert(el_index, tail)
            if el.tag == "graphic":
                if config.page_images_url_root:
                    imgs = el.attrib["facs"].split()
                    current_graphic_img.append(imgs[0])
                    el.attrib["src"] = os.path.join(config.page_images_url_root, imgs[0])
                    el.tag = "img"
                    el.attrib["class"] = "inline-img"
                    el.attrib['data-gallery'] = ''
                    el.attrib["inline-img"] = ""
                    if len(imgs) > 1:
                        el.attrib["large-img"] = os.path.join(config.page_images_url_root, imgs[1])
                    else:
                        el.attrib["large-img"] = os.path.join(config.page_images_url_root, imgs[0])
                    del el.attrib["url"]
            elif el.tag == "ptr":
                if "facs" in el.attrib and config.page_images_url_root:
                    el.tag = "a"
                    el.attrib["href"] = os.path.join(config.page_images_url_root, el.attrib["facs"])
                    el.text = el.attrib["rend"]
                    el.attrib["external-img"] = ""
                    el.attrib["class"] = "external-img"
                    el.attrib["large-img"] = el.attrib["href"]
                    del el.attrib["rend"]
                    del el.attrib["facs"]
                    el.attrib['data-gallery'] = ''
            elif el.tag == "philoHighlight":
                word_match = re.match(word_regex, el.tail, re.U)
                if word_match:
                    el.text = el.tail[:word_match.end()]
                    el.tail = el.tail[word_match.end():]
                el.tag = "span"
                el.attrib["class"] = "highlight"
            if el.tag not in VALID_HTML_TAGS:
                el = xml_to_html_class(el)
        except Exception as exception:
            import sys
            print(exception, file=sys.stderr)
    output = etree.tostring(xml).decode('utf8', 'ignore')
    ## remove spaces around hyphens and apostrophes
    output = re.sub(r" ?([-';.])+ ", '\\1 ', output)
    output = convert_entities(output)

    if note:  ## Notes don't need to fetch images
        return (output, {})
    if not images:
        return (output, {})

    ## Page images
    output, images = page_images(config, output, current_obj_img, current_graphic_img, philo_id)
    return output, images
Пример #10
0
def generate_time_series(request, config):
    db = DB(config.db_path + "/data/")
    time_series_object = {"query": dict([i for i in request]), "query_done": False}

    # Invalid date range
    if request.start_date == "invalid" or request.end_date == "invalid":
        time_series_object["results_length"] = 0
        time_series_object["more_results"] = False
        time_series_object["new_start_date"] = 0
        time_series_object["results"] = {"absolute_count": {}, "date_count": {}}
        return time_series_object

    start_date, end_date = get_start_end_date(
        db, config, start_date=request.start_date or None, end_date=request.end_date or None
    )

    # Generate date ranges
    interval = int(request.year_interval)
    date_ranges = []
    # Make sure last date gets included in for loop below by adding one to last step
    for start in range(start_date, end_date + 1, interval):
        end = start + interval - 1
        if end > end_date:
            end = end_date
        date_range = "%d-%d" % (start, end)
        date_ranges.append((start, date_range))

    absolute_count = defaultdict(int)
    date_counts = {}
    total_hits = 0
    last_date_done = start_date
    start_time = timeit.default_timer()
    max_time = request.max_time or 10
    for start_range, date_range in date_ranges:
        request.metadata[config.time_series_year_field] = date_range
        hits = db.query(request["q"], request["method"], request["arg"], raw_results=True, **request.metadata)
        hits.finish()
        hit_len = len(hits)
        params = {"report": "concordance", "start": "0", "end": "0"}
        params[config.time_series_year_field] = date_range
        url = make_absolute_query_link(config, request, **params)
        absolute_count[start_range] = {"label": start_range, "count": hit_len, "url": url}

        # Get date total count
        if interval != "1":
            end_range = start_range + (int(request["year_interval"]) - 1)
            query = 'select sum(word_count) from toms where %s between "%d" and "%d"' % (
                config.time_series_year_field,
                start_range,
                end_range,
            )
        else:
            query = "select sum(word_count) from toms where %s='%s'" % (config.time_series_year_field, start_range)

        cursor = db.dbh.cursor()
        cursor.execute(query)
        date_counts[start_range] = cursor.fetchone()[0] or 0
        total_hits += hit_len
        elapsed = timeit.default_timer() - start_time
        last_date_done = start_range
        # avoid timeouts by splitting the query if more than request.max_time
        # (in seconds) has been spent in the loop
        if elapsed > int(max_time):
            break

    time_series_object["results_length"] = total_hits
    if (last_date_done + int(request.year_interval)) >= end_date:
        time_series_object["more_results"] = False
    else:
        time_series_object["more_results"] = True
        time_series_object["new_start_date"] = last_date_done + int(request.year_interval)
    time_series_object["results"] = {"absolute_count": absolute_count, "date_count": date_counts}

    return time_series_object
Пример #11
0
def generate_time_series(request, config):
    db = DB(config.db_path + "/data/")
    time_series_object = {
        "query": dict([i for i in request]),
        "query_done": False
    }

    # Invalid date range
    if request.start_date == "invalid" or request.end_date == "invalid":
        time_series_object["results_length"] = 0
        time_series_object["more_results"] = False
        time_series_object["new_start_date"] = 0
        time_series_object["results"] = {
            "absolute_count": {},
            "date_count": {}
        }
        return time_series_object

    start_date, end_date = get_start_end_date(db,
                                              config,
                                              start_date=request.start_date
                                              or None,
                                              end_date=request.end_date
                                              or None)

    # Generate date ranges
    interval = int(request.year_interval)
    date_ranges = []
    # Make sure last date gets included in for loop below by adding one to last step
    for start in range(start_date, end_date + 1, interval):
        end = start + interval - 1
        if end > end_date:
            end = end_date
        date_range = "%d-%d" % (start, end)
        date_ranges.append((start, date_range))

    absolute_count = defaultdict(int)
    date_counts = {}
    total_hits = 0
    last_date_done = start_date
    start_time = timeit.default_timer()
    max_time = request.max_time or 10
    cursor = db.dbh.cursor()
    for start_range, date_range in date_ranges:
        request.metadata[config.time_series_year_field] = date_range
        hits = db.query(request["q"],
                        request["method"],
                        request["arg"],
                        raw_results=True,
                        **request.metadata)
        hits.finish()
        hit_len = len(hits)
        params = {"report": "concordance", "start": "0", "end": "0"}
        params[config.time_series_year_field] = date_range
        url = make_absolute_query_link(config, request, **params)
        absolute_count[start_range] = {
            "label": start_range,
            "count": hit_len,
            "url": url
        }

        # Get date total count
        if interval != 1:
            end_range = start_range + (int(request["year_interval"]) - 1)
            if request.q:
                query = 'select sum(word_count) from toms where %s between "%d" and "%d"' % (
                    config.time_series_year_field,
                    start_range,
                    end_range,
                )
            else:
                query = f"SELECT COUNT(*) FROM toms WHERE philo_type='{db.locals.default_object_level}' AND {config.time_series_year_field} BETWEEN {start_range} AND {end_range}"
        else:
            if request.q:
                query = "select sum(word_count) from toms where %s='%s'" % (
                    config.time_series_year_field, start_range)
            else:
                query = f"SELECT COUNT(*) FROM toms WHERE philo_type='{db.locals.default_object_level}' AND {config.time_series_year_field}='{start_range}'"
        cursor.execute(query)
        date_counts[start_range] = cursor.fetchone()[0] or 0
        total_hits += hit_len
        elapsed = timeit.default_timer() - start_time
        last_date_done = start_range
        # avoid timeouts by splitting the query if more than request.max_time
        # (in seconds) has been spent in the loop
        if elapsed > int(max_time):
            break

    time_series_object["results_length"] = total_hits
    if (last_date_done + int(request.year_interval)) >= end_date:
        time_series_object["more_results"] = False
    else:
        time_series_object["more_results"] = True
        time_series_object["new_start_date"] = last_date_done + int(
            request.year_interval)
    time_series_object["results"] = {
        "absolute_count": absolute_count,
        "date_count": date_counts
    }

    return time_series_object