Exemplo n.º 1
0
def cluster(cur,
            subset=['all'],
            corpora=['dickens'],
            clusterlength=['1'],
            cutoff=None):
    # Defaults / dereference arrays
    book_ids = corpora_to_book_ids(cur, corpora)
    clusterlength = int(clusterlength[0])
    api_subset = api_subset_lookup(cur)
    rclass_ids = tuple(api_subset[s] for s in subset)

    # Choose cutoff
    if cutoff is not None:
        cutoff = int(cutoff[0])
    else:
        cutoff = 5 if len(book_ids) > 1 else 2

    skipped = 0
    wl = get_word_list(cur, book_ids, rclass_ids, clusterlength)

    for term, freq in wl:
        if freq >= cutoff:
            yield (term, freq)
        else:
            skipped += 1

    if skipped > 0:
        yield ('footer',
               dict(info=dict(
                   message=
                   '%d clusters with a frequency less than %d are not shown' %
                   (skipped, cutoff))))
Exemplo n.º 2
0
def keyword(cur,
            clusterlength,
            pvalue,
            subset=['all'],
            corpora=['dickens'],
            refsubset=['all'],
            refcorpora=['dickens']):
    '''
    Main entry,
    '''
    # Defaults / dereference arrays
    book_ids = corpora_to_book_ids(cur, corpora)
    refbook_ids = corpora_to_book_ids(cur, refcorpora)
    pvalue = float(pvalue[0])
    clusterlength = int(clusterlength[0])
    api_subset = api_subset_lookup(cur)
    rclass_ids = tuple(api_subset[s] for s in subset)
    refrclass_ids = tuple(api_subset[s] for s in refsubset)

    wordlist_analysis = facets_to_df(
        get_word_list(cur, book_ids, rclass_ids, clusterlength))
    total_analysis = wordlist_analysis.Count.sum()

    wordlist_reference = facets_to_df(
        get_word_list(cur, refbook_ids, refrclass_ids, clusterlength))
    total_reference = wordlist_reference.Count.sum()

    try:
        keywords = extract_keywords(
            wordlist_analysis,
            wordlist_reference,
            total_analysis,
            total_reference,
            limit_rows=3000,
            p_value=pvalue,
        ).to_records()
    except:  # noqa
        yield ('header',
               dict(warn=dict(message='''
CliC was not able to generate a keyword list for you. Please check your search settings.
Because short suspensions are limited to 4 tokens, no 5-grams are available for short suspensions.
Please note that the target text/corpus and the reference text/corpus should be different.

It is also possible that there are no keywords with the parameters you specified. In that case
increasing the p-value might be an option.
            '''.strip(), )))
        return
        # TODO: What about the actual error? Log it?

    # Return header message
    if total_analysis:
        yield ('header',
               dict(info=dict(message='''
The results are limited to 3000 rows. Generally there will be fewer results. Only overused (positive) keywords are displayed.
            '''.strip())))

    for k in keywords:
        # Convert row into something JSON-serializable
        yield tuple(x for x in k)
Exemplo n.º 3
0
def count(cur,
          corpora=['dickens'],
          subset=['all', 'shortsus', 'longsus', 'nonquote', 'quote'],
          metadata=[]):
    """
    Get word counts for coprora

    - corpora: List of corpora / book names
    - subset: Subset(s) to return counts for
    - metadata, Array of extra metadata to provide with result, some of
      - 'book_titles' (return dict of book IDs to titles at end of result)
    """
    book_ids = tuple(corpora_to_book_ids(cur, corpora))
    api_subset = api_subset_lookup(cur)
    rclass_ids = tuple(api_subset[s] for s in subset)
    query = """
        SELECT (SELECT name FROM book WHERE book_id = t.book_id) AS "name"
    """
    params = dict(book_ids=book_ids)
    for r in rclass_ids:
        query += """
             , COUNT(CASE WHEN t.part_of ? '%d' THEN 1 END) is_%d
        """ % (r, r)
    query += """
          FROM token t
         WHERE t.book_id IN %(book_ids)s
      GROUP BY book_id
    """
    cur.execute(query, params)

    for row in cur:
        yield row

    footer = get_book_metadata(cur, book_ids, set(metadata))
    if footer:
        yield ('footer', footer)
Exemplo n.º 4
0
def subset(cur,
           corpora=['dickens'],
           subset=['all'],
           contextsize=['0'],
           metadata=[]):
    """
    Main entry function for subset search

    - corpora: List of corpora / book names
    - subset: Subset(s) to search for.
    - contextsize: Size of context window, defaults to none.
    - metadata, Array of extra metadata to provide with result, some of
      - 'book_titles' (return dict of book IDs to titles at end of result)
    """
    book_ids = corpora_to_book_ids(cur, corpora)
    if len(book_ids) == 0:
        raise UserError("No books to search", "error")
    contextsize = int(contextsize[0])
    metadata = set(metadata)
    book_cur = cur.connection.cursor()
    book = None
    api_subset = api_subset_lookup(cur)
    rclass_ids = tuple(api_subset[s] for s in subset)
    rclass = rclass_id_lookup(cur)

    query = """
        SELECT r.book_id
             , c.full_tokens full_tokens
             , c.is_node is_node
             , r.crange node_crange
             , c.part_of part_of
          FROM region r
          JOIN LATERAL (
              SELECT ARRAY_AGG(t_surrounding.crange ORDER BY t_surrounding.book_id, t_surrounding.ordering) full_tokens
                   , ARRAY_AGG(t_surrounding.crange <@ r.crange ORDER BY t_surrounding.book_id, t_surrounding.ordering) is_node
                   , (ARRAY_AGG(t_surrounding.part_of ORDER BY t_surrounding.book_id, t_surrounding.ordering))[1] part_of
                FROM token t_surrounding
               WHERE t_surrounding.book_id = r.book_id
                 AND t_surrounding.crange <@ range_expand(r.crange, %(contextsize)s)
               ) c ON TRUE
          WHERE r.book_id IN %(book_id)s
           AND r.rclass_id IN %(rclass_ids)s
    """
    params = dict(
        book_id=tuple(book_ids),
        contextsize=contextsize * 10,  # TODO: Bodge word -> char
        rclass_ids=rclass_ids,
    )
    cur.execute(query, params)

    for book_id, full_tokens, is_node, node_crange, part_of in cur:
        node_tokens = [
            crange for crange, include in zip(full_tokens, is_node) if include
        ]
        if len(node_tokens) == 0:
            continue  # Ignore empty suspensions
        if not book or book['id'] != book_id:
            book = get_book(book_cur, book_id, content=True)
        yield to_conc(
            book['content'], full_tokens, node_tokens, contextsize) + [
                [book['name'], node_crange.lower, node_crange.upper],
                [
                    int(part_of.get(str(rclass['chapter.text']), -1)),
                    int(part_of.get(str(rclass['chapter.paragraph']), -1)),
                    int(part_of.get(str(rclass['chapter.sentence']), -1)),
                ]
            ]

    book_cur.close()

    footer = get_book_metadata(cur, book_ids, metadata)
    if footer:
        yield ('footer', footer)
Exemplo n.º 5
0
def concordance(cur,
                corpora=['dickens'],
                subset=['all'],
                q=[],
                contextsize=['0'],
                metadata=[]):
    """
    Main entry function for concordance search

    - corpora: List of corpora / book names
    - subset: Subset to search within, or 'all'
    - q: Quer(ies) to search for, results will contain one of the given expressions
    - contextsize: Size of context window, defaults to none.
    - metadata, Array of extra metadata to provide with result, some of
      - 'book_titles' (return dict of book IDs to titles at end of result)
    """
    book_ids = tuple(corpora_to_book_ids(cur, corpora))
    if len(book_ids) == 0:
        raise UserError("No books to search", "error")
    api_subset = api_subset_lookup(cur)
    rclass = rclass_id_lookup(cur)
    rclass_ids = tuple(api_subset[s] for s in subset)
    if len(rclass_ids) != 1:
        raise UserError("You must supply exactly one subset", "error")
    like_sets = [parse_query(s) for s in q]
    if len(like_sets) == 0:
        raise UserError("You must supply at least one search term", "error")
    contextsize = int(contextsize[0])
    metadata = set(metadata)
    book = None

    book_cur = cur.connection.cursor()
    try:
        for likes in like_sets:
            # Choose an "anchor". We search for this first to narrow the possible
            # outputs as much as possible, then consider the types around each.
            anchor_offset = find_anchor_offset(*likes)

            query = ""
            params = dict()
            query += """
                 SELECT t.book_id
                      , c.node_start - 1 node_start -- NB: Postgres is 1-indexed
                      , c.cranges full_tokens
                      , t.part_of
                   FROM token t
                   JOIN LATERAL ( -- i.e. for each valid anchor token, get all tokens around it, including context
                       SELECT ARRAY_POSITION(ARRAY_AGG(t_surrounding.ordering = t.ordering ORDER BY book_id, ordering), TRUE) - %(anchor_offset)s node_start
                            , ARRAY_AGG(CASE WHEN t_surrounding.ordering < (t.ordering - %(anchor_offset)s) THEN t_surrounding.ttype -- i.e. part of the context, so rclass irrelevant
                                             WHEN t_surrounding.ordering > (t.ordering - %(anchor_offset)s + %(total_likes)s - 1) THEN t_surrounding.ttype -- i.e. part of the context, so rclass irrelevant
                                             WHEN t_surrounding.part_of ? %(part_of)s THEN t_surrounding.ttype
                                             ELSE NULL -- part of the node, but not in the right rclass, NULL should fail any node checks later on
                                              END ORDER BY book_id, ordering) ttypes
                            , ARRAY_AGG(t_surrounding.crange ORDER BY book_id, ordering) cranges
                         FROM token t_surrounding
                        WHERE t_surrounding.book_id = t.book_id
                          AND t_surrounding.ordering BETWEEN t.ordering - %(anchor_offset)s - %(contextsize)s
                                             AND t.ordering - %(anchor_offset)s + (%(total_likes)s - 1) + %(contextsize)s
                   ) c on TRUE
                 WHERE t.book_id IN %(book_ids)s
                   AND t.part_of ? %(part_of)s
            """
            params['anchor_offset'] = anchor_offset
            params['anchor_like'] = likes[anchor_offset]
            params['book_ids'] = book_ids
            params['contextsize'] = contextsize
            params['total_likes'] = len(likes)
            params['part_of'] = str(rclass_ids[0])

            for i, l in enumerate(likes):
                if i == anchor_offset:
                    # We should check the main token table for the anchor node, so
                    # postgres searches for this first
                    query += "AND t.ttype LIKE %(like_" + str(i) + ")s\n"
                else:
                    query += "AND c.ttypes[c.node_start + " + str(
                        i) + "] LIKE %(like_" + str(i) + ")s\n"
                params["like_" + str(i)] = l

            cur.execute(query, params)
            for book_id, node_start, full_tokens, part_of in cur:
                # Extract portion of tokens that are the node
                node_tokens = full_tokens[node_start:node_start + len(likes)]
                if not book or book['id'] != book_id:
                    book = get_book(book_cur, book_id, content=True)
                yield to_conc(
                    book['content'], full_tokens, node_tokens, contextsize
                ) + [[
                    book['name'], node_tokens[0].lower, node_tokens[-1].upper
                ],
                     [
                         int(part_of.get(str(rclass['chapter.text']), -1)),
                         int(part_of.get(str(rclass['chapter.paragraph']),
                                         -1)),
                         int(part_of.get(str(rclass['chapter.sentence']), -1)),
                     ]]
    finally:
        book_cur.close()

    footer = get_book_metadata(cur, book_ids, metadata)
    if footer:
        yield ('footer', footer)
Exemplo n.º 6
0
def get_book_metadata(cur, book_ids, metadata):
    """
    Generate dict of metadata that should go in footer of both concordance and subsets

    - book_ids: Array of book IDs to include
    - metadata: Metadata items to include, a set contining some of...
      - 'book_titles': The title / author of each book
      - 'chapter_start': The start character for all chapters, and end of book
      - 'word_count_(subset)': Count of words within (subset)
    """
    def p_params(*args):
        return ("?, " * sum(len(x) for x in args)).rstrip(', ')

    rclass = rclass_id_lookup(cur)

    out = {}
    for k in metadata:
        out[k] = {}

        if k == 'book_titles':
            cur.execute(
                """
                SELECT b.name
                     , bm.rclass_id
                     , bm.content
                  FROM book b, book_metadata bm
                 WHERE b.book_id = bm.book_id
                   AND b.book_id IN %s
                   AND bm.rclass_id IN %s
            """, (
                    tuple(book_ids),
                    (rclass['metadata.title'], rclass['metadata.author']),
                ))
            for (book_name, rclass_id, content) in cur:
                if book_name not in out[k]:
                    out[k][book_name] = [None, None]
                out[k][book_name][0 if rclass_id ==
                                  rclass['metadata.title'] else 1] = content

        elif k == 'chapter_start':
            cur.execute(
                """
                SELECT b.name
                     , r.rvalue as chapter_num
                     , r.crange crange
                  FROM book b, region r
                 WHERE b.book_id = r.book_id
                   AND r.rclass_id = %s
                   AND b.book_id IN %s
            """, (
                    rclass['chapter.text'],
                    tuple(book_ids),
                ))
            for (book_name, chapter_num, crange) in cur:
                if book_name not in out[k]:
                    out[k][book_name] = dict()
                out[k][book_name][chapter_num] = crange.lower
                out[k][book_name]['_end'] = max(
                    out[k][book_name].get('_end', 0), crange.upper)

        elif k == 'word_count_chapter':
            cur.execute(
                """
                SELECT b.name
                     , bwc.rvalue as chapter_num
                     , bwc.word_count
                  FROM book b, book_word_count bwc
                 WHERE b.book_id = bwc.book_id
                   AND bwc.rclass_id = %s
                   AND b.book_id IN %s
              ORDER BY bwc.book_id, bwc.rvalue
            """, (
                    rclass['chapter.text'],
                    tuple(book_ids),
                ))
            for (book_name, chapter_num, word_total) in cur:
                if book_name not in out[k]:
                    out[k][book_name] = dict(_end=0)
                out[k][book_name][chapter_num] = out[k][book_name]['_end']
                out[k][book_name]['_end'] += int(word_total)

        elif k.startswith('word_count_'):
            api_subset = api_subset_lookup(cur)
            cur.execute(
                """
                SELECT b.name
                     , SUM(bwc.word_count) AS word_count
                  FROM book b, book_word_count bwc
                 WHERE b.book_id = bwc.book_id
                   AND bwc.rclass_id = %s
                   AND b.book_id IN %s
              GROUP BY b.book_id
            """, (
                    api_subset[k.replace('word_count_', '')],
                    tuple(book_ids),
                ))
            for (book_name, word_count) in cur:
                out[k][book_name] = int(word_count)

        else:
            raise ValueError("Unknown metadata item %s" % k)

    return out