Пример #1
0
def tokenize_and_merge_possible_mw_tokens(text, flat_tree):
    mw_tokens = list(bintokenizer.tokenize(text))  # multi-word tokens
    mw_tokens = [
        tok.txt.split(" ") for tok in mw_tokens if tok.txt is not None
    ]
    sw_tokens = [tok for toks in mw_tokens
                 for tok in toks]  # single-word tokens

    parse_tokens = list(flat_tree.split(" "))
    parse_terminals = filter(lambda x: x[1][0].islower(),
                             enumerate(parse_tokens))

    leaf_idx_to_parse_idx = {
        leaf_idx: ptok_idx
        for (leaf_idx, (ptok_idx, ptok)) in enumerate(parse_terminals)
    }

    offset = 0
    merge_list = []
    for mw_token in mw_tokens:
        sw_count = len(mw_token)
        idxed_mw_token = [(idx + offset, token)
                          for (idx, token) in enumerate(mw_token)]
        offset += sw_count
        if sw_count == 1:
            continue
        merge_info = check_merge_candidate(idxed_mw_token, parse_tokens,
                                           leaf_idx_to_parse_idx)
        if merge_info is not None:
            merge_list.append(merge_info)

    parse_toks_out = list(parse_tokens)
    text_toks_out = list(sw_tokens)
    # merge in reverse order so we don't have to compute offsets
    for (pidx, leaf_idx, sw_count) in reversed(merge_list):
        if Settings.DEBUG:
            print("Merging:", pidx, leaf_idx, sw_count)
            print(" ".join(sw_tokens[leaf_idx:leaf_idx + sw_count]).__repr__())
            print(parse_toks_out[pidx:pidx + 1])
        parse_toks_out[pidx:pidx + sw_count] = parse_toks_out[pidx:pidx + 1]
        text_toks_out[leaf_idx:leaf_idx + sw_count] = [
            " ".join(sw_tokens[leaf_idx:leaf_idx + sw_count])
        ]

    return text_toks_out, parse_toks_out
Пример #2
0
 def _normalize_sentence(cls, single_sentence):
     """ Preprocess text and normalize for parsing network """
     return [
         tok.txt for tok in bintokenizer.tokenize(single_sentence)
         if BIN_Token.is_understood(tok)
     ]
Пример #3
0
def wordfreq():
    """ Return word frequency chart data for a given time period. """
    resp: Dict[str, Any] = dict(err=True)
    # Create datetime objects from query string args
    try:
        date_fmt = "%Y-%m-%d"
        date_from = datetime.strptime(request.args.get("date_from", ""),
                                      date_fmt)
        date_to = datetime.strptime(request.args.get("date_to", ""), date_fmt)
    except Exception as e:
        logging.warning("Failed to parse date arg: {0}".format(e))
        return better_jsonify(**resp)

    # Words param should contain one or more comma-separated word
    # lemmas with optional category specified with :cat suffix
    warg = request.args.get("words")
    if not warg:
        return better_jsonify(**resp)

    # Create word/cat pair from token
    def cat4token(t: Tok) -> Tuple[str, str]:
        assert t.kind in (TOK.WORD, TOK.PERSON, TOK.ENTITY)
        # TODO: Use GreynirPackage lemma lookup function for this
        w, cat = t.txt, ""
        if t.kind == TOK.WORD:
            val = list(filter(lambda m: m.stofn == m.ordmynd,
                              t.meanings)) or t.meanings
            cat = val[0].ordfl if len(val) else CAT_UNKNOWN
            w = val[0].stofn if len(val) else t.txt
            # Hack to fix combined word, remove hyphens added by combinator
            if w.count("-") > t.txt.count("-"):
                san = ""
                txtlen = len(t.txt)
                for i, char in enumerate(w):
                    if char == "-" and i < txtlen and t.txt[i] != "-":
                        continue
                    san += char
                w = san
        elif t.kind == TOK.PERSON:
            cat = "person_" + (t.person_names[0].gender or "hk")
        elif t.kind == TOK.ENTITY:
            cat = "entity"
        return (w, cat)

    # Parse arg string into word/cat tuples
    wds = _str2words(warg)

    # Try to tokenize each item that doesn't have a category
    nwds = []
    for w, c in wds:
        if c is None or c == CAT_UNKNOWN:
            # Try to tokenize
            tokens = list(
                filter(lambda x: x.kind in _VALID_TOKENS, tokenize(w)))
            for t in tokens:
                nwds.append(cat4token(t))
        else:
            nwds.append((w, c))

    # Filter all words not in allowed category and restrict no. words
    words = list(filter(lambda x: x[1] in _VALID_WCATS, nwds))
    words = words[:_MAX_NUM_WORDS]

    # Generate date labels
    now = datetime.utcnow()
    delta = date_to - date_from
    with changedlocale(category="LC_TIME"):
        # Group by week if period longer than 3 months
        label_date_strings: List[Union[str, Tuple[str, str]]] = []
        if delta.days >= _SHOW_WEEKS_CUTOFF:
            timeunit = "week"
            label_dates = [(
                (date_from + timedelta(days=i * 7)),
                (date_from + timedelta(days=(i * 7) + 6)),
            ) for i in range(int((delta.days + 1) / 7))]
            # Construct elegant week date labels w. no superfluous information
            labels = []
            for (d1, d2) in label_dates:
                if d1.month == d2.month:
                    d1fmt = "%-d."
                    d2fmt = "%-d. %b"
                else:
                    d1fmt = d2fmt = "%-d. %b"
                if d1.year != now.year and d1.year != d2.year:
                    d1fmt += " %Y"
                if d2.year != now.year:
                    d2fmt += " %Y"
                labels.append("{0}-{1}".format(d1.strftime(d1fmt),
                                               d2.strftime(d2fmt)))
            # Convert dates to strings for client-side
            label_date_strings = [(df.strftime("%Y-%m-%d"),
                                   dt.strftime("%Y-%m-%d"))
                                  for df, dt in label_dates]
        # Group by day
        else:
            timeunit = "day"
            label_days = [
                date_from + timedelta(days=i) for i in range(delta.days)
            ]
            labels = [
                d.strftime("%-d. %b")
                if d.year == now.year else d.strftime("%-d. %b %Y")
                for d in label_days
            ]
            label_date_strings = [d.strftime("%Y-%m-%d") for d in label_days]

    # Create datasets for front-end chart
    colors = list(_LINE_COLORS)
    data: Dict[str, Any] = dict(labels=labels,
                                labelDates=label_date_strings,
                                datasets=[])
    with SessionContext(commit=False) as session:
        for w in words:
            # Look up frequency of word for the given period
            (wd, cat) = w
            res = WordFrequencyQuery.frequency(
                wd,
                cat,
                date_from,
                date_to,
                timeunit=timeunit,
                enclosing_session=session,
            )
            # Generate data and config for chart
            label = "{0} ({1})".format(wd, CAT_DESC.get(cat))
            ds: Dict[str, Any] = dict(label=label, fill=False, lineTension=0)
            ds["borderColor"] = ds["backgroundColor"] = colors.pop(0)
            ds["data"] = [r[1] for r in res]
            ds["word"] = "{0}:{1}".format(wd, cat)
            data["datasets"].append(ds)

    # Create response
    resp["err"] = False
    resp["data"] = data
    resp["words"] = _words2str(words)

    return better_jsonify(**resp)