def tokenize_and_merge_possible_mw_tokens(text, flat_tree): mw_tokens = list(bintokenizer.tokenize(text)) # multi-word tokens mw_tokens = [ tok.txt.split(" ") for tok in mw_tokens if tok.txt is not None ] sw_tokens = [tok for toks in mw_tokens for tok in toks] # single-word tokens parse_tokens = list(flat_tree.split(" ")) parse_terminals = filter(lambda x: x[1][0].islower(), enumerate(parse_tokens)) leaf_idx_to_parse_idx = { leaf_idx: ptok_idx for (leaf_idx, (ptok_idx, ptok)) in enumerate(parse_terminals) } offset = 0 merge_list = [] for mw_token in mw_tokens: sw_count = len(mw_token) idxed_mw_token = [(idx + offset, token) for (idx, token) in enumerate(mw_token)] offset += sw_count if sw_count == 1: continue merge_info = check_merge_candidate(idxed_mw_token, parse_tokens, leaf_idx_to_parse_idx) if merge_info is not None: merge_list.append(merge_info) parse_toks_out = list(parse_tokens) text_toks_out = list(sw_tokens) # merge in reverse order so we don't have to compute offsets for (pidx, leaf_idx, sw_count) in reversed(merge_list): if Settings.DEBUG: print("Merging:", pidx, leaf_idx, sw_count) print(" ".join(sw_tokens[leaf_idx:leaf_idx + sw_count]).__repr__()) print(parse_toks_out[pidx:pidx + 1]) parse_toks_out[pidx:pidx + sw_count] = parse_toks_out[pidx:pidx + 1] text_toks_out[leaf_idx:leaf_idx + sw_count] = [ " ".join(sw_tokens[leaf_idx:leaf_idx + sw_count]) ] return text_toks_out, parse_toks_out
def _normalize_sentence(cls, single_sentence): """ Preprocess text and normalize for parsing network """ return [ tok.txt for tok in bintokenizer.tokenize(single_sentence) if BIN_Token.is_understood(tok) ]
def wordfreq(): """ Return word frequency chart data for a given time period. """ resp: Dict[str, Any] = dict(err=True) # Create datetime objects from query string args try: date_fmt = "%Y-%m-%d" date_from = datetime.strptime(request.args.get("date_from", ""), date_fmt) date_to = datetime.strptime(request.args.get("date_to", ""), date_fmt) except Exception as e: logging.warning("Failed to parse date arg: {0}".format(e)) return better_jsonify(**resp) # Words param should contain one or more comma-separated word # lemmas with optional category specified with :cat suffix warg = request.args.get("words") if not warg: return better_jsonify(**resp) # Create word/cat pair from token def cat4token(t: Tok) -> Tuple[str, str]: assert t.kind in (TOK.WORD, TOK.PERSON, TOK.ENTITY) # TODO: Use GreynirPackage lemma lookup function for this w, cat = t.txt, "" if t.kind == TOK.WORD: val = list(filter(lambda m: m.stofn == m.ordmynd, t.meanings)) or t.meanings cat = val[0].ordfl if len(val) else CAT_UNKNOWN w = val[0].stofn if len(val) else t.txt # Hack to fix combined word, remove hyphens added by combinator if w.count("-") > t.txt.count("-"): san = "" txtlen = len(t.txt) for i, char in enumerate(w): if char == "-" and i < txtlen and t.txt[i] != "-": continue san += char w = san elif t.kind == TOK.PERSON: cat = "person_" + (t.person_names[0].gender or "hk") elif t.kind == TOK.ENTITY: cat = "entity" return (w, cat) # Parse arg string into word/cat tuples wds = _str2words(warg) # Try to tokenize each item that doesn't have a category nwds = [] for w, c in wds: if c is None or c == CAT_UNKNOWN: # Try to tokenize tokens = list( filter(lambda x: x.kind in _VALID_TOKENS, tokenize(w))) for t in tokens: nwds.append(cat4token(t)) else: nwds.append((w, c)) # Filter all words not in allowed category and restrict no. words words = list(filter(lambda x: x[1] in _VALID_WCATS, nwds)) words = words[:_MAX_NUM_WORDS] # Generate date labels now = datetime.utcnow() delta = date_to - date_from with changedlocale(category="LC_TIME"): # Group by week if period longer than 3 months label_date_strings: List[Union[str, Tuple[str, str]]] = [] if delta.days >= _SHOW_WEEKS_CUTOFF: timeunit = "week" label_dates = [( (date_from + timedelta(days=i * 7)), (date_from + timedelta(days=(i * 7) + 6)), ) for i in range(int((delta.days + 1) / 7))] # Construct elegant week date labels w. no superfluous information labels = [] for (d1, d2) in label_dates: if d1.month == d2.month: d1fmt = "%-d." d2fmt = "%-d. %b" else: d1fmt = d2fmt = "%-d. %b" if d1.year != now.year and d1.year != d2.year: d1fmt += " %Y" if d2.year != now.year: d2fmt += " %Y" labels.append("{0}-{1}".format(d1.strftime(d1fmt), d2.strftime(d2fmt))) # Convert dates to strings for client-side label_date_strings = [(df.strftime("%Y-%m-%d"), dt.strftime("%Y-%m-%d")) for df, dt in label_dates] # Group by day else: timeunit = "day" label_days = [ date_from + timedelta(days=i) for i in range(delta.days) ] labels = [ d.strftime("%-d. %b") if d.year == now.year else d.strftime("%-d. %b %Y") for d in label_days ] label_date_strings = [d.strftime("%Y-%m-%d") for d in label_days] # Create datasets for front-end chart colors = list(_LINE_COLORS) data: Dict[str, Any] = dict(labels=labels, labelDates=label_date_strings, datasets=[]) with SessionContext(commit=False) as session: for w in words: # Look up frequency of word for the given period (wd, cat) = w res = WordFrequencyQuery.frequency( wd, cat, date_from, date_to, timeunit=timeunit, enclosing_session=session, ) # Generate data and config for chart label = "{0} ({1})".format(wd, CAT_DESC.get(cat)) ds: Dict[str, Any] = dict(label=label, fill=False, lineTension=0) ds["borderColor"] = ds["backgroundColor"] = colors.pop(0) ds["data"] = [r[1] for r in res] ds["word"] = "{0}:{1}".format(wd, cat) data["datasets"].append(ds) # Create response resp["err"] = False resp["data"] = data resp["words"] = _words2str(words) return better_jsonify(**resp)