def extract(self, r): content = text.strip_enclosed_carrots(r["title"]) + " " + text.strip_enclosed_carrots(r["abstract"]) ctokens = content.split() phrases = [] current = [] for ct in ctokens: if text.is_capitalized(ct): current += filter(lambda w: w not in self._querystops, text.tokenize(ct)) elif len(current) > 0: if len(current) >= self._pl: phrases.append(" ".join(current)) current = [] return phrases
def __init__(self, query, pl=1): self._querystops = set(text.tokenize(query)) self._pl = calc_pl(query)