예제 #1
0
 def extract(self, r):
   content = text.strip_enclosed_carrots(r["title"]) + " " + text.strip_enclosed_carrots(r["abstract"])
   ctokens = content.split()
   phrases = []
   current = []
   for ct in ctokens:
     if text.is_capitalized(ct):
       current += filter(lambda w: w not in self._querystops, text.tokenize(ct))
     elif len(current) > 0:
       if len(current) >= self._pl:
         phrases.append(" ".join(current))
       current = []
   return phrases
예제 #2
0
 def __init__(self, query, pl=1):
   self._querystops = set(text.tokenize(query))
   self._pl = calc_pl(query)