Пример #1
0
 def _collect_multifield(self, recIDs, termslist):
     """
     Calculates terms from many fields or tags.
     Used together with multifield tokenizer
     """
     tokenizing_function = self.tokenizing_function
     for recID in recIDs:
         new_words = tokenizing_function(recID)
         if not recID in termslist:
             termslist[recID] = []
         termslist[recID] = list_union(new_words, termslist[recID])
     return termslist
 def _collect_multifield(self, recIDs, termslist):
     """
     Calculates terms from many fields or tags.
     Used together with multifield tokenizer
     """
     tokenizing_function = self.tokenizing_function
     for recID in recIDs:
         new_words = tokenizing_function(recID)
         if not recID in termslist:
             termslist[recID] = []
         termslist[recID] = list_union(new_words, termslist[recID])
     return termslist
Пример #3
0
 def _collect_recjson(self, recIDs, termslist):
     """
     Collects terms from recjson with use of bibfield.
     Used together with recjson tokenizer.
     """
     tokenizing_function = self.tokenizing_function
     for recID in recIDs:
         record = get_record(recID)
         if record:
             new_words = tokenizing_function(record)
             if not recID in termslist:
                 termslist[recID] = []
             termslist[recID] = list_union(new_words, termslist[recID])
     return termslist
 def _collect_recjson(self, recIDs, termslist):
     """
     Collects terms from recjson with use of bibfield.
     Used together with recjson tokenizer.
     """
     tokenizing_function = self.tokenizing_function
     for recID in recIDs:
         record = get_record(recID)
         if record:
             new_words = tokenizing_function(record)
             if not recID in termslist:
                 termslist[recID] = []
             termslist[recID] = list_union(new_words, termslist[recID])
     return termslist
 def _collect_string(self, recIDs, termslist):
     """
     Collects terms from specific tags or fields.
     Used together with string tokenizer.
     """
     for tag in self.tags:
         tokenizing_function = self.special_tags.get(tag, self.tokenizing_function)
         phrases = self._get_phrases_for_tokenizing(tag, recIDs)
         for row in phrases:
             recID, phrase = row
             if recID in recIDs:
                 if not recID in termslist:
                     termslist[recID] = []
                 new_words = tokenizing_function(phrase)
                 termslist[recID] = list_union(new_words, termslist[recID])
     return termslist
Пример #6
0
 def _collect_string(self, recIDs, termslist):
     """
     Collects terms from specific tags or fields.
     Used together with string tokenizer.
     """
     for tag in self.tags:
         tokenizing_function = self.special_tags.get(
             tag, self.tokenizing_function)
         phrases = self._get_phrases_for_tokenizing(tag, recIDs)
         for row in phrases:
             recID, phrase = row
             if recID in recIDs:
                 if not recID in termslist:
                     termslist[recID] = []
                 new_words = tokenizing_function(phrase)
                 termslist[recID] = list_union(new_words, termslist[recID])
     return termslist
 def _collect_string(self, recIDs, termslist):
     """
     Collects terms from specific tags or fields.
     Used together with string tokenizer.
     """
     tags = self.tags
     for recID in recIDs:
         rec = get_record(recID)
         new_words = []
         extend = new_words.extend
         for tag in tags:
             tokenizing_function = self.special_tags.get(tag, self.tokenizing_function)
             phrases = []
             recjson_field = rec.get(tag)
             get_values_recursively(recjson_field, phrases)
             for phrase in phrases:
                 extend(tokenizing_function(phrase))
         if recID not in termslist and new_words:
             termslist[recID] = []
         if new_words:
             termslist[recID] = list_union(new_words, termslist[recID])
     return termslist
Пример #8
0
 def _collect_string(self, recIDs, termslist):
     """
     Collects terms from specific tags or fields.
     Used together with string tokenizer.
     """
     tags = self.tags
     for recID in recIDs:
         rec = get_record(recID)
         new_words = []
         extend = new_words.extend
         for tag in tags:
             tokenizing_function = self.special_tags.get(
                 tag, self.tokenizing_function)
             phrases = []
             recjson_field = rec.get(tag)
             get_values_recursively(recjson_field, phrases)
             for phrase in phrases:
                 extend(tokenizing_function(phrase))
         if recID not in termslist and new_words:
             termslist[recID] = []
         if new_words:
             termslist[recID] = list_union(new_words, termslist[recID])
     return termslist
 def test_list_union(self):
     """bibindex engine utils - list union"""
     self.assertEqual([1, 2, 3, 4],
                      list_union([1, 2, 3],
                                 [1, 3, 4]))
Пример #10
0
 def test_list_union(self):
     """bibindex engine utils - list union"""
     self.assertEqual([1, 2, 3, 4], list_union([1, 2, 3], [1, 3, 4]))