def _collect_multifield(self, recIDs, termslist): """ Calculates terms from many fields or tags. Used together with multifield tokenizer """ tokenizing_function = self.tokenizing_function for recID in recIDs: new_words = tokenizing_function(recID) if not recID in termslist: termslist[recID] = [] termslist[recID] = list_union(new_words, termslist[recID]) return termslist
def _collect_recjson(self, recIDs, termslist): """ Collects terms from recjson with use of bibfield. Used together with recjson tokenizer. """ tokenizing_function = self.tokenizing_function for recID in recIDs: record = get_record(recID) if record: new_words = tokenizing_function(record) if not recID in termslist: termslist[recID] = [] termslist[recID] = list_union(new_words, termslist[recID]) return termslist
def _collect_string(self, recIDs, termslist): """ Collects terms from specific tags or fields. Used together with string tokenizer. """ for tag in self.tags: tokenizing_function = self.special_tags.get(tag, self.tokenizing_function) phrases = self._get_phrases_for_tokenizing(tag, recIDs) for row in phrases: recID, phrase = row if recID in recIDs: if not recID in termslist: termslist[recID] = [] new_words = tokenizing_function(phrase) termslist[recID] = list_union(new_words, termslist[recID]) return termslist
def _collect_string(self, recIDs, termslist): """ Collects terms from specific tags or fields. Used together with string tokenizer. """ for tag in self.tags: tokenizing_function = self.special_tags.get( tag, self.tokenizing_function) phrases = self._get_phrases_for_tokenizing(tag, recIDs) for row in phrases: recID, phrase = row if recID in recIDs: if not recID in termslist: termslist[recID] = [] new_words = tokenizing_function(phrase) termslist[recID] = list_union(new_words, termslist[recID]) return termslist
def _collect_string(self, recIDs, termslist): """ Collects terms from specific tags or fields. Used together with string tokenizer. """ tags = self.tags for recID in recIDs: rec = get_record(recID) new_words = [] extend = new_words.extend for tag in tags: tokenizing_function = self.special_tags.get(tag, self.tokenizing_function) phrases = [] recjson_field = rec.get(tag) get_values_recursively(recjson_field, phrases) for phrase in phrases: extend(tokenizing_function(phrase)) if recID not in termslist and new_words: termslist[recID] = [] if new_words: termslist[recID] = list_union(new_words, termslist[recID]) return termslist
def _collect_string(self, recIDs, termslist): """ Collects terms from specific tags or fields. Used together with string tokenizer. """ tags = self.tags for recID in recIDs: rec = get_record(recID) new_words = [] extend = new_words.extend for tag in tags: tokenizing_function = self.special_tags.get( tag, self.tokenizing_function) phrases = [] recjson_field = rec.get(tag) get_values_recursively(recjson_field, phrases) for phrase in phrases: extend(tokenizing_function(phrase)) if recID not in termslist and new_words: termslist[recID] = [] if new_words: termslist[recID] = list_union(new_words, termslist[recID]) return termslist
def test_list_union(self): """bibindex engine utils - list union""" self.assertEqual([1, 2, 3, 4], list_union([1, 2, 3], [1, 3, 4]))