def to_bytes(self, value): """Returns a bytes representation of the given value, appropriate to be written to disk. The default implementation assumes a unicode value and encodes it using UTF-8. """ if isinstance(value, (list, tuple)): value = value[0] if not isinstance(value, bytes_type): value = utf8encode(value)[0] return value
def to_labels(key): """Takes a string and returns a list of bytestrings, suitable for use as a key or path in an FSA/FST graph. """ # Convert to tuples of bytestrings (must be tuples so they can be hashed) keytype = type(key) # I hate the Python 3 bytes object so friggin much if keytype is tuple or keytype is list: if not all(isinstance(e, bytes_type) for e in key): raise TypeError("%r contains a non-bytestring" % key) if keytype is list: key = tuple(key) elif isinstance(key, bytes_type): key = tuple(key[i:i + 1] for i in xrange(len(key))) elif isinstance(key, text_type): key = tuple(utf8encode(key[i:i + 1])[0] for i in xrange(len(key))) else: raise TypeError("Don't know how to convert %r" % key) return key
def index(self, value, **kwargs): """Returns an iterator of (btext, frequency, weight, encoded_value) tuples for each unique word in the input value. The default implementation uses the ``analyzer`` attribute to tokenize the value into strings, then encodes them into bytes using UTF-8. """ if not self.format: raise Exception("%s field %r cannot index without a format" % (self.__class__.__name__, self)) if not isinstance(value, (text_type, list, tuple)): raise ValueError("%r is not unicode or sequence" % value) assert isinstance(self.format, formats.Format) if "mode" not in kwargs: kwargs["mode"] = "index" word_values = self.format.word_values ana = self.analyzer for tstring, freq, wt, vbytes in word_values(value, ana, **kwargs): yield (utf8encode(tstring)[0], freq, wt, vbytes)
def add_document(self, **fields): self._check_state() perdocwriter = self.perdocwriter schema = self.schema docnum = self.docnum add_post = self.pool.add docboost = self._doc_boost(fields) fieldnames = sorted([name for name in fields.keys() if not name.startswith("_")]) self._check_fields(schema, fieldnames) perdocwriter.start_doc(docnum) for fieldname in fieldnames: value = fields.get(fieldname) if value is None: continue field = schema[fieldname] length = 0 if field.indexed: # TODO: Method for adding progressive field values, ie # setting start_pos/start_char? fieldboost = self._field_boost(fields, fieldname, docboost) # Ask the field to return a list of (text, weight, vbytes) # tuples items = field.index(value) # Only store the length if the field is marked scorable scorable = field.scorable # Add the terms to the pool for tbytes, freq, weight, vbytes in items: weight *= fieldboost if scorable: length += freq add_post((fieldname, tbytes, docnum, weight, vbytes)) if field.separate_spelling(): spellfield = field.spelling_fieldname(fieldname) for word in field.spellable_words(value): word = utf8encode(word)[0] # item = (fieldname, tbytes, docnum, weight, vbytes) add_post((spellfield, word, 0, 1, vbytes)) vformat = field.vector if vformat: analyzer = field.analyzer # Call the format's word_values method to get posting values vitems = vformat.word_values(value, analyzer, mode="index") # Remove unused frequency field from the tuple vitems = sorted((text, weight, vbytes) for text, _, weight, vbytes in vitems) perdocwriter.add_vector_items(fieldname, field, vitems) # Allow a custom value for stored field/column customval = fields.get("_stored_%s" % fieldname, value) # Add the stored value and length for this field to the per- # document writer sv = customval if field.stored else None perdocwriter.add_field(fieldname, field, sv, length) column = field.column_type if column and customval is not None: cv = field.to_column_value(customval) perdocwriter.add_column_value(fieldname, column, cv) perdocwriter.finish_doc() self._added = True self.docnum += 1
def add_document(self, **fields): self._check_state() perdocwriter = self.perdocwriter schema = self.schema docnum = self.docnum add_post = self.pool.add docboost = self._doc_boost(fields) fieldnames = sorted( [name for name in fields.keys() if not name.startswith("_")]) self._check_fields(schema, fieldnames) perdocwriter.start_doc(docnum) for fieldname in fieldnames: value = fields.get(fieldname) if value is None: continue field = schema[fieldname] length = 0 if field.indexed: # TODO: Method for adding progressive field values, ie # setting start_pos/start_char? fieldboost = self._field_boost(fields, fieldname, docboost) # Ask the field to return a list of (text, weight, vbytes) # tuples items = field.index(value) # Only store the length if the field is marked scorable scorable = field.scorable # Add the terms to the pool for tbytes, freq, weight, vbytes in items: weight *= fieldboost if scorable: length += freq add_post((fieldname, tbytes, docnum, weight, vbytes)) if field.separate_spelling(): # For fields which use different morphemes for spelling, # insert fake postings for the spellable words, where # docnum=-1 means "this is a spelling word" # TODO: think of something less hacktacular for word in field.spellable_words(value): word = utf8encode(word)[0] add_post((fieldname, word, -1, -1, emptybytes)) vformat = field.vector if vformat: analyzer = field.analyzer # Call the format's word_values method to get posting values vitems = vformat.word_values(value, analyzer, mode="index") # Remove unused frequency field from the tuple vitems = sorted((text, weight, vbytes) for text, _, weight, vbytes in vitems) perdocwriter.add_vector_items(fieldname, field, vitems) # Allow a custom value for stored field/column customval = fields.get("_stored_%s" % fieldname, value) # Add the stored value and length for this field to the per- # document writer sv = customval if field.stored else None perdocwriter.add_field(fieldname, field, sv, length) column = field.column_type if column and customval is not None: cv = field.to_column_value(customval) perdocwriter.add_column_value(fieldname, column, cv) perdocwriter.finish_doc() self._added = True self.docnum += 1