def __init__(self, minsize=2, maxsize=4, stored=False, field_boost=1.0, tokenizer=None, at=None, queryor=False): """ :param minsize: The minimum length of the N-grams. :param maxsize: The maximum length of the N-grams. :param stored: Whether to store the value of this field with the document. Since this field type generally contains a lot of text, you should avoid storing it with the document unless you need to, for example to allow fast excerpts in the search results. :param tokenizer: an instance of :class:`whoosh.analysis.Tokenizer` used to break the text into words. :param at: if 'start', only takes N-grams from the start of the word. If 'end', only takes N-grams from the end. Otherwise the default is to take all N-grams from each word. :param queryor: if True, combine the N-grams with an Or query. The default is to combine N-grams with an And query. """ analyzer = NgramWordAnalyzer(minsize, maxsize, tokenizer, at=at) self.format = Frequency(analyzer=analyzer, field_boost=field_boost) self.stored = stored self.queryor = queryor
def test_readwrite(): with TempStorage("readwrite") as st: format = Frequency() postings = make_postings() postfile = st.create_file("readwrite") fpw = FilePostingWriter(postfile, blocklimit=8) fpw.start(format) for id, freq in postings: fpw.write(id, float(freq), format.encode(freq), 0) fpw.finish() fpw.close() postfile = st.open_file("readwrite") fpr = FilePostingReader(postfile, 0, format) assert_equal(postings, list(fpr.items_as("frequency"))) postfile.close()
def new_field(self, field_name: str, field_data): """ Add a new field. If the schema is not yet defined the writer will add the field_name inside the schema Args: field_name (str): Name of the new field field_data: Data to put into the field """ if not self.__schema_defined: self.__writer.add_field(field_name, KEYWORD(stored=True, vector=Frequency())) self.__doc[field_name] = field_data
def __init__(self, minsize=2, maxsize=4, stored=False, field_boost=1.0): """ :param stored: Whether to store the value of this field with the document. Since this field type generally contains a lot of text, you should avoid storing it with the document unless you need to, for example to allow fast excerpts in the search results. :param minsize: The minimum length of the N-grams. :param maxsize: The maximum length of the N-grams. """ self.format = Frequency(analyzer=NgramAnalyzer(minsize, maxsize), field_boost=field_boost) self.scorable = True self.stored = stored
def __init__(self, stored=False, lowercase=False, commas=False, scorable=False, unique=False, field_boost=1.0): """ :param stored: Whether to store the value of the field with the document. :param comma: Whether this is a comma-separated field. If this is False (the default), it is treated as a space-separated field. :param scorable: Whether this field is scorable. """ ana = KeywordAnalyzer(lowercase=lowercase, commas=commas) self.format = Frequency(analyzer=ana, field_boost=field_boost) self.scorable = scorable self.stored = stored self.unique = unique
def _schema(self): # Creates a schema given this object's mingram and maxgram attributes. from whoosh.fields import Schema, FieldType, ID, STORED from whoosh.formats import Frequency from whoosh.analysis import SimpleAnalyzer idtype = ID() freqtype = FieldType(Frequency(), SimpleAnalyzer()) fls = [("word", STORED), ("score", STORED)] for size in xrange(self.mingram, self.maxgram + 1): fls.extend([("start%s" % size, idtype), ("end%s" % size, idtype), ("gram%s" % size, freqtype)]) return Schema(**dict(fls))
def schema_type(self): return KEYWORD(stored=True, commas=True, vector=Frequency())
def test_frequency_postings(): content = u("alfa bravo charlie bravo alfa alfa") assert _roundtrip(content, Frequency(), "frequency") == [("alfa", 3), ("bravo", 2), ("charlie", 1)]