예제 #1
0
파일: fields.py 프로젝트: oier/Yaki
    def __init__(self,
                 minsize=2,
                 maxsize=4,
                 stored=False,
                 field_boost=1.0,
                 tokenizer=None,
                 at=None,
                 queryor=False):
        """
        :param minsize: The minimum length of the N-grams.
        :param maxsize: The maximum length of the N-grams.
        :param stored: Whether to store the value of this field with the
            document. Since this field type generally contains a lot of text,
            you should avoid storing it with the document unless you need to,
            for example to allow fast excerpts in the search results.
        :param tokenizer: an instance of :class:`whoosh.analysis.Tokenizer`
            used to break the text into words.
        :param at: if 'start', only takes N-grams from the start of the word.
            If 'end', only takes N-grams from the end. Otherwise the default
            is to take all N-grams from each word.
        :param queryor: if True, combine the N-grams with an Or query. The
            default is to combine N-grams with an And query.
        """

        analyzer = NgramWordAnalyzer(minsize, maxsize, tokenizer, at=at)
        self.format = Frequency(analyzer=analyzer, field_boost=field_boost)
        self.stored = stored
        self.queryor = queryor
예제 #2
0
def test_readwrite():
    with TempStorage("readwrite") as st:
        format = Frequency()
        postings = make_postings()
        
        postfile = st.create_file("readwrite")
        fpw = FilePostingWriter(postfile, blocklimit=8)
        fpw.start(format)
        for id, freq in postings:
            fpw.write(id, float(freq), format.encode(freq), 0)
        fpw.finish()
        fpw.close()
        
        postfile = st.open_file("readwrite")
        fpr = FilePostingReader(postfile, 0, format)
        assert_equal(postings, list(fpr.items_as("frequency")))
        postfile.close()
예제 #3
0
def test_readwrite():
    with TempStorage("readwrite") as st:
        format = Frequency()
        postings = make_postings()

        postfile = st.create_file("readwrite")
        fpw = FilePostingWriter(postfile, blocklimit=8)
        fpw.start(format)
        for id, freq in postings:
            fpw.write(id, float(freq), format.encode(freq), 0)
        fpw.finish()
        fpw.close()

        postfile = st.open_file("readwrite")
        fpr = FilePostingReader(postfile, 0, format)
        assert_equal(postings, list(fpr.items_as("frequency")))
        postfile.close()
예제 #4
0
    def new_field(self, field_name: str, field_data):
        """
        Add a new field. If the schema is not yet defined the writer will add the field_name inside the schema

        Args:
            field_name (str): Name of the new field
            field_data: Data to put into the field
        """
        if not self.__schema_defined:
            self.__writer.add_field(field_name,
                                    KEYWORD(stored=True, vector=Frequency()))
        self.__doc[field_name] = field_data
예제 #5
0
    def __init__(self, minsize=2, maxsize=4, stored=False, field_boost=1.0):
        """
        :param stored: Whether to store the value of this field with the
            document. Since this field type generally contains a lot of text,
            you should avoid storing it with the document unless you need to,
            for example to allow fast excerpts in the search results.
        :param minsize: The minimum length of the N-grams.
        :param maxsize: The maximum length of the N-grams.
        """

        self.format = Frequency(analyzer=NgramAnalyzer(minsize, maxsize),
                                field_boost=field_boost)
        self.scorable = True
        self.stored = stored
예제 #6
0
파일: fields.py 프로젝트: gnuaha7/tagfs
 def __init__(self, stored=False, lowercase=False, commas=False,
              scorable=False, unique=False, field_boost=1.0):
     """
     :param stored: Whether to store the value of the field with the
         document.
     :param comma: Whether this is a comma-separated field. If this is False
         (the default), it is treated as a space-separated field.
     :param scorable: Whether this field is scorable.
     """
     
     ana = KeywordAnalyzer(lowercase=lowercase, commas=commas)
     self.format = Frequency(analyzer=ana, field_boost=field_boost)
     self.scorable = scorable
     self.stored = stored
     self.unique = unique
예제 #7
0
    def _schema(self):
        # Creates a schema given this object's mingram and maxgram attributes.

        from whoosh.fields import Schema, FieldType, ID, STORED
        from whoosh.formats import Frequency
        from whoosh.analysis import SimpleAnalyzer

        idtype = ID()
        freqtype = FieldType(Frequency(), SimpleAnalyzer())

        fls = [("word", STORED), ("score", STORED)]
        for size in xrange(self.mingram, self.maxgram + 1):
            fls.extend([("start%s" % size, idtype), ("end%s" % size, idtype),
                        ("gram%s" % size, freqtype)])

        return Schema(**dict(fls))
예제 #8
0
 def schema_type(self):
     return KEYWORD(stored=True, commas=True, vector=Frequency())
예제 #9
0
def test_frequency_postings():
    content = u("alfa bravo charlie bravo alfa alfa")
    assert _roundtrip(content, Frequency(), "frequency") == [("alfa", 3), ("bravo", 2), ("charlie", 1)]