示例#1
0
 def make_from_serializable(cls, obj):
     obj = cls.make_serialized_form_compatible_with_newer_version(obj)
     res = IndexingPrePostProcessor(voc_limit=obj["voc_limit"])
     res.indexer = Indexer.make_from_serializable(obj["indexer"])
     if "preprocessor" in obj:
         res.preprocessor = PreProcessor.make_from_serializable(
             obj["preprocessor"])
     res.is_initialized_ = True
     return res
示例#2
0
    def make_serialized_form_compatible_with_newer_version(cls, obj):
        if Indexer.check_if_data_indexer(obj):
            new_obj = cls.make_base_serializable_object()
            new_obj["indexer"] = obj
            new_obj["voc_limit"] = len(obj)
            ss = SimpleSegmenter("word")
            ss.initialize(None)
            new_obj["preprocessor"] = ss.to_serializable()

        elif "processors_list" in obj:
            new_obj = obj["processors_list"][-1]
            if len(obj["processors_list"]) > 1:
                preproc_obj = copy.deepcopy(obj)
                preproc_obj["processors_list"] = preproc_obj[
                    "processors_list"][:-1]
                new_obj["preprocessor"] = preproc_obj
        else:
            new_obj = obj
        return new_obj
示例#3
0
def build_index_from_iterable(iterable, voc_limit=None):
    counts = collections.defaultdict(int)
    for num_ex, line in enumerate(iterable):
        for w in line:
            counts[w] += 1

    sorted_counts = sorted(six.iteritems(counts),
                           key=operator.itemgetter(1),
                           reverse=True)

    res = Indexer()

    for w, _ in sorted_counts[:voc_limit]:
        res.add_word(w, should_be_new=True)
    res.finalize()

    return res