def __init__(self, max_duplicates=2, **kwargs): """Initialize cleanup model learner. Takes standard options of Extractor, plus: - max_duplicates: maximum number of (near) identical documents in the set """ Extractor.__init__(self, **kwargs) self.max_duplicates = max_duplicates ## dictionary of HTML elements (paths and content) with counts self.elements = dict()
def __init__(self, cleanup_model=None, cleanup_threshold=0.1, **kwargs): """Initialize cleanup model learner. Takes standard parameters of Extractor, plus: - cleanup_model: filename of the model to load, or model itself - cleanup_threshold: 0 means less conservative, 1 means more conservative """ Extractor.__init__(self, **kwargs) self.cleanup_model = cleanup_model self.cleanup_threshold = cleanup_threshold assert self.cleanup_model, "PageCleaner extractor requires a cleanup model" self.load_model(self.cleanup_model)
def __init__(self, **kwargs): Extractor.__init__(self, **kwargs) self.pages = {} self.index = None