예제 #1
0
 def __init__(self, max_duplicates=2, **kwargs):
     """Initialize cleanup model learner.
     
     Takes standard options of Extractor, plus:
      - max_duplicates: maximum number of (near) identical documents in the set
     """
     
     Extractor.__init__(self, **kwargs)
     self.max_duplicates = max_duplicates
     
     ## dictionary of HTML elements (paths and content) with counts 
     self.elements = dict()
예제 #2
0
 def __init__(self, cleanup_model=None, cleanup_threshold=0.1, **kwargs):
     """Initialize cleanup model learner.
     
     Takes standard parameters of Extractor, plus:
      - cleanup_model: filename of the model to load, or model itself
      - cleanup_threshold: 0 means less conservative, 1 means more conservative
     """
    
     Extractor.__init__(self, **kwargs)
     self.cleanup_model = cleanup_model
     self.cleanup_threshold = cleanup_threshold
     
     assert self.cleanup_model, "PageCleaner extractor requires a cleanup model"
    
     self.load_model(self.cleanup_model)
예제 #3
0
 def __init__(self, **kwargs):
     Extractor.__init__(self, **kwargs)
     self.pages = {}
     self.index = None