示例#1
0
 def __init__(self):
     HTMLParser.__init__(self)
     self.buf = []
     self.last_text = []
     self.hide_output = False
     self.tag_count = 0
     self.current_tag = None
示例#2
0
文件: toc.py 项目: ScxFiction/mkdocs
    def __init__(self):
        HTMLParser.__init__(self)
        self.links = []

        self.in_anchor = False
        self.attrs = None
        self.title = ''
示例#3
0
 def __init__(self, baseURL):
   HTMLParser.__init__(self)
   self.stack = []
   self.anchors = set()
   self.links = []
   self.baseURL = baseURL
   self.printed = False
 def __init__(self, **kwargs):
     HTMLParser.__init__(self)
     self.kwargs = kwargs
     self.active = None
     self.last_content = ""
     self.rows = []
     self.found_first_valid_num = False
 def __init__(self, builder=None, encoding=None):
     self.__stack = []
     if builder is None:
         builder = ElementTree.TreeBuilder()
     self.__builder = builder
     self.encoding = encoding or "iso-8859-1"
     HTMLParser.__init__(self)
示例#6
0
	def __init__(self):

		warnings.warn("portage.getbinpkg.ParseLinks is deprecated",
			DeprecationWarning, stacklevel=2)

		self.PL_anchors = []
		html_parser_HTMLParser.__init__(self)
 def __init__(self):
     HTMLParser.__init__(self)
     self.collect_data = False
     self.bound = 20
     self.des_tag = "div"
     self.des_attr = ("id", "content")
     self.stations_info = None
示例#8
0
 def __init__(self):
     HTMLParser.__init__(self)
     self.state = State.NOWHERE
     self.data = ""
     self.pnpid = None
     self.company = None
     self.table = []
示例#9
0
 def __init__(self):
     HTMLParser.__init__(self)
     self.url = None
     self.params = {}
     self.in_form = False
     self.form_parsed = False
     self.method = "GET"
示例#10
0
 def __init__(self):
     HTMLParser.__init__(self)
     self.state = []
     self.href= ""
     self.obj = {}
     self.index = {}
     self.done = False
示例#11
0
 def __init__(self):
     HTMLParser.__init__(self)
     self.data = dict()
     self.recordingAuthor = False
     self.recordingBody = False;
     self.data["body"] = ""
     self.save_tags = ['p', 'blockquote', 'h1', 'h2', 'h3', 'h4', 'h5']
示例#12
0
文件: main.py 项目: 0x1p2/spider-py
 def __init__(self, args):
     HTMLParser.__init__(self)
     self.root_url   = args.URL                                      # Original URL passed.
     self.netloc     = urllib.parse.urlparse(self.root_url).netloc   # Netloc of the URL.
     self.depth      = args.depth                                    # Distance (pages) to travel.
     self.timer      = args.time                                     # Amount of time per page.
     self.db         = MongoClient()[args.db][args.coll]               # Database that stores data.
     self.sub        = args.sub                                      # Subdirectory to set as root of webpage.
     self.verbose    = args.verbose                                  # Verbosity setting.
     # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #  
     self.key_terms      = ["buy", "sell", "trade", "trading"]
     self.count          = 0                 # Amount of pages processed.
     self.posts          = 0                 # Amount of posts scanned.
     self.urlBlacklist   = []                # Already completed URLS.
     self.urlDNU         = []                # Do not use URLS, duplicates.
     self.urlList        = [self.root_url]   # List of URLS to scan.
     self.items          = []                # Items to look for.
     self.discovered     = {}                # Items discovered + [URLs]
     self.BigDict        = {}                # Dictionary containing ThreadID + [URLS] <- urlDNU list?.
     # # # # # # # # # # # # # # # # # # # # #
     self.li_main            = False     # Start of play contribution
     self.blockquote_main    = False     # Start of the message
     self.div_quote_main     = False     # Start of Quote Container
     self.div_quote_xpand    = False     # Start of QuoteExpand
     self.blockquote_quote   = False     # Start of Quote Message
     self.text_lock          = True      # Locks the abilty to print text or use it.
     self.li_name            = None      # Name of original author
     self.blockquote_name    = None      # Name of person being quoted.
     # # # # # # # # # # # # # # # # # # #
     self.queryDB()      # Loads the self.items list.
	def __init__(self,strict=False):

		# Constructor call of parent class.
		HTMLParser.__init__(self,strict)

		# Defining variables of this class.
		# There are 3 types of variables.
		# data_variables: these are the required information
		# data_check_variables: boolean values corresponding to each of the data_variables
		# 		to keep a check on the data been already extracted or not.
		# tag_check_variables: used for matching the proper format.
		self.h1=False
		self.desc=False
		self.description=' '
		self.sol=False
		self.solution=' '
		self.p=False;
		self.li=False;
		self.ref=False;
		self.references=' '
		self.cvss=False;
		self.cvss_score=0.0;
		self.cve=False
		self.cve_id=' '
		self.links=[]
		self.prod=False;
		self.products=[]
		self.last_h6=' '
		self.h6=False
		self.h7=False
		self.clas=False
		self.attack_from=' '
		self.attk=False
		self.impact=' '
		self.impt=False
示例#14
0
 def __init__(self):
     HTMLParser.__init__(self)
     self.convert_charrefs = False
     self.last = "starttag"
     self.in_pre = False
     self.output = ""
     self.last_tag = ""
示例#15
0
 def __init__(self, results, url, trackers):
     HTMLParser.__init__(self)
     self.results = results
     self.url = url
     self.trackers = trackers
     self.td_counter = None
     self.current_item = None
示例#16
0
 def __init__(self, base_href):
     HTMLParser.__init__(self)
     self.base_href = base_href
     self.results = {}
     self.group_name = self.group_desc = None
     self.in_group_name = self.in_group_desc = self.in_activity = 0
     self._clear_info()
示例#17
0
 def __init__(self, url):
     HTMLParser.__init__(self)
     self.url = url
     self.current_item = None
     self.save_data = None
     self.seeds_leech = False
     self.size_repl = re_compile(",")
    def __init__(self, zip_file):
        HTMLParser.__init__(self)
        self._html = StringIO()  # buffer for the processed HTML
        self._zip_file = zip_file

        # used to exclude the contents of script and object tags
        self._excl_nested_level = 0
示例#19
0
 def __init__(self):
   # use a list to store literal bytes and escaped Unicode
   if py3:
       super().__init__()
   else:
       HTMLParser.__init__(self)
   self.title = []
示例#20
0
	def __init__(self, strict = False, reps = None, outs = None, sc = True):
		self.rep = reps
		self.outStream = outs
		self.stripComment = sc
		self.rep.parser = self

		HTMLParser.__init__(self, strict)
示例#21
0
    def __init__(self):
        """An overload of the HTML Parser constructor.
        We use this initialization code to make sure that every
        variable is flushed.

        Arguments:

        self -- Allows the function to reference parent class
        properties. It is unnecessary to specify self during function
        calls as it is implied.
        """

        # Initialize the HTML Parser.
        HTMLParser.__init__(self)

        # Initialize the variables.
        self._record_name = False
        self._record_meal = False
        self._record_station = False
        self._record_attributes = False
        self._day = EMPTY_STRING
        self._meal = EMPTY_STRING
        self._station = EMPTY_STRING
        self._name_text = []
        self._station_text = []
        self._attributes = []

        # Hold all the dining hall menus.
        self.menu = []
示例#22
0
 def __init__(self):
     self.foundGo = False
     if (sys.version_info.minor < 4):
         HTMLParser.__init__(self)
     else:
         self.html_parser_init_kwargs = { 'convert_charrefs' : True }
         HTMLParser.__init__(self, **self.html_parser_init_kwargs)
示例#23
0
 def __init__(self,
              remove_comments=False,
              remove_empty_space=False,
              remove_all_empty_space=False,
              reduce_empty_attributes=True,
              reduce_boolean_attributes=False,
              remove_optional_attribute_quotes=True,
              keep_pre=False,
              pre_tags=PRE_TAGS,
              pre_attr='pre'):
   if sys.version_info[0] >= 3 and sys.version_info[1] >= 4:
     # convert_charrefs is True by default in Python 3.5.0 and newer. It was
     # introduced in 3.4.
     HTMLParser.__init__(self, convert_charrefs=False)
   else:
     HTMLParser.__init__(self)
   self.keep_pre = keep_pre
   self.pre_tags = pre_tags
   self.remove_comments = remove_comments
   self.remove_empty_space = remove_empty_space
   self.remove_all_empty_space = remove_all_empty_space
   self.reduce_empty_attributes = reduce_empty_attributes
   self.reduce_boolean_attributes = reduce_boolean_attributes
   self.remove_optional_attribute_quotes = remove_optional_attribute_quotes
   self.pre_attr = pre_attr
   self._data_buffer = []
   self._in_pre_tag = 0
   self._in_head = False
   self._in_title = False
   self._after_doctype = False
   self._tag_stack = []
   self._title_newly_opened = False
   self.__title_trailing_whitespace = False
	def __init__(self):
		HTMLParser.__init__(self)
		self.items = []
		self.foundItem = False
		self.br_before = False
		self.checkBr = False
		self.current_item_url = ""
示例#25
0
 def __init__(self,
              remove_comments=False,
              remove_empty_space=False,
              remove_all_empty_space=False,
              reduce_empty_attributes=True,
              reduce_boolean_attributes=False,
              remove_optional_attribute_quotes=True,
              keep_pre=False,
              pre_tags=PRE_TAGS,
              pre_attr='pre'):
   HTMLParser.__init__(self)
   self.keep_pre = keep_pre
   self.pre_tags = pre_tags
   self.remove_comments = remove_comments
   self.remove_empty_space = remove_empty_space
   self.remove_all_empty_space = remove_all_empty_space
   self.reduce_empty_attributes = reduce_empty_attributes
   self.reduce_boolean_attributes = reduce_boolean_attributes
   self.remove_optional_attribute_quotes = remove_optional_attribute_quotes
   self.pre_attr = pre_attr
   self._data_buffer = []
   self._in_pre_tag = 0
   self._in_head = False
   self._in_title = False
   self._after_doctype = False
   self._tag_stack = []
   self._title_newly_opened = False
   self.__title_trailing_whitespace = False
示例#26
0
	def __init__(self, news):

		HTMLParser.__init__(self)
		self.count_a = 0;
		self.current_tag = ""
		self.looking_for_testata = False
		self.news = news
示例#27
0
 def __init__(self):
     HTMLParser.__init__(self)
     self.links = []
     self.isNumber = 0
     self.stack = []
     self.day = []
     self.test = []
 def __init__(self):
     HTMLParser.__init__(self)
     self.starParsing = False
     self.ratingParsing = False
     self.starStack = Stack()
     self.ratingStack = Stack()
     self.ratingDict = {}
示例#29
0
 def __init__(self):
     HTMLParser.__init__(self)
     self.stack = []
     self.template_names = []
     self.templates = {}
     self.current_template = ""
     self.current_template_count = 0
	def __init__(self):
		self.urlList = []
		self.index = 0
		self.nextUrl = ''
		self.tagList = ['li','a']
		self.classList = ['photo-list-padding','pic']
		HTMLParser.__init__(self)
示例#31
0
 def __init__(self):
     HTMLParser.__init__(self)
     self.recording = 0
     self.data = []
     self.link = ""
 def __init__(self, allows=[]):
     HTMLParser.__init__(self)
     self.allow_tags = allows if allows else self.allow_tags
     self.result = []
     self.start = []
     self.data = []
示例#33
0
 def __init__(self):
     HTMLParser.__init__(self)
     self.title = None
示例#34
0
 def __init__(self):
     HTMLParser.__init__(self)
     self.reset()
     self.fed = []
示例#35
0
 def __init__(self, **kw):
     HTMLParser.__init__(self, **kw)
     self._fed = []
示例#36
0
 def __init__(self):
     HTMLParser.__init__(self)
     self.reset()
     self.HTMLDATA = []
示例#37
0
 def __init__(self):
     HTMLParser.__init__(self)
     self.AllLinks = []
示例#38
0
 def __init__(self):
     HTMLParser.__init__(self)
     self._texts = []  # type: list
     self._ignore = False
示例#39
0
 def __init__(self):
     HTMLParser.__init__(self)
     self.first_row = True
     self.in_cell = False
     self.links = []
示例#40
0
 def __init__(self):
     HTMLParser.__init__(self)
     self.json_link = None
示例#41
0
 def __init__(self):
     BaseHTMLParser.__init__(self, convert_charrefs=False)
示例#42
0
 def __init__(self, ostream):
     HTMLParser.__init__(self)
     self.in_pre = False
     self.in_code = False
     self.ostream = ostream
     self.pygments_fix = False
 def __init__(self):
     HTMLParser.__init__(self)
     self.title = ""
     self.is_title = False
     self.content = ""
     self.is_content = False
示例#44
0
 def __init__(self):
     HTMLParser.__init__(self)
     self.out_buffer = []
示例#45
0
 def __init__(self):  
     HTMLParser.__init__(self)  
     self.__text = []  
 def __init__(self):
     HTMLParser.__init__(self)
     self.content = ""
     self.is_content = False
     self.other_content = False
     self.skip = False
示例#47
0
 def __init__(self):
     HTMLParser.__init__(self)
     self.flag = 0
     self.endflag = 0
     self.divflag = False
示例#48
0
 def __init__(self):
     HTMLParser.__init__(self)
     self.content = None
     self.content_type = self.UNKNOWN
示例#49
0
 def __init__(self):
     HTMLParser.__init__(self)
     self.strings = []
示例#50
0
 def __init__(self):
     HTMLParser.__init__(self)
     self.tag_results = {}
示例#51
0
    def __init__(self, tag="", attrs=None):
        # Initiate HTMLParser
        HTMLParser.__init__(self)
        self.convert_charrefs = True
        self._root = None  # root element
        self._data = []  # data collector
        self._factory = Etree.Element
        self.enabled = not tag
        self._unw_attrs = []
        self.tag = tag

        # Split attributes into wanted and unwanted attributes
        if attrs:
            self.attrs = attrs
            for key, value in attrs.copy().items():
                if value == 0:
                    self._unw_attrs.append(key)
                    del attrs[key]
        else:
            self.attrs = {}

        # Some tags in html do not require closing tags so thoes tags will need to be auto closed (Void elements)
        # Refer to: https://www.w3.org/TR/html/syntax.html#void-elements
        self._voids = frozenset((
            "area",
            "base",
            "br",
            "col",
            "hr",
            "img",
            "input",
            "link",
            "meta",
            "param",
            # Only in HTML5
            "embed",
            "keygen",
            "source",
            "track",
            # Not supported in HTML5
            "basefont",
            "frame",
            "isindex",
            # SVG self closing tags
            "rect",
            "circle",
            "ellipse",
            "line",
            "polyline",
            "polygon",
            "path",
            "stop",
            "use",
            "image",
            "animatetransform"))

        # Create temporary root element to protect from badly written sites that either
        # have no html starting tag or multiple top level elements
        elem = self._factory("html")
        self._elem = [elem]
        self._last = elem
        self._tail = 0
示例#52
0
 def __init__(self):
     HTMLParser.__init__(self)
     self.a = []
     self.n = []
     self.x = 0
 def __init__(self,date_url_file):
     HTMLParser.__init__(self)
     self.tag = None
     self.date_url = False
     self.href = None
     self.date_url_file = date_url_file
示例#54
0
文件: tangshi.py 项目: cityking/scrap
 def __init__(self):
     HTMLParser.__init__(self)
     self.content = [] 
     self.in_div = False
示例#55
0
 def __init__(self, druid_module_name, compatible_license_names):
     HTMLParser.__init__(self)
     self.state = "none"
     self.druid_module_name = druid_module_name
     self.compatible_license_names = compatible_license_names
示例#56
0
 def __init__(self):
     HTMLParser.__init__(self)
     self.div = False
     self.div_2 = False
     self.cpt = 0
     self.resulte = ""
示例#57
0
 def __init__(self):
     HTMLParser.__init__(self)
     self.maxword = 150
示例#58
0
 def __init__(self):
     HTMLParser.__init__(self)        
示例#59
0
文件: analyser.py 项目: vit-001/fget
 def __init__(self, tag_to_analyse='div'):
     HTMLParser.__init__(self)
     self.tag_to_analyse = tag_to_analyse
     self.classes = {}
示例#60
0
 def __init__(self, base, output=None):
     HTMLParser.__init__(self)
     if output is None:
         output = []
     self.output = output
     self.base = base