async def scrape(self): """ loads initial xml map, puts elements in queue, assigns workers, and initializes the scrape """ start = datetime.datetime.now() # load initial xml init_xml = await self.load_init_xml('nyTimesSiteMap/sitemap.xml.gz', self.loop) # parse xml_file_tree parser = XMLPullParser(['start', 'end']) parser.feed(init_xml) for event, element in parser.read_events(): if event == 'start' and 'sitemap' in element.text: self.xml_queue.put_nowait(element.text) async with aiohttp.ClientSession() as session: tasks = [ self.handle_task(i, session) for i in range(self.num_workers) ] await asyncio.gather(*tasks) stop = datetime.datetime.now() time_past = stop - start minutes = (time_past.seconds % 3600) // 60 seconds = time_past.seconds % 60 print(f'visited {self.pages_scraped} websites in {minutes} minutes ' f'and {seconds} seconds')
def read_feed(self): feed = None is_feed_header = True description = '' title = '' link = '' language = '' author = '' published_on = '' guid = '' try: data = self.read() parser = XMLPullParser(['start', 'end']) parser.feed(data) for event, elem in list(parser.read_events()): if event == 'start': local_part = elem.tag if local_part == 'item': if is_feed_header == True: is_feed_header = False feed = Feed(title, link, description, language, published_on) elif local_part == 'title': title = elem.text elif local_part == 'description': description = elem.text elif local_part == 'link': link = elem.text elif local_part == 'guid': guid = elem.text elif local_part == 'language': language = elem.text elif local_part == 'author': author = elem.text elif local_part == 'published_on': published_on = elem.text elif event == 'end': if elem.get_tag() == 'item': feed_message = FeedMessage(title, link, description, author, guid) feed.entries.append(feed_message) continue except ParseError as pe: print(str(pe.code) + ": " + pe.get_reason()) return feed
class RequestParser: def __init__(self): self._parser = XMLPullParser(['start', 'end']) self._root_element = None def has_ended(self, data: bytes) -> bool: self._parser.feed(data) for event, element in self._parser.read_events(): if event == 'start' and self._root_element is None: self._root_element = element elif event == 'end' and self._root_element is not None: if element.tag == self._root_element.tag: return True return False
def _handle(parser: XMLPullParser): events = parser.read_events() nonlocal element_stack for action, elem in events: elem: Element if action == 'start': element_stack.append(elem) elif action == 'end': last_tag = _simplify(element_stack[-1].tag) current_tag = _simplify(elem.tag) if last_tag != current_tag: raise Exception('unmatched tag, start: {}, end: {}'.format( last_tag, current_tag)) if path == list(map(lambda x: _simplify(x.tag), element_stack)): yield elem element_stack = element_stack[:-1] if len(element_stack) > 0: element_stack[-1].clear()
async def handle_task(self, task_id, session): """ async worker. Gets xml map file finds recipe urls, fetches html, and sends to parser """ while not self.xml_queue.empty(): xml_url = await self.xml_queue.get() print(f'worker {task_id}: fetching file {xml_url}') # get xml with recipes recipe_xml = await self.load_xml_gz(xml_url, session) recipe_xml_parser = XMLPullParser(['start', 'end']) recipe_xml_parser.feed(recipe_xml) for event, element in recipe_xml_parser.read_events(): if 'loc' in element.tag and event == 'start': url = element.text if '/recipes/' in url: html = await self.fetch_url(url, session) await self.parse(html, url) self.pages_scraped += 1 if not self.pages_scraped % 50: print(f'{self.pages_scraped} pages scraped', ' so far')
# coding = utf-8 from xml.etree.ElementTree import XMLPullParser events = ("start", "end", "start-ns", "end-ns") parser = XMLPullParser(events=events) fd = open('books.xml', 'r') xml_data = fd.read() parser.feed(xml_data) # 转换成列表操作 re_events = list(parser.read_events()) # 构造xml的root root_element = re_events[0][1] # 从根节点偏离element树 def list_tree(element, depth): print('\t' * depth, element.tag, ":", element.text if element.text.strip() != '' else '') children_elements = element.getchildren() if children_elements: for e_ in children_elements: list_tree(e_, depth + 1) list_tree(root_element, 0)
collection = client.get_collection('UsersLowRep') UsersFilePath = './Data/Users.xml' startId = int(sys.argv[1]) if len(sys.argv) > 1 else 0 dbThreshold = int(sys.argv[2]) if len(sys.argv) > 2 else None nextSwitchId = startId + dbThreshold if dbThreshold is not None else None reputationThreshold = 100 viewThreshold = 100 parser = XMLPullParser(events=['end']) with open(file=UsersFilePath) as f: Id = 0 counter = 0 rep = 0 for line in f: parser.feed(line) for event, elem in parser.read_events(): if elem.tag == 'row': Id = int(elem.get('Id')) if Id < startId: continue # rep += int(elem.get('Reputation')) # counter += 1 reputation = int(elem.get('Reputation')) if elem.get('Views') is not None: viewCount = int(elem.get('Views')) if elem.get('UpVotes') is not None: upCount = int(elem.get('UpVotes')) if elem.get('DownVotes') is not None: downCount = int(elem.get('DownVotes')) if reputation <= reputationThreshold and viewCount <= viewThreshold: data_to_save = {}
class Device(object): """Handles a Raven or Emu serial device.""" def __init__(self, device): """Open the Raven or Emu and prepares for parsing.""" from serial import Serial self._dev = Serial(device, 115200, timeout=0) self._sanitizer = re.compile(r'[^\sa-zA-Z0-9<>/_-]') self._init_parser() def _init_parser(self): """Reset the XML parser and primes it with a document tag.""" self._parser = XMLPullParser(['start', 'end']) # Add a junk root tag so we constantly get data self._parser.feed("<HomeAssistant>\n") # Store the root tag so we can clear it to avoid amassing memory for (_, elem) in self._parser.read_events(): self._root = elem # Reset data self._data = [{}] def update(self): """Pull and parse new data from the serial device.""" try: serial_data = self._dev.read(1024).decode() self._parser.feed(self._sanitizer.sub('', serial_data)) for (event, elem) in self._parser.read_events(): if event == 'start': self._data.append({}) else: data = self._data.pop() data['text'] = elem.text self._data[-1][elem.tag] = data if len(self._data) == 1: # Clear the element from root self._root.remove(elem) except ParseError: self._init_parser() def get(self, field): """Return the data accumulated for a given XML tag.""" return self._data[0][field] def query_instantaneous_demand(self): """Request updates on instantaneous demand.""" self._dev.write(b"<Command>\n" + b" <Name>get_instantaneous_demand</name>\n" + b" <Refresh>Y</Refresh>\n" b"</Command>\n") self._dev.flush() def query_summation_delivered(self): """Request updates on the various summations.""" self._dev.write(b"<Command>\n" + b" <Name>get_current_summation_delivered</name>\n" + b" <Refresh>Y</Refresh>\n" + b"</Command>\n") self._dev.flush() def query_current_price(self): """Request updates on pricing.""" self._dev.write(b"<Command>\n" + b" <Name>get_current_price</Name>\n" + b" <Refresh>Y</Refresh>\n" + b"</Command>\n") self._dev.flush()