示例#1
0
    async def scrape(self):
        """
            loads initial xml map, puts elements in queue,
            assigns workers, and initializes the scrape
        """
        start = datetime.datetime.now()
        # load initial xml
        init_xml = await self.load_init_xml('nyTimesSiteMap/sitemap.xml.gz',
                                            self.loop)
        # parse xml_file_tree
        parser = XMLPullParser(['start', 'end'])
        parser.feed(init_xml)
        for event, element in parser.read_events():
            if event == 'start' and 'sitemap' in element.text:
                self.xml_queue.put_nowait(element.text)

        async with aiohttp.ClientSession() as session:
            tasks = [
                self.handle_task(i, session) for i in range(self.num_workers)
            ]
            await asyncio.gather(*tasks)

        stop = datetime.datetime.now()
        time_past = stop - start
        minutes = (time_past.seconds % 3600) // 60
        seconds = time_past.seconds % 60
        print(f'visited {self.pages_scraped} websites in {minutes} minutes '
              f'and {seconds} seconds')
示例#2
0
    def read_feed(self):
        feed = None
        is_feed_header = True
        description = ''
        title = ''
        link = ''
        language = ''
        author = ''
        published_on = ''
        guid = ''
        try:
            data = self.read()
            parser = XMLPullParser(['start', 'end'])
            parser.feed(data)
            for event, elem in list(parser.read_events()):
                if event == 'start':
                    local_part = elem.tag
                    if local_part == 'item':
                        if is_feed_header == True:
                            is_feed_header = False
                            feed = Feed(title, link, description, language,
                                        published_on)

                    elif local_part == 'title':
                        title = elem.text
                    elif local_part == 'description':
                        description = elem.text
                    elif local_part == 'link':
                        link = elem.text
                    elif local_part == 'guid':
                        guid = elem.text
                    elif local_part == 'language':
                        language = elem.text
                    elif local_part == 'author':
                        author = elem.text
                    elif local_part == 'published_on':
                        published_on = elem.text
                    elif event == 'end':
                        if elem.get_tag() == 'item':
                            feed_message = FeedMessage(title, link,
                                                       description, author,
                                                       guid)
                            feed.entries.append(feed_message)
                            continue
        except ParseError as pe:
            print(str(pe.code) + ": " + pe.get_reason())

        return feed
示例#3
0
class RequestParser:
    def __init__(self):
        self._parser = XMLPullParser(['start', 'end'])
        self._root_element = None

    def has_ended(self, data: bytes) -> bool:
        self._parser.feed(data)

        for event, element in self._parser.read_events():
            if event == 'start' and self._root_element is None:
                self._root_element = element
            elif event == 'end' and self._root_element is not None:
                if element.tag == self._root_element.tag:
                    return True

        return False
示例#4
0
    def _handle(parser: XMLPullParser):
        events = parser.read_events()
        nonlocal element_stack

        for action, elem in events:
            elem: Element
            if action == 'start':
                element_stack.append(elem)
            elif action == 'end':
                last_tag = _simplify(element_stack[-1].tag)
                current_tag = _simplify(elem.tag)
                if last_tag != current_tag:
                    raise Exception('unmatched tag, start: {}, end: {}'.format(
                        last_tag, current_tag))
                if path == list(map(lambda x: _simplify(x.tag),
                                    element_stack)):
                    yield elem

                element_stack = element_stack[:-1]
                if len(element_stack) > 0:
                    element_stack[-1].clear()
示例#5
0
 async def handle_task(self, task_id, session):
     """
         async worker. Gets xml map file finds recipe urls, fetches html,
         and sends to parser
     """
     while not self.xml_queue.empty():
         xml_url = await self.xml_queue.get()
         print(f'worker {task_id}: fetching file {xml_url}')
         # get xml with recipes
         recipe_xml = await self.load_xml_gz(xml_url, session)
         recipe_xml_parser = XMLPullParser(['start', 'end'])
         recipe_xml_parser.feed(recipe_xml)
         for event, element in recipe_xml_parser.read_events():
             if 'loc' in element.tag and event == 'start':
                 url = element.text
                 if '/recipes/' in url:
                     html = await self.fetch_url(url, session)
                     await self.parse(html, url)
                     self.pages_scraped += 1
                     if not self.pages_scraped % 50:
                         print(f'{self.pages_scraped} pages scraped',
                               ' so far')
示例#6
0
# coding = utf-8
from xml.etree.ElementTree import XMLPullParser

events = ("start", "end", "start-ns", "end-ns")
parser = XMLPullParser(events=events)
fd = open('books.xml', 'r')
xml_data = fd.read()
parser.feed(xml_data)
# 转换成列表操作
re_events = list(parser.read_events())
# 构造xml的root
root_element = re_events[0][1]


# 从根节点偏离element树
def list_tree(element, depth):
    print('\t' * depth, element.tag, ":",
          element.text if element.text.strip() != '' else '')
    children_elements = element.getchildren()
    if children_elements:
        for e_ in children_elements:
            list_tree(e_, depth + 1)


list_tree(root_element, 0)
collection = client.get_collection('UsersLowRep')
UsersFilePath = './Data/Users.xml'
startId = int(sys.argv[1]) if len(sys.argv) > 1 else 0
dbThreshold = int(sys.argv[2]) if len(sys.argv) > 2 else None
nextSwitchId = startId + dbThreshold if dbThreshold is not None else None
reputationThreshold = 100
viewThreshold = 100

parser = XMLPullParser(events=['end'])
with open(file=UsersFilePath) as f:
    Id = 0
    counter = 0
    rep = 0
    for line in f:
        parser.feed(line)
        for event, elem in parser.read_events():
            if elem.tag == 'row':
                Id = int(elem.get('Id'))
                if Id < startId:
                    continue
                # rep += int(elem.get('Reputation'))
                # counter += 1
                reputation = int(elem.get('Reputation'))
                if elem.get('Views') is not None:
                    viewCount = int(elem.get('Views'))
                if elem.get('UpVotes') is not None:
                    upCount = int(elem.get('UpVotes'))
                if elem.get('DownVotes') is not None:
                    downCount = int(elem.get('DownVotes'))
                if reputation <= reputationThreshold and viewCount <= viewThreshold:
                    data_to_save = {}
示例#8
0
class Device(object):
    """Handles a Raven or Emu serial device."""
    def __init__(self, device):
        """Open the Raven or Emu and prepares for parsing."""
        from serial import Serial
        self._dev = Serial(device, 115200, timeout=0)
        self._sanitizer = re.compile(r'[^\sa-zA-Z0-9<>/_-]')
        self._init_parser()

    def _init_parser(self):
        """Reset the XML parser and primes it with a document tag."""
        self._parser = XMLPullParser(['start', 'end'])
        # Add a junk root tag so we constantly get data
        self._parser.feed("<HomeAssistant>\n")
        # Store the root tag so we can clear it to avoid amassing memory
        for (_, elem) in self._parser.read_events():
            self._root = elem
        # Reset data
        self._data = [{}]

    def update(self):
        """Pull and parse new data from the serial device."""
        try:
            serial_data = self._dev.read(1024).decode()
            self._parser.feed(self._sanitizer.sub('', serial_data))
            for (event, elem) in self._parser.read_events():
                if event == 'start':
                    self._data.append({})
                else:
                    data = self._data.pop()
                    data['text'] = elem.text
                    self._data[-1][elem.tag] = data
                if len(self._data) == 1:
                    # Clear the element from root
                    self._root.remove(elem)
        except ParseError:
            self._init_parser()

    def get(self, field):
        """Return the data accumulated for a given XML tag."""
        return self._data[0][field]

    def query_instantaneous_demand(self):
        """Request updates on instantaneous demand."""
        self._dev.write(b"<Command>\n" +
                        b"  <Name>get_instantaneous_demand</name>\n" +
                        b"  <Refresh>Y</Refresh>\n"
                        b"</Command>\n")
        self._dev.flush()

    def query_summation_delivered(self):
        """Request updates on the various summations."""
        self._dev.write(b"<Command>\n" +
                        b"  <Name>get_current_summation_delivered</name>\n" +
                        b"  <Refresh>Y</Refresh>\n" + b"</Command>\n")
        self._dev.flush()

    def query_current_price(self):
        """Request updates on pricing."""
        self._dev.write(b"<Command>\n" +
                        b"  <Name>get_current_price</Name>\n" +
                        b"  <Refresh>Y</Refresh>\n" + b"</Command>\n")
        self._dev.flush()