Exemplo n.º 1
0
    async def scrape(self):
        """
            loads initial xml map, puts elements in queue,
            assigns workers, and initializes the scrape
        """
        start = datetime.datetime.now()
        # load initial xml
        init_xml = await self.load_init_xml('nyTimesSiteMap/sitemap.xml.gz',
                                            self.loop)
        # parse xml_file_tree
        parser = XMLPullParser(['start', 'end'])
        parser.feed(init_xml)
        for event, element in parser.read_events():
            if event == 'start' and 'sitemap' in element.text:
                self.xml_queue.put_nowait(element.text)

        async with aiohttp.ClientSession() as session:
            tasks = [
                self.handle_task(i, session) for i in range(self.num_workers)
            ]
            await asyncio.gather(*tasks)

        stop = datetime.datetime.now()
        time_past = stop - start
        minutes = (time_past.seconds % 3600) // 60
        seconds = time_past.seconds % 60
        print(f'visited {self.pages_scraped} websites in {minutes} minutes '
              f'and {seconds} seconds')
Exemplo n.º 2
0
def parse(
    xml_text: Union[bytes, Iterator[bytes]], path: Tuple = ('feed', 'entry')
) -> Iterator[Union[dict, str]]:
    """
    lazy parser for large xml to extract a list of embedded xml elements

    :param xml_text: xml input, bytes or an iterator of bytes
    :param path: a tuple specifying the path of the xml elements to extract
    :return an iterator of dicts of the embedded xml elements
    """

    # convert xml_text to bytes iterator if it's of bytes type
    if type(xml_text) is bytes:
        xml_chunks = (xml_text[i:i + _chunk_size]
                      for i in range(0, len(xml_text), _chunk_size))
    else:
        xml_chunks = xml_text

    path: list = list(path)
    tag_stack: [str] = []
    xml_parser = XMLPullParser(events=('start', 'end'))

    handle_partial_xml = _get_partial_xml_handler(path)

    for chunk in xml_chunks:
        xml_parser.feed(chunk)
        for element in handle_partial_xml(xml_parser):
            yield _elem_to_dict_or_str(element)
Exemplo n.º 3
0
    def parse(self, xml_bytes):
        assert xml_bytes
        assert isinstance(xml_bytes, bytes)

        parser = XMLPullParser(['start', 'end', 'start-ns', 'end-ns'])  # ignore 'comment' & 'pi'
        with closing(parser) as parser:
            parser.feed(xml_bytes)
            return self._parser_read_events(parser)
Exemplo n.º 4
0
 def _init_parser(self):
     """Reset the XML parser and primes it with a document tag."""
     self._parser = XMLPullParser(['start', 'end'])
     # Add a junk root tag so we constantly get data
     self._parser.feed("<HomeAssistant>\n")
     # Store the root tag so we can clear it to avoid amassing memory
     for (_, elem) in self._parser.read_events():
         self._root = elem
     # Reset data
     self._data = [{}]
Exemplo n.º 5
0
    def read_feed(self):
        feed = None
        is_feed_header = True
        description = ''
        title = ''
        link = ''
        language = ''
        author = ''
        published_on = ''
        guid = ''
        try:
            data = self.read()
            parser = XMLPullParser(['start', 'end'])
            parser.feed(data)
            for event, elem in list(parser.read_events()):
                if event == 'start':
                    local_part = elem.tag
                    if local_part == 'item':
                        if is_feed_header == True:
                            is_feed_header = False
                            feed = Feed(title, link, description, language,
                                        published_on)

                    elif local_part == 'title':
                        title = elem.text
                    elif local_part == 'description':
                        description = elem.text
                    elif local_part == 'link':
                        link = elem.text
                    elif local_part == 'guid':
                        guid = elem.text
                    elif local_part == 'language':
                        language = elem.text
                    elif local_part == 'author':
                        author = elem.text
                    elif local_part == 'published_on':
                        published_on = elem.text
                    elif event == 'end':
                        if elem.get_tag() == 'item':
                            feed_message = FeedMessage(title, link,
                                                       description, author,
                                                       guid)
                            feed.entries.append(feed_message)
                            continue
        except ParseError as pe:
            print(str(pe.code) + ": " + pe.get_reason())

        return feed
Exemplo n.º 6
0
class RequestParser:
    def __init__(self):
        self._parser = XMLPullParser(['start', 'end'])
        self._root_element = None

    def has_ended(self, data: bytes) -> bool:
        self._parser.feed(data)

        for event, element in self._parser.read_events():
            if event == 'start' and self._root_element is None:
                self._root_element = element
            elif event == 'end' and self._root_element is not None:
                if element.tag == self._root_element.tag:
                    return True

        return False
Exemplo n.º 7
0
    def retrieve_namespaces(self):

        if version_info < (3, 4):
            raise NotImplementedError('Python 3.4 or higher is required.')

        from xml.etree.ElementTree import XMLPullParser

        ns = {}
        parser = XMLPullParser(['start-ns'])
        parser.feed(self.text)
        for e in parser._events_queue:
            if e[0] != 'start-ns':
                continue
            prefix, uri = e[1]
            if uri not in ns.keys():
                ns[uri] = prefix

        return dict((v, k) for k, v in ns.items())
Exemplo n.º 8
0
    def parse(self, xml_bytes):
        """
        Given *xml_bytes* return the data as a tree of Python objects.

        Args:
            xml_bytes (bytes): Byte string of XML data.

        Returns:
            results
        """
        assert xml_bytes
        assert isinstance(xml_bytes, bytes)

        parser = XMLPullParser(['start', 'end', 'start-ns', 'end-ns'])  # ignore 'comment' & 'pi'

        with closing(parser) as parser:
            parser.feed(xml_bytes)
            return self._parser_read_events(parser)
Exemplo n.º 9
0
 async def handle_task(self, task_id, session):
     """
         async worker. Gets xml map file finds recipe urls, fetches html,
         and sends to parser
     """
     while not self.xml_queue.empty():
         xml_url = await self.xml_queue.get()
         print(f'worker {task_id}: fetching file {xml_url}')
         # get xml with recipes
         recipe_xml = await self.load_xml_gz(xml_url, session)
         recipe_xml_parser = XMLPullParser(['start', 'end'])
         recipe_xml_parser.feed(recipe_xml)
         for event, element in recipe_xml_parser.read_events():
             if 'loc' in element.tag and event == 'start':
                 url = element.text
                 if '/recipes/' in url:
                     html = await self.fetch_url(url, session)
                     await self.parse(html, url)
                     self.pages_scraped += 1
                     if not self.pages_scraped % 50:
                         print(f'{self.pages_scraped} pages scraped',
                               ' so far')
Exemplo n.º 10
0
    def _handle(parser: XMLPullParser):
        events = parser.read_events()
        nonlocal element_stack

        for action, elem in events:
            elem: Element
            if action == 'start':
                element_stack.append(elem)
            elif action == 'end':
                last_tag = _simplify(element_stack[-1].tag)
                current_tag = _simplify(elem.tag)
                if last_tag != current_tag:
                    raise Exception('unmatched tag, start: {}, end: {}'.format(
                        last_tag, current_tag))
                if path == list(map(lambda x: _simplify(x.tag),
                                    element_stack)):
                    yield elem

                element_stack = element_stack[:-1]
                if len(element_stack) > 0:
                    element_stack[-1].clear()
Exemplo n.º 11
0
# coding = utf-8
from xml.etree.ElementTree import XMLPullParser

events = ("start", "end", "start-ns", "end-ns")
parser = XMLPullParser(events=events)
fd = open('books.xml', 'r')
xml_data = fd.read()
parser.feed(xml_data)
# 转换成列表操作
re_events = list(parser.read_events())
# 构造xml的root
root_element = re_events[0][1]


# 从根节点偏离element树
def list_tree(element, depth):
    print('\t' * depth, element.tag, ":",
          element.text if element.text.strip() != '' else '')
    children_elements = element.getchildren()
    if children_elements:
        for e_ in children_elements:
            list_tree(e_, depth + 1)


list_tree(root_element, 0)
Exemplo n.º 12
0
 def __init__(self):
     self._parser = XMLPullParser(['start', 'end'])
     self._root_element = None
from xml.etree.ElementTree import XMLPullParser
from MongodbClient import MyMongoClient
import sys

client = MyMongoClient()
collection = client.get_collection('UsersLowRep')
UsersFilePath = './Data/Users.xml'
startId = int(sys.argv[1]) if len(sys.argv) > 1 else 0
dbThreshold = int(sys.argv[2]) if len(sys.argv) > 2 else None
nextSwitchId = startId + dbThreshold if dbThreshold is not None else None
reputationThreshold = 100
viewThreshold = 100

parser = XMLPullParser(events=['end'])
with open(file=UsersFilePath) as f:
    Id = 0
    counter = 0
    rep = 0
    for line in f:
        parser.feed(line)
        for event, elem in parser.read_events():
            if elem.tag == 'row':
                Id = int(elem.get('Id'))
                if Id < startId:
                    continue
                # rep += int(elem.get('Reputation'))
                # counter += 1
                reputation = int(elem.get('Reputation'))
                if elem.get('Views') is not None:
                    viewCount = int(elem.get('Views'))
                if elem.get('UpVotes') is not None:
                        '$lt': endId
                    }
                }]
            }, {
                'Id': 1,
                'CommentCount': 1
            }))
print('Found %d entries from DB' % len(res))
entriesNum = len(res)
postIdset = set()
commentCount = dict()
for d in res:
    postIdset.add(d['Id'])
    commentCount[d['Id']] = d['CommentCount']
comments = {}
parser = XMLPullParser(events=['end'])
with open(file=CommentFilePath) as f:
    counter = 0  #Things to fix, something wrong with the parser, we need to put line contraints on it
    for line in f:
        # if counter <= 1:
        #     parser.feed(line)
        counter += 1
        if counter % 1000000 == 0:
            print('At line %d' % counter)
            parser.feed('</comments>')
            parser.close()
            parser = XMLPullParser(events=['end'])
            parser.feed('<comments>')
        # if counter <= 56000000:
        #     continue
        # if counter > 56200000:
Exemplo n.º 15
0
class Device(object):
    """Handles a Raven or Emu serial device."""
    def __init__(self, device):
        """Open the Raven or Emu and prepares for parsing."""
        from serial import Serial
        self._dev = Serial(device, 115200, timeout=0)
        self._sanitizer = re.compile(r'[^\sa-zA-Z0-9<>/_-]')
        self._init_parser()

    def _init_parser(self):
        """Reset the XML parser and primes it with a document tag."""
        self._parser = XMLPullParser(['start', 'end'])
        # Add a junk root tag so we constantly get data
        self._parser.feed("<HomeAssistant>\n")
        # Store the root tag so we can clear it to avoid amassing memory
        for (_, elem) in self._parser.read_events():
            self._root = elem
        # Reset data
        self._data = [{}]

    def update(self):
        """Pull and parse new data from the serial device."""
        try:
            serial_data = self._dev.read(1024).decode()
            self._parser.feed(self._sanitizer.sub('', serial_data))
            for (event, elem) in self._parser.read_events():
                if event == 'start':
                    self._data.append({})
                else:
                    data = self._data.pop()
                    data['text'] = elem.text
                    self._data[-1][elem.tag] = data
                if len(self._data) == 1:
                    # Clear the element from root
                    self._root.remove(elem)
        except ParseError:
            self._init_parser()

    def get(self, field):
        """Return the data accumulated for a given XML tag."""
        return self._data[0][field]

    def query_instantaneous_demand(self):
        """Request updates on instantaneous demand."""
        self._dev.write(b"<Command>\n" +
                        b"  <Name>get_instantaneous_demand</name>\n" +
                        b"  <Refresh>Y</Refresh>\n"
                        b"</Command>\n")
        self._dev.flush()

    def query_summation_delivered(self):
        """Request updates on the various summations."""
        self._dev.write(b"<Command>\n" +
                        b"  <Name>get_current_summation_delivered</name>\n" +
                        b"  <Refresh>Y</Refresh>\n" + b"</Command>\n")
        self._dev.flush()

    def query_current_price(self):
        """Request updates on pricing."""
        self._dev.write(b"<Command>\n" +
                        b"  <Name>get_current_price</Name>\n" +
                        b"  <Refresh>Y</Refresh>\n" + b"</Command>\n")
        self._dev.flush()