Пример #1
0
def csviter(obj, delimiter=None, headers=None, encoding=None):
    """ Returns an iterator of dictionaries from the given csv object

    obj can be:
    - a Response object
    - a unicode string
    - a string encoded as utf-8

    delimiter is the character used to separate field on the given obj.

    headers is an iterable that when provided offers the keys
    for the returned dictionaries, if not the first row is used.
    """
    encoding = obj.encoding if isinstance(obj, TextResponse) else encoding or 'utf-8'
    def _getrow(csv_r):
        return [str_to_unicode(field, encoding) for field in csv_r.next()]

    lines = StringIO(body_or_str(obj, unicode=False))
    if delimiter:
        csv_r = csv.reader(lines, delimiter=delimiter)
    else:
        csv_r = csv.reader(lines)

    if not headers:
        headers = _getrow(csv_r)

    while True:
        row = _getrow(csv_r)
        if len(row) != len(headers):
            log.msg(format="ignoring row %(csvlnum)d (length: %(csvrow)d, should be: %(csvheader)d)",
                    level=log.WARNING, csvlnum=csv_r.line_num, csvrow=len(row), csvheader=len(headers))
            continue
        else:
            yield dict(zip(headers, row))
Пример #2
0
def csviter(obj, delimiter=None, headers=None, encoding=None):
    """ Returns an iterator of dictionaries from the given csv object

    obj can be:
    - a Response object
    - a unicode string
    - a string encoded as utf-8

    delimiter is the character used to separate field on the given obj.

    headers is an iterable that when provided offers the keys
    for the returned dictionaries, if not the first row is used.
    """
    encoding = obj.encoding if isinstance(obj, TextResponse) else encoding or 'utf-8'
    def _getrow(csv_r):
        return [str_to_unicode(field, encoding) for field in csv_r.next()]

    lines = StringIO(body_or_str(obj, unicode=False))
    if delimiter:
        csv_r = csv.reader(lines, delimiter=delimiter)
    else:
        csv_r = csv.reader(lines)

    if not headers:
        headers = _getrow(csv_r)

    while True:
        row = _getrow(csv_r)
        if len(row) != len(headers):
            log.msg("ignoring row %d (length: %d, should be: %d)" % (csv_r.line_num, len(row), len(headers)), log.WARNING)
            continue
        else:
            yield dict(zip(headers, row))
Пример #3
0
 def parseList(self, response):
     nodename = 'loc'
     text = body_or_str(response)
     r = re.compile(r"(<%s[\s>])(.*?)(</%s>)" % (nodename, nodename),
                    re.DOTALL)
     for match in r.finditer(text):
         url = match.group(2)
         yield scrapy.Request(url, self.parse_items)
Пример #4
0
	def parse(self, response):
		nodename = 'loc'
		text = body_or_str(response)
		r = re.compile(r"(<%s[\s>])(.*?)(</%s>)" % (nodename, nodename), re.DOTALL)
		for match in r.finditer(text):
			url = match.group(2)
			if url != '':
			#if url == 'http://www.mytime.de/Haushalt/Kueche/Kuechenkleingeraete/Eismaschinen/Bestron_DHY_1705_Eismaschine-_gruen_0810951088.html' or url == 'http://www.mytime.de/Suesswaren_und_Knabbereien/Suessgebaeck/Waffeln_und_Waffelmischungen/Findeisen_Original_Waffel-Eistueten_4502020624.html' or url == 'http://www.mytime.de/Schreibwaren/Schreiben_und_Zeichnen/Fineliner/Stabilo_OHPen_universal_permanent_Folienschreiber_superfein_gruen_4510050990.html' or url =='http://www.mytime.de/Schreibwaren/Ordnen-_Archivieren_und_Organisieren/Schreibtisch-Utensilien/Casio_XR-9X1_Schriftband_9mm_Black_Ink_clear_4510050962.html' or url =='http://www.mytime.de/Schreibwaren/Schreiben_und_Zeichnen/Farbstifte_und_Filzstifte/Stabilo_Trio_Scribbi_Fasermaler_gruen_4510050924.html' or url=='http://www.mytime.de/Suesswaren_und_Knabbereien/Knabbereien/Nuesse_und_Knabbermischungen/Jeden_Tag_Cashew-Kerne_geroestet_und_gesalzen_4502020587.html' or url== 'http://www.mytime.de/Suesswaren_und_Knabbereien/Knabbereien/Nuesse_und_Knabbermischungen/Seeberger_Blanchierte_Mandeln_Honig_und_Salz_4502111669.html':
				yield Request(url, callback=self.parse_page)
Пример #5
0
    def test_body_or_str_encoding(self):
        self.assertTrue(isinstance(body_or_str(self.dummy_response, unicode=False), str))
        self.assertTrue(isinstance(body_or_str(self.dummy_response, unicode=True), unicode))

        self.assertTrue(isinstance(body_or_str('text', unicode=False), str))
        self.assertTrue(isinstance(body_or_str('text', unicode=True), unicode))

        self.assertTrue(isinstance(body_or_str(u'text', unicode=False), str))
        self.assertTrue(isinstance(body_or_str(u'text', unicode=True), unicode))
Пример #6
0
    def test_body_or_str_encoding(self):
        self.assertTrue(
            isinstance(body_or_str(self.dummy_response, unicode=False), str))
        self.assertTrue(
            isinstance(body_or_str(self.dummy_response, unicode=True),
                       unicode))

        self.assertTrue(isinstance(body_or_str('text', unicode=False), str))
        self.assertTrue(isinstance(body_or_str('text', unicode=True), unicode))

        self.assertTrue(isinstance(body_or_str(u'text', unicode=False), str))
        self.assertTrue(isinstance(body_or_str(u'text', unicode=True),
                                   unicode))
Пример #7
0
 def parse(self, response):
     nodename = 'loc'
     list = []
     text = body_or_str(response)
     r = re.compile(r"(<%s[\s>])(.*?)(</%s>)" % (nodename, nodename), re.DOTALL)
     for match in r.finditer(text):
         url = match.group(2)
         list.append(url)
     # post_nodes = response.css('tbody .floated-thumb .post-thumb a')
     # 循环整个数组,并将当前列表页每一个节点交给scrapy
     for post_node in list:
         # 解析每一个节点上的url
         if re.search('breed',post_node):
             post_url = post_node
         # yield 关键字会将后面的网址自动交个scrapy进行下载
         # urljoin方法会将url拼接成完整的地址
         # callback回调函数将给scrapy下载目标网址操作执行完成之后执行的方法
         # 并将列表页取到的img地址,通过meat属性进行传递,交给parse_detail函数
             yield Request(url=post_url, callback=self.parse_detail,  dont_filter=True)
Пример #8
0
def xmliter(obj, nodename):
    """Return a iterator of XPathSelector's over all nodes of a XML document,
       given tha name of the node to iterate. Useful for parsing XML feeds.

    obj can be:
    - a Response object
    - a unicode string
    - a string encoded as utf-8
    """
    HEADER_START_RE = re.compile(r"^(.*?)<\s*%s(?:\s|>)" % nodename, re.S)
    HEADER_END_RE = re.compile(r"<\s*/%s\s*>" % nodename, re.S)
    text = body_or_str(obj)

    header_start = re.search(HEADER_START_RE, text)
    header_start = header_start.group(1).strip() if header_start else ""
    header_end = re_rsearch(HEADER_END_RE, text)
    header_end = text[header_end[1] :].strip() if header_end else ""

    r = re.compile(r"<%s[\s>].*?</%s>" % (nodename, nodename), re.DOTALL)
    for match in r.finditer(text):
        nodetext = header_start + match.group() + header_end
        yield XmlXPathSelector(text=nodetext).select("//" + nodename)[0]
Пример #9
0
def xmliter(obj, nodename):
    """Return a iterator of Selector's over all nodes of a XML document,
       given tha name of the node to iterate. Useful for parsing XML feeds.

    obj can be:
    - a Response object
    - a unicode string
    - a string encoded as utf-8
    """
    HEADER_START_RE = re.compile(r'^(.*?)<\s*%s(?:\s|>)' % nodename, re.S)
    HEADER_END_RE = re.compile(r'<\s*/%s\s*>' % nodename, re.S)
    text = body_or_str(obj)

    header_start = re.search(HEADER_START_RE, text)
    header_start = header_start.group(1).strip() if header_start else ''
    header_end = re_rsearch(HEADER_END_RE, text)
    header_end = text[header_end[1]:].strip() if header_end else ''

    r = re.compile(r"<%s[\s>].*?</%s>" % (nodename, nodename), re.DOTALL)
    for match in r.finditer(text):
        nodetext = header_start + match.group() + header_end
        yield Selector(text=nodetext, type='xml').xpath('//' + nodename)[0]
Пример #10
0
 def test_body_or_str_extraction(self):
     self.assertEqual(body_or_str(self.dummy_response), 'dummy_response')
     self.assertEqual(body_or_str('text'), 'text')
Пример #11
0
 def test_body_or_str_input(self):
     self.assertTrue(isinstance(body_or_str(self.dummy_response), basestring))
     self.assertTrue(isinstance(body_or_str('text'), basestring))
     self.assertRaises(Exception, body_or_str, 2)
Пример #12
0
 def test_body_or_str_extraction(self):
     self.assertEqual(body_or_str(self.dummy_response), 'dummy_response')
     self.assertEqual(body_or_str('text'), 'text')
Пример #13
0
 def test_body_or_str_input(self):
     self.assertTrue(
         isinstance(body_or_str(self.dummy_response), basestring))
     self.assertTrue(isinstance(body_or_str('text'), basestring))
     self.assertRaises(Exception, body_or_str, 2)
Пример #14
0
 def test_body_or_str_extraction(self):
     self.assertEqual(body_or_str(self.dummy_response), "dummy_response")
     self.assertEqual(body_or_str("text"), "text")