return found found_books = set() prev = '' for filename in (i for i in os.listdir(arc_dir) if i.endswith('.arc')): if not filename.startswith('20100412'): continue for url, wire in read_arc(arc_dir + '/' + filename): #print filename, url if url.startswith('file'): continue if not url.startswith('http://www.amazon.com/s?'): continue body = read_body(wire) m = re_title.search(body) if m.group(1) != prev: print(m.group(1)) prev = m.group(1) continue doc = fromstring(body) try: doc.get_element_by_id('noResultsTitle') continue except KeyError: pass rc = doc.find_class('resultCount') if rc: m = re_result_count.match(rc[0].text) if m:
found.append(m.group(1)) return found found_books = set() prev = '' for filename in (i for i in os.listdir(arc_dir) if i.endswith('.arc')): if not filename.startswith('20100412'): continue for url, wire in read_arc(arc_dir +'/' + filename): #print filename, url if url.startswith('file'): continue if not url.startswith('http://www.amazon.com/s?'): continue body = read_body(wire) m = re_title.search(body) if m.group(1) != prev: print(m.group(1)) prev = m.group(1) continue doc = fromstring(body) try: doc.get_element_by_id('noResultsTitle') continue except KeyError: pass rc = doc.find_class('resultCount') if rc: m = re_result_count.match(rc[0].text) if m: