def _getdetail(self): rdelete = re.compile(r'\r') ndelete = re.compile(r'\n') for url in self.links.keys(): ob = gethtml_and_soup.Gethtml_and_soup(url, random.randint(10, 30)) try: html = ob.gethtml() soup = ob.getsoup() title = ob.soup.find('h1').text title = rdelete.sub('', title) title = ndelete.sub('', title) content = ob.soup.find('div', { 'class': 'userCarPhotoMemo' }).text content = rdelete.sub('', content) content = ndelete.sub('', content) owner = ob.soup.find('h2', { 'class': 'car_title car_header' }).text except Exception as e: print(e) print('url:{url}'.format(url=url)) title = '=NODATA=' content = '=NODATA=' owner = '=NODATA=' self.links[url]['title'] = title self.links[url]['content'] = content self.links[url]['ownerdetail'] = owner
def getlinks(self): for i in range(1, 50): print('page:', i) self.url = self.baseurl.format(page=i) self.ob = gethtml_and_soup.Gethtml_and_soup( self.url, random.randint(10, 30)) try: html = self.ob.gethtml() except Exception as e: print(e) break self.ob.getsoup() self._spoitlink() self._getdetail() self.save_contents()
def run(self): rdelete = re.compile(r'\r') ndelete = re.compile(r'\n') while True: url = self.queue.get() logging.warning('NAME:{name}---{url}'.format(name=self.getName(), url=url)) #print(url) if url == None: break time.sleep(10) ob = gethtml_and_soup.Gethtml_and_soup(url, random.randint(10, 30)) try: html = ob.gethtml() soup = ob.getsoup() except Exception as e: print(e, ':row33') try: ob.title = ob.soup.find('h1').text ob.title = rdelete.sub('', ob.title) ob.title = ndelete.sub('', ob.title) except Exception as e: print(e, ':row39') ob.title = '=NO DATA=' try: ob.content = ob.soup.find('div', { 'class': 'userCarPhotoMemo' }).text ob.content = rdelete.sub('', ob.content) ob.content = ndelete.sub('', ob.content) except Exception as e: print(e, 'NAME:{name}--:row46'.format(name=self.getName())) ob.content = '=NO DATA=' try: ob.owner = ob.soup.find('h2', { 'class': 'car_title car_header' }).text except Exception as e: print(e, 'NAME:{name}--:row51'.format(name=self.getName())) ob.owner = '=NO DATA=' #print(ob.title,ob.owner,ob.url,ob.content) logging.debug('{title}({owner})--{url}--{content}'.format( title=ob.title, owner=ob.owner, url=ob.url, content=ob.content)) self.queue2.put((ob.url, ob.title, ob.owner, ob.content))
def getlinks(self): for i in range(1, 50): print('page:', i) self.url = self.baseurl.format(page=i) self.ob = gethtml_and_soup.Gethtml_and_soup( self.url, random.randint(10, 30)) try: html = self.ob.gethtml() soup = self.ob.getsoup() self._spoitlink() except Exception as e: print(e) break for k in self.links.keys(): insertsql = 'insert into links(url) values(?)' with db.connect('minkara.db') as con: try: con.execute( 'create table if not exists links(url text unique)') con.commit() con.execute(insertsql, (k, )) con.commit() except Exception as e: print(e, k)