def manual_redirect_02(): ''' Check for pages with incorrect links that provide re-directs that were not followed for some reason by urllib2. 1000 seems to be an idela cutoff lengthfor page size. "Dead zone" from 1000-3000ish. ''' dept_conn = connectMon.MongoConn({'db_name':'EduCrawl', 'coll_name':'Department'}) dept_conn.query({"$where":"this.Page.length<1000"}) ##soupypages.makeSoup(short_pages[0][1].strip(r')|(').split(',')[0]) for i in range(dept_conn.LastQLen): cp = dept_conn.LastQ.next() if cp['Page']: soup = soupypages.makeSoup(cp['Page']) meta = soup['soup'].find_all('meta') for m in meta: try: if m['http-equiv'].lower()=='refresh': try: if m['content'].lower().find('url'): cp['Link'] = soupypages_helper.prepend_url(cp['Link'], re.split(u'url=', m['content'].lower())[1]) new_page = soupypages.makePage(cp['Link']) if new_page['pass']: cp['Page'] = new_page['page'] dept_conn.coll.update({"_id":cp["_id"]},cp) print(cp["_id"]) break except KeyError: pass except KeyError: pass
def get_page(obj, overw=False): if not 'page' in set(dir(obj)) or\ not obj.page or\ overw == True: obj = view_page(obj, soupypages.makePage(obj.url, hdr=True)) else: print('Page already retreived. To over-write, recall with overw=True') return obj
def manual_redirect_02(): ''' Check for pages with incorrect links that provide re-directs that were not followed for some reason by urllib2. 1000 seems to be an idela cutoff lengthfor page size. "Dead zone" from 1000-3000ish. ''' dept_conn = connectMon.MongoConn({ 'db_name': 'EduCrawl', 'coll_name': 'Department' }) dept_conn.query({"$where": "this.Page.length<1000"}) ##soupypages.makeSoup(short_pages[0][1].strip(r')|(').split(',')[0]) for i in range(dept_conn.LastQLen): cp = dept_conn.LastQ.next() if cp['Page']: soup = soupypages.makeSoup(cp['Page']) meta = soup['soup'].find_all('meta') for m in meta: try: if m['http-equiv'].lower() == 'refresh': try: if m['content'].lower().find('url'): cp['Link'] = soupypages_helper.prepend_url( cp['Link'], re.split(u'url=', m['content'].lower())[1]) new_page = soupypages.makePage(cp['Link']) if new_page['pass']: cp['Page'] = new_page['page'] dept_conn.coll.update({"_id": cp["_id"]}, cp) print(cp["_id"]) break except KeyError: pass except KeyError: pass