def manual_redirect_02():
    '''
    Check for pages with incorrect links that provide re-directs that
    were not followed for some reason by urllib2.
    1000 seems to be an idela cutoff lengthfor page size.  "Dead zone"
    from 1000-3000ish.
    '''
    dept_conn = connectMon.MongoConn({'db_name':'EduCrawl',
                                      'coll_name':'Department'})
    dept_conn.query({"$where":"this.Page.length<1000"})
    ##soupypages.makeSoup(short_pages[0][1].strip(r')|(').split(',')[0])

    for i in range(dept_conn.LastQLen):
        cp = dept_conn.LastQ.next()
        if cp['Page']:
            soup = soupypages.makeSoup(cp['Page'])
            meta = soup['soup'].find_all('meta')
            for m in meta:
                try:
                    if m['http-equiv'].lower()=='refresh':
                        try:
                            if m['content'].lower().find('url'):
                                cp['Link'] = soupypages_helper.prepend_url(cp['Link'],
                                                    re.split(u'url=',
                                                             m['content'].lower())[1])
                                new_page = soupypages.makePage(cp['Link'])
                                if new_page['pass']:
                                    cp['Page'] = new_page['page']
                                    dept_conn.coll.update({"_id":cp["_id"]},cp)
                                    print(cp["_id"])
                                    break
                        except KeyError:
                            pass
                except KeyError:
                    pass
Exemplo n.º 2
0
def get_page(obj, overw=False):
    if not 'page' in set(dir(obj)) or\
       not obj.page or\
       overw == True:
        obj = view_page(obj, soupypages.makePage(obj.url, hdr=True))
    else:
        print('Page already retreived. To over-write, recall with overw=True')
    return obj
def get_page(obj, overw=False):
    if not 'page' in set(dir(obj)) or\
       not obj.page or\
       overw == True:
        obj = view_page(obj, soupypages.makePage(obj.url,
                                                 hdr=True))
    else:
        print('Page already retreived. To over-write, recall with overw=True')
    return obj
Exemplo n.º 4
0
def manual_redirect_02():
    '''
    Check for pages with incorrect links that provide re-directs that
    were not followed for some reason by urllib2.
    1000 seems to be an idela cutoff lengthfor page size.  "Dead zone"
    from 1000-3000ish.
    '''
    dept_conn = connectMon.MongoConn({
        'db_name': 'EduCrawl',
        'coll_name': 'Department'
    })
    dept_conn.query({"$where": "this.Page.length<1000"})
    ##soupypages.makeSoup(short_pages[0][1].strip(r')|(').split(',')[0])

    for i in range(dept_conn.LastQLen):
        cp = dept_conn.LastQ.next()
        if cp['Page']:
            soup = soupypages.makeSoup(cp['Page'])
            meta = soup['soup'].find_all('meta')
            for m in meta:
                try:
                    if m['http-equiv'].lower() == 'refresh':
                        try:
                            if m['content'].lower().find('url'):
                                cp['Link'] = soupypages_helper.prepend_url(
                                    cp['Link'],
                                    re.split(u'url=', m['content'].lower())[1])
                                new_page = soupypages.makePage(cp['Link'])
                                if new_page['pass']:
                                    cp['Page'] = new_page['page']
                                    dept_conn.coll.update({"_id": cp["_id"]},
                                                          cp)
                                    print(cp["_id"])
                                    break
                        except KeyError:
                            pass
                except KeyError:
                    pass