示例#1
0
文件: dark.py 项目: 3bobo/spider
    def imit(self,url):
        date = Slist()
        adom = domain(url)
        try:
            page = urllib2.urlopen(url)
            html = page.read()
            if self.keyword != '':
                date.put(self.search(html,self.keyword))
                date.l_del_sa()
            else:
                url_re = re.compile(r'http://[\.\w//]+',re.S)
                date.put(url_re.findall(html,re.S))
                date.l_del_sa()

            for i in date.li2:
                for i in date.li2:
                    for i in date.li2:
                        dom = domain(i)
                        if dom != adom:
                            date.l_del_da(i)

            self.s1.put(date.li2)
            self.s1.l_del_sa()
        except:
            self.s1.put([])

        return self.s1.li2
示例#2
0
def url_parse(url):

    global nadom, nerror
    adom = domain(url)
    nadom = 'thread_' + str(adom) + '_' + str(ct)
    nerror = 'error_' + str(adom) + '_' + str(ct)
    try:
        page = urllib2.urlopen(url)
        html = page.read()
        if want != 0:
            par_url = search(html, want)
        else:
            url_re = re.compile(r'http://[\.\w//]+',re.S)
            par_url = url_re.findall(html,re.S)

        urld = Slist()
        urld.put(par_url)
        urld.l_del_sa()

        for i in urld.li2:
            for i in urld.li2:
                for i in urld.li2:
                    i_dom = domain(i)
                    if i_dom != adom:
                        urld.l_del_da(i)
                        s.s_wirte(nerror, i)

        urls = urld.li2
        return urls
    except:
        return 0