Exemplo n.º 1
0
    def __getPageAllLink(self,p):        
#        if self.kind=="1":
#            lis=PyQuery(p)("div.qiuzu li")
#        elif self.kind=="2":
#            lis=PyQuery(p)("div.qiuzu li")
        if self.kind=="1" or self.kind=="2":
            lis=PyQuery(p)("div.house")
        else:
            lis=PyQuery(p)("div.qiuzu li")
        links=[]
        for li in lis:
#            if self.kind=="3":
#                tm=PyQuery(li)("p.time span").eq(1).text()
#                link=self.baseurl+PyQuery(li)("p.housetitle a").attr("href")
            if self.kind=="2" or self.kind=="1":
                tm=PyQuery(li)("p.time").text()
                tm=tm and tm.replace("个人","") or ""
                link=self.baseurl+PyQuery(li)("p.housetitle a").attr("href")
            else: 
                tm=PyQuery(li)("span.li5").text()
                link=self.baseurl+PyQuery(li)("span.li2 a").attr("href")
            if self.kind=="4": 
                if PyQuery(li)("span.li1").text()=="合租 ":
                    continue
#            tm=PyQuery(li)("span.li5").text()
#            link=self.baseurl+PyQuery(li)("span.li2 a").attr("href")
            #link=self.baseurl+PyQuery(li)("span.li2 a").attr("href")
#            print link
            if u"天" in tm:
                s=tm.find(u"天")
                tm=tm[:s]
                if int(tm)<8:
                    links.append(link)
                else:
                    break
            elif u"小时" in tm:
                links.append(link)
            elif u"分钟" in tm:
                links.append(link)
            else:
                continue
            if 1:#not checkPath(homepath,self.folder,link):
                LinkLog.info("%s|%s"%(self.kind,link))
                try:
                    getContent(link,self.citycode,self.kind)
                except Exception,e:print "ganji getContent Exception %s"%e
            time.sleep(int(self.st))
#            fetch_quere.put({"mod":"soufang","link":link,"citycode":self.citycode,"kind":self.kind})
#        self.clinks.extend(links)
       
        if self.kind=="1" or self.kind=="2":
            if len(links)!=30:
                return False
            else:
                return True
        else:
            if len(links)!=35:
                return False
            else:
                return True
Exemplo n.º 2
0
    def __getAllNeedLinks(self):
        cond=True
        idx=0
        checkit="0"
        while  cond:
            url=self.baseUrl+self.urlpath%(str(idx+1))
#            print url
            req=urllib2.Request(url, None, self.header)
            try:
                p=self.br.open(req).read()
            except:
                continue
            check=PyQuery(p)("div.pager strong span").text()
            if check ==None or check==checkit:
                cond=False
                break
            else:
                checkit=check
                if self.kind=="1" or self.kind=="3":
                    links=PyQuery(p)("table.tbimg td.t")
                elif self.kind=="2" or self.kind=="4":
                    links=PyQuery(p)("table.tblist tr")
                p=None
#                print len(links)
                for link in links:
                    if self.kind=="1" or self.kind=="3":
                        if re.search(ur'''更新时间:(.*)''',PyQuery(link).text()):
                            tm=re.search(ur'''更新时间:(.*)''',PyQuery(link).text()).group(1)
                    elif self.kind=="2"or self.kind=="4":
                        tm=PyQuery(link)("td.tc").eq(2).text()
                    if u"今天" in tm:
                        pass
                    elif u"小时" in tm:
                        pass
                    elif u"分钟" in tm:
                        pass
                    else:
                        Y=int(time.strftime('%Y', time.localtime()))
                        ttt="%s-%s"%(Y,tm)
                        if ttt<self.endtime:
                            cond=False
                            break
                    lk=PyQuery(link)("a.t").attr("href")
                    
#                    print lk
                    if not checkPath(homepath,self.folder,lk):
                        LinkLog.info("%s|%s"%(self.kind,lk))
                        try:
                            getContent(lk,self.citycode,self.kind,self.upc)
                        except Exception,e:print "58 getContent Exception %s"%e
                    time.sleep(int(self.st))
Exemplo n.º 3
0
Arquivo: ganji.py Projeto: ptphp/PyLib
    def __getAllNeedLinks(self):
        cond=True
        idx=0
        checkit="0"
        while  cond:
            url=self.baseUrl+self.urlpath%("f"+str(idx*32))
            #url="http://gz.ganji.com/fang2/u2f0/a1f768/"
#            print url
            try:
                req=urllib2.Request(url, None, self.header)
                p=self.br.open(req).read()
            except:
                continue
            else:
                check=PyQuery(p)("ul.pageLink li a.c").text()
                if check==None or check==checkit:
                    cond=False
                    break
                else:
                    checkit=check
                    links=PyQuery(p)("div.list dl")
                    p=None
#                    print len(links)
                    for link in links:
                        lk=self.baseUrl+PyQuery(link)(" a.list_title").attr("href")
#                        print lk
                        if self.kind=="3" or self.kind=="4":
                            tm=PyQuery(link)("dd span.time").text()
                            if re.match('''\d{2}-\d{2}''', tm):
                                Y=int(time.strftime('%Y', time.localtime()))
                                tm="%s-%s"%(Y,tm.strip())
                                if tm<self.endtime:
                                    cond=False
                                    break
                            elif "分钟" in tm:
                                pass
                            elif "小时" in tm:
                                pass
                            else:
                                cond=False
                                break
                        if not checkPath(homepath,self.folder,lk):
                            LinkLog.info("%s|%s"%(self.kind,lk))
                            try:
                                getContent(lk,self.citycode,self.kind,self.upc)
                            except Exception,e:print "ganji getContent Exception %s"%e
#                            fetch_quere.put({"mod":"ganji","link":lk,"citycode":self.citycode,"kind":self.kind})        
#                        if lk not in self.clinks:
#                            self.clinks.append(lk)
                idx=idx+1