예제 #1
0
    def extractDict(self):        
        if checkPath(homepath,self.folder,self.urls):
            pass
        else:
            try:
                if self.kind=="1":
                    self.sell(self.urls)
                elif self.kind=="2":
                    self.rent(self.urls)
                elif self.kind=="3":
                    self.buy(self.urls)
                else:
                    self.require(self.urls)
                makePath(homepath,self.folder,self.urls)                
                #超过七天
                
#                if (time.time() -self.fd["posttime"]) > 7*24*36000:return
            except Exception,e:
                msglogger.info("%s 链接采集异常"%self.urls)
#                print "%s||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||"%self.urls
            self.fd["c"]="houseapi"
            self.fd["a"]="savehouse"        
            self.fd["is_checked"] = 1        
            self.fd["web_flag"]   = "gj"
            print "%s %s %s %s %s"%(("%s.soufun.com"% self.citycode),self.citycode, self.kind ,time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time())), self.urls)
            return self.fd
        
            if not self.fd["is_checked"]:
                for i in self.fd.items():
                    print i[0],i[1]
            print  "*"*80
예제 #2
0
    def extractDict(self):    
        if checkPath(homepath,self.folder,self.urls):
            pass
        else:
            try:
                self.fd["posttime"] = 0
                if self.kind=="1":
                    self.sell(self.urls)
                elif self.kind=="2":
                    self.rent(self.urls)
                elif self.kind=="3":
                    self.buy(self.urls)
                else:
                    self.require(self.urls)
                self.fd['city'] = urlparse(self.urls)[1].replace('.58.com',"") 
                makePath(homepath,self.folder,self.urls)                
                #超过七天
#                if self.fd["posttime"]:
#                    if (time.time() -self.fd["posttime"]) > 7*24*36000:return
            except Exception,e:
                msglogger.info("%s 链接采集异常"%self.urls)
#                print "%s||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||"%self.urls

            if isDEV:
#                self.fd.update(getDefaultVal(4))
                dfv=getDefaultVal(self.kind)
                for item in dfv.items() :
#                    print item[0],item[1]
                    if item[0] not in  self.fd:
                        self.fd[item[0]]=dfv.get(item[0])
                for item in dfv.items() :
                    print item[0],self.fd[item[0]],type(self.fd[item[0]])
                    
                return
            else:
                dfv=getDefaultVal(self.kind)
                for item in dfv.items() :
#                    print item[0],item[1]
                    if item[0] not in  self.fd:
                        self.fd[item[0]]=dfv.get(item[0])
            try:
                if self.fd['city'] == 'su':self.fd['city'] = 'suzhou'
            except:
                self.fd['city'] = 'suzhou'
            self.fd["is_checked"] = 1      
            self.fd["web_flag"]   = "58"
            if self.fd.get('is_ok')==False:
#                print "jjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjj"
                self.fd={}
            #print "%s %s %s %s %s"%(("%s.soufun.com"% self.citycode),self.citycode, self.kind ,time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time())), self.urls)
            return self.fd
        
            if not self.fd["is_checked"]:
                for i in self.fd.items():
                    print i[0],i[1]                        
            req=urllib2.Request("http://site.jjr360.com/app.php", urllib.urlencode(self.fd))
            p=self.br.open(req).read().strip()
            print p.decode('gbk')
예제 #3
0
파일: ganji.py 프로젝트: ptphp/PyLib
    def extractDict(self):        
        if checkPath(homepath,self.folder,self.urls):
            pass
        else:
            try:
                if self.kind=="1":
                    self.sell(self.urls)
                elif self.kind=="2":
                    self.rent(self.urls)
                elif self.kind=="3":
                    self.buy(self.urls)
                else:
                    self.require(self.urls)
                makePath(homepath,self.folder,self.urls)                
                #超过七天
                
#                if (time.time() -self.fd["posttime"]) > 7*24*36000:return
            except Exception,e:
                self.fd['house_title']=None
                msglogger.info("%s 链接采集异常"%self.urls)
#                print "%s||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||"%self.urls
            if isDEV:
#                self.fd.update(getDefaultVal(4))
                dfv=getDefaultVal(self.kind)
                for item in dfv.items() :
#                    print item[0],item[1]
                    if item[0] not in  self.fd:
                        self.fd[item[0]]=dfv.get(item[0])
                for item in dfv.items() :
                    print item[0],self.fd[item[0]],type(self.fd[item[0]])
                    
                return
            else:
                dfv=getDefaultVal(self.kind)
                for item in dfv.items() :
#                    print item[0],item[1]
                    if item[0] not in  self.fd:
                        self.fd[item[0]]=dfv.get(item[0])
            self.fd["is_checked"] = 1        
            self.fd["web_flag"]   = "gj"
            #print "%s %s %s %s %s"%(("%s.soufun.com"% self.citycode),self.citycode, self.kind ,time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time())), self.urls)
            return self.fd
        
            if not self.fd["is_checked"]:
                for i in self.fd.items():
                    print i[0],i[1]
            print  "*"*80
예제 #4
0
파일: soufun.py 프로젝트: ptphp/PyLib
def getLinks(d):
    lc=LinkCrawl(d["citycode"],d["kind"],d["st1"])
    while True:
        lc.runme()
        del gc.garbage[:]
        time.sleep(int(d["st2"]))
def getContent(clinks,citycode,kind):
    cc=ContentCrawl(clinks,citycode,kind)
    fd=cc.extractDict()
    res=""
    try:
        res=postHost(fd)
    except Exception,e:
        res=e
    print res 
    msglogger.info("%s|%s|%s"%(clinks,res,""))
    return fd

  
if __name__=="__main__":
    
    lc=LinkCrawl(citycode="wuxi",kind="4")
    lc.runme()
    
    #cc=ContentCrawl("http://esf.wuxi.soufun.com/chushou/1_119888237_-1.htm#p=1",citycode="wuxi",kind="1")
    #cc=ContentCrawl("http://rent.wuxi.soufun.com/chuzu/1_49544277_-1.htm",citycode="wuxi",kind="2")
    cc=ContentCrawl("http://esf.wuxi.soufun.com/qiugou/1_860333_-1.htm",citycode="wuxi",kind="3")
    #cc=ContentCrawl("http://rent.wuxi.soufun.com/qiuzu/1_55103674_-1.htm",citycode="wuxi",kind="4")
    cc.extractDict()
    
#    lf=file("link.log")
예제 #5
0
파일: tongcheng58.py 프로젝트: ptphp/PyLib
    lc=LinkCrawl(d["citycode"],d["kind"],d["st1"])
    while True:
        lc.runme()
        time.sleep(int(d["st2"]))
        del gc.garbage[:]
def getContent(clinks,citycode,kind,upc):
#    return
    cc=ContentCrawl(clinks,citycode,kind,upc)
    fd=cc.extractDict()
    res=""
    try:
        res=postHost(fd)
    except Exception,e:
        res=e
    print res
    msglogger.info("%s|%s|%s"%(clinks,res,kind))
    del gc.garbage[:]
if __name__=="__main__":    
#    lc=LinkCrawl(citycode="liaoyang",kind="4")
#    lc.runme()
#    url1 = "http://su.58.com/ershoufang/6432469244037x.shtml"
#    url2 = "http://su.58.com/zufang/6437371140226x.shtml"
#    url3 = "http://su.58.com/ershoufang/6383611408516x.shtml"
#    url4 = "http://su.58.com/qiuzu/6268009935368x.shtml"
    
#    cc=ContentCrawl([url4],citycode="su",kind="4")
#    cc.extractDict()
    import gc
    gc.enable()
    s=getLinksThread({"citycode":"liaoyang","kind":"3"})
    s.start()
예제 #6
0
def main():
    for city in citylist:
        startCityThread(city).start()
        msglogger.info("%s 线程启动"%city)