Пример #1
0
def getUrlImgs(url, urlid, soup):

    config = Config()
    myConfiguration = config.getConfParser()
    out_folder = myConfiguration.get('files', 'img')

    imagelist = []
    urlHelpObj = UrlHelper()
    parsed = list(urlparse.urlparse(url))
    pathParsed = parsed[2]

    for image in soup.findAll("img"):
        try:
            src = image['src']
            if src:
                filename = image["src"].split("/")[-1]
                outpath = os.path.join(out_folder, filename)
                extension = os.path.splitext(outpath)[1]

                if extension.lower() in [
                        '.gif', '.jpg', '.png', '.bmp', '.jpeg', '.tif',
                        '.tiff'
                ]:
                    if urlHelpObj.isUrl(src):
                        retriveImg(src, outpath)
                        imagelist.append(filename)
                    else:
                        parsed[2] = image["src"]
                        urlExist = httpExists(urlparse.urlunparse(parsed))
                        if urlExist == 1:
                            retriveImg(urlparse.urlunparse(parsed), outpath)
                            imagelist.append(filename)
                        else:
                            virDirs = pathParsed.split('/')
                            for dir in virDirs:
                                if dir:
                                    parsed[2] = dir + "/" + image["src"]
                                    urlExist = httpExists(
                                        urlparse.urlunparse(parsed))
                                    if urlExist == 1:
                                        retriveImg(urlparse.urlunparse(parsed),
                                                   outpath)
                                        imagelist.append(filename)
        except:
            logger.logInfo("no src found in img tag  : " +
                           str(sys.exc_info()[0]))

    imagelist = f7(imagelist)
    imageProcessing(urlid, imagelist)
Пример #2
0
def getUrlImgs(url,urlid,soup):
    
    config=Config()
    myConfiguration=config.getConfParser()
    out_folder=myConfiguration.get('files', 'img')
    
    imagelist=[]
    urlHelpObj=UrlHelper()
    parsed = list(urlparse.urlparse(url))
    pathParsed=parsed[2]  

    for image in soup.findAll("img"):
        try:  
            src= image['src']
            if src:
                filename = image["src"].split("/")[-1]
                outpath = os.path.join(out_folder, filename)
                extension = os.path.splitext(outpath)[1]
                
                if extension.lower() in ['.gif','.jpg','.png','.bmp','.jpeg','.tif','.tiff']:
                    if urlHelpObj.isUrl(src):
                        retriveImg(src,outpath)
                        imagelist.append(filename)
                    else:
                        parsed[2] = image["src"]
                        urlExist=httpExists(urlparse.urlunparse(parsed))
                        if urlExist==1:
                            retriveImg(urlparse.urlunparse(parsed),outpath)
                            imagelist.append(filename)
                        else:
                            virDirs=pathParsed.split('/')
                            for dir in virDirs:
                                if dir:
                                    parsed[2] = dir+"/"+image["src"]
                                    urlExist=httpExists(urlparse.urlunparse(parsed))
                                    if urlExist==1:
                                        retriveImg(urlparse.urlunparse(parsed),outpath)
                                        imagelist.append(filename)
        except:
            logger.logInfo("no src found in img tag  : "+str(sys.exc_info()[0]))
            
    imagelist=f7(imagelist)
    imageProcessing(urlid,imagelist)
Пример #3
0
    for saveimg in saveImgList:
        urlproviderObj.saveUrlIMG(urlid,saveimg)
        


                            
logger=AppLog()
urlproviderObj=Urlprovider()
urlHelpObj=UrlHelper()

try:

    urls=urlproviderObj.getURLs()

    for url in urls:
        if httpExists(url['orgurl']):
            urlid=url['id']
            soup=getUrlHTMLsoup(url['orgurl'])
            if soup:
                headrDict=urlHelpObj.getHeaderInfo(soup)
                urlproviderObj.addUrlHeaderInfo(urlid,headrDict['title'], headrDict['description'])
                getUrlImgs(url['orgurl'],urlid,soup)
                urlproviderObj.updateUrlStatus(urlid,1)
            else:
                urlproviderObj.updateUrlStatus(urlid,2)
        else:
            urlproviderObj.updateUrlStatus(urlid,3)
                
except:
    logger.logInfo("in urldatagrapper.py  : "+str(sys.exc_info()[0]))
Пример #4
0
                           str(sys.exc_info()[0]))

    for saveimg in saveImgList:
        urlproviderObj.saveUrlIMG(urlid, saveimg)


logger = AppLog()
urlproviderObj = Urlprovider()
urlHelpObj = UrlHelper()

try:

    urls = urlproviderObj.getURLs()

    for url in urls:
        if httpExists(url['orgurl']):
            urlid = url['id']
            soup = getUrlHTMLsoup(url['orgurl'])
            if soup:
                headrDict = urlHelpObj.getHeaderInfo(soup)
                urlproviderObj.addUrlHeaderInfo(urlid, headrDict['title'],
                                                headrDict['description'])
                getUrlImgs(url['orgurl'], urlid, soup)
                urlproviderObj.updateUrlStatus(urlid, 1)
            else:
                urlproviderObj.updateUrlStatus(urlid, 2)
        else:
            urlproviderObj.updateUrlStatus(urlid, 3)

except:
    logger.logInfo("in urldatagrapper.py  : " + str(sys.exc_info()[0]))