示例#1
0
 def fetchTextData(self, url, channel):
     try:
         soup = self.fetchUrl(url)
         datalist = soup.findAll("tr", {"class": "tr3 t_one"})
         objs = []
         sortType = dateutil.y_m_d()
         print url, len(datalist)
         for item in datalist:
             ahref = item.first("a")
             if ahref != None and item.first("h3") != None:
                 try:
                     if ahref.get('href').count("read-htm") > 0:
                         continue
                     obj = {}
                     obj['fileDate'] = ''
                     name = item.first("h3").text
                     obj['name'] = name.replace("'", "")
                     print name
                     obj['url'] = ahref.get('href')
                     obj['baseurl'] = baseurl
                     obj['channel'] = channel
                     obj['updateTime'] = datetime.datetime.now()
                     #                         self.t_queue.put(TextItemContentParse(ahref.get('href')))
                     ret = self.fetchText(ahref.get('href'))
                     if ret == None:
                         print '没有文章数据', ahref.get('href')
                         continue
                     obj['sortType'] = sortType
                     objs.append(obj)
                 except Exception as e:
                     print common.format_exception(e)
         return objs
     except Exception as e:
         print common.format_exception(e)
示例#2
0
 def fetchTextData(self, url, channel):
     try:
         soup = self.fetchUrl(url)
         div = soup.first("div", {"class": "box list channel"})
         if div == None:
             print '没有数据', url
             return []
         datalist = div.findAll("li")
         objs = []
         sortType = dateutil.y_m_d()
         for item in datalist:
             ahref = item.first("a")
             if ahref != None:
                 try:
                     obj = {}
                     obj['fileDate'] = ahref.first('span').text
                     obj['name'] = ahref.text.replace(obj['fileDate'], '')
                     print name
                     obj['url'] = ahref.get('href')
                     obj['baseurl'] = baseurl
                     obj['channel'] = channel
                     obj['updateTime'] = datetime.datetime.now()
                     #                         self.t_queue.put(TextItemContentParse(ahref.get('href')))
                     ret = self.fetchText(ahref.get('href'))
                     if ret == None:
                         print '没有文章数据', ahref.get('href')
                         continue
                     obj['sortType'] = sortType
                     objs.append(obj)
                 except Exception as e:
                     print common.format_exception(e)
         return objs
     except Exception as e:
         print common.format_exception(e)
示例#3
0
 def run(self):
     dbVPN = db.DbVPN()
     ops = db_ops.DbOps(dbVPN)
     for i in range(1, maxImgPage):
         objs = self.fetchGirlChannelData(i)
         print "解析 Girl channel图片ok----channel=", self.t_obj[
             'url'], ' size=', len(objs)
         for obj in objs:
             try:
                 sortType = dateutil.y_m_d()
                 obj['sortType'] = sortType
                 ops.inertImgItems(obj)
                 print 'items :', obj['url'], obj[
                     'channel'], " piclen=", len(obj['picList'])
                 for picItem in obj['picList']:
                     item = {}
                     item['itemUrl'] = obj['url']
                     item['picUrl'] = picItem
                     item['origUrl'] = picItem
                     ops.inertImgItems_item(item)
                     print 'items_item :', obj
                 dbVPN.commit()
             except Exception as e:
                 print common.format_exception(e)
     dbVPN.commit()
     dbVPN.close()
示例#4
0
 def fetchImgItemsData(self, url, channel):
     try:
         lis = self.fetchDataHead(url)
         print url, ";itemsLen=", len(lis)
         objs = []
         sortType = dateutil.y_m_d()
         for item in lis:
             obj = {}
             obj['name'] = item.first("div",{"class":"float-left"}).text
             print obj['name']
             obj['url'] = item.get('href')
             obj['fileDate'] = item.first("div",{"class":"float-right"}).text
             obj['baseurl'] = baseurlImg
             obj['channel'] = channel
             obj['updateTime'] = datetime.datetime.now()
             pics = self.fetchImgs(item.get('href'))
             if len(pics) == 0:
                 print '没有 图片文件--', item, '---', url
                 continue
             obj['picList'] = pics
             obj['pics'] = len(pics)
             obj['sortType'] = sortType
             obj['showType'] = 3
             print 'url=', obj['url'], 'filedate=', obj['fileDate'], '  图片数量=', len(pics)
             objs.append(obj)
         return objs
     except Exception as e:
         print common.format_exception(e)
示例#5
0
 def fetchImgItemsData(self, url, channel):
     soup = self.fetchUrl(baseurl6,url)
     datalist = soup.findAll("li",{"class":"yun1 yun-large1 border-gray"})
     objs = []
     sortType = dateutil.y_m_d()
     for item in datalist:
         ahref = item.first("a")
         if ahref!=None:
             try:
                 obj = {}
                 obj['fileDate'] = ""
                 name = ahref.first("p").text
                 obj['name'] = name
                 obj['url'] = ahref.get('href')
                 obj['baseurl'] = baseurl6
                 obj['channel'] = channel
                 obj['updateTime'] = datetime.datetime.now()
                 
                 pics = self.fetchImgs(obj['url'])
                 if len(pics) == 0:
                     print '没有 图片文件--', obj['url'], '---', url
                     continue
                 obj['picList'] = pics
                 obj['showType'] = 3
                 obj['pics'] = len(pics)
                 obj['sortType'] = sortType
                 print name,pics[0],'  url=', obj['url'], '  图片数量=', len(pics)
                 objs.append(obj)
             except Exception as e:
                 print common.format_exception(e)
     return objs
示例#6
0
 def fetchImgItemsData(self, url, channel):
     objs = []
     try:
         lis = self.fetchDataHead(url)
         sortType = dateutil.y_m_d()
         for item in lis:
             ahref = item.first("a")
             if ahref != None:
                 obj = {}
                 name = item.first("span").text
                 obj['name'] = name
                 print name
                 aurl = ahref.get("href")
                 if aurl.count("http") == 0:
                     aurl = '/' + aurl
                 else:
                     aurl = aurl.replace(baseurl, '')
                 obj['url'] = aurl
                 obj['baseurl'] = baseurl
                 obj['channel'] = channel
                 obj['updateTime'] = item.first("b", {"class": "b1"}).text
                 pics = self.fetchImgs(baseurl + obj['url'])
                 if len(pics) == 0:
                     print '没有 图片文件--', obj['url'], '---', url
                     continue
                 obj['picList'] = pics
                 obj['showType'] = 3
                 obj['pics'] = len(pics)
                 obj['sortType'] = sortType
                 print 'url=', obj['url'], '  图片数量=', len(pics)
                 objs.append(obj)
         return objs
     except Exception as e:
         print common.format_exception(e)
         return objs
示例#7
0
 def fetchTextData(self, url, channel):
     try:
         soup = self.fetchUrl(url)
         datalist = soup.findAll("li",{"class":"col-xs-12 clearfix news-box"})
         objs = []
         sortType = dateutil.y_m_d()
         for item in datalist:
             ahref = item.first("a")
             if ahref!=None:
                 try:
                     obj = {}
                     obj['fileDate'] = ''
                     name = ahref.get("title")
                     obj['name'] = name
                     print name
                     obj['url'] = ahref.get('href')
                     obj['baseurl'] = baseurl
                     obj['channel'] = channel
                     obj['updateTime'] = datetime.datetime.now()
                     ret = self.fetchText(ahref.get('href'))
                     if ret==None:
                         print '没有文章数据',ahref.get('href')
                         continue
                     obj['sortType'] = sortType
                     objs.append(obj)
                 except Exception as e:   
                     print  common.format_exception(e)
         return objs
     except Exception as e:
         print common.format_exception(e)
示例#8
0
文件: text.py 项目: zeus911/myconf
 def fetchTextData(self, url, channel):
     try:
         soup = self.fetchUrl(url)
         div = soup.first("div", {"class": "text-list-html"})
         if div == None:
             print '没有数据', url
             return []
         datalist = div.findAll("li")
         objs = []
         sortType = dateutil.y_m_d()
         for item in datalist:
             ahrefs = item.findAll("a")
             for ahref in ahrefs:
                 obj = {}
                 span = ahref.first('span')
                 if span != None:
                     obj['fileDate'] = span.text
                 else:
                     obj['fileDate'] = ''
                 name = ahref.text.replace(obj['fileDate'], '')
                 obj['name'] = name
                 print name
                 obj['url'] = ahref.get('href')
                 obj['baseurl'] = baseurl
                 obj['channel'] = channel
                 obj['updateTime'] = datetime.datetime.now()
                 self.t_queue.put(TextItemContentParse(ahref.get('href')))
                 obj['sortType'] = sortType
                 objs.append(obj)
         return objs
     except Exception as e:
         print common.format_exception(e)
示例#9
0
    def fetchImgItemsData(self, url, channel):
        soup = self.fetchUrl(baseurl10, url)
        div = soup.findAll(
            "li", {"class": "col-md-14 col-sm-16 col-xs-12 clearfix news-box"})
        objs = []
        sortType = dateutil.y_m_d()
        for item in div:
            ahref = item.first("a")
            if ahref != None:
                try:
                    obj = {}
                    obj['fileDate'] = ''
                    obj['name'] = ahref.get("title")
                    obj['url'] = ahref.get('href')
                    obj['baseurl'] = baseurl10
                    obj['channel'] = channel
                    obj['updateTime'] = datetime.datetime.now()

                    pics = self.fetchImgs(obj['url'])
                    if len(pics) == 0:
                        print '没有 图片文件--', obj['url'], '---', url
                        continue
                    obj['picList'] = pics
                    obj['showType'] = 3
                    obj['pics'] = len(pics)
                    obj['sortType'] = sortType
                    print name, pics[0], '  url=', obj['url'], '  图片数量=', len(
                        pics)
                    objs.append(obj)
                except Exception as e:
                    print common.format_exception(e)
        return objs
示例#10
0
文件: img.py 项目: zeus911/myconf
 def fetchImgItemsData(self, url, channel):
     objs = []
     try:
         divs = self.fetchDataHead(url)
         sortType = dateutil.y_m_d()
         for item in divs:
             imgDiv = item.first("div", {"class": "media-image"})
             if imgDiv != None:
                 obj = {}
                 name = item.first("div", {
                     "class": "block-layer block-inner"
                 }).first("a").text
                 obj['name'] = name
                 obj['url'] = imgDiv.first("a").get("href")
                 obj['baseurl'] = baseurl
                 obj['channel'] = channel
                 obj['updateTime'] = datetime.datetime.now()
                 pics = self.fetchImgs(obj['url'])
                 if len(pics) == 0:
                     print '没有 图片文件--', obj['url'], '---', url
                     continue
                 obj['picList'] = pics
                 obj['pics'] = len(pics)
                 obj['sortType'] = sortType
                 obj['showType'] = 3
                 print 'url=', obj['url'], '  图片数量=', len(pics)
                 objs.append(obj)
         return objs
     except Exception as e:
         print common.format_exception(e)
         return objs
示例#11
0
 def fetchImgGrilChannel(self, url):
     soup = self.fetchUrl(url)
     objs = []
     table = soup.find('div', {"class": "box movie_list"})
     if table == None:
         print '没有 channel:', url
         return None
     aList = table.findAll('a')
     for item in aList:
         obj = {}
         obj['url'] = item.get('href')
         obj['baseurl'] = baseurl
         img = item.find('img')
         if img != None:
             obj['pic'] = img.get('data-original')
         else:
             obj['pic'] = None
         obj['updateTime'] = dateutil.y_m_d()
         obj['rate'] = 1.4
         obj['showType'] = 3
         obj['channel'] = 'porn_sex'
         obj['channelType'] = 'porn_sex'
         obj['name'] = self.fetchImgGrilChannelName(item.get('href'))
         print obj
         objs.append(obj)
     return objs
示例#12
0
文件: sound.py 项目: zeus911/myconf
 def fetchFileData(self, url, channel):
     try:
         soup = self.fetchUrl(url)
         data = soup.first("div", {"class": "text-list-html"})
         objs = []
         sortType = dateutil.y_m_d()
         if data!=None:
             item = data.first("ul")
             if item!=None:
                 ahrefs = item.findAll("a")
                 for ahref in ahrefs:
                     obj = {}
                     span = ahref.first('span')
                     if span != None:
                         obj['fileDate'] = span.text
                     else:
                         obj['fileDate'] = ''
                     name = ahref.get("title")
                     obj['name'] = name
                     obj['url'] = ahref.get('href')
                     obj['baseurl'] = baseurl
                     obj['channel'] = channel
                     obj['updateTime'] = datetime.datetime.now()
                     obj['sortType'] = sortType
                     mp3 = self.fetchMp3(ahref.get('href'))
                     if mp3 == None:
                         print '没有mp3文件--', ahref, '---', url
                         continue
                     print name,mp3
                     obj['file'] = mp3
                     objs.append(obj)
         return objs
     except Exception as e:
         print common.format_exception(e)
示例#13
0
文件: img.py 项目: zeus911/myconf
 def fetchImgItemsData(self, url, channel):
     try:
         lis = self.fetchDataHead(url)
         objs = []
         sortType = dateutil.y_m_d()
         for item in lis:
             ahrefs = item.findAll("a")
             for ahref in ahrefs:
                 obj = {}
                 span = ahref.first('span')
                 if span != None:
                     obj['fileDate'] = span.text
                 else:
                     obj['fileDate'] = ''
                 name = ahref.text.replace(obj['fileDate'], '')
                 obj['name'] = name
                 print name
                 obj['url'] = ahref.get('href')
                 obj['baseurl'] = baseurl
                 obj['channel'] = channel
                 obj['updateTime'] = datetime.datetime.now()
                 pics = self.fetchImgs(ahref.get('href'))
                 if len(pics) == 0:
                     print '没有 图片文件--', ahref, '---', url
                     continue
                 obj['picList'] = pics
                 obj['pics'] = len(pics)
                 obj['sortType'] = sortType
                 obj['showType'] = 3
                 print 'url=', obj['url'], '  图片数量=', len(pics)
                 objs.append(obj)
         return objs
     except Exception as e:
         print common.format_exception(e)
示例#14
0
 def fetchImgItemsData(self, url, channel):
     soup = self.fetchUrl(url)
     div = soup.first("div", {"class": "list_art"})
     if div == None:
         print '没有数据', url
         return []
     datalist = div.findAll("li")
     objs = []
     sortType = dateutil.y_m_d()
     for item in datalist:
         ahref = item.first("a")
         if ahref != None:
             try:
                 obj = {}
                 name = ahref.text
                 obj['name'] = name
                 print name
                 obj['url'] = ahref.get('href')
                 obj['baseurl'] = baseurl
                 obj['channel'] = channel
                 obj['updateTime'] = datetime.datetime.now()
                 obj['fileDate'] = item.first('span').text
                 pics = self.fetchImgs(obj['url'])
                 if len(pics) == 0:
                     print '没有 图片文件--', obj['url'], '---', url
                     continue
                 obj['picList'] = pics
                 obj['showType'] = 3
                 obj['pics'] = len(pics)
                 obj['sortType'] = sortType
                 print 'url=', obj['url'], '  图片数量=', len(pics)
                 objs.append(obj)
             except Exception as e:
                 print common.format_exception(e)
     return objs
示例#15
0
文件: img.py 项目: zeus911/myconf
 def fetchImgItemsData(self, url, channel):
     objs = []
     try:
         soup = self.fetchUrl(url)
         sortType = dateutil.y_m_d()
         div = soup.first('div', {'class': 'box list channel'})
         if div != None:
             lis = div.findAll('li')
             for item in lis:
                 ahref = item.first("a")
                 if ahref != None:
                     obj = {}
                     udate = ahref.first('span').text
                     name = ahref.text.replace(udate, '')
                     obj['name'] = name
                     obj['url'] = ahref.get("href")
                     obj['baseurl'] = baseurl
                     obj['channel'] = channel
                     obj['updateTime'] = datetime.datetime.now()
                     pics = self.fetchImgs(obj['url'])
                     if len(pics) == 0:
                         print '没有 图片文件--', obj['url'], '---', url
                         continue
                     obj['picList'] = pics
                     obj['pics'] = len(pics)
                     obj['sortType'] = sortType
                     obj['showType'] = 3
                     print 'url=', obj['url'], '  图片数量=', len(pics)
                     objs.append(obj)
             return objs
     except Exception as e:
         print common.format_exception(e)
         return objs
示例#16
0
 def fetchImgItemsData(self, url, channel):
     try:
         lis = self.fetchDataHead(url)
         print url, ";itemsLen=", len(lis)
         objs = []
         sortType = dateutil.y_m_d()
         for li in lis:
             ahref = li.first("a")
             if ahref != None:
                 obj = {}
                 obj['name'] = ahref.get("title")
                 print obj['name']
                 obj['url'] = ahref.get('href')
                 obj['fileDate'] = ahref.first("span").text
                 obj['baseurl'] = baseurl
                 obj['channel'] = channel
                 obj['updateTime'] = datetime.datetime.now()
                 pics = self.fetchImgs(ahref.get('href'))
                 if len(pics) == 0:
                     print '没有 图片文件--', ahref, '---', url
                     continue
                 obj['picList'] = pics
                 obj['pics'] = len(pics)
                 obj['sortType'] = sortType
                 obj['showType'] = 3
                 print 'url=', obj['url'], 'filedate=', obj[
                     'fileDate'], '  图片数量=', len(pics)
                 objs.append(obj)
         return objs
     except Exception as e:
         print common.format_exception(e)
示例#17
0
    def fetchImgItemsData(self, url, channel):
        soup = self.fetchUrl(baseurl7, url)
        datalist = soup.findAll("div", {"class": "x3 margin-top"})
        objs = []
        sortType = dateutil.y_m_d()
        for item in datalist:
            ahref = item.first("a")
            if ahref != None:
                try:
                    obj = {}
                    obj['fileDate'] = item.first("span", {
                        "class":
                        "icon-heart text-small text-gray float-right"
                    }).text
                    name = item.first("img").get("alt")
                    obj['name'] = name
                    obj['url'] = ahref.get('href')
                    obj['baseurl'] = baseurl7
                    obj['channel'] = channel
                    obj['updateTime'] = datetime.datetime.now()

                    pics = self.fetchImgs(obj['url'])
                    if len(pics) == 0:
                        print '没有 图片文件--', obj['url'], '---', url
                        continue
                    obj['picList'] = pics
                    obj['showType'] = 3
                    obj['pics'] = len(pics)
                    obj['sortType'] = sortType
                    print name, pics[0], '  url=', obj['url'], '  图片数量=', len(
                        pics)
                    objs.append(obj)
                except Exception as e:
                    print common.format_exception(e)
        return objs
示例#18
0
文件: fix_img.py 项目: zeus911/myconf
def fix3():
    dbVPN = db.DbVPN()
    ops = db_ops.DbOps(dbVPN)
    sortType = dateutil.y_m_d()
    items = ops.getImgItems_itemBySortType(sortType)
    dbVPN.close()
    for obj in items:
        ext = os.path.splitext(obj['picUrl'])[1]
        out = fileOrige + str(obj['id']) + ext
        path = fileCompress + str(obj['id']) + ext
        os.system("wget -O %s %s " % (out, obj['picUrl']))
        os.system("mogrify  -resize 80%x80% " + out)
        os.system("convert  -resize 50%x50% " + out + ' ' + path)
        print 'sync imgok url=', obj['picUrl']
示例#19
0
    def run(self):
        channels = self.parseChannel()
        dbVPN = db.DbVPN()
        ops = db_ops.DbOps(dbVPN)
        sortType = dateutil.y_m_d()
        for obj in channels:
            channel = obj['url']
            url = obj['baseurl']
            ops.inertImgChannel(obj)
            dbVPN.commit()
            imgitem = {}
            imgitem['name'] = '搞笑gif动态'
            imgitem['url'] = 'hugao8.com/category/gao-gif/'
            imgitem['baseurl'] = baseurl2
            imgitem['channel'] = channel
            imgitem['updateTime'] = datetime.datetime.now()
            imgitem['fileDate'] = ''
            imgitem['showType'] = 3
            imgitem['sortType'] = sortType
            pics = []
            for i in range(1, maxImgPage):
                if i != 1:
                    url = "%s%s%s%s" % (obj['baseurl'], "page/", i, "/")
                imgs = self.fetchImgs(url)
                print len(imgs), url
                pics.extend(imgs)
                if len(imgs) == 0:
                    break
                print(i % 2)
                if i % 2 == 0:
                    imgitem['picList'] = pics
                    imgitem['pics'] = len(pics)
                    imgitem['pic'] = pics[0]
                    imgitem['url'] = '%s%s' % ('hugao8.com/category/gao-gif/',
                                               i)
                    ops.inertImgItems(imgitem)
                    dbVPN.commit()
                    print '一次提交', imgitem['url'], len(pics)
                    try:
                        for picItem in imgitem['picList']:
                            item = {}
                            item['itemUrl'] = imgitem['url']
                            item['picUrl'] = picItem
                            ops.inertImgItems_item(item)
                        dbVPN.commit()

                    except Exception as e:
                        print common.format_exception(e)
                    pics = []
        dbVPN.commit()
示例#20
0
 def run(self):
     channels = self.parseChannel()
     dbVPN = db.DbVPN()
     ops = db_ops.DbOps(dbVPN)
     sortType = dateutil.y_m_d()
     for obj in channels:
         channel = obj['url']
         url = obj['baseurl']
         ops.inertImgChannel(obj)
         dbVPN.commit()
         imgitem = {}
         imgitem['name'] = 'gif动态'
         imgitem['url'] = 'forum-47-1.html'
         imgitem['baseurl'] = baseurl5
         imgitem['channel'] = channel
         imgitem['updateTime'] = datetime.datetime.now()
         imgitem['fileDate'] = ''
         imgitem['showType'] = 3
         imgitem['sortType'] = sortType
         pics = []
         for i in range(1, maxImgPage):
             url = "%s%s%s" % (obj['baseurl'].replace("1.html",
                                                      ''), i, ".html")
             imgs = self.fetchImgs(url)
             print len(imgs), url
             pics.extend(imgs)
             if i % 5 == 0:
                 imgitem['picList'] = pics
                 imgitem['pics'] = len(pics)
                 imgitem['pic'] = pics[0]
                 imgitem['url'] = '%s%s' % ('xng666.com/a/gif/', i)
                 ops.inertImgItems(imgitem)
                 dbVPN.commit()
                 print '一次提交', imgitem['url'], len(pics)
                 try:
                     for picItem in imgitem['picList']:
                         item = {}
                         item['itemUrl'] = imgitem['url']
                         item['picUrl'] = picItem
                         ops.inertImgItems_item(item)
                     dbVPN.commit()
                 except Exception as e:
                     print common.format_exception(e)
                 pics = []
     dbVPN.commit()
示例#21
0
    def run(self):
        dbVPN = db.DbVPN()
        ops = db_ops.DbOps(dbVPN)
        ops.inertImgChannel(self.t_obj)
        dbVPN.commit()
        # 有分页
        sortType = dateutil.y_m_d()
        #         channel = self.t_obj['url']
        #         channel = urlparse(self.t_obj['baseurl']).netloc
        for name, url in img_channels.items():
            obj = {}
            obj['name'] = name
            obj['channel'] = self.t_obj['url']
            obj['updateTime'] = datetime.datetime.now()
            obj['fileDate'] = ''
            obj['baseurl'] = baseurl
            obj['showType'] = 3
            #             obj['url'] = url.replace("&", "")
            obj['url'] = urlparse(self.t_obj['baseurl']).path
            print obj['url']
            #             obj['pics'] = len(pics)
            obj['sortType'] = sortType
            pics = []
            for i in range(1, 3):
                url = url + str(i)

                alist = self.fetchDataHead(url)
                print '解析', i, "页--", len(alist)
                for item in alist:
                    pic = self.fetchImgItemData(item.get("href"))
                    if pic == None:
                        continue
                    pics.append(pic)
            obj['picList'] = pics
            obj['pics'] = len(pics)
            ops.inertImgItems(obj)
            for picItem in obj['picList']:
                item = {}
                item['itemUrl'] = obj['url']
                item['picUrl'] = picItem
                ops.inertImgItems_item(item)
            dbVPN.commit()
示例#22
0
 def fetchImgItemsData(self, url, channel):
     try:
         trs = self.fetchDataHead(url)
         print url, ";itemsLen=", len(trs)
         objs = []
         sortType = dateutil.y_m_d()
         for item in trs:
             ahrefs = item.findAll("a")
             if ahrefs == None:
                 continue
             for ahref in ahrefs:
                 match = img_channel_title.search(ahref.text)
                 if match == None:
                     continue
                 obj = {}
                 match = img_channel_date.search(ahref.text)
                 if match != None:
                     obj['fileDate'] = match.group(0)
                 else:
                     obj['fileDate'] = ''
                 name = ahref.text.replace(obj['fileDate'], '')
                 obj['name'] = name
                 obj['url'] = ahref.get('href')
                 obj['baseurl'] = baseurl
                 obj['channel'] = channel
                 obj['updateTime'] = datetime.datetime.now()
                 pics = self.fetchImgs(ahref.get('href'))
                 if len(pics) == 0:
                     print '没有 图片文件--', ahref, '---', url
                     continue
                 obj['picList'] = pics
                 obj['showType'] = 3
                 obj['pics'] = len(pics)
                 obj['sortType'] = sortType
                 obj['showType'] = 3
                 print 'url=', obj['url'], 'filedate=', obj[
                     'fileDate'], '  图片数量=', len(pics)
                 objs.append(obj)
         return objs
     except Exception as e:
         print common.format_exception(e)
示例#23
0
文件: tofile.py 项目: zeus911/myconf
    def run(self):

        try:
            dbVPN = db.DbVPN()
            ops = db_ops.DbOps(dbVPN)
            sortType = dateutil.y_m_d()
            #             sortType = "2017-07-12"
            for i in range(0, 20000):
                #                 ret = ops.getTextChannelItems(self.t_item["url"], i)
                ret = ops.getTextChannelItemsById(i, sortType)
                if len(ret) == 0:
                    print '写入完毕'
                    break
                print '开始写入 channel :', self.t_item["url"],
                cloase = False
                for item in ret:
                    #                     path = filePATH + str(item['id']) + ".txt"
                    #                     if os.path.exists(path) == False:
                    #                         output = open(path, 'w')
                    #                         output.write(item['file'])
                    #                         output.close()
                    #                         print '写完文件:' + path
                    #                     path = filePATHWeb + str(item['id']) + ".txt"
                    #                     if os.path.exists(path) == False:
                    #                         output = open(path, 'w')
                    #                         output.write(html_parse.filter_tags(item['file']))
                    #                         output.close()
                    #                         print '写完文件:' + path
                    path = filePATHHtml + str(item['id']) + ".html"
                    #                     if os.path.exists(path) == False:
                    output = open(path, 'w')
                    output.write(
                        html_parse.txtToHtml(
                            html_parse.filter_tags(item['file'])))
                    output.close()
                    print '写完文件:' + path
                print '写完页', i
            print 'channel :', self.t_item["url"], '同步完成 len=', len(ret)
            dbVPN.close()
        except Exception as e:
            print common.format_exception(e)
示例#24
0
 def fetchgirlChannelItemsOne(self, item):
     obj = {}
     obj['url'] = item.get("href")
     strName = item.text.replace("[if lt IE 9 ]>",
                                 "").replace("<![endif]", "")
     obj['name'] = html_parse.filter_tags(strName)
     span = item.first('span')
     if span != None:
         obj['fileDate'] = html_parse.filter_tags(
             span.text.replace("[if lt IE 9 ]>",
                               "").replace("<![endif]", ""))
         obj['name'] = obj['name'].replace(obj['fileDate'], '')
     else:
         obj['fileDate'] = ''
     obj['channel'] = self.t_obj['url'].replace("/?m=", '')
     obj['updateTime'] = dateutil.y_m_d()
     obj['baseurl'] = baseurl
     pics = self.fetchImgs(item.get("href"))
     obj['pics'] = len(pics)
     obj['picList'] = pics
     obj['showType'] = 3
     print obj['url'], '解析完毕', obj['channel'], len(pics), obj['name']
     return obj
示例#25
0
 def fetchTextData(self, url, channel):
     try:
         soup = self.fetchUrl(baseurl+url)
         div = soup.first("div", {"class": "novelList"})
         if div == None:
             print '没有数据', url
             return []
         datalist = div.findAll("a")
         objs = []
         sortType = dateutil.y_m_d()
         for item in datalist:
             try:
                 obj = {}
                 span = item.first('div',{"class":"pull-right date    "})
                 if span != None:
                     obj['fileDate'] = span.text
                 else:
                     obj['fileDate'] = ''
                 name = item.first("div",{"class":"pull-left"}).text
                 obj['name'] = name.replace("【完】","")
                 print name
                 obj['url'] = item.get('href')
                 obj['baseurl'] = baseurl
                 obj['channel'] = channel
                 obj['updateTime'] = datetime.datetime.now()
                 ret = self.fetchText(item.get('href'))
                 if ret==None:
                     print '没有文章数据',item.get('href')
                     continue
                 obj['sortType'] = sortType
                 objs.append(obj)
             except Exception as e:   
                 print  common.format_exception(e)
         return objs
     except Exception as e:
         print common.format_exception(e)