Python get_insert_news示例，dbConnection.get_insert_news Python示例

示例#1

0

显示文件

文件： crawler.py 项目： ZhFan/HIT-News-Subscription

def acm():
    m = re.findall(r"""<h3><a\shref="http://acm.hit.edu.cn/article/
                    \d*
                    \S*
                    </a>""", urllib.urlopen("http://acm.hit.edu.cn").read(), re.M|re.X)
      
    d = re.findall(r"""<span\sclass="time">\d\d\d\d-\d\d-\d\d
                    </span>""", urllib.urlopen("http://acm.hit.edu.cn").read(), re.M|re.X)
        
        
    #T = time.strftime('%Y/%m/%d', time.localtime(time.time()))
    #T = "2014/09/26"

    URL = []
    for x in db.get_all('News'): 
        URL.append(x.url)
    j = 0
    for i in m:
        date = d[j][-17:-7]
        date = date.replace('-', '/')
        j += 1
        #print date
        #if(cmp(date, T) != 0):
        #    continue
        url = i[13:47]
        if(URL.count(url) != 0):
            continue
        URL.append(url)
                
        title = i[49:-4]
        db.get_insert_news('News', str(url), str(title), date)

示例#2

0

显示文件

文件： crawler.py 项目： ZhFan/HIT-News-Subscription

def today():
    m = re.findall(r"""<title><\S*\s*\S*\s*\S*
                    </title>""", urllib2.urlopen("http://today.hit.edu.cn/rss.xml").read(), re.M|re.X)

    link = re.findall(r"""<link><\S*\s*\S*\s*\S*
                    </link>""", urllib2.urlopen("http://today.hit.edu.cn/rss.xml").read(), re.M|re.X)

        
    T = time.strftime('%Y/%m/%d', time.localtime(time.time()))
    #T = "2014/10/27"

    URL = []
    for x in db.get_all('News'): 
        URL.append(x.url)
    j = 0
    for i in link:
        l = i.split("[CDATA[")
        date = l[1][29:39]
        date = date.replace('-', '/')
        if(cmp(date, T) != 0):
            continue
        r = l[1].split("]]>")
        url = r[0]
        if(URL.count(url) != 0):
            continue
        URL.append(url)
        x = m[j].decode('gbk').encode('utf8')
        title = x[16:-11]
        j += 1
        db.get_insert_news('News', str(url), str(title), date)

示例#3

0

显示文件

文件： crawler.py 项目： ZhFan/HIT-News-Subscription

def hituc():
    m = re.findall(r"""/news/Show\S*
                    \s*target="_blank"\s*\S*\s*\S*\s*\S*\s*\S*
                   </a>""", urllib2.urlopen("http://hituc.hit.edu.cn").read(), re.M|re.X)

    d = re.findall(r"""class="news">\d*-\d*-\d*

                    &nbsp;</TD>""", urllib2.urlopen("http://hituc.hit.edu.cn").read(), re.M|re.X)

    
    T = time.strftime('%Y/%m/%d', time.localtime(time.time()))
    #T = "2014/9/22"

    URL = []
    for x in db.get_all('News'): 
        URL.append(x.url)
    j = 0
    for i in m:
        date = d[j][13:-11]
        j += 1
        date = date.replace('-', '/')
        if(cmp(date, T) != 0):
            continue
        l = i.split('target')
        url = "http://hituc.hit.edu.cn"
        url += l[0][0:-2]
        if(URL.count(url) != 0):
            continue
        URL.append(url)

        title = l[1][23:-4].decode('gbk').encode('utf8')
        db.get_insert_news('News', str(url), title, date)

示例#4

0

显示文件

文件： crawler.py 项目： ZhFan/HIT-News-Subscription

def usp():
    m = re.findall(r"""showXinWen_XX.asp\?MC=\S*
                    \s*\S*\s*title\s*=\s*\S*
                    \s*\S*\s*\S*\s*\S*\s*\S*\s*\S*\s*\S*\s*\S*\s*\S*\s*\S*\s*\S*\s*\S*\s*\S*\s*\S*\s*\S*\s*\S*
                    </font>""", urllib2.urlopen("http://www.usp.com.cn").read(), re.M|re.X)


    T = time.strftime('%Y/%m/%d', time.localtime(time.time()))
    #T = "2014/10/16"

    URL = []    
    for x in db.get_all('News'): 
        URL.append(x.url)
    for x in m:
        i = x.decode('gbk').encode('utf8')
        date = i[-17:-7]
        date = date.replace('-', '/')
        print date
        if(cmp(date, T) != 0):
            continue
        url = "http://www.usp.com.cn/"
        j = i.split("' target='_blank'  title = '")
        url += j[0]
        if(URL.count(url) != 0):
            continue
        URL.append(url)
        
        k = j[1].split("><font color=")
        title = k[0][0:-2]
        db.get_insert_news('News', str(url), title, date)

示例#5

0

显示文件

文件： crawler.py 项目： ZhFan/HIT-News-Subscription

def studyathit():
    m = re.findall(r"""detail.asp\?id=
                    \d*"\s*
                    class="newstext"\s>¡¤
                    \S*
                    </a>""", urllib.urlopen("http://studyathit.hit.edu.cn").read(), re.M|re.X)

    d = re.findall(r"""&nbsp;&nbsp;\d\d\d\d-\d\d-\d\d
                    </td>""", urllib.urlopen("http://studyathit.hit.edu.cn").read(), re.M|re.X)
        

    T = time.strftime('%Y/%m/%d', time.localtime(time.time()))
    #T = "2014/10/21"

    URL = []
    for x in db.get_all('News'): 
        URL.append(x.url)
    j = 0
    for i in m:
        date = d[j][-15:-5]
        date = date.replace('-', '/')
        j += 1
        if(cmp(date, T) != 0):
            continue
        url = "http://studyathit.hit.edu.cn/"
        url += i[0:18]
        if(URL.count(url) != 0):
            continue
        URL.append(url)

        l = i.split(" >")
        #x = l[1].decode('GB2312').encode('utf8')
        title = l[1][2:-4]
        db.get_insert_news('News', str(url), title, date)

示例#6

0

显示文件

文件： crawler.py 项目： ZhFan/HIT-News-Subscription

def zsb ():
    m = re.findall(r"""news/user/News_Detail_use.asp
                        \S *
                        height=800,
                        \s
                        \S* #state
                        \s
                        \S* #menubar
                        \s
                        \S* #resizable
                        \s
                        \S* #scrollbars
                        \s
                        \S* #false
                        \s
                        \S* #target
                        \s
                        \S* #title
                        \s\S*\s*
                        </a>
                        \s*\S*
                        """, urllib2.urlopen("http://zsb.hit.edu.cn").read(), re.M|re.X)

        
    T = time.strftime('%m.%d', time.localtime(time.time()))
    #T = "9.29"

    URL = []
    for x in db.get_all('News'): 
        URL.append(x.url)
    for i in m:
        dat = i[-5:-1]
        if(cmp(dat, T) != 0):
            continue
        d = T.replace('.', '/')
        date = "2014/"
        if(d[1] == '/'):date += "0"
        date += d
        url = "http://zsb.hit.edu.cn/"
        url += i[0:37]
        cnt = URL.count(url)
        if(cnt != 0):
           continue
        URL.append(url)
        
        l = i.split(">")
        
        title = l[1][0:-3].decode('gbk').encode('utf8')
        db.get_insert_news('News', str(url), title, date)

示例#7

0

显示文件

文件： home.py 项目： ZhFan/HIT-News-Subscription

    def GET(self):
        #db.get_insert_news('News', "12312313123", "郑博郑博郑博郑博郑博郑博郑博郑博郑博郑博郑博")
        #return render.hello("你好")
        m = re.findall(r"""'/news/ + # date
                    \w * # 2014
                    / #/
                    \w\w #month
                    - #-
                    \w* #day
                    / #/
                    \w * #words
                    .htm'
                    \s # white space
                    title='
                    \S *
                    '
                    \s""", urllib2.urlopen("http://today.hit.edu.cn").read(), re.M|re.X)

        #return render.hello(len(m))
        T = time.strftime('%Y/%m-%d', time.localtime(time.time()))
        #T = "2014/10-26"
	    
        for x in db.get_all('News'):
            #return render.hello(len(m))
            db.get_delete('News', "id=" + str(x.id))
        URL = []
        Title = []
        #News = []
        for i in m:
                #return render.hello(len(i))
                y = i.decode('gbk').encode('utf8')
            	x = y.split("title=")
                url = "http://today.hit.edu.cn"
                url += x[0][1:-2]
                title = x[1][1:-2]
                date = i[7:17]
                if(cmp(date, T) != 0):
                        continue
                
                cnt = URL.count(url)
                if(cnt != 0):
                        continue
                URL.append(url)
                #print  url
                
                #if title.find("电气"):
                #    return render.hello("123")
                db.get_insert_news('News', str(url), str(title))

示例#8

0

显示文件

文件： crawler.py 项目： ZhFan/HIT-News-Subscription

def news():
    m = re.findall(r""""/articles/ + # date
                    \w * # 2014
                    / #/
                    \w\w #month
                    - #-
                    \w* #day
                    / #/
                    \w * #words
                    .htm"
                    \s* # white space
                    target="_blank">
                    \S*\s*\S*\s*\S*
                    </a>""", urllib.urlopen("http://news.hit.edu.cn").read(), re.M|re.X)


    T = time.strftime('%Y/%m/%d', time.localtime(time.time()))
    #T = "2014/10/24"

    URL = []
    for x in db.get_all('News'): 
        URL.append(x.url)
    for x in m:
        i = x.decode('gbk').encode('utf8')
        date = i[11:21]
        date = date.replace('-', '/')
        j = i.split(' target="_blank">')
        if(cmp(date, T) != 0):
            continue

        url = "http://news.hit.edu.cn"
        url += j[0][1:-1]
        cnt = URL.count(url)
        if(cnt != 0):
            continue
        URL.append(url)
        
        title = j[1][0:-4]
        db.get_insert_news('News', str(url), str(title), date)

示例#9

0

显示文件

文件： crawler.py 项目： ZhFan/HIT-News-Subscription

def jwc():
    m = re.findall(r"""<a\shref="/news/Showgg.asp\?id=
                   \d\d\d\d"
                   \s
                   \S * #target
                   \s
                   \S*
                   </a>""", urllib.urlopen("http://jwc.hit.edu.cn").read(), re.M|re.X)


        
    mm = re.findall(r"""class="news">\d\d\d\d-\d*-\d* #date
                    &nbsp;""", urllib.urlopen("http://jwc.hit.edu.cn").read(), re.M|re.X)


    T = time.strftime('%Y/%m/%d', time.localtime(time.time()))
    #T = "2014/10/24"
        
    URL = []
    for x in db.get_all('News'): 
        URL.append(x.url)
    j = 0
    for i in m:
        date = mm[j][13:-6]
        j += 1
        date = date.replace('-', '/')
        if(cmp(date, T) != 0):
            continue
        i = i.split(' target="_blank" class="news">')
        url = "http://jwc.hit.edu.cn"
        url += i[0][9:-1]
        if(URL.count(url) != 0):
            continue
        URL.append(url)

        title = i[1][0:-4].decode('gbk').encode('utf8')
        db.get_insert_news('News', str(url), title, date)

示例#10

0

显示文件

文件： crawler.py 项目： ZhFan/HIT-News-Subscription

def yzb():
    d = re.findall(r"""class="date">\d\d-\d\d</div>
                    """, urllib2.urlopen("http://yzb.hit.edu.cn").read(), re.M|re.X)

   
    m = re.findall(r"""/article/list/view/id/
                        \S*
                        \s*
                        title='
                        \S*\s*\S*
                        '>
                    """, urllib2.urlopen("http://yzb.hit.edu.cn").read(), re.M|re.X)

    
    #T = time.strftime('%Y/%m/%d', time.localtime(time.time()))
    T = "2014/09/19"

   
    URL = []
    for x in db.get_all('News'): 
        URL.append(x.url)
    j = 0
    for i in m:
        date = "2014/" + d[j][13:-6]
        date = date.replace('-', '/')
        j += 1
        if(cmp(date, T) != 0):
            continue
        url = "http://yzb.hit.edu.cn"
        url += i[0:25]
        if(URL.count(url) != 0):
            continue
        URL.append(url)
        
        k = i.split("title='")
        title = k[1][0:-2]
        db.get_insert_news('News', str(url), title, date)

示例#11

0

显示文件

文件： crawler.py 项目： ZhFan/HIT-News-Subscription

def ygb():
    m = re.findall(r"""/news/show
                    \S*
                    \s\s*
                    \S*\s*\S*\s*\S*\s*\S*"
                    >""", urllib.urlopen("http://ygb.hit.edu.cn").read(), re.M|re.X)

    mm = re.findall(r"""
                \(\s
                \S *
                \s*\)""", urllib.urlopen("http://ygb.hit.edu.cn").read(), re.M|re.X)

    
    T = time.strftime('%Y/%m/%d', time.localtime(time.time()))
    #T = "2014/10/24"


    URL = []
    for x in db.get_all('News'): 
        URL.append(x.url)
    j = 0
    for i in m:
        date = mm[j][2:-2]
        j += 1
        date = date.replace('-', '/')
        if(cmp(date, T) != 0):
            continue
        url = "http://ygb.hit.edu.cn"
        l = i.split(" title=")
        url += l[0][0:-1]
        if(URL.count(url) != 0):
            continue
        URL.append(url)

        title = l[1][1:-2]
        db.get_insert_news('News', str(url), title, date)

示例#12

0

显示文件

文件： crawler.py 项目： ZhFan/HIT-News-Subscription

def hqjt():
    m = re.findall(r"""/news/show.asp\?id= #prefix
                    \d\d\d\d\s #id
                    target="_blank"> #
                    \S * #title
                    </a>
                    \s
                    <span
                    \s
                    style="color:\#aaa;font-size:10px;">
                    \d\d-\d\d
                    </span>""", urllib.urlopen("http://hqjt.hit.edu.cn/").read(), re.M|re.X)


    T = time.strftime('%Y/%m/%d', time.localtime(time.time()))
    #T = "2014/10/24"
    URL = []
    for x in db.get_all('News'): 
        URL.append(x.url)
    for i in m:
        date = "2014/" + i[-12:-7]
        date = date.replace('-', '/')
        if(cmp(date, T) != 0):
            continue
        url = "http://hqjt.hit.edu.cn"
        l = i.split(' target="_blank">')
        url += l[0]
        if(URL.count(url) != 0):
            continue
        URL.append(url)
        print url

        r = l[1].split("</a>")
        title = r[0].decode('gbk').encode('utf8')

        db.get_insert_news('News', str(url), title, date)