예제 #1
0
"""
import sys
import httplib
import re
import fetcher

#url -- proceeding page url
url = "http://dblp.uni-trier.de/db/conf/acl/acl2011.html"

if (len(sys.argv) > 1):
    url = sys.argv[1]

index = 0

contents = fetcher.fetch_webpage(url)
paper_links = re.findall('<br><a href="(.*?)"><img alt="Electronic Edition"', contents)
paper_names = re.findall('<b>(.*?)[\.|\?]', contents)

if len(paper_names) != len(paper_links):
    print "fetch paper names & links error!"
else:
    for paper_link in paper_links :
        if paper_link.endswith(".pdf") :
            d_link = paper_link
        else :
            real_path = re.findall('<a href="(.*?)"', fetcher.fetch_webpage(paper_link))
            if len(real_path) == 0 :
                index = index + 1
                continue
            d_link = real_path[0]
예제 #2
0
if (len(sys.argv) > 1):
    pNo = sys.argv[1]

if (len(sys.argv) > 2):
    size = int(sys.argv[2])

detail_list_url = "http://acm.lib.tsinghua.edu.cn/acm/Detail-List.nsp?&view=ACM&cid_PCODE=&cid_DOCTYPE=&cid_HASABSTRACT=&cid_HASFULLTEXT=&lastquery=(pNo):PROC_ID&sortfield=SECTION_SEQ_NO,SEQ_NO,PUBDATE&sortorder=ASCENDING,ASCENDING,ASCENDING&var_AUTHCODE=&var_PUBCODE=&var_BROWSECODE=&var_SOURCECODE=&recid=&reccode=&mailto=&docindex=iNo&var_SECTION=&numresults=25&fromrecord=&usertag="

content_url = "http://166.111.120.94/acm/ContentLoader.nsp?view=path"
file_url = "http://166.111.120.94/acm/path"

detail_list_url = detail_list_url.replace("pNo", pNo, 1)

for i in range(0, size) :
    contents = fetcher.fetch_webpage(detail_list_url.replace("iNo", str(i), 1))

    paper_name = re.findall('<b><img src="img/spacer.gif"><br>(.*?)</b>', contents);
    #print paper_name
    paper_path = re.findall('fl = "(.*?)"', contents)
    p_len = len(paper_path)
    index = 0
    while index < p_len:
        if not paper_path[index].endswith(".pdf") :
            del paper_path[index]
            p_len = p_len - 1
        else :
            index = index + 1
    if len(paper_path) == 0 :
        continue
    real_path = re.findall('top.location.replace\("(.*?)"', fetcher.fetch_webpage(content_url.replace("path", paper_path[0], 1)))