forked from weichaoguo/fetch_paper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
acmlib_paper_fetch.py
65 lines (55 loc) · 2.37 KB
/
acmlib_paper_fetch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#
# (C) Copyright 2012 Vector Guo <vectorguo@gmail.com>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License Version
# 2.1 as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
"""fetch acmlib papers of a proceeding in batches.
"""
import sys
import httplib
import re
import fetcher
#pNo -- PROC_ID
#size -- PAPER_NUM
pNo = 0
size = 2
if (len(sys.argv) > 1):
pNo = sys.argv[1]
if (len(sys.argv) > 2):
size = int(sys.argv[2])
detail_list_url = "http://acm.lib.tsinghua.edu.cn/acm/Detail-List.nsp?&view=ACM&cid_PCODE=&cid_DOCTYPE=&cid_HASABSTRACT=&cid_HASFULLTEXT=&lastquery=(pNo):PROC_ID&sortfield=SECTION_SEQ_NO,SEQ_NO,PUBDATE&sortorder=ASCENDING,ASCENDING,ASCENDING&var_AUTHCODE=&var_PUBCODE=&var_BROWSECODE=&var_SOURCECODE=&recid=&reccode=&mailto=&docindex=iNo&var_SECTION=&numresults=25&fromrecord=&usertag="
content_url = "http://166.111.120.94/acm/ContentLoader.nsp?view=path"
file_url = "http://166.111.120.94/acm/path"
detail_list_url = detail_list_url.replace("pNo", pNo, 1)
for i in range(0, size) :
contents = fetcher.fetch_webpage(detail_list_url.replace("iNo", str(i), 1))
paper_name = re.findall('<b><img src="img/spacer.gif"><br>(.*?)</b>', contents);
#print paper_name
paper_path = re.findall('fl = "(.*?)"', contents)
p_len = len(paper_path)
index = 0
while index < p_len:
if not paper_path[index].endswith(".pdf") :
del paper_path[index]
p_len = p_len - 1
else :
index = index + 1
if len(paper_path) == 0 :
continue
real_path = re.findall('top.location.replace\("(.*?)"', fetcher.fetch_webpage(content_url.replace("path", paper_path[0], 1)))
if len(real_path) == 0 :
continue
paper_file = open(paper_name[0].replace("/", " ")+'.pdf', 'w')
paper_file.write(fetcher.fetch_webpage(file_url.replace("path", real_path[0], 1)))
paper_file.close()