def collect_arxiv_data(self, authors=None, cats=None): """ Collect pdf data from arXiv with specified authors and category @param authors: The authors to be searched, separate authors with ' OR ' , note: author queries are exact e.g. 'Michael I. Jordan OR Michael Jordan OR David Blei OR David M. Blei', searches for the publications of the two authors with various spellings. @param cats: category restrictions """ # TODO handle possible errors in data collection # extract params from form qry = 'http://export.arxiv.org/api/query?search_query=' if cats: cats = map(lambda x: "cat:" + x, cats) if len(cats) > 1: cats = '%28' + '+OR+'.join(cats) + '%29' else: cats = cats[0] qry += cats if authors: authors = authors.lower().split(' or ') authors = map(lambda x: '%22' + x.replace(' ', '+') + '%22', authors) authors = map(lambda x: "au:" + x, authors) authors = '+OR+'.join(authors) authors = '%28' + authors.replace(' ','+') + '%29' if cats: qry += "+AND+" qry += authors qry += '&max_results=150' # ONLINE LIMITIATION, remove for standalone or set to 2000 print qry req = urllib2.urlopen(qry, timeout=10) soup = BeautifulSoup(req.read()) titles = soup.findAll('title') titles = titles[1:] # skip the query title titles = map(lambda x: x.text, titles) pdf_links = soup.findAll('link', attrs={'title': 'pdf'}) pdf_urls = map(lambda x: x['href'], pdf_links) print 'downloading: %s, %i' % (authors, len(pdf_urls)) print titles print len(pdf_urls) # randomly grab the urls so we don't have all article from one author in online version (i.e. with limitations) ct = 0 for urlnum in random.sample(range(len(pdf_urls)), len(pdf_urls)): if self._stream_to_file(urllib2.urlopen(pdf_urls[urlnum], timeout=8), os.path.join(self.data_folder, slugify(titles[urlnum]) + '.pdf')): ct += 1 print '\n$$$$\nAdded %i files from arXiv, total downloaded content at %0.2f Mb\n$$$$\n' % (ct, self.tot_dl)
def collect_www_data(self, url, fsize_limit=5): """ Collect data from the provided url -- currently limited to pdf collection @param fsize_limit: individual file size limit in MB @return: -12 if collection rejected by robots.txt """ # check robots.txt rp = robotparser.RobotFileParser() up = urlparse.urlparse(url) rp.set_url("http://" + up.hostname + "/robots.txt") rp.read() if not rp.can_fetch("*", url): print "Data collection disallowed by robots.txt" return -12 cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) req = opener.open(url) resp = req.read() soup = BeautifulSoup(resp) # find the pdfs data_urls = soup.findAll(name='a', attrs={'href': re.compile('\.pdf')}) for data in data_urls: dn_url = data['href'] if 'http' not in dn_url: # it's a local url dn_url = urlparse.urljoin(url, dn_url) try: open_url = urllib2.urlopen(dn_url, timeout=8) cl = None if open_url.headers.has_key('Content-Length'): cl = open_url.headers['Content-Length'] if cl: cl = float(cl) / 1000000 if cl < fsize_limit and cl + self.tot_dl < self.max_dc: # download file to dataloc fname = dn_url.split('/')[-1].lower() save_file = os.path.join(self.data_folder, fname) print '%s, %0.2f Mb' % (fname, cl) self._stream_to_file(open_url, save_file) else: continue except urllib2.HTTPError: print 'Problem accessing %s' % dn_url continue open_url.close() print "total downloaded: %0.2f Mb" % self.tot_dl