def __init__(self):
     self.opener = urllib2.build_opener()
     self.opener.addheaders = [("User-agent", "Mozilla/5.0")]
     self.idxparser = NatureIndexParser()
     self.issues = []
     self.articles = []
     self.download_dir = ""
class NatureSpider:
    "Class for handling hierarchy of and fetching reference files "
    "from Nature websites "

    def __init__(self):
        self.opener = urllib2.build_opener()
        self.opener.addheaders = [("User-agent", "Mozilla/5.0")]
        self.idxparser = NatureIndexParser()
        self.issues = []
        self.articles = []
        self.download_dir = ""

    def __getResource(self, resourceName):
        "Dynamically return a resource handle based on its name"
        sleep(randint(1, 5))
        if resourceName[:4] == "http":
            f = self.opener.open(resourceName)
        else:
            f = open(resourceName)
        return f

    def __download(self, url):
        "Despite its clever name, this function actually bakes cakes."
        webFile = urllib2.urlopen(url)
        localFile = open(self.download_dir + url.split("/")[-1], "w")
        localFile.write(webFile.read())
        webFile.close()
        localFile.close()

    def readIndex(self, resourceName):
        "Reads a resource to parse for links to issues. This should be a "
        "filename or url. urls are assumed to start with 'http'"
        f = self.__getResource(resourceName)
        self.idxparser.parse(f.read())

    def readIssue(self, resourceName):
        "Read a resource to fetch all links to article texts, and push it "
        "on to the list of issues."
        if resourceName == "":
            return
        i = NatureIssueParser()
        f = self.__getResource(resourceName)
        i.parse(f.read())
        self.issues.append(i)

    def readArticle(self, resourceName):
        "Read a resource to fetch all links to article reference files, "
        "and push it on to a list of articles"
        a = NatureArticleParser()
        f = self.__getResource(resourceName)
        a.parse(f.read())
        if len(a.links) > 1:
            print "fetched links", " and ".join(a.links)
            self.articles.append(a)

    def indexLinks(self):
        "Return a list of links from the idxparser, randomized, so that "
        "site access is non-sequential"
        shuffle(self.idxparser.links)
        return self.idxparser.links

    def articleLinks(self):
        "Compile and return a randomized list of links to article pages "
        "from the tree of links in issues"
        a = []
        for i in self.issues:
            for l in i.links:
                a.append(l)
        shuffle(a)
        return a

    def readIssues(self):
        "Loop over links provided in the idxparser, pushing NatureIssue "
        "instances onto self.issues via readIssue"
        i = 0
        for l in self.indexLinks():
            if i > 1:  # stopgap for testing, so as not
                break  # to pull whole site ever time
            print "reading issue", l
            if l[:4] == "http":
                return
            self.readIssue("http://www.nature.com" + l)
            i += 1

    def readArticles(self):
        "Parse all articles queued in self.articleLinks() by "
        "mapping self.readArticle, filling up self.articles"
        for l in self.articleLinks():
            self.readArticle("http://nature.com" + l)

    def fetchReferences(self):
        "Iterate over a populated articles list, and download "
        "all .ris files"
        for a in self.articles:
            for l in a.links:
                print "Downloading", l
                self.__download("http://nature.com" + l)