示例#1
0
 def _validate(self, link):
     response = self._getresponse(link)
     if not response:
         return None
     logger.debug("Validating link: {link}. Status code: {status}".format(link=link, status=response.status_code))
     if response.status_code != 200 or response.url[-3:] == "404":
         return None
     return link
示例#2
0
 def getlinks(self, query, numlinks=5, size=(100, 1500)):
     start = datetime.now()
     query = query.replace(" ", "+")
     url = self._LIST_URL.format(query=query, sizefrom=size[0], sizeto=size[1])
     urls = [url.format(pageid=i) for i in range(1, self._NUM_PAGES_SCRAPED + 1)]
     links = parallel_map(self._scrapepage, urls)
     res = []
     for link in links:
         res += link
     links = filter(lambda x: x is not None, res)
     stop = datetime.now()
     logger.debug("getlinks took: {time}".format(time=stop - start))
     return links[:numlinks]
示例#3
0
 def _scrapepage(self, url):
     logger.debug("_scrapepage")
     response = self._getresponse(url)
     if not response:
         print("ERROR: Couldn't connect to filestube!")
         return []
     html = response.text
     doc = lxml.html.fromstring(html)
     results = doc.cssselect("div.newresult")
     links = []
     for result in results:
         try:
             a_tag = result.cssselect("a.resultsLink")[0]
             link = self._BASE_URL + a_tag.get('href')
             links.append(link)
         except IndexError:
             print("there was no link on the page")
             continue
     logger.debug("Number of links: {numlinks}".format(numlinks=len(links)))
     return parallel_map(self._getfilehostlinks, links)
示例#4
0
 def _getfilehostlinks(self, url):
     logger.debug("_getfilehostlinks")
     response = self._getresponse(url)
     if not response:
         return None
     html = response.text
     logger.debug("Getting filehost links")
     doc = lxml.html.fromstring(html)
     try:
         links = doc.cssselect("pre#copy_paste_links")[0].text_content().strip("\"").strip().split()
         title = doc.cssselect("div.file-heading h1")[0].text_content()
     except IndexError:
         logger.debug("div.dotter.h1 doesn't exist. Skipping..")
         return None
     uris = parallel_map(self._validate, links)
     if None in uris:
         return None
     title = title[:-9]  # remove " download" from end of title
     return Link(title=title, uris=uris)