Пример #1
0
 def requestAPIForURL(amount):
     html = crawlUtils.crawlWorker("http://finance.nbd.com.cn/", "Anon", 0)[0]
     lastArticleID = nbdCrawlMethod.GET_LA_REGEX.findall(html)[0]
     amount = float(amount)
     i = amount / 30
     j = amount // 30
     needPages = int(i) if i == j else int(i) + 1
     result = []
     for i in range(1, 1 + needPages):
         try:
             url = "http://finance.nbd.com.cn/columns/119?last_article=%s" % lastArticleID
             APIHTML = crawlUtils.crawlWorker(url, "Anon", 0)[0]
             links = ["http://www.nbd.com.cn/articles/%s.html" % x
                      for x in nbdCrawlMethod.EXTRACT_LINKS_REGEX.findall(APIHTML)]
             lastArticleID = nbdCrawlMethod.GET_LA_REGEX.findall(APIHTML)[0]
             result += links
         except:
             pass
     return result
Пример #2
0
 def requestAPIForURL(amount):
     amount = float(amount)
     i = amount / 30
     j = amount // 30
     needPages = int(i) if i == j else int(i) + 1
     result = []
     for i in range(1, 1 + needPages):
         APIURL = "http://www.dzzq.com.cn/list_111_%s.html" % i
         html = crawlUtils.crawlWorker(APIURL, "Anon", 0)['raw']
         links = dzzqCrawlMethod.GET_LINK_REGEX.findall(html)
         for j in links:
             if "list" not in j and j[0] != "S":
                 result.append("http://www.dzzq.com.cn/finance/%s.html" % j)
     return result
Пример #3
0
 def requestAPIForURL(amount):
     amount = float(amount)
     i = amount / 10
     j = amount // 10
     needPages = int(i) if i == j else int(i) + 1
     result = []
     for i in range(1, 1 + needPages):
         APIURL = "http://www.nifa.org.cn/nifa/2955675/2955761/a704445f/index%s.html" % i
         html = crawlUtils.crawlWorker(APIURL, "Anon", 0)['raw']
         links = nifaCrawlMethod.GET_LINK_REGEX.findall(html)
         for j in links:
             result.append(
                 "http://www.nifa.org.cn/nifa/%s/%s/%s/index.html" %
                 (j[0], j[1], j[2]))
     return result
Пример #4
0
 def requestAPIForURL(amount):
     amount = float(amount)
     i = amount / 60
     j = amount // 60
     needPages = int(i) if i == j else int(i) + 1
     result = []
     for i in range(2, 2 + needPages):
         APIURL = "http://www.stockstar.com/roll/finance_%s.shtml" % i
         html = crawlUtils.crawlWorker(APIURL, "Anon", 0)['raw'] \
             .replace("<html><body><p>", "") \
             .replace("</p></body></html>", "")
         links = zqzxCrawlMethod.GET_LINK_REGEX.findall(html)
         for i in links:
             if "list" not in i and i[0] != "S":
                 result.append("https://finance.stockstar.com/%s.shtml" % i)
     return result
Пример #5
0
 def requestAPIForURL(amount):
     amount = float(amount)
     i = amount / 10
     j = amount // 10
     needPages = int(i) if i == j else int(i) + 1
     result = []
     for i in range(1, 1 + needPages):
         APIURL = "https://www.tmtpost.com/httpsserver/common/get?url=/v1/lists/home&data=offset=%s" % (
             i * 10)
         html = crawlUtils.crawlWorker(APIURL, "Anon", 0)['raw']\
             .replace("<html><body><p>", "")\
             .replace("</p></body></html>", "")
         print(html)
         jsonData = json.loads(html)
         for x in jsonData["data"]:
             if x["item_type"] == "post":
                 result.append(x["short_url"])
     return result
Пример #6
0
 def getLastestPostID():
     html = crawlUtils.crawlWorker("http://www.chinatimes.net.cn/", "Anon", 0)['raw']
     return int(chinatimesCrawlMethod.EXTRACT_LATEST_RE.findall(html)[0])
Пример #7
0
 def getLastestPostID():
     html = crawlUtils.crawlWorker("http://www.ctoutiao.com/", "Anon",
                                   0)['raw']
     return int(cttCrawlMethod.EXTRACT_LATEST_RE.findall(html)[0])
Пример #8
0
 def getLastestPostID():
     html = crawlUtils.crawlWorker("http://www.ctsbw.com/article/", "Anon",
                                   0)['raw']
     return int(ctsbwCrawlMethod.EXTRACT_LATEST_RE.findall(html)[5])
Пример #9
0
 def getLastestPostID():
     html = crawlUtils.crawlWorker("https://36kr.com/", "Anon", 0)['raw']
     return int(krCrawlMethod.EXTRACT_LATEST_RE.findall(html)[0])
Пример #10
0
 def getLastestPostID():
     html = crawlUtils.crawlWorker("http://www.geekpark.net/", "Anon", 0)['raw']
     return int(geekparkCrawlMethod.EXTRACT_LATEST_RE.findall(html)[1])
Пример #11
0
 def getLastestPostID():
     html = crawlUtils.crawlWorker(
         "http://www.cyzone.cn/content/index/init?tpl=index_page&page=1",
         "Anon", 0)['raw']
     return int(cyzoneCrawlMethod.EXTRACT_LATEST_RE.findall(html)[0])
Пример #12
0
 def getLastestPostID():
     html = crawlUtils.crawlWorker("https://news.pedata.cn/", "Anon",
                                   0)['raw']
     return int(pedataCrawlMethod.EXTRACT_LATEST_RE.findall(html)[0])
Пример #13
0
 def getLastestPostID():
     html = crawlUtils.crawlWorker("http://www.jpm.cn/", "Anon", 0)['raw']
     return int(jpmCrawlMethod.EXTRACT_LATEST_RE.findall(html)[7])