def save(source, type, categoryId, statsDate, cnt, flag): conn = db.connect_torndb_crawler() #conn = db.connect_torndb() stats = conn.get( "select * from spider_stats where source=%s and type=%s and categoryId=%s and statsDate=%s limit 1", source, type, categoryId, statsDate) if stats is None: if flag == 'create': sql = "insert spider_stats(statsDate,categoryId,categoryName,source,sourceName,type,typeName,createNum) \ values(%s,%s,%s,%s,%s,%s,%s,%s)" else: sql = "insert spider_stats(statsDate,categoryId,categoryName,source,sourceName,type,typeName,updateNum) \ values(%s,%s,%s,%s,%s,%s,%s,%s)" conn.insert(sql, statsDate, categoryId, idmap[categoryId], source, idmap[source], type, idmap[type], cnt) else: stats_id = stats["id"] if flag == 'create': conn.update("update spider_stats set createNum=%s where id=%s", cnt, stats_id) else: conn.update("update spider_stats set updateNum=%s where id=%s", cnt, stats_id) conn.close()
def fetch_proxies3(): logger.info("fetch_proxies3") global proxies, proxies_num, proxies_current, proxies_priority proxies_priority = 0 proxies_temp = [] while True: conn = db.connect_torndb_crawler() sql = 'select * from proxy_tyc' results = conn.query(sql) for result in results: proxy = { "ip:port": "%s:%s" % (result["ip"], result["port"]), "http_type": result["type"] } logger.info(proxy) proxies_temp.append(proxy) conn.close() proxies = proxies_temp proxies_num = len(proxies) proxies_current = 0 logger.info("proxies_num=%d, proxies_current=%d" % (proxies_num, proxies_current)) if proxies_num == 0: #time.sleep(60) fetch_proxies1() break break
def insert_news(item, company_id): conn = db.connect_torndb_crawler() date = item["date"] table_id = get_company_table(conn, company_id) result = conn.get( "select id, date from news" + table_id + " where companyId=%s and title=%s limit 1", company_id, item["title"]) if result != None: if result['date'] == None: conn.update("update news" + table_id + " set date= %s where id=%s", result["date"], result["id"]) return if item["domainId"] == None: item["domainId"] = 0 sql = "insert news" + table_id + "(companyId, date, title, link, domainId, createTime) values(%s,%s,%s,%s,%s, now())" newsId = conn.insert(sql, company_id, date, item["title"], item["url"], item["domainId"]) today = "%s" % time.strftime("%Y-%m-%d", time.localtime()) if date != None: if today in date: sql = "insert news_latest(companyId,newsId,newsTable,date,createTime) values(%s,%s,%s,%s, now())" conn.insert(sql, company_id, newsId, table_id, date) index = 1 for c in item["contents"]: content = "" image_id = "" if c["type"] == "text": content = c["data"] if c["type"] == "img": imageUrl = c["data"] try: r = requests.get(imageUrl, timeout=60) img = Image.open(StringIO(r.content)) (width, height) = img.size if width > 640: ratio = 640.0 / width img = img.resize((int(width * ratio), int(height * ratio)), Image.ANTIALIAS) output = StringIO() img.save(output, format='jpeg') image_id = imgfs.put(output.getvalue(), content_type='jpeg', filename='news.jpg') except: pass sql = "insert news_content" + table_id + "(newsId,content,image,rank) values(%s,%s,%s,%s)" conn.insert(sql, newsId, content, image_id, index) index += 1 conn.close()
def get_single_proxy(proxy): conn = db.connect_torndb_crawler() country = '' isp = '' ping = '' transfer = '' if proxy.get('country') != None: country = ' and country= "' + proxy['country'] + '"' #if proxy.get('isp') != None: # isp = ' and isp= "'+proxy['isp'] + '"' isp = ' and isp!="移动"' if proxy.get('ping') != None: ping = ' and pingTime < ' + str(proxy['ping']) if proxy.get('transferTime') != None: transfer = ' and transferTime < ' + str(proxy['transferTime']) sql = 'select * from proxy where type="' + proxy[ 'type'] + '" and anonymity= "' + proxy[ 'anonymity'] + '" ' + isp + country + ping + transfer + ' order by rand() limit 1' result = conn.get(sql) conn.close() return result
def release_proxy(proxy): if proxy["id"] == 0: return conn = db.connect_torndb_crawler() conn.execute("update proxy_tyc set status=0, createTime=now() where id=%s", proxy["id"]) conn.close()
def get_proxy(): conn = db.connect_torndb_crawler() proxy = conn.get( "select * from proxy_tyc where status = 0 and DATE_ADD(createTime,INTERVAL 2 SECOND) < now() order by fail limit 1" ) #proxy = conn.get("select * from proxy where type like 'socks%%' order by rand() limit 1"); #proxy = conn.get("select * from proxy where type='http' order by rand() limit 1"); if proxy is not None: conn.execute("update proxy_tyc set status=1 where id=%s", proxy["id"]) logger.info(proxy) conn.close() return proxy
def delete_last_proxy_from_db(): global proxies, proxies_num, proxies_current if proxies_current - 1 >= 0: logger.info("delete") proxy_ip = proxies[proxies_current - 1] logger.info(proxy_ip) ip_port = proxy_ip["ip:port"].split(":") conn = db.connect_torndb_crawler() conn.execute( "delete from proxy_tyc where ip=%s and port=%s and type=%s", ip_port[0], int(ip_port[1]), proxy_ip["http_type"]) conn.close()
def proxy_fail(proxy): if proxy["id"] == 0: return conn = db.connect_torndb_crawler() result = conn.get("select * from proxy_tyc where id=%s", proxy["id"]) if result is not None: fail_num = result["fail"] if fail_num > 30: conn.execute("delete from proxy_tyc where id=%s", proxy["id"]) else: conn.execute("update proxy_tyc set fail=%s where id=%s", fail_num + 1, proxy["id"]) conn.close()
def begin(): global total, cnt NUM = 1000 conn = db.connect_torndb() while True: conn2 = db.connect_torndb_crawler() result = conn2.get( "select count(*) cnt from proxy_tyc where status = 0 and DATE_ADD(createTime,INTERVAL 2 SECOND) < now()" ) conn2.close() if result["cnt"] > 0: break time.sleep(5) while True: logger.info("cnt=%d" % cnt) cs = conn.query( "select * from company_alias where type=12010 order by id limit %s,%s", cnt, NUM) if len(cs) <= 0: logger.info("Finish.") exit() request_num = 0 for c in cs: key = c["name"].strip().replace("(", u"(").replace(")", u")").\ replace("?", "").replace(" ", "").replace("'","").\ replace(".","").replace(";","") if key.find("/") != -1: cnt += 1 continue if collection.find_one({ "source": SOURCE, "type": TYPE, "key": key }) is not None: cnt += 1 continue logger.info(key) first_request(key, first=True) request_num += 1 if request_num == 0: continue else: break conn.close()
def proxy_verify(type): while True: try: conn = db.connect_torndb_crawler() ip_list = conn.query('select * from proxy where type= %s', type) if ip_list: for ip in ip_list: ip['ip'] = str(ip['ip']) ip['port'] = str(ip['port']) proxy_util.verify_proxy(ip, 'DB', type, conn) except Exception, e: logger.info(' ' + type + ' ' + e) pass finally:
def test_all(): mongo = db.connect_mongo() proxies = list( mongo.raw.proxy.find( {"$or": [{ "http_type": "Socks5" }, { "http_type": "Socks4" }]}, sort=[("_id", pymongo.ASCENDING)])) #proxies = list(mongo.raw.proxy.find({"http_type":"Socks4"}, sort=[("_id",pymongo.ASCENDING)])) mongo.close() for proxy in proxies: socks_version = 4 if proxy["http_type"] == "Socks5": socks_version = 5 ip_port = proxy["ip:port"].split(":") ip = ip_port[0] port = int(ip_port[1]) conn = db.connect_torndb_crawler() p = conn.get("select * from proxy_tyc where ip=%s and port=%s", ip, port) conn.close() if p is None: logger.info("Socks%s://%s:%s" % (socks_version, ip, port)) flag = crack(socks_version, ip, port) if flag: logger.info("Good!") conn = db.connect_torndb_crawler() conn.insert( "insert proxy_tyc(ip,port,type,createTime) values(%s,%s,%s,now())", ip, port, proxy["http_type"]) cnt = conn.get("select count(*) cnt from proxy_tyc") conn.close() if cnt["cnt"] >= 2: break
def save_comment(company_id, artifact_id, comment): t = datetime.date.today() conn = db.connect_torndb_crawler() table_id = aggregator_util.get_android_table_id(conn, company_id) a = conn.get( "select * from android" + table_id + " where date=%s and artifactId=%s and type=%s", t, artifact_id, TYPE) if a is None: conn.insert( "insert android" + table_id + "(companyId,artifactId,comment,date,type) values(%s,%s,%s,%s,%s)", company_id, artifact_id, comment, t, TYPE) else: conn.update( "update android" + table_id + " set comment=%s where id =%s", comment, a["id"]) conn.close()
def migrate(): conn = db.connect_torndb_crawler() for i in range(1,101): table_name = "news%s" % i logger.info(table_name) all_news = conn.query("select * from " + table_name + " where migrate=0 order by id") for news in all_news: logger.info(news["title"]) oldNewsId = "%s_%s" % (table_name, news["id"]) news1 = collection.find_one({"oldNewsId":oldNewsId}) if news1 is None: news1 = { "companyId":news["companyId"], "date":news["date"], "title":news["title"], "link":news["link"], "confidence":news["confidence"], "verify":news["verify"], "active":news['active'], "createTime":news["createTime"], "oldNewsId": oldNewsId } content_table_name = "news_content%s" % i contents = conn.query("select * from " + content_table_name + " where newsId=%s order by id", news["id"]) contents1 = [] for content in contents: #logger.info(content["content"]) content = { "rank":content["rank"], "content":content["content"], "image":content["image"] } contents1.append(content) news1["contents"] = contents1 #logger.info(news1) collection.insert(news1) #break conn.execute("update " + table_name + " set migrate=1 where id=%s", news["id"]) #break conn.close()
def get_http_session(): global proxies, proxies_num, proxies_current, proxies_priority flag = 0 if proxies_priority != 0: conn = db.connect_torndb_crawler() result = conn.get("select count(*) cnt from proxy_tyc") conn.close() if result["cnt"] > 0: fetch_proxies3() flag = 1 if flag == 0: if proxies_num == 0 or proxies_current >= proxies_num: if proxies_current == proxies_num: delete_last_proxy_from_db() fetch_proxies3() if proxies_current > 0: delete_last_proxy_from_db() proxy_ip = proxies[proxies_current] proxies_current += 1 logger.info("Proxy IP: %s" % proxy_ip) http_session = requests.Session() if proxy_ip["http_type"] == "Socks4": http_type = "socks4" elif proxy_ip["http_type"] == "HTTP": http_type = "http" else: http_type = "socks5" http_session.proxies = { "http": "%s://%s" % (http_type, proxy_ip["ip:port"]) } #http_session.proxies={"http":"socks5://222.187.210.218:1080"} return http_session
cnt += 1 continue if collection.find_one({ "source": SOURCE, "type": TYPE, "key": key }) is not None: cnt += 1 continue logger.info(key) first_request(key, first=True) request_num += 1 if request_num == 0: continue else: break conn.close() if __name__ == "__main__": logger.info("Start...") conn2 = db.connect_torndb_crawler() conn2.execute("update proxy_tyc set status=0") conn2.close() AsyncHTTPClient.configure("tornado.curl_httpclient.CurlAsyncHTTPClient") http_client = AsyncHTTPClient(max_clients=5) begin() tornado.ioloop.IOLoop.instance().start()
def get_proxy(type, anonymity): try: sleep_time = random.randint(10, 30) time.sleep(sleep_time) conn = db.connect_torndb_crawler() if type == 'http': t = '1' elif type == 'https': t = '2' elif type == 'socks4': t = '4' elif type == 'socks5': t = '5' if anonymity == 'high': a = '3,5' else: a = '2' # url = 'http://proxy.mimvp.com/api/fetch.php?orderid=860150908143212810&num=20&http_type='+t+'&anonymous='+a+'&result_format=json' url = 'http://proxy.mimvp.com/api/fetch.php?orderid=860150908143212810&num=20' \ '&http_type='+t+'&anonymous='+a+'&result_fields=1,2,3,4,5,6,7,8,9&result_format=json' urllib2.install_opener(None) s = urllib2.urlopen(url, timeout=60) result = s.read() result = json.loads(result) ip_list = result['result'] # logger.info(ip_list) if len(ip_list) == 0: s.close() conn.close() return for proxystr in ip_list: ip = {} ip['anonymity'] = anonymity ip['type'] = type ip['transferTime'] = proxystr['transfer_time'] ip['pingTime'] = proxystr['ping_time'] ip['isp'] = proxystr['isp'] countryarr = proxystr['country'].split(':') country = countryarr[0] province = '' if len(countryarr) > 1: province = countryarr[1] if country == u'中国': country = 'cn' ip['country'] = country ip['province'] = province iparr = proxystr['ip:port'].split(':') ip['ip'] = str(iparr[0]) ip['port'] = str(iparr[1]) # if ip['pingTime'] < 5 and ip['transferTime'] < 5: if type == 'http' or type == 'https': verify_proxy(ip, 'API', type, conn) else: insert_db(ip, type, conn) s.close() except Exception, e: logger.info(e)
def get_artifact(companies): global cnt conn = db.connect_torndb() conn_crawler = db.connect_torndb_crawler() for company in companies: companyId = company['id'] artifacts = conn.query("select * from artifact where companyId = %s", companyId) # logger.info(artifacts) if len(artifacts) > 10: company_index = conn_crawler.get("select * from company_index where companyId = %s", companyId) if company_index is not None: # logger.info(company_index) for artifact in artifacts: # logger.info(artifact) type = artifact['type'] artifactId = artifact['id'] trend_data = None if type == 4010: if artifact['domain'] is not None: if company_index['alexa'] is not None: sql = "select * from alexa"+str(company_index['alexa'])+" where companyId = %s and artifactId = %s order by date desc limit 1" alexa_data = conn_crawler.get(sql, companyId, artifactId) if alexa_data is not None: if alexa_data['rankGlobal'] is not None and alexa_data['rankGlobal'] > 0: trend_data = alexa_data['rankGlobal'] if type == 4020: pass if type == 4030: pass if type == 4040: if company_index['ios'] is not None: sql = "select * from ios"+str(company_index['ios'])+" where companyId = %s and artifactId = %s order by date desc limit 1" ios_data = conn_crawler.get(sql, companyId, artifactId) if ios_data is not None: if ios_data['comment'] is not None: trend_data = ios_data['comment'] if type == 4050: if company_index['android'] is not None: sql = "select * from android"+ str(company_index['android'])+" where companyId = %s and artifactId = %s and type=16040 order by date desc limit 1" android_data = conn_crawler.get(sql, companyId, artifactId) if android_data is not None: if android_data['download'] is not None: if android_data['download'] > 1000: trend_data = android_data['download'] logger.info(trend_data) if trend_data is not None: logger.info("artifactId = %s", artifactId) if artifact['verify'] != 'N': update_sql = "update artifact set rank =%s, active='Y', modifyTime=now() where id=%s" conn.update(update_sql, int(trend_data), int(artifactId)) else: if type == 4020 or type == 4030: update_sql = "update artifact set active='Y', modifyTime=now() where id=%s" conn.update(update_sql, int(artifactId)) else: update_sql = "update artifact set active='N', modifyTime=now() where id=%s" conn.update(update_sql, int(artifactId)) conn.close() conn_crawler.close() cnt += 1000 begin()
def proxy_success(proxy): if proxy["id"] == 0: return conn = db.connect_torndb_crawler() conn.execute("update proxy_tyc set fail=%s where id=%s", 0, proxy["id"]) conn.close()
import loghelper, config import db import name_helper #logger loghelper.init_logger("domain_2_beian", stream=True) logger = loghelper.get_logger("domain_2_beian") #mongo mongo = db.connect_mongo() collection = mongo.trend.itunes if __name__ == '__main__': logger.info("Begin...") conn = db.connect_torndb() conn_crawler = db.connect_torndb_crawler() for i in range(1,101): logger.info("ios" + str(i)) start = 0 #sql = "select * from ios" + str(i) + " where date>date_sub(now(),interval 30 day) order by id limit %s, 10000" sql = "select * from ios" + str(i) + " order by id limit %s, 10000" while True: items = list(conn_crawler.query(sql, start)) if len(items) == 0: break for item in items: artifactId = item["artifactId"] artifact = conn.get("select * from artifact where id=%s", artifactId) if artifact is None: logger.info("artifactId=%s not Found!", artifactId) continue