Пример #1
0
def save(source, type, categoryId, statsDate, cnt, flag):
    conn = db.connect_torndb_crawler()
    #conn = db.connect_torndb()
    stats = conn.get(
        "select * from spider_stats where source=%s and type=%s and categoryId=%s and statsDate=%s limit 1",
        source, type, categoryId, statsDate)
    if stats is None:
        if flag == 'create':
            sql = "insert spider_stats(statsDate,categoryId,categoryName,source,sourceName,type,typeName,createNum) \
                    values(%s,%s,%s,%s,%s,%s,%s,%s)"

        else:
            sql = "insert spider_stats(statsDate,categoryId,categoryName,source,sourceName,type,typeName,updateNum) \
                    values(%s,%s,%s,%s,%s,%s,%s,%s)"

        conn.insert(sql, statsDate, categoryId, idmap[categoryId], source,
                    idmap[source], type, idmap[type], cnt)
    else:
        stats_id = stats["id"]
        if flag == 'create':
            conn.update("update spider_stats set createNum=%s where id=%s",
                        cnt, stats_id)
        else:
            conn.update("update spider_stats set updateNum=%s where id=%s",
                        cnt, stats_id)
    conn.close()
Пример #2
0
def fetch_proxies3():
    logger.info("fetch_proxies3")
    global proxies, proxies_num, proxies_current, proxies_priority
    proxies_priority = 0

    proxies_temp = []
    while True:
        conn = db.connect_torndb_crawler()
        sql = 'select * from proxy_tyc'
        results = conn.query(sql)
        for result in results:
            proxy = {
                "ip:port": "%s:%s" % (result["ip"], result["port"]),
                "http_type": result["type"]
            }
            logger.info(proxy)
            proxies_temp.append(proxy)
        conn.close()
        proxies = proxies_temp
        proxies_num = len(proxies)
        proxies_current = 0
        logger.info("proxies_num=%d, proxies_current=%d" %
                    (proxies_num, proxies_current))
        if proxies_num == 0:
            #time.sleep(60)
            fetch_proxies1()
            break
        break
Пример #3
0
def insert_news(item, company_id):

    conn = db.connect_torndb_crawler()
    date = item["date"]

    table_id = get_company_table(conn, company_id)

    result = conn.get(
        "select id, date from news" + table_id +
        " where companyId=%s and title=%s limit 1", company_id, item["title"])
    if result != None:
        if result['date'] == None:
            conn.update("update news" + table_id + " set date= %s where id=%s",
                        result["date"], result["id"])
        return

    if item["domainId"] == None:
        item["domainId"] = 0

    sql = "insert news" + table_id + "(companyId, date, title, link, domainId, createTime) values(%s,%s,%s,%s,%s, now())"
    newsId = conn.insert(sql, company_id, date, item["title"], item["url"],
                         item["domainId"])

    today = "%s" % time.strftime("%Y-%m-%d", time.localtime())

    if date != None:
        if today in date:
            sql = "insert news_latest(companyId,newsId,newsTable,date,createTime) values(%s,%s,%s,%s, now())"
            conn.insert(sql, company_id, newsId, table_id, date)

    index = 1
    for c in item["contents"]:
        content = ""
        image_id = ""
        if c["type"] == "text":
            content = c["data"]
        if c["type"] == "img":
            imageUrl = c["data"]
            try:
                r = requests.get(imageUrl, timeout=60)
                img = Image.open(StringIO(r.content))
                (width, height) = img.size
                if width > 640:
                    ratio = 640.0 / width
                    img = img.resize((int(width * ratio), int(height * ratio)),
                                     Image.ANTIALIAS)

                output = StringIO()
                img.save(output, format='jpeg')
                image_id = imgfs.put(output.getvalue(),
                                     content_type='jpeg',
                                     filename='news.jpg')
            except:
                pass

        sql = "insert news_content" + table_id + "(newsId,content,image,rank) values(%s,%s,%s,%s)"
        conn.insert(sql, newsId, content, image_id, index)
        index += 1

    conn.close()
Пример #4
0
def get_single_proxy(proxy):
    conn = db.connect_torndb_crawler()
    country = ''
    isp = ''
    ping = ''
    transfer = ''
    if proxy.get('country') != None:
        country = ' and country= "' + proxy['country'] + '"'

    #if proxy.get('isp') != None:
    #    isp = ' and isp= "'+proxy['isp'] + '"'
    isp = ' and isp!="移动"'

    if proxy.get('ping') != None:
        ping = ' and pingTime < ' + str(proxy['ping'])

    if proxy.get('transferTime') != None:
        transfer = ' and transferTime < ' + str(proxy['transferTime'])

    sql = 'select * from proxy where type="' + proxy[
        'type'] + '" and anonymity= "' + proxy[
            'anonymity'] + '" ' + isp + country + ping + transfer + ' order by rand() limit 1'
    result = conn.get(sql)

    conn.close()
    return result
Пример #5
0
def release_proxy(proxy):
    if proxy["id"] == 0:
        return
    conn = db.connect_torndb_crawler()
    conn.execute("update proxy_tyc set status=0, createTime=now() where id=%s",
                 proxy["id"])
    conn.close()
Пример #6
0
def get_proxy():
    conn = db.connect_torndb_crawler()
    proxy = conn.get(
        "select * from proxy_tyc where status = 0 and DATE_ADD(createTime,INTERVAL 2 SECOND) < now() order by fail limit 1"
    )
    #proxy = conn.get("select * from proxy where type like 'socks%%' order by rand() limit 1");
    #proxy = conn.get("select * from proxy where type='http' order by rand() limit 1");
    if proxy is not None:
        conn.execute("update proxy_tyc set status=1 where id=%s", proxy["id"])
        logger.info(proxy)
    conn.close()
    return proxy
Пример #7
0
def delete_last_proxy_from_db():
    global proxies, proxies_num, proxies_current
    if proxies_current - 1 >= 0:
        logger.info("delete")
        proxy_ip = proxies[proxies_current - 1]
        logger.info(proxy_ip)
        ip_port = proxy_ip["ip:port"].split(":")
        conn = db.connect_torndb_crawler()
        conn.execute(
            "delete from proxy_tyc where ip=%s and port=%s and type=%s",
            ip_port[0], int(ip_port[1]), proxy_ip["http_type"])
        conn.close()
Пример #8
0
def proxy_fail(proxy):
    if proxy["id"] == 0:
        return
    conn = db.connect_torndb_crawler()
    result = conn.get("select * from proxy_tyc where id=%s", proxy["id"])
    if result is not None:
        fail_num = result["fail"]
        if fail_num > 30:
            conn.execute("delete from proxy_tyc where id=%s", proxy["id"])
        else:
            conn.execute("update proxy_tyc set fail=%s where id=%s",
                         fail_num + 1, proxy["id"])
    conn.close()
Пример #9
0
def begin():
    global total, cnt
    NUM = 1000
    conn = db.connect_torndb()

    while True:
        conn2 = db.connect_torndb_crawler()
        result = conn2.get(
            "select count(*) cnt from proxy_tyc where status = 0 and DATE_ADD(createTime,INTERVAL 2 SECOND) < now()"
        )
        conn2.close()
        if result["cnt"] > 0:
            break
        time.sleep(5)

    while True:
        logger.info("cnt=%d" % cnt)

        cs = conn.query(
            "select * from company_alias where type=12010 order by id limit %s,%s",
            cnt, NUM)
        if len(cs) <= 0:
            logger.info("Finish.")
            exit()

        request_num = 0
        for c in cs:
            key = c["name"].strip().replace("(", u"(").replace(")", u")").\
                replace("?", "").replace(" ", "").replace("'","").\
                replace(".","").replace(";","")
            if key.find("/") != -1:
                cnt += 1
                continue
            if collection.find_one({
                    "source": SOURCE,
                    "type": TYPE,
                    "key": key
            }) is not None:
                cnt += 1
                continue
            logger.info(key)
            first_request(key, first=True)
            request_num += 1

        if request_num == 0:
            continue
        else:
            break

    conn.close()
Пример #10
0
def proxy_verify(type):
    while True:
        try:
            conn = db.connect_torndb_crawler()
            ip_list = conn.query('select * from proxy where type= %s', type)
            if ip_list:
                for ip in ip_list:
                    ip['ip'] = str(ip['ip'])
                    ip['port'] = str(ip['port'])
                    proxy_util.verify_proxy(ip, 'DB', type, conn)
        except Exception, e:
            logger.info(' ' + type + '  ' + e)
            pass
        finally:
Пример #11
0
def test_all():
    mongo = db.connect_mongo()
    proxies = list(
        mongo.raw.proxy.find(
            {"$or": [{
                "http_type": "Socks5"
            }, {
                "http_type": "Socks4"
            }]},
            sort=[("_id", pymongo.ASCENDING)]))
    #proxies = list(mongo.raw.proxy.find({"http_type":"Socks4"}, sort=[("_id",pymongo.ASCENDING)]))
    mongo.close()
    for proxy in proxies:
        socks_version = 4
        if proxy["http_type"] == "Socks5":
            socks_version = 5
        ip_port = proxy["ip:port"].split(":")
        ip = ip_port[0]
        port = int(ip_port[1])
        conn = db.connect_torndb_crawler()
        p = conn.get("select * from proxy_tyc where ip=%s and port=%s", ip,
                     port)
        conn.close()

        if p is None:
            logger.info("Socks%s://%s:%s" % (socks_version, ip, port))
            flag = crack(socks_version, ip, port)
            if flag:
                logger.info("Good!")
                conn = db.connect_torndb_crawler()
                conn.insert(
                    "insert proxy_tyc(ip,port,type,createTime) values(%s,%s,%s,now())",
                    ip, port, proxy["http_type"])
                cnt = conn.get("select count(*) cnt from proxy_tyc")
                conn.close()
                if cnt["cnt"] >= 2:
                    break
Пример #12
0
def save_comment(company_id, artifact_id, comment):
    t = datetime.date.today()
    conn = db.connect_torndb_crawler()
    table_id = aggregator_util.get_android_table_id(conn, company_id)
    a = conn.get(
        "select * from android" + table_id +
        " where date=%s and artifactId=%s and type=%s", t, artifact_id, TYPE)
    if a is None:
        conn.insert(
            "insert android" + table_id +
            "(companyId,artifactId,comment,date,type) values(%s,%s,%s,%s,%s)",
            company_id, artifact_id, comment, t, TYPE)
    else:
        conn.update(
            "update android" + table_id + " set comment=%s where id =%s",
            comment, a["id"])
    conn.close()
Пример #13
0
def migrate():
    conn = db.connect_torndb_crawler()
    for i in range(1,101):
        table_name = "news%s" % i
        logger.info(table_name)
        all_news = conn.query("select * from " + table_name + " where migrate=0 order by id")

        for news in all_news:
            logger.info(news["title"])
            oldNewsId = "%s_%s" % (table_name, news["id"])
            news1 = collection.find_one({"oldNewsId":oldNewsId})
            if news1 is None:
                news1 = {
                    "companyId":news["companyId"],
                    "date":news["date"],
                    "title":news["title"],
                    "link":news["link"],
                    "confidence":news["confidence"],
                    "verify":news["verify"],
                    "active":news['active'],
                    "createTime":news["createTime"],
                    "oldNewsId": oldNewsId
                }
                content_table_name = "news_content%s" % i
                contents = conn.query("select * from " + content_table_name + " where newsId=%s order by id", news["id"])
                contents1 = []
                for content in contents:
                    #logger.info(content["content"])
                    content = {
                        "rank":content["rank"],
                        "content":content["content"],
                        "image":content["image"]
                    }
                    contents1.append(content)
                news1["contents"] = contents1
                #logger.info(news1)
                collection.insert(news1)

                #break

            conn.execute("update " + table_name + " set migrate=1 where id=%s", news["id"])
        #break
    conn.close()
Пример #14
0
def get_http_session():
    global proxies, proxies_num, proxies_current, proxies_priority
    flag = 0
    if proxies_priority != 0:
        conn = db.connect_torndb_crawler()
        result = conn.get("select count(*) cnt from proxy_tyc")
        conn.close()
        if result["cnt"] > 0:
            fetch_proxies3()
            flag = 1

    if flag == 0:
        if proxies_num == 0 or proxies_current >= proxies_num:
            if proxies_current == proxies_num:
                delete_last_proxy_from_db()
            fetch_proxies3()

        if proxies_current > 0:
            delete_last_proxy_from_db()

    proxy_ip = proxies[proxies_current]
    proxies_current += 1
    logger.info("Proxy IP: %s" % proxy_ip)

    http_session = requests.Session()
    if proxy_ip["http_type"] == "Socks4":
        http_type = "socks4"
    elif proxy_ip["http_type"] == "HTTP":
        http_type = "http"
    else:
        http_type = "socks5"
    http_session.proxies = {
        "http": "%s://%s" % (http_type, proxy_ip["ip:port"])
    }
    #http_session.proxies={"http":"socks5://222.187.210.218:1080"}
    return http_session
Пример #15
0
                cnt += 1
                continue
            if collection.find_one({
                    "source": SOURCE,
                    "type": TYPE,
                    "key": key
            }) is not None:
                cnt += 1
                continue
            logger.info(key)
            first_request(key, first=True)
            request_num += 1

        if request_num == 0:
            continue
        else:
            break

    conn.close()


if __name__ == "__main__":
    logger.info("Start...")
    conn2 = db.connect_torndb_crawler()
    conn2.execute("update proxy_tyc set status=0")
    conn2.close()
    AsyncHTTPClient.configure("tornado.curl_httpclient.CurlAsyncHTTPClient")
    http_client = AsyncHTTPClient(max_clients=5)
    begin()
    tornado.ioloop.IOLoop.instance().start()
Пример #16
0
def get_proxy(type, anonymity):
    try:
        sleep_time = random.randint(10, 30)
        time.sleep(sleep_time)

        conn = db.connect_torndb_crawler()
        if type == 'http':
            t = '1'
        elif type == 'https':
            t = '2'
        elif type == 'socks4':
            t = '4'
        elif type == 'socks5':
            t = '5'

        if anonymity == 'high':
            a = '3,5'
        else:
            a = '2'
        # url = 'http://proxy.mimvp.com/api/fetch.php?orderid=860150908143212810&num=20&http_type='+t+'&anonymous='+a+'&result_format=json'
        url = 'http://proxy.mimvp.com/api/fetch.php?orderid=860150908143212810&num=20' \
              '&http_type='+t+'&anonymous='+a+'&result_fields=1,2,3,4,5,6,7,8,9&result_format=json'

        urllib2.install_opener(None)
        s = urllib2.urlopen(url, timeout=60)
        result = s.read()
        result = json.loads(result)

        ip_list = result['result']
        # logger.info(ip_list)

        if len(ip_list) == 0:
            s.close()
            conn.close()
            return

        for proxystr in ip_list:
            ip = {}

            ip['anonymity'] = anonymity
            ip['type'] = type
            ip['transferTime'] = proxystr['transfer_time']
            ip['pingTime'] = proxystr['ping_time']
            ip['isp'] = proxystr['isp']

            countryarr = proxystr['country'].split(':')
            country = countryarr[0]
            province = ''
            if len(countryarr) > 1:
                province = countryarr[1]

            if country == u'中国':
                country = 'cn'

            ip['country'] = country
            ip['province'] = province

            iparr = proxystr['ip:port'].split(':')
            ip['ip'] = str(iparr[0])
            ip['port'] = str(iparr[1])

            # if ip['pingTime'] < 5 and ip['transferTime'] < 5:
            if type == 'http' or type == 'https':
                verify_proxy(ip, 'API', type, conn)
            else:
                insert_db(ip, type, conn)

        s.close()
    except Exception, e:
        logger.info(e)
Пример #17
0
def get_artifact(companies):
    global cnt
    conn = db.connect_torndb()
    conn_crawler = db.connect_torndb_crawler()

    for company in companies:
        companyId = company['id']
        artifacts = conn.query("select * from artifact where companyId = %s", companyId)

        # logger.info(artifacts)

        if len(artifacts) > 10:
            company_index = conn_crawler.get("select * from company_index where companyId = %s", companyId)

            if company_index is not None:
                # logger.info(company_index)
                for artifact in artifacts:
                    # logger.info(artifact)
                    type = artifact['type']
                    artifactId = artifact['id']
                    trend_data = None
                    if type == 4010:
                        if artifact['domain'] is not None:
                            if company_index['alexa'] is not None:
                                sql = "select * from alexa"+str(company_index['alexa'])+" where companyId = %s  and artifactId = %s order by date desc limit 1"
                                alexa_data = conn_crawler.get(sql, companyId, artifactId)
                                if alexa_data is not None:
                                        if alexa_data['rankGlobal'] is not None and alexa_data['rankGlobal'] > 0:
                                            trend_data = alexa_data['rankGlobal']

                    if type == 4020:
                        pass

                    if type == 4030:
                        pass

                    if type == 4040:
                        if company_index['ios'] is not None:
                            sql = "select * from ios"+str(company_index['ios'])+" where companyId = %s  and artifactId = %s order by date desc limit 1"
                            ios_data = conn_crawler.get(sql, companyId, artifactId)
                            if ios_data is not None:
                                if ios_data['comment'] is not None:
                                    trend_data = ios_data['comment']

                    if type == 4050:
                        if company_index['android'] is not None:
                            sql = "select * from android"+ str(company_index['android'])+" where companyId = %s  and artifactId = %s  and type=16040 order by date desc limit 1"
                            android_data = conn_crawler.get(sql, companyId, artifactId)
                            if android_data is not None:
                                if android_data['download'] is not None:
                                    if android_data['download'] > 1000:
                                        trend_data = android_data['download']

                    logger.info(trend_data)
                    if trend_data is not None:
                        logger.info("artifactId = %s", artifactId)
                        if artifact['verify'] != 'N':
                            update_sql = "update artifact set rank =%s, active='Y', modifyTime=now() where id=%s"
                            conn.update(update_sql, int(trend_data), int(artifactId))
                    else:
                        if type == 4020 or type == 4030:
                            update_sql = "update artifact set active='Y', modifyTime=now() where id=%s"
                            conn.update(update_sql, int(artifactId))
                        else:
                            update_sql = "update artifact set active='N', modifyTime=now() where id=%s"
                            conn.update(update_sql, int(artifactId))



    conn.close()
    conn_crawler.close()
    cnt += 1000
    begin()
Пример #18
0
def proxy_success(proxy):
    if proxy["id"] == 0:
        return
    conn = db.connect_torndb_crawler()
    conn.execute("update proxy_tyc set fail=%s where id=%s", 0, proxy["id"])
    conn.close()
Пример #19
0
import loghelper, config
import db
import name_helper

#logger
loghelper.init_logger("domain_2_beian", stream=True)
logger = loghelper.get_logger("domain_2_beian")

#mongo
mongo = db.connect_mongo()
collection = mongo.trend.itunes

if __name__ == '__main__':
    logger.info("Begin...")
    conn = db.connect_torndb()
    conn_crawler = db.connect_torndb_crawler()
    for i in range(1,101):
        logger.info("ios" + str(i))
        start = 0
        #sql = "select * from ios" + str(i) + " where date>date_sub(now(),interval 30 day) order by id limit %s, 10000"
        sql = "select * from ios" + str(i) + " order by id limit %s, 10000"
        while True:
            items = list(conn_crawler.query(sql, start))
            if len(items) == 0:
                break
            for item in items:
                artifactId = item["artifactId"]
                artifact = conn.get("select * from artifact where id=%s", artifactId)
                if artifact is None:
                    logger.info("artifactId=%s not Found!", artifactId)
                    continue