Пример #1
0
def update_label_id(url, label_id):
    """
    根据url更新label_id这个集合,将url所属的标签更新到label_id集合中
    :param url: 用户访问的url,根据该url获取标签
    :param label_id: 用户的标签集合
    :return: 
    """
    url = trans_str(url)
    host = utility.url_to_host(url)
    domain = utility.host_to_domain(host)
    try:
        if url in label_rules['url']:
            label_id.update(label_rules['url'][url])
    except KeyError:
        pass

    try:
        if host in label_rules['host']:
            label_id.update(label_rules['host'][host])
    except KeyError:
        pass

    try:
        if domain in label_rules['domain']:
            label_id.update(label_rules['domain'][domain])
    except KeyError:
        pass
Пример #2
0
def insert_host_info( host_pv_file, urls_set, host="127.0.0.1", port=3306, user="******", password="", database="test" ):
    print("urls len: {0}".format(len(urls_set)))
    conn = connect(host=host, port=port, user=user, passwd=password, db=database, charset='utf8')
    cursor = conn.cursor()
    with open(host_pv_file, mode='r') as fd:
        for line in fd:
            line = line.strip()
            host, pv = line.split(" ")
            if host not in urls_set:
                # print("host {0} not in urls set".format(host))
                continue

            host = utility.spider_url_to_dpi_url(host)
            host = utility.url_to_host(host)
            suffix = utility.get_suffix(host)
            insert_sql = "insert into dmp_site_info (domain, suffix, pv_yesterday) values (\"{0}\", \"{1}\", {2});".format(
                host, suffix, int(pv))
            # print(insert_sql)
            try:
                cursor.execute(insert_sql)
            except IntegrityError as ie:
                traceback.print_exc(ie)

    conn.commit()
    conn.close()
Пример #3
0
def write_mongo(host, port, db_name, collection_name, file_name):
    client = MongoClient(host, port)
    print client.server_info()
    dbs = client.database_names()
    print '\t'.join(dbs)
    db = client.get_database(db_name)
    collections = db.collection_names(include_system_collections=False)
    print '\t'.join(collections)
    collection = db.get_collection(collection_name)
    with open(file_name, "r") as fd:
        for line in fd:
            line = line.strip()
            segs = line.split("\t")
            if len(segs) < 2:
                continue

            host = utility.url_to_host(segs[1])
            domain = utility.host_to_domain(host)
            if ValidHostnameRegex.match(
                    domain) is not None or ValidIpAddressRegex.match(
                        domain) is not None:
                rule = host + "/*"
                if segs[0] == "色情":
                    collection.insert_one({
                        "table": "domain_rule",
                        "group_id": seqing_group_id,
                        "domain": domain,
                        "rule": rule
                    })
                    print "色情", segs[1]
                elif segs[0] == "赌博":
                    collection.insert_one({
                        "table": "domain_rule",
                        "group_id": dubo_group_id,
                        "domain": domain,
                        "rule": rule
                    })
                    print "赌博", segs[1]
                elif segs[0] == "小说":
                    collection.insert_one({
                        "table": "domain_rule",
                        "group_id": xiaoshuo_group_id,
                        "domain": domain,
                        "rule": rule
                    })
                    print "小说", segs[1]
                elif segs[0] == "长尾":
                    collection.insert_one({
                        "table": "domain_rule",
                        "group_id": changwei_group_id,
                        "domain": domain,
                        "rule": rule
                    })
                    print "长尾", segs[1]
                else:
                    print "其他", segs[0], segs[1]

    cursor = collection.find_one()
Пример #4
0
def get_result_urls(results_file):
    urls_set = set()
    with open(results_file, mode='r') as fd:
        for line in fd:
            line = line.strip()
            segs = line.split(",")
            if len(segs) < 6:
                continue
            host = utility.spider_url_to_dpi_url(segs[0])
            host = utility.url_to_host(host)
            # print("add host {0}".format(host))
            urls_set.add(host)

    return urls_set
Пример #5
0
def insert_host_content( results_file, host="127.0.0.1", port=3306, user="******", password="", database="test" ):
    conn = connect(host=host, port=port, user=user, passwd=password, db=database, charset='utf8')
    cursor = conn.cursor()
    count = 0
    failed_count = 0
    ie_failed_count = 0
    pe_failed_count = 0
    de_failed_count = 0
    oe_failed_count = 0
    with open(results_file, mode="r") as fd:
        for line in fd:
            line = line.strip()
            segs = line.split(",")
            if len(segs) < 6:
                continue
            host = utility.spider_url_to_dpi_url(segs[0])
            host = utility.url_to_host(host)
            title = segs[1]
            keywords = segs[2]
            description = segs[3]
            if title is None or not chinese.search(title.decode('utf8')):
                continue
            if keywords is None or not chinese.search(title.decode('utf8')):
                continue

            site_id = get_site_id(host, conn)
            if site_id == 0:
                print("get site id for host {0} failed".format(host))
                failed_count += 1
                continue
            insert_sql = "insert into dmp_site_content (site_id, domain, title, keywords, description) values ({0}, \"{1}\", \"{2}\", \"{3}\", \"{4}\");".format(
                site_id, host, title, keywords, description)
            try:
                cursor.execute(insert_sql)
            except IntegrityError as ie:
                traceback.print_exc(ie)
                ie_failed_count+=1
                continue
            except ProgrammingError as pe:
                traceback.print_exc(pe)
                print("Error sql: {0}".format(insert_sql))
                pe_failed_count += 1
                continue
            except DataError as de:
                traceback.print_exc(de)
                print("Error sql: {0}".format(insert_sql))
                de_failed_count += 1
                continue
            except OperationalError as oe:
                traceback.print_exc(oe)
                print("Error sql: {0}".format(insert_sql))
                oe_failed_count += 1
                continue
            count += 1
            if count > 100:
                conn.commit()
                count = 0

    conn.commit()
    conn.close()
    print("get site id failed number: {0}".format(failed_count))
    print("IntegrityError failed number: {0}".format(ie_failed_count))
    print("ProgrammingError failed number: {0}".format(pe_failed_count))
    print("DataError failed number: {0}".format(de_failed_count))
    print("OperationalError failed number: {0}".format(oe_failed_count))
Пример #6
0
                middle_tokens.append(token)
        return ''.join(middle_tokens), '.'.join(suffix_tokens[::-1])

if __name__ == "__main__":
    filename = sys.argv[1]
    full_filename = os.path.split(os.path.realpath(__file__))[0] + os.path.sep + filename
    #print analyse.default_tfidf.stop_words
    #print full_filename
    with open(full_filename, "r") as f:
        for line in f:
            line = line.strip()
            if line is None or len(line) == 0 or line.find('\t') == -1:
                continue
            try:
                url, body = line.split('\t', 1)
                host = utility.url_to_host(url)
                middle, suffix = split_host(host)
                if suffix_dict.has_key(suffix):
                    print u'\t'.join([suffix_dict[suffix], url, suffix])
                    continue

                body = body.decode('utf-8', 'ignore')
                title = body.split('\1')[0]

                # title, keywords, description, p_list, a_list = body.split('\01', 4)
                body = body.replace('\01', ' ')
                if not chinese.search(body):
                    continue
                #tags = analyse.extract_tags(body, topK=20, withWeight=True)
                tags = analyse.extract_tags(body, topK=20, withWeight=False)
                out_tag = json.dumps(tags, ensure_ascii=False)