Пример #1
0
def qq_check(text, filename):

    # file = open(dir + '/' + filename, 'rb')
    try:
        # text = file.read()

        message = check(filename)

        mess = re.split(',', message)
        url = mess[-1]

        for black in black_list:

            if black not in text and black.decode(
                    'utf-8', 'ignore').encode('gb2312') not in text:
                return tuple([True, mess[0]])

        # print message
        return tuple([False, mess[0] + ",p"])

    except Exception as e:
        print "[ERROR] " + str(e)
        exit(0)

    # finally:
    # 	file.close()
Пример #2
0
def f_check(text, filename):

    # file = open(dir + '/' + filename, 'rb')
    try:
        # text = file.read()

        message = check(filename)

        mess = re.split(',', message)
        url = mess[-1]
        point = 0

        if black_word not in text and black_word.decode(
                'utf-8', 'ignore').encode('gb2312') not in text:
            return tuple([True, mess[0]])

        else:
            for black in black_list:
                point = point + text.count(black) + text.count(
                    black.decode('utf-8', 'ignore').encode('gb2312'))

        if point > 10:
            return tuple([False, mess[0] + ",d"])

        else:
            return tuple([True, mess[0]])

    except Exception as e:
        print "[ERROR] " + str(e)
        exit(0)
Пример #3
0
def bank_check(text, filename):

    # file = open(dir + '/' + filename, 'rb')
    # try:
    # text = file.read()

    if '&#34892' in text or '&#38134' in text:
        try:
            # print chardet.detect(text)
            t = HTMLParser.HTMLParser()
            text = t.unescape(text)

            message = check(filename)

            mess = re.split(',', message)
            url = mess[-1]
            # print message

            for black in black_list:
                if black not in text:
                    return tuple([True, mess[0]])

            if check_whois(["银行", "Bank", "bank", "BANK"], url):
                return tuple([True, mess[0]])
            else:
                # print message
                return tuple([False, mess[0] + ",p"])

        except UnicodeDecodeError:
            pass

    else:
        message = check(filename)

        mess = re.split(',', message)
        url = mess[-1]

        for black in black_list:
            if black not in text and black.decode(
                    'utf-8', 'ignore').encode('gb2312') not in text:
                return tuple([True, mess[0]])

        if check_whois(["银行", "Bank", "bank", "BANK"], url):
            return tuple([True, mess[0]])
        else:
            # print message
            return tuple([False, mess[0] + ",p"])
Пример #4
0
def h_check(text, filename):

	# file = open(dir + '/' + filename, 'rb')
	try:
		# text = file.read()

		message = check(filename)

		mess = re.split(',', message)
		url = mess[-1]
		point = 0

		for black in black_list:

			# if black not in text and black.decode('utf-8', 'ignore').encode('gb2312') not in text:
			# 	return tuple([True, mess[0]])
			point = point + text.count(black) + text.count(black.decode('utf-8', 'ignore').encode('gb2312'))

		# f.write(message)

		if point > 400:
			# f.write(message)
			# print message
			# print "[INFO] the point is " + str(point)
			return tuple([False, mess[0]+",d"])
		else:
			return tuple([True, mess[0]])

	except Exception as e: 
		print "[ERROR] "+str(e)
		exit(0)

	# finally:
	# 	file.close()


# dir = "../../subject1_sample/file"

# f = open("d.txt", 'w+')

# for parent, dirnames, filenames in os.walk(dir):
# 	for filename in filenames:

# 		result = ccheck(filename)

# 		if not result[0]:
# 			# f.write(result[1])
# 			print result[1]

# f.close()
# ccheck('00078edb0b0a989b5b141bfe5e7d72d6')
Пример #5
0
def apple_check(text, filename):
    # file = open(dir + '/' + filename, 'rb')
    try:
        # text = file.read()

        message = check(filename)

        mess = re.split(',', message)
        url = mess[-1]

        for black in black_list:
            if black not in text and black.decode(
                    'utf-8', 'ignore').encode('gb2312') not in text:
                return tuple([True, mess[0]])

        try:
            r = requests.get(url, timeout=3)
            if "apple.com" in r.url:
                # print "[INFO]not p page:" + message
                return tuple([True, mess[0]])
            else:
                # print message
                return tuple([False, mess[0] + ",p"])
        except requests.exceptions.ConnectTimeout:
            # print message
            # print "[ERROR] timeout error..."
            return tuple([False, mess[0] + ",p"])

        except requests.exceptions.ConnectionError:
            # print "[ERROR]  error..."
            # print message
            return tuple([False, mess[0] + ",p"])

        except requests.exceptions.Timeout:
            return tuple([False, mess[0] + ",p"])

        except Exception as e:
            print "[ERROR] " + str(e)
            exit(0)

    except Exception as e:
        print "[ERROR] " + str(e)
        exit(0)
Пример #6
0
def ccheck(filename):

    file = open(dir + '/' + filename, 'rb')
    try:
        text = file.read()

        message = check(filename)

        mess = re.split(',', message)
        url = mess[-1]
        point = 0

        for black in black_list:

            # if black not in text and black.decode('utf-8', 'ignore').encode('gb2312') not in text:
            # return tuple([True, mess[0]])
            point = point + text.count(black) + text.count(
                black.decode('utf-8', 'ignore').encode('gb2312'))

        # f.write(message)

        if point > 0:
            # print text

            # match = re.match( r'^\s*<script.*src\s*=\s*"(http:)?(https:)?\/\/.*<\/script>', text)
            # match = re.match('script', text)

            # if match:

            # 	if get_url(url) not in match.group():
            print message
            # 	print match.group()
            return tuple([False, mess[0] + ",d"])

        # return tuple([True, mess[0]])

        # f.write(message)

        # s = requests.Session()

        # try:
        # 	header = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36'}

        # 	r = s.get(url[:-1], headers=header, timeout=10)

        # 	for black in black_list:
        # 		point2 = point2 + r.content.count(black)+ r.content.count(black.decode('utf-8', 'ignore').encode('gb2312'))

        # 	if 1 > (point2+1)/point:

        # 		if '<meta http-equiv="refresh" content="0;' in r.content:
        # 			print message
        # 			print "[INFO] the point is " + str(point)
        # 			print "[INFO] the point2 is " + str(point2)
        # 			return tuple([False, mess[0]+",d"])

        # 	return tuple([True, mess[0]])

        # except requests.exceptions.ConnectTimeout:
        # 	# print message
        # 	# print "[ERROR] timeout error..."
        # 	return tuple([True, mess[0]])

        # except requests.exceptions.ConnectionError:
        # 	# print "[ERROR]  error..."
        # 	# print message
        # 	return tuple([True, mess[0]])

        # except requests.exceptions.Timeout:
        # 	return tuple([True, mess[0]])

        # except Exception as e:
        # 	print "[ERROR] "+str(e)
        # 	exit(0)

        else:
            return tuple([True, mess[0]])

    # except Exception as e:
    # 	print "[ERROR] "+str(e)
    # 	exit(0)

    finally:
        file.close()
Пример #7
0
def h_check2(text, filename):

    # file = open(dir + '/' + filename, 'rb')
    # try:
    # text = file.read()

    message = check(filename)

    mess = re.split(',', message)
    url = mess[-1]
    point = 0
    point2 = 0

    for black in black_list:

        # if black not in text and black.decode('utf-8', 'ignore').encode('gb2312') not in text:
        # 	return tuple([True, mess[0]])
        point = point + text.count(black) + text.count(
            black.decode('utf-8', 'ignore').encode('gb2312'))

    # f.write(message)

    if point > 10:
        # f.write(message)

        s = requests.Session()

        try:
            header = {
                'User-Agent':
                'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36'
            }

            r = s.get(url[:-1], headers=header, timeout=10)

            # print r.content
            for black in black_list:
                point2 = point2 + r.content.count(black) + r.content.count(
                    black.decode('utf-8', 'ignore').encode('gb2312'))

            if 1 > (point2 + 1) / point:

                if '<meta http-equiv="refresh" content="0;' in r.content:
                    print message
                    print "[INFO] the point is " + str(point)
                    print "[INFO] the point2 is " + str(point2)
                    return tuple([False, mess[0] + ",d"])

            return tuple([True, mess[0]])

        except requests.exceptions.ConnectTimeout:
            # print message
            # print "[ERROR] timeout error..."
            return tuple([True, mess[0]])

        except requests.exceptions.ConnectionError:
            # print "[ERROR]  error..."
            # print message
            return tuple([True, mess[0]])

        except requests.exceptions.Timeout:
            return tuple([True, mess[0]])

        except requests.exceptions.ContentDecodingError:
            return tuple([True, mess[0]])

        # except Exception as e:
        # 	print "[ERROR] "+str(e)
        # 	exit(0)

    else:
        return tuple([True, mess[0]])

    # except Exception as e:
    # 	print "[ERROR] "+str(e)
    # 	exit(0)

    # finally:
    # 	file.close()


# dir = "../../subject1_sample/file"

# f = open("d.txt", 'w+')

# for parent, dirnames, filenames in os.walk(dir):
# 	for filename in filenames:

# 		result = ccheck(filename)

# 		if not result[0]:
# 			# f.write(result[1])
# 			print result[1]

# f.close()
# ccheck('12659b53d4554fbab7e4f1a3cc881815')