def qq_check(text, filename): # file = open(dir + '/' + filename, 'rb') try: # text = file.read() message = check(filename) mess = re.split(',', message) url = mess[-1] for black in black_list: if black not in text and black.decode( 'utf-8', 'ignore').encode('gb2312') not in text: return tuple([True, mess[0]]) # print message return tuple([False, mess[0] + ",p"]) except Exception as e: print "[ERROR] " + str(e) exit(0) # finally: # file.close()
def f_check(text, filename): # file = open(dir + '/' + filename, 'rb') try: # text = file.read() message = check(filename) mess = re.split(',', message) url = mess[-1] point = 0 if black_word not in text and black_word.decode( 'utf-8', 'ignore').encode('gb2312') not in text: return tuple([True, mess[0]]) else: for black in black_list: point = point + text.count(black) + text.count( black.decode('utf-8', 'ignore').encode('gb2312')) if point > 10: return tuple([False, mess[0] + ",d"]) else: return tuple([True, mess[0]]) except Exception as e: print "[ERROR] " + str(e) exit(0)
def bank_check(text, filename): # file = open(dir + '/' + filename, 'rb') # try: # text = file.read() if '行' in text or '银' in text: try: # print chardet.detect(text) t = HTMLParser.HTMLParser() text = t.unescape(text) message = check(filename) mess = re.split(',', message) url = mess[-1] # print message for black in black_list: if black not in text: return tuple([True, mess[0]]) if check_whois(["银行", "Bank", "bank", "BANK"], url): return tuple([True, mess[0]]) else: # print message return tuple([False, mess[0] + ",p"]) except UnicodeDecodeError: pass else: message = check(filename) mess = re.split(',', message) url = mess[-1] for black in black_list: if black not in text and black.decode( 'utf-8', 'ignore').encode('gb2312') not in text: return tuple([True, mess[0]]) if check_whois(["银行", "Bank", "bank", "BANK"], url): return tuple([True, mess[0]]) else: # print message return tuple([False, mess[0] + ",p"])
def h_check(text, filename): # file = open(dir + '/' + filename, 'rb') try: # text = file.read() message = check(filename) mess = re.split(',', message) url = mess[-1] point = 0 for black in black_list: # if black not in text and black.decode('utf-8', 'ignore').encode('gb2312') not in text: # return tuple([True, mess[0]]) point = point + text.count(black) + text.count(black.decode('utf-8', 'ignore').encode('gb2312')) # f.write(message) if point > 400: # f.write(message) # print message # print "[INFO] the point is " + str(point) return tuple([False, mess[0]+",d"]) else: return tuple([True, mess[0]]) except Exception as e: print "[ERROR] "+str(e) exit(0) # finally: # file.close() # dir = "../../subject1_sample/file" # f = open("d.txt", 'w+') # for parent, dirnames, filenames in os.walk(dir): # for filename in filenames: # result = ccheck(filename) # if not result[0]: # # f.write(result[1]) # print result[1] # f.close() # ccheck('00078edb0b0a989b5b141bfe5e7d72d6')
def apple_check(text, filename): # file = open(dir + '/' + filename, 'rb') try: # text = file.read() message = check(filename) mess = re.split(',', message) url = mess[-1] for black in black_list: if black not in text and black.decode( 'utf-8', 'ignore').encode('gb2312') not in text: return tuple([True, mess[0]]) try: r = requests.get(url, timeout=3) if "apple.com" in r.url: # print "[INFO]not p page:" + message return tuple([True, mess[0]]) else: # print message return tuple([False, mess[0] + ",p"]) except requests.exceptions.ConnectTimeout: # print message # print "[ERROR] timeout error..." return tuple([False, mess[0] + ",p"]) except requests.exceptions.ConnectionError: # print "[ERROR] error..." # print message return tuple([False, mess[0] + ",p"]) except requests.exceptions.Timeout: return tuple([False, mess[0] + ",p"]) except Exception as e: print "[ERROR] " + str(e) exit(0) except Exception as e: print "[ERROR] " + str(e) exit(0)
def ccheck(filename): file = open(dir + '/' + filename, 'rb') try: text = file.read() message = check(filename) mess = re.split(',', message) url = mess[-1] point = 0 for black in black_list: # if black not in text and black.decode('utf-8', 'ignore').encode('gb2312') not in text: # return tuple([True, mess[0]]) point = point + text.count(black) + text.count( black.decode('utf-8', 'ignore').encode('gb2312')) # f.write(message) if point > 0: # print text # match = re.match( r'^\s*<script.*src\s*=\s*"(http:)?(https:)?\/\/.*<\/script>', text) # match = re.match('script', text) # if match: # if get_url(url) not in match.group(): print message # print match.group() return tuple([False, mess[0] + ",d"]) # return tuple([True, mess[0]]) # f.write(message) # s = requests.Session() # try: # header = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36'} # r = s.get(url[:-1], headers=header, timeout=10) # for black in black_list: # point2 = point2 + r.content.count(black)+ r.content.count(black.decode('utf-8', 'ignore').encode('gb2312')) # if 1 > (point2+1)/point: # if '<meta http-equiv="refresh" content="0;' in r.content: # print message # print "[INFO] the point is " + str(point) # print "[INFO] the point2 is " + str(point2) # return tuple([False, mess[0]+",d"]) # return tuple([True, mess[0]]) # except requests.exceptions.ConnectTimeout: # # print message # # print "[ERROR] timeout error..." # return tuple([True, mess[0]]) # except requests.exceptions.ConnectionError: # # print "[ERROR] error..." # # print message # return tuple([True, mess[0]]) # except requests.exceptions.Timeout: # return tuple([True, mess[0]]) # except Exception as e: # print "[ERROR] "+str(e) # exit(0) else: return tuple([True, mess[0]]) # except Exception as e: # print "[ERROR] "+str(e) # exit(0) finally: file.close()
def h_check2(text, filename): # file = open(dir + '/' + filename, 'rb') # try: # text = file.read() message = check(filename) mess = re.split(',', message) url = mess[-1] point = 0 point2 = 0 for black in black_list: # if black not in text and black.decode('utf-8', 'ignore').encode('gb2312') not in text: # return tuple([True, mess[0]]) point = point + text.count(black) + text.count( black.decode('utf-8', 'ignore').encode('gb2312')) # f.write(message) if point > 10: # f.write(message) s = requests.Session() try: header = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36' } r = s.get(url[:-1], headers=header, timeout=10) # print r.content for black in black_list: point2 = point2 + r.content.count(black) + r.content.count( black.decode('utf-8', 'ignore').encode('gb2312')) if 1 > (point2 + 1) / point: if '<meta http-equiv="refresh" content="0;' in r.content: print message print "[INFO] the point is " + str(point) print "[INFO] the point2 is " + str(point2) return tuple([False, mess[0] + ",d"]) return tuple([True, mess[0]]) except requests.exceptions.ConnectTimeout: # print message # print "[ERROR] timeout error..." return tuple([True, mess[0]]) except requests.exceptions.ConnectionError: # print "[ERROR] error..." # print message return tuple([True, mess[0]]) except requests.exceptions.Timeout: return tuple([True, mess[0]]) except requests.exceptions.ContentDecodingError: return tuple([True, mess[0]]) # except Exception as e: # print "[ERROR] "+str(e) # exit(0) else: return tuple([True, mess[0]]) # except Exception as e: # print "[ERROR] "+str(e) # exit(0) # finally: # file.close() # dir = "../../subject1_sample/file" # f = open("d.txt", 'w+') # for parent, dirnames, filenames in os.walk(dir): # for filename in filenames: # result = ccheck(filename) # if not result[0]: # # f.write(result[1]) # print result[1] # f.close() # ccheck('12659b53d4554fbab7e4f1a3cc881815')