def min_entropy(inList): """Returns the minimum shannon entropy of URIs in the list""" minEntropy = en.shannon_entropy(inList[0]) for uri in inList: if minEntropy > en.shannon_entropy(uri): minEntropy = en.shannon_entropy(uri) return(minEntropy)
def __iter__(self): for word in self.context.data.split(): if self.config['word_length_min'] <= len( word) <= self.config['word_length_max']: logger.debug( "found word ({}) that matched length constaints of min:{} and max:{}".\ format(word, self.config['word_length_min'], self.config['word_length_max'])) if shannon_entropy(word) >= self.config['entropy_min']: yield self.create_secret(shannon_entropy(word), word)
def max_entropy(inList): """returns the maximum shannon entropy of URIs in the list""" try: maxEntropy = en.shannon_entropy(inList[0]) except(IndexError): maxEntropy = en.shannon_entropy(inList) for uri in inList: if maxEntropy < en.shannon_entropy(uri): maxEntropy = en.shannon_entropy(uri) return(maxEntropy)
def decrypt_str(args, binary, len_str, key): """Decrypt Spora's config""" # Save AES key if args.output_dir: out_dir = "{:}/{:}/".format(args.output_dir, os.path.basename(args.file)) out_path = out_dir + "AES256.key" # Check for the output directory if not os.path.exists(out_dir): os.makedirs(out_dir) # Write the decrypted file with open(out_path, 'w') as f: f.write(key) # Decrypt data for call in len_str: # Init Crypto stuff h = SHA256.new() c = AES.new(key, AES.MODE_CBC, '\x00' * 16) enc_bytes_list = get_bin_bytes(binary, call["str"], call["len"]) enc_bytes_str = b''.join([chr(i) for i in enc_bytes_list]) dec_bytes_str = c.decrypt(enc_bytes_str) h.update(dec_bytes_str) entropy = { "enc": shannon_entropy(enc_bytes_str), "dec": shannon_entropy(dec_bytes_str) } # Print file hash and size if args.verbose >= 1: print "\nFile decrypted SHA256: {:}, size: {:}".format( h.hexdigest(), call["len"]) # Print entropy if args.verbose >= 2: print "Entropy of {:}: before = {:}, after = {:}".format( h.hexdigest(), entropy["enc"], entropy["dec"]) # Save the decrypted file if args.output_dir: out_path = out_dir + h.hexdigest() # Write the decrypted file with open(out_path, 'w') as f: f.write(dec_bytes_str) if args.print_config: print "{:}".format(dec_bytes_str)
def score_domain(domain): """Score `domain`. The highest score, the most probable `domain` is a phishing site. Args: domain (str): the domain to check. Returns: int: the score of `domain`. """ score = 0 for tld in suspicious_tld: if domain.endswith(tld): score += 20 for keyword in suspicious_keywords: if keyword in domain: score += 25 for keyword in highly_suspicious: if keyword in domain: score += 60 score += int(round(entropy.shannon_entropy(domain) * 50)) # Lots of '-' (ie. www.paypal-datacenter.com-acccount-alert.com) if 'xn--' not in domain and domain.count('-') >= 4: score += 20 # Deeply nested subdomains (ie. www.paypal.com.security.accountupdate.gq) if domain.count('.') >= 4: score += 20 return score
def compute_entropy(region_file: str, tabix_file: str): tx = tabix.open(tabix_file) with open(region_file) as regions: reader = csv.reader(regions, delimiter="\t") for region in reader: # avoid header line # if str(region[0]).startswith("#") == True: print("\t".join(region), "entropy", sep="\t") else: chromosome = region[0] start = int(region[1]) end = int(region[2]) size = end - start serie = [0] * size for record in tx.query(chromosome, start, end): t_start = int(record[1]) t_end = int(record[2]) index = t_start - start serie[index] += 1 serie_str = "".join(str(i) for i in serie) e = entropy.shannon_entropy(serie_str) print("\t".join(region), e, sep="\t")
def get_t_wave_durations_entropy(self): t_wave_durations = self.get_t_wave_durations().ravel() ''' hist, bin_edges = np.histogram(t_wave_durations, 'auto') bin_map_t_waves = np.digitize(t_wave_durations, bin_edges[:-1]) bin_map_t_waves = np.array(list(map(lambda x: hist[x-1]/len(t_wave_durations), bin_map_t_waves))) ''' #return sps.entropy(bin_map_t_waves, base = 2) return shannon_entropy(t_wave_durations)
def get_entropy_of_file(FilePath): file_read = open(FilePath, 'r') file_data = file_read.read() entropy_file = entropy.shannon_entropy(file_data) file_read.close() return entropy_file
def get_rr_interval_durations_entropy(self): p_wave_durations = self.segments.get('rr_interval') ''' hist, bin_edges = np.histogram(p_wave_durations, 'auto') bin_map_p_waves = np.digitize(p_wave_durations, bin_edges[:-1]) bin_map_p_waves = np.array(list(map(lambda x: hist[x-1]/len(p_wave_durations), bin_map_p_waves))) ''' #return sps.entropy(bin_map_p_waves, base = 2) return shannon_entropy(p_wave_durations)
def score_domain(domain): """Score `domain`. The highest score, the most probable `domain` is a phishing site. Args: domain (str): the domain to check. Returns: int: the score of `domain`. """ score = 0 for t in suspicious['tlds']: if domain.endswith(t): score += 20 # Remove initial '*.' for wildcard certificates bug if domain.startswith('*.'): domain = domain[2:] # Removing TLD to catch inner TLD in subdomain (ie. paypal.com.domain.com) try: res = get_tld(domain, as_object=True, fail_silently=True, fix_protocol=True) domain = '.'.join([res.subdomain, res.domain]) except Exception: pass # Higer entropy is kind of suspicious score += int(round(entropy.shannon_entropy(domain)*50)) # Remove lookalike characters using list from http://www.unicode.org/reports/tr39 domain = unconfuse(domain) words_in_domain = re.split("\W+", domain) # ie. detect fake .com (ie. *.com-account-management.info) if words_in_domain[0] in ['com', 'net', 'org']: score += 10 # Testing keywords for word in suspicious['keywords']: if word in domain: score += suspicious['keywords'][word] # Testing Levenshtein distance for strong keywords (>= 70 points) (ie. paypol) for key in [k for (k,s) in suspicious['keywords'].items() if s >= 70]: # Removing too generic keywords (ie. mail.domain.com) for word in [w for w in words_in_domain if w not in ['email', 'mail', 'cloud']]: if distance(str(word), str(key)) == 1: score += 70 # Lots of '-' (ie. www.paypal-datacenter.com-acccount-alert.com) if 'xn--' not in domain and domain.count('-') >= 4: score += domain.count('-') * 3 # Deeply nested subdomains (ie. www.paypal.com.security.accountupdate.gq) if domain.count('.') >= 3: score += domain.count('.') * 3 return score
def subdomain_entropy(hostname: str) -> float: domain = get_domain(hostname) if pd.isna(domain): return np.nan subdomain = hostname[:-len(domain)] return shannon_entropy(subdomain)
def get_pr_interval_durations_entropy(self): pr_intervals = self.get_pr_intervals().ravel() ''' hist, bin_edges = np.histogram(pr_intervals,'auto') bin_map_pr_interval = np.digitize(pr_intervals,bin_edges[:-1]) bin_map_pr_interval = np.array(list(map(lambda x: hist[x-1]/len(pr_intervals), bin_map_pr_interval))) ''' #return sps.entropy(bin_map_pr_interval, base = 2) return shannon_entropy(pr_intervals)
def score_domain(domain): """Score `domain`. The highest score, the most probable `domain` is a phishing site. Args: domain (str): the domain to check. Returns: int: the score of `domain`. """ score = 0 mult = 1 for tld in suspicious_tld: if domain.endswith(tld): score += 20 mult += 1 for keyword in suspicious_keywords: if keyword in domain: score += 25 mult += 1 for keyword in highly_suspicious: if keyword in domain: score += 60 mult += 1 score += (int(round(entropy.shannon_entropy(domain)*50))) * mult # Lots of '-' (ie. www.paypal-datacenter.com-acccount-alert.com) if 'xn--' not in domain and domain.count('-') >= 4: score += 20 return score
def alleged_domain(phishy): """How sketchy is the domain in question? Performs statistical, symantic, and other reasoning techniques to separate the wheat from the chaff input: phishy - the domain (str) returns - score (int or float depending on the quant. techniques """ score = 0 for _tld in prefixes: if phishy.endswith(_tld): score += 20 if phishy.startswith('*.'): phishy = phishy[2:] # https://arstechnica.com/information-technology/2017/06/phishing-attacks-target-mobile-browsers-with-dash-padded-urls/ try: res = get_tld(phishy, as_object=True, fail_silently=True, fix_protocol=True) phishy = '.'.join([res.subdomain, res.domain]) except Exception as exc: pass words_in_domain = re.split("\W+", phishy) # How fun are wildcards? Not fun with language parsers if phishy.startswith('*.'): phishy = phishy[2:] if words_in_domain[0] in ['com', 'net', 'org']: score += 10 # Testing keywords for word, val in phrases.iteritems(): if word in phishy: score += phrases[word] # Too random? score += int(round(entropy.shannon_entropy(phishy) * 50.2)) # How likely is this like others? for key in [k for (k, s) in phrases.items() if s >= 70]: # Massaging dataset massaging with normalization for word in [w for w in words_in_domain if w not in ['cloud', 'mail', 'email']]: if ratio(str(word), str(key)) == 1: score += 70 '''Markov chain confusion Not released to the public''' '''K closest neighbors and cluster analysis (similar to Levenshstein ratios) not released to the public''' #Oh China.... if 'xn--' not in phishy and phishy.count('-') >= 4: score += phishy.count('-') * 3 # Humans rarely, rationally pick 3+ subdomains deep if phishy.count('.') >= 3: score += phishy.count('.') * 3 return score
def score_domain(domain): """Score `domain`. The highest score, the most probable `domain` is a phishing site. Args: domain (str): the domain to check. Returns: int: the score of `domain`. #https://github.com/x0rz/phishing_catcher/blob/master/catch_phishing.py """ score = 0 for t in tlds: if domain.endswith(t): score += 20 # Remove initial '*.' for wildcard certificates bug if domain.startswith("*."): domain = domain[2:] # Removing TLD to catch inner TLD in subdomain (ie. paypal.com.domain.com) try: res = get_tld(domain, as_object=True, fail_silently=True, fix_protocol=True) domain = ".".join([res.subdomain, res.domain]) except: # noqa: B110 pass words_in_domain = re.split("\W+", domain) # Remove initial '*.' for wildcard certificates bug if domain.startswith("*."): domain = domain[2:] # ie. detect fake .com (ie. *.com-account-management.info) if words_in_domain[0] in ["com", "net", "org"]: score += 10 # Testing keywords for word in keywords.keys(): if word in domain: score += keywords[word] # Higer entropy is kind of suspicious score += int(round(entropy.shannon_entropy(domain) * 50)) # Testing Levenshtein distance for strong keywords (>= 70 points) (ie. paypol) for key in [k for (k, s) in keywords.items() if s >= 70]: # Removing too generic keywords (ie. mail.domain.com) for word in [w for w in words_in_domain if w not in ["email", "mail", "cloud"]]: if distance(str(word), str(key)) == 1: score += 70 # Lots of '-' (ie. www.paypal-datacenter.com-acccount-alert.com) if "xn--" not in domain and domain.count("-") >= 4: score += domain.count("-") * 3 # Deeply nested subdomains (ie. www.paypal.com.security.accountupdate.gq) if domain.count(".") >= 3: score += domain.count(".") * 3 return score
def execute_all_measurements(self, base64_expression_decoded): results = dict() results['entropy'] = shannon_entropy(base64_expression_decoded) results['strings'] = self.words_in_strings( base64_expression_decoded, self._word_list, int(self.config[self.NAME]['string_min_length'])) results['filetype'] = get_file_type_from_binary( base64_expression_decoded) return results
def max_entropy(inList): """returns the maximum shannon entropy of URIs in the list""" try: maxEntropy = en.shannon_entropy(inList[0]) except (IndexError, TypeError, KeyError): try: maxEntropy = en.shannon_entropy(inList) except (TypeError): maxEntropy = 0.0 for uri in inList: try: if maxEntropy < en.shannon_entropy(uri): maxEntropy = en.shannon_entropy(uri) except (IndexError, TypeError, KeyError): print() return (maxEntropy)
def score_domain(config, domain, args): """ """ # dbugger = ['------------------------------------------------------------'] # dbugger.append(domain) score = 0 for t in config["tlds"]: if domain.endswith(t): score += 20 # dbugger.append("TLD: {}".format(t)) try: res = get_tld(domain, as_object=True, fail_silently=True, fix_protocol=True) if res is not None: domain = '.'.join([res.subdomain, res.domain]) except Exception as err: failed_message(args, err, domain) pass score += int(round(entropy.shannon_entropy(domain)*50)) # dbugger.append("Entropy: {}".format(int(round(entropy.shannon_entropy(domain)*50)))) domain = unconfuse(domain) words_in_domain = re.split(r"\W+", domain) if words_in_domain[0] in ["com", "net", "org"]: score += 10 # dbugger.append("Com-net-org: {}".format(words_in_domain[0])) for word in config["keywords"]: if word in domain: score += config["keywords"][word] # dbugger.append("Keyword: {}".format(len(config["keywords"]))) # dbugger.append("Keyword: {}".format(word)) for key in [k for (k,s) in config["keywords"].items() if s >= 70]: for word in [w for w in words_in_domain if w not in ["email", "mail", "cloud"]]: if distance(str(word), str(key)) == 1: score += 70 # dbugger.append("Distance: {}, {}".format(str(word), str(key))) if "xn--" not in domain and domain.count("-") >= 4: score += domain.count("-") * 3 # dbugger.append("Count dashes: {}".format(domain.count("."))) if domain.count(".") >= 3: score += domain.count(".") * 3 # dbugger.append("Count period: {}".format(domain.count("."))) # dbugger.append("\nScore: {}".format(score)) # dbugger.append('------------------------------------------------------------') # with open("dbug_file", "a") as dbug_file: # for dbug in dbugger: # dbug_file.write("{}\n".format(dbug)) return score
def entropy(segments): try: ''' hist, bin_edges = np.histogram(segments,'auto') bin_map_pr_interval = np.digitize(segments,bin_edges[:-1]) bin_map_pr_interval = np.array(list(map(lambda x: hist[x-1]/len(segments), bin_map_pr_interval))) return sps.entropy(bin_map_pr_interval, base = 2) ''' return shannon_entropy(segments) except Exception as e: print(str(e), file = sys.stderr) return 0.0
def plot_scatter(legit, dga): legit_len, legit_entropy, dga_len, dga_entropy = [], [], [], [] for x in legit: legit_len.append(len(x)) legit_entropy.append(entropy.shannon_entropy(x)) for x in dga: dga_len.append(len(x)) dga_entropy.append(entropy.shannon_entropy(x)) plt.scatter(legit_len, legit_entropy, s=140, c='#aaaaff', label='Legit', alpha=.2) plt.scatter(dga_len, dga_entropy, s=40, c='r', label='DGA', alpha=.3) plt.legend() plt.xlabel('Domain Length') plt.ylabel('Domain Entropy') plt.show()
def is_chrome_dn(self, dn): dn_segs = dn.split('.') num_segs = len(dn_segs) alpha_num = 0 random_len = len(dn_segs[0]) if random_len >= 10 and random_len <= 10 and shannon_entropy( dn_segs[0]) > 0.30: for letter in dn_segs[0]: if letter in self.ALPHABET: alpha_num += 1 if alpha_num == random_len: return True return False
def gentropy(email, compare=None): particle = email.pop('body') try: ent = entropy.shannon_entropy(particle) email['entropy'] = ent email['date'] = email['date'].isoformat() #convert to str except: print "[ERROR] Failed to parse: %s " % (particle) return email
def validate_ssn(ssn): """ Utility function to normalize social security numbers (SSN) :param ssn: Type: String Default: None Description: The SSN to normalize :return: Function accepts any SSN string and, if determined to be valid, outputs the SSN in the format 'XXX-XX-XXXX' If SSN argument is invalid, None is returned Invalid SSNs are: 1) Not equal to 9 numeric digits in length 2) Equal to known "bad_ssns" values like "123456789" 3) Numbers with all 0's in any digit group like "000-XX-XXXX" or "XXX-00-XXXX" or "XXX-XX-0000" 4) Numbers in first digit group between "900" and "999" 5) Numbers with a Shannon Entropy value <.16 like "111-22-2222" """ if not ssn: return None bad_ssns = ['123456789'] numeric_digits = re.compile(r'[^0-9]+') ssn_digits = numeric_digits.sub('', ssn) if len(ssn_digits) != 9: ssn_digits = None raise ValueError( 'The value passed as an SSN was not nine numeric digits in length: {}' .format(ssn)) elif ssn_digits: ssn_compile = re.compile( r'.*([0-8][0-9]{2}).*([0-9]{2}).*([0-9]{4}).*') n_ssn = ssn_compile.match(ssn_digits) if n_ssn: n_ssn_digits = str('{}{}{}'.format(n_ssn.group(1), n_ssn.group(2), n_ssn.group(3))) if (n_ssn_digits in bad_ssns) or (n_ssn.group(1) in [ '666', '000' ]) or (n_ssn.group(2) in ['00']) or (n_ssn.group(3) in ['0000']): raise ValueError( 'An invalid value was supplied as an SSN: '.format(ssn)) elif shannon_entropy(n_ssn_digits) < .16: raise ValueError( 'The value supplied as an SSN does not pass shannon entropy requirements: ' .format(ssn)) else: return str('{}{}{}'.format(n_ssn.group(1), n_ssn.group(2), n_ssn.group(3)))
def score_domain(domain): score = 0 for tld in tlds: if domain.endswith(tld): score += 20 # for wildcard certs, remove *. if domain.startswith('*.'): domain = domain[2:] try: res = get_tld(domain, as_object=True, fail_silentyl=True, fix_protocol=True) domain = '.'.join([res.subdomain, res.domain]) except: pass words_in_domain = re.split("\W+", domain) # for wildcard certs, remove *. if domain.startswith('*.'): domain = domain[2:] if words_in_domain[0] in ['com', 'net', 'org']: score += 10 for word in keywords.keys(): if word in domain: score += keywords[word] score += int(round(entropy.shannon_entropy(domain) * 50)) for key in [k for (k, s) in keywords.items() if s >= 70]: for word in [ w for w in words_in_domain if w not in ['email', 'mail', 'cloud'] ]: if distance(str(word), str(key)) == 1: score += 70 if 'xn--' not in domain and domain.count('-') >= 4: score += domain.count('-') * 3 if domain.count('.') >= 3: score += domain.count('.') * 3 return score
def _enumerate_encrypted_assets(self): """Returns a list of files in the APK assets that have high entropy.""" files = [] for filename, filetype in self.apk.get_files_types().items(): if "assets" in filename: buf = self.apk.zip.read(filename) file_entropy = entropy.shannon_entropy(buf) if file_entropy > 0.9: files.append({ "name": filename, "entropy": file_entropy, "size": len(buf), "type": filetype, }) return files
def score_domain(suspicious, domain, args): """ """ score = 0 for t in suspicious["tlds"]: if domain.endswith(t): score += 20 try: res = get_tld(domain, as_object=True, fail_silently=True, fix_protocol=True) if res is not None: domain = '.'.join([res.subdomain, res.domain]) except Exception as err: failed_message(args, err, domain) pass score += int(round(entropy.shannon_entropy(domain) * 50)) domain = unconfuse(domain) words_in_domain = re.split(r"\W+", domain) if words_in_domain[0] in ["com", "net", "org"]: score += 10 for word in suspicious["keywords"]: if word in domain: score += suspicious["keywords"][word] for key in [k for (k, s) in suspicious["keywords"].items() if s >= 70]: for word in [ w for w in words_in_domain if w not in ["email", "mail", "cloud"] ]: if distance(str(word), str(key)) == 1: score += 70 if "xn--" not in domain and domain.count("-") >= 4: score += domain.count("-") * 3 if domain.count(".") >= 3: score += domain.count(".") * 3 return score
def find_encrypted_assets(self, apk): """Returns a dict of files in the APK assets that have high entropy.""" ret = [] for fname, filetype in apk.get_files_types().items(): if "assets" in fname: if ".png" in fname and "png" in filetype.lower(): continue buf = apk.zip.read(fname) file_entropy = entropy.shannon_entropy(buf) if file_entropy > 0.9: ret.append({ "name": fname, "entropy": file_entropy, "size": len(buf), "type": filetype, }) return ret
def score_domain(domain): score = 0 for tld in suspicious_tld: if domain.endswith(tld): score += 20 for keyword in suspicious_keywords: if keyword in domain: score += 25 for keyword in highly_suspicious: if keyword in domain: score += 60 score += int(round(entropy.shannon_entropy(domain) * 50)) # Lots of '-' (ie. www.paypal-datacenter.com-acccount-alert.com) if not 'xn--' in domain and domain.count('-') >= 4: score += 20 return score
def score_domain(provided_ioc): """Return the scores of the provided domain.""" score = 0 for suspicious_tld in suspicious["tlds"]: if provided_ioc.endswith(suspicious_tld): score += 20 try: res = tld.get_tld(provided_ioc, as_object=True, fail_silently=True, fix_protocol=True) domain = ".".join([res.subdomain, res.domain]) except Exception: domain = provided_ioc score += int(round(entropy.shannon_entropy(domain) * 50)) domain = confusables.unconfuse(domain) words_in_domain = re.split("\W+", domain) if domain.startswith("*."): domain = domain[2:] if words_in_domain[0] in ["com", "net", "org"]: score += 10 for word in suspicious["keywords"]: if word in domain: score += suspicious["keywords"][word] for key in [k for k, v in suspicious["keywords"].items() if v >= 70]: for word in [ w for w in words_in_domain if w not in ["email", "mail", "cloud"] ]: if pylev.levenshtein(str(word), str(key)) == 1: score += 70 if "xn--" not in domain and domain.count("-") >= 4: score += domain.count("-") * 3 if domain.count(".") >= 3: score += domain.count(".") * 3 return score
def findKeyLen(data, maxKeyLen): table = {} for keylen in range(1,maxKeyLen): entsum = 0 for i in range(keylen): subtable = data[i::keylen] entsum += entropy.shannon_entropy(subtable) averageent = entsum / keylen table[keylen] = averageent print "keylen: %02d, average entropy: %f" % (keylen , averageent) keys = sorted(table, key=table.__getitem__) probablekeys = {} a = 1000 for kl in keys: if table[kl] < a: a = table[kl] if table[kl] - a < 0.1: probablekeys[kl] = table[kl] return sorted(probablekeys)[0]
def keywords(data): # retweet_count d6 = pd.DataFrame() # tweet entropy l = [] for i in range(len(data.text)): l.append(entropy.shannon_entropy(data.text[i])) d6['tweet_entropy'] = pd.Series(l) # number of words in tweet l = [] for i in range(len(data.text)): l.append(len(pd.Series(data.text[i].split()))) d6['no_of_words'] = pd.Series(l) # % of unique words in the tweet l = [] for i in range(len(data.text)): l.append(len(data.text[i])) d6['tweet_length'] = pd.Series(l) d6x = pd.DataFrame({ 'sum': d6.sum().astype('int'), 'median': d6.median(), 'mean': d6.mean(), 'std': d6.std() }) d6x = d6x.round(decimals=3) del d6 return d6x
def process(self): # print("SECTIONS") # logging.debug("loading pefile") pelib = self._getLibrary(PEFileModule().getName()) if(pelib is None): return "" # logging.debug("iterating sections") ret = [] number = 0 for section in pelib.sections: # print(section) dic_sec = {} dic_sec["name"] = repr(section.Name) dic_sec["size_raw_data"] = int(hex(section.SizeOfRawData), 16) dic_sec["virtual_size"] = int(hex(section.Misc_VirtualSize), 16) dic_sec["characteristics"] = hex(section.Characteristics) if (section.__dict__.get('IMAGE_SCN_MEM_WRITE', False) and section.__dict__.get('IMAGE_SCN_MEM_EXECUTE', False)): dic_sec["write_executable"] = "True" else: dic_sec["write_executable"] = "False" data = section.get_data() # logging.debug("calculating hashes") dic_sec["sha1"] = SHA1(data) dic_sec["sha2"] = SHA256(data) dic_sec["md5"] = MD5(data) # logging.debug("calculating fuzzy") dic_sec["fuzzy_hash"] = getSsdeep(data) dic_sec["entropy"] = entropy.shannon_entropy(data) * 8 # logging.debug("finished calculating") ret.append(dic_sec) return ret
def get_highentropy_files(self, ent_threshold=0.7): """Return list of files with higher entropy (encrypted, compressed).""" import entropy ignored_mimetypes = [ "application/x-shockwave-flash", "application/x-font-", "application/pdf", "image/" ] for file in self.filelist: with open(file["filename"], "r") as f: buff = f.read(1024 * 1024) skip = False for mime in ignored_mimetypes: if file["mime"].startswith(mime): skip = True break if not skip: ent = entropy.shannon_entropy(buff) if ent >= ent_threshold: yield (file, ent)
def test_gibson_assembly_class(): """ Most of the tests here are "data integrity" tests. The structure of the GibsonAssembler class has to be done right. """ parts = [seq_generator(500) for i in range(3)] g = GibsonAssembler(parts) assume(len(set(parts)) > 1) # make sure no duplicates exist for part in parts: assume(shannon_entropy(part) > 0.24) primer_names = ['fw_gibson', 're_gibson', '3p_sequencing', '5p_sequencing'] for n, d in g.nodes(data=True): assert len(set(d.keys()).intersection(primer_names)) == 4 assert len(g.nodes()) == len(g.edges()) assert len(g.nodes()) == len(g.sequences) p = g.primers() assert len(p) == len(g.nodes()) for part, primers in p.items(): assert len(primers) == 4
""" Creates the most gibberish possible by compounding word pieces and sorting the results based on highest entropy. Inspired by moonbase alpha youtube videos. """ import re import entropy import subprocess import random from collections import defaultdict wordlist = open('words.txt','r') somedict = defaultdict(list) words = wordlist.readlines() wordlist = [ word.strip() for word in words if re.search('rur[a-z]',word) ] for i in range(0,10000): e_words = random.sample(wordlist,10) e_words = ''.join(e_words) cmd = ['/usr/bin/espeak','--stdout',e_words] if not cmd: continue proc = subprocess.Popen(cmd, stdout=subprocess.PIPE) somedict[e_words] = entropy.shannon_entropy(proc.stdout.read()) print (somedict[e_words]*100),e_words
# Capture and parse commandline arguments script, filename, minlength = argv # Verify that the MinLength argument is a valid positive integer try: val = int(minlength) if int(minlength) < 1: sys.exit("ERROR: Minimum domain length needs to be a positive integer") except ValueError: sys.exit("ERROR: Minimum domain length needs to be a positive integer") # Read in file, using || as the field delimiter df = pd.read_csv(filename, delimiter=",", header=0) # Extract Domain column (all rows) into a list dflist = df.iloc[:, 3].tolist() for item in dflist: domain = "" try: # Split domain out of FQDN and run entropy calculation domain, ext = str(item).split(".")[-2:] ent = entropy.shannon_entropy(domain) except ValueError: ent = "0.00" # If length of domain string is less than the minimum length, return 0.00 if len(domain) > int(minlength) - 1: print(ent, ",", item, sep="") else: print("0.00", ",", item, sep="")
def assert_entropy(self, data, expected): assert_almost_equal(shannon_entropy(data), expected, places=3)
starttime = time.time() fivebit.decompress(fivebit.compress("".join(lst))) print("Execution time: " + str(time.time() - starttime)) #Random gobbledegook words print("\n\n\nGenerating 50000 gobbledegook random words..") lst = [] for i in range(50000): lst.append(teststr(random.randrange(1,15),1,255) + " ") wordlist = "".join(lst) print("Shannon entropy: " + str(entropy.shannon_entropy(wordlist))) print("Testing compression time with dictionary enabled") starttime = time.time() d = fivebit.compress(wordlist,True) print("Shannon entropy: " + str(entropy.shannon_entropy(d))) print("Execution time: " + str(time.time() - starttime)) print("Testing compression time with dictionary disabled") starttime = time.time() nd = fivebit.compress(wordlist,False) print("Execution time: " + str(time.time() - starttime)) print("Testing decompression time..") starttime = time.time() dec = fivebit.decompress(d) print("Execution time: " + str(time.time() - starttime)) print("Uncompressed length: " + str(len(wordlist)) + " Dict compressed length: " + str(len(d)) + " Nodict compressed length: " + str(len(nd)) )
# page is not new and has changed print("Updating page {} to {}".format(page_number, decrypted_map[page_number], hash.hexdigest())) insert_page(output_file, page_number, data) decrypted_map[page_number] = hash_digest else: # page is new print("Page {} found {}".format(page_number, hash.hexdigest())) insert_page(output_file, page_number, data) decrypted_map[page_number] = hash_digest while True: data = input_file.read(4096) if len(data) != 4096: break data_entropy = entropy.shannon_entropy(data) if data_entropy <= ENTROPY_THRESHOLD: # page is below entropy threshold so it is most likely decrypted print("Page {} is not encrypted ({})".format(page_number, data_entropy)) upsert_page(page_number, data) elif page_number in entropy_map and data_entropy < entropy_map[page_number]: # page entropy value has decreased, THIS IS QUESTIONABLE but should be better anyway print("Entropy for page {} decreased from {} to {}".format(page_number, entropy_map[page_number], data_entropy)) upsert_page(page_number, data) elif page_number not in decrypted_map: # if the page is not decrypted yet update it anyway, to avoid false negatives insert_page(output_file, page_number, data) entropy_map[page_number] = data_entropy
import entropy input_file = open("text.crypto", "rb") data = input_file.read() print("Shannon entropy is ", entropy.shannon_entropy(data))
def process(self): res=entropy.shannon_entropy(self.sample.getBinary()) * 8 return res
#!/usr/bin/python import sys, os if len(sys.argv) < 2: sys.exit("Usage: cryptanalysis.py encrypted.raw") if not os.path.isfile(sys.argv[1]): sys.exit("File not found") with open(sys.argv[1], "rb") as f: cipher = f.read() print ''' [+]----------[ Cryptanalysis by t3h XRUST ]----------------------------------[+] | + Common Structures: | * Fixed-length data | * Variable-length data with separator chars | * Variable-length data with length fields | + Common Mistakes: | * Home-grown encryption | * Insecure cipher mode (ECB, CBC, OFB, ...) | * Poor key selection / Insufficient key length / Key reuse | * Insecure random number generator | ''' import entropy print "+ Entropy: %s" % entropy.shannon_entropy(cipher) import collections freq = collections.Counter(cipher) print "+ Common Characters: %s" % freq.most_common(5) length = len(cipher) print "+ Ciphertext Length: %d bytes" % length print "|--- 8 byte blocks: %d (remainder: %d bytes)" % (length/8, length%8) print "|--- 16 byte blocks: %d (remainder: %d bytes)" % (length/16, length%16) print "|\n+" + "-"*40 + "+++"
import entropy input_file = open("../python-mem/text.crypto", "rb") count = 1 while True: data = input_file.read(4096) if len(data) != 4096: break print("Page {}:{}".format(count, entropy.shannon_entropy(data))) count += 1
def count_entropy(self, password): """ Counts shannon entropy of password.""" print("Shannon Entropy count: {}".format(entropy.shannon_entropy(password)))