def test_DomainToIPs(self): RL, DD, IPD = dataprun.GenerateWL(["unittest4C.log"]) D2IP = dataprun.GenerateDomain2IP(RL, DD) answerDict = { "google.com": ["0.0.0.0"], "tmall.com": ["0.0.0.2"], "youtube.com": ["0.0.0.0"], "baidu.com": ["0.0.0.2"] } for dd in D2IP: self.assertIn(dd, answerDict) self.assertEqual(D2IP[dd], answerDict[dd]) self.assertEqual(len(answerDict), len(D2IP))
def test_MultipleAnswers(self): RL, DD, IPD = dataprun.GenerateWL(["unittest4MC.log"]) answerDomain = ["google.com", "tmall.com", "youtube.com", "baidu.com"] answerIP = [ "0.0.0.0", "0.0.0.2", "1.0.0.0", "1.0.0.2", "192.168.75.0", "192.168.75.2", "192.168.75.1", "192.168.75.3" ] for domain in DD: self.assertIn(domain, answerDomain) for ip in IPD: self.assertIn(ip, answerIP) self.assertEqual(len(DD.keys()), len(answerDomain)) self.assertEqual(len(IPD.keys()), len(answerIP))
def main(): ''' domain2IP_matrix.py creates the domain to ip csr matrix for hindom project Usage: python3 domain2IP_matrix.py --dns_files /data/dns/2021-03-29_dns.05:00:00-06:00:00.log ... Requires: dataprun.py ''' # Process command line arguments parser = argparse.ArgumentParser() parser.add_argument('--dns_files', type=str, required=True, nargs='+', help='Expects log file from /data/dns directory') FLAGS = parser.parse_args() RL, domain2index, ip2index = dataprun.GenerateWL(FLAGS.dns_files) domain2ip = dataprun.GenerateDomain2IP(RL, domain2index) # Create sparse matrix of domain to IP relations getDomainResolveIpCSR(domain2ip, domain2index, ip2index)
def test_InvalidInput(self): ans = dataprun.GenerateWL(["unittest4I.log"]) self.assertIs(ans, None)
def test_PrunEffect(self): #This setting will removed all inputs RL, DD, IPD = dataprun.GenerateWL(["unittest4C.log"], ka=0, kd=0) self.assertEqual(len(DD.keys()), 0) self.assertEqual(len(IPD.keys()), 2)
def drdMatrix(filename): """ - This is the module to be called from 'hin.py'. To do so, domainRegistrardomainMatrix = drdMatrix['filename'] you can pass in a single file or list of files - returns CSR sparse matrix for two domains having same registrar """ #Marking start of run time start_time = time.time() #the whois package only support these Top Level Domains (TLD) known_tld = [ 'com', 'uk', 'ac_uk', 'ar', 'at', 'pl', 'be', 'biz', 'br', 'ca', 'cc', 'cl', 'club', 'cn', 'co', 'jp', 'co_jp', 'cz', 'de', 'store', 'download', 'edu', 'education', 'eu', 'fi', 'fr', 'id', 'in_', 'info', 'io', 'ir', 'is_is', 'it', 'kr', 'kz', 'lt', 'ru', 'lv', 'me', 'mobi', 'mx', 'name', 'net', 'ninja', 'se', 'nu', 'nyc', 'nz', 'online', 'org', 'pharmacy', 'press', 'pw', 'rest', 'ru_rf', 'security', 'sh', 'site', 'space', 'tech', 'tel', 'theatre', 'tickets', 'tv', 'us', 'uz', 'video', 'website', 'wiki', 'xyz' ] # Calling dataprun package for whitelisted domain names and corresponding indexes print("Calling Dataprun package for whitelisted domain names......\n") RL, DD, IPD = dataprun.GenerateWL(filename) # Filtering the whitelisted domain_names provided by dataprun package # with known TLDs supported by the whois package. domainName2IndexDictionary = {} for domainName, domainIndex in DD.items(): if domainName.split(".")[-1] in known_tld: domainName2IndexDictionary[domainName] = domainIndex else: pass # calling 'whoisLookup()' to find registrars for the domain_names. print( "\nStarting whois lookup for finding registrar of each domain name...\n" ) domainNameIndex2RegistrarDictionary, count_FailedLookups = whoisLookup( domainName2IndexDictionary) # getting domain-name indexes of 'domainNameIndex2RegistrarDictionary' to a list. domainNameIndexList = [] for domainNameIndex, registrarName in domainNameIndex2RegistrarDictionary.items( ): domainNameIndexList.append(domainNameIndex) print( f"\nNumber of unseccessful registrar lookups = {count_FailedLookups}\n" ) #calling 'csrMatrix()' to generate the domain-registrar-domain matrix print("Generating domain_registrar_domain matrix....\n") _csrmatrix = csrMatrix(domainNameIndexList, domainNameIndex2RegistrarDictionary) print("CSR Sparse matrix generation is complete.\n") #Marking end of run time end_time = time.time() total_time = end_time - start_time print(f"\nTotal time to run : {total_time} seconds\n") return _csrmatrix