def main(): domainFileName, days, queryTypeNames, queryTypeCodes, outputPath = parseArgs(sys.argv[1:]) domainFileDesc = open(domainFileName, "r") domainList = [] regDomainList = [] # read domains form the input file for domain in domainFileDesc: domain = domain.strip().lower() if domain[-1] == '.': domain = domain[:-1] regdomain = regdom.get_registered_domain(domain) if regdomain is not None: domainList.append(domain) regDomainList.append(regdomain) else: print 'Domain %s does not have valid registered domain, skipping.' % domain filesToSearch = findFiles(domainList, queryTypeNames, days) inputPath = makeInputString(filesToSearch, days) print inputPath regexString = getRegex(regDomainList) queryString = makeQueryString (queryTypeCodes) write_pig(inputPath, outputPath, queryString, regexString)
def main(): domainFileName, dateRange, queryTypes, queryTypeCodes, outputpath = parseArgs(sys.argv[1:]) # print queryTypes # print queryTypeCodes domainFileDesc = open(domainFileName, "r") domainList = [] regDomainList = [] # read domains form the input file for domain in domainFileDesc: domain = domain.strip().lower() if domain[-1] == ".": domain = domain[:-1] regdomain = regdom.get_registered_domain(domain) if not regdomain is None: domainList.append(domain) regDomainList.append(regdomain) else: print "Domain %s does not have valid registered domain, skipping." % domain filesToSearch = findFiles(domainList, queryTypes, dateRange) fileinputstring = makeInputString(filesToSearch) # fileinputstring = "/user/pdhakshi/SIE_DATA/BY_MULTIPARAMS/{%s}.gz/*" % (",".join(filesToSearch)) regexstring = get_regex(regDomainList) # for aFile in filesToSearch: # print aFile querystring = makequerystring(queryTypeCodes) write_pig(fileinputstring, outputpath, querystring, regexstring)
def findFiles(domainList, queryTypes, days): fileList = [] for domain in domainList: regDomain = regdom.get_registered_domain(domain) revRegDomain = ".".join(regDomain.split(".")[::-1]) tld = revRegDomain.split('.')[0] tld = tld.upper() if (tld != "COM" and tld != "NET" and tld != "ORG" and tld != "ARPA"): tld = "OTHR" domainHashCode = getJavahash(revRegDomain) qtype_tld_list = [] for qtype in queryTypes: qtype_tld = qtype + "_" + tld percent_dist_for_group = bucketDistribution[qtype_tld] bucketnumber = getBucketNumber(percent_dist_for_group, domainHashCode) qtype_tld_list.append(qtype_tld + "_" + bucketnumber) # Add the file to the output list only if it was not added already while processing some # other domain. for afile in qtype_tld_list: if afile not in fileList: fileList.append(afile) #return "{%s}/{%s}" % (",".join(days), ",".join(fileList)) return fileList
def findFiles(domainList, queryTypes, dateRange): result = [] for domain in domainList: regDomain = regdom.get_registered_domain(domain) if regDomain[-1] == ".": regDomain = regDomain[:-1] revRegDomain = ".".join(regDomain.split(".")[::-1]) tld = revRegDomain.split(".")[0] tld = tld.upper() if tld != "COM" and tld != "NET" and tld != "ARPA": tld = "OTHR" domainHashCode = getJavahash(revRegDomain) # print revRegDomain, domainHashCode qtype_tld_list = [] for qtype in queryTypes: qtype_tld = qtype + "_" + tld percent_dist_for_group = bucket_distribution[qtype_tld] bucketnumber = getBucketNumber(percent_dist_for_group, domainHashCode) qtype_tld_list.append(qtype_tld + "_" + bucketnumber) temp_result = [aDay + "_" + qtype_tld for aDay in dateRange for qtype_tld in qtype_tld_list] for aResult in temp_result: if aResult not in result: result.append(aResult) return result
def main(): domainFileName, yearmonthToDayMap, queryTypes, outputpath = parseArgs(sys.argv[1:]) domainFileDesc = open(domainFileName, "r") domainList = [] regDomainList = [] # read domains form the input file for domain in domainFileDesc: domain = domain.strip().lower() if domain[-1] == '.': domain = domain[:-1] regdomain = regdom.get_registered_domain(domain) if not regdomain is None: domainList.append(domain) regDomainList.append(regdomain) else: print 'Domain %s does not have valid registered domain, skipping.' % domain inputstring = makeInputString(yearmonthToDayMap) querystring = makequerystring (queryTypes) regexstring = get_regex(regDomainList) write_pig(inputstring, outputpath, querystring, regexstring)