示例#1
0
def ProcessRawData(inputdir, outputdir, ver):
    rfoutputdir = os.path.join(outputdir, "rf/%s" % (ver))
    StreamList = StreamProcessing(logfile)
    for domain in os.listdir(inputdir):
        domainpath = os.path.join(inputdir, domain)
        for instance in os.listdir(domainpath):
            try:
                instancepath = os.path.join(domainpath, instance)
                cnt, domain, timestamp = parseFileName(instancepath)
                if (domain, cnt) in StreamList:
                    rf_datalist = ReadCSV(instancepath,
                                          StreamList[(domain, cnt)])
                    cellsDict = CellFeature.FeatureRetrieve(
                        rf_datalist, StreamList[(domain, cnt)], domain)
                    TimeDict = TimeFeature.FeatureRetrieve(
                        rf_datalist, StreamList[(domain, cnt)], domain)
                    AllDict = MergeDict(cellsDict, TimeDict)
                    AllDict['srcDir'] = instancepath.split("Traffic/")[-1]
                    writeFeature(rfoutputdir, domain, AllDict)
                else:
                    WriteLog("Not found: %s in streamInfo" % (instancepath))
            except Exception as e:
                print("Error string: ", str(e))
                WriteLog(str(e))
                pass
示例#2
0
def ParsePcap(inputdir,outputdir,ver):
	for CrawlDate in os.listdir(inputdir):				# inputdir: Traffic/
		if "DS_Store" not in CrawlDate and CrawlDate != '':
			try:
				CrawlDir = os.path.join(inputdir,CrawlDate)	# CrawlDir: 20200422/
				print("processing Dir: %s..."%(CrawlDir))
				domainDir = getTarFile(CrawlDir,ver)			# domainDir: 20200422/traces/ 
				logfile = os.path.join(CrawlDir,cm.StreamInfo)	# logfile: XXX/logs/streamInfo.txt
				if not os.path.exists(logfile):
					logfile = os.path.join(CrawlDir,'streamInfo.txt')
				StreamList = StreamProcessing(logfile)
				for domain in os.listdir(domainDir):
					if ".DS_Store" not in domain and domain != '':
						domainpath = os.path.join(domainDir,domain)
						print("parsing domain: ",domainpath)
						for instance in os.listdir(domainpath):
							try:
								instancepath = os.path.join(domainpath,instance)
								inputfilepath = instancepath.split("Traffic/")[-1]
								cnt,domain,timestamp = parseFileName(instancepath)
								if (domain,cnt) in StreamList:
									datalist = ReadCSV(instancepath,StreamList[(domain,cnt)])
									writeFeature(outputdir,domain,datalist)
								else:
									print("Not found: %s in streamInfo"%(instancepath))
							except Exception as e:
								print("Error string: ",str(e))
								print(str(e))
								pass
			except Exception as e:
				print(str(e))
			finally:
				RemoveUncompressDir(domainDir)				# domainDir: 20200422/<untardir>
示例#3
0
def testrun():
    inputfilepath = testfile.split("Traffic/")[-1]
    cnt, domain, timestamp = parseFileName(testfile)
    StreamList = StreamProcessing(logfile)
    datalist = ReadCSV(testfile, StreamList[(domain, cnt)])
    print("len datalist = ", len(datalist))
    cellsDict = CellFeature.FeatureRetrieve(datalist,
                                            StreamList[(domain, cnt)],
                                            inputfilepath)
    TimeDict = TimeFeature.FeatureRetrieve(datalist, StreamList[(domain, cnt)],
                                           inputfilepath)
    AllDict = MergeDict(cellsDict, TimeDict)
    writeFeature(outputdir, domain, AllDict)
def ProcessRawData(inputdir, outputdir, ver):
    browserversion = inputdir.strip("/").split("/")[-1]
    rfoutputdir = os.path.join(outputdir, "rf/%s" % (browserversion))
    dfoutputdir = os.path.join(outputdir, "df/%s" % (browserversion))
    for CrawlDate in os.listdir(inputdir):  # inputdir: Traffic/
        if "DS_Store" not in CrawlDate and CrawlDate != '':
            try:
                CrawlDir = os.path.join(inputdir,
                                        CrawlDate)  # CrawlDir: 20200422/
                print("processing Dir: %s..." % (CrawlDir))
                domainDir = getTarFile(CrawlDir,
                                       ver)  # domainDir: 20200422/traces/
                logfile = os.path.join(
                    CrawlDir,
                    cm.StreamInfo)  # logfile: XXX/logs/streamInfo.txt
                if not os.path.exists(logfile):
                    logfile = os.path.join(CrawlDir, 'streamInfo.txt')
                StreamList = StreamProcessing(logfile)
                for domain in os.listdir(domainDir):
                    if ".DS_Store" not in domain and domain != '':
                        domainpath = os.path.join(domainDir, domain)
                        print("parsing domain: ", domainpath)
                        for instance in os.listdir(domainpath):
                            try:
                                instancepath = os.path.join(
                                    domainpath, instance)
                                inputfilepath = instancepath.split(
                                    "Traffic/")[-1]
                                cnt, domain, timestamp = parseFileName(
                                    instancepath)
                                if (domain, cnt) in StreamList:
                                    rf_datalist = ReadCSV(
                                        instancepath,
                                        StreamList[(domain, cnt)])
                                    cellsDict = CellFeature.FeatureRetrieve(
                                        rf_datalist, StreamList[(domain, cnt)],
                                        inputfilepath)
                                    TimeDict = TimeFeature.FeatureRetrieve(
                                        rf_datalist, StreamList[(domain, cnt)],
                                        inputfilepath)
                                    AllDict = MergeDict(cellsDict, TimeDict)
                                    AllDict['srcDir'] = instancepath.split(
                                        "Traffic/")[-1]
                                    writeFeature(rfoutputdir, domain, AllDict)
                                    df_datalist = DFFeatureExtract.ReadCSV(
                                        instancepath,
                                        StreamList[(domain, cnt)])
                                    DFFeatureExtract.writeFeature(
                                        dfoutputdir, domain, df_datalist)
                                else:
                                    WriteLog("Not found: %s in streamInfo" %
                                             (instancepath))
                            except Exception as e:
                                print("Error string: ", str(e))
                                WriteLog(str(e))
                                pass
            except Exception as e:
                print(str(e))
                WriteLog(str(e))
            finally:
                RemoveUncompressDir(
                    domainDir)  # domainDir: 20200422/<untardir>
示例#5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--inputdir',
                        '-i',
                        type=str,
                        default=None,
                        help='input traces dir, EX: traces')
    parser.add_argument('--test',
                        '-t',
                        help='test pcap file',
                        action='store_true')
    parser.add_argument("--outputdir",
                        '-o',
                        type=str,
                        required=True,
                        help='outputdir')
    parser.add_argument("--version",
                        '-v',
                        type=str,
                        default="new",
                        help="new Version Crawler or old one")
    args = parser.parse_args()
    ISTEST = args.test
    inputdir = args.inputdir
    outputdir = args.outputdir
    ver = args.version
    if ISTEST == False and inputdir == None:
        print("At least testfile or inputdir argument should be specified")
        return 0
    if ISTEST != False and inputdir != None:
        print("only one of the argument: testfile, inputdir could have value")
        return 0
    if ISTEST:
        testrun()
    else:
        for CrawlDate in os.listdir(inputdir):  # inputdir: Traffic/
            if CrawlDate not in problemList and "DS_Store" not in CrawlDate and CrawlDate != '':
                try:
                    CrawlDir = os.path.join(inputdir,
                                            CrawlDate)  # CrawlDir: 20200422/
                    print("processing Dir: %s..." % (CrawlDir))
                    domainDir = getTarFile(CrawlDir,
                                           ver)  # domainDir: 20200422/traces/
                    logfile = os.path.join(
                        CrawlDir,
                        cm.StreamInfo)  # logfile: XXX/logs/streamInfo.txt
                    if not os.path.exists(logfile):
                        logfile = os.path.join(CrawlDir, 'streamInfo.txt')
                    StreamList = StreamProcessing(logfile)
                    for domain in os.listdir(domainDir):
                        if ".DS_Store" not in domain and domain != '':
                            domainpath = os.path.join(domainDir, domain)
                            print("parsing domain: ", domainpath)
                            for instance in os.listdir(domainpath):
                                try:
                                    instancepath = os.path.join(
                                        domainpath, instance)
                                    inputfilepath = instancepath.split(
                                        "Traffic/")[-1]
                                    cnt, domain, timestamp = parseFileName(
                                        instancepath)
                                    if (domain, cnt) in StreamList:
                                        datalist = ReadCSV(
                                            instancepath,
                                            StreamList[(domain, cnt)])
                                        cellsDict = CellFeature.FeatureRetrieve(
                                            datalist,
                                            StreamList[(domain, cnt)],
                                            inputfilepath)
                                        TimeDict = TimeFeature.FeatureRetrieve(
                                            datalist,
                                            StreamList[(domain, cnt)],
                                            inputfilepath)
                                        AllDict = MergeDict(
                                            cellsDict, TimeDict)
                                        AllDict['srcDir'] = instancepath.split(
                                            "Traffic/")[-1]
                                        writeFeature(outputdir, domain,
                                                     AllDict)
                                    else:
                                        WriteLog(
                                            "Not found: %s in streamInfo" %
                                            (instancepath))
                                except Exception as e:
                                    print("Error string: ", str(e))
                                    WriteLog(str(e))
                                    pass
                except Exception as e:
                    print(str(e))
                    WriteLog(str(e))
                finally:
                    RemoveUncompressDir(
                        domainDir)  # domainDir: 20200422/<untardir>