def send_debug_printer(pid, syscall_object): p = cint.peek_register(pid, cint.ECX) params = extract_socketcall_parameters(pid, p, 4) addr = params[1] size = params[2] data = cint.copy_address_range(pid, addr, addr + size) logging.debug('This call tried to send: %s', data.encode('string-escape'))
def generateDict(self) : self.dictionary = corpora.Dictionary(line.lower().split('|') for line in open(self.corpusFile)) rare_tokens = [tokenid for tokenid, docfreq in self.dictionary.dfs.iteritems() if docfreq < 5] logging.debug('=====The number of tokens to be removed is %d =====' % len(rare_tokens)) self.dictionary.filter_tokens(rare_tokens) logging.debug('=====Total %d tokens=====' % len(self.dictionary.dfs) ) self.dictionary.compactify()
def shutdown_debug_printer(pid, syscall_object): p = cint.peek_register(pid, cint.ECX) params = extract_socketcall_parameters(pid, p, 2) fd = params[0] cmd = params[1] logging.debug('This call tried to shutdown: %d', fd) logging.debug('Command: %d: %s', cmd, SHUTDOWN_INT_TO_CMD[params[1]])
def dataCleaning(): logging.info('===Data Cleaning Processing===') input_file = DATA_TRAINING_SAMPLE adClickCntList = generateTopAdsUsersByClick(input_file) dumpList2File(adClickCntList, TMP_DATA_DIR_PATH + 'topAdClickCnt.dict') adSet = set() for line in file(TMP_DATA_DIR_PATH + 'topAdClickCnt.dict'): cnt, adid = line.strip().split() adSet.add(adid) logging.debug(len(adSet)) ad2Users = generateAd2UsersGivenAdSet(input_file, adSet) dumpDict2File(ad2Users, TMP_DATA_DIR_PATH + 'ad2UsersGivenAdSet.dict') userDict = generateUser2AdGivenAd2User(TMP_DATA_DIR_PATH + 'ad2UsersGivenAdSet.dict', adViewThreshold=10) dumpDict2File(userDict, TMP_DATA_DIR_PATH + 'user2AdGivenAd2User.dict') userSet = set() logging.debug(len(userSet)) for line in file(TMP_DATA_DIR_PATH + 'user2AdGivenAd2User.dict'): user, ads = line.strip().split('\x01') userSet.add(user) dumpUserRawFeatureGivenUserSet(input_file, userSet, TMP_DATA_DIR_PATH + 'userRawFeature.dict')
def listen_entry_handler(syscall_id, syscall_object, pid): logging.debug('Entering listen entry handler') p = cint.peek_register(pid, cint.ECX) params = extract_socketcall_parameters(pid, p, 1) fd_from_trace = int(syscall_object.args[0].value) validate_integer_argument(pid, syscall_object, 0, 0, params=params) logging.debug('Replaying this system call') subcall_return_success_handler(syscall_id, syscall_object, pid)
def query_Filter(topAdSet, fn_rawData): logging.debug('generate Query Filter') queryIdSet = set() for line in file(fn_rawData): Click, Impression, Display_url, AdID, AdvertiserID, Depth, \ Position, QueryID, KeywordID, TitleID, DescriptionID, UserID = dataParser.parseTrainData(line) if AdID not in topAdSet: continue queryIdSet.add(QueryID) return queryIdSet
def dumpCTRDistributionPlot(fn_ad2userCTR, output_dir = '/Users/zhanglixin/research/kdd_cup/advertisingLab/data/tmp_data/plot_out/') : for line in file(fn_ad2userCTR) : plotTool = MiniPlotTool(baseConfig) adid, ctrs = line.strip().split('\01') logging.debug(adid) ctrs = [float(ctr) for ctr in ctrs.split('\t')] plotTool.addline({'X':range(len(ctrs)), 'Y':ctrs}) plotTool.plot() plotTool.save(output_dir + adid + '.png')
def query_Filter(topAdSet, fn_rawData) : logging.debug('generate Query Filter') queryIdSet = set() for line in file(fn_rawData) : Click, Impression, Display_url, AdID, AdvertiserID, Depth, \ Position, QueryID, KeywordID, TitleID, DescriptionID, UserID = dataParser.parseTrainData(line) if AdID not in topAdSet : continue queryIdSet.add(QueryID) return queryIdSet
def run(lda, num_topics=200, raw_corpus='corpus', fn_bow='corpus.svmlight', fn_out_topic='LDA_corpus.svmlight') : lda.generateDict() logging.debug('=====start generateDict=====') corpora.SvmLightCorpus.serialize(fn_bow, lda.__iter__(raw_corpus)) return bow_corpus = corpora.SvmLightCorpus(fn_bow) logging.debug('=====Topic Processing=====') lda_model = models.ldamodel.LdaModel(bow_corpus, id2word=lda.dictionary, num_topics=num_topics) corpus_lda = lda_model[bow_corpus] corpora.SvmLightCorpus.serialize(fn_out_topic, corpus_lda)
def generateDict(self): self.dictionary = corpora.Dictionary(line.lower().split('|') for line in open(self.corpusFile)) rare_tokens = [ tokenid for tokenid, docfreq in self.dictionary.dfs.iteritems() if docfreq < 5 ] logging.debug('=====The number of tokens to be removed is %d =====' % len(rare_tokens)) self.dictionary.filter_tokens(rare_tokens) logging.debug('=====Total %d tokens=====' % len(self.dictionary.dfs)) self.dictionary.compactify()
def accept_exit_handler(syscall_id, syscall_object, pid): logging.debug('Entering accept exit handler') fd_from_execution = cint.peek_register(pid, cint.EAX) fd_from_trace = int(syscall_object.ret[0]) if offset_file_descriptor(fd_from_trace) != fd_from_execution: raise ReplayDeltaError('File descriptor from execution ({}) ' 'differs from file descriptor from ' 'trace ({})'.format(fd_from_execution, fd_from_trace)) if fd_from_execution >= 0: add_os_fd_mapping(fd_from_execution, fd_from_trace) cint.poke_register(pid, cint.EAX, fd_from_trace)
def dumpCTRDistributionPlot( fn_ad2userCTR, output_dir='/Users/zhanglixin/research/kdd_cup/advertisingLab/data/tmp_data/plot_out/' ): for line in file(fn_ad2userCTR): plotTool = MiniPlotTool(baseConfig) adid, ctrs = line.strip().split('\01') logging.debug(adid) ctrs = [float(ctr) for ctr in ctrs.split('\t')] plotTool.addline({'X': range(len(ctrs)), 'Y': ctrs}) plotTool.plot() plotTool.save(output_dir + adid + '.png')
def socketcall_debug_printer(pid, orig_eax, syscall_object): subcall_debug_printers = { 1: socket_debug_printer, 9: send_debug_printer, 13: shutdown_debug_printer } subcall_id = cint.peek_register(pid, cint.EBX) logging.debug('Got subcall {} {}'.format(subcall_id, SOCKET_SUBCALLS[subcall_id])) try: subcall_debug_printers[subcall_id](pid, syscall_object) except KeyError as e: logging.warning( 'This subcall ({}) has no debug printer'.format(subcall_id)) raise e
def run(lda, num_topics=200, raw_corpus='corpus', fn_bow='corpus.svmlight', fn_out_topic='LDA_corpus.svmlight'): lda.generateDict() logging.debug('=====start generateDict=====') corpora.SvmLightCorpus.serialize(fn_bow, lda.__iter__(raw_corpus)) return bow_corpus = corpora.SvmLightCorpus(fn_bow) logging.debug('=====Topic Processing=====') lda_model = models.ldamodel.LdaModel(bow_corpus, id2word=lda.dictionary, num_topics=num_topics) corpus_lda = lda_model[bow_corpus] corpora.SvmLightCorpus.serialize(fn_out_topic, corpus_lda)
def shutdown_subcall_entry_handler(syscall_id, syscall_object, pid): """Replay Always Checks: 0: sockfd: the socket file descriptor Sets: return value: 0 (success) or -1 (error) errno """ logging.debug('Entering shutdown entry handler') # Pull out the info we can check ecx = cint.peek_register(pid, cint.ECX) params = extract_socketcall_parameters(pid, ecx, 2) fd_from_trace = syscall_object.args[0].value validate_integer_argument(pid, syscall_object, 0, 0, params=params) # TODO: We need to check the 'how' parameter here # Check to make sure everything is the same # Decide if we want to replay this system call noop_current_syscall(pid) apply_return_conditions(pid, syscall_object)
def connect_entry_handler(syscall_id, syscall_object, pid): """Replay Always Checks: 0: The socket file descriptor 2: The length of the sockaddr structure pointed to by 1 Sets: return value: file descriptor of the new socket -1 (error) errno Not Implemented: * Determine what is not implemented """ logging.debug('Entering connect entry handler') ecx = cint.peek_register(pid, cint.ECX) params = extract_socketcall_parameters(pid, ecx, 3) validate_integer_argument(pid, syscall_object, 0, 0, params=params) validate_integer_argument(pid, syscall_object, 2, 2, params=params) trace_fd = int(syscall_object.args[0].value) noop_current_syscall(pid) apply_return_conditions(pid, syscall_object)
def generateTransferAdQueryTokenPair(topAdSet, fn_rawData, fn_rawQuery, fn_out, query_filter): logging.debug('Loading Query Map') #query_filter = set(line.strip() for line in file(TMP_DATA_DIR_PATH+'queryID.set')) query_map = dict([(line.strip().split('\t')) for line in file(fn_rawQuery) if line.strip().split('\t')[0] in query_filter]) Ad_QueryToken_map = {} token_map = {} logging.debug('Generating Ad_QueryToken_map') for line in file(fn_rawData): Click, Impression, Display_url, AdID, AdvertiserID, Depth, \ Position, QueryID, KeywordID, TitleID, DescriptionID, UserID = dataParser.parseTrainData(line) if AdID not in topAdSet: continue if AdID not in Ad_QueryToken_map: Ad_QueryToken_map[AdID] = {} tokens = query_map[QueryID].split('|') for token in tokens: if token not in token_map: token_map[token] = 0 token_map[token] += 1 if token not in Ad_QueryToken_map[AdID]: Ad_QueryToken_map[AdID][token] = 0 Ad_QueryToken_map[AdID][token] += 1 logging.debug('Dumping Transfer info to file') writer = file(fn_out, 'w') for Ad in Ad_QueryToken_map: for token in Ad_QueryToken_map[Ad]: #print token, Ad_QueryToken_map[Ad][token],token_map[token] writer.write('%s\t%s\t%f\n' % (Ad, token, Ad_QueryToken_map[Ad][token] * 1.0 / token_map[token])) writer.close()
def generateTransferAdQueryTokenPair(topAdSet, fn_rawData, fn_rawQuery, fn_out, query_filter) : logging.debug('Loading Query Map') #query_filter = set(line.strip() for line in file(TMP_DATA_DIR_PATH+'queryID.set')) query_map = dict([(line.strip().split('\t')) for line in file(fn_rawQuery) if line.strip().split('\t')[0] in query_filter]) Ad_QueryToken_map = {} token_map = {} logging.debug('Generating Ad_QueryToken_map') for line in file(fn_rawData) : Click, Impression, Display_url, AdID, AdvertiserID, Depth, \ Position, QueryID, KeywordID, TitleID, DescriptionID, UserID = dataParser.parseTrainData(line) if AdID not in topAdSet : continue if AdID not in Ad_QueryToken_map : Ad_QueryToken_map[AdID] = {} tokens = query_map[QueryID].split('|') for token in tokens : if token not in token_map : token_map[token] = 0 token_map[token] += 1 if token not in Ad_QueryToken_map[AdID] : Ad_QueryToken_map[AdID][token] = 0 Ad_QueryToken_map[AdID][token] += 1 logging.debug('Dumping Transfer info to file') writer = file(fn_out, 'w') for Ad in Ad_QueryToken_map : for token in Ad_QueryToken_map[Ad]: #print token, Ad_QueryToken_map[Ad][token],token_map[token] writer.write('%s\t%s\t%f\n' % (Ad, token, Ad_QueryToken_map[Ad][token] * 1.0 / token_map[token])) writer.close()
def dataCleaning() : logging.info('===Data Cleaning Processing===') input_file = DATA_TRAINING_SAMPLE adClickCntList = generateTopAdsUsersByClick(input_file) dumpList2File(adClickCntList, TMP_DATA_DIR_PATH + 'topAdClickCnt.dict') adSet = set() for line in file(TMP_DATA_DIR_PATH + 'topAdClickCnt.dict') : cnt, adid = line.strip().split() adSet.add(adid) logging.debug(len(adSet)) ad2Users = generateAd2UsersGivenAdSet(input_file, adSet) dumpDict2File(ad2Users, TMP_DATA_DIR_PATH + 'ad2UsersGivenAdSet.dict') userDict = generateUser2AdGivenAd2User(TMP_DATA_DIR_PATH + 'ad2UsersGivenAdSet.dict', adViewThreshold = 10) dumpDict2File(userDict, TMP_DATA_DIR_PATH + 'user2AdGivenAd2User.dict') userSet = set() logging.debug(len(userSet)) for line in file(TMP_DATA_DIR_PATH + 'user2AdGivenAd2User.dict') : user, ads = line.strip().split('\x01') userSet.add(user) dumpUserRawFeatureGivenUserSet(input_file, userSet, TMP_DATA_DIR_PATH + 'userRawFeature.dict')
def setsockopt_entry_handler(syscall_id, syscall_object, pid): """Replay Always Checks: 0: sockfd: the socket file descriptor Sets: optval: out parameter return value: 0 (success) or -1 (error) errno Not Implemented: More checking """ logging.debug('Entering setsockopt handler') ecx = cint.peek_register(pid, cint.ECX) params = extract_socketcall_parameters(pid, ecx, 5) fd_from_trace = int(syscall_object.args[0].value) optval_addr = params[3] # We don't check param[3] because it is an address of an empty buffer # We don't check param[4] because it is an address of an empty length validate_integer_argument(pid, syscall_object, 0, 0, params=params) noop_current_syscall(pid) apply_return_conditions(pid, syscall_object)
def getsockopt_entry_handler(syscall_id, syscall_object, pid): """Replay Always Checks: 0: The socket file descriptor Sets: optval: The value being retrieved optval_len: The length of the value being retrieved return value: 0 (success) or 1 (failure) errno Not Implemented: * Use the address validator to check addresses """ logging.debug('Entering getsockopt handler') # Pull out what we can compare ecx = cint.peek_register(pid, cint.ECX) params = extract_socketcall_parameters(pid, ecx, 5) fd_from_trace = int(syscall_object.args[0].value) optval_addr = params[3] optval_len_addr = params[4] validate_integer_argument(pid, syscall_object, 0, 0, params=params) # This if is sufficient for now for the implemented options if params[1] != 1 or params[2] != 4: raise NotImplementedError('Unimplemented getsockopt level or optname') optval_len = int(syscall_object.args[4].value.strip('[]')) if optval_len != 4: raise NotImplementedError('getsockopt() not implemented for ' 'optval sizes other than 4') optval = int(syscall_object.args[3].value.strip('[]')) logging.debug('Optval: %s', optval) logging.debug('Optval Length: %s', optval_len) logging.debug('Optval addr: %x', optval_addr & 0xffffffff) logging.debug('Optval Lenght addr: %d', optval_len_addr & 0xffffffff) noop_current_syscall(pid) cint.populate_int(pid, optval_addr, optval) cint.populate_int(pid, optval_len_addr, 4) apply_return_conditions(pid, syscall_object)
def expandFeatureId2Tokens(aggregateUserfile, expandId2TokensResultFile, query_set, desc_set, title_set) : logging.info('=========start expandFeatureId2Tokens processing=========') description_map = dict([(line.strip().split('\t')) for line in file(DATA_DESCRIPTION) if line.split('\t',1)[0] in desc_set]) logging.debug('Read %s Done.' % DATA_DESCRIPTION) query_map = dict([(line.strip().split('\t')) for line in file(DATA_QUERY) if line.split('\t',1)[0] in query_set]) logging.debug('Read %s Done.' % DATA_QUERY) title_map = dict([(line.strip().split('\t')) for line in file(DATA_TITLE) if line.split('\t',1)[0] in title_set]) logging.debug('Read %s Done.' % DATA_TITLE) #profile_map = dict([(line.strip().split('\t', 1)) for line in file(DATA_PROFILE) if line.split('\t')]) dump_format = '%s\x01%s\x01%s\x01%s\n' expandId2TokensResult = file(expandId2TokensResultFile, 'w') logging.debug('start joining tokens') for line in file(aggregateUserfile) : userID, tmp_str = line.strip().split('\x01') queryIDlist, titleIDlist, descIDList = tmp_str.split('\x02') queryExpandTokensStr = '|'.join([query_map[queryId] for queryId in queryIDlist.split('\t') if queryId != '']) titleExpandTokensStr = '|'.join([title_map[titleId] for titleId in titleIDlist.split('\t') if titleId != '']) descExpandTokensStr = '|'.join([description_map[descId] for descId in descIDList.split('\t') if descId != '']) expandId2TokensResult.write( dump_format % \ (userID, queryExpandTokensStr, titleExpandTokensStr, descExpandTokensStr)) expandId2TokensResult.close()
def expandFeatureId2Tokens(aggregateUserfile, expandId2TokensResultFile, query_set, desc_set, title_set): logging.info('=========start expandFeatureId2Tokens processing=========') description_map = dict([(line.strip().split('\t')) for line in file(DATA_DESCRIPTION) if line.split('\t', 1)[0] in desc_set]) logging.debug('Read %s Done.' % DATA_DESCRIPTION) query_map = dict([(line.strip().split('\t')) for line in file(DATA_QUERY) if line.split('\t', 1)[0] in query_set]) logging.debug('Read %s Done.' % DATA_QUERY) title_map = dict([(line.strip().split('\t')) for line in file(DATA_TITLE) if line.split('\t', 1)[0] in title_set]) logging.debug('Read %s Done.' % DATA_TITLE) #profile_map = dict([(line.strip().split('\t', 1)) for line in file(DATA_PROFILE) if line.split('\t')]) dump_format = '%s\x01%s\x01%s\x01%s\n' expandId2TokensResult = file(expandId2TokensResultFile, 'w') logging.debug('start joining tokens') for line in file(aggregateUserfile): userID, tmp_str = line.strip().split('\x01') queryIDlist, titleIDlist, descIDList = tmp_str.split('\x02') queryExpandTokensStr = '|'.join([ query_map[queryId] for queryId in queryIDlist.split('\t') if queryId != '' ]) titleExpandTokensStr = '|'.join([ title_map[titleId] for titleId in titleIDlist.split('\t') if titleId != '' ]) descExpandTokensStr = '|'.join([ description_map[descId] for descId in descIDList.split('\t') if descId != '' ]) expandId2TokensResult.write( dump_format % \ (userID, queryExpandTokensStr, titleExpandTokensStr, descExpandTokensStr)) expandId2TokensResult.close()
def socket_debug_printer(pid, syscall_object): p = cint.peek_register(pid, cint.ECX) params = extract_socketcall_parameters(pid, p, 3) logging.debug('Domain: %s', ADDRFAM_INT_TO_FAM[params[0]]) logging.debug('Type: %s', SOCKTYPE_INT_TO_TYPE[params[1]]) logging.debug('Protocol: %s', PROTOFAM_INT_TO_FAM[params[2]])
import __init__ import sys sys.path.append('../') from util import logging class dataParser: @staticmethod def parseTrainData(line): fields = line.strip().split('\t') if len(fields) != 12: return None Click, Impression, Display_url, AdID, AdvertiserID, Depth, \ Position, QueryID, KeywordID, TitleID, DescriptionID, UserID = fields return int(Click), int(Impression), Display_url,\ AdID, AdvertiserID, int(Depth), \ int(Position), QueryID, KeywordID,\ TitleID, DescriptionID, UserID if __name__ == '__main__': example_row = '0\t1\t4298118681424644510\t7686695\t385\t3\t3\t1601\t5521\t7709\t576\t490234' logging.debug(dataParser.parseTrainData(example_row))
def socket_entry_handler(syscall_id, syscall_object, pid): """Replay Always Checks: 0: The domain of the socket Sets: return value: file descriptor of the new socket -1 (error) (added as replay file descriptor) errno Not Implemented: * Determine what is not implemented """ logging.debug('Entering socket subcall entry handler') ecx = cint.peek_register(pid, cint.ECX) params = extract_socketcall_parameters(pid, ecx, 3) # Only PF_INET and PF_LOCAL socket calls are handled execution_is_PF_INET = (params[0] == cint.PF_INET) trace_is_PF_INET = (str(syscall_object.args[0]) == '[\'PF_INET\']') execution_is_PF_LOCAL = (params[0] == 1) # define PF_LOCAL 1 trace_is_PF_LOCAL = (str(syscall_object.args[0]) == '[\'PF_LOCAL\']') logging.debug('Execution is PF_INET: %s', execution_is_PF_INET) logging.debug('Trace is PF_INET: %s', trace_is_PF_INET) logging.debug('Exeuction is PF_LOCAL: %s', execution_is_PF_LOCAL) logging.debug('Trace is PF_LOCAL: %s', trace_is_PF_LOCAL) if execution_is_PF_INET != trace_is_PF_INET: raise ReplayDeltaError( 'Encountered socket subcall with mismatch between ' 'execution protocol family and trace protocol family') if execution_is_PF_LOCAL != trace_is_PF_LOCAL: raise ReplayDeltaError( 'Encountered socket subcall with mismatch between ' 'execution protocol family and trace protocol family') # Decide if we want to deal with this socket call or not if trace_is_PF_INET or \ execution_is_PF_INET or \ trace_is_PF_LOCAL or \ execution_is_PF_LOCAL: noop_current_syscall(pid) fd = int(syscall_object.ret[0]) logging.debug('File Descriptor from trace: %s', fd) apply_return_conditions(pid, syscall_object) else: logging.info('Ignoring non-PF_INET call to socket')
def accept_subcall_entry_handler(syscall_id, syscall_object, pid): """Replay Always Checks: 0: sockfd: the socket file descriptor Sets: return value: The file descriptor -1 (error) errno Not Implemented: * Implement a function to check null terminated strings to clean up this mess of checking """ logging.debug('Checking if line from trace is interrupted accept') if syscall_object.ret[0] == '?': raise NotImplementedError('Interrupted accept()s not implemented') ecx = cint.peek_register(pid, cint.ECX) params = extract_socketcall_parameters(pid, ecx, 3) sockaddr_addr = params[1] sockaddr_len_addr = params[2] fd_from_trace = syscall_object.args[0].value validate_integer_argument(pid, syscall_object, 0, 0, params=params) # Decide if this is a system call we want to replay noop_current_syscall(pid) if syscall_object.ret[0] != -1 and syscall_object.args[1].value != 'NULL': sockfields = syscall_object.args[1].value family = sockfields[0].value port = int(sockfields[1].value) ip = sockfields[2].value sockaddr_length = int(syscall_object.args[2].value.strip('[]')) logging.debug('Family: %s', family) logging.debug('Port: %s', port) logging.debug('IP: %s', ip) logging.debug('sockaddr Length: %s', sockaddr_length) logging.debug('sockaddr addr: %x', sockaddr_addr & 0xffffffff) logging.debug('sockaddr length addr: %x', sockaddr_len_addr & 0xffffffff) logging.debug('pid: %s', pid) cint.populate_af_inet_sockaddr(pid, sockaddr_addr, port, ip, sockaddr_len_addr, sockaddr_length) if syscall_object.ret[0] != -1: ret = syscall_object.ret[0] apply_return_conditions(pid, syscall_object)
def svm_rank_learn(features, output_model, args = '') : logging.info(('=='*10 + '%s' + '=='*10) % ( 'START SVM LEARNING')) svm_rank_learn_format = '%s %s %s %s' cmd_text = svm_rank_learn_format % (SVM_RANK.svm_rank_learn_command, args, features, output_model) logging.debug(cmd_text) os.system(cmd_text)
def joinResult4SVMRanking(fn_trainFeature, fn_ad2userStatus, fn_out_SVMRanking, fn_userRawExpandTokens, fn_userid4SVMRanking, fn_ad2UsersGivenAdSet): ''' fn_trainFeature=TMP_DATA_DIR_PATH+'LDA_corpus.svmlight' fn_ad2userStatus=TMP_DATA_DIR_PATH+'ad2userStatus.dict' fn_userRawExpandTokens = TMP_DATA_DIR_PATH + 'userRawExpandTokens.dict' ''' logging.info('=====joinResult4SVMRanking Start=====') userFeature = {} userlist = [] #fn_userRawExpandTokens = TMP_DATA_DIR_PATH + 'userRawExpandTokens.dict' for line in file(fn_userRawExpandTokens): userid, query, title, desc = line.strip().split('\x01') userlist.append(userid) trainFeature = file(fn_trainFeature) for userid in userlist: fields = trainFeature.readline().strip().split(' ', 1) if len(fields) != 2: continue tmp, feature_str = fields if len(feature_str.split()) <= 5: continue userFeature[userid] = feature_str logging.debug('=====load raw training Feature Done.=====') logging.debug('=====loading status map.=====') statusMap = {} for line in file(fn_ad2userStatus): adid, userid, click, impression = line.strip().split('\t') click = int(click) impression = int(impression) status = genStatus(click, impression) statusMap[(adid, userid)] = status logging.debug('=====join final data start=====') output = file(fn_out_SVMRanking, 'w') format = '%d qid:%d %s\n' adid2Idx = {} #line number of userid4SVMRanking equals to output4SVMRanking's #fn_userid4SVMRanking = TMP_DATA_DIR_PATH+'userid4SVMRanking.dat' userid_output = file(fn_userid4SVMRanking, 'w') #fn_ad2UsersGivenAdSet = TMP_DATA_DIR_PATH + 'ad2UsersGivenAdSet.dict' idx = 1 for line in file(fn_ad2UsersGivenAdSet): adid, user_str = line.strip().split('\x01') if adid not in adid2Idx: adid2Idx[adid] = idx idx += 1 userids = user_str.split('\t') for userid in userids: if userid not in userFeature or (adid, userid) not in statusMap: continue userid_output.write('%s\n' % userid) output.write(format % (statusMap[(adid, userid)], adid2Idx[adid], userFeature[userid])) output.close() userid_output.close() dumpDict2File(adid2Idx, TMP_DATA_DIR_PATH + 'adid2Idx.dict')
import __init__ import sys sys.path.append('../') from util import logging class dataParser : @staticmethod def parseTrainData(line) : fields = line.strip().split('\t') if len(fields) != 12 : return None Click, Impression, Display_url, AdID, AdvertiserID, Depth, \ Position, QueryID, KeywordID, TitleID, DescriptionID, UserID = fields return int(Click), int(Impression), Display_url,\ AdID, AdvertiserID, int(Depth), \ int(Position), QueryID, KeywordID,\ TitleID, DescriptionID, UserID if __name__ == '__main__' : example_row = '0\t1\t4298118681424644510\t7686695\t385\t3\t3\t1601\t5521\t7709\t576\t490234' logging.debug(dataParser.parseTrainData(example_row))
def getsockname_entry_handler(syscall_id, syscall_object, pid): """Replay Always Checks: 0: The socket file descriptor Sets: addr: a struct sockaddr populated with the requested information addrlen: length of the sockaddr struct being populated return value: 0 (success) or -1 (failure) errno Not Implemented: * Use address validator to check the addresses """ logging.debug('Entering getsockname handler') # Pull out the info that we can check ecx = cint.peek_register(pid, cint.ECX) params = extract_socketcall_parameters(pid, ecx, 3) # We don't compare params[1] because it is the address of an empty buffer # We don't compare params[2] because it is the address of an out parameter # Get values from trace for comparison fd_from_trace = syscall_object.args[0].value validate_integer_argument(pid, syscall_object, 0, 0, params=params) # Decide if this is a file descriptor we want to deal with noop_current_syscall(pid) if syscall_object.ret[0] != -1: logging.debug('Got successful getsockname call') addr = params[1] length_addr = params[2] length = int(syscall_object.args[2].value.strip('[]')) logging.debug('Addr: %d', addr & 0xffffffff) logging.debug('Length addr: %d', length_addr & 0xffffffff) logging.debug('Length: %d', length) sockfields = syscall_object.args[1].value family = sockfields[0].value port = int(sockfields[1].value) ip = sockfields[2].value logging.debug('Family: %s', family) logging.debug('Port: %d', port) logging.debug('Ip: %s', ip) if family != 'AF_INET': raise NotImplementedError('getsockname only supports ' 'AF_INET') cint.populate_af_inet_sockaddr(pid, addr, port, ip, length_addr, length) else: logging.debug('Got unsuccessful getsockname call') apply_return_conditions(pid, syscall_object)
def joinResult4SVMRanking(fn_trainFeature, fn_ad2userStatus, fn_out_SVMRanking, fn_userRawExpandTokens, fn_userid4SVMRanking, fn_ad2UsersGivenAdSet) : ''' fn_trainFeature=TMP_DATA_DIR_PATH+'LDA_corpus.svmlight' fn_ad2userStatus=TMP_DATA_DIR_PATH+'ad2userStatus.dict' fn_userRawExpandTokens = TMP_DATA_DIR_PATH + 'userRawExpandTokens.dict' ''' logging.info('=====joinResult4SVMRanking Start=====') userFeature = {} userlist = [] #fn_userRawExpandTokens = TMP_DATA_DIR_PATH + 'userRawExpandTokens.dict' for line in file(fn_userRawExpandTokens) : userid, query, title, desc = line.strip().split('\x01') userlist.append(userid) trainFeature = file(fn_trainFeature) for userid in userlist : fields = trainFeature.readline().strip().split(' ',1) if len(fields) != 2 : continue tmp, feature_str = fields if len(feature_str.split()) <= 5 : continue userFeature[userid] = feature_str logging.debug('=====load raw training Feature Done.=====') logging.debug('=====loading status map.=====') statusMap = {} for line in file(fn_ad2userStatus) : adid, userid, click, impression = line.strip().split('\t') click = int(click) impression = int(impression) status = genStatus(click, impression) statusMap[(adid, userid)] = status logging.debug('=====join final data start=====') output = file(fn_out_SVMRanking, 'w') format = '%d qid:%d %s\n' adid2Idx = {} #line number of userid4SVMRanking equals to output4SVMRanking's #fn_userid4SVMRanking = TMP_DATA_DIR_PATH+'userid4SVMRanking.dat' userid_output = file(fn_userid4SVMRanking, 'w') #fn_ad2UsersGivenAdSet = TMP_DATA_DIR_PATH + 'ad2UsersGivenAdSet.dict' idx = 1 for line in file(fn_ad2UsersGivenAdSet) : adid, user_str = line.strip().split('\x01') if adid not in adid2Idx : adid2Idx[adid] = idx idx += 1 userids = user_str.split('\t') for userid in userids : if userid not in userFeature or (adid, userid) not in statusMap: continue userid_output.write('%s\n' % userid) output.write(format % (statusMap[(adid, userid)], adid2Idx[adid], userFeature[userid])) output.close() userid_output.close() dumpDict2File(adid2Idx, TMP_DATA_DIR_PATH+'adid2Idx.dict')
def svm_rank_classify(features, model, predictions): logging.info(('=='*10 + '%s' + '=='*10) % ( 'START SVM CLASSIFING')) svm_rank_classify_format = '%s %s %s %s' cmd_text = svm_rank_classify_format % (SVM_RANK.svm_rank_classify_command, features, model, predictions) logging.debug(cmd_text) os.system(cmd_text)
def getpeername_entry_handler(syscall_id, syscall_object, pid): logging.debug('Entering getpeername handler') # Pull out the info that we can check ecx = cint.peek_register(pid, cint.ECX) params = extract_socketcall_parameters(pid, ecx, 3) fd = params[0] # We don't compare params[1] because it is the address of an empty buffer # We don't compare params[2] because it is the address of an out parameter # Get values from trace for comparison fd_from_trace = syscall_object.args[0].value # Check to make sure everything is the same if fd != int(fd_from_trace): raise ReplayDeltaError( 'File descriptor from execution ({}) ' 'does not match file descriptor from trace ({})'.format( fd, fd_from_trace)) # Decide if this is a file descriptor we want to deal with noop_current_syscall(pid) if syscall_object.ret[0] != -1: logging.debug('Got successful getpeername call') addr = params[1] length_addr = params[2] length = int(syscall_object.args[2].value.strip('[]')) logging.debug('Addr: %d', addr) logging.debug('Length addr: %d', length_addr) logging.debug('Length: %d', length) sockfields = syscall_object.args[1].value family = sockfields[0].value port = int(sockfields[1].value) ip = sockfields[2].value logging.debug('Family: %s', family) logging.debug('Port: %d', port) logging.debug('Ip: %s', ip) if family != 'AF_INET': raise NotImplementedError('getpeername only ' 'supports AF_INET') cint.populate_af_inet_sockaddr(pid, addr, port, ip, length_addr, length) else: logging.debug('Got unsuccessful getpeername call') apply_return_conditions(pid, syscall_object)