Пример #1
0
    def get_surround_text(self, index):
        '''
        input: node's index in the "treeRelation" list
        get surround text.Return self.SURROUND chars for front and back respectively.
        And return "start" in addition.
        "start" == 0 means the url is a root.
        "start" == -1 means the url is not found.
        Otherwise, "start" is the url start position in response body text
        '''
        data = up.readJason(self.filename)
        item_position = self.indexList[index]
        if item_position < 0:
            # the url is a root
            return (" ", " ", 0)
        currentItem = data['log']['entries'][item_position]
        if currentItem['response']['content'].has_key('text'):
            text = currentItem['response']['content']['text']
            start = self.positionInText[index][0]
            end = self.positionInText[index][1]
            if start + 1 >= self.SURROUND and (len(text) - end -
                                               1) >= self.SURROUND:
                front = text[start - self.SURROUND:start]
                back = text[end:end + self.SURROUND]
            else:
                min_num = min([start + 1, len(text) - end - 1])
                front = text[start - min_num:start]
                back = text[end:end + min_num]
        else:
            front = ""
            back = ""
            start = -1

        return (front, back, start)
Пример #2
0
 def get_surround_text(self, index):
     '''
     input: node's index in the "treeRelation" list
     get surround text.Return self.SURROUND chars for front and back respectively.
     And return "start" in addition.
     "start" == 0 means the url is a root.
     "start" == -1 means the url is not found.
     Otherwise, "start" is the url start position in response body text
     '''
     data = up.readJason(self.filename)
     item_position = self.indexList[index]
     if item_position< 0:
         # the url is a root
         return (" ", " ", 0)
     currentItem = data['log']['entries'][item_position]
     if currentItem['response']['content'].has_key('text'):
         text = currentItem['response']['content']['text']
         start = self.positionInText[ index][0]
         end = self.positionInText[ index][1]
         if start+1>= self.SURROUND and (len(text)-end-1)>= self.SURROUND:
             front = text[start-self.SURROUND: start]
             back = text[end: end+ self.SURROUND]
         else:
             min_num = min([start+1, len(text)-end-1])
             front = text[start- min_num: start]
             back = text[end: end+ min_num]
     else:
         front = ""
         back = ""
         start = -1
     
     return (front, back, start)
Пример #3
0
def get_Tree(PATH, dumpPATH, stop):
    '''
    input:
        PATH: .har file path. The .har file record the traffic
        dumpPATH: A .txt file. This file record a matrix whose fomat is 
            "treeplotVec; url; timestamp"
        stop: A Stop_url(stopURL.py) object.
        
    output:
        a Tree object
    '''
    #    onLine_re = r'((http|ftp|https)://)(([a-zA-Z0-9\._-]+\.[a-zA-Z]{2,6})|([0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}))(:[0-9]{1,4})*(/[a-zA-Z0-9\&%_\./-~-]*)?'

    data = up.readJason(PATH)
    '''
        currentItem = data['log']['entries'][i]
        
        treeContent -> up.drop_variation( currentItem['request']['url'] )
        original_treeContent -> currentItem['request']['url']
        indexList -> i(or -i if the node is root of a tree)
        wait_interval -> currentItem['timings']['wait'], (ms毫秒)
        mimeType -> currentItem['response']['content']['mimeType']
    '''
    treeRelation = []  #generate a matlab-treeplot()-like vector
    treeContent = [
    ]  #record the simplified URL corresponding to "treeRelation"
    original_treeContent = [
    ]  #record the original URL corresponding to "treeRelation"
    indexList = [
    ]  #record the index of 'entries' from whose content can find the url, negative element for root node
    wait_interval = [
    ]  #record the request-response interval of corresponding page
    mimeType = []  #record the mimeType of corresponding page
    treeTimestamp = []  #tuple elements here, (date, )
    positionInText = [
    ]  #tuple elements here represent the url ( begin, end) position
    #in text. ( -2, -2) for root node
    size = []

    for i in range(0, len(data['log']['entries'])):
        currentItem = data['log']['entries'][i]
        #        if currentItem['response']['content']['size'] > 102400:
        #            print i, 'size:',currentItem['response']['content']['size']
        #            input('big size')
        ori_requestURL = currentItem['request']['url']
        requestURL = up.drop_variation(ori_requestURL)
        if stop.is_stopURL(ori_requestURL):
            print "StopURL:", ori_requestURL
            continue

        #process this request-response pair
        #process request part
        ifInTree, location = judge_if_existing(treeContent, wait_interval,
                                               requestURL)
        if ifInTree:  #if the requested content has pushed in the tree
            root = location + 1  # get the root index for urls in response text
            # section below is used to debug
            if wait_interval[location] >= 0:
                print 'PATH 0f file:\t', PATH
                print 'entity index:\t', i
                print 'requested URL:\t', ori_requestURL
                print 'URL of the existing node:\t', treeContent[location]
                print 'node location in array:\t', location
                print 'root of this node:\t', treeRelation[location]
                print 'value of the existing node:\t', wait_interval[location]
                input("EXCEPTION:wait_interval[location] >= 0!!!\n")
            wait_interval[location] = currentItem['timings']['wait']
            mimeType[location] = currentItem['response']['content']['mimeType']
            treeTimestamp[location] = up.get_fiddle_timestamp(
                currentItem['startedDateTime'])
            size[location] = currentItem['response']['content']['size']
            treeContent[location] = requestURL
            original_treeContent[location] = ori_requestURL
        else:
            treeRelation.append(0)
            treeContent.append(requestURL)
            original_treeContent.append(ori_requestURL)
            wait_interval.append(currentItem['timings']['wait'])
            mimeType.append(currentItem['response']['content']['mimeType'])
            treeTimestamp.append(
                up.get_fiddle_timestamp(currentItem['startedDateTime']))
            indexList.append(-i)
            positionInText.append((-2, -2))
            size.append(currentItem['response']['content']['size'])
            root = len(treeContent)

        #process response part
        if data['log']['entries'][i]['response']['content'].has_key('text'):
            string = data['log']['entries'][i]['response']['content']['text']
            for url, start_pos, end_pos, count in up.get_urlSet_from_text(
                    string):
                treeRelation.append(root)
                treeContent.append(up.drop_variation(url))  #[Q2]
                original_treeContent.append(url)
                wait_interval.append(-1)
                indexList.append(i)
                mimeType.append(u'')
                treeTimestamp.append((u'', -1, u''))
                positionInText.append((start_pos, end_pos))
                size.append(-2)


#            for item in subPatt:
#                treeRelation.append( root )
#                url = item[0] + item[2] + item[6] # don't aky why, I'll tell you "because of love  ╮( ̄▽ ̄)╭"
#                url = url.rstrip('\\')
#                treeContent.append( up.drop_variation(url) ) #[Q2]
#                original_treeContent.append( url )
#                wait_interval.append(-1)
#                indexList.append(i)
#                mimeType.append(u'')
#                treeTimestamp.append((u'',-1,u''))
        else:
            print i
            print currentItem['response']['content']['mimeType']

    tree_info_mat = {}
    tree_info_mat['treeRelation'] = copy.deepcopy(treeRelation)
    tree_info_mat['treeContent'] = copy.deepcopy(treeContent)
    tree_info_mat['indexList'] = copy.deepcopy(indexList)
    tree_info_mat['original_treeContent'] = copy.deepcopy(original_treeContent)
    tree_info_mat['wait_interval'] = copy.deepcopy(wait_interval)
    tree_info_mat['mimeType'] = copy.deepcopy(mimeType)
    tree_info_mat['filename'] = PATH
    tree_info_mat['treeTimestamp'] = copy.deepcopy(treeTimestamp)
    tree_info_mat['dumpPath'] = dumpPATH
    tree_info_mat['positionInText'] = copy.deepcopy(positionInText)
    tree_info_mat['size'] = copy.deepcopy(size)

    return Tree(tree_info_mat)
Пример #4
0
def get_Tree(PATH, dumpPATH, stop):
    '''
    input:
        PATH: .har file path. The .har file record the traffic
        dumpPATH: A .txt file. This file record a matrix whose fomat is 
            "treeplotVec; url; timestamp"
        stop: A Stop_url(stopURL.py) object.
        
    output:
        a Tree object
    '''
#    onLine_re = r'((http|ftp|https)://)(([a-zA-Z0-9\._-]+\.[a-zA-Z]{2,6})|([0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}))(:[0-9]{1,4})*(/[a-zA-Z0-9\&%_\./-~-]*)?'
    
    data = up.readJason(PATH)
    
    '''
        currentItem = data['log']['entries'][i]
        
        treeContent -> up.drop_variation( currentItem['request']['url'] )
        original_treeContent -> currentItem['request']['url']
        indexList -> i(or -i if the node is root of a tree)
        wait_interval -> currentItem['timings']['wait'], (ms毫秒)
        mimeType -> currentItem['response']['content']['mimeType']
    '''
    treeRelation = [] #generate a matlab-treeplot()-like vector
    treeContent = [] #record the simplified URL corresponding to "treeRelation"
    original_treeContent = [] #record the original URL corresponding to "treeRelation"
    indexList = [] #record the index of 'entries' from whose content can find the url, negative element for root node
    wait_interval = [] #record the request-response interval of corresponding page
    mimeType = [] #record the mimeType of corresponding page
    treeTimestamp = [] #tuple elements here, (date, )
    positionInText = [] #tuple elements here represent the url ( begin, end) position
                #in text. ( -2, -2) for root node
    size = []
    
    for i in range(0,len(data['log']['entries'])):
        currentItem = data['log']['entries'][i]
#        if currentItem['response']['content']['size'] > 102400:
#            print i, 'size:',currentItem['response']['content']['size']
#            input('big size')
        ori_requestURL = currentItem['request']['url']
        requestURL = up.drop_variation( ori_requestURL );
        if stop.is_stopURL(ori_requestURL):
            print "StopURL:",ori_requestURL
            continue
        
        #process this request-response pair
        #process request part
        ifInTree, location = judge_if_existing( treeContent, wait_interval, requestURL)
        if ifInTree: #if the requested content has pushed in the tree
            root = location + 1 # get the root index for urls in response text
            # section below is used to debug
            if wait_interval[location] >= 0:
                print 'PATH 0f file:\t',PATH
                print 'entity index:\t',i
                print 'requested URL:\t',ori_requestURL
                print 'URL of the existing node:\t',treeContent[location]
                print 'node location in array:\t',location
                print 'root of this node:\t',treeRelation[location]
                print 'value of the existing node:\t',wait_interval[location]
                input("EXCEPTION:wait_interval[location] >= 0!!!\n")
            wait_interval[location] = currentItem['timings']['wait']
            mimeType[location] = currentItem['response']['content']['mimeType']
            treeTimestamp[location] = up.get_fiddle_timestamp(currentItem['startedDateTime'])
            size[location] = currentItem['response']['content']['size']
            treeContent[ location] = requestURL
            original_treeContent[ location] = ori_requestURL
        else:
            treeRelation.append(0)
            treeContent.append(requestURL)
            original_treeContent.append(ori_requestURL)
            wait_interval.append( currentItem['timings']['wait'] )
            mimeType.append(currentItem['response']['content']['mimeType'])
            treeTimestamp.append(up.get_fiddle_timestamp(currentItem['startedDateTime']))
            indexList.append(-i)
            positionInText.append((-2,-2))
            size.append(currentItem['response']['content']['size'])
            root = len(treeContent)
            
        #process response part
        if data['log']['entries'][i]['response']['content'].has_key('text'):
            string = data['log']['entries'][i]['response']['content']['text']
            for url, start_pos, end_pos, count in up.get_urlSet_from_text( string):
                treeRelation.append( root )
                treeContent.append( up.drop_variation(url)) #[Q2]
                original_treeContent.append( url)
                wait_interval.append(-1)
                indexList.append(i)
                mimeType.append(u'')
                treeTimestamp.append((u'',-1,u''))
                positionInText.append(( start_pos, end_pos))
                size.append(-2)
#            for item in subPatt:
#                treeRelation.append( root )
#                url = item[0] + item[2] + item[6] # don't aky why, I'll tell you "because of love  ╮( ̄▽ ̄)╭"
#                url = url.rstrip('\\')
#                treeContent.append( up.drop_variation(url) ) #[Q2]
#                original_treeContent.append( url )
#                wait_interval.append(-1)
#                indexList.append(i)
#                mimeType.append(u'')
#                treeTimestamp.append((u'',-1,u''))
        else:
            print i
            print currentItem['response']['content']['mimeType']
    
    tree_info_mat = {}
    tree_info_mat['treeRelation'] = copy.deepcopy( treeRelation)
    tree_info_mat['treeContent'] = copy.deepcopy( treeContent)
    tree_info_mat['indexList'] = copy.deepcopy( indexList)
    tree_info_mat['original_treeContent'] = copy.deepcopy( original_treeContent)
    tree_info_mat['wait_interval'] = copy.deepcopy( wait_interval)
    tree_info_mat['mimeType'] = copy.deepcopy( mimeType)
    tree_info_mat['filename'] = PATH
    tree_info_mat['treeTimestamp'] = copy.deepcopy( treeTimestamp )
    tree_info_mat['dumpPath'] = dumpPATH
    tree_info_mat['positionInText'] = copy.deepcopy( positionInText)
    tree_info_mat['size'] = copy.deepcopy( size)

    return Tree(tree_info_mat)
Пример #5
0
 def read_json(self, path):
     return up.readJason(path)
Пример #6
0
 def read_json(self, path):
     return up.readJason(path)