예제 #1
0
def test_overlap():
    """
    test overlap data 
        operation : [0]  [2] [3]  [4] -> last :align to end
                    [1] , [5] -> no change
        
    """
    def load_chatdata(filename):
        return parser.parse(
            json.loads(
                _open_file("tests/testdata/extract_duplcheck/overlap/" +
                           filename)))[1]

    blocks = (Block(first=0,
                    last=12771,
                    end=9890,
                    chat_data=load_chatdata("dp0-0.json")),
              Block(first=9890,
                    last=15800,
                    end=20244,
                    chat_data=load_chatdata("dp0-1.json")),
              Block(first=20244,
                    last=45146,
                    end=32476,
                    chat_data=load_chatdata("dp0-2.json")),
              Block(first=32476,
                    last=50520,
                    end=41380,
                    chat_data=load_chatdata("dp0-3.json")),
              Block(first=41380,
                    last=62875,
                    end=52568,
                    chat_data=load_chatdata("dp0-4.json")),
              Block(first=52568,
                    last=62875,
                    end=54000,
                    chat_data=load_chatdata("dp0-5.json"),
                    is_last=True))
    result = duplcheck.remove_overlap(blocks)
    #dp0-0.json has item offset time is 9890 (equals block[0].end = block[1].first),
    #but must be aligne to the most close and smaller value:9779.
    assert result[0].last == 9779

    assert result[1].last == 15800

    assert result[2].last == 32196

    assert result[3].last == 41116

    assert result[4].last == 52384

    #the last block must be always added to result.
    assert result[5].last == 62875
예제 #2
0
def test_duplicate_tail():
    """
    test duplicate tail data 
        operation : append [0]
                    [0] , [1]  -> discard [1]
                    [1] , [2]  -> append  [2]
                    [2] , [3]  -> discard [3]
                    [3] , [4]  -> append  [4]
                    [4] , [5]  -> discard [5]

        result    : [0] , [2] , [4] 
    """
    def load_chatdata(filename):
        return parser.parse(
            json.loads(
                _open_file("tests/testdata/extract_duplcheck/head/" +
                           filename)))[1]

    #chat data offsets are ignored.
    blocks = (Block(first=0, last=2500, chat_data=load_chatdata("dp0-0.json")),
              Block(first=1500,
                    last=2500,
                    chat_data=load_chatdata("dp0-1.json")),
              Block(first=10000,
                    last=45146,
                    chat_data=load_chatdata("dp0-2.json")),
              Block(first=20244,
                    last=45146,
                    chat_data=load_chatdata("dp0-3.json")),
              Block(first=20244,
                    last=62875,
                    chat_data=load_chatdata("dp0-4.json")),
              Block(first=52568,
                    last=62875,
                    chat_data=load_chatdata("dp0-5.json")))

    result = duplcheck.remove_duplicate_tail(blocks)
    _dump(result)
    assert len(result) == 3
    assert result[0].first == blocks[0].first
    assert result[0].last == blocks[0].last
    assert result[1].first == blocks[2].first
    assert result[1].last == blocks[2].last
    assert result[2].first == blocks[4].first
    assert result[2].last == blocks[4].last
예제 #3
0
def test_split_1():
    """patch.first <= parent_block.last

    While awaiting at run()->asyncdl._fetch()
    fetching parent_block proceeds, 
    and parent.block.last exceeds patch.first.

    In this case, fetched patch is all discarded,
    and worker searches other processing block again. 

    ~~~~~~ before ~~~~~~

                          patch.first
      first                 |    last                  end
       |####################|#####|---------------------|
                            ^
     @child_block
     first = last = 0                                  end (=parent_end)
       |                                                |
     
     @fetched patch
                            |-- patch --|
    
     
                            |
                            |
                            V 
    
    ~~~~~~ after ~~~~~~

     @parent_block
     first                       last                  end
       |###########################|--------------------|
    
     @child_block
                                
                            .............. ->  discard all data
                   
    """
    parent = Block(first=0,
                   last=33000,
                   end=60000,
                   continuation='parent',
                   during_split=True)
    child = Block(first=0,
                  last=0,
                  end=60000,
                  continuation='mean',
                  during_split=True)
    patch = Patch(chats=load_chatdata('pt0-5.json'),
                  first=32500,
                  last=34000,
                  continuation='patch')

    split(parent, child, patch)

    assert parent.last == 33000  # no change
    assert parent.end == 60000  # no change
    assert child.continuation is None
    assert parent.during_split is False
    assert child.during_split is True  # exclude during_split sequence
예제 #4
0
def test_split_0():
    """
    Normal case

    ~~~~~~ before ~~~~~~

     @parent_block  (# = already fetched)
    
     first    last                                     end
       |########----------------------------------------|
    

     @child_block
    
     first = last = 0                                  end (=parent_end)
       |                                                |
    

     @fetched patch
                            |-- patch --|
    
     
                            |
                            |
                            V 

    ~~~~~~ after ~~~~~~
    

     @parent_block
    
     first    last         end (after split)   
       |########------------|
    
     @child_block
                          first       last            end            
                            |###########---------------|
    
     @fetched patch
                            |-- patch --|
    """
    parent = Block(first=0,
                   last=4000,
                   end=60000,
                   continuation='parent',
                   during_split=True)
    child = Block(first=0,
                  last=0,
                  end=60000,
                  continuation='mean',
                  during_split=True)
    patch = Patch(chats=load_chatdata('pt0-5.json'),
                  first=32500,
                  last=34000,
                  continuation='patch')

    split(parent, child, patch)

    assert child.continuation == 'patch'
    assert parent.last < child.first
    assert parent.end == child.first
    assert child.first < child.last
    assert child.last < child.end
    assert parent.during_split is False
    assert child.during_split is False
예제 #5
0
def test_split_2():
    """child_block.end < patch.last:

    Case the last offset of patch exceeds child_block.end.
    In this case, remove overlapped data of patch.

    ~~~~~~ before ~~~~~~

     @parent_block  (# = already fetched)
     first    last                           end (before split)
       |########------------------------------|
    
     @child_block
     first = last = 0                        end (=parent_end)
       |                                      |
    
    continuation:succeed from patch
    
     @fetched patch
                            |-------- patch --------|
    
     
                            |
                            |
                            V 

    ~~~~~~ after ~~~~~~

     @parent_block
     first    last         end (after split)   
       |########------------|

     @child_block                                  old patch.end            
                          first            last=end |
                            |#################|......   cut extra data.
                                                    ^
    continuation : None (extract complete)

     @fetched patch                                 
                            |-------- patch --------|
    """
    parent = Block(first=0,
                   last=4000,
                   end=33500,
                   continuation='parent',
                   during_split=True)
    child = Block(first=0,
                  last=0,
                  end=33500,
                  continuation='mean',
                  during_split=True)
    patch = Patch(chats=load_chatdata('pt0-5.json'),
                  first=32500,
                  last=34000,
                  continuation='patch')

    split(parent, child, patch)

    assert child.continuation is None
    assert parent.last < child.first
    assert parent.end == child.first
    assert child.first < child.last
    assert child.last < child.end
    assert child.continuation is None
    assert parent.during_split is False
    assert child.during_split is False