Exemplo n.º 1
0
def query_expand(docs, unit_selector, query):
    ## get sentence set
    sents = []
    for doc in docs:
        #if doc.doctype != 'NEWS STORY': continue
        for sent in doc.sentences:
            ## skip short sentences
            #if sent.length <= 5: continue
            sents.append(sent)
            
    ## initialize sentences with query similarity
    sent_values = prob_util.Counter()
    for sent in sents:
        try: sent_values[sent.original] = sent.sim_basic(query)
        except: sent_values[sent.original] = 1
    sent_values = sent_values.makeProbDist()

    ## get units in each sent
    sent_units = {}
    for sent in sents:
        sent_units[sent.original] = prob_util.Counter()
        units = unit_selector(sent.stemmed)
        for unit in units:
            if text.text_processor.is_just_stopwords(unit): continue
            sent_units[sent.original][unit] += 1

    ## repeat until convergence
    previous_entropy_sent = 0
    previous_entropy_unit = 0
    for iter in range(1, 51):
        prev_sent_values = sent_values.copy()

        ## get new unit values from sent values
        unit_values = prob_util.Counter()
        for sent in sent_units:
            for unit in sent_units[sent]:
                unit_values[unit] += sent_values[sent]
        unit_values = unit_values.makeProbDist()

        ## get sent values from unit values
        sent_values = prob_util.Counter()
        for sent in sent_units:
            for unit in sent_units[sent]:
                sent_values[sent] += unit_values[unit] #/ len(sent_units[sent])
        sent_values = sent_values.makeProbDist()

        #prob_util.Counter(unit_values).displaySorted(N=5)
        #prob_util.Counter(sent_values).displaySorted(N=20)

        ## check for convergence
        entropy_sent = prob_util.entropy(sent_values)
        entropy_unit = prob_util.entropy(unit_values)
        if entropy_sent >= previous_entropy_sent or entropy_unit >= previous_entropy_unit:
            break
        previous_entropy_sent = entropy_sent
        previous_entropy_unit = entropy_unit
        dist = prob_util.klDistance(prev_sent_values, sent_values)
        sys.stderr.write('%d sent entropy [%1.4f]  unit entropy [%1.4f]  sent dist [%1.6f]\n' %(iter, entropy_sent, entropy_unit, dist))
        #if iter == 2: break
        if dist < 0.0001:
            sys.stderr.write('----------------------------')
            break
        
    return prob_util.Counter(unit_values), prob_util.Counter(sent_values)
Exemplo n.º 2
0
def query_expand(docs, unit_selector, query):
    ## get sentence set
    sents = []
    for doc in docs:
        #if doc.doctype != 'NEWS STORY': continue
        for sent in doc.sentences:
            ## skip short sentences
            #if sent.length <= 5: continue
            sents.append(sent)
            
    ## initialize sentences with query similarity
    sent_values = prob_util.Counter()
    for sent in sents:
        try: sent_values[sent.original] = sent.sim_basic(query)
        except: sent_values[sent.original] = 1
    sent_values = sent_values.makeProbDist()
    original_sent_values = sent_values.copy()

    ## get units in each sent
    sent_units = {}
    for sent in sents:
        sent_units[sent.original] = prob_util.Counter()
        units = unit_selector(sent.stemmed)
        for unit in units:
            if text.text_processor.is_just_stopwords(unit): continue
            sent_units[sent.original][unit] += 1

    ## repeat until convergence
    prev_unit_entropy = 0
    prev_sent_entropy = 0
    prev_unit_values = {}
    prev_sent_values = {}
    for iter in range(1, 51):
        prev_sent_values = sent_values.copy()

        ## get new unit values from sent values
        unit_values = prob_util.Counter()
        for sent in sent_units:
            for unit in sent_units[sent]:
                unit_values[unit] += sent_values[sent]
        unit_values = unit_values.makeProbDist()

        ## get sent values from unit values
        sent_values = prob_util.Counter()
        for sent in sent_units:
            for unit in sent_units[sent]:
                sent_values[sent] += unit_values[unit] #/ len(sent_units[sent])
        sent_values = sent_values.makeProbDist()

        ## interpolate with original sent weights
        sent_prior = 0.1
        for sent in sent_values:
            new_value = (sent_prior * original_sent_values[sent]) + ( (1-sent_prior) * sent_values[sent])
            #sent_values[sent] = new_value

        #prob_util.Counter(unit_values).displaySorted(N=100)
        #prob_util.Counter(sent_values).displaySorted(N=20)

        ## check for convergence
        entropy_sent = prob_util.entropy(sent_values)
        entropy_unit = prob_util.entropy(unit_values)
        dist = prob_util.klDistance(prev_sent_values, sent_values)
        sys.stderr.write('%d sent entropy [%1.4f]  unit entropy [%1.4f]  sent dist [%1.6f]\n' %(iter, entropy_sent, entropy_unit, dist))
        
        if iter == 2: break
        
        if (entropy_unit >= prev_unit_entropy) and (entropy_sent >= prev_sent_entropy): 
            unit_values = prev_unit_values
            sent_values = prev_sent_values
            break
        
        prev_unit_entropy = entropy_unit
        prev_sent_entropy = entropy_sent
        prev_unit_values = unit_values
        prev_sent_values = sent_values
        
        if dist < 0.0001: break

    #prob_util.Counter(unit_values).displaySorted(N=10)
    #prob_util.Counter(sent_values).displaySorted(N=20)

    return prob_util.Counter(unit_values), prob_util.Counter(sent_values)
Exemplo n.º 3
0
def map_iterative_sents(docs, unit_selector, query):

    ## get sentence set
    sents = []
    for doc in docs:
        for sent in doc.sentences:
            ## skip short sentences
            #if sent.length <= 5: continue

            ## skip sentences with no query overlap
            if query: sim = sent.sim_basic(query)
            else: sim = 1
            if sim <= 0: continue

            sents.append(sent)
            
    ## initialize uniform sentence priors
    sent_values = prob_util.Counter()
    for sent in sents:
        sent_values[sent.original] = 1
    sent_values = sent_values.makeProbDist()

    ## get units in each sent
    sent_units = {}
    for sent in sents:
        sent_units[sent.original] = prob_util.Counter()
        units = unit_selector(sent.stemmed)
        for unit in units:
            if text.text_processor.is_just_stopwords(unit): continue
            sent_units[sent.original][unit] += 1

    ## repeat until convergence
    for iter in range(1, 51):
        prev_sent_values = sent_values.copy()
        
        ## get unit values from doc values
        unit_values = prob_util.Counter()
        for sent in sent_units:
            for unit in sent_units[sent]:
                unit_values[unit] += sent_values[sent]
        unit_values = unit_values.makeProbDist()

        ## get sent values from unit values
        sent_values = prob_util.Counter()
        for sent in sent_units:
            for unit in sent_units[sent]:
                sent_values[sent] += unit_values[unit] #/ len(sent_units[sent])
        sent_values = sent_values.makeProbDist()

        #prob_util.Counter(unit_values).displaySorted(N=5)
        #prob_util.Counter(sent_values).displaySorted(N=3)

        ## check for convergence
        entropy_sent = prob_util.entropy(sent_values)
        entropy_unit = prob_util.entropy(unit_values)
        dist = prob_util.klDistance(prev_sent_values, sent_values)
        #print '%d sent entropy [%1.4f]  unit entropy [%1.4f]  sent dist [%1.6f]' %(iter, entropy_sent, entropy_unit, dist)
        if iter == 2: break
        if dist < 0.0001:
            #print '----------------------------'
            break

    return prob_util.Counter(unit_values), prob_util.Counter(sent_values)
Exemplo n.º 4
0
def map_iterative_sents(docs, unit_selector, query):

    ## get sentence set
    sents = []
    for doc in docs:
        for sent in doc.sentences:
            ## skip short sentences
            #if sent.length <= 5: continue

            ## skip sentences with no query overlap
            if query: sim = sent.sim_basic(query)
            else: sim = 1
            if sim <= 0: continue

            sents.append(sent)
            
    ## initialize uniform sentence priors
    sent_values = prob_util.Counter()
    for sent in sents:
        sent_values[sent.original] = 1
    sent_values = sent_values.makeProbDist()

    ## get units in each sent
    sent_units = {}
    for sent in sents:
        sent_units[sent.original] = prob_util.Counter()
        units = unit_selector(sent.stemmed)
        for unit in units:
            if text.text_processor.is_just_stopwords(unit): continue
            sent_units[sent.original][unit] += 1

    ## repeat until convergence
    for iter in range(1, 51):
        prev_sent_values = sent_values.copy()
        
        ## get unit values from doc values
        unit_values = prob_util.Counter()
        for sent in sent_units:
            for unit in sent_units[sent]:
                unit_values[unit] += sent_values[sent]
        unit_values = unit_values.makeProbDist()

        ## get sent values from unit values
        sent_values = prob_util.Counter()
        for sent in sent_units:
            for unit in sent_units[sent]:
                sent_values[sent] += unit_values[unit] #/ len(sent_units[sent])
        sent_values = sent_values.makeProbDist()

        #prob_util.Counter(unit_values).displaySorted(N=5)
        #prob_util.Counter(sent_values).displaySorted(N=3)

        ## check for convergence
        entropy_sent = prob_util.entropy(sent_values)
        entropy_unit = prob_util.entropy(unit_values)
        dist = prob_util.klDistance(prev_sent_values, sent_values)
        #print '%d sent entropy [%1.4f]  unit entropy [%1.4f]  sent dist [%1.6f]' %(iter, entropy_sent, entropy_unit, dist)
        if iter == 2: break
        if dist < 0.0001:
            #print '----------------------------'
            break

    return prob_util.Counter(unit_values), prob_util.Counter(sent_values)