Exemplo n.º 1
0
def concept_compare(mapper, gold_mapper):
    """
    compare mapper's concepts to the gold concepts
    """
    ## get concepts for the gold mapper (mapper should already be done)
    gold_mapper.map_concepts()
    gold_mapper.choose_sents()
    gold_mapper.format_output()

    for update_index in [0]:
        print 'update [%d]' %update_index

        gold_sorted_keys = prob_util.Counter(gold_mapper.concept_weight_sets[update_index]).sortedKeys()
        for concept in gold_sorted_keys:
            gold_weight = gold_mapper.concept_weight_sets[update_index][concept]
            try: heuristic_weight = mapper.concept_weight_sets[update_index][concept]
            except: heuristic_weight = 0
            print 'my[%1.2f] gold[%1.2f]  [%s]' %(heuristic_weight, gold_weight, ' '.join(concept), )
            
        heur_sorted_keys = prob_util.Counter(mapper.concept_weight_sets[update_index]).sortedKeys()
        for concept in heur_sorted_keys:
            if concept in gold_sorted_keys: continue
            heuristic_weight = mapper.concept_weight_sets[update_index][concept]
            print 'my[%1.2f] gold[%1.2f]  [%s]' %(heuristic_weight, 0, ' '.join(concept))
        print '----------------------------'
Exemplo n.º 2
0
def get_full_concepts(docs, query):
    """
    """
    ## get sentence set
    sents = []
    used_sents = set()
    for doc in docs:
        for sent in doc.sentences:
            ## ignore duplicate sentences
            sent_stemmed_str = ' '.join(sent.stemmed)
            if sent_stemmed_str in used_sents: continue
            used_sents.add(sent_stemmed_str)
            sents.append(sent)

    ngrams = prob_util.Counter()
    for i in range(len(sents) - 1):
        for j in range(i + 1, len(sents)):
            matches = get_overlaps(sents[i], sents[j])
            for match in matches:
                if text.text_processor.is_just_stopwords(match): continue
                ngrams[match] += 1
                #ngrams[match] += (10 ** (len(match)-1)) / 1000.0

    for ngram, count in ngrams.items():
        if count <= 1: ngrams.pop(ngram)
        else: ngrams[ngram] = (1.0 / 10000) * count * (10**len(ngram) - 1)

    ngrams.displaySorted(N=40)
    return ngrams
Exemplo n.º 3
0
def make_query(problem):
    """
    """
    mapper = concept_mapper.HeuristicMapper(problem, 'n1')
    mapper.map_concepts()
    concepts = prob_util.Counter(mapper.concepts).sortedKeys()
    concepts = [c[0] for c in concepts]
    return concepts
Exemplo n.º 4
0
    def map_concepts(self):
        """
        """
        #do_train = True
        do_train = False

        ## get features in boostexter format
        lines, concepts, concept_freq = setup_features(self.problem,
                                                       self.unit_selector,
                                                       train=do_train)

        ## write to file
        filename = '../train/%s.data' % self.problem.id
        fh = open(filename, 'w')
        fh.write('\n'.join(lines) + '\n')
        fh.close()

        if do_train:
            return

        ## classify
        model_stem = '../train/all'
        cmd = '%s -S %s -C < %s' % (BOOSTING_LEARNER, model_stem, filename)
        results = os.popen(cmd).readlines()
        concept_weights = prob_util.Counter()
        all_concept_weights = {}
        for i in range(len(results)):
            score = float(results[i].split()[-1])
            concept_weights[concepts[i]] += score
            if not concepts[i] in all_concept_weights:
                all_concept_weights[concepts[i]] = []
            all_concept_weights[concepts[i]].append(score)

        #concept_weights.displaySorted(N=1000)

        ## pruning
        final_concept_weights = {}
        count = 0
        for key in concept_weights.sortedKeys()[:300]:
            count += 1
            value = concept_weights[key]
            if value <= 0: break
            final_concept_weights[key] = value

            mean_value = sum(all_concept_weights[key]) / len(
                all_concept_weights[key])
            final_concept_weights[key] = mean_value * concept_freq[key]
            if count <= 10: print key, mean_value * concept_freq[key]

        print 'concepts used: %d' % count

        self.concept_sets = [final_concept_weights]
Exemplo n.º 5
0
def map_iterative_docs(docs, unit_selector, query):

    ## initialize uniform doc priors
    doc_values = prob_util.Counter()
    for doc in docs:
        doc_values[doc.docid] = 1
    doc_values = doc_values.makeProbDist()

    ## get units in each doc
    doc_units = {}
    used_sents = set()
    for doc in docs:
        doc_units[doc.docid] = prob_util.Counter()
        for sent in doc.sentences:

            if query:
                sim = sent.sim_basic(query)
            else:
                sim = 1
            if sim <= 0: continue

            units = unit_selector(sent.stemmed)
            for unit in units:
                if text.text_processor.is_just_stopwords(unit): continue

                doc_units[doc.docid][unit] += 1

    ## repeat until convergence
    for iter in range(1, 51):
        prev_doc_values = doc_values.copy()

        ## get unit values from doc values
        unit_values = prob_util.Counter()
        for doc in doc_units:
            for unit in doc_units[doc]:
                unit_values[unit] += doc_values[doc]
        unit_values = unit_values.makeProbDist()

        ## get doc values from unit values
        doc_values = prob_util.Counter()
        for doc in doc_units:
            for unit in doc_units[doc]:
                doc_values[doc] += unit_values[unit] / len(doc_units[doc])
                #print '%d, %s %1.4f %d' %(iter, unit, unit_values[unit], len(doc_units[doc]))
        doc_values = doc_values.makeProbDist()

        #prob_util.Counter(unit_values).displaySorted(N=5)
        #prob_util.Counter(doc_values).displaySorted(N=10)

        ## check for convergence
        if iter == 1: break
        dist = prob_util.euclidianDistance(prev_doc_values, doc_values)
        print 'dist [%1.6f]' % dist
        if dist < 0.0001: break

    #sys.exit()

    return prob_util.Counter(unit_values), prob_util.Counter(doc_values)
Exemplo n.º 6
0
def prep_docs(path, out_path):
    files = os.popen('ls %s*.sent' %path).read().splitlines()

    ## on the first pass, create a vocab mapping
    vocab = set()
    for file in files:
        if '-B' in file: continue
        sents = open(file).read().splitlines()
        doc = prob_util.Counter()

        for sent in sents[:20]:
            s = util.porter_stem_sent(util.tokenize(fix_text(sent)))
            concepts = set(util.get_ngrams(s, 1, bounds=False, as_string=True))
            vocab.update(concepts)

    fh = open(out_path+'vocab', 'w')
    vocab = zip(vocab, range(len(vocab)))
    for concept, count in vocab:
        fh.write('%s %d\n' %(concept, count))
    fh.close()
    vocab = dict(vocab)

    ## on the second pass, output one doc per line
    for file in files:
        if '-B' in file: continue
        sents = open(file).read().splitlines()
        doc = prob_util.Counter()

        for sent in sents[:20]:
            s = util.porter_stem_sent(util.tokenize(fix_text(sent)))
            concepts = set(util.get_ngrams(s, 1, bounds=False, as_string=True))
            for concept in concepts:
                doc[concept] += 1

        ## doc output
        output = '%d %s' %(len(doc), ' '.join(['%s:%d' %(vocab[t],c) for t,c in doc.items()]))
        print output
Exemplo n.º 7
0
    def map_concepts(self):
        """
        """

        ## get document statistics
        concept_sets = []
        sent_count = 0
        used_sents = set()

        for doc_set in [self.problem.new_docs]:
            concept_set = prob_util.Counter()
            concept_set, doc_values = query_expand(doc_set, self.unit_selector, self.problem.query)
            concept_sets.append(concept_set)

        ## apply a few transformations
        max_concepts = 60
        max_concept_sum = 0.5
        self.concept_sets = []
        for update_index in range(len(concept_sets)):
        
            final_concept_set = {}
            num_used_concepts = 0
            concept_sum = 0

            for concept in concept_sets[update_index].sortedKeys():
                score = concept_sets[update_index][concept]

                ## don't include more than max_concepts
                if num_used_concepts >= max_concepts: break
                if concept_sum >= max_concept_sum: 
                    print 'concepts used: %d' %num_used_concepts
                    break

                remove = False
                
                ## downweight concepts appearing in previous sets
                #for prev_index in range(update_index):
                #    if concept in concept_sets[prev_index]: score = 0.5*score

                ## add to final concept set
                if not remove:
                    final_concept_set[concept] = score
                    num_used_concepts += 1
                    concept_sum += score
                    #print count, concept

            self.concept_sets.append(final_concept_set)
Exemplo n.º 8
0
def make_concepts_exp(id, path, sents, query):
    """
    """

    query_words = set(
        util.porter_stem_sent(
            util.remove_stopwords(util.tokenize(fix_text(query)))).split())

    ## get sentence values
    sent_vals = prob_util.Counter()
    for sent in sents:
        query_overlap = set(util.remove_stopwords(
            sent.tok2.split())).intersection(query_words)
        sent_vals[sent] = max(0, len(query_overlap))
        #if sent.relevance < 0.3: sent_vals[sent] = 0.0
        #else: sent_vals[sent] = 100000**sent.relevance
        concepts = set(
            util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True))
        sent.concepts = set()
        for concept in concepts:
            if util.is_just_stopwords(concept.split('_')): continue
            sent.concepts.add(concept)
    sent_vals = prob_util.normalize(sent_vals)

    ## get concept values
    concept_vals = prob_util.Counter()
    for sent in sents:
        for concept in sent.concepts:
            concept_vals[concept] += sent_vals[sent]
    concept_vals = prob_util.normalize(concept_vals)

    iter = 0
    while True:
        iter += 1
        se = prob_util.entropy(sent_vals)
        ce = prob_util.entropy(concept_vals)
        print >> sys.stderr, 'iter [%d] sent entropy [%1.4f] concept entropy [%1.4f]' % (
            iter, se, ce)
        if iter >= 1: break

        ## get sent vals again
        sent_vals = prob_util.Counter()
        for sent in sents:
            for concept in sent.concepts:
                sent_vals[sent] += concept_vals[concept]
        sent_vals = prob_util.normalize(sent_vals)

        ## get concept values
        concept_vals = prob_util.Counter()
        for sent in sents:
            for concept in sent.concepts:
                concept_vals[concept] += sent_vals[sent]
        concept_vals = prob_util.normalize(concept_vals)

    sorted_sents = sent_vals.sortedKeys()
    #for sent in sorted_sents:
    #    print sent_vals[sent], sent.order, sent.new_par, sent

    sorted_concepts = concept_vals.sortedKeys()
    #for concept in sorted_concepts:
    #    print concept_vals[concept], concept

    ## create final concept set
    final_concepts = {}
    for concept in sorted_concepts:
        val = concept_vals[concept]
        #if val < 0.00001: continue
        final_concepts[concept] = val
    final_concept_set = set(final_concepts.keys())

    ## get final sentence list and their concepts
    seen_sents = set()
    for sent in sents:
        skip = False
        if sent.length <= 5: skip = True
        if sent in seen_sents: skip = True
        if sent.order > 0: skip = True
        else: seen_sents.add(sent)
        if skip: sent.concepts = set()
        else: sent.concepts = sent.concepts.intersection(final_concept_set)

    return create_ilp_output(sents, final_concepts, path + id)
Exemplo n.º 9
0
def run_standard(options, max_sents=10000):

    ## create output directory
    try:
        os.popen('rm -rf %s' % options.output)
    except:
        pass
    try:
        os.popen('mkdir -p %s' % options.output)
    except:
        sys.stderr.write('Error: could not create output directory [%s]\n')
        sys.exit()

    ## summarize!
    sys.stderr.write('generating summaries for task [%s]\n' % options.task)
    sys.stderr.write('length limit [%d]\n' % task.length_limit)
    sys.stderr.write('writing output to [%s]\n' % options.output)

    map_times, run_times = {}, {}

    ## sentence compression
    if options.compress:
        for problem in task.problems:
            if not '-A' in problem.id: continue
            sys.stderr.write(
                "%s %d\n" %
                (problem.id,
                 sum([len(doc.sentences) for doc in problem.new_docs])))
            #mapper = concept_mapper.HeuristicMapper(problem, "n2")
            mapper = concept_mapper.CheatingMapper(problem, "n2")
            mapper.map_concepts()
            mapper.choose_sents()
            concept_weights = mapper.concept_weights
            #print concept_weight
            #program = framework.build_program(problem, concept_weight, length=task.length_limit, sentences=mapper.relevant_sent_sets[0])
            program = framework.build_alternative_program(
                problem,
                concept_weights,
                length=task.length_limit,
                sentences=mapper.relevant_sents,
                longuest_candidate_only=False)
            # run the program and get the output
            program.debug = 0
            program.run()
            #selection = framework.get_program_result(program)
            selection = []
            for variable in program.output:
                if re.match(r'^s\d+$',
                            variable) and program.output[variable] == 1:
                    selection.append(program.binary[variable])
            selection = ordering.by_date(selection)
            summary = "\n".join(sentence.original for sentence in selection)
            #summary = compression.addAcronymDefinitionsToSummary(summary, program.acronyms)

            ## TAC id convention is annoying
            output_id = problem.id
            if options.task in ['u09', 'u08']:
                output_id = problem.id[:5] + problem.id[6:]
            output_file = open('%s/%s' % (options.output, output_id), 'w')
            output_file.write(summary)
            output_file.close()

    elif options.mcd:
        for problem in task.problems:
            num_problem_sentences = len(problem.get_new_sentences())
            if num_problem_sentences < 500: continue
            used_sent_count = 0
            for sentence in problem.get_new_sentences():
                used_sent_count += 1
                sentence.set_text(sentence.original)
                if used_sent_count < max_sents: sentence.used = True
                else: sentence.used = False
            problem.query.set_text(problem.query.original)
            sys.stdout.write(
                "%s %d\n" %
                (problem.id,
                 sum([len(doc.sentences) for doc in problem.new_docs])))

            # compute idf values
            word_idf = {}
            for doc in problem.new_docs:
                seen_words = {}
                for sentence in doc.sentences:
                    if not sentence.used: continue
                    for word in sentence.no_stop_freq:
                        if word not in seen_words: seen_words[word] = 1
                for word in seen_words:
                    if word not in word_idf: word_idf[word] = 1
                    else: word_idf[word] += 1
            for word in word_idf:
                word_idf[word] = 1.0 / word_idf[word]

            # compare sentences to centroid and derive McDonald's relevance score
            sentences = []
            index = 0
            for doc in problem.new_docs:
                doc_text = " ".join([
                    sentence.original for sentence in doc.sentences
                    if sentence.used
                ])
                centroid = text.Sentence(doc_text)
                centroid.compute_norm()
                problem.query.compute_norm()
                for sentence in doc.sentences:
                    if not sentence.used: continue
                    sentence.compute_norm()
                    sentence.rel_score = sentence.sim_cosine(
                        centroid, word_idf) + 1 / (sentence.order + 1)
                    #sentence.rel_score = sentence.sim_cosine(centroid, word_idf) + sentence.sim_cosine(problem.query, word_idf)
                    sentences.append(sentence)
                    sentence.index = index
                    index += 1

            # apply cutoff
            sentences.sort(lambda x, y: 1 if x.rel_score < y.rel_score else -1)
            if options.cutoff > 0 and len(sentences) > options.cutoff:
                sentences = sentences[0:options.cutoff]

            # construct ILP
            program = ilp.IntegerLinearProgram(debug=0)
            objective = []
            length_constraint = []
            for sentence in sentences:
                objective.append("%+g s%d" %
                                 (sentence.rel_score, sentence.index))
                program.binary["s%d" % sentence.index] = sentence
                length_constraint.append("%+g s%d" %
                                         (sentence.length, sentence.index))
                for peer in sentences:
                    if sentence == peer: continue
                    score = sentence.sim_cosine(peer, word_idf)
                    if score > 0:
                        objective.append("%+g s%d_%d" %
                                         (-score, sentence.index, peer.index))
                        program.binary["s%d_%d" %
                                       (sentence.index, peer.index)] = [
                                           sentence, peer
                                       ]
                        program.constraints["c1_%d_%d" % (sentence.index, peer.index)] = \
                            "s%d_%d - s%d <= 0" % (sentence.index, peer.index, sentence.index)
                        program.constraints["c2_%d_%d" % (sentence.index, peer.index)] = \
                            "s%d_%d - s%d <= 0" % (sentence.index, peer.index, peer.index)
                        program.constraints["c3_%d_%d" % (sentence.index, peer.index)] = \
                            "s%d + s%d - s%d_%d <= 1" % (sentence.index, peer.index, sentence.index, peer.index)
            program.objective["score"] = " ".join(objective)
            program.constraints["length"] = " ".join(
                length_constraint) + " <= %g" % task.length_limit

            run_times[problem.id] = time.time()
            program.run()
            run_times[problem.id] = time.time() - run_times[problem.id]

            selection = []
            score = 0
            # get solution and check consistency
            for variable in program.binary:
                if variable in program.output and program.output[variable] == 1:
                    if type(program.binary[variable]) == type(sentences[0]):
                        selection.append(program.binary[variable])
                        score += program.binary[variable].rel_score
                        for peer in program.output:
                            if program.output[
                                    peer] == 0 or peer == variable or type(
                                        program.binary[peer]) != type(
                                            sentences[0]):
                                continue
                            if program.binary[variable].sim_cosine(
                                    program.binary[peer], word_idf) == 0:
                                continue
                            quadratic = "s%d_%d" % (
                                program.binary[variable].index,
                                program.binary[peer].index)
                            if quadratic not in program.output or program.output[
                                    quadratic] != 1:
                                print "WARNING: %s selected but %s not selected" % (
                                    variable, quadratic)

                    else:
                        score -= program.binary[variable][0].sim_cosine(
                            program.binary[variable][1], word_idf)
                        if program.output[
                                "s%d" %
                                program.binary[variable][0].index] != 1:
                            print "WARNING: %s selected while s%d not selected" % (
                                variable, program.binary[variable][0].index)
                        if program.output[
                                "s%d" %
                                program.binary[variable][1].index] != 1:
                            print "WARNING: %s selected while s%d not selected" % (
                                variable, program.binary[variable][1].index)
            #if math.fabs(program.result["score"] - score) > .1:
            #    print "WARNING: difference between score = %g and expected = %g" % (program.result["score"], score)
            selection = ordering.by_date(selection)
            new_id = re.sub(r'.-(.)$', r'-\1', problem.id)
            output_file = open("%s/%s" % (options.output, new_id), "w")
            for sentence in selection:
                output_file.write(sentence.original + "\n")
            output_file.close()

    else:
        hist = prob_util.Counter()
        input_sents = []
        for problem in task.problems:
            num_problem_sentences = len(problem.get_new_sentences())
            #if num_problem_sentences < 300: continue
            if not '-A' in problem.id: continue

            if options.ir:
                #docs = [doc for doc, val in problem.ir_docs]
                #for doc in docs: doc.get_sentences()
                num_overlap = len(
                    set([d.id for d in problem.ir_docs
                         ]).intersection(set([d.id
                                              for d in problem.new_docs])))
                print '%s overlap: %d' % (problem.id, num_overlap)
                info_fh.write('%s overlap [%d]\n' % (problem.id, num_overlap))

            sys.stderr.write('problem [%s] input sentences [%d]' %
                             (problem.id, num_problem_sentences))
            input_sents.append(num_problem_sentences)

            ## select a concept mapper
            map_times[problem.id] = time.time()
            if options.cheat:
                mapper = concept_mapper.CheatingMapper(problem, options.units)
            else:
                mapper = concept_mapper.HeuristicMapperExp(
                    problem, options.units)

            ## timing test
            mapper.max_sents = max_sents

            ## map input concepts to weights
            success = mapper.map_concepts()
            if not success: sys.exit()

            ## choose a subset of the input sentences based on the mapping
            success = mapper.choose_sents()
            if not success: sys.exit()
            map_times[problem.id] = time.time() - map_times[problem.id]

            ## testing
            #fh = open('concept_matrix', 'w')
            for sent in mapper.relevant_sent_concepts:
                hist[len(sent)] += 1
                #fh.write(''.join(['%d, ' %concept for concept in sent[:-1]]))
                #fh.write('%d\n' %sent[-1])
            hist[0] += (num_problem_sentences -
                        len(mapper.relevant_sent_concepts))
            #hist.displaySorted(N=100)
            #sys.exit()
            ## end testing

            ## setup and run the ILP
            run_times[problem.id] = time.time()
            selection = mapper.run(task.length_limit)
            selection = ordering.by_date(selection)
            run_times[problem.id] = time.time() - run_times[problem.id]

            ## TAC id convention is annoying
            output_id = problem.id
            if options.task in ['u09', 'u08']:
                output_id = problem.id[:5] + problem.id[6:]

            output_file = open('%s/%s' % (options.output, output_id), 'w')
            word_count = 0
            for sentence in selection:
                output_file.write(sentence.original + '\n')
                word_count += len(sentence.original.split())
            output_file.close()
            curr_time = map_times[problem.id] + run_times[problem.id]
            sys.stderr.write(' word count [%d] time [%1.2fs]\n' %
                             (word_count, curr_time))
Exemplo n.º 10
0
def setup_features(problem, unit_selector, train=True):

    ## for training, get gold concepts
    gold_concepts = prob_util.Counter()
    if train:
        for annotator in problem.annotators:
            annotator_concepts = {}
            for sent in problem.training[annotator]:
                sentence = text.Sentence(sent)
                units = unit_selector(sentence.stemmed)
                for unit in units:
                    if unit not in annotator_concepts: annotator_concepts[unit] = 0
                    annotator_concepts[unit] += 1
            for concept in annotator_concepts:
                gold_concepts[concept] += 1

    ## get all sentences and unit frequencies
    sents = []
    doc_freq = prob_util.Counter()
    sent_freq = prob_util.Counter()
    raw_freq = prob_util.Counter()
    for doc in problem.new_docs:
        #if doc.doctype != 'NEWS STORY': continue

        doc_counts = prob_util.Counter()
        for sent in doc.sentences:
            sent_counts = prob_util.Counter()
            sents.append(sent)
            for unit in unit_selector(sent.stemmed):
                doc_counts[unit] += 1
                sent_counts[unit] += 1

            for unit in sent_counts:
                sent_freq[unit] += 1

        for unit in doc_counts:
            doc_freq[unit] += 1
            raw_freq[unit] += doc_counts[unit]

    ## get features for each concept unit
    lines = []
    concepts = []

    title = text.Sentence(problem.title)
    narr = text.Sentence(problem.narr)

    for sent in sents:
            
        ## sentence features
        sentence_sim = sent.sim_basic(problem.query)
        sentence_order = sent.order
        sentence_source = sent.source
        sentence_length = sent.length
            
        units = unit_selector(sent.stemmed)
        for unit in units:
                
            ## concept features
            stopword_ratio = 1 - (1.0*len(text.text_processor.remove_stopwords(unit)) / len(unit))
            doc_ratio = 1.0 * doc_freq[unit] / len(problem.new_docs)
            sent_ratio = 1.0 * sent_freq[unit] / len(sents)
            ngram = ' '.join(unit)

            sunit = text.Sentence(ngram)
            title_sim = sunit.sim_basic(title)
            narr_sim = sunit.sim_basic(narr)

            ## output format (boostexter)
            line = '%s, %1.2f, %1.2f, %1.2f, ' %(ngram, doc_ratio, sent_ratio, stopword_ratio)
            line += '%1.2f, %d, %s, %d, ' %(sentence_sim, sentence_order, sentence_source, sentence_length)
            line += '%1.2f, %1.2f, ' %(title_sim, narr_sim)
            if train: line += '%s' %int(gold_concepts[unit]>0)
            else: line += '0'
            line += '.'
            
            if stopword_ratio == 1: continue

            lines.append(line)
            concepts.append(unit)
            for rep in range(int(gold_concepts[unit]-1)):
                if train:
                    lines.append(line)
                    concepts.append(unit)
    return lines, concepts, doc_freq
Exemplo n.º 11
0
def query_expand(docs, unit_selector, query):
    ## get sentence set
    sents = []
    for doc in docs:
        #if doc.doctype != 'NEWS STORY': continue
        for sent in doc.sentences:
            ## skip short sentences
            #if sent.length <= 5: continue
            sents.append(sent)
            
    ## initialize sentences with query similarity
    sent_values = prob_util.Counter()
    for sent in sents:
        try: sent_values[sent.original] = sent.sim_basic(query)
        except: sent_values[sent.original] = 1
    sent_values = sent_values.makeProbDist()
    original_sent_values = sent_values.copy()

    ## get units in each sent
    sent_units = {}
    for sent in sents:
        sent_units[sent.original] = prob_util.Counter()
        units = unit_selector(sent.stemmed)
        for unit in units:
            if text.text_processor.is_just_stopwords(unit): continue
            sent_units[sent.original][unit] += 1

    ## repeat until convergence
    prev_unit_entropy = 0
    prev_sent_entropy = 0
    prev_unit_values = {}
    prev_sent_values = {}
    for iter in range(1, 51):
        prev_sent_values = sent_values.copy()

        ## get new unit values from sent values
        unit_values = prob_util.Counter()
        for sent in sent_units:
            for unit in sent_units[sent]:
                unit_values[unit] += sent_values[sent]
        unit_values = unit_values.makeProbDist()

        ## get sent values from unit values
        sent_values = prob_util.Counter()
        for sent in sent_units:
            for unit in sent_units[sent]:
                sent_values[sent] += unit_values[unit] #/ len(sent_units[sent])
        sent_values = sent_values.makeProbDist()

        ## interpolate with original sent weights
        sent_prior = 0.1
        for sent in sent_values:
            new_value = (sent_prior * original_sent_values[sent]) + ( (1-sent_prior) * sent_values[sent])
            #sent_values[sent] = new_value

        #prob_util.Counter(unit_values).displaySorted(N=100)
        #prob_util.Counter(sent_values).displaySorted(N=20)

        ## check for convergence
        entropy_sent = prob_util.entropy(sent_values)
        entropy_unit = prob_util.entropy(unit_values)
        dist = prob_util.klDistance(prev_sent_values, sent_values)
        sys.stderr.write('%d sent entropy [%1.4f]  unit entropy [%1.4f]  sent dist [%1.6f]\n' %(iter, entropy_sent, entropy_unit, dist))
        
        if iter == 2: break
        
        if (entropy_unit >= prev_unit_entropy) and (entropy_sent >= prev_sent_entropy): 
            unit_values = prev_unit_values
            sent_values = prev_sent_values
            break
        
        prev_unit_entropy = entropy_unit
        prev_sent_entropy = entropy_sent
        prev_unit_values = unit_values
        prev_sent_values = sent_values
        
        if dist < 0.0001: break

    #prob_util.Counter(unit_values).displaySorted(N=10)
    #prob_util.Counter(sent_values).displaySorted(N=20)

    return prob_util.Counter(unit_values), prob_util.Counter(sent_values)
Exemplo n.º 12
0
def map_iterative_sents(docs, unit_selector, query):

    ## get sentence set
    sents = []
    for doc in docs:
        for sent in doc.sentences:
            ## skip short sentences
            #if sent.length <= 5: continue

            ## skip sentences with no query overlap
            if query: sim = sent.sim_basic(query)
            else: sim = 1
            if sim <= 0: continue

            sents.append(sent)
            
    ## initialize uniform sentence priors
    sent_values = prob_util.Counter()
    for sent in sents:
        sent_values[sent.original] = 1
    sent_values = sent_values.makeProbDist()

    ## get units in each sent
    sent_units = {}
    for sent in sents:
        sent_units[sent.original] = prob_util.Counter()
        units = unit_selector(sent.stemmed)
        for unit in units:
            if text.text_processor.is_just_stopwords(unit): continue
            sent_units[sent.original][unit] += 1

    ## repeat until convergence
    for iter in range(1, 51):
        prev_sent_values = sent_values.copy()
        
        ## get unit values from doc values
        unit_values = prob_util.Counter()
        for sent in sent_units:
            for unit in sent_units[sent]:
                unit_values[unit] += sent_values[sent]
        unit_values = unit_values.makeProbDist()

        ## get sent values from unit values
        sent_values = prob_util.Counter()
        for sent in sent_units:
            for unit in sent_units[sent]:
                sent_values[sent] += unit_values[unit] #/ len(sent_units[sent])
        sent_values = sent_values.makeProbDist()

        #prob_util.Counter(unit_values).displaySorted(N=5)
        #prob_util.Counter(sent_values).displaySorted(N=3)

        ## check for convergence
        entropy_sent = prob_util.entropy(sent_values)
        entropy_unit = prob_util.entropy(unit_values)
        dist = prob_util.klDistance(prev_sent_values, sent_values)
        #print '%d sent entropy [%1.4f]  unit entropy [%1.4f]  sent dist [%1.6f]' %(iter, entropy_sent, entropy_unit, dist)
        if iter == 2: break
        if dist < 0.0001:
            #print '----------------------------'
            break

    return prob_util.Counter(unit_values), prob_util.Counter(sent_values)
Exemplo n.º 13
0
    def map_concepts(self):
        """
        """
        min_count = 3
        use_log_weights = False

        ## get document statistics
        concept_sets = []
        sent_count = 0
        used_sents = set()

        for doc_set in [self.problem.new_docs]:
            concept_set = prob_util.Counter()
            for doc in doc_set:
                #if doc.doctype != 'NEWS STORY': continue
                doc_concepts = {}
                for sent in doc.sentences:
                    
                    sent_count += 1

                    ## ignore short sentences
                    if sent.length < self.min_sent_length: continue

                    ## ignore duplicate sentences
                    sent_stemmed_str = ' '.join(sent.stemmed)
                    if sent_stemmed_str in used_sents: continue
                    used_sents.add(sent_stemmed_str)

                    ## don't consider sentences with no query overlap
                    if self.problem.query:
                        sim = sent.sim_basic(self.problem.query)
                    else: sim = 1
                    if sim <= 0: continue

                    ## TODO: using sent.stemmed -- could make this more general
                    units = self.unit_selector(sent.stemmed)

                    for unit in units:
                        if not unit in doc_concepts: doc_concepts[unit] = 0
                        doc_concepts[unit] += 1         # simple count

                use_doc_freq = len(doc_set) > min_count

                for concept, count in doc_concepts.items():
                    if not concept in concept_set: concept_set[concept] = 0
                    if use_doc_freq: concept_set[concept] += 1      # doc frequency
                    else: concept_set[concept] += count             # raw frequency

            concept_sets.append(concept_set)

        ## apply a few transformations
        self.concept_sets = []
        for update_index in range(len(concept_sets)):
        
            final_concept_set = {}
            num_used_concepts = 0

            for concept in concept_sets[update_index].sortedKeys():
                count = concept_sets[update_index][concept]

                remove = False
                
                ## remove low frequency concepts
                if count < min_count: remove = True

                ## remove stop word concepts (word ngrams only!)
                if self.unit_name[0] in ['n', 's']:
                    if text.text_processor.is_just_stopwords(concept): remove = True

                ## use log weights
                if use_log_weights: score = math.log(count, 2)
                else: score = count

                ## add to final concept set
                if not remove:
                    final_concept_set[concept] = score
                    num_used_concepts += 1

            self.concept_sets.append(final_concept_set)
Exemplo n.º 14
0
def get_values(docs, unit_selector, query):

    ## get sentence set
    sents = []
    for doc in docs:
        for sent in doc.sentences:
            sents.append(sent)

    ## initialize sentences with query similarity
    sent_values = prob_util.Counter()
    for sent in sents:
        try:
            sent_values[sent.original] = sent.sim_basic(query)
        except:
            sent_values[sent.original] = 1
    #sent_values = sent_values.makeProbDist()
    original_sent_values = sent_values.copy()

    ## get units in each sent and co-occurrences of units
    sent_units = {}
    co_units = prob_util.CondCounter()
    for sent in sents:
        sent_units[sent.original] = prob_util.Counter()
        units = unit_selector(sent.stemmed)
        for unit in units:
            if text.text_processor.is_just_stopwords(unit): continue
            sent_units[sent.original][unit] += 1
            for co_unit in units:
                if unit == co_unit: continue
                co_units[unit][co_unit] += 1

    ## get new unit values from sent values
    unit_values = prob_util.Counter()
    for sent in sent_units:
        for unit in sent_units[sent]:
            #unit_values[unit] += sent_values[sent]
            unit_values[unit] += 1

    ## greedy procedure for removing co-occurrence values
    curr_unit_values = unit_values.copy()
    new_unit_values = prob_util.Counter()
    while True:
        best_unit = curr_unit_values.sortedKeys()[0]
        new_unit_values[best_unit] = curr_unit_values[best_unit]
        print best_unit, new_unit_values[best_unit]
        curr_unit_values.pop(best_unit)
        for unit in curr_unit_values:
            new_val = curr_unit_values[unit] - co_units[best_unit][unit]
            if new_val > 1: curr_unit_values[unit] = new_val
        if max(curr_unit_values.values()) < 2: break
        if len(new_unit_values) >= 65: break

    unit_values = new_unit_values
    print '--------------', len(unit_values)

    return unit_values, sent_values

    ## get sent values from unit values
    sent_values = prob_util.Counter()
    for sent in sent_units:
        for unit in sent_units[sent]:
            sent_values[sent] += unit_values[unit]  #/ len(sent_units[sent])
    sent_values = sent_values.makeProbDist()