Пример #1
0
def get_sentence_window(entity_map, sentence, windows):
    """
    Use the whole sentence as the context
    """
    #print sentence
    for w in entity_map:
        # if w != u'La. 421':
        #     continue
        # else:
        #     print "YES!"
        if sentence.find(w) != -1:
            temp_sentence = sentence
            #print "found sentence %s" %temp_sentence
            for t in entity_map:
                if entity_map[t]:
                    temp_sentence = temp_sentence.replace(entity_map[t], "")
                elif temp_sentence.find(t) != -1:
                    temp_sentence = temp_sentence.replace(t, "")
            #print "after process %s" %temp_sentence
            if entity_map[w]:
                w = entity_map[w]
            if w not in windows:
                windows[w] = Sentence(temp_sentence,
                                      remove_stopwords=True).stemmed_model
            else:
                windows[w] += Sentence(temp_sentence,
                                       remove_stopwords=True).stemmed_model
Пример #2
0
def get_text_window(entity_map, sentence, windows, window_size):
    """
    Use a sized text window as the context
    """

    for w in entity_map:

        if sentence.find(w) != -1:

            temp_sentence = sentence
            #print "BEFORE w is %s" %w

            for t in entity_map:
                if t.find(w) != -1 or w.find(t) != -1:
                    continue
                if entity_map[t]:
                    temp_sentence = temp_sentence.replace(entity_map[t], "")
                elif temp_sentence.find(t) != -1:
                    temp_sentence = temp_sentence.replace(t, "")

            if entity_map[w]:
                w = entity_map[w]

            w_size = w.count(" ") + 1

            temp_sentence = re.sub(" +", " ", temp_sentence)
            temp_sentence += ' '  #little trick to ensure that the last token of sentence is a space
            spaces = [m.start() for m in re.finditer(' ', temp_sentence)]

            for m in re.finditer(w, temp_sentence):
                start = m.start() - 1
                if start in spaces:
                    w_start = max(0, spaces.index(start) - window_size)
                    w_end = min(
                        len(spaces) - 1,
                        spaces.index(start) + window_size + w_size)
                    #window_string = document[spaces[w_start]:spaces[w_end]]
                    window_string = temp_sentence[spaces[w_start]:m.start(
                    ) - 1] + " " + temp_sentence[m.end() + 1:spaces[w_end]]
                else:

                    w_end = min(len(spaces) - 1, window_size + w_size - 1)
                    #window_string = document[0:spaces[w_end]]
                    try:
                        window_string = temp_sentence[m.end() +
                                                      1:spaces[w_end]]
                    except IndexError:
                        print "sentence is %s" % sentence
                        print "temp sentece is %s" % temp_sentence
                        print "m_end and w_end: %d %d" % (m.end(), w_end)
                        sys.exit(-1)
                #print "now w is %s" %w
                if w not in windows:
                    windows[w] = Sentence(window_string,
                                          remove_stopwords=True).stemmed_model
                else:
                    windows[w] += Sentence(window_string,
                                           remove_stopwords=True).stemmed_model
Пример #3
0
def get_text_window(words, document, windows, window_size):
    """
    Use a window as the context
    """
    spaces = [m.start() for m in re.finditer(' ', document)]
    for w in words:
        w_size = w.count(" ") + 1
        if w not in windows:
            windows[w] = []
        for m in re.finditer(w, document):
            start = m.start() - 1
            if start in spaces:
                w_start = max(0, spaces.index(start) - window_size)
                w_end = min(
                    len(spaces) - 1,
                    spaces.index(start) + window_size + w_size)
                #window_string = document[spaces[w_start]:spaces[w_end]]
                window_string = document[spaces[w_start]:m.start() -
                                         1] + " " + document[m.end() +
                                                             1:spaces[w_end]]
            else:
                w_end = min(len(spaces) - 1, window_size + w_size - 1)
                #window_string = document[0:spaces[w_end]]
                window_string = document[m.end() + 1:spaces[w_end]]
            windows[w].append(
                Sentence(window_string, remove_stopwords=True).stemmed_text)
Пример #4
0
    def compute_measure(tweet_text,previous_text):
        if measure_method == "cosine-sim" :
            t1_model = Sentence(tweet_text).raw_model
            t2_model = Sentence(previous_text).raw_model
                  
            sim = t1_model.cosine_sim(t2_model)
        elif measure_method == "set-sim":
            t1_model = Sentence(tweet_text).raw_model
            t2_model = Sentence(previous_text).raw_model

            sim = compute_term_diff(t1_model,t2_model)
        elif measure_method == "edit-sim":    
            sim = ratio(tweet_text,previous_text)
        else:
            raise NotImplementedError("Measure %s is not implemented"
                                       %(measure_method))
        return sim
Пример #5
0
def get_sentence_window(words, sentence, windows):
    """
    Use the whole sentence as the context
    """
    #print sentence
    for w in words:

        if sentence.find(w) != -1:
            temp_sentence = sentence
            #print "found sentence %s" %temp_sentence
            for t in words:
                if temp_sentence.find(t) != -1:
                    temp_sentence = temp_sentence.replace(t, "")
            #print "after process %s" %temp_sentence
            if w not in windows:
                windows[w] = Sentence(temp_sentence,
                                      remove_stopwords=True).stemmed_model
            else:
                windows[w] += Sentence(temp_sentence,
                                       remove_stopwords=True).stemmed_model
Пример #6
0
def get_all_words(example_result_tuples):

    word_model = Model(True, need_stem=True)

    for single_tuple in example_result_tuples:
        word_model += Sentence(single_tuple['sentence'],
                               remove_stopwords=True).stemmed_model

    word_model.normalize()

    return word_model
def get_word_features(judged_data_file,normalize):
    judged_data = json.load(open(judged_data_file))
    feature_data = []
    for single_data in judged_data:
        if "result_tuples" not in single_data:
            raise RuntimeError("data does not have result_tuple field")
        else:
            # only use the sentences that clauses/verbs can be detected
            # from
            if single_data["result_tuples"]:
                sentence = single_data["sentence"]
                sentence_model = Sentence(re.sub("\n"," ",sentence),remove_stopwords=False).raw_model
                if normalize:
                    sentence_model.to_dirichlet()

                single_data.pop("sentence",None)
                single_data["word_features"] = sentence_model.model
                feature_data.append(single_data)

    return feature_data
Пример #8
0
def get_all_words(result_tuples):

    word_model = Model(False, need_stem=False)

    for single_tuple in result_tuples:
        word_model += Sentence(single_tuple['sentence'],
                               remove_stopwords=False).raw_model

    word_model.to_dirichlet()

    return word_model
Пример #9
0
def get_all_words(tuple_results):
    words = {}
    for identifier in tuple_results:
        word_model = Model(True,need_stem=True)
        for single_tuple in tuple_results[identifier]:
            word_model += Sentence(single_tuple['sentence'],remove_stopwords=True).stemmed_model

        word_model.normalize()
        for word in word_model.model:
            if word not in words:
                words[word] = 0
            words[word] += word_model.model[word]
    return words
Пример #10
0
def get_sentence_window(entity_map,sentence,windows):
    """
    Use the whole sentence as the context
    """
    #print sentence
    for w in entity_map:
        # if w != u'Jefferson County':
        #     continue
        # else:
        #     pass
        #     print "YES!"
        if sentence.find(w) != -1:
            #print "found sentence %s" %sentence
            if entity_map[w]:
                w = entity_map[w]
            if w not in windows:
                windows[w] = Model(True,need_stem=True)
            
            windows[w] += Sentence(re.sub("\n"," ",sentence),remove_stopwords=True).stemmed_model