def get_sentence_window(entity_map, sentence, windows): """ Use the whole sentence as the context """ #print sentence for w in entity_map: # if w != u'La. 421': # continue # else: # print "YES!" if sentence.find(w) != -1: temp_sentence = sentence #print "found sentence %s" %temp_sentence for t in entity_map: if entity_map[t]: temp_sentence = temp_sentence.replace(entity_map[t], "") elif temp_sentence.find(t) != -1: temp_sentence = temp_sentence.replace(t, "") #print "after process %s" %temp_sentence if entity_map[w]: w = entity_map[w] if w not in windows: windows[w] = Sentence(temp_sentence, remove_stopwords=True).stemmed_model else: windows[w] += Sentence(temp_sentence, remove_stopwords=True).stemmed_model
def get_text_window(entity_map, sentence, windows, window_size): """ Use a sized text window as the context """ for w in entity_map: if sentence.find(w) != -1: temp_sentence = sentence #print "BEFORE w is %s" %w for t in entity_map: if t.find(w) != -1 or w.find(t) != -1: continue if entity_map[t]: temp_sentence = temp_sentence.replace(entity_map[t], "") elif temp_sentence.find(t) != -1: temp_sentence = temp_sentence.replace(t, "") if entity_map[w]: w = entity_map[w] w_size = w.count(" ") + 1 temp_sentence = re.sub(" +", " ", temp_sentence) temp_sentence += ' ' #little trick to ensure that the last token of sentence is a space spaces = [m.start() for m in re.finditer(' ', temp_sentence)] for m in re.finditer(w, temp_sentence): start = m.start() - 1 if start in spaces: w_start = max(0, spaces.index(start) - window_size) w_end = min( len(spaces) - 1, spaces.index(start) + window_size + w_size) #window_string = document[spaces[w_start]:spaces[w_end]] window_string = temp_sentence[spaces[w_start]:m.start( ) - 1] + " " + temp_sentence[m.end() + 1:spaces[w_end]] else: w_end = min(len(spaces) - 1, window_size + w_size - 1) #window_string = document[0:spaces[w_end]] try: window_string = temp_sentence[m.end() + 1:spaces[w_end]] except IndexError: print "sentence is %s" % sentence print "temp sentece is %s" % temp_sentence print "m_end and w_end: %d %d" % (m.end(), w_end) sys.exit(-1) #print "now w is %s" %w if w not in windows: windows[w] = Sentence(window_string, remove_stopwords=True).stemmed_model else: windows[w] += Sentence(window_string, remove_stopwords=True).stemmed_model
def get_text_window(words, document, windows, window_size): """ Use a window as the context """ spaces = [m.start() for m in re.finditer(' ', document)] for w in words: w_size = w.count(" ") + 1 if w not in windows: windows[w] = [] for m in re.finditer(w, document): start = m.start() - 1 if start in spaces: w_start = max(0, spaces.index(start) - window_size) w_end = min( len(spaces) - 1, spaces.index(start) + window_size + w_size) #window_string = document[spaces[w_start]:spaces[w_end]] window_string = document[spaces[w_start]:m.start() - 1] + " " + document[m.end() + 1:spaces[w_end]] else: w_end = min(len(spaces) - 1, window_size + w_size - 1) #window_string = document[0:spaces[w_end]] window_string = document[m.end() + 1:spaces[w_end]] windows[w].append( Sentence(window_string, remove_stopwords=True).stemmed_text)
def compute_measure(tweet_text,previous_text): if measure_method == "cosine-sim" : t1_model = Sentence(tweet_text).raw_model t2_model = Sentence(previous_text).raw_model sim = t1_model.cosine_sim(t2_model) elif measure_method == "set-sim": t1_model = Sentence(tweet_text).raw_model t2_model = Sentence(previous_text).raw_model sim = compute_term_diff(t1_model,t2_model) elif measure_method == "edit-sim": sim = ratio(tweet_text,previous_text) else: raise NotImplementedError("Measure %s is not implemented" %(measure_method)) return sim
def get_sentence_window(words, sentence, windows): """ Use the whole sentence as the context """ #print sentence for w in words: if sentence.find(w) != -1: temp_sentence = sentence #print "found sentence %s" %temp_sentence for t in words: if temp_sentence.find(t) != -1: temp_sentence = temp_sentence.replace(t, "") #print "after process %s" %temp_sentence if w not in windows: windows[w] = Sentence(temp_sentence, remove_stopwords=True).stemmed_model else: windows[w] += Sentence(temp_sentence, remove_stopwords=True).stemmed_model
def get_all_words(example_result_tuples): word_model = Model(True, need_stem=True) for single_tuple in example_result_tuples: word_model += Sentence(single_tuple['sentence'], remove_stopwords=True).stemmed_model word_model.normalize() return word_model
def get_word_features(judged_data_file,normalize): judged_data = json.load(open(judged_data_file)) feature_data = [] for single_data in judged_data: if "result_tuples" not in single_data: raise RuntimeError("data does not have result_tuple field") else: # only use the sentences that clauses/verbs can be detected # from if single_data["result_tuples"]: sentence = single_data["sentence"] sentence_model = Sentence(re.sub("\n"," ",sentence),remove_stopwords=False).raw_model if normalize: sentence_model.to_dirichlet() single_data.pop("sentence",None) single_data["word_features"] = sentence_model.model feature_data.append(single_data) return feature_data
def get_all_words(result_tuples): word_model = Model(False, need_stem=False) for single_tuple in result_tuples: word_model += Sentence(single_tuple['sentence'], remove_stopwords=False).raw_model word_model.to_dirichlet() return word_model
def get_all_words(tuple_results): words = {} for identifier in tuple_results: word_model = Model(True,need_stem=True) for single_tuple in tuple_results[identifier]: word_model += Sentence(single_tuple['sentence'],remove_stopwords=True).stemmed_model word_model.normalize() for word in word_model.model: if word not in words: words[word] = 0 words[word] += word_model.model[word] return words
def get_sentence_window(entity_map,sentence,windows): """ Use the whole sentence as the context """ #print sentence for w in entity_map: # if w != u'Jefferson County': # continue # else: # pass # print "YES!" if sentence.find(w) != -1: #print "found sentence %s" %sentence if entity_map[w]: w = entity_map[w] if w not in windows: windows[w] = Model(True,need_stem=True) windows[w] += Sentence(re.sub("\n"," ",sentence),remove_stopwords=True).stemmed_model