def get_all_words(example_result_tuples): word_model = Model(True,need_stem=True) for single_tuple in example_result_tuples: word_model += Sentence(single_tuple['sentence'],remove_stopwords=True).stemmed_model word_model.normalize() return word_model
def get_all_words(example_result_tuples): word_model = Model(True, need_stem=True) for single_tuple in example_result_tuples: word_model += Sentence(single_tuple['sentence'], remove_stopwords=True).stemmed_model word_model.normalize() return word_model
def get_single_model(candidate_dir): candidate_models = {} files = os.walk(candidate_dir).next()[2] for a_file in files: candidate_models[a_file] = {} temp_model = json.load(open(os.path.join(candidate_dir,a_file))) for w in temp_model: if w not in candidate_models: temp = Model(True,text_dict=temp_model[w], need_stem = True, input_stemmed = True) temp.normalize() candidate_models[a_file][w] = temp return candidate_models
def get_all_words(tuple_results): words = {} for identifier in tuple_results: word_model = Model(True,need_stem=True) for single_tuple in tuple_results[identifier]: word_model += Sentence(single_tuple['sentence'],remove_stopwords=True).stemmed_model word_model.normalize() for word in word_model.model: if word not in words: words[word] = 0 words[word] += word_model.model[word] return words
def get_verbs(tuple_result): verbs = {} for identifier in tuple_result: verb_model = Model(True) for single_tuple in tuple_result[identifier]: verb = single_tuple['verb'] if verb not in NO_NEED: verb_model.update(text_list=[verb]) verb_model.normalize() for verb in verb_model.model: if verb not in verbs: verbs[verb] = 0 verbs[verb] += verb_model.model[verb] return verbs
def get_model_for_entities(source_dir): models = {} for instance in os.walk(source_dir).next()[2]: data = json.load(open(os.path.join(source_dir,instance))) for entity_type in data: if entity_type not in models: temp = Model(True,need_stem = True) temp.normalize() models[entity_type] = temp for entity in data[entity_type]: temp = Model(True,text_dict=data[entity_type][entity],need_stem=True,input_stemmed=True) temp.normalize() models[entity_type] += temp return models
def get_models(results,index_dir): models = {} for qid in results: if qid not in models: models[qid] = {} for day in results[qid]: single_model = Model(True,need_stem=True) for tid in results[qid][day]: text = get_text(index_dir,tid) if text: single_model.update(text_string=text) single_model.normalize() models[qid][day] = single_model.model return models
def get_single_model(candidate_dir): candidate_models = {} files = os.walk(candidate_dir).next()[2] for a_file in files: candidate_models[a_file] = {} temp_model = json.load(open(os.path.join(candidate_dir, a_file))) for w in temp_model: if w not in candidate_models: temp = Model(True, text_dict=temp_model[w], need_stem=True, input_stemmed=True) temp.normalize() candidate_models[a_file][w] = temp return candidate_models
def get_model_for_entities(source_dir): models = {} for instance in os.walk(source_dir).next()[2]: data = json.load(open(os.path.join(source_dir, instance))) for entity_type in data: if entity_type not in models: temp = Model(True, need_stem=True) temp.normalize() models[entity_type] = temp for entity in data[entity_type]: temp = Model(True, text_dict=data[entity_type][entity], need_stem=True, input_stemmed=True) temp.normalize() models[entity_type] += temp return models
def get_all_verbs(example_result_tuples): verb_model = Model(True, need_stem=True) for single_tuple in example_result_tuples: word = single_tuple['verb'] if single_tuple['verb_label'] != 'VB': word = WordNetLemmatizer().lemmatize(word, 'v') try: verb_model.update(text_list=[str(word)]) except TypeError: print "Wrong Word!" print word print type(word) print single_tuple sys.exit(0) verb_model.normalize() return verb_model
def get_all_verbs(example_result_tuples): verb_model = Model(True,need_stem=True) for single_tuple in example_result_tuples: word = single_tuple['verb'] if single_tuple['verb_label'] != 'VB': word = WordNetLemmatizer().lemmatize(word,'v') try: verb_model.update(text_list=[str(word)]) except TypeError: print "Wrong Word!" print word print type(word) print single_tuple sys.exit(0) verb_model.normalize() return verb_model
def get_sub_features(model,size): """ get top terms as features """ data = Model(True, need_stem = True, input_stemmed = True) for instance in model: for w in model[instance]: data += model[instance][w] data.normalize() terms = data.model sorted_terms = sorted(terms.items(),key = lambda x:x[1],reverse=True) i = 0 features = {} for (w,v) in sorted_terms: features[w] = v i += 1 if i == size: print "break when i is",i break print "get %d features" %(len(features)) return features
def get_sub_features(model, size): """ get top terms as features """ data = Model(True, need_stem=True, input_stemmed=True) for instance in model: for w in model[instance]: data += model[instance][w] data.normalize() terms = data.model sorted_terms = sorted(terms.items(), key=lambda x: x[1], reverse=True) i = 0 features = {} for (w, v) in sorted_terms: features[w] = v i += 1 if i == size: print "break when i is", i break print "get %d features" % (len(features)) return features