def get_pmi_model(wc_dir, noun): pmi_model = features.PMINounModel() filename = pmi_model.lemma_to_filename(noun, wc_dir) try: os.stat(filename) except OSError: return None pmi_model._load_from_pmi_file(filename) return pmi_model
def generate_cv_old_recursive(cv_entry, cv_entries_dict, input_dir, output_dir): concept = cv_entry.concept # check if it exists on disk already pmi_model = features.PMINounModel() filename = pmi_model.lemma_to_filename(concept, output_dir) try: os.stat(filename) pmi_model._load_from_pmi_file(filename) return pmi_model except OSError: pass # since it doesn't exist on disk, compute it recursively wv_models = [] for word in cv_entry.wv: word_pmi_model = get_pmi_model(input_dir, word) if word_pmi_model: wv_models.append(word_pmi_model) for word1, word2 in cv_entry.wvi: word1_pmi_model = get_pmi_model(input_dir, word1) word2_pmi_model = get_pmi_model(input_dir, word2) if word1_pmi_model and word2_pmi_model: wv_models.append(word1_pmi_model.intersection(word2_pmi_model)) for parent, weight in cv_entry.p: parent_cv_entry = cv_entries_dict[parent] parent_pmi_model = generate_cv(parent_cv_entry, cv_entries_dict, input_dir, output_dir) parent_pmi_model.scale(weight) wv_models.append(parent_pmi_model) concept_pmi_model = features.PMINounModel() concept_pmi_model.noun = concept for model in wv_models: concept_pmi_model.union_max(model) concept_pmi_model.save_to_file(output_dir) return concept_pmi_model
def get_pmi_model_OLD1(wc_dir, noun): if cached_pmi_models.has_key(noun): return cached_pmi_models[noun] else: pmi_model = features.PMINounModel() filename = pmi_model.lemma_to_filename(noun, wc_dir) try: os.stat(filename) except OSError: return None pmi_model._load_from_pmi_file(filename) cached_pmi_models[noun] = pmi_model return pmi_model
def get_feature_set(source_dir): print 'Loading models...' pmi_models = [] realcount = 0 feature_set = set() for n, pmi_filename in enumerate(glob.glob('%s/*.pmi.bz2' % source_dir)): print 'Loading:', n, pmi_filename pmi_model = features.PMINounModel() pmi_model._load_from_pmi_file(pmi_filename) for section in pmi_model.sections: for feature in pmi_model.__dict__[section]: if (section, feature) not in feature_set: feature_set.add((section, feature)) return feature_set
def generate_cv_entries(cv_entries_dict, input_dir, output_dir): nm = features.PMINounModel() for concept in cv_entries_dict: filename = nm.lemma_to_filename(concept, output_dir) concept_exists_already = False try: os.stat(filename) concept_exists_already = True except OSError: pass if not concept_exists_already: generate_cv(cv_entries_dict[concept], cv_entries_dict, input_dir, output_dir)
def make_db(source_dir, c, feature_map_toidx, concept_map_toidx): for n, pmi_filename in enumerate(glob.glob('%s/*.pmi.bz2' % source_dir)): print 'Loading:', n, pmi_filename pmi_model = features.PMINounModel() pmi_model._load_from_pmi_file(pmi_filename) if pmi_model.high_fcount < 40: continue concept = pmi_model.noun for section in pmi_model.sections: for feature in pmi_model.__dict__[section]: pmi = pmi_model.__dict__[section][feature] feature_str = featuremap.feature_to_str(feature) feature_idx = feature_map_toidx[(section, feature_str)] concept_idx = concept_map_toidx[concept] c.execute('insert into words values (?, ?, ?)', (concept_idx, feature_idx, pmi))
def load_concept_model(concept): try: model = features.PMINounModel(concept, base_dir) return model except IOError: return None
def generalize_and_save_pmi(source_dir, dest_dir): for pmi_filename in glob.glob('%s/*.pmi' % source_dir): pmi_model = features.PMINounModel() pmi_model._load_from_pmi_file(pmi_filename) generalize_pmi_nounmodel(pmi_model) pmi_model.save_to_file(dest_dir)