def concept_features_for_sentence(sentence, chunk_inds): """ concept_features() @param sentence. A sentence in list of chunk format @param chunk_inds. A list of indices for non-None-labeled chunks @return A list of feature dictionaries """ global dependency_parser # Get a feature set for each word in the sentence features_list = [] for ind in chunk_inds: features_list.append( feat_word.concept_features_for_chunk(sentence, ind)) dependencies = None if dependency_parser is not None: dependencies = dependency_parser.get_collapsed_dependencies(sentence) # Allow for particular features to be enabled for feature in enabled_concept_features: # Features: UMLS features if (feature == "UMLS") and enabled['UMLS']: umls_features = feat_umls.concept_features_for_chunks( sentence, chunk_inds) for i in range(len(chunk_inds)): features_list[i].update(umls_features[i]) if (feature == "grammar_features" and enabled["PY4J"]): print "getting grammar features" for i, target_index in enumerate(chunk_inds): if dependencies is not None: features_list[i].update( dependency_parser.get_related_tokens( target_index, sentence, dependencies)) if enabled_modules()["WORD2VEC"]: print "getting vectors..." for i, chunk_index in enumerate(chunk_inds): chunk = sentence[chunk_index] cluster = predict_sequence_cluster(chunk) features_list[i].update({("cluster", cluster): 1}) return features_list
import cPickle as pickle import sys import os import atexit features_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) if features_dir not in sys.path: sys.path.append(features_dir) # find where umls tables are located from read_config import enabled_modules enabled = enabled_modules() umls_tables = enabled['UMLS'] features_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) if features_dir not in sys.path: sys.path.append(features_dir) from utilities import load_pickled_obj class UmlsCache: # static class variables filename = None cache = None def __init__(self): try:
__date__ = 'Jan. 27, 2014' # What modules are available from utilities import load_pos_tagger from read_config import enabled_modules import word_features as feat_word ################################################ # Build a few expensive one-time objects # what to build requires knowing what tools are enabled enabled = enabled_modules() # Import feature modules feat_genia=None if enabled['GENIA']: from genia_dir.genia_features import GeniaFeatures # Only create UMLS cache if module is available if enabled['UMLS']: from umls_dir import interface_umls from umls_dir import interpret_umls import umls_dir.umls_features as feat_umls
word = line[0] vector = map(float, line[1:-1]) assert len(vector) == vector_size word_vecs[word] = vector print '\n\tword2vec embeddings complete' return word_vecs if embeddings is None: # Load word vectors vectors_bin = read_config.enabled_modules()["WORD2VEC"] pretrained = load_bin(vectors_bin, bin_mode=True) # be able to handle OOV by giving them 0 vectors embeddings = defaultdict(lambda:np.zeros(len(pretrained.values()[0]))) embeddings = defaultdict(lambda:np.array([.0000000000000000000000000001]*len(pretrained.values()[0]))) embeddings.update(pretrained) print "\tsuccessfully loaded word2vec embeddings\n" def cosine_similarity(x, y): return (np.inner(x,y) / (np.linalg.norm(x) * np.linalg.norm(y))) def get_word_from_vec(vector):