def __init__(self): self.vectorizer = HashingVectorizer(ngram_range=(1, 2)) self.dict_vectorizer = DictVectorizer() # These are set dynamically in training # but fixed here to match the end feature names # in the trained model. If the model is retrained then # these may have to change self.dict_vectorizer.feature_names_ = [ 'DocumentPositionQuintile0', 'DocumentPositionQuintile1', 'DocumentPositionQuintile2', 'DocumentPositionQuintile3', 'DocumentPositionQuintile4', 'DocumentPositionQuintile5', 'DocumentPositionQuintile6'] self.dict_vectorizer.vocabulary_ = {k: i for i, k in enumerate(self.dict_vectorizer.feature_names_)} self.drugbank = Drugbank()
class PICO_vectorizer: def __init__(self): self.vectorizer = HashingVectorizer(ngram_range=(1, 2)) self.dict_vectorizer = DictVectorizer() # These are set dynamically in training # but fixed here to match the end feature names # in the trained model. If the model is retrained then # these may have to change self.dict_vectorizer.feature_names_ = [ 'DocumentPositionQuintile0', 'DocumentPositionQuintile1', 'DocumentPositionQuintile2', 'DocumentPositionQuintile3', 'DocumentPositionQuintile4', 'DocumentPositionQuintile5', 'DocumentPositionQuintile6'] self.dict_vectorizer.vocabulary_ = {k: i for i, k in enumerate(self.dict_vectorizer.feature_names_)} self.drugbank = Drugbank() def token_contains_number(self, token): return any(char.isdigit() for char in token) def is_number(self,num): try: float(num) return True except ValueError: return False def transform(self, doc_text, extra_features=None, idf=None): # first hashing vectorizer calculates integer token counts # (note that this uses a signed hash; negative indices are # are stored as a flipped (negated) value in the positive # index. This works fine so long as the model files use the # same rule (to balance out the negatives). sentences = [sent.text for sent in doc_text.sents] X_text = self.vectorizer.transform(sentences) X_rowsums = diags(X_text.sum(axis=1).A1, 0) if idf is not None: X_text = (X_text * idf) + X_text X_numeric = self.extract_numeric_features(doc_text, len(sentences)) X_text.eliminate_zeros() if extra_features: X_extra_features = self.dict_vectorizer.transform(extra_features) # now combine feature sets. feature_matrix = sp.sparse.hstack((normalize(X_text), X_numeric, X_extra_features)).tocsr() else: #now combine feature sets. feature_matrix = sp.sparse.hstack((normalize(X_text), X_numeric)).tocsr() return feature_matrix def extract_numeric_features(self, doc_text, n, normalize_matrix=False): # number of numeric features (this is fixed # for now; may wish to revisit this) m = 12 X_numeric = lil_matrix((n,m))#sp.sparse.csc_matrix((n,m)) for sentence_index, sentence in enumerate(doc_text.sents): X_numeric[sentence_index, :] = self.extract_structural_features(sentence) # column-normalize X_numeric = X_numeric.tocsc() if normalize_matrix: X_numeric = normalize(X_numeric, axis=0) return X_numeric def extract_structural_features(self, sentence): fv = np.zeros(12) sent_text = sentence.text num_new_lines = sent_text.count("\n") if num_new_lines <= 1: fv[0] = 1 elif num_new_lines < 20: fv[1] = 1 elif num_new_lines < 40: fv[2] = 1 else: fv[3] = 1 line_lens = [len(line) for line in sent_text.split("\n") if not line.strip()==""] if line_lens: ## # maybe the *fraction* of lines less then... 10 chars? num_short_lines = float(len([len_ for len_ in line_lens if len_ <= 10])) frac_short_lines = float(num_short_lines)/float(len(line_lens)) else: num_short_lines, frac_short_lines = 0, 0 if frac_short_lines < .1: fv[4] = 1 elif frac_short_lines <= .25: fv[5] = 1 else: fv[6] = 1 #fv[4] = 1 if frac_short_lines >= .25 else 0 tokens = [w.text for w in sentence] num_numbers = sum([self.token_contains_number(t) for t in tokens]) if num_numbers > 0: # i think you should replace with two indicators # 1 does it contain more than num_frac = num_numbers / float(len(tokens)) # change to .1 and .3??? #fv[2] = num_frac if num_frac > .2 else 0.0 if num_frac < .2: fv[7] = 1 elif num_frac < .4: fv[8] = 1 else: # >= .4! fv[9] = 1 if len(tokens): average_token_len = np.mean([len(t) for t in tokens]) fv[10] = 1 if average_token_len < 5 else 0 fv[11] = self.drugbank.contains_drug(sent_text) return fv