def __init__(self):
        self.vectorizer = HashingVectorizer(ngram_range=(1, 2))
        self.dict_vectorizer = DictVectorizer()

        # These are set dynamically in training
        # but fixed here to match the end feature names
        # in the trained model. If the model is retrained then
        # these may have to change
        self.dict_vectorizer.feature_names_ = [
            'DocumentPositionQuintile0',
            'DocumentPositionQuintile1',
            'DocumentPositionQuintile2',
            'DocumentPositionQuintile3',
            'DocumentPositionQuintile4',
            'DocumentPositionQuintile5',
            'DocumentPositionQuintile6']
        self.dict_vectorizer.vocabulary_ = {k: i for i, k in enumerate(self.dict_vectorizer.feature_names_)}

        self.drugbank = Drugbank()
class PICO_vectorizer:

    def __init__(self):
        self.vectorizer = HashingVectorizer(ngram_range=(1, 2))
        self.dict_vectorizer = DictVectorizer()

        # These are set dynamically in training
        # but fixed here to match the end feature names
        # in the trained model. If the model is retrained then
        # these may have to change
        self.dict_vectorizer.feature_names_ = [
            'DocumentPositionQuintile0',
            'DocumentPositionQuintile1',
            'DocumentPositionQuintile2',
            'DocumentPositionQuintile3',
            'DocumentPositionQuintile4',
            'DocumentPositionQuintile5',
            'DocumentPositionQuintile6']
        self.dict_vectorizer.vocabulary_ = {k: i for i, k in enumerate(self.dict_vectorizer.feature_names_)}

        self.drugbank = Drugbank()

    def token_contains_number(self, token):
        return any(char.isdigit() for char in token)

    def is_number(self,num):
        try:
            float(num)
            return True
        except ValueError:
            return False


    def transform(self, doc_text, extra_features=None, idf=None):
        # first hashing vectorizer calculates integer token counts
        # (note that this uses a signed hash; negative indices are
        # are stored as a flipped (negated) value in the positive
        # index. This works fine so long as the model files use the
        # same rule (to balance out the negatives).

        sentences = [sent.text for sent in doc_text.sents]

        X_text = self.vectorizer.transform(sentences)

        X_rowsums = diags(X_text.sum(axis=1).A1, 0)
        if idf is not None:
            X_text = (X_text * idf) + X_text
            X_numeric = self.extract_numeric_features(doc_text, len(sentences))
            X_text.eliminate_zeros()

        if extra_features:
            X_extra_features = self.dict_vectorizer.transform(extra_features)
            # now combine feature sets.
            feature_matrix = sp.sparse.hstack((normalize(X_text), X_numeric, X_extra_features)).tocsr()
        else:
            #now combine feature sets.
            feature_matrix = sp.sparse.hstack((normalize(X_text), X_numeric)).tocsr()

        return feature_matrix



    def extract_numeric_features(self, doc_text, n, normalize_matrix=False):
        # number of numeric features (this is fixed
        # for now; may wish to revisit this)
        m = 12

        X_numeric = lil_matrix((n,m))#sp.sparse.csc_matrix((n,m))
        for sentence_index, sentence in enumerate(doc_text.sents):
            X_numeric[sentence_index, :] = self.extract_structural_features(sentence)
            # column-normalize
        X_numeric = X_numeric.tocsc()
        if normalize_matrix:
            X_numeric = normalize(X_numeric, axis=0)
        return X_numeric


    def extract_structural_features(self, sentence):
        fv = np.zeros(12)

        sent_text = sentence.text

        num_new_lines = sent_text.count("\n")
        if num_new_lines <= 1:
            fv[0] = 1
        elif num_new_lines < 20:
            fv[1] = 1
        elif num_new_lines < 40:
            fv[2] = 1
        else:
            fv[3] = 1

        line_lens = [len(line) for line in sent_text.split("\n") if not line.strip()==""]

        if line_lens:
            ##
            # maybe the *fraction* of lines less then... 10 chars?
            num_short_lines = float(len([len_ for len_ in line_lens if len_ <= 10]))
            frac_short_lines = float(num_short_lines)/float(len(line_lens))
        else:
            num_short_lines, frac_short_lines = 0, 0

        if frac_short_lines < .1:
            fv[4] = 1
        elif frac_short_lines <= .25:
            fv[5] = 1
        else:
            fv[6] = 1

        #fv[4] = 1 if frac_short_lines >= .25 else 0

        tokens = [w.text for w in sentence]
        num_numbers = sum([self.token_contains_number(t) for t in tokens])

        if num_numbers > 0:
            # i think you should replace with two indicators
            # 1 does it contain more than
            num_frac = num_numbers / float(len(tokens))
            # change to .1 and .3???
            #fv[2] = num_frac if num_frac > .2 else 0.0
            if num_frac < .2:
                fv[7] = 1
            elif num_frac < .4:
                fv[8] = 1
            else:
                # >= .4!
                fv[9] = 1

        if len(tokens):
            average_token_len = np.mean([len(t) for t in tokens])
            fv[10] = 1 if average_token_len < 5 else 0

        fv[11] = self.drugbank.contains_drug(sent_text)
        return fv