Exemplo n.º 1
0
def get_vocab(clean_tweets):
    """
    Similar to TfidfVectorizer, CountVectorizer removes all the common words
    based on word occurrences in the corpus.
    Returns a sorted vocab list of words.
    """
    cv = CountVectorizer(max_features = 1000,
                         stop_words = 'english',
                         lowercase = True,
                         analyzer = 'word')
    cv._count_vocab(clean_tweets, False)
    words = cv.fit_transform(clean_tweets)
    return (sorted(cv.vocabulary_.items(), key=lambda x:x[1]))
Exemplo n.º 2
0
class SimpleAL:

    def __init__(self):
        self.text=None
        self.clf = LogisticRegression(C=100, penalty='l1')
        self.next_idx=None
        self.vectorizer=None
        self.binary=True
        self.max_df=0.8
        self.min_df=0.01
        self.max_features=30000
        self.keys=None
        self.labels_unique=None

    def init_data(self,corpus,labels,clean=False):
        #corpus is a list of text
        #labels is a numpy array where 0 is unlabeled, 1 is positive, -1 is negative
        self.text = corpus
        self.my_labels=labels
        self.labels_unique=np.unique(labels)
        #remvove 0 since it denotes unlabeled
        self.labels_unique=self.labels_unique[self.labels_unique!=0]
        if clean:
            self.featurize_text(corpus)
        else:
            
            clean_text = [line for line in self.clean_text(corpus)]
            self.featurize_text(clean_text)

    def featurize_text(self,clean_text,vectorizer_preload=False):
        if vectorizer_preload:
            #vectorizer already loaded, don't fit
            self.X = self.vectorizer.transform(clean_text)
            return 
        else:
            self.vectorizer = CountVectorizer( stop_words="english", ngram_range=(1, 1), analyzer="word", binary=True,max_features=self.max_features)
            vocabulary, X = self.vectorizer._count_vocab(clean_text, None)
            if self.binary:
                X.data.fill(1)
            self.X_full = self.vectorizer._sort_features(X, vocabulary)

        self.vocabulary_ = vocabulary
        self.limit_features()

    def limit_features(self):
        max_df = self.max_df
        min_df = self.min_df
        max_features = self.max_features

        n_doc = self.X_full.shape[0]
        max_doc_count = (max_df
                         if isinstance(max_df, numbers.Integral)
                         else max_df * n_doc)
        min_doc_count = (min_df
                         if isinstance(min_df, numbers.Integral)
                         else min_df * n_doc)
        if max_doc_count < min_doc_count:
                raise ValueError("max_df corresponds to < documents than min_df")
        self.X,stop_words_ = self.vectorizer._limit_features(self.X_full, self.vocabulary_,max_doc_count,min_doc_count,max_features)

    def change_cutoffs(self,max_df=None,min_df=None):
        flag=False
        if max_df is not None:
            if float(max_df) != float(self.max_df):
                self.max_df=max_df
                flag=True
        if min_df is not None:
            if float(min_df) != float(self.min_df):
                self.min_df = min_df
                flag=True
        if flag==True:
            self.limit_features()     
 
    def margin_sampling(self,array):
        # Margin sampling: 
        # Scheffer, T., Decomain, C., & Wrobel, S. (2001). Active hidden markov models for information extraction. 
        #    In Advances in Intelligent Data Analysis (pp. 309-318). Springer Berlin Heidelberg.
        desc=(-np.sort(-array,axis=1))#sort rows of predictions
        return np.argmin(desc[:,0]-desc[:,1])

    def find_highest(self,array):
        idx = array.argmax()
        return idx

    def run_classifier(self,init=False):
        self.next_idx=None
        indices = np.where(self.my_labels!=0)[0]
        self.test_indices = np.where(self.my_labels==0)[0]
        tmp_X=self.X[indices]
        tmp_labels = self.my_labels[indices]
        tmp_Xtest=self.X[self.test_indices]
        self.clf.fit(tmp_X,tmp_labels)
        self.predictions=self.clf.predict_proba(tmp_Xtest)

        
    def set_next(self,uncertainty=True):
        if self.predictions is not None:
            if self.next_idx is None:
                if uncertainty:
                    next_idx=self.margin_sampling(self.predictions)
                else:
                    #get highest for class that is underrepresented:
                    counts=np.bincount(labels.astype(np.int))
                    mysum=self.my_labels.sum()
                    lblindx=np.argmax(counts[1:])
                    next_idx=self.find_highest(self.predictions[:,lblindx])
                    next_idx=self.find_highest(self.predictions[:,lblindx])
                #now that we have selected this for labeling, remove it
                self.predictions = np.delete(self.predictions, (next_idx), axis=0)
                self.next_idx=self.test_indices[next_idx]
                self.test_indices=np.delete(self.test_indices, (next_idx), axis=0)
                return True
            else:
                return False
        else:
            return False
                                    
        
    def load_classifier(self,filename,path=''):
        if not os.path.isfile(f):
            print 'File does not exists'
            return
        self.clf=pickle.load(open(f,'wb'))
        
    def load_vectorizer(self,filename,path=''):
        if not os.path.isfile(f):
            print 'File does not exists'
            return
        self.vectorizer=pickle.load(open(f,'wb'))

    def save_classifier(self,filename,path='',overwrite=False):
        f=os.path.join(path,filename)
        if not overwrite:
            if os.path.isfile(f):
                print 'File already exists'
                return
        pickle.dump(self.clf,open(f,'wb'))


    def save_vectorizer(self,filename,path='',overwrite=False):
        f=os.path.join(path,filename)
        if not overwrite:
            if os.path.isfile(f):
                print 'File already exists'
                return
        pickle.dump(self.vectorizer,open(f,'wb'))
            
        
    def set_label(self,value):
        if value not in self.labels_unique:
            print 'invalid label'
            return
        if self.next_idx is not None:
            self.my_labels[self.next_idx]=value
            self.next_idx=None
        else:
            print 'no index to be labeled selected. Call set_next()'

    def load_marketwired(self,dirs=['marketwired_data_rerun2','marketwired_data_rerun1','marketwired_data_symbolwise']):
        for path in dirs:
            if os.path.exists(path):
                for fname in os.listdir(path):
                    pathname=os.path.join(path,fname)
                    f=open(pathname,'r')
                    yield pathname,f.read()                
                    f.close()
            else:
                print 'directory '+path+' does not exist. Data not loaded'

    def clean_text(self,corpus,html=False):
        #unwanted chars:
        #to be replaced with ''
        charsNOWHITE = '"#()\'*+/<=>@[\\]^_`{|}~'
        #to be replaced with ' '
        charsWHITE =',.&!+:;?\n'
        remove_nowhite_map = dict((ord(char), None) for char in charsNOWHITE)
        remove_white_map = dict((ord(char), u' ') for char in charsWHITE)
        #
        strp=frozenset(string.printable)
        tab1=' '*len(charsWHITE)
        transtable=string.maketrans(charsWHITE,tab1)
        if html:
            for line in corpus:
                #remove non ascii characters
                line = filter(lambda x: x in strp, line)
                parsed_html = BeauifulSoup(line)
                texts = parsed_html.findAll(text=True)
                line=" ".join(texts).lower()

                #or replace non ascii characters
                #html = ''.join([x.lower() if (x in string.printable) else 'X' for x in line])
                #line = re.sub(r'X[X\s]*',' X ',line)

                #remove certain characters using fast c code translate function
                #line=line.translate(remove_nowhite_map)
                #replace certain characters with whitesapce
                #line=line.translate(remove_white_map)

                #If text is string and not unicode, use this:
                line=line.translate(None,charsNOWHITE)
                line=line.translate(transtable)

                #remove excessive whitespaces
                #line = " ".join(line.split())

                yield line
        else:
            for line in corpus:
                #remove non ascii characters
                line = filter(lambda x: x in strp, line)

                #remove certain characters using fast c code translate function
                #line=line.translate(remove_nowhite_map)
                #line=line.translate(remove_white_map)
                line=line.translate(None,charsNOWHITE)
                line=line.translate(transtable)


                yield line.lower()