Exemplo n.º 1
0
def load_Boilerplate():
    print "loading data.."
    traindata_raw = list(np.array(pd.read_table('../data/train.tsv'))[:,2])
    testdata_raw = list(np.array(pd.read_table('../data/test.tsv'))[:,2])

    if 0:    
        traindata_raw = tr_json(traindata_raw)
        testdata_raw = tr_json(testdata_raw)

    y = np.array(pd.read_table('../data/train.tsv'))[:,-1]

    if False:
        print "pre-processing data"
        traindata = []
        testdata = []
        for observation in traindata_raw:
            traindata.append(preprocess_pipeline(observation, "english", 
                "WordNetLemmatizer", True, True, False))
        for observation in testdata_raw:
            testdata.append(preprocess_pipeline(observation, "english", 
                "WordNetLemmatizer", True, True, False))
    else:
        traindata, testdata = traindata_raw, testdata_raw

    X_all = traindata + testdata
    lentrain = len(traindata)
    return X_all,y,lentrain
Exemplo n.º 2
0
def load_Boilerplate():
    print "loading data.."
    traindata_raw = list(np.array(pd.read_table('../data/train.tsv'))[:, 2])
    testdata_raw = list(np.array(pd.read_table('../data/test.tsv'))[:, 2])

    if 0:
        traindata_raw = tr_json(traindata_raw)
        testdata_raw = tr_json(testdata_raw)

    y = np.array(pd.read_table('../data/train.tsv'))[:, -1]

    if False:
        print "pre-processing data"
        traindata = []
        testdata = []
        for observation in traindata_raw:
            traindata.append(
                preprocess_pipeline(observation, "english",
                                    "WordNetLemmatizer", True, True, False))
        for observation in testdata_raw:
            testdata.append(
                preprocess_pipeline(observation, "english",
                                    "WordNetLemmatizer", True, True, False))
    else:
        traindata, testdata = traindata_raw, testdata_raw

    X_all = traindata + testdata
    lentrain = len(traindata)
    return X_all, y, lentrain
Exemplo n.º 3
0
def lsa(BP, lentrain, n_components=16, preproc=True, 
    fit_area='test', min_df=3):
    """
    aka Latent semantic analysis
    """
    if preproc:
        print "pre-processing data"
        traindata = []
        for observation in BP:
            traindata.append(preprocess_pipeline(observation, "english", 
                "WordNetLemmatizer", True, True, False))
        BP = traindata

    print "fitting TfidfVectorizer"
    tfv = TfidfVectorizer(min_df=min_df,  max_features=None, strip_accents='unicode',  
        analyzer='word',token_pattern=r'\w{1,}',ngram_range=(1, 2), use_idf=1,
        smooth_idf=1, sublinear_tf=1, norm='l2')
    if fit_area == 'test':
        tfv.fit(BP[lentrain:])
    elif fit_area == 'train':
        tfv.fit(BP[:lentrain])
    else:
        tfv.fit(BP)
    print "transforming data"
    BP = tfv.transform(BP)
    print "BP(post):",BP.shape

    if 1:
        # svd here
        print "use svd"
        svd = TruncatedSVD(n_components=n_components, random_state=1)
        BP = svd.fit_transform(BP)
    
    return BP
Exemplo n.º 4
0
    def load(self, preproc=-1, update=False):
        fname, update = self.get_fname(update, suff='bp')
        global gBP
        if update:
            print "loading data.."
            train_df = pd.read_table('%s/train.tsv' % self.datapath,
                                     na_values='?')
            test_df = pd.read_table('%s/test.tsv' % self.datapath,
                                    na_values='?')
            assert train_df.shape[1] == test_df.shape[1] + 1
            names = train_df.columns
            print(list(names))
            y = np.array(train_df['label'].values, dtype=np.int64)
            X_all_df = pd.concat((train_df.ix[:, :-1], test_df))
            print("train:", train_df.shape, "test:", test_df.shape,
                  "X_all_df:", X_all_df.shape)
            assert X_all_df.shape == (train_df.shape[0] + test_df.shape[0],
                                      test_df.shape[1])

            if preproc > 0:
                print "pre-processing data and update Xal_df:", X_all_df.shape
                N = X_all_df.shape[0]
                for i in range(N):
                    observation = X_all_df.iloc[i, 2]
                    d = json.loads(observation)
                    for k in ['title', 'body']:
                        if k in d and d[k]:
                            d[k] = preprocess_pipeline(
                                d[k],
                                lang="english",
                                stemmer_type=supported_stemmers[self.stemmer],
                                return_as_str=True,
                                do_remove_stopwords=True,
                                do_clean_html=False)
                    X_all_df.iloc[i, 2] = json.dumps(d)
            else:
                preproc = 0

            gBP.fit(X_all_df,
                    y,
                    n_components=self.n_components,
                    min_df=self.min_df)
            #gBP.fit(X_all_df.iloc[:400,:],y[:200],preproc=preproc)

            dat = (X_all_df, y, preproc, gBP)
            print "save data X_all_df", X_all_df.shape, "y", y.shape
            joblib.dump(dat, fname)
        else:
            print "load data..",
            with open(fname, "rb") as f:
                dat = joblib.load(fname)
                (X_all_df, y, preproc1, gBP) = dat
            print "=> X_all_df:", dat[0].shape, "y:", dat[
                1].shape, "preproc:", preproc1
            if preproc >= 0 and preproc1 != preproc:
                return self.load(preproc=preproc, update=True)
        return X_all_df, y
Exemplo n.º 5
0
  def load_transform(self, update=False):
    """ Load data, fit BP, and transform it and return back
    """
    params = (self.n_components,self.min_df,self.preproc,self.use_svd,self.tfidf,
        self.stemmer,self.fit_area,self.extra)
    if update:
        ex = json.loads(self.extra)
        do_remove_stopwords = ex.get('do_remove_stopwords',True)
        logging.info("updating data%s",params)
        train_df = pd.read_table('%s/train.tsv'%self.datapath, na_values='?')
        test_df = pd.read_table('%s/test.tsv'%self.datapath, na_values='?')
        assert train_df.shape[1] == test_df.shape[1] + 1
        names = train_df.columns
        #logging.debug("%s",list(names))
        y = np.array(train_df['label'].values, dtype=np.int64)
        X_all_df = pd.concat((train_df.ix[:,:-1],test_df))
        logging.debug("train:%s test:%s X_all_df:%s",train_df.shape,test_df.shape,X_all_df.shape)
        assert X_all_df.shape == (train_df.shape[0]+test_df.shape[0],test_df.shape[1])
        
        if self.preproc > 0:
            logging.info("pre-processing data and update Xal_df:%s",X_all_df.shape)
            N = X_all_df.shape[0]
            for i in range(N):
                observation = X_all_df.iloc[i,2]
                d = json.loads(observation)
                for k in ['title','body']:
                    if k in d and d[k]:
                        d[k] = preprocess_pipeline(d[k], 
                            lang="english", stemmer_type=supported_stemmers[self.stemmer], return_as_str=True,
                            do_remove_stopwords=do_remove_stopwords, do_clean_html=False)
                X_all_df.iloc[i,2] = json.dumps(d)
        
        BPobj1 = BPobj()
        BPobj1.fit(X_all_df,y,n_components=self.n_components,min_df=self.min_df, 
            use_svd=self.use_svd, tfidf=self.tfidf, fit_area=self.fit_area,
            extra=self.extra)

        BP = BPobj1.transform(X_all_df)
        dat =  (X_all_df,y,BP,params)
        logging.info("save data X_all_df:%s y:%s BP:%s",X_all_df.shape,y.shape,BP.shape)
        self.cache_dump(dat,params)
    else:
        logging.debug("load data%s",params)
        data = self.cache_load(params)
        if data is None:
            logging.info("Data with params not cached: updating..")
            return self.load_transform(update=True)
        else:
            (X_all_df,y,BP,params) = data
    return (X_all_df,y,BP,params)
Exemplo n.º 6
0
def lsa(BP,
        lentrain,
        n_components=16,
        preproc=True,
        fit_area='test',
        min_df=3):
    """
    aka Latent semantic analysis
    """
    if preproc:
        print "pre-processing data"
        traindata = []
        for observation in BP:
            traindata.append(
                preprocess_pipeline(observation, "english",
                                    "WordNetLemmatizer", True, True, False))
        BP = traindata

    print "fitting TfidfVectorizer"
    tfv = TfidfVectorizer(min_df=min_df,
                          max_features=None,
                          strip_accents='unicode',
                          analyzer='word',
                          token_pattern=r'\w{1,}',
                          ngram_range=(1, 2),
                          use_idf=1,
                          smooth_idf=1,
                          sublinear_tf=1,
                          norm='l2')
    if fit_area == 'test':
        tfv.fit(BP[lentrain:])
    elif fit_area == 'train':
        tfv.fit(BP[:lentrain])
    else:
        tfv.fit(BP)
    print "transforming data"
    BP = tfv.transform(BP)
    print "BP(post):", BP.shape

    if 1:
        # svd here
        print "use svd"
        svd = TruncatedSVD(n_components=n_components, random_state=1)
        BP = svd.fit_transform(BP)

    return BP
Exemplo n.º 7
0
 def load(self, preproc = -1, update=False):
   fname, update = self.get_fname(update, suff='bp')
   global gBP
   if update:
       print "loading data.."
       train_df = pd.read_table('%s/train.tsv'%self.datapath, na_values='?')
       test_df = pd.read_table('%s/test.tsv'%self.datapath, na_values='?')
       assert train_df.shape[1] == test_df.shape[1] + 1
       names = train_df.columns
       print(list(names))
       y = np.array(train_df['label'].values, dtype=np.int64)
       X_all_df = pd.concat((train_df.ix[:,:-1],test_df))
       print("train:",train_df.shape,"test:",test_df.shape,"X_all_df:",X_all_df.shape)
       assert X_all_df.shape == (train_df.shape[0]+test_df.shape[0],test_df.shape[1])
       
       if preproc > 0:
           print "pre-processing data and update Xal_df:",X_all_df.shape
           N = X_all_df.shape[0]
           for i in range(N):
               observation = X_all_df.iloc[i,2]
               d = json.loads(observation)
               for k in ['title','body']:
                   if k in d and d[k]:
                       d[k] = preprocess_pipeline(d[k], 
                           lang="english", stemmer_type=supported_stemmers[self.stemmer], return_as_str=True,
                           do_remove_stopwords=True, do_clean_html=False)
               X_all_df.iloc[i,2] = json.dumps(d)
       else:
           preproc = 0
       
       gBP.fit(X_all_df,y,n_components=self.n_components,min_df=self.min_df)
       #gBP.fit(X_all_df.iloc[:400,:],y[:200],preproc=preproc)
       
       dat =  (X_all_df,y,preproc,gBP)
       print "save data X_all_df",X_all_df.shape,"y",y.shape
       joblib.dump(dat,fname)
   else:
       print "load data..",
       with open(fname,"rb") as f:
           dat = joblib.load(fname)
           (X_all_df, y, preproc1, gBP) = dat 
       print "=> X_all_df:",dat[0].shape,"y:",dat[1].shape,"preproc:",preproc1
       if preproc >= 0 and preproc1 != preproc:
           return self.load(preproc=preproc, update=True)
   return X_all_df, y
Exemplo n.º 8
0
    def load_transform(self, update=False):
        """ Load data, fit BP, and transform it and return back
    """
        params = (self.n_components, self.min_df, self.preproc, self.use_svd,
                  self.tfidf, self.stemmer, self.fit_area, self.extra)
        if update:
            ex = json.loads(self.extra)
            do_remove_stopwords = ex.get('do_remove_stopwords', True)
            logging.info("updating data%s", params)
            train_df = pd.read_table('%s/train.tsv' % self.datapath,
                                     na_values='?')
            test_df = pd.read_table('%s/test.tsv' % self.datapath,
                                    na_values='?')
            assert train_df.shape[1] == test_df.shape[1] + 1
            names = train_df.columns
            #logging.debug("%s",list(names))
            y = np.array(train_df['label'].values, dtype=np.int64)
            X_all_df = pd.concat((train_df.ix[:, :-1], test_df))
            logging.debug("train:%s test:%s X_all_df:%s", train_df.shape,
                          test_df.shape, X_all_df.shape)
            assert X_all_df.shape == (train_df.shape[0] + test_df.shape[0],
                                      test_df.shape[1])

            if self.preproc > 0:
                logging.info("pre-processing data and update Xal_df:%s",
                             X_all_df.shape)
                N = X_all_df.shape[0]
                for i in range(N):
                    observation = X_all_df.iloc[i, 2]
                    d = json.loads(observation)
                    for k in ['title', 'body']:
                        if k in d and d[k]:
                            d[k] = preprocess_pipeline(
                                d[k],
                                lang="english",
                                stemmer_type=supported_stemmers[self.stemmer],
                                return_as_str=True,
                                do_remove_stopwords=do_remove_stopwords,
                                do_clean_html=False)
                    X_all_df.iloc[i, 2] = json.dumps(d)

            BPobj1 = BPobj()
            BPobj1.fit(X_all_df,
                       y,
                       n_components=self.n_components,
                       min_df=self.min_df,
                       use_svd=self.use_svd,
                       tfidf=self.tfidf,
                       fit_area=self.fit_area,
                       extra=self.extra)

            BP = BPobj1.transform(X_all_df)
            dat = (X_all_df, y, BP, params)
            logging.info("save data X_all_df:%s y:%s BP:%s", X_all_df.shape,
                         y.shape, BP.shape)
            self.cache_dump(dat, params)
        else:
            logging.debug("load data%s", params)
            data = self.cache_load(params)
            if data is None:
                logging.info("Data with params not cached: updating..")
                return self.load_transform(update=True)
            else:
                (X_all_df, y, BP, params) = data
        return (X_all_df, y, BP, params)