예제 #1
0
파일: classify.py 프로젝트: juix/scripts
 def __init__(self):
     self.db = ExtendedDB(commitOnClose=False)
     self.X,self.y,self.X_eval,self.ids_eval,self.columns = MatrixFactory().loadFromFile()
     #clf=ensemble.RandomForestClassifier(n_estimators=66)
     #clf=svm.SVC(kernel="linear",gamma=0.01, C=250, class_weight='balanced')
     clf=naive_bayes.MultinomialNB(alpha=0.01)
     self.clf = clf
예제 #2
0
파일: classify.py 프로젝트: juix/scripts
class Main(object):
    signifikanzNiveau = 0.7

    def __init__(self):
        self.db = ExtendedDB(commitOnClose=False)
        self.X,self.y,self.X_eval,self.ids_eval,self.columns = MatrixFactory().loadFromFile()
        #clf=ensemble.RandomForestClassifier(n_estimators=66)
        #clf=svm.SVC(kernel="linear",gamma=0.01, C=250, class_weight='balanced')
        clf=naive_bayes.MultinomialNB(alpha=0.01)
        self.clf = clf
        
    def classify(self):
        print("classifying")
        self.clf.fit(self.X,self.y)
        y_pred = self.clf.predict(self.X_eval)
        self.db.query("DELETE FROM predictions")
        for i, cls in zip(self.ids_eval, y_pred):
            self.db.save("predictions",id=i,cls=int(cls))
        self.db.commit()
        print("done.")
        
    def crossvali(self):
        print("crossvalidation")
        X_train, X_test, y_train, y_test = cross_validation.train_test_split(
                    self.X, self.y, test_size=0.1, random_state=8198987, stratify=self.y)
        self.clf.fit(X_train,y_train)
        y_pred = self.clf.predict(X_test)
        print(metrics.confusion_matrix(y_test, y_pred))
        print(metrics.classification_report(y_test, y_pred, target_names=sorted(set([str(x) for x in (y_test.tolist()+y_pred.tolist())]))))

    def chi2(self):
                df = len(self.columns) # freiheitsgrad
                #krit = chi2.ppf(0.95, df)
                krit = chi2.ppf(self.signifikanzNiveau, df)
                print("Kritischer Wert chi² = %0.3f"%krit)        

                limit = 50
                skb = feature_selection.SelectKBest(feature_selection.chi2, k=limit)
                skb.fit(self.X, self.y)
                a=[(idx_,score) for idx_,score in enumerate(skb.scores_) if not np.isnan(score)]
                a=sorted(a,key=lambda x:x[1], reverse=True)
                most=[(self.columns[idx_],score) for idx_,score in a[:limit]]
                msg = u"\n\t".join([u"%s (%0.3f)"%(name,score) for name,score in most])
                sys.stderr.write((u"%s\n"%msg).encode("utf-8",errors="strict"))
예제 #3
0
파일: makeMatrix.py 프로젝트: juix/scripts
 def __init__(self):
     self.db = ExtendedDB(commitOnClose=False)
예제 #4
0
파일: makeMatrix.py 프로젝트: juix/scripts
class Main(object):
    nominalFeatures=["uploaderlink"]
    stringFeatures=["uploaderlink"]
    normalise=True
    encodeNominalFeatures=True
    KBEST = 1500
    max_features_also_favourited = 10e3

    def __init__(self):
        self.db = ExtendedDB(commitOnClose=False)
        
    def __call__(self, normalise="auto"):
        """
        Create normalised feature matrix out of db.
        Handles nominal and string features.
        """
        print("creating matrix")
        if normalise=="auto": normalise=self.normalise
        
        self.db.query("""
            DROP TABLE IF EXISTS temp_features
        """)
        self.db.query("""
            CREATE TEMP TABLE temp_also_favourited AS (
                SELECT a.id, a.id_also
                FROM also_favourited AS a
                INNER JOIN
                (
                    SELECT id_also, count(id_also)
                    FROM also_favourited
                    GROUP BY id_also 
                    --HAVING count(id_also) > 2
                    ORDER BY count(id_also) DESC
                    LIMIT %s
                ) AS b
                ON a.id_also = b.id_also
            )
        """,(self.max_features_also_favourited,))
        self.db.query("""
            CREATE TABLE temp_features AS(
                SELECT 
                    m.id, 
                    m.uploaderlink,
                    m.views,
                    m.favourited,
                    m.time,
                    COALESCE(y.count,0) AS comments,
                    (
                        views
                        *(  1-
                            EXTRACT(epoch FROM AGE(m.time))
                            /(SELECT max(EXTRACT(epoch FROM AGE(medium.time)))+1 FROM medium)
                        )
                        --/(SELECT MAX(views) FROM medium)
                    ) AS views_per_time,
                    (
                        favourited
                        *(  1-
                            EXTRACT(epoch FROM AGE(m.time))
                            /(SELECT max(EXTRACT(epoch FROM AGE(medium.time)))+1 FROM medium)
                        )
                        --/(SELECT MAX(favourited) FROM medium)
                    ) AS favourited_per_time,
                    (
                        COALESCE(y.count,0)
                        *(  1-
                            EXTRACT(epoch FROM AGE(m.time))
                            /(SELECT max(EXTRACT(epoch FROM AGE(medium.time)))+1 FROM medium)
                        )
                        --/(SELECT MAX(favourited) FROM medium)
                    ) AS comments_per_time --,
                    --CASE WHEN (m.id in (SELECT id FROM ids_likes)) THEN 'l'
                    --     WHEN (m.id in (SELECT id FROM ids_dislikes)) THEN 'd'
                    --     ELSE '?'
                    --END AS rating
                FROM medium AS m
                LEFT JOIN (
                    SELECT id, count(id)
                    FROM comments
                    GROUP BY id
                ) AS y
                ON m.id = y.id
                --INNER JOIN (
                --    SELECT * FROM ids_likes
                --    UNION ALL
                --    SELECT * FROM ids_dislikes
                --) AS z
                --ON m.id = z.id
            )
        """)
        self.db.commit()
        
        # table to pandas
        alchemy = sqlalchemy.create_engine('postgresql:///motherless')
        data = pd.read_sql_table("temp_features",alchemy,index_col='id',parse_dates=None)
        
        # filter
        #data = data[:10]
        
        ids = data.index.tolist()

        # stringfeatures to int
        mle = MultiColumnLabelEncoder(columns = self.stringFeatures)
        data = mle.fit_transform(data)
        del mle
        
        # datetime to int
        data["time"] = [d.toordinal() for d in data["time"]]
        
        # X = np.array(data)
        columns = data.columns
        X = data.as_matrix().astype(DTYPEINT)
        del data
        print("\t#features = %d"%X.shape[1])
        
        # nominal features to binary
        if self.encodeNominalFeatures:
                sys.stdout.write("\tNominal features to binary...")
                enc = preprocessing.OneHotEncoder(
                    categorical_features=np.in1d(
                        columns,list(self.nominalFeatures)), 
                    sparse=False,
                    dtype=DTYPEINT)
                X = enc.fit_transform(X).astype(DTYPEINT)
                #columns 
                nominalOrdered = [x for x in columns if x in self.nominalFeatures]
                notNominalOrdered = [x for x in columns if x not in self.nominalFeatures]
                encodedColumns = [col for x,col in enumerate(nominalOrdered) for _ in range(enc.n_values_[x]) ]
                columns = np.concatenate((encodedColumns, notNominalOrdered))
                print("\t#features = %d"%X.shape[1])
                del enc

        # list-features to binary
        for table,select in [("tags","link"),("groups","link"),
                ("temp_also_favourited","id_also")
            ]:
            sys.stdout.write("\t%s... "%table)
            sys.stdout.flush()
            X2, columns2 = self.getListFeatures(table,select,ids)
            sys.stdout.write("\t#features: +%d -> %d..."%(X2.shape[1], X.shape[1]+X2.shape[1]))
            sys.stdout.flush()
            # unite(data,X2)
            #data = data.join(X2,rsuffix=table)
            columns = np.concatenate((columns,columns2))
            X = np.column_stack((X,X2))
            sys.stdout.write("done (%s)\n"%X.dtype)
            sys.stdout.flush()
                
        # set vars
        sys.stdout.write("\tsetting vars\n")
        likes = self.getRatedIds("likes")
        dislikes = self.getRatedIds("dislikes")
        ids_condition = [(i, (i in likes or i in dislikes)) for i in ids]
        ids_train = [i for i,condition in ids_condition if condition == True]
        ids_eval = [i for i,condition in ids_condition if condition == False]
        mask_rated = np.array([condition for _, condition in ids_condition],dtype=np.bool)
        y_train = np.array([int(i in likes) for i in ids_train],dtype=np.uint8)

        # feature selection
        chosen_cols, skb = self.fvTrimmed(X[mask_rated,:], y_train)
        #columns = [columns[x] for x in chosen_cols]
        columns = columns[skb.get_support()]
        X = skb.transform(X)
        del skb

        if normalise:
                X = preprocessing.normalize(X, norm='l2', axis=0).astype(DTYPEFLOAT)
        else:
            X = X.astype(DTYPEFLOAT)
        
        # split labelled and unlabelled samples
        X_train = X[mask_rated,:]
        X_eval = X[np.logical_not(mask_rated),:]
        
        return (X_train, y_train, X_eval, ids_eval, columns)

    def fvTrimmed(self, X, y):
        """ select k best features """
        sys.stdout.write("Feature selection... ")
        sys.stdout.flush()
        skb = feature_selection.SelectKBest(feature_selection.chi2, k=self.KBEST)
        skb.fit(X, y)
        #X = skb.transform(X)
        #columns:
        a=[(idx_,score) for idx_,score in enumerate(skb.scores_) if not np.isnan(score)]
        a=sorted(a,key=lambda x:x[1], reverse=True)
        cols = [idx_ for idx_,score in a[:self.KBEST]]
        print("%d Features"%len(cols))
        #most=[(self.columns[idx_],score) for idx_,score in a[:self.KBEST]]
        return cols, skb
        #args2 = (cols, X, y)+tuple([skb.transform(m) for m in args])
        #return args2
                
    def writeToFile(self, *args, **kwargs):
        """ write all info from __call__ to ./matrix.npz """
        X,y,X_eval,ids,cols = self(*args, **kwargs)
        print("writing file")
        np.savez("matrix.npz",X=X,y=y,X_eval=X_eval,ids=ids,cols=cols)

    def loadFromFile(self):
        """ load matrix from file """
        sys.stdout.write("loading file ")
        npz = np.load("matrix.npz")
        sys.stdout.write("(%s)\n"%npz["X"].dtype)
        return npz["X"], npz["y"], npz["X_eval"], npz["ids"], npz["cols"]
    
    def getRatedIds(self, category="likes"):
        """ return id list of all @category media """
        self.db.query("SELECT id FROM ids_%s"%category)
        return [r["id"] for r in self.db.cursor.fetchall()]
            
    def getListFeatures(self, table, select, ids):
        """
        Create dict with tags and groups for each medium. Transform to binary matrix and return.
        """
        mlb = preprocessing.MultiLabelBinarizer(sparse_output=False)
        multiLabels = []
        for i in ids:
            self.db.query("SELECT "+select+" FROM "+table+" WHERE id=%s",(i,))
            multiLabels.append(set([r[0] for r in self.db.cursor.fetchall()]))
        X2 = mlb.fit_transform(multiLabels).astype(np.uint8)
        return X2, ["%s_%s"%(table,x) for x in mlb.classes_]