def user_format(userPath,userSavePath):
    '''给粉丝|关注|微博数加性别标签'''
    print '''读取用户列表'''
    userlist =[]
    gender =[]
    t=[]
    userf = codecs.open(userPath)  
    for line in userf.readlines(): 
        usertemp = line.strip().split(',')  
        userlist.append([tk for tk in usertemp[:]])
    userlist = np.array(userlist)
    data = userlist.T[4:].T    
    genderlabel = userlist.T[1].T
    for i in range(0,len(genderlabel)):
        if genderlabel[i] =='女':
            gender.append(0)
        else:
            gender.append(1)
        t.append([double(tk) for tk in data[i][:]])
 
    data = np.array(t)
    genderlabel = np.array(gender)   
    print len(data[0])
    print len(genderlabel)
    dump_svmlight_file(data, genderlabel,userSavePath,zero_based=False)
    userf.close()
def gender_label_format(userPath,contentPath,DataPath,WritePath):
    '''加性别标签'''
    data = []
    label = []
    userlist =[]
    imagename =[]
    contentlist= []
    print'''导入数据'''
    img = codecs.open(DataPath)  
    for line in img.readlines(): 
        datatemp = line.strip().split(',')  
        imagename.append(datatemp[1])
        data.append([double(tk) for tk in datatemp[2:]])
    img.close()
    imagename = np.array(imagename)
    data = np.array(data)
    print'''导入用户信息'''
    userf = codecs.open(userPath)  
    for line in userf.readlines(): 
        usertemp = line.strip().split(',')  
        #print usertemp[1]
        userlist.append([tk for tk in usertemp[:]])
    userf.close()
    print'''导入图片信息'''
    contentf = codecs.open(contentPath)  
    for line in contentf.readlines(): 
        contenttemp = line.strip().split(',')  
        #print contenttemp[1]
        contentlist.append([tk for tk in contenttemp[:]])
    contentf.close()
    
    print '''填入标签'''
    for i in range(0,len(imagename)):
        name = imagename[i]
        print name
        flag = 0
        for li in contentlist:
            #print li
            if(name == li[1]):
                for user in userlist:
                    #print user
                    if(user[0] == li [0]):
                        flag = 1
                        print user[1]
                        if(user[1] == '女'):
                            label.append(0)
                        else:
                            label.append(1)
                        break
                break
            #print genderlabel
        if(flag == 0):
            print i
            print(name+"没有对应的标签")
            np.delete(data, i, 0)#删除对应的数据            
    label = np.array(label)
    print label
    print len(data)
    print len(label)
    dump_svmlight_file(data, label,WritePath,zero_based=False)
Пример #3
0
def tovw(x, y=None, sample_weight=None):
    """ Convert array or sparse matrix to Vowpal Wabbit format

    Parameters
    ----------
    x : {array-like, sparse matrix}, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.
    y : {array-like}, shape (n_samples,), optional
        Target vector relative to X.
    sample_weight : {array-like}, shape (n_samples,), optional
                    sample weight vector relative to X.

    Returns
    -------
    out : {array-like}, shape (n_samples, 1)
          Training vectors in VW string format
    """

    use_truth = y is not None
    use_weight = sample_weight is not None

    # convert to numpy array if needed
    if not isinstance(x, (np.ndarray, csr_matrix)):
        x = np.array(x)
    if not isinstance(y, np.ndarray):
        y = np.array(y)

    # make sure this is a 2d array
    if x.ndim == 1:
        x = x.reshape(1, -1)
    if y.ndim == 0:
        y = y.reshape(1)

    rows, cols = x.shape

    # check for invalid characters if array has string values
    if x.dtype.char == 'S':
        for row in rows:
            for col in cols:
                x[row, col] = INVALID_CHARS.sub('.', x[row, col])

    # convert input to svmlight format
    s = StringIO.StringIO()
    dump_svmlight_file(x, np.zeros(rows), s)

    # parse entries to construct VW format
    rows = s.getvalue().split('\n')[:-1]
    out = []
    for idx, row in enumerate(rows):
        truth = y[idx] if use_truth else 1
        weight = sample_weight[idx] if use_weight else 1
        features = row.split('0 ', 1)[1]
        # only using a single namespace and no tags
        out.append(('{y} {w} |{ns} {x}'.format(y=truth, w=weight, ns=DEFAULT_NS, x=features)))

    s.close()

    return out
Пример #4
0
def tovw(x, y=None, sample_weight=None):
    """Convert array or sparse matrix to Vowpal Wabbit format

    Parameters
    ----------
    x : {array-like, sparse matrix}, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.
    y : {array-like}, shape (n_samples,), optional
        Target vector relative to X.
    sample_weight : {array-like}, shape (n_samples,), optional
                    sample weight vector relative to X.

    Returns
    -------
    out : {array-like}, shape (n_samples, 1)
          Training vectors in VW string format
    """

    use_truth = y is not None
    use_weight = sample_weight is not None

    # convert to numpy array if needed
    if not isinstance(x, (np.ndarray, csr_matrix)):
        x = np.array(x)
    if not isinstance(y, np.ndarray):
        y = np.array(y)

    # make sure this is a 2d array
    if x.ndim == 1:
        x = x.reshape(1, -1)
    if y.ndim == 0:
        y = y.reshape(1)

    rows, cols = x.shape

    # check for invalid characters if array has string values
    if x.dtype.char == 'S':
        for row in rows:
            for col in cols:
                x[row, col] = INVALID_CHARS.sub('.', x[row, col])

    # convert input to svmlight format
    s = StringIO.StringIO()
    dump_svmlight_file(x, np.zeros(rows), s)

    # parse entries to construct VW format
    rows = s.getvalue().split('\n')[:-1]
    out = []
    for idx, row in enumerate(rows):
        truth = y[idx] if use_truth else 1
        weight = sample_weight[idx] if use_weight else 1
        features = row.split('0 ', 1)[1]
        # only using a single namespace and no tags
        out.append(('{y} {w} |{ns} {x}'.format(y=truth, w=weight, ns=DEFAULT_NS, x=features)))

    s.close()

    return out
Пример #5
0
def write_to_file(X,K):
  from scipy import io
  from sklearn.datasets.svmlight_format import dump_svmlight_file
  dd_path = olfaction_prediction_path + '/data/derived/'
  if not os.path.isdir(dd_path):
    os.mkdir(dd_path)
  np.savetxt(dd_path + 'nspdk_r3_d4_unaug_gramian.mtx.gz', K)
  # Write features in standard libSVM format:
  dump_svmlight_file(X,np.zeros(X.shape[0]),dd_path + 'nspdk_r3_d4_unaug.svm')
Пример #6
0
def write_to_file(X, K):
    from scipy import io
    from sklearn.datasets.svmlight_format import dump_svmlight_file
    dd_path = olfaction_prediction_path + '/data/derived/'
    if not os.path.isdir(dd_path):
        os.mkdir(dd_path)
    np.savetxt(dd_path + 'nspdk_r3_d4_unaug_gramian.mtx.gz', K)
    # Write features in standard libSVM format:
    dump_svmlight_file(X, np.zeros(X.shape[0]),
                       dd_path + 'nspdk_r3_d4_unaug.svm')
def term_label_format(imagePath,termPath,termWritePath):
    ''''加终端标签'''
    data = []
    termlabel = []
    termlist = []
    imagename =[]
    contentlist= []
    img = codecs.open(imagePath)  
    
    for line in img.readlines(): 
        datatemp = line.strip().split(',')  
        imagename.append(datatemp[1])
        data.append([double(tk) for tk in datatemp[2:]])
    img.close()
    imagename = np.array(imagename)
    data = np.array(data)
    
    comf = codecs.open(termPath)
    for line in comf.readlines():
        termlisttemp = line.strip().split(",")
        termlist.append(termlisttemp)
    comf.close()
    
    contentf = codecs.open(contentPath)  
    for line in contentf.readlines(): 
        contenttemp = line.strip().split(',')  
        #print contenttemp[1]
        contentlist.append([tk for tk in contenttemp[:]])
    contentf.close()
    
    print '''填入标签'''
    for i in range(0,len(imagename)):
        name = imagename[i]
        flag = 0
        for li in contentlist:
            #print li
            if(name == li[1]):
                for term in termlist:
                    if(term[0] == li [6]):
                        flag = 1
                        termlabel.append(int(term[1]))
                        print i,name,term[0],term[1]
                        break
                break
        if(flag == 0):
            print i
            print(name+"没有对应的标签")
            del data[i]#删除对应的数据
    termlabel = np.array(termlabel)
    print termlabel
    print len(data)
    print len(termlabel)
    dump_svmlight_file(data, termlabel,termWritePath,zero_based=False)
def cutUselessFeatures(Path,delline,writePath):
    """
        把区分度不大的特征删除
    """
    from sklearn.datasets import load_svmlight_file
    data, target = load_svmlight_file(Path)
    deldata=data.toarray()
    for i in delline:
        print i
        deldata = np.delete(deldata, i, 1)#删除第i列的元素值
    
    
    dump_svmlight_file(deldata, target,writePath,zero_based=False)
Пример #9
0
    def transform_features(self):
        totransform = []
        for index, item in enumerate(self.feat_head):
            field = item[0]
            func_name = item[1]
            transform = item[2]
            is_enable = item[3]

            if is_enable:
                if not field in self.stumble_data.get_features():
                    print 'field not in feature..generating:' + field
                    func_name(field)
                totransform.append((field, transform))

        if len(totransform):
            mapper = DataFrameMapper(totransform)
            mapper.fit(self.stumble_data.all_pd[:self.stumble_data.len_train])
            #
            X_transformed_train = mapper.transform(
                self.stumble_data.all_pd[:self.stumble_data.len_train])
            X_transformed_test = mapper.transform(
                self.stumble_data.all_pd[self.stumble_data.len_train:])

            for index, item in enumerate(self.feat_head):
                field = item[0]
                is_enable = item[3]
                if is_enable and field in self.stumble_data.get_features():
                    del self.stumble_data.all_pd[field]

            import pdb
            pdb.set_trace()

            from scipy.sparse import hstack

            X_train = X_transformed_train
            X_test = X_transformed_test
            y_train = self.stumble_data.all_pd[:self.stumble_data.
                                               len_train]['label']
            #            print 'Dumping train in SVMLight.'
            dump_svmlight_file(X_train, y_train, output_train_libsvm_file)


#            print 'Dumping test in SVMLight.'
#            dump_svmlight_file(X_test, pred, output_test_libsvm_file )

        else:
            X_train = X_train.as_matrix()
            X_test = X_test.as_matrix()

        return X_train, y_train, X_test
Пример #10
0
   def transform_features(self):
       totransform = []
       for index, item in enumerate(self.feat_head):
           field = item[0]
           func_name = item[1]
           transform = item[2]
           is_enable = item[3]

           if is_enable:
               if not field in self.stumble_data.get_features():
                   print 'field not in feature..generating:' +  field
                   func_name(field)
               totransform.append((field, transform))

       if len(totransform):
           mapper = DataFrameMapper(totransform)
           mapper.fit(self.stumble_data.all_pd[:self.stumble_data.len_train])
           #
           X_transformed_train = mapper.transform(
               self.stumble_data.all_pd[:self.stumble_data.len_train])
           X_transformed_test = mapper.transform(
               self.stumble_data.all_pd[self.stumble_data.len_train:])

           for index, item in enumerate(self.feat_head):
               field = item[0]
               is_enable = item[3]
               if is_enable and field in self.stumble_data.get_features():
                   del self.stumble_data.all_pd[field]

           import pdb
           pdb.set_trace()

           from scipy.sparse import hstack

           X_train = X_transformed_train
           X_test = X_transformed_test
           y_train = self.stumble_data.all_pd[:self.stumble_data.len_train]['label']
#            print 'Dumping train in SVMLight.'
           dump_svmlight_file(X_train, y_train, output_train_libsvm_file )

#            print 'Dumping test in SVMLight.'
#            dump_svmlight_file(X_test, pred, output_test_libsvm_file )

       else:
           X_train = X_train.as_matrix()
           X_test = X_test.as_matrix()


       return X_train, y_train, X_test
Пример #11
0
def sina_content_format(contentPath,userPath,contentSavePath):
    '''图片微博加标签'''
    data = []
    userlist =[]
    contentlist =[]
    genderlabel= []
    contentf = codecs.open(contentPath) 
    print '''读取发图微博'''
    for line in contentf.readlines(): 
        contenttemp = line.strip().split(',')  
        #print contenttemp[1]
        contentlist.append([tk for tk in contenttemp[:]])
    contentf.close()
   
    print '''读取用户列表'''
    userf = codecs.open(userPath)  
    for line in userf.readlines(): 
        usertemp = line.strip().split(',')  
        #print usertemp[1]
        userlist.append([tk for tk in usertemp[:]])
    userf.close()
    
    print '''填入数据和标签'''
    for i in range(len(contentlist)):
        temp = []
        temp=[int(tk) for tk in contentlist[i][2:5]]
        s = contentlist[i][5].split(" ")
        t = s[1].split(":")
        temp.append(int(t[0]))
        temp.append(int(contentlist[i][7]))
        #填入数据
        data.append(temp)
        #填入标签
        for user in userlist:
            if(user[0]==contentlist[i][0]):
                if(user[1] == '女'):
                    genderlabel.append(0)
                else:
                    genderlabel.append(1)
                break
    
    data = np.array(data)
    genderlabel = np.array(genderlabel)  
    
    print len(data)  
    print len(genderlabel)
    dump_svmlight_file(data, genderlabel,contentSavePath,zero_based=False)
Пример #12
0
    # category
    vec = DictVectorizer()
    print 'Transforming to dict.'
    X_2_cat_feat = vec.fit_transform(category_rows)

    from scipy.sparse import hstack

    Y_temp = hstack((X_2_cat_feat, X_1_norm_feat))
    Y_temp_2 = hstack((Y_temp, X_0_text_feat_bus_name))
    Y = Y_temp_2.tocsr()

    dump_group_names(
        vec.get_feature_names(),
        feature_name_bus_name,
        'bus_name',
        float_header_list,
        output_train_libsvm_file + '.grp',
        y_shape,
    )

    print 'Dumping train in SVMLight.'
    dump_svmlight_file(Y[0:len_train], rating_rows[0:len_train],
                       output_train_libsvm_file)

    print 'Dumping test in SVMLight.'
    dump_svmlight_file(Y[len_train:], rating_rows[len_train:],
                       output_test_libsvm_file)

    print 'done... Dumping in SVMLight.'
Пример #13
0
    # category
    vec = DictVectorizer()
    print 'Transforming to dict.'
    X_2_cat_feat = vec.fit_transform(category_rows)
    
    from scipy.sparse import hstack

    if(len(X_1_norm_feat) > 0 ):
        Y_temp = hstack((X_2_cat_feat,X_1_norm_feat))
    else:
        Y_temp = X_2_cat_feat
    if ( len(text_rows) > 0 ):
        Y_temp_2 = hstack((Y_temp,X_0_text_feat_bus_name))
    else: 
        Y_temp_2 = Y_temp
    Y = Y_temp_2.tocsr()
    

    #dump_group_names(vec.get_feature_names(), feature_name_bus_name, 'bus_name',float_header_list,  output_train_libsvm_file + '.grp', y_shape, )

    
    print 'Dumping train in SVMLight.'
    dump_svmlight_file(Y[0:len_train], rating_rows[0:len_train], output_train_libsvm_file )

    print 'Dumping test in SVMLight.'
    dump_svmlight_file(Y[len_train:], rating_rows[len_train:], output_test_libsvm_file )
    
    print 'done... Dumping in SVMLight.'


def tovw(x, y=None, sample_weight=None):
    """Convert array or sparse matrix to Vowpal Wabbit format

    Parameters
    ----------
    x : {array-like, sparse matrix}, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.
    y : {array-like}, shape (n_samples,), optional
        Target vector relative to X.
    sample_weight : {array-like}, shape (n_samples,), optional
                    sample weight vector relative to X.

    Returns
    -------
    out : {array-like}, shape (n_samples, 1)
          Training vectors in VW string format

    Examples
    --------
    >>> import pandas as pd
    >>> from sklearn.feature_extraction.text import HashingVectorizer
    >>> from vowpalwabbit.sklearn_vw import tovw
    >>> X = pd.Series(['cat', 'dog', 'cat', 'cat'], name='catdog')
    >>> y = pd.Series([-1, 1, -1, -1], name='label')
    >>> hv = HashingVectorizer()
    >>> hashed = hv.fit_transform(X)
    >>> tovw(x=hashed, y=y)
    """

    use_truth = y is not None
    use_weight = sample_weight is not None

    # convert to numpy array if needed
    if not isinstance(x, (np.ndarray, csr_matrix)):
        x = np.array(x)
    if not isinstance(y, np.ndarray):
        y = np.array(y)

    # make sure this is a 2d array
    if x.ndim == 1:
        x = x.reshape(1, -1)
    if y.ndim == 0:
        y = y.reshape(1)

    rows, cols = x.shape

    # check for invalid characters if array has string values
    if x.dtype.char == 'S':
        for row in rows:
            for col in cols:
                x[row, col] = INVALID_CHARS.sub('.', x[row, col])

    # convert input to svmlight format
    s = io.BytesIO()
    dump_svmlight_file(x, np.zeros(rows), s)

    # parse entries to construct VW format
    rows = s.getvalue().decode('ascii').split('\n')[:-1]
    out = []
    for idx, row in enumerate(rows):
        truth = y[idx] if use_truth else 1
        weight = sample_weight[idx] if use_weight else 1
        features = row.split('0 ', 1)[1]
        # only using a single namespace and no tags
        out.append(('{y} {w} |{ns} {x}'.format(y=truth, w=weight, ns=DEFAULT_NS, x=features)))

    s.close()

    return out
Пример #15
0
def feature_format(sinadataPath,userPath,contentPath,sinaGenderPath):
    '''性别标签格式化'''
    imagename = []#每行数据所对应的图片么
    data = []# 数据矩阵
    contentlist = []#微博列表
    genderlabel = []#性别标签
    userlist = [] #用户列表
    
    #注意读取的格式编码!!!,有中文时字符编码是uft-8的菜可以识别,
    #可以在eclipse建立普通文件复制内容过来就可以解决
    ''''读取数据'''''
    f = codecs.open(sinadataPath)  
    for line in f.readlines(): 
        datatemp = line.strip().split(',')  
        imagename.append(datatemp[1])
        data.append([double(tk) for tk in datatemp[2:]])
    f.close()
    imagename = np.array(imagename)
    data = np.array(data)
    #print imagename 
    #print data
    print '''读取发图微博'''
    contentf = codecs.open(contentPath)  
    for line in contentf.readlines(): 
        contenttemp = line.strip().split(',')  
        #print contenttemp[1]
        contentlist.append([tk for tk in contenttemp[:]])
    contentf.close()
    print '''读取用户列表'''
    userf = codecs.open(userPath)  
    for line in userf.readlines(): 
        usertemp = line.strip().split(',')  
        #print usertemp[1]
        userlist.append([tk for tk in usertemp[:]])
    userf.close()
    
    print '''填入标签'''
    for i in range(0,len(imagename)):
        name = imagename[i]
        print name
        flag = 0
        for li in contentlist:
            #print li
            if(name == li[1]):
                for user in userlist:
                    #print user
                    if(user[0] == li [0]):
                        flag = 1
                        #print user[1]
                        if(user[1] == '女'):
                            genderlabel.append(0)
                        else:
                            genderlabel.append(1)
                        break
                break
            #print genderlabel
        if(flag == 0):
            print i
            print(name+"没有对应的标签")
            np.delete(data, i, 0)#删除对应的数据
            
    genderlabel = np.array(genderlabel)  

    
    print genderlabel
    print data.shape[0] 
    print genderlabel.shape[0]
    print ''''构建libsvm数据'''
    dump_svmlight_file(data, genderlabel,sinaGenderPath,zero_based=False)
Пример #16
0
def wechat_fomat(dataPath,labelPath,writeGenderPath,writeLocPath):
    '''微信数据格式化'''
    imagename = []#每行数据所对应的图片么
    data = []# 数据矩阵
    genderlabel = []#性别标签
    loclabel = []#位置标签
    labelfile = []
    
    #注意读取的格式编码!!!,有中文时字符编码是uft-8的菜可以识别,
    #可以在eclipse建立普通文件复制内容过来就可以解决
    ''''读取数据'''''
    f = codecs.open(dataPath)  
    for line in f.readlines(): 
        tokens = line.strip().split(' ')  
        imagename.append(tokens[0])
        data.append([double(tk) for tk in tokens[1:]])
    f.close()
    imagename = np.array(imagename)
    data = np.array(data)
    print imagename 
    '''''读取标签'''
    labelf = codecs.open(labelPath)  
    for line in labelf.readlines(): 
        tokens = line.strip().split(' ')  
        labelfile.append([tk for tk in tokens[:]])
    # print labelfile
    
    flag = 0
    '''填入标签'''''
    for i in range(0,len(imagename)):
        name = imagename[i]
        flag = 0
        for li in labelfile:
    #         print li[3]
            if(name == li[3]):
                flag = 1
                if(li[1] == '女'):
                    genderlabel.append(0)
                else:
                    genderlabel.append(1)
                if(li[5] == '2'):
                    loclabel.append(0)
                else:
                    loclabel.append(1)
        if(flag == 0):
            print i
            print(name+"没有对应的标签")
            np.delete(data, i, 0)#删除对应的数据
    # print loclabel  
    # label = np.array(label)
    labelf.close()
    
    ''''稀疏矩阵化数据'''
    data = np.array(data)
    genderlabel = np.array(genderlabel)
    loclabel = np.array(loclabel)
    
    '''查看数据是否一致大小
    如果结果不一致说明标签和数据不匹配.
    '''
    print data.shape[0]
    print genderlabel.shape[0]
    print loclabel.shape[0] 
    ''''将libsvm格式数据写到文件'''
    dump_svmlight_file(data, genderlabel,writeGenderPath,zero_based=False)
    dump_svmlight_file(data, loclabel,writeLocPath,zero_based=False)
    print ("Wechat format End!")