예제 #1
0
def livecollect(num):
    bloom = ScalableBloomFilter(1000000,0.001)
    havedone = temp_stupid('user.txt').read()
    error = temp_stupid('erroruser.txt').read()
    for i in havedone:
        try:
            ii=json.loads(i)['id']
        except:
            continue
        bloom.add(ii)
    for i in error:
        bloom.add(i)
    temp = temp_stupid('sample.txt')
    see = temp.read()
    sc = tempflow('user.txt','a')
    seed=[]
    for i in see:
        if i not in bloom:
            seed.append(i)
    for i in seed[:num]:
        #tempp=temp_stupid(i+'txt')
        if i.startswith(u'\ufeff'):
            i = i.encode('utf8')[3:].decode('utf8')
        try:
            userdict = tiny_people(getRequest(),i)
            sc.writein([json.dumps(userdict,ensure_ascii=False)])
            print(i)
        except ErrorInJson as result:
            temp_stupid('erroruser.txt').save([i])
        except BaseException as result:
            for e in range(10):
                try:
                    send(traceback.format_exc() + '\n' + str(result) + '\n in ' + datetime.datetime.now().strftime(
                        '%Y-%m-%d %H:%M:%S'), __name__ + ' throw '+result.__class__.__name__, '*****@*****.**')
                    break
                except BaseException as res:
                    print(res)
                    continue
            while True:
                if not CanConnect(cre):
                    continue
                else:
                    break

            
        # try:
        #     livelist = worm_userV4(i,'lives',['id'])
        # except BaseException as result:
        #     while True:
        #         try:
        #             send(traceback.format_exc() + '\n' + str(result) + '\n in ' + datetime.datetime.now().strftime(
        #                 '%Y-%m-%d %H:%M:%S'), __name__ + ' throw '+result.__class__.__name__, '*****@*****.**')
        #             break
        #         except:
        #             continue
        #     continue
        # tempp.update(livelist)
    sc.end()
예제 #2
0
def getdir(path, fil=None, pre='.txt', to='dir.txt', display=False):
    filePath = path
    lis = os.listdir(filePath)
    if pre:
        lis = [x.replace(pre, '') for x in lis]
    if fil:
        lis = list(filter((lambda x: True if x not in fil else False), lis))
    temp_stupid(to).update(lis)
    if display:
        print(lis)
    return lis
예제 #3
0
def generatecorpus():
    from functiontool.getrelation import getrelation
    from functiontool.templib import temp_stupid
    from functiontool.dirkit import getdir
    sample = getdir('G:/zhihuData/gooduser', pre='.txt')
    from functiontool.kit import tranverse
    k = []

    def a(i):
        k.append(' '.join(getrelation(i, path='G:/zhihuData')))

    tranverse(sample, a)
    temp_stupid('corpus_all.txt').update(k)
예제 #4
0
def load_corpus(filename, usertxt, exper=None):
    '''
    filename: 语料的文件名,文件形式参考corpus.txt
    exper: int 这个参数是用来限制返回的用户(记录)个数的
        之所以用了exper命名
        是因为你写好程序后想试试能不能跑通
        这时候不用加载所有数据,后面处理很慢的
        设置exper=100就只返回100个用户的数据
    usertxt: 文件名,该文件用来存filename文件中每一行对应的user_id
    return    (sentence_d,all_d,vali,user)
        sentence_d: 一个列表,列表中的元素是一个用户的够买live的id序列,但是去除了最新买的live(要用做验证/测试)
        all_d: 和sentence_d不同的,没有抹去最新买的live,列表内的元素为(liveid,frequency_count)按降序排列
        vali: 抹去的那个最新买的live都存在这里了,这是一个列表,列表中每个元素为对应用户的最新买的live
        user: 这个列表存用户id
    注意。以上四个返回的列表长度都是一样的(len())
        因此user[i]这位用户的购买记录存在all_d[i]中
        最新买的live是vali[i],
        除了最新买的,剩下的都在sentence_d[i]
        也就是说,四个列表同一索引存了同一个用户的信息
    '''
    from functiontool.templib import temp_stupid
    stop = temp_stupid('G:/zhihuX/old/needuser.txt').read()
    f = open(filename, 'r')
    sentence_d = []
    all_d = []
    vali = []
    user = []
    lines = f.readlines()
    f.close()
    if usertxt:
        users = temp_stupid(usertxt).read()
    if exper:
        lines = lines[:exper]
        if usertxt:
            users = users[:exper]
    ii = 0
    for sentence in lines:
        sen = sentence.split()
        num = len(sen)
        #if True:
        if num > 1 and (not users[ii] in stop):
            vali.append(sen[0])
            sentence_d.append(sen[1:])
            all_d.extend(sen)
            user.append(users[ii])
        ii += 1

    all_d = collections.Counter(all_d).most_common()
    return sentence_d, all_d, vali, user
예제 #5
0
def forlive(num):
    bloom = ScalableBloomFilter(1000000, 0.001)
    havedone = temp_stupid('live.txt').read()
    error = temp_stupid('errorlive.txt').read()
    for i in havedone:
        try:
            ii = json.loads(i)['id']
        except:
            continue
        bloom.add(ii)
    for i in error:
        bloom.add(i)
    temp = temp_stupid('need.txt')
    see = temp.read()
    sc = tempflow('live.txt', 'a')
    seed = []
    for i in see:
        if i not in bloom:
            seed.append(i)
    for i in seed[:num]:
        #tempp=temp_stupid(i+'txt')
        if i.startswith(u'\ufeff'):
            i = i.encode('utf8')[3:].decode('utf8')
        try:
            userdict = tiny_live(i, getRequest())
            sc.writein([userdict])
            print(i)
        except ErrorInJson as result:
            temp_stupid('errorlive.txt').save([i])
        except BaseException as result:
            for e in range(10):
                try:
                    send(
                        traceback.format_exc() + '\n' + str(result) +
                        '\n in ' +
                        datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                        __name__ + ' throw ' + result.__class__.__name__,
                        '*****@*****.**')
                    break
                except BaseException as res:
                    print(res)
                    continue
            while True:
                if not CanConnect(create_live(i), XHR_HEADER_API):
                    continue
                else:
                    break
    sc.end()
예제 #6
0
 def create_from_file(self, filename):
     '''
     从文件中获得一个过滤器,同样,不会保存
     '''
     what = ScalableBloomFilter(100000000, 0.001)
     t = temp_stupid(filename)
     for i in t.read():
         what.add(i)
     return what
예제 #7
0
def getdir(path,fil=None,pre='.txt',to='dir.txt',display = False):
    '''
    该函数用户获得某一目录下的所有文件名
    path: 目录
    fil: a list 其中的内容会被过滤掉,相当于git中的.gitignore
    pre: 如果需要去除文件名的类型,默认去掉文件名之后的'.txt'
    to: 把结果输出到一个文件
    display: 是否要在运行过程中打印每个结果

    '''
    filePath = path
    lis=os.listdir(filePath)
    if pre:
        lis = [x.replace(pre,'') for x in lis]
    if fil:
        lis = list(filter((lambda x:True if x not in fil else False),lis))
    temp_stupid(to).update(lis)
    if display:
        print(lis)
    return lis
예제 #8
0
def load_corpus_test(numm=40000):
    '''
    这个函数加载测试集语料
    改编自load_corpus()
    由于纯粹为了临时使用,有些应该变成参数的,但是都固化在里面了
    返回的和load_corpus()一样
    '''
    from functiontool.templib import temp_stupid
    # stop里的都是训练集
    stop = temp_stupid('G:/zhihuX/old/needuser.txt').read() + temp_stupid(
        './sample/dir.txt').read()

    filename = 'corpus_all.txt'
    usertxt = 'dir.txt'
    f = open(filename, 'r')
    sentence_d = []
    all_d = []
    vali = []
    user = []
    lines = f.readlines()
    f.close()
    users = temp_stupid(usertxt).read()
    ii = 0
    pp = 0
    for sentence in lines:
        if pp > numm:
            break
        sen = sentence.split()
        num = len(sen)
        #if True:
        if num > 1 and (not users[ii] in stop):
            vali.append(sen[0])
            sentence_d.append(sen[1:])
            all_d.extend(sen)
            user.append(users[ii])
            pp += 1
        ii += 1

    all_d = collections.Counter(all_d).most_common()
    return sentence_d, all_d, vali, user
예제 #9
0
def generatecorpus():
    from functiontool.getrelation import getrelation
    from functiontool.templib import temp_stupid
    sample = temp_stupid('./sample/dir.txt').read()
    from functiontool.kit import tranverse
    k = set()

    def a(i):
        p = getrelation(i)
        for x in p:
            k.add(x)

    tranverse(sample, a)
    return k
예제 #10
0
import os
import gensim
# 引入doc2vec
from gensim.models import Doc2Vec
from load_data import *
from functiontool.baseinfo import getlive
from functiontool.templib import temp_stupid
from functiontool.textpro import *
from functiontool.getrelation import *
import random

documents = []
_, words, validatewords, user = load_corpus('corpus_all.txt',
                                            usertxt='dir.txt')
_, _, vocabulary = index_item(words)
stop = temp_stupid("G:/stopwords-master/stopwords.txt").read()
stop = [x.strip() for x in stop]
print(stop[:30])
# fil=lambda x:False if x in stop else True
for j in user:
    text = []
    for i in getrelation(j, path='G:/zhihuData')[1:]:
        x = None
        try:
            x = getlive(i).description
            # print(i)
            x = text2list(gettext(x), stop)
            # print(x[:10])
            text.extend(x)
            if not x:
                continue
예제 #11
0
# se=list(pd.read_sql('user',engine)['id'])
have = list(getdir('G:/zhihuData/gooduser', pre='.txt'))
# filfunc=lambda x: True if x in se else False
# target=list(filter(filfunc,have))

from functiontool.easyRelation import *
from functiontool.getrelation import getrelation
# new('user','int')
# ea=easyRelation('user')
p = list(enumerate(have, 1))
k = []


def a(i):
    #ea.insert({'id':i[1],'token':i[0],'isprocess':len(getrelation(i[1],get='live',path='G:/zhihuData'))})
    num = len(getrelation(i[1], get='live', path='G:/zhihuData'))
    k.append(','.join([i[1], str(i[0]), str(num)]))


from functiontool.kit import tranverse
tranverse(p, a)
# ea.end()
from functiontool.templib import temp_stupid
temp_stupid('usermap.csv').update(k)
record.login(
    来源='zhihuData中的所有数据',
    输出='usermap.csv',
    内容='将user的id映射为int值,以便算法使用,同时在isprocess字段保存了涉及的live数量,是为了导入sqlite',
    设计工具='temolib,tranverse(in kit)')
record.wri('将user的id映射为int值的文件准备')