def livecollect(num): bloom = ScalableBloomFilter(1000000,0.001) havedone = temp_stupid('user.txt').read() error = temp_stupid('erroruser.txt').read() for i in havedone: try: ii=json.loads(i)['id'] except: continue bloom.add(ii) for i in error: bloom.add(i) temp = temp_stupid('sample.txt') see = temp.read() sc = tempflow('user.txt','a') seed=[] for i in see: if i not in bloom: seed.append(i) for i in seed[:num]: #tempp=temp_stupid(i+'txt') if i.startswith(u'\ufeff'): i = i.encode('utf8')[3:].decode('utf8') try: userdict = tiny_people(getRequest(),i) sc.writein([json.dumps(userdict,ensure_ascii=False)]) print(i) except ErrorInJson as result: temp_stupid('erroruser.txt').save([i]) except BaseException as result: for e in range(10): try: send(traceback.format_exc() + '\n' + str(result) + '\n in ' + datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S'), __name__ + ' throw '+result.__class__.__name__, '*****@*****.**') break except BaseException as res: print(res) continue while True: if not CanConnect(cre): continue else: break # try: # livelist = worm_userV4(i,'lives',['id']) # except BaseException as result: # while True: # try: # send(traceback.format_exc() + '\n' + str(result) + '\n in ' + datetime.datetime.now().strftime( # '%Y-%m-%d %H:%M:%S'), __name__ + ' throw '+result.__class__.__name__, '*****@*****.**') # break # except: # continue # continue # tempp.update(livelist) sc.end()
def getdir(path, fil=None, pre='.txt', to='dir.txt', display=False): filePath = path lis = os.listdir(filePath) if pre: lis = [x.replace(pre, '') for x in lis] if fil: lis = list(filter((lambda x: True if x not in fil else False), lis)) temp_stupid(to).update(lis) if display: print(lis) return lis
def generatecorpus(): from functiontool.getrelation import getrelation from functiontool.templib import temp_stupid from functiontool.dirkit import getdir sample = getdir('G:/zhihuData/gooduser', pre='.txt') from functiontool.kit import tranverse k = [] def a(i): k.append(' '.join(getrelation(i, path='G:/zhihuData'))) tranverse(sample, a) temp_stupid('corpus_all.txt').update(k)
def load_corpus(filename, usertxt, exper=None): ''' filename: 语料的文件名,文件形式参考corpus.txt exper: int 这个参数是用来限制返回的用户(记录)个数的 之所以用了exper命名 是因为你写好程序后想试试能不能跑通 这时候不用加载所有数据,后面处理很慢的 设置exper=100就只返回100个用户的数据 usertxt: 文件名,该文件用来存filename文件中每一行对应的user_id return (sentence_d,all_d,vali,user) sentence_d: 一个列表,列表中的元素是一个用户的够买live的id序列,但是去除了最新买的live(要用做验证/测试) all_d: 和sentence_d不同的,没有抹去最新买的live,列表内的元素为(liveid,frequency_count)按降序排列 vali: 抹去的那个最新买的live都存在这里了,这是一个列表,列表中每个元素为对应用户的最新买的live user: 这个列表存用户id 注意。以上四个返回的列表长度都是一样的(len()) 因此user[i]这位用户的购买记录存在all_d[i]中 最新买的live是vali[i], 除了最新买的,剩下的都在sentence_d[i] 也就是说,四个列表同一索引存了同一个用户的信息 ''' from functiontool.templib import temp_stupid stop = temp_stupid('G:/zhihuX/old/needuser.txt').read() f = open(filename, 'r') sentence_d = [] all_d = [] vali = [] user = [] lines = f.readlines() f.close() if usertxt: users = temp_stupid(usertxt).read() if exper: lines = lines[:exper] if usertxt: users = users[:exper] ii = 0 for sentence in lines: sen = sentence.split() num = len(sen) #if True: if num > 1 and (not users[ii] in stop): vali.append(sen[0]) sentence_d.append(sen[1:]) all_d.extend(sen) user.append(users[ii]) ii += 1 all_d = collections.Counter(all_d).most_common() return sentence_d, all_d, vali, user
def forlive(num): bloom = ScalableBloomFilter(1000000, 0.001) havedone = temp_stupid('live.txt').read() error = temp_stupid('errorlive.txt').read() for i in havedone: try: ii = json.loads(i)['id'] except: continue bloom.add(ii) for i in error: bloom.add(i) temp = temp_stupid('need.txt') see = temp.read() sc = tempflow('live.txt', 'a') seed = [] for i in see: if i not in bloom: seed.append(i) for i in seed[:num]: #tempp=temp_stupid(i+'txt') if i.startswith(u'\ufeff'): i = i.encode('utf8')[3:].decode('utf8') try: userdict = tiny_live(i, getRequest()) sc.writein([userdict]) print(i) except ErrorInJson as result: temp_stupid('errorlive.txt').save([i]) except BaseException as result: for e in range(10): try: send( traceback.format_exc() + '\n' + str(result) + '\n in ' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), __name__ + ' throw ' + result.__class__.__name__, '*****@*****.**') break except BaseException as res: print(res) continue while True: if not CanConnect(create_live(i), XHR_HEADER_API): continue else: break sc.end()
def create_from_file(self, filename): ''' 从文件中获得一个过滤器,同样,不会保存 ''' what = ScalableBloomFilter(100000000, 0.001) t = temp_stupid(filename) for i in t.read(): what.add(i) return what
def getdir(path,fil=None,pre='.txt',to='dir.txt',display = False): ''' 该函数用户获得某一目录下的所有文件名 path: 目录 fil: a list 其中的内容会被过滤掉,相当于git中的.gitignore pre: 如果需要去除文件名的类型,默认去掉文件名之后的'.txt' to: 把结果输出到一个文件 display: 是否要在运行过程中打印每个结果 ''' filePath = path lis=os.listdir(filePath) if pre: lis = [x.replace(pre,'') for x in lis] if fil: lis = list(filter((lambda x:True if x not in fil else False),lis)) temp_stupid(to).update(lis) if display: print(lis) return lis
def load_corpus_test(numm=40000): ''' 这个函数加载测试集语料 改编自load_corpus() 由于纯粹为了临时使用,有些应该变成参数的,但是都固化在里面了 返回的和load_corpus()一样 ''' from functiontool.templib import temp_stupid # stop里的都是训练集 stop = temp_stupid('G:/zhihuX/old/needuser.txt').read() + temp_stupid( './sample/dir.txt').read() filename = 'corpus_all.txt' usertxt = 'dir.txt' f = open(filename, 'r') sentence_d = [] all_d = [] vali = [] user = [] lines = f.readlines() f.close() users = temp_stupid(usertxt).read() ii = 0 pp = 0 for sentence in lines: if pp > numm: break sen = sentence.split() num = len(sen) #if True: if num > 1 and (not users[ii] in stop): vali.append(sen[0]) sentence_d.append(sen[1:]) all_d.extend(sen) user.append(users[ii]) pp += 1 ii += 1 all_d = collections.Counter(all_d).most_common() return sentence_d, all_d, vali, user
def generatecorpus(): from functiontool.getrelation import getrelation from functiontool.templib import temp_stupid sample = temp_stupid('./sample/dir.txt').read() from functiontool.kit import tranverse k = set() def a(i): p = getrelation(i) for x in p: k.add(x) tranverse(sample, a) return k
import os import gensim # 引入doc2vec from gensim.models import Doc2Vec from load_data import * from functiontool.baseinfo import getlive from functiontool.templib import temp_stupid from functiontool.textpro import * from functiontool.getrelation import * import random documents = [] _, words, validatewords, user = load_corpus('corpus_all.txt', usertxt='dir.txt') _, _, vocabulary = index_item(words) stop = temp_stupid("G:/stopwords-master/stopwords.txt").read() stop = [x.strip() for x in stop] print(stop[:30]) # fil=lambda x:False if x in stop else True for j in user: text = [] for i in getrelation(j, path='G:/zhihuData')[1:]: x = None try: x = getlive(i).description # print(i) x = text2list(gettext(x), stop) # print(x[:10]) text.extend(x) if not x: continue
# se=list(pd.read_sql('user',engine)['id']) have = list(getdir('G:/zhihuData/gooduser', pre='.txt')) # filfunc=lambda x: True if x in se else False # target=list(filter(filfunc,have)) from functiontool.easyRelation import * from functiontool.getrelation import getrelation # new('user','int') # ea=easyRelation('user') p = list(enumerate(have, 1)) k = [] def a(i): #ea.insert({'id':i[1],'token':i[0],'isprocess':len(getrelation(i[1],get='live',path='G:/zhihuData'))}) num = len(getrelation(i[1], get='live', path='G:/zhihuData')) k.append(','.join([i[1], str(i[0]), str(num)])) from functiontool.kit import tranverse tranverse(p, a) # ea.end() from functiontool.templib import temp_stupid temp_stupid('usermap.csv').update(k) record.login( 来源='zhihuData中的所有数据', 输出='usermap.csv', 内容='将user的id映射为int值,以便算法使用,同时在isprocess字段保存了涉及的live数量,是为了导入sqlite', 设计工具='temolib,tranverse(in kit)') record.wri('将user的id映射为int值的文件准备')