def getPackageToScore(mask=None, inOrOut=True): packToScore = {} f = SepFile(':') f.open(Qpackage.QPACKAGE_SCORE_TXT, mode='txt', flag='r') for line in f: if mask == None: packToScore[line[0]] = float(line[1]) else: if inOrOut: if line[0] in mask: packToScore[line[0]] = float(line[1]) else: if line[0] not in mask: packToScore[line[0]] = float(line[1]) f.close() return packToScore
def procInterval(self, week): #quser只能控制周为单位的,不能控制条为单位 userToOpenPackage = {} match_dir = re.compile( os.path.join(self.qUserPath, '(' + '|'.join(week) + ')', '.*\.gz')) for (filename, _, files) in os.walk(self.qUserPath): for gzfile in files: gzfile_dir = os.path.join(filename, gzfile) if match_dir.search(gzfile_dir): f = SepFile('|').open(gzfile_dir, mode='gzip', flag='rb') for line in f: #line[0] userId, line[1] packageName, line[2] openTimes if line[0] not in userToOpenPackage: userToOpenPackage[line[0]] = {} if line[1] in self.qPackageToId: if line[1] not in userToOpenPackage[line[0]]: userToOpenPackage[line[0]][line[1]] = int( line[2]) else: userToOpenPackage[line[0]][line[1]] += int( line[2]) else: if line[1] in self.qPackageToId: if line[1] not in userToOpenPackage[line[0]]: userToOpenPackage[line[0]][line[1]] = int( line[2]) else: userToOpenPackage[line[0]][line[1]] += int( line[2]) f.close() return userToOpenPackage
def getIdToQuser(mask=None): idToQuser = {} f = SepFile('|').open(Quser.QUSER_ID_TXT, 'txt', 'r') for line in f: if mask == None: idToQuser[line[1]] = line[0] else: if line[1] in mask: idToQuser[line[1]] = line[0] f.close() return idToQuser # if __name__ == '__main__': # user = set() # f = SepFile(',').open('/root/Downloads/look-alike/data/payQualityUsers/payQualityUsers.txt', 'txt', 'r') # for line in f: # username = line[0] # if len(line[0]) == 0 and len(line[1]) != 0: # username = line[1] # user.add(username) # f.close() # print len(user) # print 'return'
def procInterval(self, weekInterval): #与Quser不一样因为我们不需要把7天的加起来 userToOpenPackage = {} num_record = 0 match_dir = re.compile(os.path.join(self.candPath, '(' + '|'.join(weekInterval) + ')', '.*\.gz')) for (filename, _, files) in os.walk(self.candPath): for gzfile in files: gzfile_dir = os.path.join(filename, gzfile) if match_dir.search(gzfile_dir): f = SepFile('|').open(gzfile_dir, mode='gzip', flag='rb') for line in f: if line[0] not in userToOpenPackage: if num_record >= self.max_record: yield userToOpenPackage userToOpenPackage = {} userToOpenPackage[line[0]] = {} if line[1] in self.qPackageToId: userToOpenPackage[line[0]][line[1]] = int(line[2]) num_record = 1 else: userToOpenPackage[line[0]] = {} if line[1] in self.qPackageToId: userToOpenPackage[line[0]][line[1]] = int(line[2]) num_record += 1 else: if line[1] in self.qPackageToId: if line[1] not in userToOpenPackage[line[0]]: userToOpenPackage[line[0]][line[1]] = int(line[2]) else: userToOpenPackage[line[0]][line[1]] += int(line[2]) f.close() yield userToOpenPackage
def writeQpackageToId(): f = SepFile(':').open(Qpackage.QPACKAGE_SCORE_TXT, 'txt', 'r') idx = 0 writer = LineFile().open(Qpackage.QPACKAGE_ID_TXT, 'txt', 'w') for line in f: writer.writeLine(line[0] + '|' + str(idx)) idx += 1 writer.close() f.close()
def getQuserToId(mask=None): qUserToId = {} f = SepFile('|').open(Quser.QUSER_ID_TXT, 'txt', 'r') for line in f: if mask == None: qUserToId[line[0]] = line[1] else: if line[0] in mask: qUserToId[line[0]] = line[1] f.close() return qUserToId
def getQuserOpenPackage(basePath='s3://datamining.ym/dmuser/ykang/results/qUserInLast5EachDay', beginDay='2016-01-24', interval_='30', isForward='0', s3DictBasePath='s3://datamining.ym/dmuser/ykang/data/spark.ouwan.qPackageToId', isDownload=True): mconf = MissionConf().setAppName('getQuserOpenPackage') msc = MissionContext(conf=mconf) [_, appPath] = msc.getFolder() if isDownload: for theDay in getDaysGen(beginDay, int(interval_), int(isForward)): BashUtil.s3Cp(os.path.join(basePath,theDay), appPath+os.sep+theDay, recursived=True) openPackage = {} mask = {'imei=333333333333333':1, 'imei=123456789abcdef':1, 'imei=111111111111111':1, 'imei=012345678912345':1, 'imei=000000000000000':1, 'imei=00000000000000':1} for (filename, _, files) in os.walk(appPath): print filename for gzfile in files: [_, ext] = os.path.splitext(gzfile) if ext == '.gz': f = SepFile('|') f.open(filename+os.sep+gzfile, mode='gzip', flag='rb') for line in f: if line[0] not in mask: if line[1] not in openPackage: openPackage[line[1]] = int(line[2]) else: openPackage[line[1]] += int(line[2]) f.close() openTimes = [] print 'sorting' packs = openPackage.keys() for key in packs: openTimes.append(openPackage[key]) index = sorted(range(len(openTimes)), key=lambda k: openTimes[k], reverse=True) print 'sorted' writer = LineFile() writer.open(os.path.join(appPath, 'qUserOpenPackage.txt'), mode='txt', flag='w') for i in index: key = packs[i] value = openPackage[key] writer.writeLine(key + '|' + str(value)) writer.close() #可以将qUserOpenPackageToOpenTimes写入到该位置Qpackage.QPACKAGE_ID_TXT index = 0; f = LineFile().open(Qpackage.QPACKAGE_ID_TXT, mode='txt', flag='w') for qPackage in openPackage: f.writeLine(qPackage + '|' + str(index)) index += 1 f.close() BashUtil.s3Cp(Qpackage.QPACKAGE_ID_TXT, dst=os.path.join(s3DictBasePath, 'qPackageToId.txt'), recursived=False) return openPackage
def getIdToPackage(mask=None): idToQpackage = {} f = SepFile('|').open(Qpackage.QPACKAGE_ID_TXT, 'txt', 'r') for line in f: if mask == None: idToQpackage[line[1]] = line[0] else: if line[1] in mask: idToQpackage[line[1]] = line[0] f.close() return idToQpackage
def getIdToCandidate(mask=None): print 'Info: Loading idToCandidate' idToCand = {} f = SepFile('|').open(Candidate.CANDIDATES_ID_TXT, 'txt', 'r') for line in f: if mask == None: idToCand[line[1]] = line[0] else: if line[1] in mask: idToCand[line[1]] = line[0] f.close() return idToCand
def getQpackageToId(mask=None): print 'Info: loading QpackageToId' qPackageToId = {} f = SepFile('|').open(Qpackage.QPACKAGE_ID_TXT, 'txt', 'r') for line in f: if mask == None: qPackageToId[line[0]] = line[1] else: if line[0] in mask: qPackageToId[line[0]] = line[1] f.close() return qPackageToId
def getCandidateToId(mask=None): print 'Info: Loading canditateToId' if mask == None: candToId = pickle.load(open(Candidate.CANDIDATES_ID_PICKLE, 'rb')) else: candToId = {} f = SepFile('|').open(Candidate.CANDIDATES_ID_TXT, 'txt', 'r') for line in f: if line[0] in mask: candToId[line[0]] = line[1] f.close() return candToId
def getQpackageToOpenTimes(appPath): #idf qPackageToOpenTimes = {} for (filename, _, files) in os.walk(appPath): for gzfile in files: print gzfile [_, ext] = os.path.splitext(gzfile) if ext == '.gz': reader = SepFile('|') reader.open(os.path.join(filename, gzfile), mode='gzip', flag='rb') for line in reader: qPackageToOpenTimes[line[0]] = int(line[1]) reader.close() return qPackageToOpenTimes
def getIdToQuser(mask=None, inOrOut=True): idToQuser = {} f = SepFile('|').open(Quser.QUSER_ID_TXT, 'txt', 'r') for line in f: if mask == None: idToQuser[line[1]] = line[0] else: if inOrOut: if line[1] in mask: idToQuser[line[1]] = line[0] else: if line[1] not in mask: idToQuser[line[1]] = line[0] f.close() return idToQuser
def getIdToPackage(mask=None, inOrOut=True): idToQpackage = {} f = SepFile('|').open(Qpackage.QPACKAGE_ID_TXT, 'txt', 'r') for line in f: if mask == None: idToQpackage[line[1]] = line[0] else: if inOrOut: if line[0] in mask: idToQpackage[line[1]] = line[0] else: if line[0] not in mask: idToQpackage[line[1]] = line[0] f.close() return idToQpackage
def getUserTotalNumber(s3Path, isDownload=True): mconf = MissionConf().setAppName('userTotalNumber') msc = MissionContext(conf=mconf) [self, appPath] = msc.getFolder() if isDownload: BashUtil.s3Cp(s3Path, appPath, recursived=True) userTotalNumber = 0 for (filename, _, files) in os.walk(appPath): for gzfile in files: [_, ext] = os.path.splitext(gzfile) if ext == '.gz': reader = SepFile('|') reader.open(os.path.join(filename, gzfile), mode='gzip', flag='rb') for line in reader: userTotalNumber += 1 reader.close() return userTotalNumber
def writeQuserToId(): qUser = set() f = SepFile(',').open(Quser.TOTAL_QUSER_TXT, 'txt', 'r') for line in f: username = '******' + line[0] if len(line[0]) == 0 and len(line[1]) != 0: username = '******' + line[1] qUser.add(username) qUser = list(qUser) f.close() f = LineFile().open(Quser.QUSER_ID_TXT, 'txt', 'w') for i in range(len(qUser)): f.writeLine(qUser[i] + '|' + str(i)) f.close()
def writeQpackageToId(mask=None, inOrOut=True): f = SepFile(':').open(Qpackage.QPACKAGE_SCORE_TXT, 'txt', 'r') idx = 0 writer = LineFile().open(Qpackage.QPACKAGE_ID_TXT, 'txt', 'w') for line in f: if mask == None: writer.writeLine(line[0] + '|' + str(idx)) idx += 1 else: if inOrOut: if line[0] in mask: writer.writeLine(line[0] + '|' + str(idx)) idx += 1 else: if line[0] not in mask: writer.writeLine(line[0] + '|' + str(idx)) idx += 1 writer.close() f.close()
def writeCandidateToId(): print 'Info: Writing canditateToId' candidate = set() for pf in Candidate.PART_FILE_NAME: print 'Info: processing ' + pf f = SepFile('|').open(pf, 'gzip', 'rb') for line in f: candidate.add(line[0]) f.close() candidate = list(candidate) writer = LineFile() writer.open(Candidate.CANDIDATES_ID_TXT, 'txt', 'w') candToId = {} for i in range(len(candidate)): candToId[candidate[i]] = str(i) writer.writeLine(candidate[i] + '|' + str(i)) writer.close() del candidate gc.collect() pickle.dump(candToId, open(Candidate.CANDIDATES_ID_PICKLE, 'wb'), True)
def writeCandidateToId(): candidates = set() match_dir = re.compile(os.path.join(Candidate.BASE_PATH, '.*\.gz')) for (filename, _, files) in os.walk(Candidate.BASE_PATH): for gzfile in files: gzfile_dir = os.path.join(filename, gzfile) if match_dir.search(gzfile_dir): f = SepFile('|').open(gzfile_dir, mode='gzip', flag='r') for line in f: candidates.add(line[0]) f.close() candidates = list(candidates) writer = LineFile().open(Candidate.CANDIDATES_ID_TXT, mode='txt', flag='w') candToId = {} for i in range(len(candidates)): candToId[candidates[i]] = str(i) writer.writeLine(candidates[i] + '|' + str(i)) writer.close() del candidates gc.collect() pickle.dump(candToId, open(Candidate.CANDIDATES_ID_PICKLE, 'wb'), True)
def procInterval(self, weekInterval): #与Quser不一样因为我们不需要把7天的加起来 userToOpenPackage = {} num_record = 0 match_dir = re.compile( os.path.join(self.candPath, '(' + '|'.join(weekInterval) + ')', '.*\.gz')) for (filename, _, files) in os.walk(self.candPath): for gzfile in files: gzfile_dir = os.path.join(filename, gzfile) if match_dir.search(gzfile_dir): f = SepFile('|').open(gzfile_dir, mode='gzip', flag='rb') for line in f: if line[0] not in userToOpenPackage: if num_record >= self.max_record: yield userToOpenPackage userToOpenPackage = {} userToOpenPackage[line[0]] = {} if line[1] in self.qPackageToId: userToOpenPackage[line[0]][line[1]] = int( line[2]) num_record = 1 else: userToOpenPackage[line[0]] = {} if line[1] in self.qPackageToId: userToOpenPackage[line[0]][line[1]] = int( line[2]) num_record += 1 else: if line[1] in self.qPackageToId: if line[1] not in userToOpenPackage[line[0]]: userToOpenPackage[line[0]][line[1]] = int( line[2]) else: userToOpenPackage[line[0]][line[1]] += int( line[2]) f.close() yield userToOpenPackage
def procInterval(self, week): #quser只能控制周为单位的,不能控制条为单位 userToOpenPackage = {} match_dir = re.compile(os.path.join(self.qUserPath, '(' + '|'.join(week) + ')', '.*\.gz')) for (filename, _, files) in os.walk(self.qUserPath): for gzfile in files: gzfile_dir = os.path.join(filename, gzfile) if match_dir.search(gzfile_dir): f = SepFile('|').open(gzfile_dir, mode='gzip', flag='rb') for line in f: #line[0] userId, line[1] packageName, line[2] openTimes if line[0] not in userToOpenPackage: userToOpenPackage[line[0]] = {} if line[1] in self.qPackageToId: if line[1] not in userToOpenPackage[line[0]]: userToOpenPackage[line[0]][line[1]] = int(line[2]) else: userToOpenPackage[line[0]][line[1]] += int(line[2]) else: if line[1] in self.qPackageToId: if line[1] not in userToOpenPackage[line[0]]: userToOpenPackage[line[0]][line[1]] = int(line[2]) else: userToOpenPackage[line[0]][line[1]] += int(line[2]) f.close() return userToOpenPackage
def writeQuserToId(mask=None, inOrOut=True): qUser = set() f = SepFile(',').open(Quser.TOTAL_QUSER_TXT, 'txt', 'r') for line in f: username = '******' + line[0] if len(line[0]) == 0 and len(line[1]) != 0: username = '******' + line[1] if mask == None: qUser.add(username) else: if inOrOut: if username in mask: qUser.add(username) else: if username not in mask: qUser.add(username) qUser = list(qUser) f.close() f = LineFile().open(Quser.QUSER_ID_TXT, 'txt', 'w') for i in range(len(qUser)): f.writeLine(qUser[i] + '|' + str(i)) f.close()
msc = MissionContext(conf=mconf) [_, appPath] = msc.getFolder() basePath = 's3://datamining.ym/dmuser/ykang/results/qUserInLast5EachDay' for theDay in getDaysGen('2016-01-24', 30, 0): BashUtil.s3Cp(basePath+os.sep+theDay, appPath+os.sep+theDay, recursived=True) openPackage = {} mask = {'imei=333333333333333':1, 'imei=123456789abcdef':1, 'imei=111111111111111':1, 'imei=012345678912345':1, 'imei=000000000000000':1, 'imei=00000000000000':1} #mask = {} for (filename, dirs, files) in os.walk(appPath): print filename for gzfile in files: [name, ext] = os.path.splitext(gzfile) if ext == '.gz': f = SepFile('|') f.open(filename+os.sep+gzfile, mode='gzip', flag='rb') for line in f: if line[0] not in mask: if line[1] not in openPackage: openPackage[line[1]] = int(line[2]) else: openPackage[line[1]] += int(line[2]) f.close() openTimes = [] print 'sorting' packs = openPackage.keys() for key in packs: openTimes.append(openPackage[key]) index = sorted(range(len(openTimes)), key=lambda k: openTimes[k], reverse=True) print 'sorted'
openPackage = {} mask = { 'imei=333333333333333': 1, 'imei=123456789abcdef': 1, 'imei=111111111111111': 1, 'imei=012345678912345': 1, 'imei=000000000000000': 1, 'imei=00000000000000': 1 } #mask = {} for (filename, dirs, files) in os.walk(appPath): print filename for gzfile in files: [name, ext] = os.path.splitext(gzfile) if ext == '.gz': f = SepFile('|') f.open(filename + os.sep + gzfile, mode='gzip', flag='rb') for line in f: if line[0] not in mask: if line[1] not in openPackage: openPackage[line[1]] = int(line[2]) else: openPackage[line[1]] += int(line[2]) f.close() openTimes = [] print 'sorting' packs = openPackage.keys() for key in packs: openTimes.append(openPackage[key]) index = sorted(range(len(openTimes)), key=lambda k: openTimes[k],
def getQuserOpenPackage( basePath='s3://datamining.ym/dmuser/ykang/results/qUserInLast5EachDay', beginDay='2016-01-24', interval_='30', isForward='0', s3DictBasePath='s3://datamining.ym/dmuser/ykang/data/spark.ouwan.qUserOpenPackage', isDownload=True): mconf = MissionConf().setAppName('getQuserOpenPackage') msc = MissionContext(conf=mconf) [_, appPath] = msc.getFolder() if isDownload: for theDay in getDaysGen(beginDay, int(interval_), int(isForward)): BashUtil.s3Cp(os.path.join(basePath, theDay), appPath + os.sep + theDay, recursived=True) openPackage = {} mask = { 'imei=333333333333333': 1, 'imei=123456789abcdef': 1, 'imei=111111111111111': 1, 'imei=012345678912345': 1, 'imei=000000000000000': 1, 'imei=00000000000000': 1 } for (filename, _, files) in os.walk(appPath): print filename for gzfile in files: [_, ext] = os.path.splitext(gzfile) if ext == '.gz': f = SepFile('|') f.open(filename + os.sep + gzfile, mode='gzip', flag='rb') for line in f: if line[0] not in mask: if line[1] not in openPackage: openPackage[line[1]] = int(line[2]) else: openPackage[line[1]] += int(line[2]) f.close() openTimes = [] print 'sorting' packs = openPackage.keys() for key in packs: openTimes.append(openPackage[key]) index = sorted(range(len(openTimes)), key=lambda k: openTimes[k], reverse=True) print 'sorted' writer = LineFile() writer.open(os.path.join(appPath, 'qUserOpenPackage.txt'), mode='txt', flag='w') for i in index: key = packs[i] value = openPackage[key] writer.writeLine(key + '|' + str(value)) writer.close() BashUtil.s3Cp(os.path.join(appPath, 'qUserOpenPackage.txt'), dst=os.path.join(s3DictBasePath, 'qUserOpenPackage.txt'), recursived=False) return openPackage
# # from com.um.ykang.mission.MissionConf import MissionConf # from com.um.ykang.mission.MissionContext import MissionContext # # mconf = MissionConf().setAppName('test') # (msc, app) = MissionContext(conf=mconf).getFolder() # msc.getSample('s3://datamining.ym/dmuser/ykang/results/test2/part-00000.gz', 20).getEmrFile() from com.um.ykang.data.format.File import SepFile import os # writer = open('/root/test.txt', 'w') # writer.writelines('123') # writer.close() f = SepFile('\t').open( '/home/bk25103378/dataExchangeYiGuan/overlap_full_label.txt', 'txt', 'r') writer = open('/home/bk25103378/dataExchangeYiGuan/label.txt', 'w') for line in f: writer.write(line[2] + '\t' + line[3].strip() + os.linesep) writer.close()