def main(datadir): # datadir = sys.argv[1] if os.path.exists(os.path.join(datadir, "static_res.pk")): return pkload(os.path.join(datadir, "static_res.pk")) pattern = re.compile("reviews_(.*?)_5.json") jsondir = os.path.join(datadir, "preprocess", "transform") src, tgt = pattern.findall("\n".join(os.listdir(jsondir))) cold = os.path.join(datadir, "preprocess", "cold") def getUser(domain, overuser=None): domainpattern = glob.glob(os.path.join(cold, "*%s*.pk" % domain)) domainfile = list(domainpattern)[0] domainuser = pkload(domainfile) domainjsonpattern = glob.glob( os.path.join(jsondir, "*%s*.json" % domain)) if domain == "overlap": return domainuser domainUC = 0 domainOUC = 0 domainuser = set(domainuser) overuser = set(overuser) for record in readJson(domainjsonpattern[0]): if record["reviewerID"] in domainuser: domainUC += 1 if record["reviewerID"] in overuser: domainOUC += 1 return domainuser, domainUC, domainOUC overuser = getUser("overlap") srcuser, srcUC, srcOUC = getUser(src, overuser) tgtuser, tgtUC, tgtOUC = getUser(tgt, overuser) print datadir, "done" static_res = { "Domain": [src, tgt], "User": { "overlap": len(overuser), src: len(srcuser), tgt: len(tgtuser) }, "Record": { "srcUC": srcUC, "srcOUC": srcOUC, "tgtUC": tgtUC, "tgtOUC": tgtOUC } } pkdump(static_res, os.path.join(datadir, "static_res.pk")) return static_res
def getUser(domain, overuser=None): domainpattern = glob.glob(os.path.join(cold, "*%s*.pk" % domain)) domainfile = list(domainpattern)[0] domainuser = pkload(domainfile) domainjsonpattern = glob.glob( os.path.join(jsondir, "*%s*.json" % domain)) if domain == "overlap": return domainuser domainUC = 0 domainOUC = 0 domainuser = set(domainuser) overuser = set(overuser) for record in readJson(domainjsonpattern[0]): if record["reviewerID"] in domainuser: domainUC += 1 if record["reviewerID"] in overuser: domainOUC += 1 return domainuser, domainUC, domainOUC
def sentitrain(dir, domain, filter_size, filter_num, embd_size, epoches): runConfig = config.configs["DEBUG"]("sentitrain_%s_%s_%s" % (domain, filter_size, str(embd_size))) runConfig.setConsoleLogLevel("DEBUG") logger = runConfig.getLogger() gpuConfig = runConfig.getGPUConfig() session = tf.Session(config=gpuConfig) transPath = os.path.join(dir, "transform") data = [] logger.info(transPath + "/*%s*" % domain) for d in glob.glob(transPath + "/*%s*" % domain): data.append(Dataset.SentiRecDataset(d)) if data == []: logger.error("The data of %s is not in %s", domain, transPath) raise Exception data = data[0] vocabPath = os.path.join(dir, "vocab") vocab = pkload(os.path.join(vocabPath, "allDomain.pk")) vocab_size = len(vocab) + 1 filter_size = [int(i) for i in filter_size.split(",")] sentc_len = data.getSentLen() sentirec = SentiRec(sentc_len, vocab_size, embd_size, filter_size, filter_num) sentirec.initSess(session) train_writer = tf.summary.FileWriter('log/sentitrain/%s/train' % domain, session.graph) test_writer = tf.summary.FileWriter('log/sentitrain/%s/test' % domain, session.graph) minMae = 20 minRmse = 20 minEpoch = epoches batchSize = 1000 saver = tf.train.Saver(max_to_keep=1) for epoch in range(epoches): logger.info("Epoch %d" % epoch) @recordTime def senticEpoch(): loss, mae, rmse = 0, 0, 0 i = 0 for batchData in data.getTrainBatch( batchSize, itemgetter("reviewText", "overall")): sentcBatch = [d[0] for d in batchData] ratingBatch = [d[1] for d in batchData] batch = {"sentc_ipt": sentcBatch, "rating": ratingBatch} l, m, r = sentirec.trainBatch(session, batch) loss += l mae += m rmse += r i += 1 logger.info("minMae is %f, epoch mae is %f" % (minMae, mae / i)) logger.info("minRmse is %f, epoch rmse is %f" % (minRmse, rmse / i)) summary = sentirec.getSummary(session, batch) train_writer.add_summary(summary, epoch) if epoch % 50 == 0: global testEpoch for testBatch in data.getTestBatch( batchSize, itemgetter("reviewText", "overall")): testSB = [d[0] for d in testBatch] testRB = [d[1] for d in testBatch] batch = {"sentc_ipt": testSB, "rating": testRB} testSummary = sentirec.getSummary(session, batch) test_writer.add_summary(testSummary, testEpoch) testEpoch += 1 return mae / i, rmse / i return min((minMae, mae / i)), min((minRmse, rmse / i)) mae, rmse = senticEpoch() if mae < minMae: minMae = mae if rmse < minRmse: minRmse = rmse minEpoch = epoch modelSaveDir = os.path.join(dir, "sentiModel/%s/" % domain) if not os.path.exists(modelSaveDir): os.makedirs(modelSaveDir) saver.save(session, os.path.join(modelSaveDir, "%s-model" % domain), global_step=epoch) loader = tf.train.import_meta_graph( os.path.join(modelSaveDir, "%s-model-%d.meta" % (domain, minEpoch))) loader.restore(session, tf.train.latest_checkpoint(modelSaveDir)) sentiOutput = {} for batchData in data._getBatch( data.index, batchSize, itemgetter("reviewText", "reviewerID", "asin")): sentcBatch = [d[0] for d in batchData] reviewerIDAsin = [(d[1], d[2]) for d in batchData] outputVec = sentirec.outputVector(session, sentcBatch) sentiOutput.update(dict(zip(reviewerIDAsin, outputVec))) outputPath = os.path.join(dir, "sentiRecOutput", domain + ".pk") pkdump(sentiOutput, outputPath)