Exemplo n.º 1
0
def main(datadir):
    # datadir = sys.argv[1]
    if os.path.exists(os.path.join(datadir, "static_res.pk")):
        return pkload(os.path.join(datadir, "static_res.pk"))
    pattern = re.compile("reviews_(.*?)_5.json")
    jsondir = os.path.join(datadir, "preprocess", "transform")
    src, tgt = pattern.findall("\n".join(os.listdir(jsondir)))
    cold = os.path.join(datadir, "preprocess", "cold")

    def getUser(domain, overuser=None):
        domainpattern = glob.glob(os.path.join(cold, "*%s*.pk" % domain))
        domainfile = list(domainpattern)[0]
        domainuser = pkload(domainfile)

        domainjsonpattern = glob.glob(
            os.path.join(jsondir, "*%s*.json" % domain))

        if domain == "overlap":
            return domainuser

        domainUC = 0
        domainOUC = 0
        domainuser = set(domainuser)
        overuser = set(overuser)
        for record in readJson(domainjsonpattern[0]):
            if record["reviewerID"] in domainuser:
                domainUC += 1
            if record["reviewerID"] in overuser:
                domainOUC += 1

        return domainuser, domainUC, domainOUC

    overuser = getUser("overlap")
    srcuser, srcUC, srcOUC = getUser(src, overuser)
    tgtuser, tgtUC, tgtOUC = getUser(tgt, overuser)

    print datadir, "done"
    static_res = {
        "Domain": [src, tgt],
        "User": {
            "overlap": len(overuser),
            src: len(srcuser),
            tgt: len(tgtuser)
        },
        "Record": {
            "srcUC": srcUC,
            "srcOUC": srcOUC,
            "tgtUC": tgtUC,
            "tgtOUC": tgtOUC
        }
    }

    pkdump(static_res, os.path.join(datadir, "static_res.pk"))
    return static_res
Exemplo n.º 2
0
    def getUser(domain, overuser=None):
        domainpattern = glob.glob(os.path.join(cold, "*%s*.pk" % domain))
        domainfile = list(domainpattern)[0]
        domainuser = pkload(domainfile)

        domainjsonpattern = glob.glob(
            os.path.join(jsondir, "*%s*.json" % domain))

        if domain == "overlap":
            return domainuser

        domainUC = 0
        domainOUC = 0
        domainuser = set(domainuser)
        overuser = set(overuser)
        for record in readJson(domainjsonpattern[0]):
            if record["reviewerID"] in domainuser:
                domainUC += 1
            if record["reviewerID"] in overuser:
                domainOUC += 1

        return domainuser, domainUC, domainOUC
Exemplo n.º 3
0
def sentitrain(dir, domain, filter_size, filter_num, embd_size, epoches):
    runConfig = config.configs["DEBUG"]("sentitrain_%s_%s_%s" %
                                        (domain, filter_size, str(embd_size)))
    runConfig.setConsoleLogLevel("DEBUG")
    logger = runConfig.getLogger()
    gpuConfig = runConfig.getGPUConfig()
    session = tf.Session(config=gpuConfig)
    transPath = os.path.join(dir, "transform")
    data = []
    logger.info(transPath + "/*%s*" % domain)
    for d in glob.glob(transPath + "/*%s*" % domain):
        data.append(Dataset.SentiRecDataset(d))

    if data == []:
        logger.error("The data of %s is not in %s", domain, transPath)
        raise Exception

    data = data[0]
    vocabPath = os.path.join(dir, "vocab")
    vocab = pkload(os.path.join(vocabPath, "allDomain.pk"))
    vocab_size = len(vocab) + 1

    filter_size = [int(i) for i in filter_size.split(",")]

    sentc_len = data.getSentLen()
    sentirec = SentiRec(sentc_len, vocab_size, embd_size, filter_size,
                        filter_num)
    sentirec.initSess(session)

    train_writer = tf.summary.FileWriter('log/sentitrain/%s/train' % domain,
                                         session.graph)
    test_writer = tf.summary.FileWriter('log/sentitrain/%s/test' % domain,
                                        session.graph)
    minMae = 20
    minRmse = 20
    minEpoch = epoches

    batchSize = 1000

    saver = tf.train.Saver(max_to_keep=1)

    for epoch in range(epoches):
        logger.info("Epoch %d" % epoch)

        @recordTime
        def senticEpoch():
            loss, mae, rmse = 0, 0, 0
            i = 0
            for batchData in data.getTrainBatch(
                    batchSize, itemgetter("reviewText", "overall")):
                sentcBatch = [d[0] for d in batchData]
                ratingBatch = [d[1] for d in batchData]
                batch = {"sentc_ipt": sentcBatch, "rating": ratingBatch}
                l, m, r = sentirec.trainBatch(session, batch)
                loss += l
                mae += m
                rmse += r
                i += 1
            logger.info("minMae is %f, epoch mae is %f" % (minMae, mae / i))
            logger.info("minRmse is %f, epoch rmse is %f" %
                        (minRmse, rmse / i))
            summary = sentirec.getSummary(session, batch)
            train_writer.add_summary(summary, epoch)
            if epoch % 50 == 0:
                global testEpoch
                for testBatch in data.getTestBatch(
                        batchSize, itemgetter("reviewText", "overall")):
                    testSB = [d[0] for d in testBatch]
                    testRB = [d[1] for d in testBatch]
                    batch = {"sentc_ipt": testSB, "rating": testRB}
                    testSummary = sentirec.getSummary(session, batch)
                    test_writer.add_summary(testSummary, testEpoch)
                    testEpoch += 1
            return mae / i, rmse / i
            return min((minMae, mae / i)), min((minRmse, rmse / i))

        mae, rmse = senticEpoch()
        if mae < minMae:
            minMae = mae
        if rmse < minRmse:
            minRmse = rmse
            minEpoch = epoch
            modelSaveDir = os.path.join(dir, "sentiModel/%s/" % domain)
            if not os.path.exists(modelSaveDir):
                os.makedirs(modelSaveDir)

            saver.save(session,
                       os.path.join(modelSaveDir, "%s-model" % domain),
                       global_step=epoch)

    loader = tf.train.import_meta_graph(
        os.path.join(modelSaveDir, "%s-model-%d.meta" % (domain, minEpoch)))
    loader.restore(session, tf.train.latest_checkpoint(modelSaveDir))

    sentiOutput = {}
    for batchData in data._getBatch(
            data.index, batchSize,
            itemgetter("reviewText", "reviewerID", "asin")):
        sentcBatch = [d[0] for d in batchData]
        reviewerIDAsin = [(d[1], d[2]) for d in batchData]
        outputVec = sentirec.outputVector(session, sentcBatch)
        sentiOutput.update(dict(zip(reviewerIDAsin, outputVec)))

    outputPath = os.path.join(dir, "sentiRecOutput", domain + ".pk")
    pkdump(sentiOutput, outputPath)