Exemplo n.º 1
0
def loadJudgments(judgFile):
    currJudgments = []
    existingKws = set()
    lastQid = 0
    try:
        currJudgments = [judg for judg in judgmentsFromFile(judgFile)]
        existingKws = set([judg.keywords for judg in currJudgments])
        judgDict = judgmentsByQid(currJudgments)
        judgProfile = []
        for qid, judglist in judgDict.items():
            judgProfile.append((judglist[0], len(judglist)))
        judgProfile.sort(key=lambda j: j[1], reverse=True)
        for prof in judgProfile:
            print("%s has %s judgments" % (prof[0].keywords, prof[1]))

        lastQid = currJudgments[-1].qid
    except FileNotFoundError:
        pass

    return (currJudgments, existingKws, lastQid)
    def train(self):
        # Load features into Elasticsearch
        initDefaultStore()
        loadFeatures(ES_FEATURE_SET_NAME)
        # Parse a judgments
        label_file = self.find_label_file()
        print(self.find_label_file(),file=sys.stderr)
        movieJudgments = judgmentsByQid(
            judgmentsFromFile(filename=label_file))
        # Use proposed Elasticsearch queries (1.json.jinja ... N.json.jinja) to generate a training set
        # output as "sample_judgments_wfeatures.txt"
        logFeatures(self.__es, judgmentsByQid=movieJudgments)
        buildFeaturesJudgmentsFile(
            movieJudgments, filename='sample_judgments_wfeatures.txt')
        # Train each ranklib model type
        # for modelType in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
        modelType = int(ES_MODEL_TYPE)
        # 0, MART
        # 1, RankNet
        # 2, RankBoost
        # 3, AdaRank
        # 4, coord Ascent
        # 6, LambdaMART
        # 7, ListNET
        # 8, Random Forests
        # 9, Linear Regression
        print("*** Training %s " % modelType)
        self.trainModel(judgmentsWithFeaturesFile='sample_judgments_wfeatures.txt',
                        modelOutput='model.txt', whichModel=modelType)
        self.saveModel(scriptName=ES_MODEL_NAME,
                       featureSet='movie_features', modelFname='model.txt')

        with open('/opt/services/flaskapp/src/training_log.txt') as flog:
            log_lines = flog.readlines()

        print(label_file)
        return '{}{}\n{}'.format('Model trained and deployed to Elasticsearch: \n', ''.join(log_lines[-5:-3]), 'Now test the model')
Exemplo n.º 3
0
            parsedJson = formatFeature(ftrId, keywords)
            if not 'query' in parsedJson:
                raise ValueError(
                    "%s.json.jinja should be an ES query with root of {\"query..."
                    % ftrId)
            thisBase['query']['bool']['should'] = parsedJson['query']
            yield thisBase
            ftrId += 1
    except IOError:
        pass


def buildFeaturesJudgmentsFile(judgmentsWithFeatures, filename):
    with open(filename, 'w') as judgmentFile:
        for qid, judgmentList in judgmentsWithFeatures.items():
            for judgment in judgmentList:
                judgmentFile.write(judgment.toRanklibFormat() + "\n")


if __name__ == "__main__":
    from elasticsearch import Elasticsearch
    from judgments import judgmentsFromFile, judgmentsByQid
    esUrl = "http://localhost:9200"
    es = Elasticsearch()
    judgements = judgmentsByQid(
        judgmentsFromFile(filename='sample_judgements.txt'))
    kwDocFeatures(es, index='tmdb', searchType='movie', judgements=judgements)
    for qid, judgmentList in judgements.items():
        for judgment in judgmentList:
            print(judgment.toRanklibFormat())
Exemplo n.º 4
0
    from judgments import judgmentsFromFile, judgmentsByQid, duplicateJudgmentsByWeight

    config = configparser.ConfigParser()
    config.read('settings.cfg')
    esUrl = config['DEFAULT']['ESHost']
    if len(argv) > 1:
        esUrl = argv[1]

    es = Elasticsearch(esUrl, timeout=1000)

    # Load features into Elasticsearch
    initDefaultStore(esUrl)
    loadFeatures(esUrl)
    # Parse a judgments
    movieJudgments = judgmentsByQid(
        judgmentsFromFile(filename=HUMAN_JUDGMENTS))
    movieJudgments = duplicateJudgmentsByWeight(movieJudgments)
    trainJudgments, testJudgments = partitionJudgments(movieJudgments,
                                                       testProportion=0.0)

    # Use proposed Elasticsearch queries (1.json.jinja ... N.json.jinja) to generate a training set
    # output as "sample_judgments_wfeatures.txt"
    logFeatures(es, judgmentsByQid=movieJudgments)

    buildFeaturesJudgmentsFile(trainJudgments, filename=TRAIN_JUDGMENTS)
    buildFeaturesJudgmentsFile(testJudgments, filename=TEST_JUDGMENTS)

    # Train each ranklib model type
    for modelType in [8, 9, 6]:
        # 0, MART
        # 1, RankNet
Exemplo n.º 5
0
        draw = random()
        if draw <= testProportion:
            testJudgments[qid] = judgment
        else:
            trainJudgments[qid] = judgment

    return (trainJudgments, testJudgments)


if __name__ == "__main__":
    from elasticsearch import Elasticsearch
    from judgments import judgmentsFromFile, judgmentsByQid, duplicateJudgmentsByWeight
    esUrl = "http://ec2-54-234-184-186.compute-1.amazonaws.com:9616/supersecretsquirrel/"
    es = Elasticsearch(esUrl, timeout=1000)
    # Parse a judgments
    judgments = judgmentsByQid(judgmentsFromFile(filename='osc_judgments.txt'))
    judgments = duplicateJudgmentsByWeight(judgments)
    trainJudgments, testJudgments = partitionJudgments(judgments,
                                                       testProportion=0.00)
    # Use proposed Elasticsearch queries (1.json.jinja ... N.json.jinja) to generate a training set
    # output as "osc_judgments_wfeatures.txt"
    kwDocFeatures(es, index='o19s', searchType='post', judgements=judgments)
    numFeatures = len(judgments[1][0].features)
    print("Training on %s features" % numFeatures)
    buildFeaturesJudgmentsFile(trainJudgments,
                               filename='osc_judgments_wfeatures_train.txt')
    buildFeaturesJudgmentsFile(testJudgments,
                               filename='osc_judgments_wfeatures_test.txt')
    # Train each ranklib model type
    for modelType in [0, 6, 9]:
        # 0, MART
Exemplo n.º 6
0
        print(resp.status_code)
        if (resp.status_code >= 300):
            print(resp.text)


if __name__ == "__main__":
    import configparser
    from judgments import judgmentsFromFile, judgmentsByQid

    es = Elasticsearch(timeout=1000)
    # Load features into Elasticsearch
    initDefaultStore()
    loadFeatures()
    # Parse a judgments
    movieJudgments = judgmentsByQid(
        judgmentsFromFile(filename='search_sample_judgments.txt'))
    # Use proposed Elasticsearch queries (1.json.jinja ... N.json.jinja) to generate a training set
    # output as "sample_judgments_wfeatures.txt"
    logFeatures(es, judgmentsByQid=movieJudgments)
    buildFeaturesJudgmentsFile(
        movieJudgments, filename='search_sample_judgments_wfeatures.txt')
    # Train each ranklib model type
    for modelType in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
        # 0, MART
        # 1, RankNet
        # 2, RankBoost
        # 3, AdaRank
        # 4, coord Ascent
        # 6, LambdaMART
        # 7, ListNET
        # 8, Random Forests
Exemplo n.º 7
0
    import configparser
    from elasticsearch import Elasticsearch
    from judgments import judgmentsFromFile, judgmentsByQid, duplicateJudgmentsByWeight

    config = configparser.ConfigParser()
    config.read('settings.cfg')
    esUrl = config['DEFAULT']['ESHost']

    es = Elasticsearch(esUrl, timeout=1000)


    # Load features into Elasticsearch
    initDefaultStore(esUrl)
    loadFeatures(esUrl)
    # Parse a judgments
    movieJudgments = judgmentsByQid(judgmentsFromFile(filename=HUMAN_JUDGMENTS))
    movieJudgments = duplicateJudgmentsByWeight(movieJudgments)
    trainJudgments, testJudgments = partitionJudgments(movieJudgments, testProportion=0.0)

    # Use proposed Elasticsearch queries (1.json.jinja ... N.json.jinja) to generate a training set
    # output as "sample_judgments_wfeatures.txt"
    logFeatures(es, judgmentsByQid=movieJudgments)

    buildFeaturesJudgmentsFile(trainJudgments, filename=TRAIN_JUDGMENTS)
    buildFeaturesJudgmentsFile(testJudgments, filename=TEST_JUDGMENTS)

    # Train each ranklib model type
    for modelType in [8,9,6]:
        # 0, MART
        # 1, RankNet
        # 2, RankBoost
            docId = doc['_id']
            features = doc['fields']['_ltrlog'][0]['main']
            featuresPerDoc[docId] = featureDictToList(features)

        # Append features from ES back to ranklib judgment list
        for judgment in judgments:
            try:
                features = featuresPerDoc[
                    judgment.
                    docId]  # If KeyError, then we have a judgment but no movie in index
                judgment.features = features
            except KeyError:
                print("Missing movie %s" % judgment.docId)


def buildFeaturesJudgmentsFile(judgmentsWithFeatures, filename):
    with open(filename, 'w') as judgmentFile:
        for qid, judgmentList in judgmentsWithFeatures.items():
            for judgment in judgmentList:
                judgmentFile.write(judgment.toRanklibFormat() + "\n")


if __name__ == "__main__":
    from judgments import judgmentsFromFile, judgmentsByQid
    from elasticsearch import Elasticsearch
    es = Elasticsearch()
    judgmentsByQid = judgmentsByQid(judgmentsFromFile('sample_judgments.txt'))
    logFeatures(es, judgmentsByQid)
    buildFeaturesJudgmentsFile(judgmentsByQid,
                               "sample_judgments_wfeatures.txt")
Exemplo n.º 9
0
                ratings.append(judgment)

    return ratings


if __name__ == "__main__":
    """ Usage python rateShit.py esURL ratingsFileName """
    from sys import argv

    judgFile = argv[2]

    currJudgments = []
    existingKws = set()
    lastQid = 0
    try:
        currJudgments = [judg for judg in judgmentsFromFile(judgFile)]
        existingKws = set([judg.keywords for judg in currJudgments])
        lastQid = currJudgments[-1].qid
    except FileNotFoundError:
        pass

    keywords = "-"
    qid = lastQid + 1
    while len(keywords) > 0:
        keywords = input("Enter the Keywords ('GTFO' to exit) ")

        if keywords == "GTFO":
            break

        if keywords in existingKws:
            print("Sorry, we already have ratings for %s. Try again" %
            print(resp.text)





if __name__ == "__main__":
    import configparser
    from judgments import judgmentsFromFile, judgmentsByQid

    es = Elasticsearch(timeout=1000)
    # Load features into Elasticsearch
    initDefaultStore()
    loadFeatures()
    # Parse a judgments
    movieJudgments = judgmentsByQid(judgmentsFromFile(filename='sample_judgments.txt'))
    # Use proposed Elasticsearch queries (1.json.jinja ... N.json.jinja) to generate a training set
    # output as "sample_judgments_wfeatures.txt"
    logFeatures(es, judgmentsByQid=movieJudgments)
    buildFeaturesJudgmentsFile(movieJudgments, filename='sample_judgments_wfeatures.txt')
    # Train each ranklib model type
    for modelType in [0,1,2,3,4,5,6,7,8,9]:
        # 0, MART
        # 1, RankNet
        # 2, RankBoost
        # 3, AdaRank
        # 4, coord Ascent
        # 6, LambdaMART
        # 7, ListNET
        # 8, Random Forests
        # 9, Linear Regression
        featuresPerDoc = {}
        for doc in res['hits']['hits']:
            docId = doc['_id']
            features = doc['fields']['_ltrlog'][0]['main']
            featuresPerDoc[docId] = featureDictToList(features)

        # Append features from ES back to ranklib judgment list
        for judgment in judgments:
            try:
                features = featuresPerDoc[judgment.docId] # If KeyError, then we have a judgment but no movie in index
                judgment.features = features
            except KeyError:
                print("Missing movie %s" % judgment.docId)


def buildFeaturesJudgmentsFile(judgmentsWithFeatures, filename):
    with open(filename, 'w') as judgmentFile:
        for qid, judgmentList in judgmentsWithFeatures.items():
            for judgment in judgmentList:
                judgmentFile.write(judgment.toRanklibFormat() + "\n")


if __name__ == "__main__":
    from judgments import judgmentsFromFile, judgmentsByQid
    from elasticsearch import Elasticsearch
    es = Elasticsearch()
    judgmentsByQid = judgmentsByQid(judgmentsFromFile('sample_judgments.txt'))
    logFeatures(es, judgmentsByQid)
    buildFeaturesJudgmentsFile(judgmentsByQid, "sample_judgments_wfeatures.txt")

Exemplo n.º 12
0
if __name__ == "__main__":
    import configparser
    from judgments import judgmentsFromFile, judgmentsByQid

    judgment_filename = 'rolling500_judgments.txt'
    # judgment_filename = 'implicit_judgements.txt'
    judgment_features_filename = 'rolling500_judgments_wfeatures.txt'
    featureset_name = 'rolling_features_1'

    es = Elasticsearch(timeout=1000)
    # Load features into Elasticsearch
    initDefaultStore()
    loadFeatures(featureset_name)
    # Parse a judgments
    rollingJudgments = judgmentsByQid(
        judgmentsFromFile(filename=judgment_filename))
    # Use proposed Elasticsearch queries (1.json.jinja ... N.json.jinja) to generate a training set
    # output as "sample_judgments_wfeatures.txt"
    logFeatures(es, judgmentsByQid=rollingJudgments)
    buildFeaturesJudgmentsFile(rollingJudgments,
                               filename=judgment_features_filename)
    # Train each ranklib model type
    for modelType in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
        # 0, MART
        # 1, RankNet
        # 2, RankBoost
        # 3, AdaRank
        # 4, coord Ascent
        # 6, LambdaMART
        # 7, ListNET
        # 8, Random Forests
Exemplo n.º 13
0
        print("REBUILDING TRAINING DATA for %s (%s/%s)" %
              (judgments[0].keywords, idx, len(judgmentsByQid)))
        # Append features from ES back to ranklib judgment list
        for judgment in judgments:
            try:
                features = featuresPerDoc[
                    judgment.
                    docId]  # If KeyError, then we have a judgment but no movie in index
                judgment.features = features
            except KeyError:
                print("Missing movie %s" % judgment.docId)
        idx += 1


def buildFeaturesJudgmentsFile(judgmentsWithFeatures, filename):
    with open(filename, 'w') as judgmentFile:
        for qid, judgmentList in judgmentsWithFeatures.items():
            for judgment in judgmentList:
                judgmentFile.write(judgment.toRanklibFormat() + "\n")


if __name__ == "__main__":
    from judgments import judgmentsFromFile, judgmentsByQid

    solrColl = SolrColl('http://localhost:8983/solr/tmdb/')
    judgmentsByQid = judgmentsByQid(judgmentsFromFile('movie_judgments.txt'))
    logFeatures(solrColl, judgmentsByQid)
    buildFeaturesJudgmentsFile(judgmentsByQid,
                               "sample_judgments_wfeatures.txt")