def train(feature_set_name: str,
          model_name: str,
          queries_file: str,
          judgments_file: str,
          index_name: str,
          features_file: str,
          model_output: str,
          protected_feature_name="1",
          gamma=1,
          number_of_iterations=3000,
          learning_rate=0.001,
          lambdaa=0.001,
          init_var=0.01,
          standardize=False,
          log=None):
    """
    Train and upload model with specified parameters
    """
    es = elastic_connection(timeout=1000)
    collect_train_data(es, queries_file, judgments_file, feature_set_name,
                       index_name, features_file)
    train_model(features_file, model_output, protected_feature_name, gamma,
                number_of_iterations, learning_rate, lambdaa, init_var,
                standardize, log)

    save_model(model_name, feature_set_name, model_output)
예제 #2
0
def train():
    from judgments import judgments_from_file, judgments_by_qid

    es = elastic_connection(timeout=1000)
    # Load features into Elasticsearch
    init_default_store()
    load_features(FEATURE_SET_NAME)
    # Parse a judgments
    movieJudgments = judgments_by_qid(
        judgments_from_file(filename=JUDGMENTS_FILE))
    # Use proposed Elasticsearch queries (1.json.jinja ... N.json.jinja) to generate a training set
    # output as "sample_judgments_wfeatures.txt"
    log_features(es, judgments_dict=movieJudgments, search_index=INDEX_NAME)
    build_features_judgments_file(movieJudgments,
                                  filename=JUDGMENTS_FILE_FEATURES)
    # Train each ranklib model type
    for modelType in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
        # 0, MART
        # 1, RankNet
        # 2, RankBoost
        # 3, AdaRank
        # 4, coord Ascent
        # 6, LambdaMART
        # 7, ListNET
        # 8, Random Forests
        # 9, Linear Regression
        Logger.logger.info("*** Training %s " % modelType)
        train_model(judgments_with_features_file=JUDGMENTS_FILE_FEATURES,
                    model_output='model.txt',
                    which_model=modelType)
        save_model(script_name="gsearch_model_%s" % modelType,
                   feature_set=FEATURE_SET_NAME,
                   model_fname='model.txt')
def index(index_name, document_dir):
    """
    Index the data
    :param index_name:          Name of the created index
    :param document_dir:        Path to the directory containing the JSONs to be uploaded. Each JSON :must: have an "id".
    :return:
    """
    es = elastic_connection(timeout=30)
    reindex(es,
            document_list=create_document_list(document_dir),
            index=index_name)
def search(index_name, query, model, verbose):
    """
    Peforms a search request on Elasticseach using LTR and a specified (DELTR) model
    :param index_name:      The index to search on
    :param query:           The query to search by
    :param model:           The model to search with
    :param verbose:         Whether or not the output should contain the weights
    :return:
    """
    es = elastic_connection(timeout=1000)
    results = es.search(index=index_name, body=ltr_query(query, model))
    for result in results['hits']['hits']:
        message = result['_source']['id']
        if verbose:
            features = result['fields']['_ltrlog'][0]['log_entry']
            message += ' ' + ' '.join(
                ['{0}:{1}'.format(ll['name'], ll['value']) for ll in features])
        Logger.logger.info(message)
예제 #5
0
def training_pipeline():
    from utils import elastic_connection
    es = elastic_connection()
    file_judgments = parse_data_and_get_judgement()
    print(file_judgments)
    init_default_store()
    load_features(FEATURE_SET_NAME)
    log_features(es, judgments_dict=file_judgments, search_index=INDEX_NAME)
    build_features_judgments_file(file_judgments,
                                  filename=JUDGMENTS_FILE_FEATURES)

    for modelType in [6, 7, 9]:
        # for modelType in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
        Logger.logger.info("*** Training %s " % modelType)
        train_model(judgments_with_features_file=JUDGMENTS_FILE_FEATURES,
                    model_output='model.txt',
                    which_model=modelType)
        save_model(script_name="test_%s" % modelType,
                   feature_set=FEATURE_SET_NAME,
                   model_fname='model.txt')
예제 #6
0
        settings['mappings'] = mapping_settings  # C

    es_connection.indices.delete(index, ignore=[400, 404])
    es_connection.indices.create(index, body=settings)

    elasticsearch.helpers.bulk(es, bulk_docs(movie_dict, index))


def bulk_docs(movie_dict, index):
    print(len(movie_dict))
    for movie_id, movie in movie_dict.items():


        if 'release_date' in movie and movie['release_date'] == "":
            del movie['release_date']
        enrich(movie)
        add_cmd = {"_index": index,  # E
                   "_type": "movie",
                   "_id": movie_id,
                   "_source": movie}

        yield add_cmd
        if 'title' in movie:
            Logger.logger.info("%s added to %s" % (movie['title'].encode('utf-8'), index))


if __name__ == "__main__":
    es = elastic_connection(timeout=30)
    tmdb_movie_dict = json.loads(open('tmdb.json',encoding='utf-8').read())
    reindex(es, movie_dict=tmdb_movie_dict)
예제 #7
0
        print("I am here")

        resp = requests.post(full_path,
                             data=json.dumps(model_payload),
                             headers=head,
                             auth=ES_AUTH,
                             verify=False)
        Logger.logger.info(resp.status_code)
        if resp.status_code >= 300:
            Logger.logger.error(resp.text)


if __name__ == "__main__":
    from judgments import judgments_from_file, judgments_by_qid

    es = elastic_connection(timeout=1000)
    # Load features into Elasticsearch
    init_default_store()
    load_features(FEATURE_SET_NAME)
    # Parse a judgments
    movieJudgments = judgments_by_qid(
        judgments_from_file(filename=JUDGMENTS_FILE))
    # Use proposed Elasticsearch queries (1.json.jinja ... N.json.jinja) to generate a training set
    # output as "sample_judgments_wfeatures.txt"
    log_features(es, judgments_dict=movieJudgments, search_index=INDEX_NAME)
    build_features_judgments_file(movieJudgments,
                                  filename=JUDGMENTS_FILE_FEATURES)
    # Train each ranklib model type
    #for modelType in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
    for modelType in [6]:
        # 0, MART
        # Add feature back to each judgment
        features_per_doc = {}
        for doc in res['hits']['hits']:
            docId = doc['_id']
            features = doc['fields']['_ltrlog'][0]['main']
            features_per_doc[docId] = feature_dict_to_list(features)

        # Append features from ES back to ranklib judgment list
        for judgment in judgments:
            try:
                features = features_per_doc[
                    judgment.docId]  # If KeyError, then we have a judgment but no file in index
                judgment.features = features
            except Exception as e:
                print(e)
                Logger.logger.info("Missing id %s" % judgment.docId)


def build_features_judgments_file(judgments_with_features, filename):
    with open(filename, 'w') as judgmentFile:
        for qid, judgmentList in judgments_with_features.items():
            for judgment in judgmentList:
                judgmentFile.write(judgment.to_ranklib_format() + "\n")


if __name__ == "__main__":
    es_connection = elastic_connection()
    judgmentsByQid = judgments_by_qid(judgments_from_file(JUDGMENTS_FILE))
    log_features(es_connection, judgmentsByQid, INDEX_NAME)
    build_features_judgments_file(judgmentsByQid, JUDGMENTS_FILE_FEATURES)
예제 #9
0
def generate_gsearch_judgements():
    es_connection = elastic_connection()
    data = _get_json()
    _write_file(es_connection, data)
from utils import elastic_connection
import pandas as pd
from utils import expert_feedback_index_name

es = elastic_connection()


def get_data():
    res = es.search(index=expert_feedback_index_name, size=20)
    data_df = []

    for doc in res['hits']['hits']:
        list = []
        source = doc['_source']
        list.append(source["doc_id"])
        list.append(source["search_query"])
        list.append(source["grade"])
        data_df.append(list)

    dataframe = pd.DataFrame(data_df, columns=['doc_id', 'search_query', 'grade'])
    return dataframe
예제 #11
0
if __name__ == "__main__":
    """This script set the default roles and users to run the LTR ranklib-demo"""
    if len(sys.argv) == 2:
        password = getpass.getpass()
    elif len(sys.argv) == 3:
        password = sys.argv[2]
    else:
        Logger.logger.info(
            """prepare_xpack.py [elasticsearch.user] [elasticsearch.password]"""
        )
        sys.exit(-1)

    username = sys.argv[1]

    es = elastic_connection(http_auth=(username, password))
    xpack = XPackClient(es)

    Logger.logger.info("Configure ltr_admin role:")
    res = xpack.security.put_role(
        'ltr_admin', {
            "cluster": ["all"],
            "indices": [{
                "names": [".ltrstore*"],
                "privileges": ["all"]
            }]
        })
    Logger.logger.info(res)

    Logger.logger.info("Configure tmdb role:")
    res = xpack.security.put_role(