def typoIt(judgmentInFile, judgmentOutFile, rounds=100): with open(judgmentInFile) as f: currJudgments = [judg for judg in judgments_from_file(f)] lastQid = currJudgments[-1].qid judgDict = judgments_by_qid(currJudgments) existingTypos = set() for i in range(0, rounds): for qid, judglist in judgDict.items(): keywords = judglist[0].keywords keywordsWTypo = butterfingers(keywords) if keywordsWTypo != keywords and keywordsWTypo not in existingTypos: newQid = lastQid+1 print("%s => %s" % (keywords, keywordsWTypo)) lastQid += 1 for judg in judglist: typoJudg = Judgment(grade=judg.grade, qid=newQid, keywords=keywordsWTypo, doc_id=judg.doc_id) currJudgments.append(typoJudg) existingTypos.add(keywordsWTypo) with open(judgmentOutFile, 'w') as f: judgments_to_file(f, judgmentsList=currJudgments)
def train(): from judgments import judgments_from_file, judgments_by_qid es = elastic_connection(timeout=1000) # Load features into Elasticsearch init_default_store() load_features(FEATURE_SET_NAME) # Parse a judgments movieJudgments = judgments_by_qid( judgments_from_file(filename=JUDGMENTS_FILE)) # Use proposed Elasticsearch queries (1.json.jinja ... N.json.jinja) to generate a training set # output as "sample_judgments_wfeatures.txt" log_features(es, judgments_dict=movieJudgments, search_index=INDEX_NAME) build_features_judgments_file(movieJudgments, filename=JUDGMENTS_FILE_FEATURES) # Train each ranklib model type for modelType in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]: # 0, MART # 1, RankNet # 2, RankBoost # 3, AdaRank # 4, coord Ascent # 6, LambdaMART # 7, ListNET # 8, Random Forests # 9, Linear Regression Logger.logger.info("*** Training %s " % modelType) train_model(judgments_with_features_file=JUDGMENTS_FILE_FEATURES, model_output='model.txt', which_model=modelType) save_model(script_name="gsearch_model_%s" % modelType, feature_set=FEATURE_SET_NAME, model_fname='model.txt')
auth=ES_AUTH, verify=False) Logger.logger.info(resp.status_code) if resp.status_code >= 300: Logger.logger.error(resp.text) if __name__ == "__main__": from judgments import judgments_from_file, judgments_by_qid es = elastic_connection(timeout=1000) # Load features into Elasticsearch init_default_store() load_features(FEATURE_SET_NAME) # Parse a judgments movieJudgments = judgments_by_qid( judgments_from_file(filename=JUDGMENTS_FILE)) # Use proposed Elasticsearch queries (1.json.jinja ... N.json.jinja) to generate a training set # output as "sample_judgments_wfeatures.txt" log_features(es, judgments_dict=movieJudgments, search_index=INDEX_NAME) build_features_judgments_file(movieJudgments, filename=JUDGMENTS_FILE_FEATURES) # Train each ranklib model type #for modelType in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]: for modelType in [6]: # 0, MART # 1, RankNet # 2, RankBoost # 3, AdaRank # 4, coord Ascent # 6, LambdaMART # 7, ListNET
# Add feature back to each judgment features_per_doc = {} for doc in res['hits']['hits']: docId = doc['_id'] features = doc['fields']['_ltrlog'][0]['main'] features_per_doc[docId] = feature_dict_to_list(features) # Append features from ES back to ranklib judgment list for judgment in judgments: try: features = features_per_doc[ judgment.docId] # If KeyError, then we have a judgment but no file in index judgment.features = features except Exception as e: print(e) Logger.logger.info("Missing id %s" % judgment.docId) def build_features_judgments_file(judgments_with_features, filename): with open(filename, 'w') as judgmentFile: for qid, judgmentList in judgments_with_features.items(): for judgment in judgmentList: judgmentFile.write(judgment.to_ranklib_format() + "\n") if __name__ == "__main__": es_connection = elastic_connection() judgmentsByQid = judgments_by_qid(judgments_from_file(JUDGMENTS_FILE)) log_features(es_connection, judgmentsByQid, INDEX_NAME) build_features_judgments_file(judgmentsByQid, JUDGMENTS_FILE_FEATURES)