def __init__(self, docs_dictionary, main_dictionary, avdl, stem_suffix, ip, city_dictionary): self.main_dictionary = main_dictionary self.__ranker = Ranker(docs_dictionary, main_dictionary, avdl, len(docs_dictionary), stem_suffix, ip) self.__list_of_cities = [] self.__city_dictionary = city_dictionary
def search_query(self): # given a query, search for relveant documents and rank them query_terms = self.__string_to_terms(self.query) query_desc_terms = self.__string_to_terms( self.query_desc) if self.query_desc is not None else [] # in case there's a query description, treat it as part of the query relevant_documents = self.__retrieve_relevant_documents( query_terms + query_desc_terms) ranker = Ranker(relevant_documents.values(), query_terms, query_desc_terms) return ranker.calculate_ranked_documents()
def start(self): # DatabaseQueries.createTables() self.modelStore = ModelStore() # "database" of models self.userAnalyzer = UserAnalyzer( ) # classify user type: anonymous? registered new? or registered old? self.trainingCenter = TrainingCenter(self.modelStore) self.ranker = Ranker() # just rank the recommended items # once start should firstly train the models and immediately have recommendations on home page self.trainingCenter.trainModel( ) # NOTE: need to firstly train models once for a welcome page self.recEngine = RecEngine(self.userAnalyzer, self.modelStore, DatabaseQueries.getNumRatingsPerUser())
def __init__(self, corpus_path, posting_path, terms_dict, cities_dict, docs_dict, avg_doc_length, with_stemming, with_semantics): self.terms_dict = terms_dict self.cities_dict = cities_dict self.docs_dict = docs_dict self.parser = Parse(corpus_path) ## corpus path for stop words self.parser.to_stem = with_stemming self.posting_path = posting_path self.ranker = Ranker(avg_doc_length) self.model = None self.with_semantics = with_semantics self.with_stemming = with_stemming
def start(self): # each object here simulates the API calls through network # passing an object A to the constructor of B means A will communication to B self.db.startEngine() self.ranker = Ranker(self.numberToServe, self.db) self.user_analyzer = UserAnalyzer() self.model_store = ModelStore() self.online_learner = OnlineLearner(self.db, self.model_store) self.offline_learner = OfflineLearner(self.db, self.model_store) self.increment() self.rec_engine = RecEngine( self.user_analyzer, self.model_store, self.db.connTable[DatabaseInterface.USER_ACTIVITY_KEY])
def start(self): self.db.startEngine() self.ranker = Ranker(self.numberToServe, self.db) self.userAnalyzer = UserAnalyzer() self.modelStore = ModelStore() self.offlineLearner = OfflineLearner(self.db, self.modelStore) self.onlineLearner = OnlineLearner(self.db, self.modelStore) #so that immediately after we start, we can start to give recommendations self.offlineLearner.trainModel() #had to extract it here self.recEngine = RecEngine( self.userAnalyzer, self.modelStore, self.db.extract(DatabaseInterface.USER_ACTIVITY_KEY))
def start(self): # each object here simulates the API calls through network # passing an object A to the constructor of B means A will communication to B self.db.startEngine() self.ranker = Ranker(self.numberToServe, self.db) self.userAnalyzer = UserAnalyzer() self.modelStore = ModelStore() self.offlineLearner = OfflineLearner(self.db, self.modelStore) self.onlineLearner = OnlineLearner(self.db, self.modelStore) self.offlineLearner.trainModel() # when we start the webserver, let offline learner to train the models, # so that after the start(), we can start to give recommendation self.recEngine = RecEngine( self.userAnalyzer, self.modelStore, self.db.extract(DatabaseInterface.USER_ACTIVITY_KEY))
def generateTestFeatures(client_socket, infile, featurefile): #------------------------------------------------ doc = Document(infile) #------------------------------------------------ # Load pickle for label picklefile = DIR['DATA'] + 'test-labels-pickle' global test_labels with open(picklefile, 'rb') as pfile: test_labels = pickle.load(pfile) #------------------------------------------------ # For display and analysis dir, filename = os.path.split(infile) fcode = re.match(r'(.+)-parscit-section\.xml', filename).group(1) #------------------------------------------------ test_sents, sent_indices = getRankedSent(doc, fcode) #----------------------------------------- # Sectional Ranker sections = [] for sec, block in doc.document.items(): sentences = '' for key in sorted(block.keys()): sentences += (str(block[key])) sections.append(sentences) sec_ranker = Ranker(sections) sec_indices = sent2Section(doc, sent_indices) #----------------------------------------- for sentence, sent_idx, sec_idx in zip(test_sents, sent_indices, sec_indices): key = fcode + '-' + str(sent_idx) feature_string = test_data[key]['reallbl'] tree = parseTrees(getDepParse(client_socket, sentence)) feature_string += processTree(tree, sec_ranker, sec_idx, False) test_data[key]['depparse'] = getTree(tree) test_data[key]['features'] = feature_string writeToFile(featurefile, feature_string + '\n', 'a')
def get_pos_sentences(infile, outfile, backup=False): doc = Document(infile) #sentences, o = doc.all_sentences() #ranker = Ranker(sentences, tfidf=False) #----------------------------------------- # Instead of the above, now sentences will be clubbed into sections and # passed to the ranker, which is to be returned sections = [] for sec, block in doc.document.items(): sentences = '' for key in sorted(block.keys()): sentences += (str(block[key])) sections.append(sentences) ranker = Ranker(sections) #----------------------------------------- sent, offset = doc.section_sentences('abstract') sent_idx = range(offset, offset + len(sent)) samples = '\n'.join(sent) writeToFile(outfile, samples, 'w') #return ranker, sent_idx # The sent_idx needs to be converted to reflect the corresponding section # index section_idx = sent2Section(doc, sent_idx) if backup: backupfile = DIR['BASE'] + "data/backup.txt" writeToFile(backupfile, "\n---------Positive---------\n", 'a') writeToFile(backupfile, samples, 'a') return ranker, section_idx
class WebServer(object): logging.basicConfig(level=logging.INFO) def __init__(self, configMap): self.db = DatabaseInterface(configMap['data_dir']) # numberToServe: the number of items finally served to the users self.numberToServe = configMap['numberToServe'] self.log = logging.getLogger(__name__) def start(self): # each object here simulates the API calls through network # passing an object A to the constructor of B means A will communication to B self.db.startEngine() self.ranker = Ranker(self.numberToServe, self.db) self.userAnalyzer = UserAnalyzer() self.modelStore = ModelStore() self.offlineLearner = OfflineLearner(self.db, self.modelStore) self.onlineLearner = OnlineLearner(self.db, self.modelStore) self.offlineLearner.trainModel() # when we start the webserver, we should let offline learner to train the models, # such that, after the start(), we can start to give recommendation self.recEngine = RecEngine(self.userAnalyzer, self.modelStore, self.db.extract(DatabaseInterface.USER_ACTIVITY_KEY)) def getAction(self, action): assert (isinstance(action, Action)) # taking the action from users self.onlineLearner.trainModel(action) # analyze action type, and save the registered user's action actionType = self.userAnalyzer.analyzeAction(action) if actionType == "registered": self.log.info("Recording action %s" % action) self.db.putAction(action) def provideRecommendation(self, request): # return the ID's for the recommended items assert (isinstance(request, Request)) # provide recommendations to user self.log.info("responding to request: %s" % request) recommendations = self.recEngine.provideRecommendation(request) recsReranked = self.ranker.rerank(recommendations) return recsReranked # a list of item ids def renderRecommendation(self, request): assert (isinstance(request, Request)) recsReranked = self.provideRecommendation(request) # for the purpose of testing, we sort the index, output item names # output is ordered by the id value return self.db.extract(DatabaseInterface.INVENTORY_KEY).loc[recsReranked].sort_index() def increment(self): self.log.info("incrementing the system, update the models") # increment the whole system by one day, trigger offline training self.offlineLearner.trainModel() self.modelStore.cleanOnlineModel() self.recEngine.resetCache() def getFromInventory(self, itemId): return self.db.extract(DatabaseInterface.INVENTORY_KEY).loc[itemId]
def generateTrainFeatures(client_socket, infile, featurefile): #------------------------------------------------ doc = Document(infile) all_sentences, all_offset = doc.all_sentences() #------------------------------------------------ # Positive sentences pos_sents, offset = doc.section_sentences('abstract') sent_indices = range(offset, offset + len(pos_sents)) #----------------------------------------- # Sectional Ranker sections = [] for sec, block in doc.document.items(): sentences = '' for key in sorted(block.keys()): sentences += (str(block[key])) sections.append(sentences) sec_ranker = Ranker(sections) sec_indices = sent2Section(doc, sent_indices) #----------------------------------------- # Count ranker #count_ranker = Ranker(all_sentences, tfidf=False) #----------------------------------------- for sentence, sent_idx, sec_idx in zip(pos_sents, sent_indices, sec_indices): feature_string = '+1' tree = parseTrees(getDepParse(client_socket, sentence)) feature_string += processTree(tree, sec_ranker, sec_idx, False) #feature_string += processTree(tree, count_ranker, sent_idx, True) writeToFile(featurefile, feature_string + '\n', 'a') #------------------------------------------------ # Negative sentences neg_ranker = TextRank(all_sentences) neg_ranker.rank() num = 5 x = -1 neg_sents = [] sent_indices = [] while num > 0: idx = neg_ranker.scores[x][0] + all_offset x -= 1 if not validSentence(doc[idx]): continue else: sent_indices.append(idx) neg_sents.append(doc[idx].sentence.encode('utf-8')) num -= 1 sec_indices = sent2Section(doc, sent_indices) #------------------------------------------------ for sentence, sent_idx, sec_idx in zip(neg_sents, sent_indices, sec_indices): feature_string = '-1' tree = parseTrees(getDepParse(client_socket, sentence)) feature_string += processTree(tree, sec_ranker, sec_idx, False) #feature_string += processTree(tree, count_ranker, sent_idx, True) writeToFile(featurefile, feature_string + '\n', 'a') #------------------------------------------------ print "All input files processed to create feature vectors for training."
class WebServer(object): logging.basicConfig(level=logging.INFO) def __init__(self, configMap): self.db = DatabaseInterface(configMap['data_dir']) self.numberToServe = configMap['numberToServe'] self.log = logging.getLogger(__name__) # numberToServe: the number of items finally served to the users def start(self): # each object here simulates the API calls through network # passing an object A to the constructor of B means A will communication to B self.db.startEngine() self.ranker = Ranker(self.numberToServe, self.db) self.user_analyzer = UserAnalyzer() self.model_store = ModelStore() self.online_learner = OnlineLearner(self.db, self.model_store) self.offline_learner = OfflineLearner(self.db, self.model_store) self.increment() self.rec_engine = RecEngine( self.user_analyzer, self.model_store, self.db.connTable[DatabaseInterface.USER_ACTIVITY_KEY]) def getAction(self, action): assert (isinstance(action, Action)) #analyze user type user_type = self.user_analyzer.analyzeAction(action) self.online_learner.trainModel(action) if user_type == "registered": self.log.info("Recording action %s", action) self.db.putAction(action) def provideRecommendation(self, request): # return the ID's for the recommended items assert (isinstance(request, Request)) recommendations = self.rec_engine.provideRecommendation(request) item_ids = self.ranker.rerank(recommendations) return item_ids def renderRecommendation(self, request): assert (isinstance(request, Request)) item_ids = self.provideRecommendation(request) return self.getFromInventory(item_ids).sort_index() def increment(self): self.log.info("incrementing the system, update the models") # increment the whole system by one day, trigger offline training self.model_store.cleanOnlineModel() self.offline_learner.trainModel() def getFromInventory(self, itemId): return self.db.extract(DatabaseInterface.INVENTORY_KEY).loc[itemId]
class WebServer(object): logging.basicConfig(level=logging.INFO) #configMap is in main def __init__(self, configMap): self.db = DatabaseInterface(configMap['data_dir']) self.numberToServe = configMap['numberToServe'] self.log = logging.getLogger(__name__) #要用key idk why, why not a direct string? #initialize everything def start(self): self.db.startEngine() self.ranker = Ranker(self.numberToServe, self.db) self.userAnalyzer = UserAnalyzer() self.modelStore = ModelStore() self.offlineLearner = OfflineLearner(self.db, self.modelStore) self.onlineLearner = OnlineLearner(self.db, self.modelStore) #so that immediately after we start, we can start to give recommendations self.offlineLearner.trainModel() #had to extract it here self.recEngine = RecEngine( self.userAnalyzer, self.modelStore, self.db.extract(DatabaseInterface.USER_ACTIVITY_KEY)) def getAction(self, action): assert (isinstance(action, Action)) self.onlineLearner.trainModel(action) actionType = self.userAnalyzer.analyzeAction(action) if actionType == "registered": self.db.putAction(action) def provideRec(self, request): assert (isinstance(request, Request)) rec = self.recEngine.provideRec(request) recReRanked = self.ranker.rerank(rec) return recReRanked def renderRec(self, request): assert (isinstance(request, Request)) recReRanked = self.provideRec(request) return self.db.extract( DatabaseInterface.INVENTORY_KEY).loc[recReRanked].sort_index() def increment(self): #offline, online, recengine(find the new most popular one) self.offlineLearner.trainModel() self.modelStore.cleanOnlineModel() self.recEngine.resetCache() def getFromInventory(self, itemId): return self.db.extract(DatabaseInterface.INVENTORY_KEY).loc[itemId]
def get_test_sentences(infile, outfile, backup=False): doc = Document(infile) sentences, offset = doc.all_sentences() ranker = TextRank(sentences) ranker.rank() num = 7 x = 0 samples = '' sent_idx = [] while num > 0: idx = ranker.scores[x][0] + offset x += 1 #if not validSentence(doc[idx]): # continue #else: # sent_idx.append(idx) # samples += doc[idx].sentence.encode('utf-8') + '\n' # num -= 1 sent_idx.append(idx) samples += doc[idx].sentence.encode('utf-8') + '\n' num -= 1 #--------------------------------------------------- # Storing the sentence in the dictionary for pickling for display infi = re.match(r'/home/ankur/devbench/scientific/scisumm/demo/(.+)-parscit-section\.xml', infile).group(1) key = infi + "-" + str(idx) test_data[key] = {'sentence': doc[idx].sentence.encode('utf-8'), 'textrank': ranker.scores[x - 1][1], 'contextpre': getContext(doc, idx, -2), 'contextpos': getContext(doc, idx, 2)} writeToFile(outfile, samples, 'w') #ranker = Ranker(sentences, tfidf=False) #return ranker, sent_idx #----------------------------------------- # Calculating the sectional TF-IDF sections = [] for sec, block in doc.document.items(): sentences = '' for key in sorted(block.keys()): sentences += (str(block[key])) sections.append(sentences) ranker = Ranker(sections) #----------------------------------------- # The sent_idx needs to be converted to reflect the corresponding section # index section_idx = sent2Section(doc, sent_idx) if backup: backupfile = DIR['BASE'] + "data/backup.txt" writeToFile(backupfile, "\n---------" + str(doc) + "---------\n", 'a') writeToFile(backupfile, samples, 'a') return ranker, section_idx, sent_idx
def classifyDoc(document): featurefile = DIR['DATA'] + 'features_svm.txt' classify = DIR['BASE'] + "lib/svm-light/svm_classify" model = DIR['DATA'] + "sec-tfidf-model.txt" outfile = DIR['DATA'] + "svm-out-sent.txt" #sumlength = 5 client_socket = getConnection() doc = Document(document) #----------------------------------------- # Clubbing sentences in sections and passing to the ranker sections = [] for sec, block in doc.document.items(): sentences = '' for key in sorted(block.keys()): sentences += (str(block[key])) sections.append(sentences) sec_ranker = Ranker(sections) sents, offset = doc.all_sentences() ranker = TextRank(sents) ranker.rank() #----------------------------------------- sents, sent_indices = getSecRankedSent(doc) #----------------------------------------- # The sent_idx needs to be converted to reflect the corresponding # section index sec_indices = sent2Section(doc, sent_indices) summary = [] classified = [] sum_len = 0 for sent, sec_idx in zip(sents, sec_indices): #----------------------------------------- # dependency parse tree = parseTrees(getDepParse(client_socket, sent)) #----------------------------------------- deleteFiles([featurefile]) feature_string = "+1" feature_string += processTree(tree, sec_ranker, sec_idx, False) writeToFile(featurefile, feature_string + '\n', 'a') deleteFiles([outfile]) subprocess.call([classify, featurefile, model, outfile]) with open(outfile, 'r') as ofile: sent_val = float(ofile.read().strip()) classified.append((sent, sent_val)) for sent, val in sorted(classified, key=itemgetter(1)): summary.append(sent) sum_len += len(sent.split(' ')) if sum_len > 130: break writeToFile(DIR['DATA'] + "svm_summary.txt", '\n'.join(summary), 'w') print '\n'.join(summary)
def start(self): # each object here simulates the API calls through network # passing an object A to the constructor of B means A will communication to B self.db.startEngine() self.ranker = Ranker(self.numberToServe, self.db) self.userAnalyzer = UserAnalyzer() self.modelStore = ModelStore() self.offlineLearner = OfflineLearner(self.db, self.modelStore) self.onlineLearner = OnlineLearner(self.db, self.modelStore) self.offlineLearner.trainModel() # when we start the webserver, we should let offline learner to train the models, # such that, after the start(), we can start to give recommendation self.recEngine = RecEngine(self.userAnalyzer, self.modelStore, self.db.extract(DatabaseInterface.USER_ACTIVITY_KEY))
def main(self, args=None): """训练bot的主函数 """ print('Welcome to DeepRank!') print() print('TensorFlow detected: v{}'.format(tf.__version__)) # 初始化, hyperparameters self.args = self.parseArgs(args) if not self.args.rootDir: self.args.rootDir = os.getcwd() self.loadHyperParams() # 读入训练和测试用的数据 self.textData = TextData(self.args) self.evalData = RankTextData(self.args) # 搭建模型 graph = tf.Graph() with tf.device(self.getDevice()): with graph.as_default(): with tf.name_scope('training'): self.model_train = Ranker(self.args, is_training=True) tf.get_variable_scope().reuse_variables() with tf.name_scope('validation'): self.model_valid = Ranker(self.args, is_training=False) with tf.name_scope('evluation'): self.model_test = Ranker(self.args, is_training=False) self.ckpt_model_saver = tf.train.Saver( name='checkpoint_model_saver') self.best_model_saver = tf.train.Saver( name='best_model_saver') # Running session # allow_soft_placement = True: 当设置为使用GPU而实际上没有GPU的时候,允许使用其他设备运行。 self.sess = tf.Session(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=False)) print('Initialize variables...') self.sess.run(tf.global_variables_initializer()) # 定义 saver/summaries graph_info = self.sess.graph self.train_writer = tf.summary.FileWriter( os.path.join(self.modelDir, 'train/'), graph_info) self.valid_writer = tf.summary.FileWriter( os.path.join(self.modelDir, 'valid/'), graph_info) """ # 使用worvecd等预处理的词向量参数初始化 bot 模型的词向量参数 if self.args.initEmbeddings: self.loadEmbedding(self.sess) """ # 开始训练 self.mainTrain(self.sess)
def get_neg_sentences(infile, outfile, backup=False): doc = Document(infile) sentences, offset = doc.all_sentences() ranker = TextRank(sentences) ranker.rank() num = 5 x = -1 samples = '' sent_idx = [] while num > 0: idx = ranker.scores[x][0] + offset x -= 1 if not validSentence(doc[idx]): continue else: sent_idx.append(idx) samples += doc[idx].sentence.encode('utf-8') + '\n' num -= 1 writeToFile(outfile, samples, 'w') #ranker = Ranker(sentences, tfidf=False) #return ranker, sent_idx #----------------------------------------- # Calculating the sectional TF-IDF sections = [] for sec, block in doc.document.items(): sentences = '' for key in sorted(block.keys()): sentences += (str(block[key])) sections.append(sentences) ranker = Ranker(sections) #----------------------------------------- # The sent_idx needs to be converted to reflect the corresponding section # index section_idx = sent2Section(doc, sent_idx) if backup: backupfile = DIR['BASE'] + "data/backup.txt" writeToFile(backupfile, "\n---------Negative---------\n", 'a') writeToFile(backupfile, samples, 'a') return ranker, section_idx
def __init__(self, projectRoot): Ranker.__init__(self, projectRoot)
def __init__(self, depth=1): self.depth = depth self.ranker = Ranker() self.buffer = []
def generateTrainFeatures(client_socket, infile, featurefile): #------------------------------------------------ doc = Document(infile) all_sentences, all_offset = doc.all_sentences() #------------------------------------------------ # For display and analysis dir, filename = os.path.split(infile) fcode = re.match(r'(.+)-parscit-section\.xml', filename).group(1) #------------------------------------------------ #------------------------------------------------ # Positive sentences pos_sents, offset = doc.section_sentences('abstract') sent_indices = range(offset, offset + len(pos_sents)) #----------------------------------------- # Sectional Ranker sections = [] for sec, block in doc.document.items(): sentences = '' for key in sorted(block.keys()): sentences += (str(block[key])) sections.append(sentences) sec_ranker = Ranker(sections) sec_indices = sent2Section(doc, sent_indices) #----------------------------------------- # Count ranker #count_ranker = Ranker(all_sentences, tfidf=False) #----------------------------------------- for sentence, sent_idx, sec_idx in zip(pos_sents, sent_indices, sec_indices): key = fcode + '-' + str(sent_idx) feature_string = '+1' tree = parseTrees(getDepParse(client_socket, sentence)) feature_string += processTree(tree, sec_ranker, sec_idx, 1, False) train_data[key] = {'sentence': doc[sent_idx].sentence.encode('utf-8'), 'reallbl': '+1', 'features': feature_string} writeToFile(featurefile, feature_string + '\n', 'a') #------------------------------------------------ # Negative sentences neg_ranker = TextRank(all_sentences) neg_ranker.rank() num = 5 x = -1 neg_sents = [] sent_indices = [] while num > 0: idx = neg_ranker.scores[x][0] + all_offset x -= 1 if not validSentence(doc[idx]): continue else: sent_indices.append(idx) neg_sents.append(doc[idx].sentence.encode('utf-8')) num -= 1 sec_indices = sent2Section(doc, sent_indices) #------------------------------------------------ for sentence, sent_idx, sec_idx in zip(neg_sents, sent_indices, sec_indices): key = fcode + '-' + str(sent_idx) feature_string = '-1' tree = parseTrees(getDepParse(client_socket, sentence)) feature_string += processTree(tree, sec_ranker, sec_idx, 1, False) train_data[key] = {'sentence': doc[sent_idx].sentence.encode('utf-8'), 'reallbl': '-1', 'features': feature_string} writeToFile(featurefile, feature_string + '\n', 'a') #------------------------------------------------ print "All input files processed to create feature vectors for training."
class Rankbot: """ Retrieval-based chatbot """ def __init__(self): self.args = None self.textData = None self.model = None self.modelDir = '' self.globStep = 0 self.ckpt_model_saver = None self.best_model_saver = None self.best_model = [] self.best_valid_loss = [float('inf'), float('inf'), float('inf')] self.sess = None self.MODEL_DIR_BASE = 'save/model' self.MODEL_NAME_BASE = 'model' self.BEST_MODEL_NAME_BASE = 'best_model' self.MODEL_EXT = '.ckpt' self.CONFIG_FILENAME = 'params.ini' @staticmethod def parseArgs(args): """ Parse 超参数 Args: args (list<stir>): List of arguments. """ parser = argparse.ArgumentParser() # Global options globalArgs = parser.add_argument_group('Global选项') globalArgs.add_argument('--keepAll', action='store_true', help='如果等于True,则保留所有的中间结果') globalArgs.add_argument('--modelTag', type=str, default=None, help='模型的标记,方便以后识别,区分和管理不同的模型') globalArgs.add_argument('--rootDir', type=str, default=None, help='保存模型和数据的根目录') globalArgs.add_argument('--device', type=str, default=None, help='\'gpu\' or \'cpu\',指定运算用的设备') globalArgs.add_argument('--seed', type=int, default=None, help='随机数种子,方便重现实验结果') # 数据相关的选项 datasetArgs = parser.add_argument_group('数据处理超参数') datasetArgs.add_argument('--corpus', choices=TextData.corpusChoices(), default=TextData.corpusChoices()[0], help='数据集选项.') datasetArgs.add_argument('--datasetTag', type=str, default='', help='数据集的标记,方便数据的版本控制。例如,' '我们产生一个20000个单词的数据文件和另一个40000个单词的数据文件') datasetArgs.add_argument('--maxLength', type=int, default=10, help='输入/问,输出/答句子的最长长度,对应RNN的最长长度') datasetArgs.add_argument( '--filterVocab', type=int, default=1, help='去掉出现频率 <= filterVocab的词语。若要保留所有单词,filterVocab设置为0') datasetArgs.add_argument('--skipLines', action='store_true', help='如果等于True,只使用对话记录中的[2*i, 2*i+1]行作为数据样本' '否则,[2*i, 2*i+1]也可以作为数据样本,' '对话纪录中出去第一行和最后一行的每一行会出现在两个样本里面') datasetArgs.add_argument('--vocabularySize', type=int, default=20000, help='词典大小的上限(0 表示没有上限)') datasetArgs.add_argument('--train_frac', type=float, default=0.8, help='percentage of training samples') datasetArgs.add_argument('--valid_frac', type=float, default=0.1, help='percentage of training samples') # 模型结构选项 nnArgs = parser.add_argument_group('结构相关的模型超参数') nnArgs.add_argument('--hiddenSize', type=int, default=256, help='每个RNN cell的state维度') nnArgs.add_argument('--numLayers', type=int, default=2, help='每个时间的RNN cell层数') nnArgs.add_argument('--initEmbeddings', action='store_true', help='如果True, 使用开源的 word2vec 参数初始化词向量') nnArgs.add_argument('--embeddingSize', type=int, default=256, help='词向量维度') nnArgs.add_argument('--embeddingSource', type=str, default="GoogleNews-vectors-negative300.bin", help='用来初始化词向量的 word2vec 文件') # 模型训练设置 trainingArgs = parser.add_argument_group('Training options') trainingArgs.add_argument('--numEpochs', type=int, default=25, help='设置训练多少个epoch') trainingArgs.add_argument('--saveEvery', type=int, default=5000, help='设置经过多少个minibatch记录一次checkpoint') trainingArgs.add_argument('--batchSize', type=int, default=32, help='mini-batch样本数量') trainingArgs.add_argument('--learningRate', type=float, default=0.002, help='Learning rate') trainingArgs.add_argument( '--dropout', type=float, default=0.9, help='Dropout rate (这里是dropout以后保留的比重,keep_prob)') return parser.parse_args(args) def main(self, args=None): """训练bot的主函数 """ print('Welcome to DeepRank!') print() print('TensorFlow detected: v{}'.format(tf.__version__)) # 初始化, hyperparameters self.args = self.parseArgs(args) if not self.args.rootDir: self.args.rootDir = os.getcwd() self.loadHyperParams() # 读入训练和测试用的数据 self.textData = TextData(self.args) self.evalData = RankTextData(self.args) # 搭建模型 graph = tf.Graph() with tf.device(self.getDevice()): with graph.as_default(): with tf.name_scope('training'): self.model_train = Ranker(self.args, is_training=True) tf.get_variable_scope().reuse_variables() with tf.name_scope('validation'): self.model_valid = Ranker(self.args, is_training=False) with tf.name_scope('evluation'): self.model_test = Ranker(self.args, is_training=False) self.ckpt_model_saver = tf.train.Saver( name='checkpoint_model_saver') self.best_model_saver = tf.train.Saver( name='best_model_saver') # Running session # allow_soft_placement = True: 当设置为使用GPU而实际上没有GPU的时候,允许使用其他设备运行。 self.sess = tf.Session(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=False)) print('Initialize variables...') self.sess.run(tf.global_variables_initializer()) # 定义 saver/summaries graph_info = self.sess.graph self.train_writer = tf.summary.FileWriter( os.path.join(self.modelDir, 'train/'), graph_info) self.valid_writer = tf.summary.FileWriter( os.path.join(self.modelDir, 'valid/'), graph_info) """ # 使用worvecd等预处理的词向量参数初始化 bot 模型的词向量参数 if self.args.initEmbeddings: self.loadEmbedding(self.sess) """ # 开始训练 self.mainTrain(self.sess) def mainTrain(self, sess): """ 训练模型 Args: sess: 当前的 tf session """ print('开始训练模型,(按 Ctrl+C 保存并推出训练过程)...') try: batches_valid = self.evalData.getValidBatches() batches_test = self.evalData.getTestBatches() for e in range(self.args.numEpochs): print() print("----- Epoch {}/{} ; (lr={}) -----".format( e + 1, self.args.numEpochs, self.args.learningRate)) batches = self.textData.getBatches() tic = datetime.datetime.now() for nextBatch in tqdm(batches, desc="Training"): # Training pass ops, feedDict = self.model_train.step(nextBatch) assert len(ops) == 3 # (training, loss) _, loss, train_summaries = sess.run(ops, feedDict) self.globStep += 1 # 记录训练状态(训练数据上的损失函数) if self.globStep % 100 == 0: tqdm.write("----- Step %d -- CE Loss %.2f" % (self.globStep, loss)) # Checkpoint if self.globStep % self.args.saveEvery == 0: self.train_writer.add_summary(train_summaries, self.globStep) self.train_writer.flush() # validation pass print('Evaluating on validation data ...') self.valid_losses = [0, 0, 0] for nextEvalBatch in tqdm(batches_valid, desc="Validation"): ops, feedDict = self.model_valid.step( nextEvalBatch) assert len(ops) == 2 loss, eval_summaries = sess.run(ops, feedDict) for i in range(3): self.valid_losses[i] += loss[i] self.valid_writer.add_summary(eval_summaries, self.globStep) self.valid_writer.flush() for i in range(3): self.valid_losses[i] = self.valid_losses[i] / len( batches_valid) print('validation, Recall_20@(1,3,5) = %s' % self.valid_losses) time.sleep(5) if (len(self.best_model) == 0) or (self.valid_losses[0] > self.best_valid_loss[0]): print( 'best_model updated, with best accuracy :%s' % self.valid_losses) self.best_valid_loss = self.valid_losses[:] self._saveBestSession(sess) self._saveCkptSession(sess) toc = datetime.datetime.now() print("Epoch %d finished in %s seconds" % (e, toc - tic)) # 训练结束后,在测试数据上运行一遍 self.best_model_saver.restore(sess, self.best_model) self.test_losses = [0, 0, 0] for nextTestBatch in tqdm(batches_test, desc="FinalTest"): ops, feedDict = self.model_test.step(nextTestBatch) assert len(ops) == 2 loss, _ = sess.run(ops, feedDict) for i in range(3): self.test_losses[i] += loss[i] / len(batches_test) print('Final testing, Recall_20@(1,3,5) = %s' % self.test_losses) except (KeyboardInterrupt, SystemExit): # 如果用户在程序运行过程中按 Ctrl+C 结束训练 print('Interruption detected, exiting the program...') self._saveCkptSession(sess) # Ultimate saving before complete exit def _saveCkptSession(self, sess): """ 保存模型参数 Args: sess: 当前的tf session """ tqdm.write('保存Checkpoint (don\'t stop the run)...') tqdm.write('validation, Recall_20@(1,3,5) = ' + repr(self.valid_losses)) self.saveHyperParams() # 保存 checkpoint 的文件名 model_name = os.path.join(self.modelDir, self.MODEL_NAME_BASE) if self.args.keepAll: model_name += '-' + str(self.globStep) model_name = model_name + self.MODEL_EXT self.ckpt_model_saver.save(sess, model_name) tqdm.write('Checkpoint saved.') def _saveBestSession(self, sess): """ 保存模型参数 Args: sess: 当前的tf session """ tqdm.write('保存新的BestModel (don\'t stop the run)...') self.saveHyperParams() # 保存 bestmodel的 的文件名 model_name = os.path.join(self.modelDir, self.BEST_MODEL_NAME_BASE) model_name = model_name + self.MODEL_EXT self.best_model = self.best_model_saver.save(sess, model_name) tqdm.write('Best Model saved.') def loadHyperParams(self): """ 读取与当前模型相关的超参数 """ # 当前的模型位置(model path) self.modelDir = os.path.join(self.args.rootDir, self.MODEL_DIR_BASE) if self.args.modelTag: print("modelTag=%s" % self.args.modelTag) self.modelDir += '-' + self.args.modelTag print("modelDir=%s" % self.modelDir) # 如果存在config文件,使用其中的一些超参数 configName = os.path.join(self.modelDir, self.CONFIG_FILENAME) if os.path.exists(configName): config = configparser.ConfigParser() config.read(configName) # 恢复超参数 self.globStep = config['General'].getint('globStep') self.args.corpus = config['General'].get('corpus') self.args.datasetTag = config['Dataset'].get('datasetTag') self.args.maxLength = config['Dataset'].getint('maxLength') self.args.filterVocab = config['Dataset'].getint('filterVocab') self.args.skipLines = config['Dataset'].getboolean('skipLines') self.args.vocabularySize = config['Dataset'].getint( 'vocabularySize') self.args.hiddenSize = config['Network'].getint('hiddenSize') self.args.numLayers = config['Network'].getint('numLayers') self.args.initEmbeddings = config['Network'].getboolean( 'initEmbeddings') self.args.embeddingSize = config['Network'].getint('embeddingSize') self.args.embeddingSource = config['Network'].get( 'embeddingSource') def saveHyperParams(self): """ 保存模型的超参数,便于模型管理。 """ config = configparser.ConfigParser() config['General'] = {} config['General']['globStep'] = str(self.globStep) config['General']['corpus'] = str(self.args.corpus) config['Dataset'] = {} config['Dataset']['datasetTag'] = str(self.args.datasetTag) config['Dataset']['maxLength'] = str(self.args.maxLength) config['Dataset']['filterVocab'] = str(self.args.filterVocab) config['Dataset']['skipLines'] = str(self.args.skipLines) config['Dataset']['vocabularySize'] = str(self.args.vocabularySize) config['Network'] = {} config['Network']['hiddenSize'] = str(self.args.hiddenSize) config['Network']['numLayers'] = str(self.args.numLayers) config['Network']['initEmbeddings'] = str(self.args.initEmbeddings) config['Network']['embeddingSize'] = str(self.args.embeddingSize) config['Network']['embeddingSource'] = str(self.args.embeddingSource) # 保留模型学习使用的超参数,仅仅用于模型管理。 config['Training (won\'t be restored)'] = {} config['Training (won\'t be restored)']['learningRate'] = str( self.args.learningRate) config['Training (won\'t be restored)']['batchSize'] = str( self.args.batchSize) config['Training (won\'t be restored)']['dropout'] = str( self.args.dropout) with open(os.path.join(self.modelDir, self.CONFIG_FILENAME), 'w') as configFile: config.write(configFile) def getDevice(self): """ 根据输入超参数管理设备。 Return: str: 运行程序的设备的名称。 """ if self.args.device == 'cpu': return '/cpu:0' elif self.args.device == 'gpu0': return '/gpu:0' elif self.args.device == 'gpu1': return '/gpu:1' elif self.args.device is None: return None else: print( 'Warning: Error in the device name: {}, use the default device' .format(self.args.device)) return None
class FlowControl(object): logging.basicConfig(level=logging.INFO) # Output information for log use def __init__(self, configMap): # numberToServe: the number of items finally served to the users self.numberToServe = configMap['numberToServe'] self.log = logging.getLogger(__name__) # instantiate all together the classes that will be used, and start with training offline models def start(self): # DatabaseQueries.createTables() self.modelStore = ModelStore() # "database" of models self.userAnalyzer = UserAnalyzer( ) # classify user type: anonymous? registered new? or registered old? self.trainingCenter = TrainingCenter(self.modelStore) self.ranker = Ranker() # just rank the recommended items # once start should firstly train the models and immediately have recommendations on home page self.trainingCenter.trainModel( ) # NOTE: need to firstly train models once for a welcome page self.recEngine = RecEngine(self.userAnalyzer, self.modelStore, DatabaseQueries.getNumRatingsPerUser()) # Use models - Output recommendations results directly to user def renderRecommendation(self, userId=None, numberToServe=None, itemId=None, ratingScore=None, classical=None, userPreference=None): self.log.info("responding to request: %s" % userId) recommendations = self.recEngine.provideRecommendation( userId, itemId, ratingScore, classical, userPreference) # returns a dict rankings = self.ranker.rank(recommendations, userId, numberToServe) # a list of item ids # output is the detail content of item, not just item id, but sorted (ranked) by the id value # print("results from recEngine:", recommendations) # print(rankings) df_inventory = DatabaseQueries.getInventory() df_inventory.index = df_inventory.index + 1 itemsRecommended = [] itemsImageURL = [] # for i in rankings: # itemsRecommended.append(df_inventory[ df_inventory['itemId'] == i].itemName.item()) # itemsImageURL.append(df_inventory[df_inventory['itemId']== i].itemImageURL.item()) # print(itemsRecommended) # print(itemsImageURL) for i in rankings: itemName = df_inventory[df_inventory['itemId'] == i].itemName.item() itemsRecommended.append(itemName) if os.path.exists("./static/images/moviePosters/" + itemName + ".jpg"): url = "./static/images/moviePosters/" + itemName + ".jpg" else: url = df_inventory[df_inventory['itemId'] == i].itemImageURL.item() itemsImageURL.append(url) return itemsRecommended, itemsImageURL # Set up and update models - increment system - update offline models and clear online model at the end of day def increment(self): self.log.info("incrementing the system, update the models") # increment the whole system by one day, trigger offline training self.trainingCenter.trainModel() self.recEngine.resetCache() # reset most popular
class WebServer(object): logging.basicConfig(level=logging.INFO) def __init__(self, configMap): self.db = DatabaseInterface(configMap['data_dir']) # numberToServe: the number of items finally served to the users self.numberToServe = configMap['numberToServe'] self.log = logging.getLogger(__name__) def start(self): # each object here simulates the API calls through network # passing an object A to the constructor of B means A will communication to B self.db.startEngine() self.ranker = Ranker(self.numberToServe, self.db) self.userAnalyzer = UserAnalyzer() self.modelStore = ModelStore() self.offlineLearner = OfflineLearner(self.db, self.modelStore) self.onlineLearner = OnlineLearner(self.db, self.modelStore) self.offlineLearner.trainModel() # when we start the webserver, let offline learner to train the models, # so that after the start(), we can start to give recommendation self.recEngine = RecEngine( self.userAnalyzer, self.modelStore, self.db.extract(DatabaseInterface.USER_ACTIVITY_KEY)) def getAction(self, action): assert (isinstance(action, Action)) # taking the action from users self.onlineLearner.trainModel(action) # analyze action type, and save the registered user's action actionType = self.userAnalyzer.analyzeAction(action) if actionType == "registered": self.log.info("Recording action %s" % action) self.db.putAction(action) def provideRecommendation(self, request): # return the ID's for the recommended items assert (isinstance(request, Request)) # provide recommendations to user self.log.info("responding to request: %s" % request) recommendations = self.recEngine.provideRecommendation(request) recsReranked = self.ranker.rerank(recommendations) return recsReranked # a list of item ids def renderRecommendation(self, request): assert (isinstance(request, Request)) recsReranked = self.provideRecommendation(request) # for the purpose of testing, we sort the index, output item names # output is ordered by the id value return self.db.extract( DatabaseInterface.INVENTORY_KEY).loc[recsReranked].sort_index() def increment(self): self.log.info("incrementing the system, update the models") # increment the whole system by one day, trigger offline training self.offlineLearner.trainModel() self.modelStore.cleanOnlineModel() self.recEngine.resetCache() def getFromInventory(self, itemId): return self.db.extract(DatabaseInterface.INVENTORY_KEY).loc[itemId]
def classifyDoc(document): featurefile = DIR['DATA'] + 'features_svm.txt' classify = DIR['BASE'] + "lib/svm-light/svm_classify" model = DIR['DATA'] + "sec-tfidf-model.txt" outfile = DIR['DATA'] + "svm-out-sent.txt" #sumlength = 5 client_socket = getConnection() doc = Document(document) #----------------------------------------- # Clubbing sentences in sections and passing to the ranker sections = [] for sec, block in doc.document.items(): sentences = '' for key in sorted(block.keys()): sentences += (str(block[key])) sections.append(sentences) sec_ranker = Ranker(sections) sents, offset = doc.all_sentences() ranker = TextRank(sents) ranker.rank() looper = 20 num = 10 x = 0 summary = [] sent_idx = [0] sum_len = 0 while num > 0: idx = ranker.scores[x][0] + offset x += 1 if not validSentence(doc[idx]): continue elif doc.get_section_name(idx) == 'abstract': continue sent_idx[0] = idx #----------------------------------------- # dependency parse tree = parseTrees( getDepParse(client_socket, doc[idx].sentence.encode('utf-8'))) #----------------------------------------- # The sent_idx needs to be converted to reflect the corresponding # section index sec_idx = sent2Section(doc, sent_idx) #----------------------------------------- deleteFiles([featurefile]) feature_string = "+1" feature_string += processTree(tree, sec_ranker, sec_idx[0], False) writeToFile(featurefile, feature_string + '\n', 'a') deleteFiles([outfile]) subprocess.call([classify, featurefile, model, outfile]) with open(outfile, 'r') as ofile: sent_val = float(ofile.read().strip()) if sent_val > 0: summary.append(doc[idx].sentence.encode('utf-8')) num -= 1 sum_len += len(doc[idx].sentence.encode('utf-8').split(' ')) if sum_len > 130: break looper -= 1 if looper == 0: print "Looper Done" break writeToFile(DIR['DATA'] + "svm_summary.txt", '\n'.join(summary), 'w') print '\n'.join(summary)
#!/usr/bin/env python # ------- # imports # ------- import sys import os APP_PATH = os.path.dirname(os.path.realpath(__file__)) sys.path.append(APP_PATH + '/app') sys.path.append(APP_PATH + '/app/league') sys.path.append(APP_PATH + '/app/player') from Ranker import Ranker from LeagueFactory import LeagueFactory from PlayerFactory import PlayerFactory from PlayerEditor import PlayerEditor # ---- # main # ---- leagueFactory = LeagueFactory() league = leagueFactory.getLeague(sys.stdin, sys.stdout) # playerEditor = PlayerEditor(sys.stdin, sys.stdout, league) # playerEditor.run() ranker = Ranker(league, sys.stdin, sys.stdout) ranker.run()
def rank(option, keywords): ranker = Ranker(keywords) all = db.getAll() docs_scores = [] otime = time.time() if option != 'mix': for instance in all: score = dict() wordlist = instance['words'] score['url'] = instance['url'] # TODO: change to switch option if option == 'cos': score[option] = ranker.cosineSimilarity( wordlist, instance[option]) elif option == 'jac': score[option] = ranker.jaccardSimilarity( wordlist, instance[option]) elif option == 'vae': score[option] = ranker.variationalAutoEncoder( wordlist, instance[option]) elif option == 'pr': score[option] = ranker.pagerankSimilarity( wordlist, instance[option], instance['total']) else: break docs_scores.append(score) else: coss = [] # jacs = [] # vaes = [] prs = [] for instance in all: wordlist = instance['words'] for opt in ['cos', 'pr']: score = dict() score['url'] = instance['url'] if opt == 'cos': score['score'] = ranker.cosineSimilarity( wordlist, instance[opt]) coss.append(score) # elif opt == 'jac': # score['score'] = ranker.jaccardSimilarity(wordlist, instance[opt]) # jacs.append(score) # elif opt == 'vae': # score['score'] = ranker.variationalAutoEncoder(wordlist, instance[opt]) # vaes.append(score) elif opt == 'pr': score['score'] = ranker.pagerankSimilarity( wordlist, instance[opt], instance['total']) prs.append(score) else: break sorted_cos = sorted(coss, key=lambda i: i['score'], reverse=True) sorted_jac = [] sorted_vae = [] # sorted_jac = sorted(jacs, key=lambda i:i['score'], reverse=True) # sorted_vae = sorted(vaes, key=lambda i:i['score'], reverse=True) sorted_pr = sorted(prs, key=lambda i: i['score'], reverse=True) # normalize for instance in sorted_cos: instance['score'] = instance['score'] / sorted_cos[0]['score'] # for instance in sorted_jac: # instance['score'] = instance['score'] / sorted_jac[0]['score'] # for instance in sorted_vae: # instance['score'] = instance['score'] / sorted_vae[0]['score'] # mix scores for instance in sorted_pr: score = dict() score['url'] = instance['url'] prscore = instance['score'] / sorted_pr[0]['score'] score[option] = ranker.mixSimilarity(instance['url'], prscore, sorted_cos, sorted_jac, sorted_vae) docs_scores.append(score) sorted_scores = sorted(docs_scores, key=lambda i: i[option], reverse=True) return (sorted_scores, time.time() - otime)
print '-p to toggle preloading' print '-k <number> to set the top K number' print '-d <path> to set the database path' print '-b <path> to set the bookkeeping path' sys.exit() elif opt == '-p': PRELOAD = True elif opt == '-k': K = int(arg) elif opt == '-d': PATH = arg elif opt == '-b': BK = arg p = Preloader(PATH) r = Ranker(K, PATH) if PRELOAD: p.doc_frequencies() p.weights() while True: query = raw_input(':') # print type(r) # print 'Start Rank Query' # start = time.clock() links, size, ssize, dur = r.query(query) true_size = size if size >= K else ssize # print 'AVG GET WEIGHT TIME: ', math.fsum([i - j for i, j in zip(r.ends, r.starts)]) / len(r.starts) # print 'End Rank Query' res = list()
class Searcher: def __init__(self, docs_dictionary, main_dictionary, avdl, stem_suffix, ip, city_dictionary): self.main_dictionary = main_dictionary self.__ranker = Ranker(docs_dictionary, main_dictionary, avdl, len(docs_dictionary), stem_suffix, ip) self.__list_of_cities = [] self.__city_dictionary = city_dictionary # will be trigger from controller def search(self, query_dict, addons_dict=None): ''' searching docs that includes terms in query final rank in ranker class will be the final calculation :param query_dict: {term : { query : tf } } :param addons_dict: {term : { query : tf } } - optional ''' query_dict = self.adjust_terms(query_dict) if addons_dict is not None: #title + description addons_dict = self.adjust_terms(addons_dict) all_terms = list( set(list(query_dict.keys()) + list(addons_dict.keys()))) #all_terms = self.merge_all_terms_to_one_list(query_dict, addons_dict) self.__ranker.fill_mini_posting_file( sorted(all_terms, key=lambda v: v.upper())) if self.__list_of_cities is not None: self.remove_not_relevant_docs() ranked_titles = self.__ranker.rank(query_dict) ranked_addons = self.__ranker.rank(addons_dict) self.__ranker.calculate_final_rank( ranked_titles, ranked_addons ) #saves the result in final_result &&& takes top 50 else: self.__ranker.fill_mini_posting_file( sorted(query_dict.keys(), key=lambda v: v.upper())) if self.__list_of_cities is not None: self.remove_not_relevant_docs() ranked_docs = self.__ranker.rank(query_dict) self.__ranker.final_result = ranked_docs self.__ranker.final_result["999"] = self.__ranker.get_top_docs( "999") def get_final_result(self): return self.__ranker.final_result def set_cities_filter_list(self, list): self.__list_of_cities = list def adjust_terms(self, query_dict): ''' :param query_dict: {term : { query : tf } } :return: ''' result = {} for term in query_dict: if term not in self.main_dictionary: value = query_dict[term] if term.lower() in self.main_dictionary: if term.lower() not in result: result[term.lower()] = value else: # exists in result -> merge result[term.lower()] = self.mergi_mergi( result[term.lower()], value) elif term.upper() in self.main_dictionary: if term.upper() not in result: result[term.upper()] = value else: #exists in result -> merge result[term.upper()] = self.mergi_mergi( result[term.upper()], value) else: #print (term + " not exists in main dic at all") if term not in result: result[term] = query_dict[term] else: result[term] = self.mergi_mergi( result[term], query_dict[term]) #not exists in the main dictionary else: if term not in result: result[term] = query_dict[term] else: result[term] = self.mergi_mergi(result[term], query_dict[term]) return result def mergi_mergi(self, dic1, dic2): ''' merge two dictionaries that looks like this: { Query:tf in query } :param dic1: first :param dic2: second :return: merged dictionary ''' for q in dic1: if q in dic2: dic2[q] += dic1[q] else: dic2[q] = dic1[q] return dic2 def merge_all_terms_to_one_list(self, query_dict, addons_dict): ''' creates a list of merged terms :param query_dict: {term : { query : tf } } :param addons_dict: {term : { query : tf } } :return: list of terms ''' result = [] for term in query_dict: if term not in result: result.append(term) for term in addons_dict: if term not in result: result.append(term) return result def remove_not_relevant_docs(self): city_docs = [] # will contain list of all possible docs for city_name in self.__list_of_cities: city_docs = list( set( list(self.__city_dictionary[city_name].dic_doc_index.keys( )) + city_docs)) city_docs = {key: None for key in city_docs } #now, city_docs is dictionary for faster result self.__ranker.city_docs = city_docs
class Searcher: def __init__(self, corpus_path, posting_path, terms_dict, cities_dict, docs_dict, avg_doc_length, with_stemming, with_semantics): self.terms_dict = terms_dict self.cities_dict = cities_dict self.docs_dict = docs_dict self.parser = Parse(corpus_path) ## corpus path for stop words self.parser.to_stem = with_stemming self.posting_path = posting_path self.ranker = Ranker(avg_doc_length) self.model = None self.with_semantics = with_semantics self.with_stemming = with_stemming """ Description : This method brings the posting list of all term in the query Args: param1: query_terms param2: cities Return: parsed query and words dictionary with all the posting lists of all terms in query """ def get_terms_from_post(self, query_terms, cities): if self.with_stemming: path = self.posting_path + '\sFinalPost' + '\Final_Post' else: path = self.posting_path + '\FinalPost' + '\Final_Post' word_dict = {} updated_query_terms = {} for term in query_terms: if term not in self.terms_dict: term_lower = term.lower() term_upper = term.upper() if term_lower in self.terms_dict: tmp = query_terms[term] term = term_lower updated_query_terms[term] = tmp elif term_upper in self.terms_dict: tmp = query_terms[term] term = term_upper updated_query_terms[term] = tmp else: continue else: updated_query_terms[term] = query_terms[term] line = self.terms_dict[term][0] + 1 term_index = linecache.getline(path, line) term_index = term_index.split('|')[1].split('#') i = 0 if len(cities) > 0: cities_docs = set() for city in cities: if self.cities_dict[city][2] is not None: cities_docs.update(self.cities_dict[city][2]) while i < len(term_index) - 1: term_doc_info = string_to_dict(term_index[i]) for doc_id in term_doc_info: doc = self.docs_dict[doc_id] if doc.origin_city not in cities and doc_id not in cities_docs: continue if term not in word_dict: word_dict[term] = {} word_dict[term][doc_id] = term_doc_info[doc_id] i += 1 else: while i < len(term_index) - 1: term_doc_info = string_to_dict(term_index[i]) for doc_id in term_doc_info: if term not in word_dict: word_dict[term] = {} word_dict[term][doc_id] = term_doc_info[doc_id] i += 1 return updated_query_terms, word_dict """ Description : This method make the search of query brings the posting list and call the ranking function, for ranking all the retrieved docs in the posting lists filtered by the cities list Args: param1: query param2: cities Return: list of the 50 most relevant ranking docs """ def search(self, query, cities): query_terms = {} if self.with_semantics: if self.with_stemming: self.parser.set_stemming_bool(False) stem_query = self.parser.main_parser(text=query, doc=None) self.parser.set_stemming_bool(True) for word in stem_query: word = word.lower() if not word.isalpha(): continue try: synonyms = self.model.wv.most_similar(positive=word) except: continue for i in range(0, 3): stem_word = str(self.parser.pystemmer.stemWord((synonyms[i][0]).encode("ascii"))) query_terms[stem_word] = 1 for stem in stem_query: if stem.lower() in query_terms or stem.upper() in query_terms: continue query_terms[stem] = stem_query[stem][0] else: query = self.parser.main_parser(text=query, doc=None) for word in query: try: synonyms = self.model.wv.most_similar(positive=word) except: continue for i in range(0, 3): query_terms[(synonyms[i][0]).encode("ascii")] = 1 query_terms[word] = query[word][0] else: query = self.parser.main_parser(text=query, doc=None) for word in query: query_terms[word] = query[word][0] query_terms, words_terms = self.get_terms_from_post(query_terms, cities) result = self.ranker.rank_doc(query_terms, words_terms, self.docs_dict, 1) return result