示例#1
0
 def __init__(self, docs_dictionary, main_dictionary, avdl, stem_suffix, ip,
              city_dictionary):
     self.main_dictionary = main_dictionary
     self.__ranker = Ranker(docs_dictionary, main_dictionary, avdl,
                            len(docs_dictionary), stem_suffix, ip)
     self.__list_of_cities = []
     self.__city_dictionary = city_dictionary
示例#2
0
 def search_query(self):
     # given a query, search for  relveant documents and rank them
     query_terms = self.__string_to_terms(self.query)
     query_desc_terms = self.__string_to_terms(
         self.query_desc) if self.query_desc is not None else []
     # in case there's a query description, treat it as part of the query
     relevant_documents = self.__retrieve_relevant_documents(
         query_terms + query_desc_terms)
     ranker = Ranker(relevant_documents.values(), query_terms,
                     query_desc_terms)
     return ranker.calculate_ranked_documents()
示例#3
0
 def start(self):
     # DatabaseQueries.createTables()
     self.modelStore = ModelStore()  # "database" of models
     self.userAnalyzer = UserAnalyzer(
     )  # classify user type: anonymous? registered new? or registered old?
     self.trainingCenter = TrainingCenter(self.modelStore)
     self.ranker = Ranker()  # just rank the recommended items
     # once start should firstly train the models and immediately have recommendations on home page
     self.trainingCenter.trainModel(
     )  # NOTE: need to firstly train models once for a welcome page
     self.recEngine = RecEngine(self.userAnalyzer, self.modelStore,
                                DatabaseQueries.getNumRatingsPerUser())
示例#4
0
 def __init__(self, corpus_path, posting_path, terms_dict, cities_dict, docs_dict, avg_doc_length, with_stemming,
              with_semantics):
     self.terms_dict = terms_dict
     self.cities_dict = cities_dict
     self.docs_dict = docs_dict
     self.parser = Parse(corpus_path)  ## corpus path for stop words
     self.parser.to_stem = with_stemming
     self.posting_path = posting_path
     self.ranker = Ranker(avg_doc_length)
     self.model = None
     self.with_semantics = with_semantics
     self.with_stemming = with_stemming
示例#5
0
 def start(self):
     # each object here simulates the API calls through network
     # passing an object A to the constructor of B means A will communication to B
     self.db.startEngine()
     self.ranker = Ranker(self.numberToServe, self.db)
     self.user_analyzer = UserAnalyzer()
     self.model_store = ModelStore()
     self.online_learner = OnlineLearner(self.db, self.model_store)
     self.offline_learner = OfflineLearner(self.db, self.model_store)
     self.increment()
     self.rec_engine = RecEngine(
         self.user_analyzer, self.model_store,
         self.db.connTable[DatabaseInterface.USER_ACTIVITY_KEY])
 def start(self):
     self.db.startEngine()
     self.ranker = Ranker(self.numberToServe, self.db)
     self.userAnalyzer = UserAnalyzer()
     self.modelStore = ModelStore()
     self.offlineLearner = OfflineLearner(self.db, self.modelStore)
     self.onlineLearner = OnlineLearner(self.db, self.modelStore)
     #so that immediately after we start, we can start to give recommendations
     self.offlineLearner.trainModel()
     #had to extract it here
     self.recEngine = RecEngine(
         self.userAnalyzer, self.modelStore,
         self.db.extract(DatabaseInterface.USER_ACTIVITY_KEY))
 def start(self):
     # each object here simulates the API calls through network
     # passing an object A to the constructor of B means A will communication to B
     self.db.startEngine()
     self.ranker = Ranker(self.numberToServe, self.db)
     self.userAnalyzer = UserAnalyzer()
     self.modelStore = ModelStore()
     self.offlineLearner = OfflineLearner(self.db, self.modelStore)
     self.onlineLearner = OnlineLearner(self.db, self.modelStore)
     self.offlineLearner.trainModel()
     # when we start the webserver, let offline learner to train the models,
     # so that after the start(), we can start to give recommendation
     self.recEngine = RecEngine(
         self.userAnalyzer, self.modelStore,
         self.db.extract(DatabaseInterface.USER_ACTIVITY_KEY))
示例#8
0
def generateTestFeatures(client_socket, infile, featurefile):
    #------------------------------------------------
    doc = Document(infile)
    #------------------------------------------------
    # Load pickle for label
    picklefile = DIR['DATA'] + 'test-labels-pickle'
    global test_labels
    with open(picklefile, 'rb') as pfile:
        test_labels = pickle.load(pfile)
    #------------------------------------------------
    # For display and analysis
    dir, filename = os.path.split(infile)
    fcode = re.match(r'(.+)-parscit-section\.xml', filename).group(1)
    #------------------------------------------------
    test_sents, sent_indices = getRankedSent(doc, fcode)
    #-----------------------------------------
    # Sectional Ranker
    sections = []
    for sec, block in doc.document.items():
        sentences = ''
        for key in sorted(block.keys()):
            sentences += (str(block[key]))
        sections.append(sentences)
    sec_ranker = Ranker(sections)
    sec_indices = sent2Section(doc, sent_indices)
    #-----------------------------------------
    for sentence, sent_idx, sec_idx in zip(test_sents, sent_indices,
                                           sec_indices):
        key = fcode + '-' + str(sent_idx)
        feature_string = test_data[key]['reallbl']
        tree = parseTrees(getDepParse(client_socket, sentence))
        feature_string += processTree(tree, sec_ranker, sec_idx, False)
        test_data[key]['depparse'] = getTree(tree)
        test_data[key]['features'] = feature_string
        writeToFile(featurefile, feature_string + '\n', 'a')
示例#9
0
def get_pos_sentences(infile, outfile, backup=False):
    doc = Document(infile)
    #sentences, o = doc.all_sentences()
    #ranker = Ranker(sentences, tfidf=False)
    #-----------------------------------------
    # Instead of the above, now sentences will be clubbed into sections and
    # passed to the ranker, which is to be returned
    sections = []
    for sec, block in doc.document.items():
        sentences = ''
        for key in sorted(block.keys()):
            sentences += (str(block[key]))
        sections.append(sentences)
    ranker = Ranker(sections)
    #-----------------------------------------
    sent, offset = doc.section_sentences('abstract')
    sent_idx = range(offset, offset + len(sent))
    samples = '\n'.join(sent)
    writeToFile(outfile, samples, 'w')
    #return ranker, sent_idx
    # The sent_idx needs to be converted to reflect the corresponding section
    # index
    section_idx = sent2Section(doc, sent_idx)
    if backup:
        backupfile = DIR['BASE'] + "data/backup.txt"
        writeToFile(backupfile, "\n---------Positive---------\n", 'a')
        writeToFile(backupfile, samples, 'a')
    return ranker, section_idx
class WebServer(object):
    logging.basicConfig(level=logging.INFO)

    def __init__(self, configMap):
        self.db = DatabaseInterface(configMap['data_dir'])
        # numberToServe: the number of items finally served to the users
        self.numberToServe = configMap['numberToServe']
        self.log = logging.getLogger(__name__)

    def start(self):
        # each object here simulates the API calls through network
        # passing an object A to the constructor of B means A will communication to B
        self.db.startEngine()
        self.ranker = Ranker(self.numberToServe, self.db)
        self.userAnalyzer = UserAnalyzer()
        self.modelStore = ModelStore()
        self.offlineLearner = OfflineLearner(self.db, self.modelStore)
        self.onlineLearner = OnlineLearner(self.db, self.modelStore)
        self.offlineLearner.trainModel()
        # when we start the webserver, we should let offline learner to train the models,
        # such that, after the start(), we can start to give recommendation
        self.recEngine = RecEngine(self.userAnalyzer, self.modelStore,
                                   self.db.extract(DatabaseInterface.USER_ACTIVITY_KEY))

    def getAction(self, action):
        assert (isinstance(action, Action))
        # taking the action from users
        self.onlineLearner.trainModel(action)
        # analyze action type, and save the registered user's action
        actionType = self.userAnalyzer.analyzeAction(action)
        if actionType == "registered":
            self.log.info("Recording action %s" % action)
            self.db.putAction(action)

    def provideRecommendation(self, request):
        # return the ID's for the recommended items
        assert (isinstance(request, Request))
        # provide recommendations to user
        self.log.info("responding to request: %s" % request)
        recommendations = self.recEngine.provideRecommendation(request)
        recsReranked = self.ranker.rerank(recommendations)
        return recsReranked  # a list of item ids

    def renderRecommendation(self, request):
        assert (isinstance(request, Request))
        recsReranked = self.provideRecommendation(request)
        # for the purpose of testing, we sort the index, output item names
        # output is ordered by the id value
        return self.db.extract(DatabaseInterface.INVENTORY_KEY).loc[recsReranked].sort_index()

    def increment(self):
        self.log.info("incrementing the system, update the models")
        # increment the whole system by one day, trigger offline training
        self.offlineLearner.trainModel()
        self.modelStore.cleanOnlineModel()
        self.recEngine.resetCache()

    def getFromInventory(self, itemId):
        return self.db.extract(DatabaseInterface.INVENTORY_KEY).loc[itemId]
示例#11
0
def generateTrainFeatures(client_socket, infile, featurefile):
    #------------------------------------------------
    doc = Document(infile)
    all_sentences, all_offset = doc.all_sentences()
    #------------------------------------------------
    # Positive sentences
    pos_sents, offset = doc.section_sentences('abstract')
    sent_indices = range(offset, offset + len(pos_sents))
    #-----------------------------------------
    # Sectional Ranker
    sections = []
    for sec, block in doc.document.items():
        sentences = ''
        for key in sorted(block.keys()):
            sentences += (str(block[key]))
        sections.append(sentences)
    sec_ranker = Ranker(sections)
    sec_indices = sent2Section(doc, sent_indices)
    #-----------------------------------------
    # Count ranker
    #count_ranker = Ranker(all_sentences, tfidf=False)
    #-----------------------------------------
    for sentence, sent_idx, sec_idx in zip(pos_sents, sent_indices,
                                           sec_indices):
        feature_string = '+1'
        tree = parseTrees(getDepParse(client_socket, sentence))
        feature_string += processTree(tree, sec_ranker, sec_idx, False)
        #feature_string += processTree(tree, count_ranker, sent_idx, True)
        writeToFile(featurefile, feature_string + '\n', 'a')
    #------------------------------------------------
    # Negative sentences
    neg_ranker = TextRank(all_sentences)
    neg_ranker.rank()
    num = 5
    x = -1
    neg_sents = []
    sent_indices = []
    while num > 0:
        idx = neg_ranker.scores[x][0] + all_offset
        x -= 1
        if not validSentence(doc[idx]):
            continue
        else:
            sent_indices.append(idx)
            neg_sents.append(doc[idx].sentence.encode('utf-8'))
            num -= 1
    sec_indices = sent2Section(doc, sent_indices)
    #------------------------------------------------
    for sentence, sent_idx, sec_idx in zip(neg_sents, sent_indices,
                                           sec_indices):
        feature_string = '-1'
        tree = parseTrees(getDepParse(client_socket, sentence))
        feature_string += processTree(tree, sec_ranker, sec_idx, False)
        #feature_string += processTree(tree, count_ranker, sent_idx, True)
        writeToFile(featurefile, feature_string + '\n', 'a')
    #------------------------------------------------
    print "All input files processed to create feature vectors for training."
示例#12
0
class WebServer(object):
    logging.basicConfig(level=logging.INFO)

    def __init__(self, configMap):
        self.db = DatabaseInterface(configMap['data_dir'])
        self.numberToServe = configMap['numberToServe']
        self.log = logging.getLogger(__name__)

    # numberToServe: the number of items finally served to the users
    def start(self):
        # each object here simulates the API calls through network
        # passing an object A to the constructor of B means A will communication to B
        self.db.startEngine()
        self.ranker = Ranker(self.numberToServe, self.db)
        self.user_analyzer = UserAnalyzer()
        self.model_store = ModelStore()
        self.online_learner = OnlineLearner(self.db, self.model_store)
        self.offline_learner = OfflineLearner(self.db, self.model_store)
        self.increment()
        self.rec_engine = RecEngine(
            self.user_analyzer, self.model_store,
            self.db.connTable[DatabaseInterface.USER_ACTIVITY_KEY])

    def getAction(self, action):
        assert (isinstance(action, Action))
        #analyze user type
        user_type = self.user_analyzer.analyzeAction(action)
        self.online_learner.trainModel(action)
        if user_type == "registered":
            self.log.info("Recording action %s", action)
            self.db.putAction(action)

    def provideRecommendation(self, request):
        # return the ID's for the recommended items
        assert (isinstance(request, Request))
        recommendations = self.rec_engine.provideRecommendation(request)
        item_ids = self.ranker.rerank(recommendations)
        return item_ids

    def renderRecommendation(self, request):
        assert (isinstance(request, Request))
        item_ids = self.provideRecommendation(request)
        return self.getFromInventory(item_ids).sort_index()

    def increment(self):
        self.log.info("incrementing the system, update the models")
        # increment the whole system by one day, trigger offline training
        self.model_store.cleanOnlineModel()
        self.offline_learner.trainModel()

    def getFromInventory(self, itemId):
        return self.db.extract(DatabaseInterface.INVENTORY_KEY).loc[itemId]
class WebServer(object):
    logging.basicConfig(level=logging.INFO)

    #configMap is in main
    def __init__(self, configMap):
        self.db = DatabaseInterface(configMap['data_dir'])
        self.numberToServe = configMap['numberToServe']
        self.log = logging.getLogger(__name__)
        #要用key idk why, why not a direct string?

    #initialize everything
    def start(self):
        self.db.startEngine()
        self.ranker = Ranker(self.numberToServe, self.db)
        self.userAnalyzer = UserAnalyzer()
        self.modelStore = ModelStore()
        self.offlineLearner = OfflineLearner(self.db, self.modelStore)
        self.onlineLearner = OnlineLearner(self.db, self.modelStore)
        #so that immediately after we start, we can start to give recommendations
        self.offlineLearner.trainModel()
        #had to extract it here
        self.recEngine = RecEngine(
            self.userAnalyzer, self.modelStore,
            self.db.extract(DatabaseInterface.USER_ACTIVITY_KEY))

    def getAction(self, action):
        assert (isinstance(action, Action))
        self.onlineLearner.trainModel(action)
        actionType = self.userAnalyzer.analyzeAction(action)
        if actionType == "registered":
            self.db.putAction(action)

    def provideRec(self, request):
        assert (isinstance(request, Request))
        rec = self.recEngine.provideRec(request)
        recReRanked = self.ranker.rerank(rec)
        return recReRanked

    def renderRec(self, request):
        assert (isinstance(request, Request))
        recReRanked = self.provideRec(request)
        return self.db.extract(
            DatabaseInterface.INVENTORY_KEY).loc[recReRanked].sort_index()

    def increment(self):
        #offline, online, recengine(find the new most popular one)
        self.offlineLearner.trainModel()
        self.modelStore.cleanOnlineModel()
        self.recEngine.resetCache()

    def getFromInventory(self, itemId):
        return self.db.extract(DatabaseInterface.INVENTORY_KEY).loc[itemId]
示例#14
0
def get_test_sentences(infile, outfile, backup=False):
    doc = Document(infile)
    sentences, offset = doc.all_sentences()
    ranker = TextRank(sentences)
    ranker.rank()
    num = 7
    x = 0
    samples = ''
    sent_idx = []
    while num > 0:
        idx = ranker.scores[x][0] + offset
        x += 1
        #if not validSentence(doc[idx]):
        #    continue
        #else:
        #    sent_idx.append(idx)
        #    samples += doc[idx].sentence.encode('utf-8') + '\n'
        #    num -= 1
        sent_idx.append(idx)
        samples += doc[idx].sentence.encode('utf-8') + '\n'
        num -= 1
        #---------------------------------------------------
        # Storing the sentence in the dictionary for pickling for display
        infi = re.match(r'/home/ankur/devbench/scientific/scisumm/demo/(.+)-parscit-section\.xml', infile).group(1)
        key = infi + "-" + str(idx)
        test_data[key] = {'sentence': doc[idx].sentence.encode('utf-8'),
                          'textrank': ranker.scores[x - 1][1],
                          'contextpre': getContext(doc, idx, -2),
                          'contextpos': getContext(doc, idx, 2)}
    writeToFile(outfile, samples, 'w')
    #ranker = Ranker(sentences, tfidf=False)
    #return ranker, sent_idx
    #-----------------------------------------
    # Calculating the sectional TF-IDF
    sections = []
    for sec, block in doc.document.items():
        sentences = ''
        for key in sorted(block.keys()):
            sentences += (str(block[key]))
        sections.append(sentences)
    ranker = Ranker(sections)
    #-----------------------------------------
    # The sent_idx needs to be converted to reflect the corresponding section
    # index
    section_idx = sent2Section(doc, sent_idx)
    if backup:
        backupfile = DIR['BASE'] + "data/backup.txt"
        writeToFile(backupfile, "\n---------" + str(doc) + "---------\n", 'a')
        writeToFile(backupfile, samples, 'a')
    return ranker, section_idx, sent_idx
示例#15
0
def classifyDoc(document):
    featurefile = DIR['DATA'] + 'features_svm.txt'
    classify = DIR['BASE'] + "lib/svm-light/svm_classify"
    model = DIR['DATA'] + "sec-tfidf-model.txt"
    outfile = DIR['DATA'] + "svm-out-sent.txt"
    #sumlength = 5
    client_socket = getConnection()
    doc = Document(document)
    #-----------------------------------------
    # Clubbing sentences in sections and passing to the ranker
    sections = []
    for sec, block in doc.document.items():
        sentences = ''
        for key in sorted(block.keys()):
            sentences += (str(block[key]))
        sections.append(sentences)
    sec_ranker = Ranker(sections)
    sents, offset = doc.all_sentences()
    ranker = TextRank(sents)
    ranker.rank()
    #-----------------------------------------
    sents, sent_indices = getSecRankedSent(doc)
    #-----------------------------------------
    # The sent_idx needs to be converted to reflect the corresponding
    # section index
    sec_indices = sent2Section(doc, sent_indices)
    summary = []
    classified = []
    sum_len = 0
    for sent, sec_idx in zip(sents, sec_indices):
        #-----------------------------------------
        # dependency parse
        tree = parseTrees(getDepParse(client_socket, sent))
        #-----------------------------------------
        deleteFiles([featurefile])
        feature_string = "+1"
        feature_string += processTree(tree, sec_ranker, sec_idx, False)
        writeToFile(featurefile, feature_string + '\n', 'a')
        deleteFiles([outfile])
        subprocess.call([classify, featurefile, model, outfile])
        with open(outfile, 'r') as ofile:
            sent_val = float(ofile.read().strip())
            classified.append((sent, sent_val))
    for sent, val in sorted(classified, key=itemgetter(1)):
        summary.append(sent)
        sum_len += len(sent.split(' '))
        if sum_len > 130:
            break
    writeToFile(DIR['DATA'] + "svm_summary.txt", '\n'.join(summary), 'w')
    print '\n'.join(summary)
 def start(self):
     # each object here simulates the API calls through network
     # passing an object A to the constructor of B means A will communication to B
     self.db.startEngine()
     self.ranker = Ranker(self.numberToServe, self.db)
     self.userAnalyzer = UserAnalyzer()
     self.modelStore = ModelStore()
     self.offlineLearner = OfflineLearner(self.db, self.modelStore)
     self.onlineLearner = OnlineLearner(self.db, self.modelStore)
     self.offlineLearner.trainModel()
     # when we start the webserver, we should let offline learner to train the models,
     # such that, after the start(), we can start to give recommendation
     self.recEngine = RecEngine(self.userAnalyzer, self.modelStore,
                                self.db.extract(DatabaseInterface.USER_ACTIVITY_KEY))
示例#17
0
    def main(self, args=None):
        """训练bot的主函数
        """
        print('Welcome to DeepRank!')
        print()
        print('TensorFlow detected: v{}'.format(tf.__version__))

        # 初始化, hyperparameters
        self.args = self.parseArgs(args)
        if not self.args.rootDir:
            self.args.rootDir = os.getcwd()
        self.loadHyperParams()

        # 读入训练和测试用的数据
        self.textData = TextData(self.args)
        self.evalData = RankTextData(self.args)

        # 搭建模型
        graph = tf.Graph()
        with tf.device(self.getDevice()):
            with graph.as_default():
                with tf.name_scope('training'):
                    self.model_train = Ranker(self.args, is_training=True)

                tf.get_variable_scope().reuse_variables()
                with tf.name_scope('validation'):
                    self.model_valid = Ranker(self.args, is_training=False)

                with tf.name_scope('evluation'):
                    self.model_test = Ranker(self.args, is_training=False)
                    self.ckpt_model_saver = tf.train.Saver(
                        name='checkpoint_model_saver')
                    self.best_model_saver = tf.train.Saver(
                        name='best_model_saver')

                # Running session
                # allow_soft_placement = True: 当设置为使用GPU而实际上没有GPU的时候,允许使用其他设备运行。
                self.sess = tf.Session(config=tf.ConfigProto(
                    allow_soft_placement=True, log_device_placement=False))
                print('Initialize variables...')
                self.sess.run(tf.global_variables_initializer())

                # 定义 saver/summaries
                graph_info = self.sess.graph
                self.train_writer = tf.summary.FileWriter(
                    os.path.join(self.modelDir, 'train/'), graph_info)
                self.valid_writer = tf.summary.FileWriter(
                    os.path.join(self.modelDir, 'valid/'), graph_info)
                """
                # 使用worvecd等预处理的词向量参数初始化 bot 模型的词向量参数
                if self.args.initEmbeddings:
                    self.loadEmbedding(self.sess)
                """

                # 开始训练
                self.mainTrain(self.sess)
示例#18
0
def get_neg_sentences(infile, outfile, backup=False):
    doc = Document(infile)
    sentences, offset = doc.all_sentences()
    ranker = TextRank(sentences)
    ranker.rank()
    num = 5
    x = -1
    samples = ''
    sent_idx = []
    while num > 0:
        idx = ranker.scores[x][0] + offset
        x -= 1
        if not validSentence(doc[idx]):
            continue
        else:
            sent_idx.append(idx)
            samples += doc[idx].sentence.encode('utf-8') + '\n'
            num -= 1
    writeToFile(outfile, samples, 'w')
    #ranker = Ranker(sentences, tfidf=False)
    #return ranker, sent_idx
    #-----------------------------------------
    # Calculating the sectional TF-IDF
    sections = []
    for sec, block in doc.document.items():
        sentences = ''
        for key in sorted(block.keys()):
            sentences += (str(block[key]))
        sections.append(sentences)
    ranker = Ranker(sections)
    #-----------------------------------------
    # The sent_idx needs to be converted to reflect the corresponding section
    # index
    section_idx = sent2Section(doc, sent_idx)
    if backup:
        backupfile = DIR['BASE'] + "data/backup.txt"
        writeToFile(backupfile, "\n---------Negative---------\n", 'a')
        writeToFile(backupfile, samples, 'a')
    return ranker, section_idx
示例#19
0
 def __init__(self, projectRoot):
     Ranker.__init__(self, projectRoot)
示例#20
0
 def __init__(self, depth=1):
     self.depth = depth
     self.ranker = Ranker()
     self.buffer = []
示例#21
0
def generateTrainFeatures(client_socket, infile, featurefile):
    #------------------------------------------------
    doc = Document(infile)
    all_sentences, all_offset = doc.all_sentences()
    #------------------------------------------------
    # For display and analysis
    dir, filename = os.path.split(infile)
    fcode = re.match(r'(.+)-parscit-section\.xml', filename).group(1)
    #------------------------------------------------
    #------------------------------------------------
    # Positive sentences
    pos_sents, offset = doc.section_sentences('abstract')
    sent_indices = range(offset, offset + len(pos_sents))
    #-----------------------------------------
    # Sectional Ranker
    sections = []
    for sec, block in doc.document.items():
        sentences = ''
        for key in sorted(block.keys()):
            sentences += (str(block[key]))
        sections.append(sentences)
    sec_ranker = Ranker(sections)
    sec_indices = sent2Section(doc, sent_indices)
    #-----------------------------------------
    # Count ranker
    #count_ranker = Ranker(all_sentences, tfidf=False)
    #-----------------------------------------
    for sentence, sent_idx, sec_idx in zip(pos_sents, sent_indices,
                                           sec_indices):
        key = fcode + '-' + str(sent_idx)
        feature_string = '+1'
        tree = parseTrees(getDepParse(client_socket, sentence))
        feature_string += processTree(tree, sec_ranker, sec_idx, 1, False)
        train_data[key] = {'sentence': doc[sent_idx].sentence.encode('utf-8'),
                           'reallbl': '+1',
                           'features': feature_string}
        writeToFile(featurefile, feature_string + '\n', 'a')
    #------------------------------------------------
    # Negative sentences
    neg_ranker = TextRank(all_sentences)
    neg_ranker.rank()
    num = 5
    x = -1
    neg_sents = []
    sent_indices = []
    while num > 0:
        idx = neg_ranker.scores[x][0] + all_offset
        x -= 1
        if not validSentence(doc[idx]):
            continue
        else:
            sent_indices.append(idx)
            neg_sents.append(doc[idx].sentence.encode('utf-8'))
            num -= 1
    sec_indices = sent2Section(doc, sent_indices)
    #------------------------------------------------
    for sentence, sent_idx, sec_idx in zip(neg_sents, sent_indices,
                                           sec_indices):
        key = fcode + '-' + str(sent_idx)
        feature_string = '-1'
        tree = parseTrees(getDepParse(client_socket, sentence))
        feature_string += processTree(tree, sec_ranker, sec_idx, 1, False)
        train_data[key] = {'sentence': doc[sent_idx].sentence.encode('utf-8'),
                           'reallbl': '-1',
                           'features': feature_string}
        writeToFile(featurefile, feature_string + '\n', 'a')
    #------------------------------------------------
    print "All input files processed to create feature vectors for training."
示例#22
0
class Rankbot:
    """
    Retrieval-based chatbot
    """
    def __init__(self):

        self.args = None
        self.textData = None
        self.model = None

        self.modelDir = ''
        self.globStep = 0
        self.ckpt_model_saver = None
        self.best_model_saver = None
        self.best_model = []
        self.best_valid_loss = [float('inf'), float('inf'), float('inf')]

        self.sess = None

        self.MODEL_DIR_BASE = 'save/model'
        self.MODEL_NAME_BASE = 'model'
        self.BEST_MODEL_NAME_BASE = 'best_model'
        self.MODEL_EXT = '.ckpt'
        self.CONFIG_FILENAME = 'params.ini'

    @staticmethod
    def parseArgs(args):
        """
        Parse 超参数
        Args:
            args (list<stir>): List of arguments.
        """

        parser = argparse.ArgumentParser()

        # Global options
        globalArgs = parser.add_argument_group('Global选项')
        globalArgs.add_argument('--keepAll',
                                action='store_true',
                                help='如果等于True,则保留所有的中间结果')
        globalArgs.add_argument('--modelTag',
                                type=str,
                                default=None,
                                help='模型的标记,方便以后识别,区分和管理不同的模型')
        globalArgs.add_argument('--rootDir',
                                type=str,
                                default=None,
                                help='保存模型和数据的根目录')
        globalArgs.add_argument('--device',
                                type=str,
                                default=None,
                                help='\'gpu\' or \'cpu\',指定运算用的设备')
        globalArgs.add_argument('--seed',
                                type=int,
                                default=None,
                                help='随机数种子,方便重现实验结果')

        # 数据相关的选项
        datasetArgs = parser.add_argument_group('数据处理超参数')
        datasetArgs.add_argument('--corpus',
                                 choices=TextData.corpusChoices(),
                                 default=TextData.corpusChoices()[0],
                                 help='数据集选项.')
        datasetArgs.add_argument('--datasetTag',
                                 type=str,
                                 default='',
                                 help='数据集的标记,方便数据的版本控制。例如,'
                                 '我们产生一个20000个单词的数据文件和另一个40000个单词的数据文件')
        datasetArgs.add_argument('--maxLength',
                                 type=int,
                                 default=10,
                                 help='输入/问,输出/答句子的最长长度,对应RNN的最长长度')
        datasetArgs.add_argument(
            '--filterVocab',
            type=int,
            default=1,
            help='去掉出现频率 <= filterVocab的词语。若要保留所有单词,filterVocab设置为0')
        datasetArgs.add_argument('--skipLines',
                                 action='store_true',
                                 help='如果等于True,只使用对话记录中的[2*i, 2*i+1]行作为数据样本'
                                 '否则,[2*i, 2*i+1]也可以作为数据样本,'
                                 '对话纪录中出去第一行和最后一行的每一行会出现在两个样本里面')
        datasetArgs.add_argument('--vocabularySize',
                                 type=int,
                                 default=20000,
                                 help='词典大小的上限(0 表示没有上限)')
        datasetArgs.add_argument('--train_frac',
                                 type=float,
                                 default=0.8,
                                 help='percentage of training samples')
        datasetArgs.add_argument('--valid_frac',
                                 type=float,
                                 default=0.1,
                                 help='percentage of training samples')

        # 模型结构选项
        nnArgs = parser.add_argument_group('结构相关的模型超参数')
        nnArgs.add_argument('--hiddenSize',
                            type=int,
                            default=256,
                            help='每个RNN cell的state维度')
        nnArgs.add_argument('--numLayers',
                            type=int,
                            default=2,
                            help='每个时间的RNN cell层数')
        nnArgs.add_argument('--initEmbeddings',
                            action='store_true',
                            help='如果True, 使用开源的 word2vec 参数初始化词向量')
        nnArgs.add_argument('--embeddingSize',
                            type=int,
                            default=256,
                            help='词向量维度')
        nnArgs.add_argument('--embeddingSource',
                            type=str,
                            default="GoogleNews-vectors-negative300.bin",
                            help='用来初始化词向量的 word2vec 文件')

        # 模型训练设置
        trainingArgs = parser.add_argument_group('Training options')
        trainingArgs.add_argument('--numEpochs',
                                  type=int,
                                  default=25,
                                  help='设置训练多少个epoch')
        trainingArgs.add_argument('--saveEvery',
                                  type=int,
                                  default=5000,
                                  help='设置经过多少个minibatch记录一次checkpoint')
        trainingArgs.add_argument('--batchSize',
                                  type=int,
                                  default=32,
                                  help='mini-batch样本数量')
        trainingArgs.add_argument('--learningRate',
                                  type=float,
                                  default=0.002,
                                  help='Learning rate')
        trainingArgs.add_argument(
            '--dropout',
            type=float,
            default=0.9,
            help='Dropout rate (这里是dropout以后保留的比重,keep_prob)')

        return parser.parse_args(args)

    def main(self, args=None):
        """训练bot的主函数
        """
        print('Welcome to DeepRank!')
        print()
        print('TensorFlow detected: v{}'.format(tf.__version__))

        # 初始化, hyperparameters
        self.args = self.parseArgs(args)
        if not self.args.rootDir:
            self.args.rootDir = os.getcwd()
        self.loadHyperParams()

        # 读入训练和测试用的数据
        self.textData = TextData(self.args)
        self.evalData = RankTextData(self.args)

        # 搭建模型
        graph = tf.Graph()
        with tf.device(self.getDevice()):
            with graph.as_default():
                with tf.name_scope('training'):
                    self.model_train = Ranker(self.args, is_training=True)

                tf.get_variable_scope().reuse_variables()
                with tf.name_scope('validation'):
                    self.model_valid = Ranker(self.args, is_training=False)

                with tf.name_scope('evluation'):
                    self.model_test = Ranker(self.args, is_training=False)
                    self.ckpt_model_saver = tf.train.Saver(
                        name='checkpoint_model_saver')
                    self.best_model_saver = tf.train.Saver(
                        name='best_model_saver')

                # Running session
                # allow_soft_placement = True: 当设置为使用GPU而实际上没有GPU的时候,允许使用其他设备运行。
                self.sess = tf.Session(config=tf.ConfigProto(
                    allow_soft_placement=True, log_device_placement=False))
                print('Initialize variables...')
                self.sess.run(tf.global_variables_initializer())

                # 定义 saver/summaries
                graph_info = self.sess.graph
                self.train_writer = tf.summary.FileWriter(
                    os.path.join(self.modelDir, 'train/'), graph_info)
                self.valid_writer = tf.summary.FileWriter(
                    os.path.join(self.modelDir, 'valid/'), graph_info)
                """
                # 使用worvecd等预处理的词向量参数初始化 bot 模型的词向量参数
                if self.args.initEmbeddings:
                    self.loadEmbedding(self.sess)
                """

                # 开始训练
                self.mainTrain(self.sess)

    def mainTrain(self, sess):
        """ 训练模型
        Args:
            sess: 当前的 tf session
        """

        print('开始训练模型,(按 Ctrl+C 保存并推出训练过程)...')

        try:
            batches_valid = self.evalData.getValidBatches()
            batches_test = self.evalData.getTestBatches()
            for e in range(self.args.numEpochs):
                print()
                print("----- Epoch {}/{} ; (lr={}) -----".format(
                    e + 1, self.args.numEpochs, self.args.learningRate))

                batches = self.textData.getBatches()

                tic = datetime.datetime.now()
                for nextBatch in tqdm(batches, desc="Training"):
                    # Training pass
                    ops, feedDict = self.model_train.step(nextBatch)
                    assert len(ops) == 3  # (training, loss)
                    _, loss, train_summaries = sess.run(ops, feedDict)
                    self.globStep += 1

                    # 记录训练状态(训练数据上的损失函数)
                    if self.globStep % 100 == 0:
                        tqdm.write("----- Step %d -- CE Loss %.2f" %
                                   (self.globStep, loss))

                    # Checkpoint
                    if self.globStep % self.args.saveEvery == 0:
                        self.train_writer.add_summary(train_summaries,
                                                      self.globStep)
                        self.train_writer.flush()

                        # validation pass
                        print('Evaluating on validation data ...')
                        self.valid_losses = [0, 0, 0]
                        for nextEvalBatch in tqdm(batches_valid,
                                                  desc="Validation"):
                            ops, feedDict = self.model_valid.step(
                                nextEvalBatch)
                            assert len(ops) == 2
                            loss, eval_summaries = sess.run(ops, feedDict)
                            for i in range(3):
                                self.valid_losses[i] += loss[i]

                        self.valid_writer.add_summary(eval_summaries,
                                                      self.globStep)
                        self.valid_writer.flush()

                        for i in range(3):
                            self.valid_losses[i] = self.valid_losses[i] / len(
                                batches_valid)

                        print('validation, Recall_20@(1,3,5) = %s' %
                              self.valid_losses)
                        time.sleep(5)
                        if (len(self.best_model)
                                == 0) or (self.valid_losses[0] >
                                          self.best_valid_loss[0]):
                            print(
                                'best_model updated, with best accuracy :%s' %
                                self.valid_losses)
                            self.best_valid_loss = self.valid_losses[:]
                            self._saveBestSession(sess)

                        self._saveCkptSession(sess)

                toc = datetime.datetime.now()
                print("Epoch %d finished in %s seconds" % (e, toc - tic))

            # 训练结束后,在测试数据上运行一遍
            self.best_model_saver.restore(sess, self.best_model)
            self.test_losses = [0, 0, 0]
            for nextTestBatch in tqdm(batches_test, desc="FinalTest"):
                ops, feedDict = self.model_test.step(nextTestBatch)
                assert len(ops) == 2
                loss, _ = sess.run(ops, feedDict)
                for i in range(3):
                    self.test_losses[i] += loss[i] / len(batches_test)
            print('Final testing, Recall_20@(1,3,5) = %s' % self.test_losses)

        except (KeyboardInterrupt, SystemExit):
            # 如果用户在程序运行过程中按 Ctrl+C 结束训练
            print('Interruption detected, exiting the program...')

        self._saveCkptSession(sess)  # Ultimate saving before complete exit

    def _saveCkptSession(self, sess):
        """ 保存模型参数

        Args:
            sess: 当前的tf session
        """
        tqdm.write('保存Checkpoint (don\'t stop the run)...')
        tqdm.write('validation, Recall_20@(1,3,5) = ' +
                   repr(self.valid_losses))
        self.saveHyperParams()

        # 保存 checkpoint 的文件名
        model_name = os.path.join(self.modelDir, self.MODEL_NAME_BASE)
        if self.args.keepAll:
            model_name += '-' + str(self.globStep)
        model_name = model_name + self.MODEL_EXT

        self.ckpt_model_saver.save(sess, model_name)
        tqdm.write('Checkpoint saved.')

    def _saveBestSession(self, sess):
        """ 保存模型参数

        Args:
            sess: 当前的tf session
        """
        tqdm.write('保存新的BestModel (don\'t stop the run)...')
        self.saveHyperParams()

        # 保存 bestmodel的 的文件名
        model_name = os.path.join(self.modelDir, self.BEST_MODEL_NAME_BASE)
        model_name = model_name + self.MODEL_EXT

        self.best_model = self.best_model_saver.save(sess, model_name)
        tqdm.write('Best Model saved.')

    def loadHyperParams(self):
        """ 读取与当前模型相关的超参数
        """
        # 当前的模型位置(model path)
        self.modelDir = os.path.join(self.args.rootDir, self.MODEL_DIR_BASE)
        if self.args.modelTag:
            print("modelTag=%s" % self.args.modelTag)
            self.modelDir += '-' + self.args.modelTag
        print("modelDir=%s" % self.modelDir)

        # 如果存在config文件,使用其中的一些超参数
        configName = os.path.join(self.modelDir, self.CONFIG_FILENAME)
        if os.path.exists(configName):
            config = configparser.ConfigParser()
            config.read(configName)

            # 恢复超参数
            self.globStep = config['General'].getint('globStep')
            self.args.corpus = config['General'].get('corpus')

            self.args.datasetTag = config['Dataset'].get('datasetTag')
            self.args.maxLength = config['Dataset'].getint('maxLength')
            self.args.filterVocab = config['Dataset'].getint('filterVocab')
            self.args.skipLines = config['Dataset'].getboolean('skipLines')
            self.args.vocabularySize = config['Dataset'].getint(
                'vocabularySize')

            self.args.hiddenSize = config['Network'].getint('hiddenSize')
            self.args.numLayers = config['Network'].getint('numLayers')
            self.args.initEmbeddings = config['Network'].getboolean(
                'initEmbeddings')
            self.args.embeddingSize = config['Network'].getint('embeddingSize')
            self.args.embeddingSource = config['Network'].get(
                'embeddingSource')

    def saveHyperParams(self):
        """ 保存模型的超参数,便于模型管理。
        """
        config = configparser.ConfigParser()
        config['General'] = {}
        config['General']['globStep'] = str(self.globStep)
        config['General']['corpus'] = str(self.args.corpus)

        config['Dataset'] = {}
        config['Dataset']['datasetTag'] = str(self.args.datasetTag)
        config['Dataset']['maxLength'] = str(self.args.maxLength)
        config['Dataset']['filterVocab'] = str(self.args.filterVocab)
        config['Dataset']['skipLines'] = str(self.args.skipLines)
        config['Dataset']['vocabularySize'] = str(self.args.vocabularySize)

        config['Network'] = {}
        config['Network']['hiddenSize'] = str(self.args.hiddenSize)
        config['Network']['numLayers'] = str(self.args.numLayers)
        config['Network']['initEmbeddings'] = str(self.args.initEmbeddings)
        config['Network']['embeddingSize'] = str(self.args.embeddingSize)
        config['Network']['embeddingSource'] = str(self.args.embeddingSource)

        # 保留模型学习使用的超参数,仅仅用于模型管理。
        config['Training (won\'t be restored)'] = {}
        config['Training (won\'t be restored)']['learningRate'] = str(
            self.args.learningRate)
        config['Training (won\'t be restored)']['batchSize'] = str(
            self.args.batchSize)
        config['Training (won\'t be restored)']['dropout'] = str(
            self.args.dropout)

        with open(os.path.join(self.modelDir, self.CONFIG_FILENAME),
                  'w') as configFile:
            config.write(configFile)

    def getDevice(self):
        """ 根据输入超参数管理设备。
        Return:
            str: 运行程序的设备的名称。
        """
        if self.args.device == 'cpu':
            return '/cpu:0'
        elif self.args.device == 'gpu0':
            return '/gpu:0'
        elif self.args.device == 'gpu1':
            return '/gpu:1'
        elif self.args.device is None:
            return None
        else:
            print(
                'Warning: Error in the device name: {}, use the default device'
                .format(self.args.device))
            return None
示例#23
0
class FlowControl(object):
    logging.basicConfig(level=logging.INFO)  # Output information for log use

    def __init__(self, configMap):
        # numberToServe: the number of items finally served to the users
        self.numberToServe = configMap['numberToServe']
        self.log = logging.getLogger(__name__)

    # instantiate all together the classes that will be used, and start with training offline models
    def start(self):
        # DatabaseQueries.createTables()
        self.modelStore = ModelStore()  # "database" of models
        self.userAnalyzer = UserAnalyzer(
        )  # classify user type: anonymous? registered new? or registered old?
        self.trainingCenter = TrainingCenter(self.modelStore)
        self.ranker = Ranker()  # just rank the recommended items
        # once start should firstly train the models and immediately have recommendations on home page
        self.trainingCenter.trainModel(
        )  # NOTE: need to firstly train models once for a welcome page
        self.recEngine = RecEngine(self.userAnalyzer, self.modelStore,
                                   DatabaseQueries.getNumRatingsPerUser())

    # Use models - Output recommendations results directly to user
    def renderRecommendation(self,
                             userId=None,
                             numberToServe=None,
                             itemId=None,
                             ratingScore=None,
                             classical=None,
                             userPreference=None):
        self.log.info("responding to request: %s" % userId)
        recommendations = self.recEngine.provideRecommendation(
            userId, itemId, ratingScore, classical,
            userPreference)  # returns a dict
        rankings = self.ranker.rank(recommendations, userId,
                                    numberToServe)  # a list of item ids
        # output is the detail content of item, not just item id, but sorted (ranked) by the id value
        # print("results from recEngine:", recommendations)
        # print(rankings)
        df_inventory = DatabaseQueries.getInventory()
        df_inventory.index = df_inventory.index + 1
        itemsRecommended = []
        itemsImageURL = []
        # for i in rankings:
        #     itemsRecommended.append(df_inventory[ df_inventory['itemId'] == i].itemName.item())
        #     itemsImageURL.append(df_inventory[df_inventory['itemId']== i].itemImageURL.item())
        # print(itemsRecommended)
        # print(itemsImageURL)
        for i in rankings:
            itemName = df_inventory[df_inventory['itemId'] ==
                                    i].itemName.item()
            itemsRecommended.append(itemName)

            if os.path.exists("./static/images/moviePosters/" + itemName +
                              ".jpg"):
                url = "./static/images/moviePosters/" + itemName + ".jpg"
            else:
                url = df_inventory[df_inventory['itemId'] ==
                                   i].itemImageURL.item()
            itemsImageURL.append(url)

        return itemsRecommended, itemsImageURL

    # Set up and update models - increment system - update offline models and clear online model at the end of day
    def increment(self):
        self.log.info("incrementing the system, update the models")
        # increment the whole system by one day, trigger offline training
        self.trainingCenter.trainModel()
        self.recEngine.resetCache()  # reset most popular
class WebServer(object):
    logging.basicConfig(level=logging.INFO)

    def __init__(self, configMap):
        self.db = DatabaseInterface(configMap['data_dir'])
        # numberToServe: the number of items finally served to the users
        self.numberToServe = configMap['numberToServe']
        self.log = logging.getLogger(__name__)

    def start(self):
        # each object here simulates the API calls through network
        # passing an object A to the constructor of B means A will communication to B
        self.db.startEngine()
        self.ranker = Ranker(self.numberToServe, self.db)
        self.userAnalyzer = UserAnalyzer()
        self.modelStore = ModelStore()
        self.offlineLearner = OfflineLearner(self.db, self.modelStore)
        self.onlineLearner = OnlineLearner(self.db, self.modelStore)
        self.offlineLearner.trainModel()
        # when we start the webserver, let offline learner to train the models,
        # so that after the start(), we can start to give recommendation
        self.recEngine = RecEngine(
            self.userAnalyzer, self.modelStore,
            self.db.extract(DatabaseInterface.USER_ACTIVITY_KEY))

    def getAction(self, action):
        assert (isinstance(action, Action))
        # taking the action from users
        self.onlineLearner.trainModel(action)
        # analyze action type, and save the registered user's action
        actionType = self.userAnalyzer.analyzeAction(action)
        if actionType == "registered":
            self.log.info("Recording action %s" % action)
            self.db.putAction(action)

    def provideRecommendation(self, request):
        # return the ID's for the recommended items
        assert (isinstance(request, Request))
        # provide recommendations to user
        self.log.info("responding to request: %s" % request)
        recommendations = self.recEngine.provideRecommendation(request)
        recsReranked = self.ranker.rerank(recommendations)
        return recsReranked  # a list of item ids

    def renderRecommendation(self, request):
        assert (isinstance(request, Request))
        recsReranked = self.provideRecommendation(request)
        # for the purpose of testing, we sort the index, output item names
        # output is ordered by the id value
        return self.db.extract(
            DatabaseInterface.INVENTORY_KEY).loc[recsReranked].sort_index()

    def increment(self):
        self.log.info("incrementing the system, update the models")
        # increment the whole system by one day, trigger offline training
        self.offlineLearner.trainModel()
        self.modelStore.cleanOnlineModel()
        self.recEngine.resetCache()

    def getFromInventory(self, itemId):
        return self.db.extract(DatabaseInterface.INVENTORY_KEY).loc[itemId]
示例#25
0
 def __init__(self, projectRoot):
     Ranker.__init__(self, projectRoot)
示例#26
0
def classifyDoc(document):
    featurefile = DIR['DATA'] + 'features_svm.txt'
    classify = DIR['BASE'] + "lib/svm-light/svm_classify"
    model = DIR['DATA'] + "sec-tfidf-model.txt"
    outfile = DIR['DATA'] + "svm-out-sent.txt"
    #sumlength = 5
    client_socket = getConnection()
    doc = Document(document)
    #-----------------------------------------
    # Clubbing sentences in sections and passing to the ranker
    sections = []
    for sec, block in doc.document.items():
        sentences = ''
        for key in sorted(block.keys()):
            sentences += (str(block[key]))
        sections.append(sentences)
    sec_ranker = Ranker(sections)
    sents, offset = doc.all_sentences()
    ranker = TextRank(sents)
    ranker.rank()
    looper = 20
    num = 10
    x = 0
    summary = []
    sent_idx = [0]
    sum_len = 0
    while num > 0:
        idx = ranker.scores[x][0] + offset
        x += 1
        if not validSentence(doc[idx]):
            continue
        elif doc.get_section_name(idx) == 'abstract':
            continue
        sent_idx[0] = idx
        #-----------------------------------------
        # dependency parse
        tree = parseTrees(
            getDepParse(client_socket, doc[idx].sentence.encode('utf-8')))
        #-----------------------------------------
        # The sent_idx needs to be converted to reflect the corresponding
        # section index
        sec_idx = sent2Section(doc, sent_idx)
        #-----------------------------------------
        deleteFiles([featurefile])
        feature_string = "+1"
        feature_string += processTree(tree, sec_ranker, sec_idx[0], False)
        writeToFile(featurefile, feature_string + '\n', 'a')
        deleteFiles([outfile])
        subprocess.call([classify, featurefile, model, outfile])
        with open(outfile, 'r') as ofile:
            sent_val = float(ofile.read().strip())
        if sent_val > 0:
            summary.append(doc[idx].sentence.encode('utf-8'))
            num -= 1
            sum_len += len(doc[idx].sentence.encode('utf-8').split(' '))
        if sum_len > 130:
            break
        looper -= 1
        if looper == 0:
            print "Looper Done"
            break
    writeToFile(DIR['DATA'] + "svm_summary.txt", '\n'.join(summary), 'w')
    print '\n'.join(summary)
示例#27
0
#!/usr/bin/env python

# -------
# imports
# -------

import sys
import os

APP_PATH = os.path.dirname(os.path.realpath(__file__))
sys.path.append(APP_PATH + '/app')
sys.path.append(APP_PATH + '/app/league')
sys.path.append(APP_PATH + '/app/player')

from Ranker import Ranker
from LeagueFactory import LeagueFactory
from PlayerFactory import PlayerFactory
from PlayerEditor import PlayerEditor

# ----
# main
# ----

leagueFactory = LeagueFactory()
league = leagueFactory.getLeague(sys.stdin, sys.stdout)

# playerEditor = PlayerEditor(sys.stdin, sys.stdout, league)
# playerEditor.run()

ranker = Ranker(league, sys.stdin, sys.stdout)
ranker.run()
示例#28
0
def rank(option, keywords):
    ranker = Ranker(keywords)

    all = db.getAll()

    docs_scores = []

    otime = time.time()

    if option != 'mix':
        for instance in all:
            score = dict()

            wordlist = instance['words']

            score['url'] = instance['url']

            # TODO: change to switch option
            if option == 'cos':
                score[option] = ranker.cosineSimilarity(
                    wordlist, instance[option])
            elif option == 'jac':
                score[option] = ranker.jaccardSimilarity(
                    wordlist, instance[option])
            elif option == 'vae':
                score[option] = ranker.variationalAutoEncoder(
                    wordlist, instance[option])
            elif option == 'pr':
                score[option] = ranker.pagerankSimilarity(
                    wordlist, instance[option], instance['total'])
            else:
                break

            docs_scores.append(score)
    else:
        coss = []
        # jacs = []
        # vaes = []
        prs = []

        for instance in all:
            wordlist = instance['words']
            for opt in ['cos', 'pr']:
                score = dict()
                score['url'] = instance['url']
                if opt == 'cos':
                    score['score'] = ranker.cosineSimilarity(
                        wordlist, instance[opt])
                    coss.append(score)
                # elif opt == 'jac':
                #     score['score'] = ranker.jaccardSimilarity(wordlist, instance[opt])
                #     jacs.append(score)
                # elif opt == 'vae':
                #     score['score'] = ranker.variationalAutoEncoder(wordlist, instance[opt])
                #     vaes.append(score)
                elif opt == 'pr':
                    score['score'] = ranker.pagerankSimilarity(
                        wordlist, instance[opt], instance['total'])
                    prs.append(score)
                else:
                    break

        sorted_cos = sorted(coss, key=lambda i: i['score'], reverse=True)
        sorted_jac = []
        sorted_vae = []
        # sorted_jac = sorted(jacs, key=lambda i:i['score'], reverse=True)
        # sorted_vae = sorted(vaes, key=lambda i:i['score'], reverse=True)
        sorted_pr = sorted(prs, key=lambda i: i['score'], reverse=True)

        # normalize
        for instance in sorted_cos:
            instance['score'] = instance['score'] / sorted_cos[0]['score']
        # for instance in sorted_jac:
        #     instance['score'] = instance['score'] / sorted_jac[0]['score']
        # for instance in sorted_vae:
        #     instance['score'] = instance['score'] / sorted_vae[0]['score']

        # mix scores
        for instance in sorted_pr:
            score = dict()
            score['url'] = instance['url']
            prscore = instance['score'] / sorted_pr[0]['score']
            score[option] = ranker.mixSimilarity(instance['url'], prscore,
                                                 sorted_cos, sorted_jac,
                                                 sorted_vae)
            docs_scores.append(score)

    sorted_scores = sorted(docs_scores, key=lambda i: i[option], reverse=True)

    return (sorted_scores, time.time() - otime)
示例#29
0
        print '-p to toggle preloading'
        print '-k <number> to set the top K number'
        print '-d <path> to set the database path'
        print '-b <path> to set the bookkeeping path'
        sys.exit()
    elif opt == '-p':
        PRELOAD = True
    elif opt == '-k':
        K = int(arg)
    elif opt == '-d':
        PATH = arg
    elif opt == '-b':
        BK = arg

p = Preloader(PATH)
r = Ranker(K, PATH)
if PRELOAD:
    p.doc_frequencies()
    p.weights()
while True:
    query = raw_input(':')
    # print type(r)
    # print 'Start Rank Query'
    # start = time.clock()
    links, size, ssize, dur = r.query(query)
    true_size = size if size >= K else ssize

    # print 'AVG GET WEIGHT TIME: ', math.fsum([i - j for i, j in zip(r.ends, r.starts)]) / len(r.starts)

    # print 'End Rank Query'
    res = list()
示例#30
0
class Searcher:
    def __init__(self, docs_dictionary, main_dictionary, avdl, stem_suffix, ip,
                 city_dictionary):
        self.main_dictionary = main_dictionary
        self.__ranker = Ranker(docs_dictionary, main_dictionary, avdl,
                               len(docs_dictionary), stem_suffix, ip)
        self.__list_of_cities = []
        self.__city_dictionary = city_dictionary

    # will be trigger from controller
    def search(self, query_dict, addons_dict=None):
        '''
        searching docs that includes terms in query
        final rank in ranker class will be the final calculation
        :param query_dict: {term : { query : tf } }
        :param addons_dict: {term : { query : tf } } - optional
        '''

        query_dict = self.adjust_terms(query_dict)
        if addons_dict is not None:  #title + description
            addons_dict = self.adjust_terms(addons_dict)
            all_terms = list(
                set(list(query_dict.keys()) + list(addons_dict.keys())))
            #all_terms = self.merge_all_terms_to_one_list(query_dict, addons_dict)
            self.__ranker.fill_mini_posting_file(
                sorted(all_terms, key=lambda v: v.upper()))
            if self.__list_of_cities is not None:
                self.remove_not_relevant_docs()
            ranked_titles = self.__ranker.rank(query_dict)
            ranked_addons = self.__ranker.rank(addons_dict)
            self.__ranker.calculate_final_rank(
                ranked_titles, ranked_addons
            )  #saves the result in final_result &&& takes top 50

        else:
            self.__ranker.fill_mini_posting_file(
                sorted(query_dict.keys(), key=lambda v: v.upper()))
            if self.__list_of_cities is not None:
                self.remove_not_relevant_docs()
            ranked_docs = self.__ranker.rank(query_dict)
            self.__ranker.final_result = ranked_docs
            self.__ranker.final_result["999"] = self.__ranker.get_top_docs(
                "999")

    def get_final_result(self):
        return self.__ranker.final_result

    def set_cities_filter_list(self, list):
        self.__list_of_cities = list

    def adjust_terms(self, query_dict):
        '''

        :param query_dict: {term : { query : tf } }
        :return:
        '''
        result = {}
        for term in query_dict:
            if term not in self.main_dictionary:
                value = query_dict[term]
                if term.lower() in self.main_dictionary:
                    if term.lower() not in result:
                        result[term.lower()] = value
                    else:  # exists in result -> merge
                        result[term.lower()] = self.mergi_mergi(
                            result[term.lower()], value)
                elif term.upper() in self.main_dictionary:
                    if term.upper() not in result:
                        result[term.upper()] = value
                    else:  #exists in result -> merge
                        result[term.upper()] = self.mergi_mergi(
                            result[term.upper()], value)
                else:
                    #print (term + " not exists in main dic at all")
                    if term not in result:
                        result[term] = query_dict[term]
                    else:
                        result[term] = self.mergi_mergi(
                            result[term], query_dict[term])
                    #not exists in the main dictionary
            else:
                if term not in result:
                    result[term] = query_dict[term]
                else:
                    result[term] = self.mergi_mergi(result[term],
                                                    query_dict[term])
        return result

    def mergi_mergi(self, dic1, dic2):
        '''
        merge two dictionaries that looks like this: { Query:tf in query }
        :param dic1: first
        :param dic2: second
        :return: merged dictionary
        '''
        for q in dic1:
            if q in dic2:
                dic2[q] += dic1[q]
            else:
                dic2[q] = dic1[q]
        return dic2

    def merge_all_terms_to_one_list(self, query_dict, addons_dict):
        '''
        creates a list of merged terms
        :param query_dict: {term : { query : tf } }
        :param addons_dict: {term : { query : tf } }
        :return: list of terms
        '''
        result = []
        for term in query_dict:
            if term not in result:
                result.append(term)
        for term in addons_dict:
            if term not in result:
                result.append(term)
        return result

    def remove_not_relevant_docs(self):
        city_docs = []  # will contain list of all possible docs
        for city_name in self.__list_of_cities:
            city_docs = list(
                set(
                    list(self.__city_dictionary[city_name].dic_doc_index.keys(
                    )) + city_docs))
        city_docs = {key: None
                     for key in city_docs
                     }  #now, city_docs is dictionary for faster result
        self.__ranker.city_docs = city_docs
示例#31
0
class Searcher:
    def __init__(self, corpus_path, posting_path, terms_dict, cities_dict, docs_dict, avg_doc_length, with_stemming,
                 with_semantics):
        self.terms_dict = terms_dict
        self.cities_dict = cities_dict
        self.docs_dict = docs_dict
        self.parser = Parse(corpus_path)  ## corpus path for stop words
        self.parser.to_stem = with_stemming
        self.posting_path = posting_path
        self.ranker = Ranker(avg_doc_length)
        self.model = None
        self.with_semantics = with_semantics
        self.with_stemming = with_stemming

    """
       Description :
           This method brings the posting list of all term in the query
       Args:
           param1: query_terms
           param2: cities

        Return:
            parsed query and words dictionary with all the posting lists of all terms in query
    """
    def get_terms_from_post(self, query_terms, cities):
        if self.with_stemming:
            path = self.posting_path + '\sFinalPost' + '\Final_Post'
        else:
            path = self.posting_path + '\FinalPost' + '\Final_Post'

        word_dict = {}
        updated_query_terms = {}
        for term in query_terms:
            if term not in self.terms_dict:
                term_lower = term.lower()
                term_upper = term.upper()
                if term_lower in self.terms_dict:
                    tmp = query_terms[term]
                    term = term_lower
                    updated_query_terms[term] = tmp
                elif term_upper in self.terms_dict:
                    tmp = query_terms[term]
                    term = term_upper
                    updated_query_terms[term] = tmp
                else:
                    continue
            else:
                updated_query_terms[term] = query_terms[term]
            line = self.terms_dict[term][0] + 1
            term_index = linecache.getline(path, line)
            term_index = term_index.split('|')[1].split('#')
            i = 0
            if len(cities) > 0:
                cities_docs = set()
                for city in cities:
                    if self.cities_dict[city][2] is not None:
                        cities_docs.update(self.cities_dict[city][2])
                while i < len(term_index) - 1:
                    term_doc_info = string_to_dict(term_index[i])
                    for doc_id in term_doc_info:
                        doc = self.docs_dict[doc_id]
                        if doc.origin_city not in cities and doc_id not in cities_docs:
                            continue
                        if term not in word_dict:
                            word_dict[term] = {}
                        word_dict[term][doc_id] = term_doc_info[doc_id]
                    i += 1
            else:
                while i < len(term_index) - 1:
                    term_doc_info = string_to_dict(term_index[i])
                    for doc_id in term_doc_info:
                        if term not in word_dict:
                            word_dict[term] = {}
                        word_dict[term][doc_id] = term_doc_info[doc_id]
                    i += 1
        return updated_query_terms, word_dict

    """
       Description :
           This method make the search of query brings the posting list and call the ranking function,
           for ranking all the retrieved docs in the posting lists filtered by the cities list
       Args:
           param1: query
           param2: cities

        Return:
            list of the 50 most relevant ranking docs
    """
    def search(self, query, cities):
        query_terms = {}
        if self.with_semantics:
            if self.with_stemming:
                self.parser.set_stemming_bool(False)
                stem_query = self.parser.main_parser(text=query, doc=None)
                self.parser.set_stemming_bool(True)
                for word in stem_query:
                    word = word.lower()
                    if not word.isalpha():
                        continue
                    try:
                        synonyms = self.model.wv.most_similar(positive=word)
                    except:
                        continue
                    for i in range(0, 3):
                        stem_word = str(self.parser.pystemmer.stemWord((synonyms[i][0]).encode("ascii")))
                        query_terms[stem_word] = 1
                    for stem in stem_query:
                        if stem.lower() in query_terms or stem.upper() in query_terms:
                            continue
                        query_terms[stem] = stem_query[stem][0]
            else:
                query = self.parser.main_parser(text=query, doc=None)
                for word in query:
                    try:
                        synonyms = self.model.wv.most_similar(positive=word)
                    except:
                        continue
                    for i in range(0, 3):
                        query_terms[(synonyms[i][0]).encode("ascii")] = 1
                    query_terms[word] = query[word][0]
        else:
            query = self.parser.main_parser(text=query, doc=None)
            for word in query:
                query_terms[word] = query[word][0]
        query_terms, words_terms = self.get_terms_from_post(query_terms, cities)
        result = self.ranker.rank_doc(query_terms, words_terms, self.docs_dict, 1)
        return result