def generate_natural_language_prediction_scores(self,
                                                    test_questions,
                                                    prediction_file_location,
                                                    collection_id,
                                                    num_rows=10):
        """
        Generates runtime requests using the data from the input test file, submits them to the ranker associated with
            the input ranker id and writes returned predictions to the specified output path.  The predictions are in
            the same sequence as the feature vectors in the test file. However, since RaaS only returns top 10 ranked
            documents the remaining document scores are defaulted to -1 (with confidence 0)

        :param LabelledQueryStream test_questions: a csv containing data to use for the requests (specifically
            only care about the question_text)
        :param str prediction_file_location: valid path for the prediction file to be created (over writes existing)
        :param str collection_id: the collection id at which the queries will be pointed to in the cluster
        :param int or None num_rows: the number of predictions to write to the prediction file. Defaults to 10
        """
        self.logger.info(
            "Sending runtime requests from <<%s>> to collection: <<%s>> (predictions will be written to: <<%s>>)"
            % (test_questions, collection_id, prediction_file_location))

        temp_file = get_temp_file(prediction_file_location)
        stats = defaultdict(float)
        with smart_file_open(temp_file, 'w') as prediction_outfile:
            writer = csv.writer(prediction_outfile, delimiter=' ')
            for query in test_questions:
                stats['num_questions'] += 1
                self.logger.debug("Generate predictions for query <<%s>>" %
                                  query.get_qid())
                predictions = self._get_runtime_predictions(
                    stats['num_questions'],
                    query_text=query.get_qid(),
                    collection_id=collection_id,
                    num_results_to_return=num_rows)
                if predictions:
                    stats['num_results_returned'] += len(predictions)
                    self._write_results_to_file(predictions, writer)
                else:
                    stats['num_queries_which_doesnt_have_any_results'] += 1
                if self.logger.isEnabledFor(
                        logging.DEBUG) or stats['num_questions'] % 500 == 0:
                    self.logger.info('Generated predictions for %d queries' %
                                     stats['num_questions'])

            if stats['num_questions'] < 1:
                raise ValueError("No test instances found in the file")
        move(temp_file, prediction_file_location)

        self.logger.info(
            "Completed getting runtime predictions for %d questions" %
            stats['num_questions'])
        return stats
示例#2
0
 def _drop_answer_id_col_from_feature_file(self, train_file_location):
     file_without_aid = insert_modifier_in_filename(train_file_location, 'no_aid')
     if path.isfile(file_without_aid):
         self.logger.info('Found a previously generated version of the training file without answer id column, '
                          're-using it: %s' % file_without_aid)
     else:
         self.logger.info('Generating a version of the feature file without answer id (which is what ranker'
                          ' training expects')
         temp_file = get_temp_file(file_without_aid)
         with smart_file_open(temp_file, 'w') as outfile:
             writer = csv.writer(outfile)
             with smart_file_open(train_file_location) as infile:
                 reader = csv.reader(infile)
                 for row in reader:
                     writer.writerow(row[:1] + row[2:])
         move(temp_file, file_without_aid)
         self.logger.info('Done generating file: %s' % file_without_aid)
     return file_without_aid
示例#3
0
    def _generate_sparse_format_file(self, feature_file):
        sparse_file = insert_modifier_in_filename(feature_file, 'sparse_format')
        if path.isfile(sparse_file):
            self.logger.info("Re-using previously generated sparse format file: %s" % sparse_file)
        else:
            self.logger.info('Generating a sparse version of the feature file (zeros replaced with empty columns '
                             'which the ranker knows how to deal with)')
            temp_file = get_temp_file(sparse_file)
            with smart_file_open(temp_file, 'w') as outfile:
                writer = csv.writer(outfile)
                with smart_file_open(feature_file) as infile:
                    reader = csv.reader(infile)
                    for row in reader:
                        writer.writerow(row[:1] + row[2:])
            move(temp_file, sparse_file)
        self.logger.info('Done generating file: %s' % sparse_file)

        return self._get_file_size(sparse_file), sparse_file
示例#4
0
    def generate_ranker_predictions(self, ranker_id, test_file_location, prediction_file_location,
                                    file_has_answer_ids=True):
        """
        Generates runtime requests using the data from the input test file, submits them to the ranker associated with
            the input ranker id and writes returned predictions to the specified output path.  The predictions are in
            the same sequence as the feature vectors in the test file. However, since RaaS only returns top 10 ranked
            documents the remaining document scores are defaulted to -1 (with confidence 0)

        :param str ranker_id: id for the associated ranker in bluemix
        :param str test_file_location: a csv containing data to use for the requests (question_id, feature_1,
            feature_2,..., label)
        :param str prediction_file_location: valid path for the prediction file to be created (over writes existing)
        :param bool file_has_answer_ids: a flag to indicate whether or not the file has an answer id column (if not,
            one will be mocked)
        """
        self.logger.info(
            "Sending runtime requests from <<%s>> to ranker id: <<%s>> (predictions will be written to: <<%s>>)" % (
                test_file_location, ranker_id, prediction_file_location))

        with smart_file_open(test_file_location) as test_file:
            temp_file = get_temp_file(prediction_file_location)
            stats = defaultdict(float)
            with smart_file_open(temp_file, 'w') as prediction_outfile:
                query_stream = RankerFeatureFileStream(fh=test_file, file_has_answer_ids=file_has_answer_ids)

                for query in query_stream:
                    stats['num_questions'] += 1
                    self.logger.debug("Generate predictions for qid <<%s>>" % query.get_qid())
                    ranked_candidate_answers = self._call_runtime(ranker_id, query, query_stream.feature_names)
                    num_answers_written = self._write_ranker_preds_to_prediction_file(query.get_qid(),
                                                                                      ranked_candidate_answers,
                                                                                      prediction_outfile)
                    if num_answers_written != query.get_answer_count():
                        raise ValueError(
                            "Error getting ranked answers for qid %s.  Expected %d answers, but only got %d: %s" %
                            (query.get_qid(), query.get_answer_count(), num_answers_written,
                             ranked_candidate_answers))
                    sleep(0.001)

                if stats['num_questions'] < 1:
                    raise ValueError("No test instances found in the file")
            move(temp_file, prediction_file_location)

            self.logger.info("Completed getting runtime predictions for %d questions" % stats['num_questions'])