Exemplo n.º 1
0
    def build_file_train_model_produce_output(
            self, feature_names, n_gram_length, sentiment_processor, spotter,
            golden_saliency_by_entid_by_docid, dexter_dataset,
            wikipedia_dataset):
        feature_filename = FileLocations.get_dropbox_intermediate_path(
        ) + 'sentiment_simple_ngram_' + str(n_gram_length) + '.txt'
        document_to_feature_converter = SimpleSentiment(
            sentiment_processor, n_gram_length=n_gram_length)

        model_filename = FileLocations.get_dropbox_intermediate_path(
        ) + 'simple_sentiment_model_ngram_' + str(n_gram_length) + '.pickle'

        tosent_converter = SimpleGBRT(model_filename)
        test_docid_set = set(Const.TESTSET_DOCID_LIST)
        train_docid_set = set(Const.TRAINSET_DOCID_LIST)
        salience_by_entity_by_doc_id = smb.build_output_using_dexter_dataset(
            spotter, golden_saliency_by_entid_by_docid, feature_filename,
            document_to_feature_converter, tosent_converter, test_docid_set,
            train_docid_set)
        # if not os.path.isfile(model_filename):
        # build model
        self.train_model(feature_filename, feature_names, dexter_dataset,
                         wikipedia_dataset, model_filename)

        trc = TrecReferenceCreator()
        prefix = str(n_gram_length) + '_n_gram_x_temp'
        trc.create_results_file(salience_by_entity_by_doc_id, prefix)
        report, ndcg, trec_by_id = trc.get_report(
            FileLocations.get_dropbox_intermediate_path() +
            'trec_ground_truth.txt', prefix)
        trc.logger.info('\nTrec Eval Results:\n%s', report)

        return salience_by_entity_by_doc_id, ndcg, trec_by_id
Exemplo n.º 2
0
    def get_ndcg_and_trec_eval(self, feature_filename, model_filename, feature_names, docid_set, wikipediaDataset , dexterDataset, per_document_ndcg):
        self.logger.info('loading model %s', model_filename)

        with open(model_filename, 'rb') as handle:
            model = pickle.load(handle)

        salience_by_entity_by_doc_id = self.get_salience_by_entity_by_doc_id(feature_filename, model, docid_set, feature_names, dexterDataset,
                                                    wikipediaDataset, filter_for_interesting=False)

        trc = TrecReferenceCreator()
        prefix = 'model_runner_x_temp'
        trc.create_results_file(salience_by_entity_by_doc_id, prefix)
        overall_report, overall_ndcg, overall_trec_val_by_name = trc.get_report(FileLocations.get_dropbox_intermediate_path() + 'trec_ground_truth.txt', prefix)


        ndcg_by_docid = {}
        trec_val_by_name_by_docid = {}
        if per_document_ndcg:
            skipped = []
            for docid in docid_set:
                salience_by_entity_by_doc_id_b = {}
                if docid in salience_by_entity_by_doc_id:
                    salience_by_entity_by_doc_id_b[docid] = salience_by_entity_by_doc_id[docid]
                    trc = TrecReferenceCreator()
                    prefix = 'model_runner_x_temp'
                    trc.create_results_file(salience_by_entity_by_doc_id_b, prefix)
                    report, ndcg, trec_val_by_name = trc.get_report(FileLocations.get_dropbox_intermediate_path() + 'trec_ground_truth.txt', prefix)
                    trc.logger.info('\nTrec Eval Results:\n%s', report)
                    ndcg_by_docid[docid] = ndcg
                    trec_val_by_name_by_docid[docid] = trec_val_by_name
                else:
                    self.logger.warning('No data for docid %d, skipping',docid)
                    skipped.append(docid)
            self.logger.info('per doc ndcg : %s ', ndcg_by_docid)
            self.logger.info('skipped in the per doc ndcg : %s ', skipped)

        trc.logger.info('\n_____________________________________\nTrec Eval Results Overall:\n%s', overall_report)

        return overall_ndcg, ndcg_by_docid, overall_trec_val_by_name, trec_val_by_name_by_docid
            None,
            train_docid_set,
            wikipediaDataset,
            filter_for_interesting=filter_for_interesting)
        builder = SalienceBasedOnTFModelBuilder()
        builder.train_model(output_filename,
                            document_to_feature_converter.tf_feature_names,
                            datasetDexter, wikipediaDataset, model_filename)

    tosent_converter = SimpleGBRT(model_filename)
    salience_by_entity_by_doc_id = smb.build_output_using_dexter_dataset(
        spotter,
        golden_saliency_by_entid_by_docid,
        output_filename,
        document_to_feature_converter,
        tosent_converter,
        report_docid_set,
        wikipediaDataset,
        filter_for_interesting=filter_for_interesting,
        json_doc_list=document_list)

    if use_dexter_dataset:
        trc = TrecReferenceCreator()
        lines_written = trc.create_results_file(salience_by_entity_by_doc_id,
                                                'x_temp')
        if lines_written > 0:
            report, ndcg, p_at = trc.get_report(
                FileLocations.get_dropbox_intermediate_path() +
                'trec_ground_truth.txt', 'x_temp')
            trc.logger.info(' Trec Eval Results:\n %s', report)
Exemplo n.º 4
0
    def main(self, from_, to_, measurement, pipeline_portion):

        # load the data
        dd = DatasetDexter()
        document_list = dd.get_dexter_dataset()

        # process the data
        count = 0

        slcs = SpotlightCachingSpotter()
        light_features_to_zero = []
        lfe = SELLightFeatureExtractor(light_features_to_zero)
        gbrt = None  # GBRT('fred')
        ndcg = NDCG()

        min_candidates_to_pass_through = 3
        binary_classifier_threshold = 0.5
        spotter_confidence = 0.5
        corpus_name = 'dexter_fset_02_'
        break_early = False

        file_prefix = (corpus_name + '_' + str(from_) + '_to_' + str(to_) +
                       '_')
        salience_by_entity_by_doc_id = {}
        time_by_docid = {}

        light_feature_filename = FileLocations.get_temp_path(
        ) + file_prefix + 'light_output_partial.txt'

        file = open(light_feature_filename, "a")
        file.write(
            '\ndocId, entity_id, golden_salience, estimated_salience, [light_features]'
        )
        file.close()

        for document in document_list:
            data = json.loads(document)
            docid = data['docId']

            if (count in range(from_, (to_ + 1)) and measurement == 'LINE') or \
                    (docid in range(from_, (to_ + 1)) and measurement == 'DOCID'):
                self.logger.info('_______________________________________')
                self.logger.info('Starting processing of docid = %d  line=%d ',
                                 docid, count)
                start_time = time.time()
                saliency_by_ent_id_golden = self.extract_saliency_by_ent_id_golden(
                    data)
                body = self.extract_body(data)
                title = data['title']

                pipeline = Pipeline002(slcs, lfe, gbrt, ndcg,
                                       light_feature_filename)

                calculated_saliency_by_entity_id, golden_salience_by_entity_id, discount_sum, model_dcgs = \
                    pipeline.process_document(
                        docid,
                        body, title,
                        file_prefix, break_early=break_early,
                        golden_salience_by_entity_id=saliency_by_ent_id_golden,
                        min_candidates_to_pass_through=min_candidates_to_pass_through,
                        binary_classifier_threshold=binary_classifier_threshold,
                        spotter_confidence=spotter_confidence)

                salience_by_entity_by_doc_id[
                    docid] = calculated_saliency_by_entity_id
                self.logger.info('count = %d, docId = %d ', count, docid)
                self.logger.info('calculated_saliency_by_entity_id = %s ',
                                 str(calculated_saliency_by_entity_id))
                self.logger.info('discount_sum = %s ', str(discount_sum))
                self.logger.info('model_dcgs = %s ', str(model_dcgs))

                diff = time.time() - start_time

                time_by_docid[docid] = diff
                self.logger.info('Times taken %s', time_by_docid)
                self.logger.info('Time taken for docid=%d, time=%f', docid,
                                 diff)

            count += 1
        self.logger.info('Times taken by docid: %s', time_by_docid)

        trc = TrecReferenceCreator()
        trc.create_results_file(salience_by_entity_by_doc_id, 'x_temp')
        report, ndcg, p_at = trc.get_report(
            FileLocations.get_dropbox_intermediate_path() +
            'trec_ground_truth.txt', 'x_temp')
        self.logger.info(' Trec Eval Results:\n %s', report)
from sellibrary.trec.trec_util import TrecReferenceCreator

if __name__ == "__main__":
    df = TrecReferenceCreator()
    df.create_reference_file(True)