示例#1
0
 def train_model_using_dexter_dataset(self, sentiment_processor, spotter,
                                      afinn_filename):
     dexter_json_doc_list = self.dd.get_dexter_dataset(
         FileLocations.get_dropbox_dexter_path(), 'saliency-dataset.json')
     self.logger.info('building list of n-grams')
     ngram_list = []
     for n_gram_length in range(2, 10):
         for json_doc in dexter_json_doc_list:
             data = json.loads(json_doc)
             # pprint.pprint(data)
             body = self.extract_body(data)
             title = data['title']
             title_entities = spotter.get_entity_candidates(title, 0.5)
             for e in title_entities:
                 n_gram = sentiment_processor.get_ngram(
                     title, n_gram_length, e.start_char, e.end_char)
                 ngram_list.append(n_gram)
             body_entities = spotter.get_entity_candidates(body, 0.5)
             for e in body_entities:
                 n_gram = sentiment_processor.get_ngram(
                     body, n_gram_length, e.start_char, e.end_char)
                 ngram_list.append(n_gram)
     self.logger.info('processing list of n-grams')
     sentiment_processor.cal_term_weight_on_full_corpus(afinn_filename,
                                                        ngram_list,
                                                        debug_mode=1)
     self.logger.info('processing complete')
    def dexter_dataset_sentiment(self, sentiment_processor, spotter,
                                 output_filename):
        dexter_json_doc_list = self.dd.get_dexter_dataset(
            FileLocations.get_dropbox_dexter_path(), 'saliency-dataset.json')
        self.logger.info('building list of n-grams')
        ngram_list = []

        sent_by_entity_id_by_docid = {}

        file_contents = ''
        for json_doc in dexter_json_doc_list:
            data = json.loads(json_doc)
            body = self.extract_body(data)
            title = data['title']
            docid = data['docId']

            sent_by_entity_id_by_docid[docid] = {}
            for n_gram_length in range(2, 10):
                title_entities = spotter.get_entity_candidates(title, 0.5)
                for e in title_entities:
                    n_gram = sentiment_processor.get_ngram(
                        title, n_gram_length, e.start_char, e.end_char)
                    sent = sentiment_processor.get_doc_sentiment(n_gram)
                    if e.entity_id not in sent_by_entity_id_by_docid[docid]:
                        sent_by_entity_id_by_docid[docid][e.entity_id] = 0
                    sent_by_entity_id_by_docid[docid][
                        e.entity_id] = sent_by_entity_id_by_docid[docid][
                            e.entity_id] + sent

                ngram_list.append(n_gram)
                body_entities = spotter.get_entity_candidates(body, 0.5)
                for e in body_entities:
                    n_gram = sentiment_processor.get_ngram(
                        body, n_gram_length, e.start_char, e.end_char)
                    sent = sentiment_processor.get_doc_sentiment(n_gram)
                    if e.entity_id not in sent_by_entity_id_by_docid[docid]:
                        sent_by_entity_id_by_docid[docid][e.entity_id] = 0
                    sent_by_entity_id_by_docid[docid][
                        e.entity_id] = sent_by_entity_id_by_docid[docid][
                            e.entity_id] + sent
            #log progress
            for entity_id in sent_by_entity_id_by_docid[docid].keys():
                sent = sent_by_entity_id_by_docid[docid][entity_id]

                s = '%d %d 0 0 [ %f ]' % (docid, entity_id, sent)
                self.logger.info(s)
                file_contents = file_contents + s + '\n'

        file = open(output_filename, "w")
        file.write(file_contents)
        file.close()

        self.logger.info('processing complete')
示例#3
0
    def get_only_golden_rows(self, X, y, docid_array, entity_id_array, dexterDataset, wikipediaDataset):

        dexter_json_doc_list = dexterDataset.get_dexter_dataset(FileLocations.get_dropbox_dexter_path(), 'saliency-dataset.json')
        golden_saliency_by_entid_by_docid = dexterDataset.get_golden_saliency_by_entid_by_docid(dexter_json_doc_list, wikipediaDataset)

        rows_in_golden = np.zeros(X.shape[0])
        for i in range(X.shape[0]):
            docid = docid_array[i]
            entity_id = entity_id_array[i]
            if docid in golden_saliency_by_entid_by_docid:
                if entity_id in golden_saliency_by_entid_by_docid[docid]:
                    rows_in_golden[i] = 1

        X_filtered = X[rows_in_golden == 1]
        y_filtered = y[rows_in_golden == 1]
        docid_array_filtered = docid_array[rows_in_golden == 1]
        entity_id_array_filtered = entity_id_array[rows_in_golden == 1]

        return X_filtered, y_filtered, docid_array_filtered, entity_id_array_filtered
示例#4
0
    def create_reference_file(self, zero_less_than_2):

        # load the data
        dd = DatasetDexter()
        document_list = dd.get_dexter_dataset(
            path=FileLocations.get_dropbox_dexter_path())

        results = ''
        # process the data
        result_count = 0
        doc_count = 0

        for document in document_list:
            data = json.loads(document)
            saliency_by_ent_id_golden = self.extract_saliency_by_ent_id_golden(
                data)
            docid = data['docId']

            sorted_list = self.get_ordered_list_from_dictionary(
                saliency_by_ent_id_golden)

            for item in sorted_list:
                entity_id = item[0]
                salience = item[1]
                if zero_less_than_2:
                    if salience < 2.0:
                        salience = 0.0
                results = results + str(docid) + ' 0 ' + str(
                    entity_id) + ' ' + str(salience) + '\n'
                result_count += 1

            self.logger.info('Documents Processed %d Entities Processed %d ',
                             doc_count, result_count)
            doc_count += 1

        fn = FileLocations.get_dropbox_intermediate_path(
        ) + "trec_ground_truth.txt"
        self.logger.info('writing to %s ', fn)
        file = open(fn, "w")
        file.write(results)
        file.close()
示例#5
0
        feature_names=file_A_feature_names,
        entity_id_index=1,
        y_feature_index=2,
        first_feature_index=4,
        number_features_per_line=len(file_A_feature_names) + 4,
        tmp_filename='/tmp/temp_conversion_file.txt')

    print(y1.shape)
    dexter_dataset = DatasetDexter()
    wikipedia_dataset = WikipediaDataset()
    # fg = FilterGolden()
    # X1, y1, docid_array1, entity_id_array1 = fg.get_only_golden_rows(X1, y1, docid_array1, entity_id_array1, dexter_dataset,
    #                                                     wikipedia_dataset)

    document_list = dexter_dataset.get_dexter_dataset(
        path=FileLocations.get_dropbox_dexter_path())
    golden_saliency_by_entid_by_docid = dexter_dataset.get_golden_saliency_by_entid_by_docid(
        document_list, wikipedia_dataset)

    print(y1.shape)

    # Load File B
    X2, y2, docid_array2, entity_id_array2 = load_feature_matrix(
        feature_filename=filename_B,
        feature_names=file_B_feature_names,
        entity_id_index=1,
        y_feature_index=2,
        first_feature_index=4,
        number_features_per_line=len(file_B_feature_names) + 4,
        tmp_filename='/tmp/temp_conversion_file.txt')
示例#6
0
            for entity_id in all_heavy_features_by_entity_id.keys():
                output = '{0},{1},{2},{3},{4}\n'.format(
                    str(optional_docid), str(entity_id), str('?'), str('?'),
                    str(all_heavy_features_by_entity_id[entity_id]))
                file.write(output)
            file.close()

        return features_by_entity_id


if __name__ == "__main__":

    #build a the golden spotter
    dd = DatasetDexter()
    document_list = dd.get_dexter_dataset(
        FileLocations.get_dropbox_dexter_path(), 'saliency-dataset.json')
    wikipedia_dataset = WikipediaDataset()
    spotter = GoldenSpotter(document_list, wikipedia_dataset)

    body = "Iranian representatives say negotiations with Europe on its nuclear program are in the final stages. Iran's foreign minister, Kamal Kharazi, told state television Saturday Iranian negotiators have given their final response to a European Union proposal to suspend Iran's uranium enrichment program. He said it is now up to the Europeans to decide whether or not to accept their decision. Iran and the European Union's big three powers; Britain, Germany, and France; have been negotiating a deal under which Tehran would agree to freeze sensitive nuclear work to avoid possible U.N. Security Council sanctions. U.S. Secretary of State Colin Powell, says that Iran's nuclear program is intended to make nuclear weapons. Iran authorities have insisted that their nuclear ambitions are limited to generating electricity from atomic energy plants, not making bombs. Critics of the position of the United States point to Israel's nuclear program. Israel maintains a policy of nuclear ambiguity, but is widely believed to possess at least 82 nuclear weapons. The program has not been condemned by the United States."
    title = ""

    sfe = SelFeatureExtractor(spotter,
                              binary_classifier_threshold=0.5,
                              min_candidates_to_pass_through=5,
                              binary_classifier=None,
                              light_feature_filename=None,
                              heavy_feature_filename=None,
                              num_light_features=23,
                              break_early=False)
示例#7
0
    filename = FileLocations.get_dropbox_intermediate_path() + 'sel.pickle'
    build_model = False
    aws_util = AWSUtil()
    smb = SelModelBuilder()


    # if build_model:
    #     sentiment_processor = smb.train_and_save_model(filename)
    # else:
    #     sentiment_processor = SentimentProcessor()
    #     sentiment_processor.load_model(filename)

    dd = smb.get_dexter_datset()
    wikipediaDataset = WikipediaDataset()
    document_list = dd.get_dexter_dataset(path=FileLocations.get_dropbox_dexter_path())
    spotter = GoldenSpotter(document_list, wikipediaDataset)

    golden_saliency_by_entid_by_docid = dd.get_golden_saliency_by_entid_by_docid(document_list, wikipediaDataset)


    output_filename = FileLocations.get_dropbox_intermediate_path() + 'sel_all_features_golden_spotter.docid.'+ str(min_docid) + '-' + str(max_docid) + '.txt'
    heavy_feature_filename = FileLocations.get_temp_path() + 'sel_heavy_features_golden_spotter.docid.'+ str(min_docid) + '-' + str(max_docid) + '.txt'
    light_feature_filename = FileLocations.get_temp_path() + 'sel_light_features_golden_spotter.docid.'+ str(min_docid) + '-' + str(max_docid) + '.txt'

    document_to_feature_converter = SelFeatureExtractor(spotter, binary_classifier_threshold=0.5,
                                                        min_candidates_to_pass_through = 5000,
                                                        binary_classifier=None,
                 light_feature_filename = light_feature_filename, heavy_feature_filename = heavy_feature_filename, num_light_features = 23, break_early = break_early)

    sel_feat_to_sent = None # SelFeatToSent(FileLocations.get_dropbox_intermediate_path() + 'sel_GradientBoostingRegressor.pickle')
    def build_output_using_dexter_dataset(self, spotter,
                                          golden_saliency_by_entid_by_docid,
                                          output_filename, docid_set,
                                          use_rand_values):
        dexter_json_doc_list = self.dd.get_dexter_dataset(
            FileLocations.get_dropbox_dexter_path(), 'saliency-dataset.json')
        self.logger.info('building features')

        if (output_filename != None):
            file = open(output_filename, "w")
        else:
            file = None

        salience_by_entity_by_doc_id = {}
        for json_doc in dexter_json_doc_list:
            data = json.loads(json_doc)
            # pprint.pprint(data)
            docid = data['docId']

            if docid_set is None or docid in docid_set:

                salience_by_entity_by_doc_id[docid] = {}
                body = self.extract_body(data)
                title = data['title']
                title_entities = spotter.get_entity_candidates(title, docid)
                body_entities = spotter.get_entity_candidates(body, docid)

                features_by_entity_id = {}

                for e in title_entities:
                    if docid in golden_saliency_by_entid_by_docid:
                        if e.entity_id in golden_saliency_by_entid_by_docid[
                                docid]:
                            golden = golden_saliency_by_entid_by_docid[docid][
                                e.entity_id]
                    if use_rand_values:
                        features_by_entity_id[e.entity_id] = [random.random()]
                    else:
                        features_by_entity_id[e.entity_id] = [golden]
                for e in body_entities:
                    if docid in golden_saliency_by_entid_by_docid:
                        if e.entity_id in golden_saliency_by_entid_by_docid[
                                docid]:
                            golden = golden_saliency_by_entid_by_docid[docid][
                                e.entity_id]
                    if use_rand_values:
                        features_by_entity_id[e.entity_id] = [random.random()]
                    else:
                        features_by_entity_id[e.entity_id] = [golden]

                for entity_id in features_by_entity_id.keys():
                    golden = 0
                    if docid in golden_saliency_by_entid_by_docid:
                        if entity_id in golden_saliency_by_entid_by_docid[
                                docid]:
                            golden = golden_saliency_by_entid_by_docid[docid][
                                entity_id]

                    line = str(docid) + ',' + str(entity_id) + ',' + str(
                        golden) + ',0,' + str(features_by_entity_id[entity_id])

                    if file is not None:
                        file.write(line)
                        file.write('\n')

                    sentiment = features_by_entity_id[entity_id][0]
                    salience_by_entity_by_doc_id[docid][entity_id] = sentiment
                    self.logger.debug('sent %f', sentiment)

        if file is not None:
            file.close()
            self.logger.info('written to %s', output_filename)
        self.logger.info('processing complete')

        return salience_by_entity_by_doc_id
示例#9
0

    # ___________Entry Point To Class________________________________________________

    def get_feature_list_by_ent(self, body, title, spotter, very_light=False, docid = -1):
        entity_list, entity_id_set, features_by_ent_id, name_by_entity_id, title_entity_list, title_entity_id_set = \
            self.get_entity_saliency_list(body, title, spotter, very_light, docid)
        return features_by_ent_id, name_by_entity_id

    # ___________________________________________________________

if __name__ == "__main__":

    #build a the golden spotter
    dd = DatasetDexter()
    document_list = dd.get_dexter_dataset(FileLocations.get_dropbox_dexter_path(),'saliency-dataset.json')
    wikipedia_dataset = WikipediaDataset()
    spotter = GoldenSpotter(document_list, wikipedia_dataset)

    body = "Iranian representatives say negotiations with Europe on its nuclear program are in the final stages. Iran's foreign minister, Kamal Kharazi, told state television Saturday Iranian negotiators have given their final response to a European Union proposal to suspend Iran's uranium enrichment program. He said it is now up to the Europeans to decide whether or not to accept their decision. Iran and the European Union's big three powers; Britain, Germany, and France; have been negotiating a deal under which Tehran would agree to freeze sensitive nuclear work to avoid possible U.N. Security Council sanctions. U.S. Secretary of State Colin Powell, says that Iran's nuclear program is intended to make nuclear weapons. Iran authorities have insisted that their nuclear ambitions are limited to generating electricity from atomic energy plants, not making bombs. Critics of the position of the United States point to Israel's nuclear program. Israel maintains a policy of nuclear ambiguity, but is widely believed to possess at least 82 nuclear weapons. The program has not been condemned by the United States."
    title = ""

    light_feature_calculator = SELLightFeatureCalculator()

    combiner = SELLightFeatureCombiner(light_feature_calculator)
    features_by_ent_id, name_by_entity_id = combiner.get_feature_list_by_ent(body, title, spotter, very_light=False, docid = 2)

    logger = logging.getLogger(__name__)
    logger.info(features_by_ent_id)
    logger.info(name_by_entity_id)
示例#10
0
    def build_output_using_dexter_dataset(self, spotter,
                                          golden_saliency_by_entid_by_docid,
                                          output_filename,
                                          document_to_feature_converter,
                                          tosent_converter, test_docid_set,
                                          train_docid_set):
        dexter_json_doc_list = self.dd.get_dexter_dataset(
            FileLocations.get_dropbox_dexter_path(), 'saliency-dataset.json')
        self.logger.info('building features')

        if (output_filename != None):
            file = open(output_filename, "w")
        else:
            file = None

        line_num = 0
        salience_by_entity_by_doc_id = {}
        for json_doc in dexter_json_doc_list:
            line_num += 1
            if line_num % 100 == 0:
                self.logger.info('Processed %d lines.', line_num)
            data = json.loads(json_doc)
            # pprint.pprint(data)
            docid = data['docId']

            # if docid in test_docid_set or docid in train_docid_set:

            salience_by_entity_by_doc_id[docid] = {}
            body = self.extract_body(data)
            title = data['title']
            title_entities = spotter.get_entity_candidates(title, docid)
            body_entities = spotter.get_entity_candidates(body, docid)
            # self.logger.info('Location:A')
            features_by_entity_id = document_to_feature_converter.get_features(
                body, body_entities, title, title_entities)
            # self.logger.info('Location:B.1')
            data_matrix = None
            for entity_id in features_by_entity_id.keys():
                if data_matrix is None:
                    data_matrix = np.array(
                        features_by_entity_id[entity_id]).reshape(1, -1)
                else:
                    row = np.array(features_by_entity_id[entity_id]).reshape(
                        1, -1)
                    data_matrix = np.concatenate((data_matrix, row), axis=0)
            # self.logger.info('Location:B.2')
            sentiment_array = tosent_converter.get_salient_from_numpy_matrix(
                data_matrix)
            # self.logger.info('Location:B.3')
            i = 0
            for entity_id in features_by_entity_id.keys():
                sentiment = sentiment_array[i]
                i += 1
                golden = 0
                if docid in golden_saliency_by_entid_by_docid:
                    if entity_id in golden_saliency_by_entid_by_docid[docid]:
                        golden = golden_saliency_by_entid_by_docid[docid][
                            entity_id]
                line = str(docid) + ',' + str(entity_id) + ',' + str(
                    golden) + ',0,' + str(features_by_entity_id[entity_id])
                if file is not None:
                    file.write(line)
                    file.write('\n')

                if docid in test_docid_set:
                    salience_by_entity_by_doc_id[docid][entity_id] = sentiment
            # self.logger.info('Location:C')

        if file is not None:
            file.close()
            self.logger.info('written to %s', output_filename)
        self.logger.info('processing complete')

        return salience_by_entity_by_doc_id