Exemplo n.º 1
0
    def _calc_a_file(self, files_to_compare, debug):
        assert(isinstance(files_to_compare, FilesToCompare))

        # Reading test answers.
        test_opins = OpinionCollection.from_file(
            files_to_compare.test_filepath,
            self.synonyms_filepath,
            stemmer=self.stemmer)

        # Reading etalon answers.
        etalon_opins = OpinionCollection.from_file(
            files_to_compare.etalon_filepath,
            self.synonyms_filepath,
            stemmer=self.stemmer)

        if debug:
            print "{} <-> {}, {}".format(
                    files_to_compare.test_filepath,
                    files_to_compare.etalon_filepath,
                    files_to_compare.index)

        # Comparing test and etalon results.
        results = self._check(etalon_opins, test_opins)

        # Save result comparison into file.
        # TODO. remove path declaration from here.
        comparison_file = "{}/art{}.comp.txt".format(
                self.user_answers, str(files_to_compare.index))

        if debug:
            print "Save comparison file: {}".format(comparison_file)

        results.to_csv(comparison_file)

        return self._calcPrecisionAndRecall(results)
Exemplo n.º 2
0
    def to_opinion_collections(self, news_indices, synonyms):
        # O(N^2)
        assert (isinstance(news_indices, list))
        assert (isinstance(synonyms, SynonymsCollection))

        result = []
        for news_ID in news_indices:
            result_opinions = OpinionCollection(None, synonyms)

            for r in self.relations:
                assert (isinstance(r, ExtractedRelation))

                if r.text_position.news_ID != news_ID:
                    continue

                if r.label == NeutralLabel():  # ignore neutral labels
                    continue

                o = r.create_opinion()
                if not result_opinions.has_opinion_by_synonyms(o):
                    result_opinions.add_opinion(o)

            result.append((news_ID, result_opinions))

        return result
Exemplo n.º 3
0
    def predict(self, dest_data_type=DataType.Test):

        self.relation_collections[dest_data_type].reset_labels()

        for index, relation_groups in enumerate(
                self.relation_collections[dest_data_type].
                iter_by_linked_relations_groups(self.Settings.BatchSize)):

            batch = Batch(relation_groups, self.Settings.GroupSize)
            feed_dict = self.create_feed_dict(batch, dest_data_type)

            result = self.sess.run([self.network.Labels], feed_dict=feed_dict)
            uint_labels = result[0]

            for group_index, group in enumerate(batch.iter_groups):
                for relation in group:
                    assert (isinstance(relation, ExtractedRelation))
                    self.relation_collections[dest_data_type].apply_label(
                        label=Label.from_uint(int(uint_labels[group_index])),
                        relation_id=relation.relation_id)

        for news_ID in self.io.get_data_indices(dest_data_type):
            collection = OpinionCollection(None, self.synonyms,
                                           self.Settings.Stemmer)
            self.relation_collections[dest_data_type].fill_opinion_collection(
                collection,
                news_ID,
                lambda labels: labels[0],
                debug_check_collection=False)

            collection.save(
                self.io.get_opinion_output_filepath(
                    news_ID, self.io.get_model_root(dest_data_type)))

        return self._evaluate(dest_data_type, self.Settings.Stemmer)
Exemplo n.º 4
0
def read_opinions(filepath,
                  synonyms,
                  custom_opin_ends_iter=None,
                  read_sentiment=True,
                  skip_non_added=True):
    assert (isinstance(synonyms, SynonymsCollection))
    assert (callable(custom_opin_ends_iter) or custom_opin_ends_iter is None)
    assert (isinstance(read_sentiment, bool))
    assert (isinstance(skip_non_added, bool))

    opinions = OpinionCollection(opinions=[], synonyms=synonyms)

    it = __iter_opinion_end_values(filepath, read_sentiment) if custom_opin_ends_iter is None \
        else custom_opin_ends_iter(read_sentiment)

    for left_value, right_value, sentiment in tqdm(it, "Reading opinions:"):

        o = Opinion(value_left=left_value,
                    value_right=right_value,
                    sentiment=Label.from_int(sentiment))

        add_result = opinions.try_add_opinion(o)

        msg = "Warning: opinion '{}->{}' was skipped!".format(
            o.value_left, o.value_right)

        if add_result is False:
            if not skip_non_added:
                raise Exception(msg)
            else:
                print(msg)

    return opinions
Exemplo n.º 5
0
    def _process_into_collections(self, indices, data_type):
        """
        Processing all parameters into collections.
        returns:
            NewsWordsCollection and RelationCollection
        """
        def find_feature_vector_for_opinion(opinion_vector_collections, opinion):
            assert(isinstance(opinion_vector_collections, list))

            for collection in opinion_vector_collections:
                assert(isinstance(collection, OpinionVectorCollection))
                if not collection.has_opinion(opinion):
                    continue
                return collection.find_by_opinion(opinion)

            return None

        assert(isinstance(indices, list))

        erc = ExtractedRelationsCollection()
        ntc = NewsTermsCollection()
        for news_index in indices:
            assert(isinstance(news_index, int))

            entity_filepath = self.io.get_entity_filepath(news_index)
            news_filepath = self.io.get_news_filepath(news_index)
            opin_filepath = self.io.get_opinion_input_filepath(news_index)
            neutral_filepath = self.io.get_neutral_filepath(news_index, data_type)

            news = News.from_file(news_filepath,
                                  EntityCollection.from_file(entity_filepath, self.Settings.Stemmer),
                                  stemmer=self.Settings.Stemmer)

            opinions_collections = [OpinionCollection.from_file(neutral_filepath,
                                                                self.io.get_synonyms_collection_filepath(),
                                                                self.Settings.Stemmer)]
            if data_type == DataType.Train:
                opinions_collections.append(OpinionCollection.from_file(opin_filepath,
                                                                        self.io.get_synonyms_collection_filepath(),
                                                                        self.Settings.Stemmer))

            news_terms = NewsTerms.create_from_news(news_index, news, keep_tokens=self.Settings.KeepTokens)

            for relations, opinion in self._extract_relations(opinions_collections, news, news_terms):

                feature_vector = find_feature_vector_for_opinion(self.get_opinion_vector_collection(news_index, data_type),
                                                                 opinion)

                erc.add_news_relations(relations,
                                       opinion,
                                       news_terms,
                                       news_index,
                                       feature_vector)
            ntc.add_news_terms(news_terms)

        return ntc, erc
    def __read_collection(self, io, data_type, settings):
        assert(isinstance(io, RuSentRelNetworkIO))
        assert(isinstance(data_type, unicode))
        assert(isinstance(settings, CommonModelSettings))

        erc = ExtractedRelationsCollection()
        ntc = NewsTermsCollection()
        entities_list = []
        missed_relations_total = 0
        for news_index in io.get_data_indices(data_type):
            assert(isinstance(news_index, int))

            entity_filepath = io.get_entity_filepath(news_index)
            news_filepath = io.get_news_filepath(news_index)
            opin_filepath = io.get_etalon_doc_opins_filepath(news_index)
            neutral_filepath = io.get_neutral_filepath(news_index, data_type)

            entities = EntityCollection.from_file(entity_filepath, settings.Stemmer, self.__synonyms)

            news = News.from_file(news_filepath, entities)

            opinions_collections = [OpinionCollection.from_file(neutral_filepath, self.__synonyms)]
            if data_type == DataType.Train:
                opinions_collections.append(OpinionCollection.from_file(opin_filepath, self.__synonyms))

            news_terms = NewsTerms.create_from_news(news_index, news, keep_tokens=settings.KeepTokens)
            news_terms_helper = NewsTermsHelper(news_terms)

            if DebugKeys.NewsTermsStatisticShow:
                news_terms_helper.debug_statistics()
            if DebugKeys.NewsTermsShow:
                news_terms_helper.debug_show_terms()

            for relations, opinion, opinions in self.__extract_relations(opinions_collections, news, news_terms):
                reversed = ContextModelInitHelper.__find_or_create_reversed_opinion(opinion, opinions_collections)
                missed = erc.add_news_relations(relations=relations,
                                                label=self.__labels_helper.create_label_from_opinions(forward=opinion, backward=reversed),
                                                news_terms=news_terms,
                                                news_index=news_index,
                                                check_relation_is_correct=lambda r: Sample.check_ability_to_create_sample(
                                                    window_size=settings.TermsPerContext,
                                                    relation=r))
                missed_relations_total += missed

            ntc.add_news_terms(news_terms)
            entities_list.append(entities)

        return ntc, erc, entities_list, missed_relations_total
Exemplo n.º 7
0
    def __clone_with_different_label(self, opinions, label):
        assert(isinstance(opinions, OpinionCollection))
        assert(isinstance(label, Label))

        ro = OpinionCollection(opinions=[],
                               synonyms=self.Synonyms)

        for o in opinions:
            assert(isinstance(o, Opinion))
            no = Opinion(value_left=o.value_left,
                         value_right=o.value_right,
                         sentiment=label)

            ro.add_opinion(no)

        return ro
Exemplo n.º 8
0
    def calc_a_file(self, files_to_compare, debug):
        assert (isinstance(files_to_compare, FilesToCompare))

        # Reading test answers.
        test_opins = OpinionCollection.from_file(
            filepath=files_to_compare.TestFilepath, synonyms=self.__synonyms)

        # Reading etalon answers.
        etalon_opins = OpinionCollection.from_file(
            filepath=files_to_compare.EtalonFilepath, synonyms=self.__synonyms)

        if debug:
            print "{} <-> {}, {}".format(files_to_compare.TestFilepath,
                                         files_to_compare.EtalonFilepath,
                                         files_to_compare.index)

        return test_opins, etalon_opins
Exemplo n.º 9
0
    def __save_etalon(self, relation_collection_helper):
        assert (isinstance(relation_collection_helper,
                           ExtractedRelationsCollectionHelper))

        relation_collection_helper.save_into_opinion_collections(
            create_opinion_collection=lambda: OpinionCollection(
                opinions=None, synonyms=self.ReadOnlySynonymsCollection),
            create_filepath_by_news_id=lambda news_id: self.IO.
            get_etalon_doc_opins_filepath(news_id),
            label_calculation_mode=LabelCalculationMode.FIRST_APPEARED)
Exemplo n.º 10
0
    def __extract_sentence_opinion_refs(text_objects_collection, title_opinions, synonyms):
        assert(isinstance(text_objects_collection, TextObjectsCollection))

        opinion_list = []
        opinion_refs = []
        added_opinions = OpinionCollection(opinions=None, synonyms=synonyms)

        TextProcessor.__setup_tags(text_objects_collection=text_objects_collection,
                                   synonyms=synonyms)

        for l_obj in text_objects_collection:
            for r_obj in text_objects_collection:

                if l_obj.CollectionInd == r_obj.CollectionInd:
                    continue

                opinion = Opinion(value_left=l_obj.get_value(),
                                  value_right=r_obj.get_value(),
                                  sentiment=NeutralLabel())

                is_title_already_has_opinion = title_opinions.has_synonymous_opinion(opinion)
                is_already_added = added_opinions.has_synonymous_opinion(opinion)

                is_appropriate = is_title_already_has_opinion and not is_already_added

                if not is_appropriate:
                    continue

                opinion = title_opinions.get_synonymous_opinion(opinion)
                o = RefOpinion(left_index=l_obj.CollectionInd,
                               right_index=r_obj.CollectionInd,
                               sentiment=opinion.sentiment)
                opinion_refs.append(o)

                opinion_list.append(opinion)

                add_result = added_opinions.try_add_opinion(opinion)
                assert(add_result)

        return opinion_refs, opinion_list
Exemplo n.º 11
0
def create_test_opinions(test_collections, labels, synonyms_filepath, stemmer):
    assert (isinstance(test_collections, list))
    assert (isinstance(labels, np.ndarray))
    assert (isinstance(stemmer, Stemmer))

    label_index = 0
    opinion_collection_list = []
    synonyms = SynonymsCollection.from_file(synonyms_filepath, stemmer=stemmer)

    for c in test_collections:
        opinions = OpinionCollection(None, synonyms, stemmer)
        for opinion_vector in c:
            l = Label.from_int(int(labels[label_index]))
            opinion_vector.set_label(l)
            o = opinions.create_opinion(opinion_vector.value_left,
                                        opinion_vector.value_right,
                                        opinion_vector.label)

            if not opinions.has_opinion_by_synonyms(o) and not isinstance(
                    l, NeutralLabel):
                opinions.add_opinion(o)
            elif not isinstance(l, NeutralLabel):
                print "Failed for o={}".format(o.to_unicode().encode('utf-8'))

            label_index += 1
        opinion_collection_list.append(opinions)
    return opinion_collection_list
Exemplo n.º 12
0
    def _process_into_collections(self, indices, entity_indices,
                                  word_embedding, window_size_in_words,
                                  is_train_collection):
        assert (isinstance(indices, list))
        assert (isinstance(word_embedding, Embedding))
        assert (isinstance(is_train_collection, bool))

        rc = ExtractedRelationsCollection()
        nwc = NewsWordsCollection(entity_indices, word_embedding)
        for n in indices:
            assert (type(n) == int)

            entity_filepath = self.io.get_entity_filepath(n)
            news_filepath = self.io.get_news_filepath(n)
            opin_filepath = self.io.get_opinion_input_filepath(n)
            neutral_filepath = self.io.get_neutral_filepath(
                n, is_train_collection)

            news = News.from_file(news_filepath,
                                  EntityCollection.from_file(entity_filepath))

            opinions_collections = [
                OpinionCollection.from_file(neutral_filepath,
                                            self.synonyms_filepath)
            ]
            if is_train_collection:
                opinions_collections.append(
                    OpinionCollection.from_file(opin_filepath,
                                                self.synonyms_filepath))

            news_words = NewsWords(n, news)
            news_descriptor = self.create_news_descriptor(
                n, news, news_words, opinions_collections, is_train_collection)

            rc.add_news_relations(news_descriptor, self.synonyms,
                                  window_size_in_words, is_train_collection)
            nwc.add_news(news_words)

        return nwc, rc
Exemplo n.º 13
0
    def get_method_statistic(files_to_compare_list, synonyms_filepath, stemmer):
        """
            Calculate statistic based on result files
            files_to_compare_list: list
                list of FilesToCompare objects
            synonyms_filepath: str
            stemmer: Stemmer
        """
        assert(isinstance(stemmer, Stemmer))

        columns = ["t_all", "t_pos", "t_neg", "e_all", "e_pos", "e_neg"]

        df = pd.DataFrame(columns=columns)
        for files_to_compare in files_to_compare_list:

            assert(isinstance(files_to_compare, FilesToCompare))
            test_opins = OpinionCollection.from_file(
                    files_to_compare.test_filepath, synonyms_filepath, stemmer=stemmer)
            etalon_opins = OpinionCollection.from_file(
                    files_to_compare.etalon_filepath, synonyms_filepath, stemmer=stemmer)

            df.loc[files_to_compare.index] = [
                    MethodStatistic.founded_opins(test_opins, etalon_opins),
                    MethodStatistic.founded_opins(test_opins, etalon_opins, PositiveLabel()),
                    MethodStatistic.founded_opins(test_opins, etalon_opins, NegativeLabel()),
                    len(etalon_opins),
                    len(list(etalon_opins.iter_sentiment(PositiveLabel()))),
                    len(list(etalon_opins.iter_sentiment(NegativeLabel())))]

        df.loc['sum'] = [float(df[c].sum()) for c in columns]

        df.loc['found'] = None
        df.loc['found']['t_all'] = float(df.loc['sum']['t_all']) / df.loc['sum']['e_all']
        df.loc['found']['t_pos'] = float(df.loc['sum']['t_pos']) / df.loc['sum']['e_pos']
        df.loc['found']['t_neg'] = float(df.loc['sum']['t_neg']) / df.loc['sum']['e_neg']

        return df
Exemplo n.º 14
0
    def process_news_content(self, news_info, title_opinions, synonyms):
        """ news_id: assumes a unique name/key
        Perform sentences parsing, excluding news title
        """
        assert(isinstance(news_info, NewsInfo))

        text_opinions = OpinionCollection(opinions=None, synonyms=synonyms)
        cds = []

        for index in range(news_info.sentences_count()):

            _, parsed_sentence, s_objects, s_frames = self._process_sentence_core(news_info, s_ind=index)

            s_opinion_refs, s_opinions_list = self.__extract_sentence_opinion_refs(
                text_objects_collection=s_objects,
                title_opinions=title_opinions,
                synonyms=synonyms)

            if len(s_opinion_refs) == 0:
                continue

            for opinion in s_opinions_list:
                if not text_opinions.has_synonymous_opinion(opinion):
                    add_result = text_opinions.try_add_opinion(opinion)
                    assert(add_result)

            cd = ContextDescriptor(
                sentence_index=index,
                parsed_text=parsed_sentence,
                opinion_refs=s_opinion_refs,
                objects_collection=s_objects,
                text_frames=s_frames,
                frames=self.Settings.Frames)

            cds.append(cd)

        return cds, text_opinions
Exemplo n.º 15
0
    def predict_core(self,
                     dest_data_type,
                     rc_labeling_callback):
        assert(isinstance(dest_data_type, unicode))
        assert(callable(rc_labeling_callback))

        rc = self.get_relations_collection(dest_data_type)
        rch = self.get_relations_collection_helper(dest_data_type)

        assert(isinstance(rc, ExtractedRelationsCollection))
        assert(isinstance(rch, ExtractedRelationsCollectionHelper))

        rc.reset_labels()
        assert(rc.check_all_relations_without_labels())

        predict_log = rc_labeling_callback(rc, dest_data_type)

        assert(rc.check_all_relations_has_labels())

        rch.debug_labels_statistic()

        rch.save_into_opinion_collections(
            create_opinion_collection=lambda: OpinionCollection(opinions=None,
                                                                synonyms=self.ReadOnlySynonymsCollection),
            create_filepath_by_news_id=lambda news_id: self.IO.get_model_doc_opins_filepath(doc_id=news_id,
                                                                                            data_type=dest_data_type),
            label_calculation_mode=self.Settings.RelationLabelCalculationMode)

        eval_result = self.get_eval_helper().evaluate_model(data_type=dest_data_type,
                                                            io=self.IO,
                                                            indices=rch.iter_unique_news_ids(),
                                                            synonyms=self.ReadOnlySynonymsCollection)

        rc.reset_labels()

        return eval_result, predict_log
Exemplo n.º 16
0
def opinions_between_entities(E, diff, news, synonyms, sentiment_opins=None):
    """ Relations that had the same difference
    """
    def try_add_opinion(o, added, neutral_opins):
        assert (isinstance(o, Opinion))
        assert (isinstance(neutral_opins, OpinionCollection))

        # Filter if there is a sentiment relation
        if sentiment_opins is not None:
            if sentiment_opins.has_opinion_by_synonyms(o):
                return

        if neutral_opins.has_opinion_by_synonyms(o):
            return

        added.add(o.create_value_id())
        neutral_opins.add_opinion(o)

    def is_ignored(entity):
        # TODO. Move ignored entities into core.
        return env.stemmer.lemmatize_to_str(entity.value) in IGNORED_ENTITIES

    def get_entity_synonyms(entity):
        return synonyms.get_synonyms_list(entity.value), \
               synonyms.get_synonym_group_index(entity.value)

    added = set()
    c = OpinionCollection(opinions=None, synonyms=synonyms)

    for i in range(E.shape[0]):
        for j in range(E.shape[1]):

            if E[i][j] != diff:
                continue

            e1 = news.entities.get_entity_by_index(i)
            e2 = news.entities.get_entity_by_index(j)

            if is_ignored(e1) or is_ignored(e2):
                continue

            if not synonyms.has_synonym(e1.value):
                synonyms.add_synonym(e1.value)

            if not synonyms.has_synonym(e2.value):
                synonyms.add_synonym(e2.value)

            sl1, g1 = get_entity_synonyms(e1)
            sl2, g2 = get_entity_synonyms(e2)

            r_left = sl1[0]
            r_right = sl2[0]

            # Filter the same groups
            if g1 == g2:
                "Entities '{}', and '{}' a part of the same synonym group".format(
                    r_left.encode('utf-8'), r_right.encode('utf-8'))
                continue

            try_add_opinion(Opinion(r_left, r_right, NeutralLabel()), added, c)
            try_add_opinion(Opinion(r_right, r_left, NeutralLabel()), added, c)

    return c
Exemplo n.º 17
0
#
# Train
#
root = io_utils.train_root()
for n in io_utils.train_indices():
    entity_filepath = root + "art{}.ann".format(n)
    news_filepath = root + "art{}.txt".format(n)
    opin_filepath = root + "art{}.opin.txt".format(n)
    neutral_filepath = root + "art{}.neut.txt".format(n)

    print neutral_filepath

    entities = EntityCollection.from_file(entity_filepath)
    news = News.from_file(news_filepath, entities)
    opinions = OpinionCollection.from_file(opin_filepath,
                                           io_utils.get_synonyms_filepath())

    neutral_opins = make_neutrals(news, synonyms, opinions)
    neutral_opins.save(neutral_filepath)

#
# Test
#
root = io_utils.test_root()
for n in io_utils.test_indices():
    entity_filepath = path.join(root, "art{}.ann".format(n))
    news_filepath = path.join(root, "art{}.txt".format(n))
    neutral_filepath = path.join(root, "art{}.neut.txt".format(n))

    print neutral_filepath
Exemplo n.º 18
0
    def predict(self, dest_data_type=DataType.Test):

        def calculate_label(relation_labels):
            assert(isinstance(relation_labels, list))

            label = None
            if self.Settings.RelationLabelCalculationMode == LabelCalculationMode.FIRST_APPEARED:
                label = relation_labels[0]
            if self.Settings.RelationLabelCalculationMode == LabelCalculationMode.AVERAGE:
                label = Label.from_int(np.sign(sum([l.to_int() for l in relation_labels])))

            if DebugKeys.PredictLabel:
                print [l.to_int() for l in relation_labels]
                print "Result: {}".format(label.to_int())

            return label

        assert(isinstance(dest_data_type, unicode))

        self._relations_collections[dest_data_type].reset_labels()
        prediction_collection = RelationPredictionResultCollection(len(self._relations_collections[dest_data_type]))

        for bags_group in self.bags_collection[dest_data_type].iter_by_groups(self.Settings.BagsPerMinibatch):

            minibatch = MiniBatch(bags_group)
            feed_dict = self.create_feed_dict(minibatch, data_type=dest_data_type)

            log_names, log_params = self.network.Log
            result = self.sess.run([self.network.Labels, self.network.Output] + log_params, feed_dict=feed_dict)
            uint_labels = result[0]
            output = result[1]

            if DebugKeys.PredictBatchDisplayLog:
                self._display_log(log_names, result[2:])

            # apply labels
            sample_indices_count = 0
            for sample_index, sample in enumerate(minibatch.iter_by_samples()):
                label = Label.from_uint(int(uint_labels[sample_index]))
                self._relations_collections[dest_data_type].apply_label(label, sample.RelationID)
                prediction_collection.add(sample.RelationID, RelationPredictionResult(output[sample_index]))
                sample_indices_count += 1

            assert(sample_indices_count == len(uint_labels))

        assert(self._relations_collections[dest_data_type].debug_check_all_relations_has_labels())

        self._relations_collections[dest_data_type].debug_labels_statistic(dest_data_type)

        # Compose Result
        self._relations_collections[dest_data_type].save(
            self.io.get_relations_filepath(data_type=dest_data_type,
                                           epoch=self._last_fit_epoch_index))

        prediction_collection.save(
            self.io.get_relations_prediction_filepath(data_type=dest_data_type,
                                                      epoch=self._last_fit_epoch_index))

        for news_ID in self.io.get_data_indices(dest_data_type):
            collection = OpinionCollection(None, self.synonyms, self.settings.Stemmer)
            self._relations_collections[dest_data_type].fill_opinion_collection(collection, news_ID, calculate_label)

            collection.save(self.io.get_opinion_output_filepath(news_ID, self.io.get_model_root(dest_data_type)))

        return self._evaluate(dest_data_type, self.Settings.Stemmer)
Exemplo n.º 19
0
    def _extract_opinions_from_title(self, title_terms, title_objects, title_frames, synonyms):
        assert(isinstance(title_terms, list))
        assert(isinstance(title_objects, TextObjectsCollection))
        assert(isinstance(title_frames, TextFrameVariantsCollection))
        assert(isinstance(synonyms, SynonymsCollection))

        opinion_refs = []
        title_opinions = OpinionCollection(opinions=None, synonyms=synonyms)

        TextProcessor.__setup_tags(text_objects_collection=title_objects,
                                   synonyms=synonyms)

        for l_obj in title_objects:
            for r_obj in title_objects:

                l_bound = l_obj.get_bound()
                r_bound = r_obj.get_bound()

                if l_bound.TermIndex == r_bound.TermIndex:
                    continue

                if l_bound.TermIndex >= r_bound.TermIndex:
                    continue

                i = l_obj.CollectionInd
                j = r_obj.CollectionInd

                if not self.__check_auth_correctness(i=i, j=j, objects=title_objects):
                    continue

                label = self.decide_label_of_pair_in_title_optional(
                    i=i, j=j,
                    title_objects=title_objects,
                    title_frames=title_frames)

                if label is None:
                    # Considered by pair-base processor
                    continue

                opinion = Opinion(value_left=l_obj.get_value(),
                                  value_right=r_obj.get_value(),
                                  sentiment=label)

                self.__debug_opinions_created += 1

                if self.__check_obj_preposition_in_title:
                    if self.__reject_by_russian_prepositions(l_obj=l_obj, r_obj=r_obj, title_terms=title_terms):
                        self.__debug_opinions_rejected_by_preps += 1
                        continue

                if not self.__guarantee_synonyms_presence(synonyms=synonyms, obj_value=opinion.value_left):
                    self.__debug_opinions_with_missed_synonyms += 1
                    continue

                if not self.__guarantee_synonyms_presence(synonyms=synonyms, obj_value=opinion.value_right):
                    self.__debug_opinions_with_missed_synonyms += 1
                    continue

                lg_ind = synonyms.get_synonym_group_index(opinion.value_left)
                rg_ind = synonyms.get_synonym_group_index(opinion.value_right)

                if lg_ind == rg_ind:
                    self.__debug_opinions_looped += 1
                    continue

                if not title_opinions.has_synonymous_opinion(opinion):
                    # OK, adding
                    self.__debug_opinions_total_extracted_from_titles += 1
                    add_result = title_opinions.try_add_opinion(opinion)
                    assert(add_result)
                else:
                    self.__debug_opinions_title_synonymous_existed += 1

                opinion_ref = RefOpinion(left_index=i, right_index=j, sentiment=opinion.sentiment)
                opinion_refs.append(opinion_ref)

        return opinion_refs, title_opinions
Exemplo n.º 20
0
# Train collection
#
root = io_utils.train_root()
for n in io_utils.train_indices():
    entity_filepath = root + "art{}.ann".format(n)
    opin_filepath = root + "art{}.opin.txt".format(n)
    neutral_filepath = root + "art{}.neut.txt".format(n)
    news_filepath = root + "art{}.txt".format(n)
    vector_output = root + "art{}.vectors.txt".format(n)

    print vector_output

    entities = EntityCollection.from_file(entity_filepath)
    news = News.from_file(news_filepath, entities)

    sentiment_opins = OpinionCollection.from_file(
        opin_filepath, io_utils.get_synonyms_filepath())
    neutral_opins = OpinionCollection.from_file(
        neutral_filepath, io_utils.get_synonyms_filepath())

    # filter_neutral(neutral_opins)

    vectors = vectorize_opinions(
        news, entities, [sentiment_opins, neutral_opins])

    vectors.save(vector_output)

#
# Test collection
#
root = io_utils.test_root()
for n in io_utils.test_indices():