Пример #1
0
def parse_txt(
    parser: SpacyBISTParser,
    txt_path: Union[str, PathLike],
    out_dir: Union[str, PathLike] = None,
    show_tok=True,
    show_doc=True,
):
    """Parse raw documents in the form of lines in a text file.

    Args:
        parser (SpacyBISTParser)
        txt_path (str or PathLike)
        out_dir (str or PathLike): If specified, the output will also be written to this path.
        show_tok (bool, optional): Specifies whether to include token text in output.
        show_doc (bool, optional): Specifies whether to include document text in output.

    Yields:
        CoreNLPDoc: the annotated document.
    """
    with open(txt_path, encoding="utf-8") as f:
        if out_dir:
            print("Writing parsed documents to {}".format(out_dir))
        for i, doc_text in enumerate(
                tqdm(f, total=line_count(txt_path), file=sys.stdout)):
            parsed_doc = parser.parse(doc_text.rstrip("\n"), show_tok,
                                      show_doc)

            if out_dir:
                out_path = Path(out_dir) / (str(i + 1) + ".json")
                with open(out_path, "w", encoding="utf-8") as doc_file:
                    doc_file.write(parsed_doc.pretty_json())
            yield parsed_doc
Пример #2
0
    def __init__(
        self,
        parse: bool = True,
        rerank_model: PathLike = None,
        asp_thresh: int = 3,
        op_thresh: int = 2,
        max_iter: int = 3,
    ):
        self.acquire_lexicon = AcquireTerms(asp_thresh, op_thresh, max_iter)
        if parse:
            from nlp_architect.pipelines.spacy_bist import SpacyBISTParser

            self.parser = SpacyBISTParser()
        else:
            self.parser = None

        if not rerank_model:
            print("using pre-trained reranking model")
            rerank_model = _download_pretrained_rerank_model(
                RERANK_MODEL_DEFAULT_PATH)

        download_unzip(*EMBEDDING_URL,
                       EMBEDDING_PATH,
                       license_msg="Glove word embeddings.")
        self.rerank = RerankTerms(vector_cache=True,
                                  rerank_model=rerank_model,
                                  emb_model_path=EMBEDDING_PATH)
Пример #3
0
    def __init__(self,
                 aspect_lex: PathLike,
                 opinion_lex: PathLike or dict,
                 parse: bool = True):
        """Inits SentimentInference with given aspect and opinion lexicons."""
        INFERENCE_OUT.mkdir(parents=True, exist_ok=True)
        self.opinion_lex = \
            opinion_lex if type(opinion_lex) is dict else load_opinion_lex(opinion_lex)
        self.aspect_lex = _load_aspect_lexicon(aspect_lex)
        self.intensifier_lex = _read_lexicon_from_csv('IntensifiersLex.csv')
        self.negation_lex = _read_lexicon_from_csv('NegationSentLex.csv')

        if parse:
            from nlp_architect.pipelines.spacy_bist import SpacyBISTParser
            self.parser = SpacyBISTParser()
        else:
            self.parser = None
Пример #4
0
    def __init__(
        self,
        aspect_lex: Union[str, PathLike],
        opinion_lex: Union[str, PathLike, dict],
        parse: bool = True,
    ):
        """Inits SentimentInference with given aspect and opinion lexicons."""
        INFERENCE_OUT.mkdir(parents=True, exist_ok=True)
        self.opinion_lex = (opinion_lex if type(opinion_lex) is dict else
                            load_opinion_lex(Path(opinion_lex)))
        self.aspect_lex = _load_aspect_lexicon(Path(aspect_lex))
        self.intensifier_lex = _read_lexicon_from_csv("IntensifiersLex.csv")
        self.negation_lex = _read_lexicon_from_csv("NegationSentLex.csv")

        if parse:
            from nlp_architect.pipelines.spacy_bist import SpacyBISTParser

            self.parser = SpacyBISTParser(spacy_model="en")
        else:
            self.parser = None
Пример #5
0
    def run(
        self,
        aspect_lex: PathLike = None,
        opinion_lex: PathLike = None,
        data: PathLike = None,
        parsed_data: PathLike = None,
        inference_results: PathLike = None,
    ) -> Optional[pd.DataFrame]:

        opinions = load_opinion_lex(opinion_lex)
        if not opinions:
            raise ValueError("Empty opinion lexicon!")
        aspects = pd.read_csv(aspect_lex, header=None, encoding="utf-8")[0]
        if aspects.empty:
            raise ValueError("Empty aspect lexicon!")
        if inference_results:
            with open(inference_results, encoding="utf-8") as f:
                results = json.loads(f.read(),
                                     object_hook=SentimentDoc.decoder)
        elif data or parsed_data:
            inference = SentimentInference(aspect_lex, opinions, parse=False)
            parse = None
            if not parsed_data:  # source data is raw text, need to parse
                from nlp_architect.pipelines.spacy_bist import SpacyBISTParser

                parse = SpacyBISTParser().parse

            results = {}
            print("Running inference on data files... (Iterating data files)")
            data_source = parsed_data if parsed_data else data
            for file, doc in self._iterate_docs(data_source):
                parsed_doc = (parse(doc) if parse else json.loads(
                    doc, object_hook=CoreNLPDoc.decoder))
                sentiment_doc = inference.run(parsed_doc=parsed_doc)
                if sentiment_doc:
                    results[file] = sentiment_doc
            with open(SENTIMENT_OUT / "inference_results.json",
                      "w",
                      encoding="utf-8") as f:
                json.dump(results,
                          f,
                          cls=SentimentDocEncoder,
                          indent=4,
                          sort_keys=True)
        else:
            print(
                "No input given. Please supply one of: "
                "data directory, parsed data directory, or inference results.")
            return None

        print("\nComputing statistics...")
        stats = self._compute_stats(results, aspects, opinions)
        print("Done.")
        return stats
Пример #6
0
class Fixtures:
    default_parser = SpacyBISTParser()

    ptb_pos_tags = {
        "CC",
        "CD",
        "DT",
        "EX",
        "FW",
        "IN",
        "JJ",
        "JJR",
        "JJS",
        "LS",
        "MD",
        "NN",
        "NNS",
        "NNP",
        "NNPS",
        "PDT",
        "POS",
        "PRP",
        "PRP$",
        "RB",
        "RBR",
        "RBS",
        "RP",
        "SYM",
        "TO",
        "UH",
        "VB",
        "VBD",
        "VBG",
        "VBN",
        "VBP",
        "VBZ",
        "WDT",
        "WP",
        "WP$",
        "WRB",
    }

    token_label_types = {
        "start": int,
        "len": int,
        "pos": str,
        "ner": str,
        "lemma": str,
        "gov": int,
        "rel": str,
    }
Пример #7
0
class Fixtures:
    default_parser = SpacyBISTParser()

    ptb_pos_tags = {
        'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD',
        'NN', 'NNS', 'NNP', 'NNPS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR',
        'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP',
        'VBZ', 'WDT', 'WP', 'WP$', 'WRB'
    }

    token_label_types = {
        'start': int,
        'len': int,
        'pos': str,
        'ner': str,
        'lemma': str,
        'gov': int,
        'rel': str
    }
Пример #8
0
    def __init__(
        self,
        aspect_lex: Union[str, PathLike],
        opinion_lex: Union[str, PathLike, dict],
        parse: bool = True,
        parser="spacy",
        spacy_model="en_core_web_sm",
    ):
        """Inits SentimentInference with given aspect and opinion lexicons."""
        INFERENCE_OUT.mkdir(parents=True, exist_ok=True)
        self.opinion_lex = (
            opinion_lex if type(opinion_lex) is dict else load_opinion_lex(Path(opinion_lex))
        )
        self.aspect_lex = _load_aspect_lexicon(Path(aspect_lex))
        self.intensifier_lex = _read_lexicon_from_csv("IntensifiersLex.csv")
        self.negation_lex = _read_lexicon_from_csv("NegationSentLex.csv")
        self.parser_name = parser

        if parse:
            if parser == "bist":
                from nlp_architect.pipelines.spacy_bist import SpacyBISTParser

                self.parser = SpacyBISTParser(spacy_model=spacy_model)
            elif parser == "spacy":
                from nlp_architect.utils.text import SpacyInstance

                disable = [
                    "merge_noun_chunks",
                    "ner",
                    "entity_linker",
                    "textcat",
                    "entity_ruler",
                    "sentencizer",
                    "merge_entities",
                ]
                self.parser = SpacyInstance(
                    model=spacy_model, disable=disable, ptb_pos=True, n_jobs=1
                )
        else:
            self.parser = None
Пример #9
0
class BistParserApi(AbstractApi):
    """
    Bist Parser API
    """
    def __init__(self):
        self.model = None

    def load_model(self):
        """
        Load SpacyBISTParser model
        """
        self.model = SpacyBISTParser()

    def inference(self, doc):
        """
        Parse according to SpacyBISTParser's model

        Args:
            doc (str): the doc str

        Returns:
            CoreNLPDoc: the parser's response hosted in CoreNLPDoc object
        """
        return self.model.parse(doc)
Пример #10
0
 def load_model(self):
     """
     Load SpacyBISTParser model
     """
     self.model = SpacyBISTParser()
Пример #11
0
class SentimentInference(object):
    """Main class for sentiment inference execution.

    Attributes:
        opinion_lex: Opinion lexicon as outputted by TrainSentiment module.
        aspect_lex: Aspect lexicon as outputted by TrainSentiment module.
        intensifier_lex (dict): Pre-defined intensifier lexicon.
        negation_lex (dict): Pre-defined negation lexicon.
    """
    def __init__(self,
                 aspect_lex: PathLike,
                 opinion_lex: PathLike or dict,
                 parse: bool = True):
        """Inits SentimentInference with given aspect and opinion lexicons."""
        INFERENCE_OUT.mkdir(parents=True, exist_ok=True)
        self.opinion_lex = \
            opinion_lex if type(opinion_lex) is dict else load_opinion_lex(opinion_lex)
        self.aspect_lex = _load_aspect_lexicon(aspect_lex)
        self.intensifier_lex = _read_lexicon_from_csv('IntensifiersLex.csv')
        self.negation_lex = _read_lexicon_from_csv('NegationSentLex.csv')

        if parse:
            from nlp_architect.pipelines.spacy_bist import SpacyBISTParser
            self.parser = SpacyBISTParser()
        else:
            self.parser = None

    def run(self,
            doc: str = None,
            parsed_doc: CoreNLPDoc = None) -> SentimentDoc:
        """Run SentimentInference on a single document.

        Returns:
            The sentiment annotated document, which contains the detected events per sentence.
        """
        if not parsed_doc:
            if not self.parser:
                raise RuntimeError(
                    "Parser not initialized (try parse=True at init )")
            parsed_doc = self.parser.parse(doc)

        sentiment_doc = None
        for sentence in parsed_doc.sentences:
            events = []
            scores = []
            for aspect_row in self.aspect_lex:
                _, asp_events = self._extract_event(aspect_row, sentence)
                for asp_event in asp_events:
                    events.append(asp_event)
                    scores += [
                        term.score for term in asp_event
                        if term.type == TermType.ASPECT
                    ]

            if events:
                if not sentiment_doc:
                    sentiment_doc = SentimentDoc(parsed_doc.doc_text)
                sentiment_doc.sentences.append(
                    SentimentSentence(
                        sentence[0]['start'],
                        sentence[-1]['start'] + sentence[-1]['len'] - 1,
                        events))
        return sentiment_doc

    def _extract_intensifier_terms(self, toks, sentiment_index, polarity,
                                   sentence):
        """Extract intensifier events from sentence."""
        count = 0
        terms = []
        for intens_i, intens in [(i, x) for i, x in enumerate(toks)
                                 if x in self.intensifier_lex]:
            if math.fabs(sentiment_index - intens_i) == 1:
                score = self.intensifier_lex[intens].score
                terms.append(
                    Term(intens, TermType.INTENSIFIER, polarity, score,
                         sentence[intens_i]['start'],
                         sentence[intens_i]['len']))
                count += abs(score + float(INTENSIFIER_FACTOR))
        return count if count != 0 else 1, terms

    def _extract_neg_terms(self, toks: list, op_i: int,
                           sentence: list) -> tuple:
        """Extract negation terms from sentence.

        Args:
            toks: Sentence text broken down to tokens (words).
            op_i: Index of opinion term in sentence.
            sentence: parsed sentence

        Returns:
            List of negation terms and its aggregated sign (positive or negative).
        """
        sign = 1
        terms = []
        gov_op_i = sentence[op_i]['gov']
        dep_op_indices = [
            sentence.index(x) for x in sentence if x['gov'] == op_i
        ]
        for neg_i, negation in [(i, x) for i, x in enumerate(toks)
                                if x in self.negation_lex]:
            position = self.negation_lex[negation].position
            dist = op_i - neg_i
            before = position == 'before' and (dist == 1
                                               or neg_i in dep_op_indices)
            after = position == 'after' and (dist == -1 or neg_i == gov_op_i)
            both = position == 'both' and dist in (1, -1)
            if before or after or both:
                terms.append(
                    Term(negation, TermType.NEGATION, Polarity.NEG,
                         self.negation_lex[negation].score,
                         sentence[toks.index(negation)]['start'],
                         sentence[toks.index(negation)]['len']))
                sign *= self.negation_lex[negation].score
        return terms, sign

    def _extract_event(self, aspect_row: LexiconElement,
                       parsed_sentence: list) -> tuple:
        """Extract opinion and aspect terms from sentence."""
        event = []
        sent_aspect_pair = None
        real_aspect_indices = _consolidate_aspects(aspect_row.term,
                                                   parsed_sentence)
        aspect_key = aspect_row.term[0]
        for aspect_index_range in real_aspect_indices:
            for word_index in aspect_index_range:
                sent_aspect_pair, event = \
                    self._detect_opinion_aspect_events(word_index, parsed_sentence, aspect_key,
                                                       aspect_index_range)
                if sent_aspect_pair:
                    break
        return sent_aspect_pair, event

    @staticmethod
    def _modify_for_multiple_word(cur_tkn, parsed_sentence, index_range):
        """Modify multiple-word aspect tkn length and start index.

        Args:
            index_range: The index range of the multi-word aspect.
        Returns:
            The modified aspect token.
        """
        if len(index_range) >= 2:
            cur_tkn["start"] = parsed_sentence[index_range[0]]["start"]
            cur_tkn["len"] = len(parsed_sentence[index_range[0]]["text"])
            for i in index_range[1:]:
                cur_tkn["len"] = int(cur_tkn["len"]) + len(
                    parsed_sentence[i]["text"]) + 1
        return cur_tkn

    def _detect_opinion_aspect_events(self, aspect_index, parsed_sent,
                                      aspect_key, index_range):
        """Extract opinion-aspect events from sentence.

        Args:
            aspect_index: index of aspect in sentence.
            parsed_sent: current sentence parse tree.
            aspect_key: main aspect term serves as key in aspect dict.
            index_range: The index range of the multi word aspect.

        Returns:
            List of aspect sentiment pair, and list of events extracted.
        """
        all_pairs, events = [], []
        sentence_text_list = [x["text"] for x in parsed_sent]
        sentence_text = ' '.join(sentence_text_list)
        cur_aspect = parsed_sent[aspect_index]["text"]
        for tok in parsed_sent:
            aspect_op_pair = []
            terms = []
            pos = tok['pos']
            gov_i = tok['gov']
            gov = parsed_sent[gov_i]
            gov_text = gov['text']
            tok_text = tok['text']

            # is cur_tkn an aspect and gov a opinion?
            if tok_text.lower() == cur_aspect.lower() and parsed_sent.index(tok) == aspect_index \
                    and gov_text.lower() in self.opinion_lex and \
                    gov['pos'] not in VERB_POS:
                aspect_op_pair.append(
                    (self._modify_for_multiple_word(tok, parsed_sent,
                                                    index_range), gov))
            #  is gov an aspect and cur_tkn a opinion?
            if gov_text.lower() == cur_aspect.lower() and gov_i == aspect_index \
                    and tok_text.lower() in self.opinion_lex and pos not in VERB_POS:
                aspect_op_pair.append(
                    (self._modify_for_multiple_word(gov, parsed_sent,
                                                    index_range), tok))
            # if aspect_tok found
            for aspect, opinion in aspect_op_pair:
                op_tok_i = parsed_sent.index(opinion)
                score = self.opinion_lex[opinion['text'].lower()].score
                neg_terms, sign = self._extract_neg_terms(
                    sentence_text_list, op_tok_i, parsed_sent)
                polarity = Polarity.POS if score * sign > 0 else Polarity.NEG
                intensifier_score, intensifier_terms = self._extract_intensifier_terms(
                    sentence_text_list, op_tok_i, polarity, parsed_sent)
                over_all_score = score * sign * intensifier_score
                terms.append(
                    Term(aspect_key, TermType.ASPECT, polarity, over_all_score,
                         aspect['start'], aspect['len']))
                terms.append(
                    Term(opinion['text'], TermType.OPINION, polarity,
                         over_all_score, opinion['start'], opinion['len']))
                if len(neg_terms) > 0:
                    terms = terms + neg_terms
                if len(intensifier_terms) > 0:
                    terms = terms + intensifier_terms
                all_pairs.append([
                    aspect_key, opinion['text'], over_all_score, polarity,
                    sentence_text
                ])
                events.append(terms)
        return all_pairs, events
Пример #12
0
import spacy
import datetime
import logging

logging.basicConfig(level=logging.INFO)
import neuralcoref
from nlp_architect.pipelines.spacy_bist import SpacyBISTParser

nlp = spacy.load('en_core_web_sm')
nlp.replace_pipe("parser", SpacyBISTParser())
neuralcoref.add_to_pipe(nlp)

print('processing', datetime.datetime.utcnow())
doc = nlp(
    'However in recent times attempts at systematizing this relationship is imposed by the convergence brought about by technological change or data revolution which has enabled use of better observation devices that can be in citizen’s hands. However in recent times attempts at systematizing this relationship is imposed by the convergence brought about by technological change or data revolution which has enabled use of better observation devices that can be in citizen’s hands. However in recent times attempts at systematizing this relationship is imposed by the convergence brought about by technological change or data revolution which has enabled use of better observation devices that can be in citizen’s hands.'
)
print('done', datetime.datetime.utcnow())
doc.to_json()
doc._.coref_clusters