Python PickledLexicon 예제들, sparv.util.PickledLexicon Python 예제들

예제 #1

0

파일 보기

파일: sentiment.py 프로젝트: professorlust/sparv-pipeline

def sentiment(sense,
              out_scores,
              out_labels,
              model,
              max_decimals=6,
              lexicon=None):
    """Assign sentiment values to tokens based on their sense annotation.
    When more than one sense is possible, calulate a weighted mean.
    - sense: existing annotation with saldoIDs.
    - out_scores, out_labels: resulting annotation file.
    - model: pickled lexicon with saldoIDs as keys.
    - max_decimals: int stating the amount of decimals the result is rounded to.
    - lexicon: this argument cannot be set from the command line,
      but is used in the catapult. This argument must be last.
    """

    if not lexicon:
        lexicon = util.PickledLexicon(model)
    # Otherwise use pre-loaded lexicon (from catapult)

    sense = util.read_annotation(sense)
    result_scores = {}
    result_labels = {}

    for token in sense:
        # Get set of senses for each token
        token_senses = dict([
            s.rsplit(util.SCORESEP, 1) if util.SCORESEP in s else (s, -1.0)
            for s in sense[token].split(util.DELIM) if s
        ])

        # Check for sense annotations and if any of the senses occur in the sentiment lexicon
        if token_senses and any(
                lexicon.lookup(s, (None, None))[1] for s in token_senses):
            sent_sum = 0.0
            labels_set = set()
            for s in token_senses:
                p = float(token_senses[s])
                if p < 0:
                    p = 1.0 / len(token_senses)
                sent_label, sent_score = lexicon.lookup(s, (None, None))
                if sent_label is not None:
                    labels_set.add(sent_label)
                    # Calculate weighted mean value
                    sent_sum += float(sent_score) * p
            result_scores[token] = str(round(sent_sum, max_decimals))
            # If there are multiple labels, derive label from polarity_score
            if len(labels_set) > 1:
                result_labels[token] = SENTIMENT_LABLES.get(round(sent_sum))
            else:
                result_labels[token] = SENTIMENT_LABLES.get(
                    int(list(labels_set)[0]))

        else:
            result_scores[token] = None
            result_labels[token] = None

    util.write_annotation(out_scores, result_scores)
    util.write_annotation(out_labels, result_labels)

예제 #2

0

파일 보기

def annotate(
        sense: Annotation = Annotation("<token>:saldo.sense"),
        out_scores: Output = Output("<token>:sensaldo.sentiment_score",
                                    description="SenSALDO sentiment score"),
        out_labels: Output = Output("<token>:sensaldo.sentiment_label",
                                    description="SenSALDO sentiment label"),
        model: Model = Model("[sensaldo.model]"),
        lexicon=None):
    """Assign sentiment values to tokens based on their sense annotation.

    When more than one sense is possible, calulate a weighted mean.
    - sense: existing annotation with saldoIDs.
    - out_scores, out_labels: resulting annotation file.
    - model: pickled lexicon with saldoIDs as keys.
    - lexicon: this argument cannot be set from the command line,
      but is used in the catapult. This argument must be last.
    """
    if not lexicon:
        lexicon = util.PickledLexicon(model.path)
    # Otherwise use pre-loaded lexicon (from catapult)

    sense = sense.read()
    result_scores = []
    result_labels = []

    for token in sense:
        # Get set of senses for each token and sort them according to their probabilities
        token_senses = [
            tuple(s.rsplit(util.SCORESEP, 1)) if util.SCORESEP in s else
            (s, -1.0) for s in token.split(util.DELIM) if s
        ]
        token_senses.sort(key=lambda x: float(x[1]), reverse=True)

        # Lookup the sentiment score for the most probable sense and assign a sentiment label
        if token_senses:
            best_sense = token_senses[0][0]
            score = lexicon.lookup(best_sense, None)
        else:
            score = None

        if score:
            result_scores.append(score)
            result_labels.append(SENTIMENT_LABLES.get(int(score)))
        else:
            result_scores.append(None)
            result_labels.append(None)

    out_scores.write(result_scores)
    out_labels.write(result_labels)

예제 #3

0

파일 보기

파일: sentiment.py 프로젝트: roger-mahler/sparv-pipeline

def sentiment(sense,
              out_scores,
              out_labels,
              model,
              max_decimals=6,
              lexicon=None):
    """Assign sentiment values to tokens based on their sense annotation.
    When more than one sense is possible, calulate a weighted mean.
    - sense: existing annotation with saldoIDs.
    - out_scores, out_labels: resulting annotation file.
    - model: pickled lexicon with saldoIDs as keys.
    - max_decimals: int stating the amount of decimals the result is rounded to.
    - lexicon: this argument cannot be set from the command line,
      but is used in the catapult. This argument must be last.
    """

    if not lexicon:
        lexicon = util.PickledLexicon(model)
    # Otherwise use pre-loaded lexicon (from catapult)

    sense = util.read_annotation(sense)
    result_scores = {}
    result_labels = {}

    for token in sense:
        # Get set of senses for each token and sort them according to their probabilities
        token_senses = [
            tuple(s.rsplit(util.SCORESEP, 1)) if util.SCORESEP in s else
            (s, -1.0) for s in sense[token].split(util.DELIM) if s
        ]
        token_senses.sort(key=lambda x: x[1], reverse=True)

        # Lookup the sentiment score for the most probable sense and assign a sentiment label
        if token_senses:
            best_sense = token_senses[0][0]
            score = lexicon.lookup(best_sense, None)
        else:
            score = None

        if score:
            result_scores[token] = score
            result_labels[token] = SENTIMENT_LABLES.get(int(score))
        else:
            result_scores[token] = None
            result_labels[token] = None

    util.write_annotation(out_scores, result_scores)
    util.write_annotation(out_labels, result_labels)

예제 #4

0

파일 보기

def annotate_words(out: Output, model: Model, saldoids: Annotation, pos: Annotation, annotate, pos_limit: List[str],
                   class_set=None, disambiguate=True, connect_ids=False, delimiter=util.DELIM, affix=util.AFFIX,
                   scoresep=util.SCORESEP, lexicon=None):
    """
    Annotate words with blingbring classes (rogetID).

    - out_sent: resulting annotation file.
    - model: pickled lexicon with saldoIDs as keys.
    - saldoids, pos: existing annotation with saldoIDs/parts of speech.
    - annotate: annotation function, returns an iterable containing annotations
        for one token ID. (annotate_bring() or annotate_swefn())
    - pos_limit: parts of speech that will be annotated.
        Set to None to annotate all pos.
    - class_set: output Bring classes or Roget IDs ("bring", "roget_head",
        "roget_subsection", "roget_section" or "roget_class").
        Set to None when not annotating blingbring.
    - disambiguate: use WSD and use only the most likely saldo ID.
    - connect_IDs: for sweFN: paste saldo ID after each sweFN ID.
    - delimiter: delimiter character to put between ambiguous results
    - affix: optional character to put before and after results to mark a set.
    - lexicon: this argument cannot be set from the command line,
      but is used in the catapult. This argument must be last.
    """
    if not lexicon:
        lexicon = util.PickledLexicon(model.path)
    # Otherwise use pre-loaded lexicon (from catapult)

    sense = saldoids.read()
    token_pos = list(pos.read())
    out_annotation = pos.create_empty_attribute()

    # Check if the saldo IDs are ranked (= word senses have been disambiguated)
    wsd = saldoids.split()[1].split(".")[0] == "wsd"

    for token_index, token_sense in enumerate(sense):

        # Check if part of speech of this token is allowed
        if not pos_ok(token_pos, token_index, pos_limit):
            saldo_ids = None
            out_annotation[token_index] = affix
            continue

        if wsd and util.SCORESEP in token_sense:
            ranked_saldo = token_sense.strip(util.AFFIX).split(util.DELIM) \
                if token_sense != util.AFFIX else None
            saldo_tuples = [(i.split(util.SCORESEP)[0], i.split(util.SCORESEP)[1]) for i in ranked_saldo]

            if not disambiguate:
                saldo_ids = [i[0] for i in saldo_tuples]

            # Only take the most likely analysis into account.
            # Handle wsd with equal probability for several words
            else:
                saldo_ids = [saldo_tuples[0]]
                del saldo_tuples[0]
                while saldo_tuples and (saldo_tuples[0][1] == saldo_ids[0][1]):
                    saldo_ids = [saldo_tuples[0]]
                    del saldo_tuples[0]

                saldo_ids = [i[0] for i in saldo_ids]

        else:  # No WSD
            saldo_ids = token_sense.strip(util.AFFIX).split(util.DELIM) \
                if token_sense != util.AFFIX else None

        result = annotate(saldo_ids, lexicon, connect_ids, scoresep)
        out_annotation[token_index] = util.cwbset(result, delimiter, affix) if result else affix
    out.write(out_annotation)

예제 #5

0

파일 보기

파일: lexical_classes.py 프로젝트: roger-mahler/sparv-pipeline

def create_freq_pickle(corpus,
                       annotation,
                       filename,
                       model,
                       class_set=None,
                       score_separator=util.SCORESEP):
    """Build pickle with relative frequency for a given annotation in one or
       more reference corpora."""

    lexicon = util.PickledLexicon(model)
    # Create a set of all possible classes
    if class_set:
        all_classes = set(cc for c in lexicon.lexicon.values()
                          for cc in c[class_set])
    else:
        all_classes = set(cc for c in lexicon.lexicon.values() for cc in c)
    lexicon_size = len(all_classes)
    smoothing = 0.1

    corpus_stats = defaultdict(int)
    corpus_size = 0

    if isinstance(corpus, str):
        corpus = corpus.split()

    for c in corpus:
        # Get corpus size
        process = subprocess.Popen(
            [CWB_DESCRIBE_EXECUTABLE, "-r", CORPUS_REGISTRY, c],
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE)
        reply, error = process.communicate()
        reply = reply.decode()

        if error:
            error = error.decode()
            util.log.error(error)
            sys.exit(1)

        for line in reply.splitlines():
            if line.startswith("size (tokens)"):
                _, size = line.split(":")
                corpus_size += int(size.strip())

        # Get frequency of annotation
        util.log.info("Getting frequencies from %s", c)
        process = subprocess.Popen(
            [CWB_SCAN_EXECUTABLE, "-r", CORPUS_REGISTRY, c] + [annotation],
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE)
        reply, error = process.communicate()
        reply = reply.decode()
        if error:
            error = error.decode()
            if "Error:" in error:  # We always get something back on stderror from cwb-scan-corpus, so we must check if it really is an error
                if "Error: can't open attribute" in error:
                    util.log.error("Annotation '%s' not found", annotation)
                    sys.exit(1)

        for line in reply.splitlines():
            if not line.strip():
                continue
            freq, classes = line.split("\t")
            for cl in classes.split("|"):
                if cl:
                    freq = int(freq)
                    if score_separator:
                        cl, score = cl.rsplit(score_separator, 1)
                        score = float(score)
                        if score <= 0:
                            continue
                        freq = freq * score
                    corpus_stats[cl.replace("_", " ")] += freq

    rel_freq = defaultdict(float)

    for cl in all_classes:
        cl = cl.replace("_", " ")
        rel_freq[cl] = (corpus_stats[cl] +
                        smoothing) / (corpus_size + smoothing * lexicon_size)

    util.lexicon_to_pickle(rel_freq, filename)

예제 #6

0

파일 보기

파일: lexical_classes.py 프로젝트: roger-mahler/sparv-pipeline

def annotate_doc(out,
                 in_token_annotation,
                 text_children,
                 saldoids=None,
                 cutoff=10,
                 types=False,
                 delimiter=util.DELIM,
                 affix=util.AFFIX,
                 freq_model=None,
                 decimals=3):
    """
    Annotate documents with lexical classes.
    - out: resulting annotation file
    - in_token_annotation: existing annotation with lexical classes on token level.
    - text_children: existing annotation for text-IDs and their word children.
    - saldoids: existing annotation with saldoIDs, needed when types=True.
    - cutoff: value for limiting the resulting bring classes.
              The result will contain all words with the top x frequencies.
              Words with frequency = 1 will be removed from the result.
    - types: if True, count every class only once per saldo ID occurrence.
    - delimiter: delimiter character to put between ambiguous results.
    - affix: optional character to put before and after results to mark a set.
    - freq_model: pickled file with reference frequencies.
    - decimals: number of decimals to keep in output.
    """
    cutoff = int(cutoff)
    types = util.strtobool(types)
    text_children = util.read_annotation(text_children)
    classes = util.read_annotation(in_token_annotation)
    sense = util.read_annotation(saldoids) if types else None

    if freq_model:
        freq_model = util.PickledLexicon(freq_model)

    out_doc = {}

    for textid, words in text_children.items():
        seen_types = set()
        class_freqs = defaultdict(int)
        words = words.split()

        for tokid in words:
            # Count only sense types
            if types:
                senses = str(
                    sorted([
                        s.split(util.SCORESEP)[0] for s in sense[tokid].strip(
                            util.AFFIX).split(util.DELIM)
                    ]))
                if senses in seen_types:
                    continue
                else:
                    seen_types.add(senses)

            rogwords = classes[tokid].strip(util.AFFIX).split(
                util.DELIM) if classes[tokid] != util.AFFIX else []
            for w in rogwords:
                class_freqs[w] += 1

        if freq_model:
            for c in class_freqs:
                # Relative frequency
                rel = class_freqs[c] / len(words)
                # Calculate class dominance
                ref_freq = freq_model.lookup(c.replace("_", " "), 0)
                if not ref_freq:
                    util.log.error("Class '%s' is missing" % ref_freq)
                class_freqs[c] = (rel / ref_freq)

        # Sort words according to frequency/dominance
        ordered_words = sorted(class_freqs.items(),
                               key=lambda x: x[1],
                               reverse=True)
        if freq_model:
            # Remove words with dominance < 1
            ordered_words = [w for w in ordered_words if w[1] >= 1]
        else:
            # Remove words with frequency 1
            ordered_words = [w for w in ordered_words if w[1] > 1]

        if len(ordered_words) > cutoff:
            cutoff_freq = ordered_words[cutoff - 1][1]
            ordered_words = [w for w in ordered_words if w[1] >= cutoff_freq]

        # Join words and frequencies/dominances
        ordered_words = [
            util.SCORESEP.join([word, str(round(freq, decimals))])
            for word, freq in ordered_words
        ]
        out_doc[textid] = util.cwbset(ordered_words, delimiter,
                                      affix) if ordered_words else affix

    util.write_annotation(out, out_doc)

예제 #7

0

파일 보기

파일: lexical_classes.py 프로젝트: roger-mahler/sparv-pipeline

def annotate_words(out,
                   model,
                   saldoids,
                   pos,
                   annotate,
                   pos_limit,
                   class_set=None,
                   disambiguate=True,
                   connect_ids=False,
                   delimiter=util.DELIM,
                   affix=util.AFFIX,
                   scoresep=util.SCORESEP,
                   lexicon=None):
    """
    Annotate words with blingbring classes (rogetID).
    - out_sent: resulting annotation file.
    - model: pickled lexicon with saldoIDs as keys.
    - saldoids, pos: existing annotation with saldoIDs/parts of speech.
    - annotate: annotation function, returns an iterable containing annotations
        for one token ID. (annotate_bb() or annotate_swefn())
    - pos_limit: parts of speech that will be annotated.
        Set to None to annotate all pos.
    - class_set: output Bring classes or Roget IDs ("bring", "roget_head",
        "roget_subsection", "roget_section" or "roget_class").
        Set to None when not annotating blingbring.
    - disambiguate: use WSD and use only the most likely saldo ID.
    - connect_IDs: for sweFN: paste saldo ID after each sweFN ID.
    - delimiter: delimiter character to put between ambiguous results
    - affix: optional character to put before and after results to mark a set.
    - lexicon: this argument cannot be set from the command line,
      but is used in the catapult. This argument must be last.
    """

    if not lexicon:
        lexicon = util.PickledLexicon(model)
    # Otherwise use pre-loaded lexicon (from catapult)

    if pos_limit.lower() == "none":
        pos_limit = None

    result_dict = {}
    sense = util.read_annotation(saldoids)
    token_pos = util.read_annotation(pos)

    for tokid in sense:

        # Check if part of speech of this token is allowed
        if not pos_ok(token_pos, tokid, pos_limit):
            saldo_ids = None
            result_dict[tokid] = affix
            continue

        if util.SCORESEP in sense[tokid]:  # WSD
            ranked_saldo = sense[tokid].strip(util.AFFIX).split(util.DELIM) \
                if sense[tokid] != util.AFFIX else None
            saldo_tuples = [(i.split(util.SCORESEP)[0],
                             i.split(util.SCORESEP)[1]) for i in ranked_saldo]

            if not disambiguate:
                saldo_ids = [i[0] for i in saldo_tuples]

            # Only take the most likely analysis into account.
            # Handle wsd with equal probability for several words
            else:
                saldo_ids = [saldo_tuples[0]]
                del saldo_tuples[0]
                while saldo_tuples and (saldo_tuples[0][1] == saldo_ids[0][1]):
                    saldo_ids = [saldo_tuples[0]]
                    del saldo_tuples[0]

                saldo_ids = [i[0] for i in saldo_ids]

        else:  # No WSD
            saldo_ids = sense[tokid].strip(util.AFFIX).split(util.DELIM) \
                if sense[tokid] != util.AFFIX else None

        result = annotate(saldo_ids, lexicon, connect_ids, scoresep)
        result_dict[tokid] = util.cwbset(result, delimiter,
                                         affix) if result else affix
    util.write_annotation(out, result_dict)

예제 #8

0

파일 보기

def start(socket_path,
          processes=1,
          verbose='false',
          saldo_model=None,
          compound_model=None,
          stats_model=None,
          dalin_model=None,
          swedberg_model=None,
          blingbring_model=None,
          malt_jar=None,
          malt_model=None,
          malt_encoding=util.UTF8,
          sentiment_model=None,
          swefn_model=None,
          swener=False,
          swener_encoding=util.UTF8):
    """
    Starts a catapult on a socket file, using a number of processes.

    If verbose is false, all stdout and stderr programs produce is
    piped to /dev/null, otherwise it is sent to the client. The
    computation is done by the catapult processes, however.
    Regardless of what verbose is, client errors should be reported
    both in the catapult and to the client.

    The saldo model and compound model can be pre-loaded and shared in
    memory between processes.

    Start processes using catalaunch.
    """

    if os.path.exists(socket_path):
        log.error('socket %s already exists', socket_path)
        exit(1)

    verbose = verbose.lower() == 'true'

    log.info('Verbose: %s', verbose)

    # If processes does not contain an int, set it to the number of processors
    try:
        processes = int(processes)
    except:
        processes = cpu_count()

    # Start the socket
    server_socket = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
    server_socket.bind(socket_path)
    server_socket.listen(processes)

    # The dictionary of functions with saved lexica, indexed by module name strings
    annotators = {}

    # Load Saldo and older lexicons
    lexicons = [m for m in [saldo_model, dalin_model, swedberg_model] if m]
    if lexicons:
        lexicon_dict = {}
        for lexicon in lexicons:
            lexicon_dict[os.path.basename(lexicon).rstrip(
                ".pickle")] = saldo.SaldoLexicon(lexicon)
        annotators['sparv.saldo'] = set_last_argument(lexicon_dict)(
            saldo.annotate)

    if stats_model and compound_model:
        annotators['sparv.compound'] = set_last_argument(
            compound.SaldoCompLexicon(compound_model),
            compound.StatsLexicon(stats_model))(compound.annotate)

    elif compound_model:
        annotators['sparv.compound_simple'] = set_last_argument(
            compound_simple.SaldoLexicon(compound_model))(
                compound_simple.annotate)

    # if blingbring_model:
    #     annotators['sparv.lexical_classes'] = set_last_argument(
    #         util.PickledLexicon(blingbring_model))(lexical_classes.annotate_bb_words)

    # if swefn_model:
    #     annotators['sparv.lexical_classes'] = set_last_argument(
    #         util.PickledLexicon(swefn_model))(lexical_classes.annotate_swefn_words)

    if sentiment_model:
        annotators['sparv.sentiment'] = set_last_argument(
            util.PickledLexicon(sentiment_model))(sentiment.sentiment)

    # if models_1700s:
    #     models = models_1700s.split()
    #     lexicons = [saldo.SaldoLexicon(lex) for lex in models]
    #     annotators[('sparv.fsv', '--annotate_fallback')] = set_last_argument(lexicons)(fsv.annotate_fallback)
    #     annotators[('sparv.fsv', '--annotate_full')] = set_last_argument(lexicons)(fsv.annotate_full)

    if verbose:
        log.info('Loaded annotators: %s', list(annotators.keys()))

    if malt_jar and malt_model:
        malt_args = dict(maltjar=malt_jar,
                         model=malt_model,
                         encoding=malt_encoding,
                         send_empty_sentence=True)
    else:
        malt_args = None

    if swener:
        swener_args = dict(stdin="", encoding=swener_encoding, verbose=True)
    else:
        swener_args = None

    # Start processes-1 workers
    workers = [
        Process(target=worker,
                args=[server_socket, verbose, annotators, malt_args])
        for i in range(processes - 1)
    ]

    for p in workers:
        p.start()

    # Additionally, let this thread be worker 0
    worker(server_socket, verbose, annotators, malt_args, swener_args)

예제 #9

0

파일 보기

def annotate_text(out: Output, lexical_classes_token: Annotation, text: Annotation, token: Annotation,
                  saldoids, cutoff, types, delimiter, affix, freq_model, decimals):
    """
    Annotate text chuncs with lexical classes.

    - out: resulting annotation file
    - lexical_classes_token: existing annotation with lexical classes on token level.
    - text, token: existing annotations for the text-IDs and the tokens.
    - saldoids: existing annotation with saldoIDs, needed when types=True.
    - cutoff: value for limiting the resulting bring classes.
              The result will contain all words with the top x frequencies.
              Words with frequency = 1 will be removed from the result.
    - types: if True, count every class only once per saldo ID occurrence.
    - delimiter: delimiter character to put between ambiguous results.
    - affix: optional character to put before and after results to mark a set.
    - freq_model: pickled file with reference frequencies.
    - decimals: number of decimals to keep in output.
    """
    cutoff = int(cutoff)
    text_children, _orphans = text.get_children(token, preserve_parent_annotation_order=True)
    classes = list(lexical_classes_token.read())
    sense = list(saldoids.read()) if types else None

    if freq_model:
        freq_model = util.PickledLexicon(freq_model.path)

    out_annotation = text.create_empty_attribute()

    for text_index, words in enumerate(text_children):
        seen_types = set()
        class_freqs = defaultdict(int)

        for token_index in words:
            # Count only sense types
            if types:
                senses = str(sorted([s.split(util.SCORESEP)[0] for s in sense[token_index].strip(util.AFFIX).split(util.DELIM)]))
                if senses in seen_types:
                    continue
                else:
                    seen_types.add(senses)

            rogwords = classes[token_index].strip(util.AFFIX).split(util.DELIM) if classes[token_index] != util.AFFIX else []
            for w in rogwords:
                class_freqs[w] += 1

        if freq_model:
            for c in class_freqs:
                # Relative frequency
                rel = class_freqs[c] / len(words)
                # Calculate class dominance
                ref_freq = freq_model.lookup(c.replace("_", " "), 0)
                if not ref_freq:
                    log.error("Class '%s' is missing" % ref_freq)
                class_freqs[c] = (rel / ref_freq)

        # Sort words according to frequency/dominance
        ordered_words = sorted(class_freqs.items(), key=lambda x: x[1], reverse=True)
        if freq_model:
            # Remove words with dominance < 1
            ordered_words = [w for w in ordered_words if w[1] >= 1]
        else:
            # Remove words with frequency 1
            ordered_words = [w for w in ordered_words if w[1] > 1]

        if len(ordered_words) > cutoff:
            cutoff_freq = ordered_words[cutoff - 1][1]
            ordered_words = [w for w in ordered_words if w[1] >= cutoff_freq]

        # Join words and frequencies/dominances
        ordered_words = [util.SCORESEP.join([word, str(round(freq, decimals))]) for word, freq in ordered_words]
        out_annotation[text_index] = util.cwbset(ordered_words, delimiter, affix) if ordered_words else affix

    out.write(out_annotation)