from __future__ import absolute_import, print_function

from collections import defaultdict, deque
from copy import copy
from gflags import FLAGS, DEFINE_bool, DuplicateFlagError
import logging
from nltk.util import flatten
from os.path import splitext
import sys

from causeway.because_data import CausalityStandoffReader, CausationInstance
from nlpypline.data.io import DirectoryReader, InstancesDocumentWriter

try:
    DEFINE_bool(
        'separate_new_conn', True,
        'Whether a separate "NEW-CONN" transition should be generated'
        ' at the start of each new relation')
    DEFINE_bool(
        'separate_shift', False,
        'Whether a separate "SHIFT" transition should be generated when'
        ' a relation is completed')
except DuplicateFlagError as e:
    logging.warn(e)


class CausalityOracleTransitionWriter(InstancesDocumentWriter):
    def __init__(self, filepath=None):
        super(CausalityOracleTransitionWriter, self).__init__(filepath)
        self._byte_offset_in_doc = None

    def write_all_instances(self, document, instances_getter=None):
Exemplo n.º 2
0
from collections import defaultdict
from gflags import DEFINE_bool, FLAGS, DuplicateFlagError
import logging
import re
import time

from causeway import PossibleCausation, PairwiseAndNonIAAEvaluator
from nlpypline.pipeline import Stage
from nlpypline.pipeline.models import Model
from nlpypline.util import Enum

try:
    DEFINE_bool('regex_include_pos', True,
                'Whether to include POS tags in the strings matched by regex')

except DuplicateFlagError as e:
    logging.warn('Ignoring flag redefinitions; assuming module reload')


class RegexConnectiveModel(Model):
    def __init__(self, *args, **kwargs):
        super(RegexConnectiveModel, self).__init__(*args, **kwargs)
        self.regexes = []

    def _train_model(self, sentences):
        self.regexes = [
            (re.compile(pattern), matching_groups)
            for pattern, matching_groups in self._extract_patterns(sentences)
        ]

    def test(self, sentences):
Exemplo n.º 3
0
from gflags import DEFINE_bool, FLAGS, DuplicateFlagError
from itertools import chain  # , izip_longest
import logging
import numpy as np
import pycrfsuite
import time
from types import MethodType

from nlpypline.pipeline.models import Model, MultiplyFeaturizedModel
from nlpypline.pipeline.featurization import DictOnlyFeaturizer, Featurizer

try:
    DEFINE_bool('pycrfsuite_verbose', False,
                'Verbose logging output from python-crfsuite trainer')
except DuplicateFlagError as e:
    logging.warn('Ignoring flag redefinitions; assuming module reload')


class StructuredModel(Model):
    '''
    In a structured model, every instance is divided up into "parts." Those
    parts are treated as the thing to be scored by the model. Thus, this class
    overrides the default train and test methods to extract parts first, and
    then call the normal test/train on the parts rather than the instances.
    (Thus, it's often a good idea for the parts to store pointers to the
    original instances for use in featurization, as the feature extractors won't
    get a copy of the original instance on the side.)

    A StructuredModel also has a StructuredDecoder, which is used to decode the
    scored parts into a coherent labeling for the instance.
    '''
Exemplo n.º 4
0
from gflags import FLAGS, DEFINE_bool, DEFINE_string, DuplicateFlagError
import io
import logging
import os
import re

from nlpypline.data import StanfordParsedSentence, SentencesDocument
from nlpypline.util import recursively_list_files
from nlpypline.util.streams import (read_stream_until, peek_and_revert_unless,
                                    CharacterTrackingStreamWrapper)

try:
    DEFINE_string('reader_codec', 'utf-8',
                  'The encoding to assume for data files')
    DEFINE_bool(
        'reader_gold_parses', False,
        'Whether to read .parse.gold files instead of .parse files for'
        ' sentence parses')
    DEFINE_bool(
        'gold_parses_fallback', False,
        'If reader_gold_parses is True, falls back to automated parse'
        ' files instead of failing if gold parses are not found')
except DuplicateFlagError as e:
    logging.warn('Ignoring flag redefinitions; assuming module reload')


class DocumentStream(object):
    def __init__(self, filepath=None):
        self._file_stream = None
        if filepath:
            self.open(filepath)
Exemplo n.º 5
0
            'in_parse_tree',
            'pattern',
            'pattern+conn_parse_path',
            'conn_rel_pos',
            'is_alnum'
        ],
        'Features for the argument-labeling CRF')
    DEFINE_integer(
        'arg_label_max_dep_path_len', 4,
        "Maximum number of dependency path steps to allow before"
        " just making the value 'LONG-RANGE'")
    DEFINE_enum('arg_label_training_alg', 'lbfgs',
                ['lbfgs', 'l2sgd', 'ap', 'pa', 'arow'],
                'Algorithm for training argument labeling CRF')
    DEFINE_bool(
        'arg_label_save_crf_info', False,
        'Whether to read in and save an accessible version of the CRF'
        ' model parameters in the model (useful for debugging)')
except DuplicateFlagError as e:
    logging.warn('Ignoring flag redefinitions; assuming module reload')


class ArgumentLabelerModel(CRFModel):
    CAUSE_LABEL = 'Cause'
    EFFECT_LABEL = 'Effect'
    NONE_LABEL = 'None'

    def __init__(self, training_algorithm, training_params, *args, **kwargs):
        super(ArgumentLabelerModel,
              self).__init__(selected_features=FLAGS.arg_label_features,
                             training_algorithm=training_algorithm,
                             training_params=training_params,
Exemplo n.º 6
0
import sys

from causeway.because_data import CausalityStandoffReader
from causeway.because_data.iaa import CausalityMetrics, print_indented
from nlpypline.data.io import DirectoryReader

try:
    DEFINE_list(
        'iaa_file_regexes', r".*\.ann$",
        "Regexes to match filenames against for IAA (non-matching files will"
        " not be compared).")
    DEFINE_integer(
        'iaa_max_sentence', sys.maxint,
        'Maximum number of sentences to analyze when computing IAA.')
    DEFINE_bool(
        'iaa_include_partial', False,
        'Include a comparison that counts partial overlap of spans as a'
        ' match.')
    DEFINE_bool('iaa_recurse', False,
                'Whether to recurse into the data directories')
except DuplicateFlagError as e:
    logging.warn('Ignoring flag redefinitions; assuming module reload')


def compare_instance_lists(gold, predicted, indent=0):
    printing_some_metrics = FLAGS.iaa_log_confusion or FLAGS.iaa_log_stats

    if FLAGS.iaa_include_partial:
        partial_possibilities = [True, False]
    else:
        partial_possibilities = [False]
Exemplo n.º 7
0
from gflags import DEFINE_bool, FLAGS, DuplicateFlagError
import logging

from causeway import PairwiseAndNonIAAEvaluator
from nlpypline.pipeline import Stage
from nlpypline.pipeline.models.structured import (StructuredModel,
                                                  StructuredDecoder)

try:
    DEFINE_bool('combiner_print_test_instances', False,
                'Whether to print differing IAA results during evaluation')
except DuplicateFlagError as e:
    logging.warn('Ignoring flag redefinitions; assuming module reload')


class BaselineCombinerModel(StructuredModel):
    def __init__(self, baseline_causations_attr_name):
        super(BaselineCombinerModel, self).__init__(BaselineDecoder())
        self.baseline_causations_attr_name = baseline_causations_attr_name

    def _train_structured(self, instances, parts_by_instance):
        pass

    def _make_parts(self, sentence, is_train):
        if is_train:
            return []
        else:
            return getattr(sentence, self.baseline_causations_attr_name)

    def _score_parts(self, instance, instance_parts):
        pass
Exemplo n.º 8
0
import cPickle
from gflags import DEFINE_bool, FLAGS, DuplicateFlagError
import itertools
import logging
import numpy as np
from scipy.sparse import lil_matrix, vstack
from sklearn.base import BaseEstimator

from nlpypline.pipeline.featurization import (FeatureExtractor, Featurizer,
                                              FeaturizationError)
from nlpypline.util import NameDictionary, listify
# from nlpypline.util.metrics import diff_binary_vectors

try:
    DEFINE_bool(
        'rebalance_stochastically', False,
        'Rebalance classes by stochastically choosing samples to replicate')
except DuplicateFlagError as e:
    logging.warn('Ignoring flag redefinitions; assuming module reload')


class Model(object):
    def __init__(self, *args, **kwargs):
        if args or kwargs:
            logging.debug("Extra model arguments: args=%s, kwargs=%s", args,
                          kwargs)

    def train(self, instances):
        self.reset()  # Reset state in case we've been previously trained.
        self._train_model(instances)
        self._post_model_train()
Exemplo n.º 9
0
import re
from scipy.sparse import lil_matrix, csr_matrix, csgraph

from nlpypline.util import Enum, merge_dicts, listify, nwise
from nlpypline.util.nltk import (collins_find_heads, nltk_tree_to_graph,
                                 is_parent_of_leaf)
from nlpypline.util.scipy import bfs_shortest_path_costs
from nlpypline.util.streams import (
    CharacterTrackingStreamWrapper, eat_whitespace, is_at_eof,
    peek_and_revert_unless, read_stream_until)


try:
    DEFINE_bool('use_constituency_parse', False,
                'Whether to build constituency parse trees from the provided'
                ' constituency parse string when constructing'
                ' StanfordParsedSentences. Setting to false makes reading in'
                ' data more efficient.')
except DuplicateFlagError:
    pass


class Document(object):
    # TODO: there are probably a lot of other things we should offer here.
    # Starting with the ability to recover the text of the document...
    def __init__(self, filename):
        self.filename = filename

    def __repr__(self):
        return '<%s: %s>' % (self.__class__.__name__, self.filename)
Exemplo n.º 10
0
from __future__ import absolute_import
import copy
from gflags import DEFINE_bool, FLAGS, DuplicateFlagError
import logging
import numpy as np
from nltk.metrics import confusionmatrix
from nlpypline.util.scipy import add_rows_and_cols_to_matrix
from nlpypline.util import floats_same_or_nearly_equal

try:
    DEFINE_bool(
        'metrics_log_raw_counts', False,
        "Log raw counts (TP, FP, etc.) for evaluation or IAA metrics.")
except DuplicateFlagError as e:
    logging.warn('Ignoring flag redefinitions; assuming module reload')


def safe_divide(dividend, divisor):
    if divisor != 0:
        return float(dividend) / divisor
    elif dividend == 0:
        return 0.0
    else:
        return np.nan


def f1(precision, recall):
    return safe_divide(2 * precision * recall, precision + recall)


class ClassificationMetrics(object):
Exemplo n.º 11
0
from nlpypline.data.io import DirectoryReader
from nlpypline.pipeline import Pipeline, SimpleStage
from nlpypline.pipeline.models import ClassBalancingClassifierWrapper
from nlpypline.util import print_indented


try:
    DEFINE_enum('classifier_model', 'logistic',
                ['tree', 'knn', 'logistic', 'svm', 'forest', 'nb'],
                'What type of machine learning model to use as the underlying'
                ' causality filter classifier')
    DEFINE_float(
        'rebalance_ratio', 1.0,
        'The maximum ratio by which to rebalance classes for training')
    DEFINE_bool('eval_with_cv', False,
               'Evaluate with cross-validation. Overrides --evaluate flag, and'
               ' causes both train and test to be combined.')
    DEFINE_bool('debug', False,
                'Whether to print debug-level logging.')
    DEFINE_integer('seed', None, 'Seed for the numpy RNG.')
    DEFINE_enum('pipeline_type', 'tregex',
                ['tregex', 'regex', 'baseline', 'tregex+baseline',
                 'regex+baseline', 'tregex_mostfreq', 'regex_mostfreq',
                 'tregex_cache'],
                'Which causality pipeline to run')
    DEFINE_bool('filter_overlapping', True,
                'Whether to filter smaller connectives that overlap with larger'
                ' ones')
    DEFINE_bool('save_models', False,
                "Whether to save pipeline models post-train (if not doing CV).")
    DEFINE_string('models_dir', None,
Exemplo n.º 12
0
import operator
import os
from os import path
import subprocess
from subprocess import PIPE
import tempfile

from causeway.because_data import CausationInstance
from causeway.because_data.iaa import CausalityMetrics
from nlpypline.data import StanfordParsedSentence
from nlpypline.pipeline import Stage, Evaluator
from nlpypline.pipeline.models import Model
from nlpypline.util import listify, print_indented, Enum, make_getter, make_setter

try:
    DEFINE_bool("iaa_calculate_partial", False,
                "Whether to compute metrics for partial overlap")
    DEFINE_string('stanford_ner_path',
                  '/home/jesse/Documents/Work/Research/stanford-corenlp-full-2015-04-20',
                  'Path to Stanford NER directory')
    DEFINE_string('stanford_ner_jar', 'stanford-corenlp-3.5.2.jar',
                  'Name of JAR file containing Stanford NER')
    DEFINE_string(
        'stanford_ner_model_name', 'english.all.3class.distsim.crf.ser.gz',
        'Name of model file for Stanford NER')
    DEFINE_bool('print_patterns', False,
                'Whether to print all connective patterns')
    DEFINE_bool('patterns_print_test_instances', False,
                'Whether to print differing IAA results during evaluation of'
                ' pattern matching stage')
    DEFINE_bool('args_print_test_instances', False,
                'Whether to print differing IAA results during evaluation of'
Exemplo n.º 13
0
from copy import copy, deepcopy
from gflags import FLAGS, DuplicateFlagError, DEFINE_bool
import logging
from nltk.tree import ImmutableParentedTree
import numpy as np
import os
from scipy.sparse.lil import lil_matrix

from nlpypline.data import Annotation, Token, StanfordParsedSentence
from nlpypline.data.io import (DocumentReader, StanfordParsedSentenceReader,
                               InstancesDocumentWriter)
from nlpypline.util import listify, Enum, make_getter, make_setter, Object
from textwrap import TextWrapper

try:
    DEFINE_bool('reader_binarize_degrees', True,
                'Whether to turn all degrees into "Facilitate" and "Inhibit"')
    DEFINE_bool(
        'reader_ignore_overlapping', False,
        'Whether, when reading causality data, instances with an'
        ' accompanying overlapping relation should be ignored')
except DuplicateFlagError as e:
    logging.warn('Ignoring flag redefinitions; assuming module reload')


class CausewaySentence(StanfordParsedSentence):
    def __init__(self, *args, **kwargs):
        super(CausewaySentence, self).__init__(*args, **kwargs)
        self.causation_instances = []
        self.overlapping_rel_instances = []

    def add_causation_instance(self, *args, **kwargs):