예제 #1
0
def parse(text):
    """
    Primary function to run syntaxnet and PredPatt over input sentences.
    """
    parse_tree, trace = annotate_text(text)
    conll_parsed = parse_to_conll(parse_tree)

    conll_pp = [ud_parse for sent_id, ud_parse in load_conllu(conll_parsed)][0]

    #PredPatt options. Modify as needed.
    resolve_relcl = True  # relative clauses
    resolve_appos = True  # appositional modifiers
    resolve_amod = True   # adjectival modifiers
    resolve_conj = True   # conjuction
    resolve_poss = True   # possessives
    ud = dep_v2.VERSION   # the version of UD
    opts = PredPattOpts(resolve_relcl=resolve_relcl,
                        resolve_appos=resolve_appos,
                        resolve_amod=resolve_amod,
                        resolve_conj=resolve_conj,
                        resolve_poss=resolve_poss,
                        ud=ud)
    ppatt = PredPatt(conll_pp, opts=opts)
    predicate_deps, arg_deps = get_ud_fragments(ppatt)

    #NOTE:
    #This returns the pretty print formatted string from PredPatt. This is done
    #largely as a place holder for JSON compatability within the REST API.
    return {'predpatt': {'predicate_deps': predicate_deps,
                         'arg_deps': arg_deps},
            'conll': conll_parsed,
            'original': text}
예제 #2
0
def extract_predpatt_text(row, eid_num:int):
    '''
    Given a pandas dataframe of TB data
    and eid_num (1 or 2)
    
    output predpatt predicate text
    (adds copula fillers in text)
    '''
    tokenid = getattr(row, f'eid{eid_num}_token_id')
    conllu_string = getattr(row, f'eid{eid_num}_sent_conllu')
    parsed_tb = [PredPatt(ud_parse, opts=options) for sent_id, ud_parse in load_conllu(conllu_string)]
    pred_objects = parsed_tb[0].instances
    
    curr_text = getattr(row, f'eid{eid_num}_text')
    
    pred_match = False
    #print(f"{(row['docid'], row['eventInstanceID'], row['relatedToEventInstance'])}")
    if pred_objects:
        for pred in pred_objects:
            if int(pred.root.position)==int(tokenid):
                pred_match = True
                pred_object = pred
                break
            else:
                pred_match=False
        
        if pred_match:
            pred_text, _, _, _ = predicate_info(pred_object)
            return pred_text
        else:
            return curr_text

    else:
        return getattr(row, f'eid{eid_num}_text')
예제 #3
0
def test(data):
    from predpatt import PredPatt, load_conllu

    def fail(g, t):
        if len(g) != len(t):
            return True
        else:
            for i in g:
                if i not in t:
                    return True

    no_color = lambda x, _: x
    count, failed = 0, 0
    ret = ""
    for sent_id, ud_parse in load_conllu(data):
        count += 1
        pp = PredPatt(ud_parse)
        sent = ' '.join(t.text for t in pp.tokens)
        linearized_pp = linearize(pp)
        gold_preds = [
            predicate.format(C=no_color, track_rule=False)
            for predicate in pp.instances if likely_to_be_pred(predicate)
        ]
        test_preds = pprint_preds(
            construct_pred_from_flat(linearized_pp.split()))
        if fail(gold_preds, test_preds):
            failed += 1
            ret += (
                "Sent: %s\nLinearized PredPatt:\n\t%s\nGold:\n%s\nYours:\n%s\n\n"
                % (sent, linearized_pp, "\n".join(gold_preds),
                   "\n".join(test_preds)))
    print(ret)
    print("You have test %d instances, and %d failed the test." %
          (count, failed))
예제 #4
0
    def from_conll(cls,
                   corpus: Union[str, TextIO],
                   name: str = 'ewt',
                   options: Optional[PredPattOpts] = None) -> 'PredPattCorpus':
        """Load a CoNLL dependency corpus and apply predpatt

        Parameters
        ----------
        corpus
            (path to) a .conllu file
        name
            the name of the corpus; used in constructing treeids
        options
            options for predpatt extraction
        """

        options = DEFAULT_PREDPATT_OPTIONS if options is None else options

        corp_is_str = isinstance(corpus, str)

        if corp_is_str and splitext(basename(corpus))[1] == '.conllu':
            with open(corpus) as infile:
                data = infile.read()

        elif corp_is_str:
            data = corpus

        else:
            data = corpus.read()

        # load the CoNLL dependency parses as graphs
        ud_corp = {name+'-'+str(i+1): [line.split()
                                       for line in block.split('\n')
                                       if len(line) > 0
                                       if line[0] != '#']
                   for i, block in enumerate(data.split('\n\n'))}
        ud_corp = CoNLLDependencyTreeCorpus(ud_corp)

        # extract the predpatt for those dependency parses
        try:
            predpatt = {name+'-'+sid.split('_')[1]: PredPatt(ud_parse,
                                                             opts=options)
                        for sid, ud_parse in load_conllu(data)}

        except ValueError:
            errmsg = 'PredPatt was unable to parse the CoNLL you provided.' +\
                     ' This is likely due to using a version of UD that is' +\
                     ' incompatible with PredPatt. Use of version 1.2 is' +\
                     ' suggested.'

            raise ValueError(errmsg)
            
        return cls({n: (pp, ud_corp[n])
                    for n, pp in predpatt.items()})
예제 #5
0
def setup_graph():
    ud = DependencyGraphBuilder.from_conll(listtree, 'tree1')

    pp = PredPatt(next(load_conllu(rawtree))[1],
                  opts=PredPattOpts(resolve_relcl=True,
                                    borrow_arg_for_relcl=True,
                                    resolve_conj=False,
                                    cut=True))

    graph = PredPattGraphBuilder.from_predpatt(pp, ud, 'tree1')

    return pp, graph
예제 #6
0
def generate_predicates(
    abstract_text:str,
    pred_patt_opts=None
)->Iterable[Tuple[str, str, str]]:
  "Requires that pred_util:nlp and pred_util:stopwords be initialized"
  nlp = dpg.get("pred_util:nlp")
  parser = Spacy2ConllParser(nlp=nlp)
  stopwords = dpg.get("pred_util:stopwords")

  doc = nlp(abstract_text)
  for sent in doc.sents:
    # if the sentence is very long
    if len(sent) >= 20:
      word_count = defaultdict(int)
      for tok in sent:
        word_count[str(tok)] += 1
        # if one word dominates the long sentence
      if max(word_count.values()) >= len(sent)*0.2:
        continue  # we likely generated the same word over-and-over
    conllu = "".join(list(parser.parse(input_str=str(sent))))
    for _, pred_patt_parse in load_conllu(conllu):
      predicates = PredPatt(
        pred_patt_parse,
        opts=pred_patt_opts
      ).instances
      for predicate in predicates:
        # We only care about 2-entity predicates
        if len(predicate.arguments) == 2:
          a_ents, b_ents = [
              # Get the set of entities
              filter(
                # Not in the stopword list
                lambda x: x not in stopwords,
                [str(e).strip() for e in nlp(args.phrase()).ents]
              )
              # For each argument
              for args in predicate.arguments
          ]
          # Slight cleaning needed to better match the predicate phrase
          # Note, that PredPatt predicates use ?a and ?b placeholders
          predicate_stmt = (
              re.match(
                r".*\?a(.*)\?b.*", # get text between placeholders
                predicate.phrase()
              )
              .group(1) # get the group matched between the placeholders
              .strip()
          )
          if len(predicate_stmt) > 0:
            # We're going to iterate all predicates
            for a, b in product(a_ents, b_ents):
              if a != b:
                yield (a, predicate_stmt, b)
예제 #7
0
def extract_predpatt(path='../../data/corpora/ud/UD_English-EWT-r1.2/'):
    '''
        Extract PredPatt objects from CONLLU files
    '''

    options = PredPattOpts(resolve_relcl=True,
                           borrow_arg_for_relcl=True,
                           resolve_conj=False,
                           cut=True)  # Resolve relative clause
    patt = {}

    for file in os.listdir(path):
        if file.endswith('.conllu'):
            with open(path + file, 'r') as infile:
                for sent_id, ud_parse in load_conllu(infile.read()):
                    patt[file + " " + sent_id] = \
                                                PredPatt(ud_parse, opts=options)

    return patt
예제 #8
0
    def extract(self, sentence: str) -> List[Dict[str, Any]]:
        processed = self.pipeline.process(sentence, self._error)
        if self._error.occurred():
            print(f"=== Error occurred: {self._error.message}")
            self._error = ProcessingError()
            return None
        else:
            conll_example = [ud_parse for sent_id, ud_parse in load_conllu(processed)][
                0
            ]
            ppatt = PredPatt(conll_example, opts=self._opts)
            result = []
            for predicate in ppatt.instances:
                structure = {
                    "predicate": predicate.tokens,
                    "arguments": [x.tokens for x in predicate.arguments],
                }
                result.append(structure)

            return result
def main():
    # Data Locations
    parser = argparse.ArgumentParser(
        description='Recast UDS-Time duration to NLI format.')
    parser.add_argument('--udstime',
                        type=str,
                        default='time_eng_ud_v1.2_2015_10_30.tsv',
                        help='UDS-Time tsv dataset file location.')

    parser.add_argument(
        '--split',
        type=str,
        default='',
        help='If specified (train, dev, test), only that split is recasted')

    parser.add_argument('--out_train',
                        type=str,
                        default='train/',
                        help='recasted train data folder location ')

    parser.add_argument('--out_dev',
                        type=str,
                        default='dev/',
                        help='recasted train data folder location')

    parser.add_argument('--out_test',
                        type=str,
                        default='test/',
                        help='recasted train data folder location ')

    args = parser.parse_args()

    # ### Import UDS Time
    uds_time = pd.read_csv(args.udstime, sep="\t")
    ewt = doc_utils.Corpus(uds_time=uds_time)
    df = ewt.process_data

    #######################################################
    ## Add features to UDS-time dataframe
    #######################################################

    df['Pred1.UPOS'] = df.apply(
        lambda row: get_predicate_pos(row, ewt, event=1), axis=1)
    df['Pred2.UPOS'] = df.apply(
        lambda row: get_predicate_pos(row, ewt, event=2), axis=1)

    ## Extract Predicate Full Text
    predicate_dict = {}
    for ud_data_path in ud_data:
        covered_set = set()
        fname = ud_data_path.split("/")[-1]
        data_name = fname.split(".")[0].split("-")[-1]

        #print(f"Start processing: {data_name}")
        with open(ud_data_path) as infile:
            data = infile.read()
            parsed = [(PredPatt(ud_parse, opts=options), sent_id)
                      for sent_id, ud_parse in load_conllu(data)]

        for pred_object, sentid in parsed:
            sentnum = sentid.split("_")[-1]
            sentenceid = fname + " " + sentnum
            for predicate_object in pred_object.instances:
                #print(f"sentenceid: {sentenceid}, pred: {predicate_object}")
                pred_text, _, pred_root_token, _ = predicate_info(
                    predicate_object)
                predicate_dict[sentenceid + "_" +
                               str(pred_root_token)] = pred_text
                #print(f"error at sentid :{sentenceid}")

        print(f"Finished creating predicate dictionary for : {data_name}\n")

    df['Pred1.Text.Full'] = df['Event1.ID'].map(lambda x: predicate_dict[x])
    df['Pred2.Text.Full'] = df['Event2.ID'].map(lambda x: predicate_dict[x])

    #######################################################
    ## Recast Data
    #######################################################

    pairid = -1  # count total pair ids
    # Count event-pairs skipped due to ambiguous text for highlighting predicate.
    skipcount = 0

    if args.split:
        splits = [args.split]
    else:
        splits = ['train', 'dev', 'test']

    for split in splits:
        data = []
        metadata = []

        curr_df = df[df['Split'] == split]
        print(f"Creating NLI instances for Data split: {split}")
        event_pair_ids = list(curr_df.groupby(['Event.Pair.ID']).groups.keys())

        pbar = tqdm(total=len(event_pair_ids))

        for idx, event_pair_id in enumerate(event_pair_ids):
            ## Predicate 1

            recasted_data, recasted_metadata, pairid, skipcount = create_duration_NLI(
                event_pair_id,
                df,
                ewt,
                pairid=pairid,
                skipcount=skipcount,
                event=1,
                sliding_window=1)
            if recasted_data:
                data += recasted_data
                metadata += recasted_metadata
            ## Predicate 2
            recasted_data, recasted_metadata, pairid, skipcount = create_duration_NLI(
                event_pair_id,
                df,
                ewt,
                pairid=pairid,
                skipcount=skipcount,
                event=2,
                sliding_window=1)
            if recasted_data:
                data += recasted_data
                metadata += recasted_metadata

            # if pairid%10000==0:
            # 	print(f"Total pair-ids processed so far: {pairid}, skipped so far: {skipcount}")
            pbar.update(1)

        out_folder = {
            'train': args.out_train,
            'dev': args.out_dev,
            'test': args.out_test
        }

        print(
            f"Total pair-ids processed so far: {pairid}, skipped so far: {skipcount}"
        )

        with open(out_folder[split] + "recast_temporal-duration_data.json",
                  'w') as out_data:
            json.dump(data, out_data, indent=4)

        with open(out_folder[split] + "recast_temporal-duration_metadata.json",
                  'w') as out_metadata:
            json.dump(metadata, out_metadata, indent=4)

    print(f"Total pair-ids: {pairid}")
    print(f'Total events skipped: {skipcount}')
예제 #10
0
]
home = expanduser("~/Downloads/")
parsed = {'train': [], 'devte': []}
out_data = []

options = PredPattOpts(resolve_relcl=True,
                       borrow_arg_for_relcl=True,
                       resolve_conj=False,
                       cut=True)  # Resolve relative clause

path = home + '/UD_English-r1.2/en-ud-train.conllu'
with open(path, 'r') as infile:
    data = infile.read()
    parsed['train'] += [('en-ud-train.conllu' + " " + sent_id,
                         PredPatt(ud_parse, opts=options))
                        for sent_id, ud_parse in load_conllu(data)]

for file in files:
    path = home + file
    with open(path, 'r') as infile:
        data = infile.read()
        parsed['devte'] += [(file[17:] + " " + sent_id,
                             PredPatt(ud_parse, opts=options))
                            for sent_id, ud_parse in load_conllu(data)]

c = {'train': 0, 'devte': 0}
d = {'train': 0, 'dev': 0, 'test': 0}
ign = {'train': 0, 'devte': 0}
prons_incl = [
    "you", "they", "yourself", "themselves", "them", "themself", "theirself",
    "theirselves"
예제 #11
0
            for line in f.readlines():
                feats = line.split('\t')
                features[feats[0]] = [feats[1].split(), feats[2].split()]

        # Load the predpatt objects for creating features
        files = ['/Downloads/UD_English-r1.2/en-ud-train.conllu',
                 '/Downloads/UD_English-r1.2/en-ud-dev.conllu',
                 '/Downloads/UD_English-r1.2/en-ud-test.conllu']

        options = PredPattOpts(resolve_relcl=True, borrow_arg_for_relcl=True, resolve_conj=False, cut=True)  # Resolve relative clause
        patt = {}

        for file in files:
            path = home + file
            with open(path, 'r') as infile:
                for sent_id, ud_parse in load_conllu(infile.read()):
                    patt[file[27:] + " " + sent_id] = PredPatt(ud_parse, opts=options)

        data['Structure'] = data['Sentence.ID'].map(lambda x: (patt[x], features[x]))

        # Split the datasets into train, dev, test
        data_test = data[data['Split'] == 'test'].reset_index(drop=True)
        data_dev = data[data['Split'] == 'dev'].reset_index(drop=True)
        data = data[data['Split'] == 'train'].reset_index(drop=True)

#         Ridit scoring annotations and confidence ratings
#         for attr in attributes:
#             resp = attr_map[attr]
#             resp_conf = attr_conf[attr]
#             data[resp_conf + ".norm"] = data.groupby('Annotator.ID')[resp_conf].transform(ridit)
#             data_dev[resp_conf + ".norm"] = data_dev.groupby('Annotator.ID')[resp_conf].transform(ridit)
예제 #12
0
        print(
            "Processing {} of Genre: {}, Progress {}/{} ({} %), Num Skipped: {}"
            .format(currbook, currgenre, num_processed, num_books,
                    num_processed / (num_books * 1.0), num_skipped))

        decomp_lines_json = [json.loads(x) for x in decomp_lines]

        book_conll_files = [
            fi for fi in os.listdir(conlldir)
            if parse_conll_filename(fi)[1][0] == currbook
        ]

        for conllfi in book_conll_files:  #For each chunk in the book
            conllfile = os.path.join(conlldir, conllfi)
            genre, book, doc_id = parse_conll_filename(conllfile)
            conll_iter = load_conllu(conllfile)
            decomp_lines_json_chunk = [
                x for x in decomp_lines_json if x['doc-id'] == doc_id
            ]  #get the lines associated with this chunk
            line_idx = 0  #Where we are in the decomp json file

            valid_instance = True
            for sent_id, parse in conll_iter:
                sent_id = int(sent_id.split('_')[1])

                if line_idx >= len(decomp_lines_json_chunk):
                    break

                if decomp_lines_json_chunk[line_idx][
                        'sent-id'] == sent_id:  #check if there is a matching decomp extraction for this conll line
                    json_line = decomp_lines_json_chunk[line_idx]

id = 1
files = ['/UD_English-r1.2/en-ud-dev.conllu',
         '/UD_English-r1.2/en-ud-test.conllu']
home = expanduser("~/Downloads/")
parsed = {'train': [], 'devte': []}
out_data = []

# Resolve relative clause
options = PredPattOpts(resolve_relcl=True, borrow_arg_for_relcl=True, resolve_conj=False, cut=True)

path = home + '/UD_English-r1.2/en-ud-train.conllu'
with open(path, 'r') as infile:
    data = infile.read()
    parsed['train'] += [('en-ud-train.conllu' + " " + sent_id, PredPatt(ud_parse, opts=options)) for sent_id, ud_parse in load_conllu(data)]

for file in files:
    path = home + file
    with open(path, 'r') as infile:
        data = infile.read()
        parsed['devte'] += [(file[17:] + " " + sent_id, PredPatt(ud_parse, opts=options)) for sent_id, ud_parse in load_conllu(data)]
# random.shuffle(parsed['train'])
c = {'train': 0, 'devte': 0}
d = {'train': 0, 'dev': 0, 'test': 0}
copp = {'train': 0, 'devte': 0}
auxverb = {'train': 0, 'devte': 0}
ign = {'train': 0, 'devte': 0}
adj = {'train': 0, 'devte': 0}

for write_file in ['pred_train_data.csv', 'pred_devte_data.csv']:
예제 #14
0
def hand_engineering(prot, batch_size, data, data_dev):
    '''
        Hand engineered feature extraction. Supports the following - UD,
        Verbnet classids, Wordnet supersenses, concreteness ratings, LCS
        eventivity scores
    '''
    home = expanduser("~")
    framnet_posdict = {
        'V': 'VERB',
        'N': 'NOUN',
        'A': 'ADJ',
        'ADV': 'ADV',
        'PREP': 'ADP',
        'NUM': 'NUM',
        'INTJ': 'INTJ',
        'ART': 'DET',
        'C': 'CCONJ',
        'SCON': 'SCONJ',
        'PRON': 'PRON',
        'IDIO': 'X',
        'AVP': 'ADV'
    }
    # Load the features
    features = {}
    with open(home + '/Desktop/protocols/data/features-2.tsv', 'r') as f:
        for line in f.readlines():
            feats = line.split('\t')
            features[feats[0]] = (feats[1].split(), feats[2].split())

    # Load the predpatt objects for creating features
    files = [
        '/Downloads/UD_English-r1.2/en-ud-train.conllu',
        '/Downloads/UD_English-r1.2/en-ud-dev.conllu',
        '/Downloads/UD_English-r1.2/en-ud-test.conllu'
    ]
    home = expanduser("~")
    options = PredPattOpts(resolve_relcl=True,
                           borrow_arg_for_relcl=True,
                           resolve_conj=False,
                           cut=True)  # Resolve relative clause
    patt = {}

    for file in files:
        path = home + file
        with open(path, 'r') as infile:
            for sent_id, ud_parse in load_conllu(infile.read()):
                patt[file[33:][:-7] + " " + sent_id] = PredPatt(ud_parse,
                                                                opts=options)

    data['Structure'] = data['Split.Sentence.ID'].map(lambda x:
                                                      (patt[x], features[x]))
    data_dev['Structure'] = data_dev['Split.Sentence.ID'].map(
        lambda x: (patt[x], features[x]))

    raw_x = data['Structure'].tolist()
    raw_dev_x = data_dev['Structure'].tolist()

    all_x = raw_x + raw_dev_x
    all_feats = '|'.join(['|'.join(all_x[i][1][0]) for i in range(len(all_x))])
    feature_cols = Counter(all_feats.split('|'))

    # All UD dataset features
    all_ud_feature_cols = list(
        feature_cols.keys()) + [(a + "_dep") for a in feature_cols.keys()]

    # Concreteness
    f = open(home + '/Desktop/protocols/data/concrete.pkl', 'rb')
    concreteness = pickle.load(f)
    if prot == 'arg':
        conc_cols = ['concreteness']
    else:
        conc_cols = ['concreteness', 'max_conc', 'min_conc']
    f.close()

    # LCS eventivity
    from lcsreader import LexicalConceptualStructureLexicon
    lcs = LexicalConceptualStructureLexicon(
        home + '/Desktop/protocols/data/verbs-English.lcs')
    lcs_feats = ['lcs_eventive', 'lcs_stative']

    # Wordnet supersenses(lexicographer names)
    supersenses = list(
        set(['supersense=' + x.lexname() for x in wordnet.all_synsets()]))

    # Framenet
    lem2frame = {}
    for lm in framenet.lus():
        for lemma in lm['lexemes']:
            lem2frame[lemma['name'] + '.' +
                      framnet_posdict[lemma['POS']]] = lm['frame']['name']
    frame_names = ['frame=' + x.name for x in framenet.frames()]

    # Verbnet classids
    verbnet_classids = ['classid=' + vcid for vcid in verbnet.classids()]

    # Lexical features
    lexical_feats = [
        'can', 'could', 'should', 'would', 'will', 'may', 'might', 'must',
        'ought', 'dare', 'need'
    ] + [
        'the', 'an', 'a', 'few', 'another', 'some', 'many', 'each', 'every',
        'this', 'that', 'any', 'most', 'all', 'both', 'these'
    ]

    dict_feats = {}
    for f in verbnet_classids + lexical_feats + supersenses + frame_names + lcs_feats + all_ud_feature_cols + conc_cols:
        dict_feats[f] = 0

    x_pd = pd.DataFrame([
        features_func(sent_feat=sent,
                      token=token,
                      lemma=lemma,
                      dict_feats=dict_feats.copy(),
                      prot=prot,
                      concreteness=concreteness,
                      lcs=lcs,
                      l2f=lem2frame) for sent, token, lemma in
        zip(raw_x, data['Root.Token'].tolist(), data['Lemma'].tolist())
    ])

    dev_x_pd = pd.DataFrame([
        features_func(sent_feat=sent,
                      token=token,
                      lemma=lemma,
                      dict_feats=dict_feats.copy(),
                      prot=prot,
                      concreteness=concreteness,
                      lcs=lcs,
                      l2f=lem2frame)
        for sent, token, lemma in zip(raw_dev_x, data_dev['Root.Token'].tolist(
        ), data_dev['Lemma'].tolist())
    ])

    # Figure out which columns to drop(they're always zero)
    todrop1 = dev_x_pd.columns[(dev_x_pd == 0).all()].values.tolist()
    todrop = x_pd.columns[(x_pd == 0).all()].values.tolist()
    intdrop = [a for a in todrop if a not in todrop1]
    cols_to_drop = cols_to_drop = list(set(todrop) - set(intdrop))

    x = x_pd.drop(cols_to_drop, axis=1).values.tolist()
    dev_x = dev_x_pd.drop(cols_to_drop, axis=1).values.tolist()

    x = [[a[:] for a in x[i:i + batch_size]]
         for i in range(0, len(data), batch_size)]
    dev_x = [[a[:] for a in dev_x[i:i + batch_size]]
             for i in range(0, len(data_dev), batch_size)]
    return x, dev_x