Пример #1
0
def pmb2fol(pmb_dir, pd, sig=None, drawDRS=False):
    '''Read a CLF file of the PMB document and convert its content into a first-order logic formula
    '''
    debug("PMB document {}".format(pd))
    clf = read_clf(pmb_dir, pd)
    if not clf: return None
    # Parse clausal forms and read them as a set of connected boxes
    try:
        box_dict, sub_rel, dir_subs, disc_rels, op_types, ops_fine = check_clf(clf, sig)
        if logging.DEBUG >= logging.root.level:
            for b in box_dict: pr_box(box_dict[b])
        debug("sub_rel: {}".format(pr_2rel(sub_rel)))
        debug("dir_sub: {}".format(pr_2rel(dir_subs)))
        debug("Disc rel: {}".format(disc_rels))
    except RuntimeError as e:
        warning("{} has error: {}".format(pd, e))
        return None
    # recover DRS from boxes
    drs = boxes2drs(box_dict, sub_rel, disc_rels)
    if not drs: return None
    if drawDRS: drs.draw()
    # Convert DRS into FOL formula
    fol = drs.fol()
    debug("FOL formula for {}:\n\t{}".format(pd, fol))
    if fol.free():
        warning("The FOL formula of {} has occurrences of free variables: {}".format(pd, fol.free()))
        return None
    return fol
Пример #2
0
def is_well_formed_drs(drs, signature):
    '''Return true/false for whether a DRS is well-formed'''
    try:
        _ = check_clf(drs, signature, v=0)
        return True
    except RuntimeError:
        return False
Пример #3
0
def extensive_format_check(drs, pp_info):
    '''Do a more extensive semantic format check (referee)
       Also try to fix the DRSs, if specified'''
    fixed_drs = False
    try:
        _ = check_clf([tuple(c) for c in drs], pp_info.signature, v=0)
        return drs
    # DRS invalid, replace by dummy or try to fix
    except RuntimeError as err:
        err_message = str(err)
        # Try to fix subordinate loops by just merging/removing the offending box
        if pp_info.fix and 'Subordinate relation has a loop' in err_message:
            err_cat = "sub loop"
            box_num = err_message.split('||')[1].split('>')[0].strip()
            fixed_drs = solve_loops([tuple(c) for c in drs], box_num, pp_info)
        elif pp_info.fix_disc and "Boxes are not connected" in err_message:
            err_cat = "boxes disconnected"
            boxes = re.findall('\{(.*?)\}', err_message)
            fixed_drs = solve_non_connected(drs, boxes[0].replace(',', '').split(),
                                            boxes[1].replace(',', '').split(), pp_info.signature)

    # Only get here if DRS was invalid - if we don't have a fixed one, return dummy
    if fixed_drs:
        pp_info.pp_dict[err_cat].append(pp_info.cur_idx)
        return fixed_drs
    if pp_info.no_referee:
        # Don't want to do referee dummies, return initial DRS
        return drs
    pp_info.pp_dict["dummies-ref"].append(pp_info.cur_idx)
    return default_drs(pp_info.baseline, list_output=True)
Пример #4
0
def solve_loops(clf, box, pp_info):
    '''Recursive function: simply remove the box from a DRS that returned
       the subordinate relation has a loop error.
       If the new DRS gets a loop error message,
       then simply remove the new box as well (etc)'''
    # If already empty, return False
    if not clf:
        return False

    # First try to merge the offending box with any other box, and see if the DRS is valid now
    boxes = get_first_arg_boxes(clf)
    for b in boxes:
        if box != b:
            new_clf = merge_boxes(clf, [b, box])
            try:
                _ = check_clf(new_clf, pp_info.signature, v=0)
                # No error means the DRS is now valid, so return
                return new_clf
            except RuntimeError as err:
                pass

    # If this is not the case, we remove the offending box
    new_clf = remove_by_first_arg_box(clf, box)

    # Maybe we need to insert or remove REFs again
    new_clf_tmp = check_ref_clauses(new_clf, pp_info, do_print=False)
    new_clf = [tuple(x) for x in new_clf_tmp]

    # Check if the new DRS is valid
    try:
        _ = check_clf(new_clf, pp_info.signature, v=0)
        # No error means the DRS is now valid, so return
        return new_clf
    except RuntimeError as err:
        err_message = str(err)
        # Check the error message, if again a loop is the problem, call this function again
        if 'Subordinate relation has a loop' in err_message:
            box_num = err_message.split('||')[1].split('>')[0].strip()
            # If nothing changed, avoid infinite loops by stopping here
            if new_clf == clf:
                return False
            # Otherwise try this function again
            new_clf = solve_loops(new_clf, box_num, pp_info)
        # Different error, so approach didn't work, quit
        else:
            return False
    return new_clf
Пример #5
0
def solve_non_connected(drs, boxes1, boxes2, signature):
    '''Try to solve sets of unconnected boxes by changing a discourse variable
       to a disc var present in a different box'''
    # Introduce variables in one of the other boxes and see if that helps
    for idx, clause in enumerate(drs):
        if clause[1] == "REF" and clause[0] in boxes1:
            for box in boxes2:
                fixed_drs = change_box_in_drs(drs, idx, box)
                try:
                    _ = check_clf([tuple(c) for c in fixed_drs], signature, v=0)
                    return fixed_drs
                except RuntimeError:
                    pass
        elif clause[1] == "REF" and clause[0] in boxes2:
            for box in boxes1:
                fixed_drs = change_box_in_drs(drs, idx, box)
                try:
                    _ = check_clf([tuple(c) for c in fixed_drs], signature, v=0)
                    return fixed_drs
                except:
                    pass
    # If we get here nothing worked, return dummy
    return False
Пример #6
0
def remove_ill_formed_drss(drss, signature_file):
    '''Remove ill-formed DRSs from a set of DRSs'''
    # Get signature to do the checking
    signature = get_signature(signature_file)

    # Loop over DRSs and only keep valid ones
    new_drss, remove_idxs = [], []
    for idx, drs in enumerate(drss):
        # Remove comments and split
        list_drs = drs_string_to_list(drs)
        try:
            _ = check_clf([tuple(c) for c in list_drs], signature, v=0)
            new_drss.append(drs)
        # DRS invalid, ignore
        except RuntimeError:
            remove_idxs.append(idx)
    return new_drss, remove_idxs
Пример #7
0
def clf2graph(clf, alignment, signature=None, pars={}):
    '''Convert a CLF and alignments into a DRG graph
    '''
    # parse clf and check on correctness
    (box_dict, top_boxes, disc_rels, presupp_rels, cl_types, arg_typing) =\
        clfref.check_clf(clf, signature)
    assert len(clf) == len(cl_types), '#clauses == #clause_types'
    # map clauses to alignments
    cl2al = clause_alignment(clf, cl_types, alignment)
    # convert constants to nodes and get a mapping from terms to DIs
    nodes, nid = process_vars_constants(arg_typing)
    next_id = len(nid)
    # keep track of these
    edges = []
    # convert boxes into graph components
    for b, box in sorted(box_dict.items()):
        next_id = box2graph(box,
                            nid,
                            nodes,
                            edges,
                            next_id,
                            arg_typing,
                            cl2al,
                            pars=pars)
    # add discourse relations
    for (r, b1, b2) in sorted(disc_rels):
        add_edges(edges, [(nid[b1], nid[b2], r)], [cl2al[(r, b1, b2)]])
    # add presupposition relations
    for (b1, b2) in sorted(presupp_rels):
        add_edges(edges, [(nid[b1], nid[b2], 'PRESUPPOSITION')],
                  [cl2al[(b1, b2)]])
    # remove duplicate nodes but keep the order
    ord_set_nodes = sanity_check_nodes(nodes)
    if len(ord_set_nodes) != len(nodes):
        debug("After cleaning {} nodes remains {}".format(
            len(nodes), len(ord_set_nodes)))
    # roots = find_roots(ord_set_nodes, edges)
    edges = clean_set(edges)
    connectivity_check(ord_set_nodes, edges)
    remove_recoverable_edges(ord_set_nodes, edges, pars['bm'])
    # remove type feature from nodes, not needed anymore
    for nd in ord_set_nodes:
        del nd['type']
    debug("edges ({}); nodes ({})".format(len(edges), len(ord_set_nodes)))
    return ord_set_nodes, edges, [nid[b] for b in top_boxes]
Пример #8
0
def extensive_format_check(drss_fixed, sig_file):
    '''Do a more extensive semantic format check (referee)'''
    drss_final = []
    signature = get_signature(sig_file)
    error_counter = Counter()
    for clf in drss_fixed:
        try:
            _ = check_clf([tuple(c.split()) for c in clf], signature, v=1)
            drss_final.append(clf)
        except RuntimeError as err_message:  #DRS invalid, replace by dummy
            error_counter.update([err_message[0]])
            drss_final.append([" ".join(x) for x in dummy_drs()])
    error_total = sum(error_counter.values())
    print_str = "#wrong = {} ({:.2f}%)".format(
        error_total, error_total * 100 / float(len(drss_fixed)))
    for (err, c) in error_counter.most_common():
        print_str += str(c) + ' ' + err + ' '
    return drss_final, print_str, error_total
Пример #9
0
def get_clauses(file_name, signature, ill_type):
    '''Function that returns a list of DRSs (that consists of clauses)'''
    clause_list, original_clauses, cur_orig, cur_clauses = [], [], [], []

    with open(file_name, 'r') as in_f:
        input_lines = in_f.read().split('\n')
        for idx, line in enumerate(input_lines):
            if line.strip().startswith('%'):
                pass  # skip comments
            elif not line.strip():
                if cur_clauses:  # newline, so DRS is finished, add to list. Ignore double/clause newlines
                    # First check if the DRS is valid, will error if invalid
                    try:
                        check_clf([tuple(c) for c in cur_clauses],
                                  signature,
                                  v=False)
                        clause_list.append(cur_clauses)
                        original_clauses.append(cur_orig)
                    except Exception as e:
                        if ill_type == 'error':
                            raise ValueError(e)
                        elif ill_type == 'dummy':
                            # FIXME: uncomment
                            print(
                                'WARNING: DRS {0} is ill-formed and replaced by a dummy DRS'
                                .format(len(clause_list) + 1))
                            clause_list.append(dummy_drs())
                            original_clauses.append(
                                [" ".join(x) for x in dummy_drs()])
                        elif ill_type == 'spar':
                            print(
                                'WARNING: DRS {0} is ill-formed and replaced by the SPAR DRS'
                                .format(len(clause_list) + 1))
                            clause_list.append(spar_drs())
                            original_clauses.append(
                                [" ".join(x) for x in spar_drs()])
                        elif ill_type == 'score':
                            print(
                                'WARNING: DRS {0} is ill-formed, but try to give a score anyway - might still error later'
                                .format(len(clause_list) + 1))

                            clause_list.append(cur_clauses)
                            original_clauses.append(cur_orig)
                cur_clauses = []
                cur_orig = []
            else:
                cur_clauses.append(line.split(
                    ' %', 1)[0].strip().split())  #remove comments
                cur_orig.append(line)

    if cur_clauses:  # no newline at the end, still add the DRS
        clause_list.append(cur_clauses)
        original_clauses.append(cur_orig)

    # Invert -of relations and reorder inv_boxes if they contain a constant between quotes
    inv_boxes = DRS(signature).inv_boxes
    for drs in clause_list:
        for clause in drs:
            if len(clause) == 4 and is_role(
                    clause[1]) and clause[1].endswith('Of') and len(
                        clause[1]) > 2:
                # Switch clauses and remove the -Of
                clause[2], clause[3] = clause[3], clause[2]
                clause[1] = clause[1][:-2]
            elif clause[1] in inv_boxes and len(
                    clause) == 4 and between_quotes(
                        clause[2]) and not between_quotes(clause[3]):
                # b1 NEQ x1 x2 is equal to b1 NEQ x2 x1
                # If one of the two arguments is between quotes, rewrite them in such a way
                # that it can always match
                # For example rewrite b1 NEQ "speaker" x1 to b1 NEQ x1 "speaker"
                # If there are two variables or two items between quotes, do nothing
                clause[2], clause[3] = clause[3], clause[2]

    # If we want to include REF clauses we are done now
    if args.include_ref:
        return clause_list, original_clauses
    else:  #else remove redundant REF clauses
        final_clauses, final_original = remove_refs(clause_list,
                                                    original_clauses)
        return final_clauses, final_original
Пример #10
0
 signature = get_signature(args.sig_file, v=args.v)
 # define counters
 trg_err_counter = Counter()
 src_err_counter = Counter()
 # contrast CLFs
 sen_ids = []
 for sid in trg_clf_dict:
     # read raw and CLFs
     (raw, trg_clf) = trg_clf_dict[sid]
     #pr_clf(trg_clf, pr=True, inline=False)
     (src_raw, src_clf) = src_clf_dict[sid]
     #print raw, src_raw
     #assert raw == src_raw or src_raw is None
     # check validity of Gold CLF. If it is invalid, report and go to next CLF
     try:
         check_clf(trg_clf, signature, v=args.v)
     except RuntimeError as e:
         trg_err_counter.update([e[0]])
         print '!nvGold [{}] "{}":\tThe gold CLF is invalid'.format(
             sid, raw)
         continue
     # check validity of Source CLF
     try:
         check_clf(src_clf, signature, v=args.v)
         src_invalid = ''
     except RuntimeError as e:
         src_err_counter.update([e[0]])
         #print '!nvSyst [{}] "{}":\tThe system produced CLF is invalid'.format(sid, raw)
         src_invalid = '!!!Invalid CLF '
     # detect which filter to apply
     dnf = dnf_ops if dnf_ops else dnf_tks
Пример #11
0
 info("{} mrps read".format(len(mrps)))
 # converting mrps into clfs one-by-one
 error_counter = Counter()
 drg_count = 0
 clfs_info_list, meta_list, invalids = [], [], []
 for mrp in mrps:
     if mrp['framework'] != 'drg' \
         or args.ids and mrp['id'] not in args.ids:
         continue
     meta_list.append((mrp['id'], mrp['input']))
     drg_count += 1
     try:
         clf = mrp2clf(mrp, fix=['edge_lab'])  # some graphs need this
         # if signature is
         if args.validate:
             clfref.check_clf(clf, sig)
         clfs_info_list.append(clf)
     except:
         if args.throw_error: raise
         err_message = repr(sys.exc_info()[1])
         if not args.quiet: error("{}: {}".format(mrp['id'], err_message))
         error_counter.update([re.sub('\d+', 'NUM', err_message)])
         invalids.append(mrp['id'])
         clfs_info_list.append({
             'b REF x': ('b', 'REF', 'x'),
             'b nevermatching "n.01" x': ('b', 'LEX', 'x')
         })
 write_clfs(clfs_info_list, meta_list, filename=args.clf)
 if error_counter and not args.quiet:
     print("Frequencies of erros")
     for err, c in error_counter.most_common():