def get_binary_for_markedup(left, right, result, markedup=None, flexible=False): for binary in BINARIES: if category.compare(left, binary[0]): if category.compare(right, binary[1]): if category.compare(result, binary[2]): keep_deps = binary[3] rules = binary[4] if len(rules) > 0: return rules elif markedup is not None: return ['(S 0 1)'] + markedup[result][1:] else: return [] if flexible: for binary in BINARIES: if category.compare(result, binary[2]): rules = binary[4] if len(rules) > 0: return rules elif markedup is not None: return ['(S 0 1)'] + markedup[result][1:] else: return [] if markedup is not None: return ['(S 0 1)'] + markedup[result][1:] return None
def get_unary(start_cat, end_cat, markedup=None): # Note: PP_qus - for questions only, ignored for now for unary in UNARIES: start = unary[0] end_markup = unary[1] end = category.strip_braces(end_markup) keep_deps = unary[2] extra = unary[3] rules = unary[4] if category.compare(start_cat, start): if category.compare(end_cat, end): if len(rules) > 0: return rules elif markedup is not None: if end in markedup: return markedup[end][1:] end_no_brac = category.strip_square_brackets(end) if end_no_brac in markedup: return markedup[end_no_brac][1:] else: return [] return None
def __init__(self, text='', pos=0): Tree.__init__(self, text) self.label = '' self.category = None self.orig_category = None self.pos = None self.word = None self.head = None self.rule = None if text == '': return if '<L' in text: depth = 0 for i in xrange(pos + 1, len(text)): char = text[i] # update the depth (note that brackets in categories only muck things up # for the category that is the root of this subtree) if char == '(': depth += 1 if self.label != '' and depth == 1: self.subtrees.append(CCG_Tree(text, i)) elif char == ')': depth -= 1 # we've reached the end of the category that is the root of this subtree if char == '>' and self.label == '': self.label = text[pos + 2:i] # we've reached the end of the scope for this bracket if depth < 0: break parts = self.label.split() self.category = ''.join(parts[1].split('[X]')) self.orig_category = self.category # Fix a sentence with two broken categories in CCGBank (0595.15) if self.category[-1] in '\\/': self.category = self.category + 'NP' self.rule = rule.determine_combinator(self.subtrees, self.category) if 'conj' in self.rule: if not self.category.endswith( '[conj]') and not category.compare( self.category, self.subtrees[1].category): if self.subtrees[1].category.endswith('[conj]'): self.category = self.subtrees[1].category else: self.category = self.subtrees[1].category + '[conj]' if len(parts) == 4: if len(self.subtrees) > 0: self.head = self.subtrees[0] if parts[2] == '1' and len(self.subtrees) == 2: self.head = self.subtrees[1] elif len(parts) == 6: self.pos = parts[3] self.word = parts[4] else: # Handle fowler input self.label = text[pos:].split()[0][1:] self.category = ')'.join('('.join( self.label.split('{')).split('}')) self.orig_category = self.category depth = 0 for i in xrange(pos + 1, len(text)): if depth < 0: break char = text[i] # update the depth if char == '(': depth += 1 if depth == 1: self.subtrees.append(CCG_Tree(text, i)) elif char == ')': depth -= 1 if len(self.subtrees) == 0: pos = i for j in xrange(i, 0, -1): if text[j] == ' ': pos = j break self.word = text[pos + 1:i] break self.rule = rule.determine_combinator(self.subtrees, self.category) if 'conj' in self.rule: if not self.category.endswith( '[conj]') and not category.compare( self.category, self.subtrees[1].category): if self.subtrees[1].category.endswith('[conj]'): self.category = self.subtrees[1].category else: self.category = self.subtrees[1].category + '[conj]' if self.word is not None: self.pos = "UNK" if self.word == '.': self.pos = '.' if self.word == ',': self.pos = ',' if self.word == '...': self.pos = ':' if self.word == '?': self.pos = '.' if self.word == '!': self.pos = '.'
def __init__(self, text="", pos=0): Tree.__init__(self, text) self.label = "" self.category = None self.orig_category = None self.pos = None self.word = None self.head = None self.rule = None if text == "": return if "<L" in text: depth = 0 for i in xrange(pos + 1, len(text)): char = text[i] # update the depth (note that brackets in categories only muck things up # for the category that is the root of this subtree) if char == "(": depth += 1 if self.label != "" and depth == 1: self.subtrees.append(CCG_Tree(text, i)) elif char == ")": depth -= 1 # we've reached the end of the category that is the root of this subtree if char == ">" and self.label == "": self.label = text[pos + 2 : i] # we've reached the end of the scope for this bracket if depth < 0: break parts = self.label.split() self.category = "".join(parts[1].split("[X]")) self.orig_category = self.category # Fix a sentence with two broken categories in CCGBank (0595.15) if self.category[-1] in "\\/": self.category = self.category + "NP" self.rule = rule.determine_combinator(self.subtrees, self.category) if "conj" in self.rule: if not self.category.endswith("[conj]") and not category.compare( self.category, self.subtrees[1].category ): if self.subtrees[1].category.endswith("[conj]"): self.category = self.subtrees[1].category else: self.category = self.subtrees[1].category + "[conj]" if len(parts) == 4: if len(self.subtrees) > 0: self.head = self.subtrees[0] if parts[2] == "1" and len(self.subtrees) == 2: self.head = self.subtrees[1] elif len(parts) == 6: self.pos = parts[3] self.word = parts[4] else: # Handle fowler input self.label = text[pos:].split()[0][1:] self.category = ")".join("(".join(self.label.split("{")).split("}")) self.orig_category = self.category depth = 0 for i in xrange(pos + 1, len(text)): if depth < 0: break char = text[i] # update the depth if char == "(": depth += 1 if depth == 1: self.subtrees.append(CCG_Tree(text, i)) elif char == ")": depth -= 1 if len(self.subtrees) == 0: pos = i for j in xrange(i, 0, -1): if text[j] == " ": pos = j break self.word = text[pos + 1 : i] break self.rule = rule.determine_combinator(self.subtrees, self.category) if "conj" in self.rule: if not self.category.endswith("[conj]") and not category.compare( self.category, self.subtrees[1].category ): if self.subtrees[1].category.endswith("[conj]"): self.category = self.subtrees[1].category else: self.category = self.subtrees[1].category + "[conj]" if self.word is not None: self.pos = "UNK" if self.word == ".": self.pos = "." if self.word == ",": self.pos = "," if self.word == "...": self.pos = ":" if self.word == "?": self.pos = "." if self.word == "!": self.pos = "."
def determine_combinator(source, result): ### print len(source) ### print ' '.join(source), result if len(source) == 0: return 'lex' if len(source) == 1: if get_unary(source[0].category, result) is not None: return 'unary' return 'type' if len(source) == 2: left = source[0].category right = source[1].category result_parts = category.divide(result) left_parts = category.divide(left) right_parts = category.divide(right) if get_binary(left, right, result) is not None: return 'binary' # Coordination # X = X CONJ X if left == 'conj' or (result.endswith('[conj]') and not '[conj]' in right): if right == 'conj\\conj': return 'fa.b' return 'conj1' elif 'conj' in source[1].rule or '[conj]' in right: if category.compare(left, right): return 'conj2' if category.compare(category.divide(left)[2], right) and category.divide(left)[1] == '/': return 'fa.f' if category.compare( category.divide(right)[0], left) and category.divide(right)[1] is not None: if 'conj2' in source[ 1].rule or '[conj]' in right and category.compare( category.divide(right)[2], left): return 'fa.b' else: return 'conj1' if category.compare(category.divide(right)[2], left): return 'fa.b' if (category.compare(left_parts[2], result_parts[2]) and category.compare(left_parts[0], right_parts[2]) and category.compare(right_parts[0], result_parts[0]) and left_parts[1] == result_parts[1] == '/' and right_parts[1] == '\\'): return 'cc.b' if (category.compare(left_parts[2], right_parts[0]) and category.compare(left_parts[0], result_parts[0]) and category.compare(right_parts[2], result_parts[2]) and left_parts[1] == right_parts[1] == result_parts[1] == '/'): return 'fc.f' if (category.compare(left_parts[2], result_parts[2]) and category.compare(left_parts[0], right_parts[2]) and category.compare(right_parts[0], result_parts[0]) and left_parts[1] == right_parts[1] == result_parts[1] == '\\'): return 'fc.b' if category.compare(result, left): if '[conj]' in result: return 'conj2' raw_right = right if '[conj]' in right: raw_right = right[:-6] if category.compare(result, raw_right): return 'conj2' else: return 'conj2' elif 'conj1' in source[0].rule or '[conj]' in left: return 'conj2' # consider conj3, to handle , separated lists # Function application # X = X/Y + Y if (left_parts[1] == '/' and category.compare(left_parts[2], right) and category.compare(left_parts[0], result)): return 'fa.f' # X = Y + X\Y if (right_parts[1] == '\\' and category.compare(right_parts[2], left) and category.compare(right_parts[0], result)): return 'fa.b' # Function composition # X/Z = X/Y + Y/Z if (category.compare(left_parts[2], right_parts[0]) and category.compare(left_parts[0], result_parts[0]) and category.compare(right_parts[2], result_parts[2]) and left_parts[1] == right_parts[1] == result_parts[1] == '/'): return 'fc.f' # X\Z = Y\Z + X\Y if (category.compare(left_parts[2], result_parts[2]) and category.compare(left_parts[0], right_parts[2]) and category.compare(right_parts[0], result_parts[0]) and left_parts[1] == right_parts[1] == result_parts[1] == '\\'): return 'fc.b' # Crossed composition # X/Z = Y/Z + X\Y # For example: # (S\NP)/(S\NP) = (S\NP)/(S\NP) + (S\NP)\(S\NP) if (category.compare(left_parts[2], result_parts[2]) and category.compare(left_parts[0], right_parts[2]) and category.compare(right_parts[0], result_parts[0]) and left_parts[1] == result_parts[1] == '/' and right_parts[1] == '\\'): return 'cc.b' # Z\X = Z/Y + Y\X # ((S\NP)/S)/(S\NP) = ((S\NP)/S)/(S\NP) + (S\NP)\(S\NP) # Backward crossed substitution # X/Z = B/Z + (X\B)/Z if (left_parts[1] == right_parts[1] == result_parts[1] == '/' and category.compare(left_parts[2], result_parts[2]) and category.compare(right_parts[2], result_parts[2])): sub_parts = category.divide(right_parts[0]) if (category.compare(sub_parts[0], result_parts[0]) and category.compare(sub_parts[2], left_parts[0]) and sub_parts[1] != left_parts[1]): return 'bs.f' # X\Z = (X/B)\Z + B\Z if (left_parts[1] == right_parts[1] == result_parts[1] == '\\' and category.compare(left_parts[2], result_parts[2]) and category.compare(right_parts[2], result_parts[2])): sub_parts = category.divide(left_parts[0]) if (sub_parts[0] == result_parts[0] and sub_parts[2] == right_parts[0] and sub_parts[1] != right_parts[1]): return 'bs.b' # There are restrictions on what B can be, but since this is a parse, and # all other options have been exhausted, this must be what is going on # Uncomment to see what is misc: ### if left == result and '/' not in right and '\\' not in right: ### pass ### elif right == result and '/' not in left and '\\' not in left: ### pass ### elif '[conj]' in left or '[conj]' in right or '[conj]' in result: ### pass ### else: ### print 'misc rule:', left, right, result ### print ' ', left_parts ### print ' ', right_parts ### print ' ', result_parts if category.divide(result)[0] == right and category.divide( result)[1] is not None: return 'conj1' return 'misc'
def determine_combinator(source, result): ### print len(source) ### print ' '.join(source), result if len(source) == 0: return 'lex' if len(source) == 1: if get_unary(source[0].category, result) is not None: return 'unary' return 'type' if len(source) == 2: left = source[0].category right = source[1].category result_parts = category.divide(result) left_parts = category.divide(left) right_parts = category.divide(right) if get_binary(left, right, result) is not None: return 'binary' # Coordination # X = X CONJ X if left == 'conj' or (result.endswith('[conj]') and not '[conj]' in right): if right == 'conj\\conj': return 'fa.b' return 'conj1' elif 'conj' in source[1].rule or '[conj]' in right: if category.compare(left, right): return 'conj2' if category.compare(category.divide(left)[2], right) and category.divide(left)[1] == '/': return 'fa.f' if category.compare(category.divide(right)[0], left) and category.divide(right)[1] is not None: if 'conj2' in source[1].rule or '[conj]' in right and category.compare(category.divide(right)[2], left): return 'fa.b' else: return 'conj1' if category.compare(category.divide(right)[2], left): return 'fa.b' if (category.compare(left_parts[2], result_parts[2]) and category.compare(left_parts[0], right_parts[2]) and category.compare(right_parts[0], result_parts[0]) and left_parts[1] == result_parts[1] == '/' and right_parts[1] == '\\'): return 'cc.b' if (category.compare(left_parts[2], right_parts[0]) and category.compare(left_parts[0], result_parts[0]) and category.compare(right_parts[2], result_parts[2]) and left_parts[1] == right_parts[1] == result_parts[1] == '/'): return 'fc.f' if (category.compare(left_parts[2], result_parts[2]) and category.compare(left_parts[0], right_parts[2]) and category.compare(right_parts[0], result_parts[0]) and left_parts[1] == right_parts[1] == result_parts[1] == '\\'): return 'fc.b' if category.compare(result, left): if '[conj]' in result: return 'conj2' raw_right = right if '[conj]' in right: raw_right = right[:-6] if category.compare(result, raw_right): return 'conj2' else: return 'conj2' elif 'conj1' in source[0].rule or '[conj]' in left: return 'conj2' # consider conj3, to handle , separated lists # Function application # X = X/Y + Y if (left_parts[1] == '/' and category.compare(left_parts[2], right) and category.compare(left_parts[0], result)): return 'fa.f' # X = Y + X\Y if (right_parts[1] == '\\' and category.compare(right_parts[2], left) and category.compare(right_parts[0], result)): return 'fa.b' # Function composition # X/Z = X/Y + Y/Z if (category.compare(left_parts[2], right_parts[0]) and category.compare(left_parts[0], result_parts[0]) and category.compare(right_parts[2], result_parts[2]) and left_parts[1] == right_parts[1] == result_parts[1] == '/'): return 'fc.f' # X\Z = Y\Z + X\Y if (category.compare(left_parts[2], result_parts[2]) and category.compare(left_parts[0], right_parts[2]) and category.compare(right_parts[0], result_parts[0]) and left_parts[1] == right_parts[1] == result_parts[1] == '\\'): return 'fc.b' # Crossed composition # X/Z = Y/Z + X\Y # For example: # (S\NP)/(S\NP) = (S\NP)/(S\NP) + (S\NP)\(S\NP) if (category.compare(left_parts[2], result_parts[2]) and category.compare(left_parts[0], right_parts[2]) and category.compare(right_parts[0], result_parts[0]) and left_parts[1] == result_parts[1] == '/' and right_parts[1] == '\\'): return 'cc.b' # Z\X = Z/Y + Y\X # ((S\NP)/S)/(S\NP) = ((S\NP)/S)/(S\NP) + (S\NP)\(S\NP) # Backward crossed substitution # X/Z = B/Z + (X\B)/Z if (left_parts[1] == right_parts[1] == result_parts[1] == '/' and category.compare(left_parts[2], result_parts[2]) and category.compare(right_parts[2], result_parts[2])): sub_parts = category.divide(right_parts[0]) if (category.compare(sub_parts[0], result_parts[0]) and category.compare(sub_parts[2], left_parts[0]) and sub_parts[1] != left_parts[1]): return 'bs.f' # X\Z = (X/B)\Z + B\Z if (left_parts[1] == right_parts[1] == result_parts[1] == '\\' and category.compare(left_parts[2], result_parts[2]) and category.compare(right_parts[2], result_parts[2])): sub_parts = category.divide(left_parts[0]) if (sub_parts[0] == result_parts[0] and sub_parts[2] == right_parts[0] and sub_parts[1] != right_parts[1]): return 'bs.b' # There are restrictions on what B can be, but since this is a parse, and # all other options have been exhausted, this must be what is going on # Uncomment to see what is misc: ### if left == result and '/' not in right and '\\' not in right: ### pass ### elif right == result and '/' not in left and '\\' not in left: ### pass ### elif '[conj]' in left or '[conj]' in right or '[conj]' in result: ### pass ### else: ### print 'misc rule:', left, right, result ### print ' ', left_parts ### print ' ', right_parts ### print ' ', result_parts if category.divide(result)[0] == right and category.divide(result)[1] is not None: return 'conj1' return 'misc'