예제 #1
0
파일: tree.py 프로젝트: steve3p0/LING511
    def embed_n_np(self, t: nltk.Tree):

        # RB - Adverb
        # RBR - Adverb, comparative
        # RBS - Adverb, superlative

        try:
            t.label()
        except AttributeError:
            # print(t)
            return

        try:
            for child in t:
                #t = nltk.ParentedTree.convert(t)
                if child.label() == child.right_sibling().label() == "NN":
                    # noun = child
                    noun = nltk.ParentedTree("NN", [child[0]])

                    np = nltk.ParentedTree("NP", [noun])
                    child_pos = self.get_position(child, t)
                    t.remove(child)
                    t.insert(child_pos, np)

                    t = nltk.ParentedTree.convert(t)
                    parent = t.parent()
                    parent = nltk.ParentedTree.convert(parent)
        except Exception:
            #print("swallow hard!")
            pass

        for child in t:
            self.embed_n_np(child)
def make(srcSnt, tgtSnt, srcTree, tgtTree, wa, gsuba, base):
    f = codecs.open('/dev/shm/subaFeatEx.' + str(base), 'w', 'utf-8')
    #f = codecs.open('/dev/shm/koala.suba', 'w', 'utf-8')
    sentID = base
    for i in xrange(len(srcSnt)):
        #print wa[i], srcSnt[i], tgtSnt[i]
        if i % 1000 == 0: print >> sys.stderr, i,
        bead = Bead2(nltk.ParentedTree(srcTree[i]), nltk.ParentedTree(tgtTree[i]), \
          oneline2waMatrix(wa[i], len(srcSnt[i].split()), len(tgtSnt[i].split())), oneline2subaList(gsuba[i]))

        for suba in bead.otherSuba:
            example = (features(bead, suba), False,
                       str(sentID) + '--' + suba.__str__()
                       )  # add negative training examples
            f.write('ID' + example[2] + '\t' + str(example[1]) + '\t' +
                    '\t'.join(example[0]) + '\n')
            #f.write(suba.__str__()+' ')
        for suba in bead.goldSuba:
            example = (features(bead, suba), True,
                       str(sentID) + '--' + suba.__str__()
                       )  # add positive training examples
            f.write('ID' + example[2] + '\t' + str(example[1]) + '\t' +
                    '\t'.join(example[0]) + '\n')
            #f.write(suba.__str__()+' ')
        #f.write('\n')
        sentID += 1
    f.close()
예제 #3
0
    def test_get_head_s(self):
        parse = """(S
  (S
    (NP
      (NP
        (DT The)
        (ADJP (RBS most) (JJ important))
        (JJ Taiwanese)
        (JJ musical)
        (NN master))
      (PP (IN of) (NP (DT the) (JJ last) (JJ half) (NN century)))))
  (, ,)
  (NP (PRP he))
  (VP
    (VBD was)
    (NP
      (NP (DT a) (JJ beloved) (NN teacher))
      (PP (IN to) (NP (JJ many)))))
  (. .))"""

        self.assertEqual(nltk.ParentedTree("VBD", ["was"]), self.head_finder.get_head(nltk.ParentedTree.fromstring(parse)))

        parse_2 = "(S (`` `) (NP (NNP Bus) (NNP Stop) (POS ')))"

        self.assertEqual(nltk.ParentedTree("NNP", ["Stop"]),
                         self.head_finder.get_head(
                             nltk.ParentedTree.fromstring(parse_2)))
예제 #4
0
    def test_get_difficult_heads(self):
        parse = """(NP
  (S
    (VP
      (VP
        (VBG recalling)
        (NP (DT the) (JJ Korean) (NN delegation))
        (PP
          (IN to)
          (NP
            (DT the)
            (NNP Korean)
            (NML (NNP Military) (NNP Armistice))
            (NNP Commission))))
      (CC and)
      (VP
        (VBG setting)
        (PRT (RP up))
        (NP
          (NP
            (DT the)
            (NNP Panmunjom)
            (NNP Representative)
            (NNP Office))
          (PP
            (IN of)
            (NP
              (NP (DT the) (NNP Korean) (NNPS People) (POS 's))
              (NNP Army))))
        (PP (IN as) (NP (DT the) (JJ negotiatory) (NN organization))))))
  (, ,)
  (ADVP (FW etc)))"""

        parse2 = """(NP
  (QP (NNS Tens) (IN of) (NNS thousands))
  (PP (IN of) (NP (NNS people))))"""

        parse3 = """(NP
  (PRP he)
  (PRN
    (-LRB- -LRB-)
    (NP
      (NP (DT the) (NN one))
      (SBAR
        (WHNP (WP who))
        (S (VP (VBD tricked) (NP (DT these) (NNS people))))))
    (-RRB- -RRB-)))"""

        parse4 = """(UCP
  (NP (NN %um))
  (CC and)
  (S (NP (PRP you)) (VP (MD can) (VP (ADVP (RB also))))))"""

        self.assertEqual(nltk.ParentedTree("FW", ["etc"]), self.head_finder.get_head(nltk.ParentedTree.fromstring(parse)))
        self.assertEqual(nltk.ParentedTree("NNS", ["Tens"]), self.head_finder.get_head(nltk.ParentedTree.fromstring(parse2)))
        self.assertEqual(nltk.ParentedTree("-LRB-", ["-LRB-"]), self.head_finder.get_head(nltk.ParentedTree.fromstring(parse3)))
        self.assertEqual(nltk.ParentedTree("MD", ["can"]), self.head_finder.get_head(nltk.ParentedTree.fromstring(parse4)))
예제 #5
0
def loadData(srcTrList, tgtTrList, waList, alignFunc, ruleExFlag,
             wordRulesFlag, minMemFlag, procID, verbose, extensiveRulesFlag,
             fractionalCountFlag, phraseRulesFlag, s2t):
    if minMemFlag:
        if 'hacept' not in os.listdir('/dev/shm'):
            os.mkdir('/dev/shm/hacept')
        f1 = codecs.open('/dev/shm/hacept/rule.' + str(procID), 'w', 'utf-8')
        f2 = codecs.open('/dev/shm/hacept/ruleInv.' + str(procID), 'w',
                         'utf-8')
        gf1 = codecs.open('/dev/shm/hacept/glueRule.' + str(procID), 'w',
                          'utf-8')
    else:
        result = []

    basicGlueRuleTopLabels, basicGlueRuleLabels = set([]), set([])

    for i in xrange(len(waList)):
        srcTr = nltk.ParentedTree(srcTrList[i])
        tgtTr = nltk.ParentedTree(tgtTrList[i])
        wa = [item.split('-') for item in waList[i].split()]
        wa = [([int(i) for i in item[0].split(',')],
               [int(j) for j in item[1].split(',')]) for item in wa]

        if minMemFlag:
            if len(srcTr.leaves()) == 0 or len(tgtTr.leaves()) == 0:
                continue
            else:
                tmpSntFrame = SntFrame(srcTr, tgtTr, wa, alignFunc, ruleExFlag,
                                       wordRulesFlag, extensiveRulesFlag,
                                       fractionalCountFlag, phraseRulesFlag,
                                       s2t, verbose)
                for rule in tmpSntFrame.ruleList:
                    r, rinv = rule[0], rule[1]
                    #r, rinv = rule.mosesFormatRule()
                    f1.write(r)
                    f2.write(rinv)
                if s2t:
                    for rule in tmpSntFrame.glueRuleList:
                        r, rinv = rule[0], rule[1]
                        gf1.write(r)
        else:
            if len(srcTr.leaves()) == 0 or len(tgtTr.leaves()) == 0:
                result.append(None)
            else:
                tmpSntFrame = SntFrame(srcTr, tgtTr, wa, alignFunc, ruleExFlag,
                                       wordRulesFlag, extensiveRulesFlag,
                                       fractionalCountFlag, phraseRulesFlag,
                                       s2t, verbose)
                result.append(tmpSntFrame)

        if s2t:
            basicGlueRuleTopLabels.update(tmpSntFrame.basicGlueRuleTopLabels)
            basicGlueRuleLabels.update(tmpSntFrame.basicGlueRuleLabels)

    if minMemFlag: return [None], basicGlueRuleTopLabels, basicGlueRuleLabels
    else: return result, basicGlueRuleTopLabels, basicGlueRuleLabels
예제 #6
0
 def test_get_head_sq(self):
     self.assertEqual(
         nltk.ParentedTree("VBP", ["are"]),
         self.head_finder.get_head(
             nltk.ParentedTree.fromstring(
                 "(SQ (VBP are) (NP (PRP they)) (NP (DT all) (NNS liars)))")
         ))
예제 #7
0
    def test_get_head_sbarq(self):
        parse = """(SBARQ
  (WHADVP (WRB Where))
  (SQ (MD Should) (NP (NNP Chinese) (NNP Music)) (VP (VB Go)))
  (. ?))"""

        self.assertEqual(nltk.ParentedTree("MD", ["Should"]), self.head_finder.get_head(nltk.ParentedTree.fromstring(parse)))
예제 #8
0
    def test_get_head_pp(self):
        parse = """(PP
  (IN of)
  (NP
    (NP (NNS thousands))
    (PP (IN of) (NP (JJ non-profit) (NNS institutions)))))"""

        self.assertEqual(nltk.ParentedTree("IN", ["of"]), self.head_finder.get_head(nltk.ParentedTree.fromstring(parse)))
예제 #9
0
def isLegalTree(line, i):
    try:
        t = nltk.Tree(line)
        pt = nltk.ParentedTree(line)
    except ValueError:
        print >> sys.stderr, "illegal tree!!! #" + str(i)
        print >> sys.stderr, line
        exit(1)
예제 #10
0
    def test_get_head_sbar(self):
        parse = """(SBAR
  (WHNP (WP who))
  (S
    (VP
      (VBD had)
      (VP
        (VBN had)
        (NP (NP (JJ enough)) (PP (IN of) (NP (NN schooling))))))))"""

        self.assertEqual(nltk.ParentedTree("WP", ["who"]), self.head_finder.get_head(nltk.ParentedTree.fromstring(parse)))
예제 #11
0
    def test_head_rule_cc(self):
        parse = """(NP
        (NP
            (NNS ruin))
        (CC and)
        (NP
            (NNS terror)))
        """

        self.assertEqual(nltk.ParentedTree("CC", ["and"]),
                         self.head_finder.get_head(nltk.ParentedTree.fromstring(
                parse)))
예제 #12
0
 def test_get_head_np(self):
     self.assertEqual(nltk.ParentedTree("NNS", ["police"]), self.head_finder.get_head(nltk.ParentedTree.fromstring("(NP (JJ Local) (NNS police))")))
     self.assertEqual(nltk.ParentedTree("NN", ["shop"]), self.head_finder.get_head(nltk.ParentedTree.fromstring("(NP (JJ Local) (NN shop))")))
     self.assertEqual(nltk.ParentedTree("NNP", ["NBC"]), self.head_finder.get_head(nltk.ParentedTree.fromstring("(NP (NNP NBC) (POS 's))")))
     self.assertEqual(nltk.ParentedTree("NN", ["wedding"]), self.head_finder.get_head(nltk.ParentedTree.fromstring("(NP (NP (NP (PRP$ his) (NN brother) (POS 's)) (NN wedding)) (PP (IN in) (NP (NNP Khan) (NNPS Younes))))")))
     self.assertEqual(nltk.ParentedTree("NNP", ["Taiwan"]), self.head_finder.get_head(nltk.ParentedTree.fromstring("(NP (NNP Taiwan) (POS 's))")))
     self.assertEqual(nltk.ParentedTree("NN", ["port"]), self.head_finder.get_head(nltk.ParentedTree.fromstring("(NP (NP (NP (NNP Yemen) (POS 's)) (NN port)) (PP (IN of) (NP (NNP Aden))))")))
예제 #13
0
def flatten2one(tr):
    newLine = '(' + tr.node + ' '
    for subt in tr.subtrees():
        if subt.height() == 2:
            if isinstance(subt.node, str) and isinstance(subt[0], str):
                newLine += '(' + subt.node + ' ' + subt[0] + ') '
            else:
                print subt
                exit(1)
    newLine += ')'
    newTr = nltk.ParentedTree(newLine)
    #print 'newTr is: ', newTr
    return newTr
예제 #14
0
    def test_get_head_frag(self):
        parse = """(FRAG
  (PP (IN On) (NP (DT the) (NN internet) (NN type)))
  (NP (NNP Iraq))
  (: :)
  (NP (NNP Beyond) (NNP Abu) (NNP Ghraib))
  (: :)
  (NP
    (NP (NN Detention) (CC and) (NN torture))
    (PP (IN in) (NP (NNP Iraq))))
  (. .))"""

        self.assertEqual(nltk.ParentedTree(".", ["."]), self.head_finder.get_head(
            nltk.ParentedTree.fromstring(
            parse)))
예제 #15
0
def grow_branches(starting_node, from_list):
    """ Recursively grows top-down subtree for an SST phrase.
	
	If the node label objects contain information about contained leaves,
	the branches will be ordered left-to-right to maintain sentence order."""

    # tokens are leaves, no more growth from there
    """if not isinstance(starting_node, nltk.Tree):
		print(type(starting_node), starting_node)"""
    if type(starting_node) is SST_Token:
        return starting_node

    # print(len(from_list), starting_node)
    children = starting_node.children
    # if contained-leaf data was collected, order the subtrees left-to-right.
    children = sorted(children, key=lambda c: min(c.get_leaf_indices()))

    # recursively grow subtrees
    branches = list(grow_branches(child, from_list) for child in children)
    subtree = nltk.ParentedTree(starting_node, branches)
    return subtree
예제 #16
0
파일: tree.py 프로젝트: steve3p0/LING511
    def convert_adv_deg(self, t: nltk.Tree):

        # RB - Adverb
        # RBR - Adverb, comparative
        # RBS - Adverb, superlative

        try:
            t.label()
        except AttributeError:
            # print(t)
            return

        if t.label() in ["ADJP", "ADVP"]:
            phrase = t

            try:
                if phrase[0].label() == "RB" and \
                   phrase[1].label() in ["RB", "JJ"]:
                    #t = nltk.ParentedTree.convert(t)
                    adv = phrase[0]
                    if adv[0] in ["too", "very"]:
                        if len(t) > 1:
                            if adv.right_sibling().label() in ["RB", "JJ"]:
                                deg = nltk.ParentedTree("Deg", [adv[0]])
                                t.remove(t[0])
                                t.insert(0, deg)

                                t = nltk.ParentedTree.convert(t)
                                parent = t.parent()
                                parent = nltk.ParentedTree.convert(parent)
            except:
                #print("swallow hard!")
                pass

        for child in t:
            self.convert_adv_deg(child)
예제 #17
0
def flatten(line, level):
    tr = nltk.ParentedTree(line.strip())
    #if the tree's height is smaller or equal to 3, no changes are needed
    if tr.height() <= 3:
        return ' '.join(tr.pprint().split())

    # if level is greater than tree's height, set it to tree's height, which means the most flattening
    if level > tr.height():
        level = tr.height()

    #print 'tr is: ', tr
    for subt in tr.subtrees():
        #print 'subt is: ', subt, 'height is: ', subt.height(),
        if subt.height() == level:
            #print 'yes!'
            if subt.parent():
                subt.parent()[subt.parent_index()] = flatten2one(subt)
            else:
                tr = flatten2one(subt)
                break
        else:
            #print 'no!'
            continue
    return ' '.join(tr.pprint().split())
예제 #18
0
 def test_get_head_ucp(self):
     self.assertEqual(nltk.ParentedTree("NN", ["trade"]), self.head_finder.get_head(nltk.ParentedTree.fromstring("(UCP (JJ economic) (CC and) (NN trade))")))
예제 #19
0
 def test_get_head_vp(self):
     self.assertEqual(nltk.ParentedTree("VB", ["shoot"]), self.head_finder.get_head(nltk.ParentedTree.fromstring("(VP (VB shoot))")))
예제 #20
0
 def test_get_head_x(self):
     self.assertEqual(nltk.ParentedTree(":", ["--"]), self.head_finder.get_head(nltk.ParentedTree.fromstring("(X (NNS Men) (CC or) (: --))")))
예제 #21
0
 def test_get_head_intj(self):
     self.assertEqual(nltk.ParentedTree("UH", ["oh"]), self.head_finder.get_head(nltk.ParentedTree.fromstring("(INTJ (UH oh) (PRP$ my) (NNP god))")))
예제 #22
0
 def test_get_head_adjp(self):
     self.assertEqual(nltk.ParentedTree("JJ" ,["twelfth"]), self.head_finder.get_head(nltk.ParentedTree.fromstring("(ADJP (JJ twelfth) (CC and) (JJ thirteenth))")))
예제 #23
0
 def test_get_head_whnp(self):
     self.assertEqual(nltk.ParentedTree("WP", ["who"]), self.head_finder.get_head(nltk.ParentedTree.fromstring("(WHNP (WP who))")))
예제 #24
0
 def test_get_head_advp(self):
     self.assertEqual(nltk.ParentedTree("RB", ["here"]), self.head_finder.get_head(nltk.ParentedTree.fromstring("(ADVP (RB here))")))
예제 #25
0
 def test_get_head_qp(self):
     self.assertEqual(nltk.ParentedTree("CD", ["forty"]), self.head_finder.get_head(nltk.ParentedTree.fromstring("(QP (CD forty) (HYPH -) (CD five))")))
예제 #26
0
 def test_get_head_whadvp(self):
     self.assertEqual(nltk.ParentedTree("WRB", ["how"]), self.head_finder.get_head(nltk.ParentedTree.fromstring("(WHADVP (WRB how))")))
예제 #27
0
    def loadData(cls, filename):
        """
		Return a list of Bead instances.
		Load trees, word alignments, and subtree alignments from a file, and create Bead instances. 

		:type filename: str
		:param filename: a file with trees, word alignments, and subtree alignments for multiple sentence pairs.

		"""
        import codecs
        f = codecs.open(filename, 'r', 'utf-8')
        blocks = util.cleanData(f.read()).split('\n\n')
        f.close()

        beadList = []

        srcTree = None
        tgtTree = None
        wordAlignment = None
        subtreeAlignment = []

        for block in blocks[:-1]:
            block = block.split('\n')
            i = 0
            errFlag = False
            while i < len(block):
                line = block[i]
                if line.startswith('SOURCE'):
                    #print line
                    if line[8:].startswith('ERROR'):
                        errFlag = True
                        break
                    srcTree = nltk.ParentedTree(line[8:])
                    if srcTree.leaves() == []:
                        errFlag = True
                        break
                    #print srcTree.leaves()
                    i += 1
                    continue

                elif line.startswith('TARGET'):
                    #print line
                    tgtTree = nltk.ParentedTree(line[8:])
                    #print tgtTree
                    #print tgtTree.leaves()
                    if tgtTree.leaves() == []:
                        errFlag = True
                        break
                    wordAlignment = [[
                        0 for j in xrange(len(tgtTree.leaves()))
                    ] for k in xrange(len(srcTree.leaves()))]
                    i += 1
                    #print i
                    continue

                elif line.startswith('<mapping>'):
                    #print "in mapping..."
                    i += 1
                    line = block[i]
                    #print
                    #print ' '.join([item.encode('utf-8') for item in srcTree.leaves()])
                    #print
                    #print srcTree
                    #print srcSubtreeIndex
                    #print
                    #print tgtTree
                    #print tgtSubtreeIndex
                    while not line.startswith('</mapping>'):
                        x1 = int(line.split()[0].split(',')[0]) - 1
                        x2 = int(line.split()[0].split(',')[-1])

                        y1 = int(line.split()[1].split(',')[0]) - 1
                        y2 = int(line.split()[1].split(',')[-1])

                        subtreeAlignment.append((x1, y1, x2, y2))
                        i += 1
                        line = block[i]

                    i += 1
                    continue

                elif line.startswith('<alignment>'):
                    i += 1
                    line = block[i]
                    #print len(wordAlignment), len(wordAlignment[0])
                    while not line.startswith('</alignment>'):
                        #print line
                        srcIndexes = [
                            int(item) - 1
                            for item in line.split()[0].split(',')
                            if int(item) != -1
                        ]
                        tgtIndexes = [
                            int(item) - 1
                            for item in line.split()[1].split(',')
                            if int(item) != -1
                        ]
                        for srcIndex in srcIndexes:
                            for tgtIndex in tgtIndexes:
                                wordAlignment[srcIndex][tgtIndex] = 1
                        i += 1
                        line = block[i]

                    i += 1
                    continue

                elif line.startswith('</bead>'):
                    break

                i += 1

            if not errFlag:
                beadList.append(
                    cls(srcTree, tgtTree, wordAlignment, subtreeAlignment,
                        False, False, False, False, False))
            srcTree, tgtTree, wordAlignment, subtreeAlignment = None, None, None, []

        return beadList
예제 #28
0
 def test_get_head_nml(self):
     self.assertEqual(nltk.ParentedTree("NN", ["curtain"]), self.head_finder.get_head(nltk.ParentedTree.fromstring("(NML (NN air) (NN curtain))")))