Пример #1
0
def main():
    sents = 0
    words_tot = 0
    yngve_tot = 0
    frazier_tot = 0
    nodes_tot = 0
    for line in sys.stdin:
        if line.strip() == "":
            continue
        t = Tree.parse(line)
        words = calc_words(t)
        words_tot += words
        sents += 1
        yngve = calc_yngve(t, 0)
        yngve_avg = float(yngve)/words
        yngve_tot += yngve_avg
        nodes = calc_nodes(t)
        nodes_avg = float(nodes)/words
        nodes_tot += nodes_avg
        frazier = calc_frazier(t, 0, "")
        frazier_avg = float(frazier)/words
        frazier_tot += frazier_avg
        # print "Sentence=%d\twords=%d\tyngve=%f\tfrazier=%f\tnodes=%f" % (sents, words, yngve_avg, frazier_avg, nodes_avg)
    yngve_avg = float(yngve_tot)/sents
    frazier_avg = float(frazier_tot)/sents
    nodes_avg = float(nodes_tot)/sents
    words_avg = float(words_tot)/sents
    print "Total\tsents=%d\twords=%f\tyngve=%f\tfrazier=%f\tnodes=%f" % (sents, words_avg, yngve_avg, frazier_avg, nodes_avg)
Пример #2
0
def load_ace_file(textfile, fmt):
    print('  - %s' % os.path.split(textfile)[1])
    annfile = textfile+'.tmx.rdc.xml'

    # Read the xml file, and get a list of entities
    entities = []
    xml = ET.parse(open(annfile)).getroot()
    for entity in xml.findall('document/entity'):
        typ = entity.find('entity_type').text
        for mention in entity.findall('entity_mention'):
            if mention.get('TYPE') != 'NAME': continue # only NEs
            s = int(mention.find('head/charseq/start').text)
            e = int(mention.find('head/charseq/end').text)+1
            entities.append( (s, e, typ) )

    # Read the text file, and mark the entities.
    text = open(textfile).read()
    
    # Strip XML tags, since they don't count towards the indices
    text = re.sub('<(?!/?TEXT)[^>]+>', '', text)

    # Blank out anything before/after <TEXT>
    def subfunc(m): return ' '*(m.end()-m.start()-6)
    text = re.sub('[\s\S]*<TEXT>', subfunc, text)
    text = re.sub('</TEXT>[\s\S]*', '', text)

    # Simplify quotes
    text = re.sub("``", ' "', text)
    text = re.sub("''", '" ', text)

    entity_types = set(typ for (s,e,typ) in entities)

    # Binary distinction (NE or not NE)
    if fmt == 'binary':
        i = 0
        toks = nltk.Tree('S', [])
        for (s,e,typ) in sorted(entities):
            if s < i: s = i # Overlapping!  Deal with this better?
            if e <= s: continue
            toks.extend(nltk.word_tokenize(text[i:s]))
            toks.append(nltk.Tree('NE', text[s:e].split()))
            i = e
        toks.extend(nltk.word_tokenize(text[i:]))
        yield toks

    # Multiclass distinction (NE type)
    elif fmt == 'multiclass':
        i = 0
        toks = nltk.Tree('S', [])
        for (s,e,typ) in sorted(entities):
            if s < i: s = i # Overlapping!  Deal with this better?
            if e <= s: continue
            toks.extend(nltk.word_tokenize(text[i:s]))
            toks.append(nltk.Tree(typ, text[s:e].split()))
            i = e
        toks.extend(nltk.word_tokenize(text[i:]))
        yield toks

    else:
        raise ValueError('bad fmt value')
Пример #3
0
    def test_current_production(self):
        inputs_ = [("""
                (S
                    (sentence
                        (type_1_sentence_coord_1
                        (type_1_sentence_coord_2
                            (type_2_sentence
                            (THERE There)
                            (AUX is)
                            (Noun_Phrase
                                (det (DET an))
                                (Noun_w_support
                                (Adj_phrase
                                    (Adj_core (JJ small))
                                    (AND and)
                                    (Adj_phrase (Adj_core (JJ red))))
                                (Noun_Count (NN apple)))))))
                        (PERIOD .)))
                """, Production(Nonterminal("S"), [Nonterminal("sentence")]))]

        for i, (input_, expect_) in enumerate(inputs_):
            tree = Tree.parse(input_)
            production = current_production(tree)

            self.assertEqual(expect_, production)
Пример #4
0
 def parse_trees(self, flatten=False):
   trees = []
   for sentence in self.result['sentences']:
     ptree = Tree.parse(sentence['parsetree'])
     if flatten:
       ptree = flatten_deeptree(ptree)
     trees.append(ptree)
   return trees
 def parse_trees(self, flatten=False):
     trees = []
     for sentence in self.result['sentences']:
         ptree = Tree.parse(sentence['parsetree'])
         if flatten:
             ptree = flatten_deeptree(ptree)
         trees.append(ptree)
     return trees
Пример #6
0
 def loadHeadTrees(self,filename):
     """load trees with head annotated with ps2ds"""
     trees = []
     inf = codecs.open(filename,'r','utf-8')
     for s in inf.readlines():
         head_tree = Tree.parse(s)
         head_tree = Tree('TOP',[head_tree]) # coordinate with original tree structure
         trees.append(head_tree)
     return trees
Пример #7
0
def get_semantics_from_parse_tree(parse_tree_string):
    """Take a string representing the parse tree as input, and print the
    semantic parse. The result list consists of a list of tuples, with each
    tuple containing the VerbNet frame and its associated tree."""
    parse_tree = Tree.parse(parse_tree_string)
    # parse_tree.draw()

    split_clause_dict = split_clauses(parse_tree)

    for key, (clause, conjunction) in split_clause_dict.items():
        activized_clause = activize_clause(clause)
        split_clause_dict[key] = (activized_clause, conjunction)

    result_list = []

    for position, (clause, conjunction) in split_clause_dict.items():
        split_tree_dict = split_conjunctions(clause)

        if conjunction != "":
            result_list.append(conjunction)

        for split, (split_tree, conjunction) in split_tree_dict.items():
            if conjunction != "":
                result_list.append(conjunction)

            for tree in split_tree:
                tree = existential_there_insertion(tree)
                tree = invert_clause(tree)
                tree = wh_movement(tree)

                tree.draw()

                # Regex for finding verbs
                verb_finder = re.compile(r"(?<=VB[ DGNPZ]) *\w*(?=\))")

                # Get the lemma of the verb for searching verbnet
                verbs = (word.strip().lower() for word in verb_finder.findall(str(tree)))

                for verb in verbs:

                    lemmatized_verb = lemmatizer.lemmatize(verb, "v")
                    vfo_list = create_VerbFrameObjects(lemmatized_verb)

                    match_list = []

                    for vfo in vfo_list:
                        match = vfo.match_parse(tree)

                        if match:
                            match_list.append(match)

                    best_match = pick_best_match(match_list)
                    if not best_match is None:
                        result_list.append((best_match, tree))

    return result_list
Пример #8
0
 def _parse_trees_output(output_):
     res = []
     cur_lines = []
     for line in output_.splitlines(False):
         if line == '':
             res.append(Tree.parse('\n'.join(cur_lines)))
             cur_lines = []
         else:
             cur_lines.append(line)
     return res
Пример #9
0
 def _parse_trees_output(output_):
     res = []
     cur_lines = []
     for line in output_.splitlines(False):
         if line == '':
             res.append(Tree.parse('\n'.join(cur_lines)))
             cur_lines = []
         else:
             cur_lines.append(line)
     return res
Пример #10
0
    def _parse(self, t):
        try:
            return Tree.parse(self._normalize(t))

        except ValueError, e:
            sys.stderr.write("Bad tree detected; trying to recover...\n")
            # Try to recover, if we can:
            if e.args == ('mismatched parens',):
                for n in range(1, 5):
                    try:
                        v = Tree.parse(self._normalize(t+')'*n))
                        sys.stderr.write("  Recovered by adding %d close "
                                         "paren(s)\n" % n)
                        return v
                    except ValueError: pass
            # Try something else:
            sys.stderr.write("  Recovered by returning a flat parse.\n")
            #sys.stderr.write(' '.join(t.split())+'\n')
            return Tree('S', self._tag(t))
Пример #11
0
def load_parse_doc(parse_path):
    parse_path = os.path.abspath(parse_path)
    parses = []
    with open(parse_path, 'r') as fp:
        for line in fp:
            line = line.strip()
            if line == '':
                continue
            parse = Tree.parse(line)
            parses.append(parse)
    return parses
def build_tagged_sents(files):
    """
	Build the corpus of tagged sentences from the files of the sequoia corpus.
	"""
    sents = []
    for fname in files:
        fin = codecs.open(fname, "r", "utf-8")
        for line in fin:
            t = Tree.parse(line)
            sents.append(t.pos())
        fin.close()
    return sents
Пример #13
0
def build_tagged_sents(files):
    """
	Build the corpus of tagged sentences from the files of the sequoia corpus.
	"""
    sents = []
    for fname in files:
        fin = codecs.open(fname, "r", "utf-8")
        for line in fin:
            t = Tree.parse(line)
            sents.append(t.pos())
        fin.close()
    return sents
Пример #14
0
 def _load_sent_token(self):
     print "Loading sentences and tokens..."
     sent_elmts = self.c_root.findall(CTAKES_PREFIX + 'textspan.Sentence')
     t_counter = 0
     for sent_elmt in sent_elmts:
         sent_begin = int(sent_elmt.get('begin'))
         sent_end = int(sent_elmt.get('end'))
         sent_num = int(sent_elmt.get('sentenceNumber'))
         cursor = sent_begin
         sent_span = []
         token_offset = 0
         while cursor < sent_end:
             buf = self._find_token_elmt_with_attrib_of_val('begin', cursor)
             if len(buf) == 0:
                 cursor = cursor + 1
                 continue
             elif len(buf) > 1:
                 print 'More than one token appear to begin at ' + str(cursor) + \
                     '\nLoading ctakes xml file terminated'
                 return
             else:
                 token_elmt = buf[0]
                 t = Token(self.ds_id + '_t_' + str(t_counter))
                 t.type = token_elmt.tag.split('.')[-1][:-5]
                 # skipping 'newline' token when counting up tid
                 t_num = int(token_elmt.get('tokenNumber')) - sent_num
                 if t_num != t_counter:
                     print 'CAUTION: t_num does not equal to counter t_counter'
                 t.offset = token_offset
                 t.begin = int(token_elmt.get('begin'))
                 t.end = int(token_elmt.get('end'))
                 t.pos = token_elmt.get('partOfSpeech')
                 t.n_form = token_elmt.get('normalizedForm')
                 #t.c_form = token_elmt.get('canonicalForm')
                 #t.cap = int(token_elmt.get('capitalization'))
                 #t.num_p = int(token_elmt.get('numPosition'))
                 self.tokens.append(t)
             sent_span.append(t)
             cursor = t.end + 1
             token_offset = token_offset + 1
             t_counter += 1
             
         s = Sentence(self.ds_id + '_s_' + str(sent_num))
         s.span = sent_span
         s.num = sent_num
         #s.begin = sent_begin
         #s.end = sent_end
         s.parse = Tree.parse(self.p_fp.next())
         for t in s.span:
             t.sent = s
         self.sents.append(s)  
     return
Пример #15
0
    def __init__(self, json_file):
        data = json.load(json_file)
        for k, v in data.iteritems():
            self.__setattr__(k, v)
        self.__raw_data = data # for future reference

        #print data
        self.spantree = SpanTree.parse(self.goldparse)
        self.spantree.convert()
        self.goldparse = Tree.parse(self.goldparse)

        self.text = data['text'].split()
        self.treebank_sentence = data['treebank_sentence'].split()
Пример #16
0
    def findAmbiguities(self, line):
        result = self.parse(line)

        #if 'coref' in result:
        #    return 1

        trees = []
        retval = 0
        for i in range(len(result['sentences'])):
            tree = Tree.parse(result['sentences'][i]['parsetree'])
            trees.append(tree)
            # Since tree[0] is a S
            for subtree in tree:
                retval = max(retval, self.exploreSubTree(subtree))
        return retval
Пример #17
0
 def findAmbiguities(self,line):    
     result = self.parse(line) 
     
     #if 'coref' in result:
     #    return 1
 
     trees = []
     retval = 0
     for i in range(len(result['sentences'])):
         tree = Tree.parse(result['sentences'][i]['parsetree'])
         trees.append(tree)
         # Since tree[0] is a S
         for subtree in tree:
             retval = max(retval, self.exploreSubTree(subtree))
     return retval
Пример #18
0
    def read(klass, path=KNOWLEDGE_PATH):
       
        if not path:
            raise Exception("Specify a path to the verbframes.json as $WIMKB")

        with open(path, 'rb') as kbfile:
            data = json.load(kbfile, encoding="utf8")

            kwargs = {}
            for frame in data['frames']:
                for mapping in frame['mappings']:
                    # Update mapping with frame object
                    mapping['frame']   = frame['frame']

                    # Convert string reprs of Trees
                    mapping['verbmap'] = Tree.parse(mapping['verbmap'])

                    if 'parse' in mapping:
                        mapping['parse']   = Tree.parse(mapping['parse']) 

                # Convert kwargs
                kwargs[frame['frame']] = frame['mappings']

        return klass(**kwargs)
Пример #19
0
    def read(klass, path=KNOWLEDGE_PATH):

        if not path:
            raise Exception("Specify a path to the verbframes.json as $WIMKB")

        with open(path, 'rb') as kbfile:
            data = json.load(kbfile, encoding="utf8")

            kwargs = {}
            for frame in data['frames']:
                for mapping in frame['mappings']:
                    # Update mapping with frame object
                    mapping['frame'] = frame['frame']

                    # Convert string reprs of Trees
                    mapping['verbmap'] = Tree.parse(mapping['verbmap'])

                    if 'parse' in mapping:
                        mapping['parse'] = Tree.parse(mapping['parse'])

                # Convert kwargs
                kwargs[frame['frame']] = frame['mappings']

        return klass(**kwargs)
Пример #20
0
def tag_ptree(ptree, coreflist):
    """Tags given parse tree with coreferences

    Args:
        ptree: string, parenthesized str represenation of parse tree
        coreflist: list of tuples, [('1', {'text': 'dog', 'ref': None})]

    Returns:
        string, tagged parse tree

    >>> ptree = '(S NP( (NN He)) VP( (V ran)))'
    >>> coreflist = [('1', {'text': 'He', 'ref': None})]
    >>> tag_ptree(ptree, coreflist)
    '(S NP( COREF_TAG_1( (NN He))) VP( (V ran)))'

    """
    pattern = r"""(?P<lp>\(?\s*)       # left parenthesis
                  (?P<tg>[a-zA-Z$]+)?  # POS tag
                  (?P<data>\s*%s)      # subtree of tag
                  (?P<rp>(?:\s*\))*)   # right parenthesis
               """
    for cid, coref in coreflist[::-1]:
        words = ''.join(word_tokenize(coref['text']))

        nltktree = Tree.parse(ptree)
        nltktree.reverse()  # perform search right to left
        data = None
        for subtree in nltktree.subtrees():  # BFS
            if ''.join(subtree.leaves()) == words:  # equal ignoring whitespace
                data = subtree.pprint()
                break

        # If found via breadth-first search of parse tree
        if data:
            ptree = ptree.replace(data, '( COREF_TAG_%s%s)' % (cid, data))
        else:  # Try finding via regex matching instead
            dpattern = r'\s*'.join([r'\(\s*[a-zA-Z$]+\s+%s\s*\)' % word
                                    for word in word_tokenize(coref['text'])])
            found = re.findall(pattern % dpattern, ptree, re.X)
            if found:
                repl = '%s%s ( COREF_TAG_%s%s) %s' % (found[0][0],
                                                      found[0][1],
                                                      cid,
                                                      found[0][2],
                                                      found[0][3])
                ptree = re.sub(pattern % dpattern, repl, ptree, 1, re.X)

    return ptree
Пример #21
0
 def parseQuestion(self, text):
     question = Question()
     print "RECEIVED DATA IS\n" + text
     wordList = nltk.word_tokenize(text)
     i = 0
     tokens = list()
     for word in wordList:
         print "WORD: "+str(word)
         if not str(word).strip() is "" and not str(word).strip() is "." and not str(word).strip() is "?" and not str(word).strip() is "!" and not str(word).strip() is ",":
             tokens.append(word)
         i+=1
     print tokens
     question.setTokens(tokens)
     result = self.parse(text)
     tree = Tree.parse(result['sentences'][0]['parsetree'])
     print TreeUtils.findPocs(tree)
Пример #22
0
def create_trees_nltk(filename):    
    f = open(filename, "r")

    response = f.readlines(); f.close()
    valid_tree_texts = []   
    tree_text = '' 
    for line in response:
        line = line.strip()
        if(line == ""):
            valid_tree_texts.append(tree_text)
            tree_text = ""            
        else:
            tree_text += line+" "        
    trees = [Tree.parse(line) for line in valid_tree_texts]
    
    for i in range(len(trees)):
        trees[i].chomsky_normal_form() 
    
    return trees
Пример #23
0
def create_trees_nltk(filename):
    f = open(filename, "r")

    response = f.readlines()
    f.close()
    valid_tree_texts = []
    tree_text = ''
    for line in response:
        line = line.strip()
        if (line == ""):
            valid_tree_texts.append(tree_text)
            tree_text = ""
        else:
            tree_text += line + " "
    trees = [Tree.parse(line) for line in valid_tree_texts]

    for i in range(len(trees)):
        trees[i].chomsky_normal_form()

    return trees
Пример #24
0
def test_nltk_trees(parsed_text):
    ''' Example of parsed_text, stanford parser output :
    
        (ROOT
  (S
    (ADVP (RB However))
    (NP
      (NP (DT the) (NNS talks))
      (, ,)
      (VP (VBN hosted)
        (PP (IN by)
          (NP (NNP Douglas) (NNP Hurd))))
      (, ,))
    (VP (VBD ended)
      (PP (IN in)
        (NP (NN stalemate))))
    (. .)))
    
    '''
    nltree = Tree.parse(parsed_text)
    nltree.chomsky_normal_form()
    nltree.draw()
Пример #25
0
def test_nltk_trees(parsed_text):
    
    ''' Example of parsed_text, stanford parser output :
    
        (ROOT
  (S
    (ADVP (RB However))
    (NP
      (NP (DT the) (NNS talks))
      (, ,)
      (VP (VBN hosted)
        (PP (IN by)
          (NP (NNP Douglas) (NNP Hurd))))
      (, ,))
    (VP (VBD ended)
      (PP (IN in)
        (NP (NN stalemate))))
    (. .)))
    
    ''' 
    nltree = Tree.parse(parsed_text)
    nltree.chomsky_normal_form()
    nltree.draw()
Пример #26
0
def _process_parse(parse, coreflist):
    """Tags parse tree with corefs and returns the tree, lexicon, dependencies
    and raw text as tuple

    Args:
        parse: list of stanford corenlp parsed sentences
        coreflist: list of coreferences from tagged xml

    Returns:
        tuple, (ptree, lexicon, dependencies, rawtext) if parse contains a
            sentence, else returns None

    """
    sentence = parse.get('sentences')
    if sentence:
        ptree = Tree.parse(tag_ptree(sentence[0]['parsetree'], coreflist))
        words = [(w[0], w[1]) for w in sentence[0]['words']]
        depends = [(d[0], d[1], d[2]) for d in sentence[0]['dependencies']]
        text = sentence[0]['text']

        return ptree, words, depends, text
    else:
        return None
Пример #27
0
def read_trees(filename, treelist, check=True):
    buffer = []
    for line in open(filename):
        if not line.strip():
            continue
        if line.startswith("(") and buffer:
            tree = ' '.join(buffer)
            tree = re.sub('\s+', ' ', tree)
            treelist.append(tree)
            buffer = []
        buffer.append(line.rstrip())
    if buffer:
        tree = ' '.join(buffer)
        tree = re.sub('\s+', ' ', tree)
        treelist.append(tree)

    if check:
        for idx, tree in enumerate(treelist):
            try:
                t = Tree.parse(tree)
                s = "  ".join(t.leaves())
            except ValueError:
                assert False, "f: %s, i: %s, t: %s" %(filename, idx, tree)
Пример #28
0
    def test_nltk_trees(self):
        parsed_text =  """ (S
    (NP (PRP He))
    (VP (VBZ reckons)
      (SBAR
        (S
          (NP (DT the) (JJ current) (NN account) (NN deficit))
          (VP (MD will)
            (VP (VB narrow)
              (PP (TO to)
                (NP
                  (QP (RB only) (# #) (CD 1.8) (CD billion))))
              (PP (IN in)
                (NP (NNP September))))))))
    (. .)) """ 
#        parsed_text = """(S
#    (S
#      (NP
#        (NP (JJS Most))
#        (PP (IN of)
#          (NP (DT the) (NN commodity) (NN traffic))))
#      (VP (VBD was)
#        (ADJP (RP off))))
#    (, ,)
#    (NP (DT the) (NN company))
#    (VP (VBD said))
#    (. .)) """ 
#        """(S
#    (NP (DT The) (NN cat))
#    (VP (VBD sat)
#      (PP (IN on)
#        (NP (DT a) (NN mat))))
#    (. .))"""
        nltree = Tree.parse(parsed_text)
        nltree.chomsky_normal_form()
        nltree.draw()
Пример #29
0
 def test_nltk_trees(self):
     parsed_text = """ (S
 (NP (PRP He))
 (VP (VBZ reckons)
   (SBAR
     (S
       (NP (DT the) (JJ current) (NN account) (NN deficit))
       (VP (MD will)
         (VP (VB narrow)
           (PP (TO to)
             (NP
               (QP (RB only) (# #) (CD 1.8) (CD billion))))
           (PP (IN in)
             (NP (NNP September))))))))
 (. .)) """
     #        parsed_text = """(S
     #    (S
     #      (NP
     #        (NP (JJS Most))
     #        (PP (IN of)
     #          (NP (DT the) (NN commodity) (NN traffic))))
     #      (VP (VBD was)
     #        (ADJP (RP off))))
     #    (, ,)
     #    (NP (DT the) (NN company))
     #    (VP (VBD said))
     #    (. .)) """
     #        """(S
     #    (NP (DT The) (NN cat))
     #    (VP (VBD sat)
     #      (PP (IN on)
     #        (NP (DT a) (NN mat))))
     #    (. .))"""
     nltree = Tree.parse(parsed_text)
     nltree.chomsky_normal_form()
     nltree.draw()
Пример #30
0
def main():
  for line in sys.stdin:
    t = Tree.parse(line)
    t.draw()
Пример #31
0
# sys.exit()

mode = 0
parse = ""

first_tree = True
for line in sys.stdin:
    # print 'mode:', mode
    line = line[:-1]  # remove newline

    if line == "Leaves:":
        assert mode == -2

        if mode == -2:
            t = Tree.parse(parse)
            assert t
            if not first_tree:
                print ""
            first_tree = False
            print t.pprint()
            parse = ""
        mode = 0
        continue

    if line.startswith("Tree:"):
        mode -= 1
    elif line == "-----":
        mode -= 1
    else:
        assert abs(mode) < 3
Пример #32
0
def demo():
    import random
    def fill(cw):
        cw['fill'] = '#%06d' % random.randint(0,999999)

    cf = CanvasFrame(width=550, height=450, closeenough=2)

    t = Tree.parse('''
    (S (NP the very big cat)
       (VP (Adv sorta) (V saw) (NP (Det the) (N dog))))''')

    tc = TreeWidget(cf.canvas(), t, draggable=1,
                    node_font=('helvetica', -14, 'bold'),
                    leaf_font=('helvetica', -12, 'italic'),
                    roof_fill='white', roof_color='black',
                    leaf_color='green4', node_color='blue2')
    cf.add_widget(tc,10,10)

    def boxit(canvas, text):
        big = ('helvetica', -16, 'bold')
        return BoxWidget(canvas, TextWidget(canvas, text,
                                            font=big), fill='green')
    def ovalit(canvas, text):
        return OvalWidget(canvas, TextWidget(canvas, text),
                          fill='cyan')

    treetok = Tree.parse('(S (NP this tree) (VP (V is) (AdjP shapeable)))')
    tc2 = TreeWidget(cf.canvas(), treetok, boxit, ovalit, shapeable=1)

    def color(node):
        node['color'] = '#%04d00' % random.randint(0,9999)
    def color2(treeseg):
        treeseg.node()['fill'] = '#%06d' % random.randint(0,9999)
        treeseg.node().child()['color'] = 'white'

    tc.bind_click_trees(tc.toggle_collapsed)
    tc2.bind_click_trees(tc2.toggle_collapsed)
    tc.bind_click_nodes(color, 3)
    tc2.expanded_tree(1).bind_click(color2, 3)
    tc2.expanded_tree().bind_click(color2, 3)

    paren = ParenWidget(cf.canvas(), tc2)
    cf.add_widget(paren, tc.bbox()[2]+10, 10)

    tree3 = Tree.parse('''
    (S (NP this tree) (AUX was)
       (VP (V built) (PP (P with) (NP (N tree_to_treesegment)))))''')
    tc3 = tree_to_treesegment(cf.canvas(), tree3, tree_color='green4',
                              tree_xspace=2, tree_width=2)
    tc3['draggable'] = 1
    cf.add_widget(tc3, 10, tc.bbox()[3]+10)

    def orientswitch(treewidget):
        if treewidget['orientation'] == 'horizontal':
            treewidget.expanded_tree(1,1).subtrees()[0].set_text('vertical')
            treewidget.collapsed_tree(1,1).subtrees()[0].set_text('vertical')
            treewidget.collapsed_tree(1).subtrees()[1].set_text('vertical')
            treewidget.collapsed_tree().subtrees()[3].set_text('vertical')
            treewidget['orientation'] = 'vertical'
        else:
            treewidget.expanded_tree(1,1).subtrees()[0].set_text('horizontal')
            treewidget.collapsed_tree(1,1).subtrees()[0].set_text('horizontal')
            treewidget.collapsed_tree(1).subtrees()[1].set_text('horizontal')
            treewidget.collapsed_tree().subtrees()[3].set_text('horizontal')
            treewidget['orientation'] = 'horizontal'

    text = """
Try clicking, right clicking, and dragging
different elements of each of the trees.
The top-left tree is a TreeWidget built from
a Tree.  The top-right is a TreeWidget built
from a Tree, using non-default widget
constructors for the nodes & leaves (BoxWidget
and OvalWidget).  The bottom-left tree is
built from tree_to_treesegment."""
    twidget = TextWidget(cf.canvas(), text.strip())
    textbox = BoxWidget(cf.canvas(), twidget, fill='white', draggable=1)
    cf.add_widget(textbox, tc3.bbox()[2]+10, tc2.bbox()[3]+10)

    tree4 = Tree.parse('(S (NP this tree) (VP (V is) (Adj horizontal)))')
    tc4 = TreeWidget(cf.canvas(), tree4, draggable=1,
                     line_color='brown2', roof_color='brown2',
                     node_font=('helvetica', -12, 'bold'),
                     node_color='brown4', orientation='horizontal')
    tc4.manage()
    cf.add_widget(tc4, tc3.bbox()[2]+10, textbox.bbox()[3]+10)
    tc4.bind_click(orientswitch)
    tc4.bind_click_trees(tc4.toggle_collapsed, 3)

    # Run mainloop
    cf.mainloop()
Пример #33
0
Файл: tree.py Проект: gijs/nltk
def demo():
    import random

    def fill(cw):
        cw["fill"] = "#%06d" % random.randint(0, 999999)

    cf = CanvasFrame(width=550, height=450, closeenough=2)

    t = Tree.parse(
        """
    (S (NP the very big cat)
       (VP (Adv sorta) (V saw) (NP (Det the) (N dog))))"""
    )

    tc = TreeWidget(
        cf.canvas(),
        t,
        draggable=1,
        node_font=("helvetica", -14, "bold"),
        leaf_font=("helvetica", -12, "italic"),
        roof_fill="white",
        roof_color="black",
        leaf_color="green4",
        node_color="blue2",
    )
    cf.add_widget(tc, 10, 10)

    def boxit(canvas, text):
        big = ("helvetica", -16, "bold")
        return BoxWidget(canvas, TextWidget(canvas, text, font=big), fill="green")

    def ovalit(canvas, text):
        return OvalWidget(canvas, TextWidget(canvas, text), fill="cyan")

    treetok = Tree.parse("(S (NP this tree) (VP (V is) (AdjP shapeable)))")
    tc2 = TreeWidget(cf.canvas(), treetok, boxit, ovalit, shapeable=1)

    def color(node):
        node["color"] = "#%04d00" % random.randint(0, 9999)

    def color2(treeseg):
        treeseg.node()["fill"] = "#%06d" % random.randint(0, 9999)
        treeseg.node().child()["color"] = "white"

    tc.bind_click_trees(tc.toggle_collapsed)
    tc2.bind_click_trees(tc2.toggle_collapsed)
    tc.bind_click_nodes(color, 3)
    tc2.expanded_tree(1).bind_click(color2, 3)
    tc2.expanded_tree().bind_click(color2, 3)

    paren = ParenWidget(cf.canvas(), tc2)
    cf.add_widget(paren, tc.bbox()[2] + 10, 10)

    tree3 = Tree.parse(
        """
    (S (NP this tree) (AUX was)
       (VP (V built) (PP (P with) (NP (N tree_to_treesegment)))))"""
    )
    tc3 = tree_to_treesegment(cf.canvas(), tree3, tree_color="green4", tree_xspace=2, tree_width=2)
    tc3["draggable"] = 1
    cf.add_widget(tc3, 10, tc.bbox()[3] + 10)

    def orientswitch(treewidget):
        if treewidget["orientation"] == "horizontal":
            treewidget.expanded_tree(1, 1).subtrees()[0].set_text("vertical")
            treewidget.collapsed_tree(1, 1).subtrees()[0].set_text("vertical")
            treewidget.collapsed_tree(1).subtrees()[1].set_text("vertical")
            treewidget.collapsed_tree().subtrees()[3].set_text("vertical")
            treewidget["orientation"] = "vertical"
        else:
            treewidget.expanded_tree(1, 1).subtrees()[0].set_text("horizontal")
            treewidget.collapsed_tree(1, 1).subtrees()[0].set_text("horizontal")
            treewidget.collapsed_tree(1).subtrees()[1].set_text("horizontal")
            treewidget.collapsed_tree().subtrees()[3].set_text("horizontal")
            treewidget["orientation"] = "horizontal"

    text = """
Try clicking, right clicking, and dragging
different elements of each of the trees.
The top-left tree is a TreeWidget built from
a Tree.  The top-right is a TreeWidget built
from a Tree, using non-default widget
constructors for the nodes & leaves (BoxWidget
and OvalWidget).  The bottom-left tree is
built from tree_to_treesegment."""
    twidget = TextWidget(cf.canvas(), text.strip())
    textbox = BoxWidget(cf.canvas(), twidget, fill="white", draggable=1)
    cf.add_widget(textbox, tc3.bbox()[2] + 10, tc2.bbox()[3] + 10)

    tree4 = Tree.parse("(S (NP this tree) (VP (V is) (Adj horizontal)))")
    tc4 = TreeWidget(
        cf.canvas(),
        tree4,
        draggable=1,
        line_color="brown2",
        roof_color="brown2",
        node_font=("helvetica", -12, "bold"),
        node_color="brown4",
        orientation="horizontal",
    )
    tc4.manage()
    cf.add_widget(tc4, tc3.bbox()[2] + 10, textbox.bbox()[3] + 10)
    tc4.bind_click(orientswitch)
    tc4.bind_click_trees(tc4.toggle_collapsed, 3)

    # Run mainloop
    cf.mainloop()
Пример #34
0
fout = open('tree.tex', 'w')
print >> fout, r'''\documentclass[tikz]{standalone}
\usepackage{CJKutf8}
\usepackage{color}
\usepackage{tikz}
\usepackage{tikz-qtree}
\thispagestyle{empty}
\begin{document}
\begin{CJK}{UTF8}{gbsn}

\begin{tikzpicture}'''
f = open(parse_file)
for i, s in enumerate(f):
    if i == line_num:
        s = s.replace('$', '\$')
        tree = Tree.parse(s)
        if flag == '0':
            h = tree.height()
            print >> fout, '''\\begin{{scope}}[frontier/.style={{distance from root={}}}]\n'''.format(
                h * 28)
            for pos in tree.treepositions('leaves'):
                tree[pos] = r'\edge[dotted]; {' + tree[pos] + '}'
            idx = 0
            for line in tree.pprint_latex_qtree().split('\n'):
                if ';' in line:
                    line = line.replace('{',
                                        '\\node(n{}) {{'.format(idx)).replace(
                                            '}', '};').replace('%', '\%')
                    idx += 1
                print >> fout, line
            for i in range(idx):
Пример #35
0
    TreeView(*trees).mainloop()
    return

##//////////////////////////////////////////////////////
##  Demo Code
##//////////////////////////////////////////////////////

import random
if __name__ == '__main__':
    def fill(cw):
        cw['fill'] = '#%06d' % random.randint(0,999999)
    
    cf = CanvasFrame(width=550, height=450, closeenough=2)

    tree = Tree.parse('''
    (S (NP the very big cat)
       (VP (Adv sorta) (V saw) (NP (Det the) (N dog))))
    ''', leafparser = lambda t: Token(TEXT=t))
                
    tc = TreeWidget(cf.canvas(), tree, draggable=1, 
                    node_font=('helvetica', -14, 'bold'),
                    leaf_font=('helvetica', -12, 'italic'),
                    roof_fill='white', roof_color='black',
                    leaf_color='green4', node_color='blue2')
    cf.add_widget(tc,10,10)
    
    def boxit(canvas, text):
        big = ('helvetica', -16, 'bold')
        return BoxWidget(canvas, TextWidget(canvas, text,
                                            font=big), fill='green')
    def ovalit(canvas, text):
        return OvalWidget(canvas, TextWidget(canvas, text),
Пример #36
0
fout = open('tree.tex','w')
print >>fout,r'''\documentclass[tikz]{standalone}
\usepackage{CJKutf8}
\usepackage{color}
\usepackage{tikz}
\usepackage{tikz-qtree}
\thispagestyle{empty}
\begin{document}
\begin{CJK}{UTF8}{gbsn}

\begin{tikzpicture}'''
f = open(parse_file)
for i,s in enumerate(f):
    if i == line_num:
        s = s.replace('$','\$')
        tree = Tree.parse(s)
        if flag == '0':
            h = tree.height()
            print >>fout,'''\\begin{{scope}}[frontier/.style={{distance from root={}}}]\n'''.format(h*28)
            for pos in tree.treepositions('leaves'):
	        tree[pos] = r'\edge[dotted]; {' + tree[pos] + '}'
            idx = 0
            for line in tree.pprint_latex_qtree().split('\n'):
                if ';' in line:
                    line = line.replace('{','\\node(n{}) {{'.format(idx)).replace('}','};').replace('%','\%')
                    idx += 1
                print >>fout,line
            for i in range(idx):
                print >>fout,'\draw (n{} |- 0,{}pt) node {{{}}};'.format(i,-h*28-10,i)
        else:
            print >>fout,r'\begin{scope}'
Пример #37
0
#!/usr/bin/python

from nltk.tree import Tree
import sys

# A program to display parse trees (in Penn treebank format) with NLTK
#
#  To install NLTK on ubuntu: sudo apt-get install python-nltk

for line in sys.stdin:
    t = Tree.parse(line)
    t.draw()
Пример #38
0
    parser = argparse.ArgumentParser()
    parser.add_argument('ptb', action='store', help="ptb.json file")
    parser.add_argument('json', action='store', help="json input file")
    parser.add_argument('jsonout', action='store', help="json output file")
    parser.add_argument('-verbose', action='store_true')
    arguments = parser.parse_args(sys.argv[1:])

    treebank = json.load(open(arguments.ptb))

    docId, sentNr = re.search(r'wsj_(\d+).(\d+).json', arguments.json).groups()
    #print treebank.keys()
    #print docId
    #int(docId)
    sentNr = int(sentNr)
    data = json.load(open(arguments.json))

    if arguments.verbose:
        from nltk.tree import Tree
        sys.stderr.write("text:\n"), data['text']
        sys.stderr.write("%s\n" %(treebank[docId][sentNr]))

        t = Tree.parse(treebank[docId][sentNr])
        sys.stderr.write("%s\n" %(" ".join(t.leaves())))

    assert docId in treebank
    #print treebank[docId]
    assert int(sentNr) < len(treebank[docId])

    data['ptbparse'] = treebank[docId][sentNr]
    json.dump(data, open(arguments.jsonout, 'w'), indent=2, sort_keys=True)
Пример #39
0
#!/usr/bin/env python

# check if parse from .onf is equal to parse obtained from penn treebank

import sys
from collections import defaultdict
from itertools import imap, izip
import json
import re
from nltk.tree import Tree

if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('json', action='store', help="json input file")
    arguments = parser.parse_args(sys.argv[1:])

    data = json.load(open(arguments.json))

    ptb = Tree.parse(data['ptbparse'])
    onf = Tree.parse(data['goldparse'])

    equal = ptb[0].pprint() == onf[0].pprint()
    if not equal:
        print "0 parses from pbt and .onf differ in %s" %arguments.json
    if equal:
        print "1 parses from pbt and .onf do NOT differ in %s" %arguments.json
        #print ptb[0].pprint()
        #print onf[0].pprint()
Пример #40
0
are brackets annotated. Export the content as a regular annotated corpus
for pos tagging learning.
"""

import sys, codecs
from nltk.tree import Tree

def treeSentenceToTuples(sent):
	"""
	:param sent: a Tree representing a sentence
	:type sent: nltk.tree.Tree
	"""
	return [u"%s/%s"%(t,p) for t,p in sent.pos() if not t in ["-LRB-", "-RRB-"]]

if __name__ == "__main__":
	if len(sys.argv) < 3:
		print "Usage:\n\t%s <destination> <corpus>" % sys.argv[0]
		sys.exit(-1)
	dest = sys.argv[1]
	fout = codecs.open(dest, "w", "utf-8")
	for fname in sys.argv[2:]:
		fin = codecs.open(fname, "r", "utf-8")
		for line in fin:
			t = Tree.parse(line)
			tokens = treeSentenceToTuples(t)
			fout.write(u" ".join(tokens))
			fout.write("\n")
		fin.close()
	fout.close()
	
Пример #41
0
import json
# from jsonrpc import ServerProxy, JsonRpc20, TransportTcpIp
import jsonrpclib
from pprint import pprint


class StanfordNLP:
    def __init__(self, port_number=8080):
        self.server = jsonrpclib.Server("http://localhost:%d" % port_number)

    def parse(self, text):
        return json.loads(self.server.parse(text))

nlp = StanfordNLP()
result = nlp.parse("Hello world!  It is so beautiful.")
pprint(result)

from nltk.tree import Tree
tree = Tree.parse(result['sentences'][0]['parsetree'])
pprint(tree)
Пример #42
0
Файл: tree.py Проект: sp00/nltk
def demo():
    import random
    def fill(cw):
        cw['fill'] = '#%06d' % random.randint(0,999999)

    cf = CanvasFrame(width=550, height=450, closeenough=2)

    t = Tree.parse('''
    (S (NP the very big cat)
       (VP (Adv sorta) (V saw) (NP (Det the) (N dog))))''')

    tc = TreeWidget(cf.canvas(), t, draggable=1,
                    node_font=('helvetica', -14, 'bold'),
                    leaf_font=('helvetica', -12, 'italic'),
                    roof_fill='white', roof_color='black',
                    leaf_color='green4', node_color='blue2')
    cf.add_widget(tc,10,10)

    def boxit(canvas, text):
        big = ('helvetica', -16, 'bold')
        return BoxWidget(canvas, TextWidget(canvas, text,
                                            font=big), fill='green')
    def ovalit(canvas, text):
        return OvalWidget(canvas, TextWidget(canvas, text),
                          fill='cyan')

    treetok = Tree.parse('(S (NP this tree) (VP (V is) (AdjP shapeable)))')
    tc2 = TreeWidget(cf.canvas(), treetok, boxit, ovalit, shapeable=1)

    def color(node):
        node['color'] = '#%04d00' % random.randint(0,9999)
    def color2(treeseg):
        treeseg.node()['fill'] = '#%06d' % random.randint(0,9999)
        treeseg.node().child()['color'] = 'white'

    tc.bind_click_trees(tc.toggle_collapsed)
    tc2.bind_click_trees(tc2.toggle_collapsed)
    tc.bind_click_nodes(color, 3)
    tc2.expanded_tree(1).bind_click(color2, 3)
    tc2.expanded_tree().bind_click(color2, 3)

    paren = ParenWidget(cf.canvas(), tc2)
    cf.add_widget(paren, tc.bbox()[2]+10, 10)

    tree3 = Tree.parse('''
    (S (NP this tree) (AUX was)
       (VP (V built) (PP (P with) (NP (N tree_to_treesegment)))))''')
    tc3 = tree_to_treesegment(cf.canvas(), tree3, tree_color='green4',
                              tree_xspace=2, tree_width=2)
    tc3['draggable'] = 1
    cf.add_widget(tc3, 10, tc.bbox()[3]+10)

    def orientswitch(treewidget):
        if treewidget['orientation'] == 'horizontal':
            treewidget.expanded_tree(1,1).subtrees()[0].set_text('vertical')
            treewidget.collapsed_tree(1,1).subtrees()[0].set_text('vertical')
            treewidget.collapsed_tree(1).subtrees()[1].set_text('vertical')
            treewidget.collapsed_tree().subtrees()[3].set_text('vertical')
            treewidget['orientation'] = 'vertical'
        else:
            treewidget.expanded_tree(1,1).subtrees()[0].set_text('horizontal')
            treewidget.collapsed_tree(1,1).subtrees()[0].set_text('horizontal')
            treewidget.collapsed_tree(1).subtrees()[1].set_text('horizontal')
            treewidget.collapsed_tree().subtrees()[3].set_text('horizontal')
            treewidget['orientation'] = 'horizontal'

    text = """
Try clicking, right clicking, and dragging
different elements of each of the trees.
The top-left tree is a TreeWidget built from
a Tree.  The top-right is a TreeWidget built
from a Tree, using non-default widget
constructors for the nodes & leaves (BoxWidget
and OvalWidget).  The bottom-left tree is
built from tree_to_treesegment."""
    twidget = TextWidget(cf.canvas(), text.strip())
    textbox = BoxWidget(cf.canvas(), twidget, fill='white', draggable=1)
    cf.add_widget(textbox, tc3.bbox()[2]+10, tc2.bbox()[3]+10)

    tree4 = Tree.parse('(S (NP this tree) (VP (V is) (Adj horizontal)))')
    tc4 = TreeWidget(cf.canvas(), tree4, draggable=1,
                     line_color='brown2', roof_color='brown2',
                     node_font=('helvetica', -12, 'bold'),
                     node_color='brown4', orientation='horizontal')
    tc4.manage()
    cf.add_widget(tc4, tc3.bbox()[2]+10, textbox.bbox()[3]+10)
    tc4.bind_click(orientswitch)
    tc4.bind_click_trees(tc4.toggle_collapsed, 3)

    # Run mainloop
    cf.mainloop()
Пример #43
0
def process_file(json_filename, nb):
    docId, sentNr = re.search(r'wsj_(\d+).(\d+).json', json_filename).groups()
    sentNr = int(sentNr)
    data = json.load(open(json_filename))
    data['nom'] = []

    # index adjustments for consistency with ontonotes parses
    ptb_tree = Tree.parse(data['ptbparse'])
    ptbstring = tree_to_string(ptb_tree) # wrap traces

    onftree = Tree.parse(data['goldparse'])
    onfstring = tree_to_string(onftree) # wrap traces
    raw_onfstring = tree_to_string(onftree, wrap_traces=False)

    ptbstring_tok = add_spaces(ptbstring, onfstring)

    tokenize_offsets = split_offsets(ptbstring, ptbstring_tok)
    trace_offsets = Offset(ptbstring_tok.split(), onfstring.split(), ignore_braces=True)

    #print ptbstring
    #print ptbstring_tok
    #print onfstring
    #print tokenize_offsets
    #print trace_offsets

    pt = SpanTree.parse(data['ptbparse'])

    for nb_data in nb[docId][sentNr]:
        args = nb_data['args']

	# TODO: arguments that are chains or concatenations of multiple nodes

        new_args = []
        for pos, role in args:
            words, start, end = [], None, None
            leaf_id, depth = pt.parse_pos(pos)
            if leaf_id != None and depth != None:
                treepos = pt.get_treepos(leaf_id, depth)
                while is_trace(pt[treepos]):
                    trace_id = int(pt[treepos].leaves()[0].split('-')[-1])
                    print 'looking for trace', trace_id
                    tracepos = pt.find_trace(trace_id)
                    if tracepos != None:
                        print 'trace %s found! Here:', tracepos
                        print pt[tracepos].pprint()
                        treepos = tracepos
                    else:
                        break # could not follow trace

                words = pt[treepos].leaves()
                start, end = span_from_treepos(pt, treepos)
                #print start, end,

                # adjust of different tokenization
                assert start in tokenize_offsets
                start = min(tokenize_offsets[start])
                assert end in tokenize_offsets
                end = max(tokenize_offsets[end])

                # adjust of inserted traces in ontonotes
                start = trace_offsets.map_to_longer(start)
                end = trace_offsets.map_to_longer(end)
                #print '->', start, end

            phrase = ''
            if words:
                phrase = ' '.join(raw_onfstring.split()[start:end+1])
            new_args.append( [role, pos, start, end, phrase] )

        nb_data['args'] = new_args
        data['nom'].append(nb_data)

        #print nb_data
    json.dump(data, open(json_filename, 'w'), indent=2, sort_keys=True)
Пример #44
0
import json
# from jsonrpc import ServerProxy, JsonRpc20, TransportTcpIp
import jsonrpclib
from pprint import pprint


class StanfordNLP:
    def __init__(self, port_number=8080):
        self.server = jsonrpclib.Server("http://localhost:%d" % port_number)

    def parse(self, text):
        return json.loads(self.server.parse(text))


nlp = StanfordNLP()
result = nlp.parse("Hello world!  It is so beautiful.")
pprint(result)

from nltk.tree import Tree
tree = Tree.parse(result['sentences'][0]['parsetree'])
pprint(tree)