word=''.join(tree.leaves())

  if word in Vec:  # word in the scope of our vocabulary/corpus, go on (ignore others)

    tag_collapsed=Vec[word]

    if tag_collapsed in Tag2Word:
      Tag2Word[tag_collapsed].add(word)
    else:
      Tag2Word[tag_collapsed]={word}


    Forest.append(tree)

    for subtree in tree.subtrees():

      string=''.join(subtree.leaves())

      if not string in Str2Code:
        Str2Code[string]=type_count
        Code2Str[type_count]=string
        type_count +=1

        

    

print('\nCurrent type count is:', type_count)
print('while current Vec size is:', len(Vec))
print('tmp test', type_count,len(Str2Code))
Пример #2
0
  string=''.join(tree.leaves())

  #If the word has occurred in the corpus...
  
  if string in Vec:

    Forest.append(tree)

    

    tag_set=Vec[string]
    
    S.append(tree)

    for s in tree.subtrees():
      Symbols2.add(''.join(s.leaves()))

    while S:

      current_tree=S.pop()

      string=''.join(current_tree.leaves())

      Symbols.add(string)


      # propogate the tagset to current tree, note: one useless update for the root node
      if string in Vec:
        Vec[string].update(tag_set)
      else: