Пример #1
0
    def convert_file(self, input_file, output_file):
        text = ""
        actual_word = ""
        new_para = False
        new_word = False
        for line in input_file:
            if ("</para>" in line):
                new_para = True
            elif ("<para>" in line and new_para):
                output_file.write('\n\n')
                new_para = False

            elif ("<w " in line):
                new_word = True
            elif ("</w>" in line and new_word):
                output_file.write(actual_word)
                actual_word = ""
                new_word = False

            elif ("<token>" in line):
                token = get_interstring(line, '>', '<') + " "
                actual_word += token
            elif ("<no_space_after>" in line):
                value = get_interstring(line, '>', '<')
                if (value == "1"):
                    actual_word = actual_word[:-1]
        output_file.write(text)
Пример #2
0
    def process_coref( self, first_line, actual_info, perspron): # -> Coreference_record
        """
        reads coreferent ID and creates a new coreference record
        """
        ( actual_ID, actual_type ) = actual_info
        coref_ID = ""
        if ( "<coref_gram" in first_line ):
            line = self.pdt_t_input.readline()
            coref_ID = get_interstring( line, '>', '<')
        elif ( "<coref_text" in first_line ):
            self.pdt_t_input.readline() # <LM>
            line = self.pdt_t_input.readline()
            coref_ID = get_interstring( line, '>', '<')
        #elif ( "<coref_special" in first_line ):
        #    pass # segments or exophorae - they don't refere to any word in file

        coref_type = self.ID_type( coref_ID) # string "word", "dropped", "sentence", "other"
        #actual_ID = actual_info[0]
        #actual_dropped = actual_info[1]
        
        if ( coref_type == "dropped" ): # replace a reference to dropped pronoun with a reference to its non-dropped supernode
            coref_ID = self.get_dropped( coref_ID)
            
        if ( actual_ID != None and coref_ID != None ):            
            record = Coreference_record( actual_type == "dropped", actual_ID, coref_type == "dropped", coref_ID, perspron)
            return record
        return None
Пример #3
0
 def read_infos( self, first_line): # -> (string, Node_type) 
     """
     obtains information about a node from its first line
     return a pair of id string (substring of the first line containing node id) and enum        
     """
     id_string = get_interstring( first_line, '"', '"') # gots substring between first two quotes   
     type = self.get_node_type( id_string) # recognizes node type
     return ( id_string, type)
Пример #4
0
 def get_paragraph_ID( self, id_string): # -> int
     """
     gets the number of the actual paragraph from the id of the actual node
     id_string ... of a sentence, e.g. t-lnd94103-052-p1s11
     """
     last = id_string.split( '-')[-1]
     id = get_interstring( last, 'p', 's')
     return int( id)
Пример #5
0
 def next_pdt_word(self):
     pdt_line = self.pdt_w_input.readline()
     while (not "</doc>" in pdt_line):
         #print(self.para_ID, self.sent_ID )
         #print(pdt_line)
         if ("<para" in pdt_line):
             self.para_ID += 1
             self.sent_ID = 1
         elif ("<w id" in pdt_line):
             pdt_ID = get_interstring(pdt_line, '"', '"')
             token_line = self.pdt_w_input.readline()
             token = get_interstring(token_line, '>', '<')
             return (pdt_ID, token)
         elif (pdt_line == ""):
             return ("", "")
         pdt_line = self.pdt_w_input.readline()
     return ("", "")
Пример #6
0
    def process_coref( self, first_line, actual_info): # -> Coreference_record
        """
        reads coreferent ID and creates a new coreference record        
        """
        ( actual_ID, actual_type ) = actual_info # id_string and Node_type (word or dropped)
        coref_ID = ""
        if ( "<coref_gram" in first_line ):
            line = self.pdt_t_input.readline()
            coref_ID = get_interstring( line, '>', '<')
        elif ( "<coref_text" in first_line ):
            self.pdt_t_input.readline() # <LM>
            line = self.pdt_t_input.readline()
            coref_ID = get_interstring( line, '>', '<')
        #elif ( "<coref_special" in first_line ):
        #    pass # segments or exophorae - they don't refere to any word in file

        coref_type = self.get_node_type( coref_ID) # 
        
        if ( coref_type == Node_type.Dropped ): # replace a reference to dropped pronoun with a reference to its non-dropped supernode
            coref_ID = self.get_dropped( coref_ID)
            
        if ( actual_ID != None and coref_ID != None ):   # creating a new coreference record          
            record = Coreference_record( actual_type == Node_type.Dropped, actual_ID, coref_type == Node_type.Dropped, coref_ID)
            return record
Пример #7
0
 def paragraph_ID( self, string):        
     last = string.split( '-')[-1]
     para_string = get_interstring( last, 'p', 's')
     return int( para_string)
Пример #8
0
 def read_infos( self, string):
     id_string = get_interstring( string, '"', '"')      
     type = self.ID_type( id_string)
     return ( id_string, type)