Python tokenize_string_without_punctuations示例，utils.tokenize_string_without_punctuations Python示例

示例#1

0

显示文件

文件： identify_interaction.py 项目： rogersjeffreyl/SET_Project

    def identify_interaction(self,path_to_test_file):

        document_data=open(path_to_test_file,'rb')
        xml_data=parse(document_data)
        sentences = xml_data.getElementsByTagName("sentence")
        result={}
        for sentence in sentences:
            entity_collection={}
            sentence_attrs = dict(sentence.attributes.items())
            sentence_text=sentence_attrs["text"]
            sentence_id=sentence_attrs["id"]
            entities = sentence.getElementsByTagName("entity")
            for entity in entities:
                    entity_attrs = dict(entity.attributes.items())
                    id=entity_attrs["id"]
                    text=(entity_attrs["text"]).lower()
                    type=(entity_attrs["type"]).lower()
                    entity_collection[id]={}
                    entity_collection[id]={"text":text,"type":type}
            words=utils.tokenize_string_without_punctuations(sentence_text)

            bigrams=utils.generate_bigrams(words)
            trigrams=utils.generate_trigrams(words)
            result_found=0

            for bigram in bigrams:

                if bigram in self.bigrams_based_on_score.keys():
                          result[sentence_id]={}
                          result[sentence_id]=self.bigrams_based_on_score[bigram]
                          result_found=1
                          print "bigram as a result of which classification is done:"+bigram
                          break
            if result_found!=1:
               result[sentence_id]="false"
        #pprint.pprint(self.bigrams_based_on_score)
        document_data.close()
        return  result

示例#2

0

显示文件

文件： get_test_data.py 项目： rogersjeffreyl/SET_Project

   def parse_ddi_corpus(self,file):

        #print file
        document_data=open(file,'r')
        xml_data=parse(document_data)
        sentences = xml_data.getElementsByTagName("sentence")

        for sentence in sentences:
            self.total_sentence_count+=1
            sentence_attrs = dict(sentence.attributes.items())
            text=sentence_attrs["text"]
            sentence_id=sentence_attrs["id"]
            words=utils.tokenize_string_without_punctuations(text)
            entities = sentence.getElementsByTagName("entity")
            for entity in entities:
                entity_attrs = dict(entity.attributes.items())
                id=entity_attrs["id"]
                text=(entity_attrs["text"]).lower()
                type=(entity_attrs["type"]).lower()
                self.entity_collection[id]={}
                self.entity_collection[id]={"text":text,"type":type}

            interacting_pairs = sentence.getElementsByTagName("pair")
            # removing drugs
            for ids in self.entity_collection:

                        if self.entity_collection[ids]["text"] in words:
                           words.remove(self.entity_collection[ids]["text"])

            for pair in interacting_pairs:
                pair_attrs = dict(pair.attributes.items())
                type="null"
                ddi="0"
                if "ddi" in pair_attrs.keys():
                    ddi=pair_attrs["ddi"]
                    if ddi=="false":
                       ddi="0"
                    elif ddi=="true":
                       ddi="1"

                if "type" in pair_attrs.keys():
                    type=pair_attrs["type"]

                #self.pairs_collection[pair_attrs["id"]]={}
                pair_id=pair_attrs["id"]
                self.pairs_collection[pair_id]=ddi

                entity_1=self.entity_collection[pair_attrs["e1"]]["text"]
                entity_2=self.entity_collection[pair_attrs["e2"]]["text"]

                if entity_1 in self.interaction_collection.keys():
                    if entity_2 not in self.interaction_collection[entity_1].keys():
                        self.interaction_collection[entity_1][entity_2]=ddi
                elif entity_2 in self.interaction_collection.keys():
                    if entity_1 not in self.interaction_collection[entity_2]:
                        self.interaction_collection[entity_2][entity_1]=ddi
                else:
                    self.interaction_collection[entity_2]={}
                    self.interaction_collection[entity_2][entity_1]=ddi
                    self.interaction_collection[entity_2][entity_1]=ddi


        document_data.close()

示例#3

0

显示文件

文件： ddi_parser_corpus.py 项目： rogersjeffreyl/SET_Project

    def parse_ddi_corpus(self,file):

        #print file
        document_data=open(file,'r')
        xml_data=parse(document_data)
        sentences = xml_data.getElementsByTagName("sentence")

        for sentence in sentences:
            # setting flag to zero at the beginning of sentence
            is_there_a_positive_interaction_for_sentence=0
            is_there_a_negative_interaction_for_sentence=0
            #counting the total sentences
            self.total_sentence_count+=1
            sentence_attrs = dict(sentence.attributes.items())
            text=sentence_attrs["text"]
            sentence_id=sentence_attrs["id"]
            words=utils.tokenize_string_without_punctuations(text)
            entities = sentence.getElementsByTagName("entity")
            for entity in entities:
                entity_attrs = dict(entity.attributes.items())
                id=entity_attrs["id"]
                text=(entity_attrs["text"]).lower()
                type=(entity_attrs["type"]).lower()
                self.entity_collection[id]={}
                self.entity_collection[id]={"text":text,"type":type}

            interacting_pairs = sentence.getElementsByTagName("pair")
            # removing drugs
            for ids in self.entity_collection:

                        if self.entity_collection[ids]["text"] in words:
                           words.remove(self.entity_collection[ids]["text"])

            for pair in interacting_pairs:
                pair_attrs = dict(pair.attributes.items())
                type="null"

                if "type" in pair_attrs.keys():
                    type=pair_attrs["type"]

                if pair_attrs["ddi"]=="true":
                    is_there_a_positive_interaction_for_sentence=1

                    entity_1=self.entity_collection[pair_attrs["e1"]]["text"]
                    entity_2=self.entity_collection[pair_attrs["e2"]]["text"]

                    if entity_1 in self.interaction_collection.keys():
                        if entity_2 not in self.interaction_collection[entity_1].keys():
                            self.interaction_collection[entity_1][entity_2]=type
                    elif entity_2 in self.interaction_collection.keys():
                        if entity_1 not in self.interaction_collection[entity_2]:
                            self.interaction_collection[entity_2][entity_1]=type
                    else:
                        self.interaction_collection[entity_2]={}
                        self.interaction_collection[entity_2][entity_1]=type
                elif pair_attrs["ddi"]=="false" and is_there_a_positive_interaction_for_sentence!=1:
                     is_there_a_negative_interaction_for_sentence=1
            if is_there_a_positive_interaction_for_sentence ==1:
                    self.sentence_interaction_information[sentence_id]={}
                    self.sentence_interaction_information[sentence_id]="true"
                    self.positive_sentence_count=self.positive_sentence_count+1
                    self.populate_unigram_hashes(words,self.interaction_true_word_count)
                    self.update_bigram_hash(words,self.interaction_true_bigram_count)
                    self.update_trigram_hash(words,self.interaction_true_trigram_count)
            elif is_there_a_negative_interaction_for_sentence==1:
                   self.negative_sentence_count=self.negative_sentence_count+1
                   self.populate_unigram_hashes(words,self.interaction_false_word_count)
                   self.update_bigram_hash(words,self.interaction_false_bigram_count)
                   self.update_trigram_hash(words,self.interaction_false_trigram_count)
                   self.sentence_interaction_information[sentence_id]={}
                   self.sentence_interaction_information[sentence_id]="false"
            else:
                   self.sentence_interaction_information[sentence_id]={}
                   self.sentence_interaction_information[sentence_id]="false"
        document_data.close()