Python tokenize 예제들, matching.string_functions.tokenize Python 예제들

예제 #1

0

파일 보기

파일: gen_geo_features.py 프로젝트: dssg/machine_learning_legislation

    def operate(self, instance):
        """
        given an instance a list of categories as features
        """
        if not self.force and instance.feature_groups.has_key(self.name):
            return
        instance.feature_groups[self.name] = {}

        s = instance.attributes["entity_inferred_name"]
        tokens = tokenize(normalize_no_lower(s))

        instance.feature_groups[
            self.name]['GEO_FEAUTURE_geo_inferred_text_has_state'] = Feature(
                'GEO_FEAUTURE_geo_inferred_text_has_state',
                geo_inferred_text_has_state(tokens, self.full, self.full_upper,
                                            self.abbr))
        instance.feature_groups[
            self.name]['GEO_FEAUTURE_geo_inferred_text_has_county'] = Feature(
                'GEO_FEAUTURE_geo_inferred_text_has_county',
                geo_inferred_text_has_county(s))
        instance.feature_groups[
            self.name]['GEO_FEAUTURE_geo_inferred_text_has_city'] = Feature(
                'GEO_FEAUTURE_geo_inferred_text_has_city',
                geo_inferred_text_has_city(s, self.cities, self.cities_upper))
        instance.feature_groups[self.name][
            'GEO_FEAUTURE_geo_inferred_text_ends_with_state'] = Feature(
                'GEO_FEAUTURE_geo_inferred_text_ends_with_state',
                geo_inferred_text_ends_with_state(tokens, self.abbr,
                                                  self.full))

        logging.debug(
            "Feature count %d for entity id: %d after %s" %
            (instance.feature_count(), instance.attributes["id"], self.name))

예제 #2

0

파일 보기

파일: label_new_documents.py 프로젝트: dssg/machine_learning_legislation

 def get_state(self, row):
     tokens = string_functions.tokenize(row)
     for t in tokens:
         if t in self.capitalized_state_names:
             return self.capitalized_state_names_dict[t]
         if t in self.state_names:
             return self.state_names_dict[t]
         if t in self.state_abbreviations:
             return t
     return None

예제 #3

0

파일 보기

파일: label_new_documents.py 프로젝트: dssg/machine_learning_legislation

 def get_state(self, row):
     tokens = string_functions.tokenize(row)
     for t in tokens:
         if t in self.capitalized_state_names:
             return self.capitalized_state_names_dict[t]
         if t in self.state_names:
             return self.state_names_dict[t]
         if t in self.state_abbreviations:
             return t
     return None

예제 #4

0

파일 보기

파일: label_new_documents.py 프로젝트: dssg/machine_learning_legislation

    def label_row(self, row, column_indices, table_offset, congress, chamber,
                  document_type, number, sponsor_indices):

        instance = self.get_instance_from_row(row, column_indices)
        X, y, space = pipe.instances_to_matrix(
            [
                instance,
            ], feature_space=self.feature_space, dense=False)
        scores = self.model.decision_function(X)
        fields = [
            'congress', 'chamber', 'document_type', 'number', 'row',
            'row_offset', 'row_length', 'score', 'state', 'sponsors'
        ]
        cmd = "insert into candidate_earmarks (" + ", ".join(
            fields
        ) + ") values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) returning id"
        attributes = instance.attributes
        state = self.geo_coder.get_state(attributes['entity_text'])
        cur = self.conn.cursor()
        if sponsor_indices:
            print sponsor_indices

        sponsors = []
        for index in sponsor_indices:
            try:
                sponsor_cell = attributes['entity_text'].split("|")[index]
                sponsors_in_cell = string_functions.tokenize(
                    string_functions.normalize_no_lower(sponsor_cell))
                for sic in sponsors_in_cell:
                    if sic in self.sponsor_coder.sponsors[congress]:
                        sponsors.append(sic)

            except Exception as e:
                print "Index: %d" % index
                print len(attributes['entity_text'].split("|"))
                print attributes['entity_text']
                logging.exception("SCREW UP")

        sponsors_string = "|".join(sponsors)[:1024]

        cur.execute(cmd, (congress, chamber, document_type, number,
                          attributes['entity_text'], row.offset + table_offset,
                          row.length, scores[0], state, sponsors_string))
        curr_id = cur.fetchone()[0]

        for sponsor in sponsors:
            cur.execute(
                'insert into sponsors (candidate_earmark_id, sponsor) values (%s, %s)',
                (curr_id, sponsor))

        self.conn.commit()

예제 #5

0

파일 보기

파일: entity_text_bag_feature_generator.py 프로젝트: dssg/machine_learning_legislation

    def operate(self, instance):
        """
        given an instance a list of categories as features
        """
        if not self.force and instance.feature_groups.has_key(self.name):
            return
        instance.feature_groups[self.name] = {}

        tokens = tokenize(normalize_no_lower(instance.attributes["entity_inferred_name"]))

        for token in tokens:
            if token not in self.forbidden:
                feature_name = self.feature_prefix +token.lower()
                instance.feature_groups[self.name][feature_name] = Feature(feature_name, 1) 

       

        logging.debug( "Feature count %d for entity id: %d after %s" %(instance.feature_count(),instance.attributes["id"], self.name))

예제 #6

0

파일 보기

파일: gen_geo_features.py 프로젝트: dssg/machine_learning_legislation

    def operate(self, instance):
        """
        given an instance a list of categories as features
        """
        if not self.force and instance.feature_groups.has_key(self.name):
            return
        instance.feature_groups[self.name] = {}

        s = instance.attributes["entity_inferred_name"]
        tokens = tokenize(normalize_no_lower(s))

        
        
        instance.feature_groups[self.name]['GEO_FEAUTURE_geo_inferred_text_has_state'] = Feature('GEO_FEAUTURE_geo_inferred_text_has_state', geo_inferred_text_has_state(tokens, self.full, self.full_upper, self.abbr))
        instance.feature_groups[self.name]['GEO_FEAUTURE_geo_inferred_text_has_county'] = Feature('GEO_FEAUTURE_geo_inferred_text_has_county', geo_inferred_text_has_county(s))
        instance.feature_groups[self.name]['GEO_FEAUTURE_geo_inferred_text_has_city'] = Feature('GEO_FEAUTURE_geo_inferred_text_has_city', geo_inferred_text_has_city(s, self.cities, self.cities_upper))
        instance.feature_groups[self.name]['GEO_FEAUTURE_geo_inferred_text_ends_with_state'] = Feature('GEO_FEAUTURE_geo_inferred_text_ends_with_state', geo_inferred_text_ends_with_state(tokens, self.abbr, self.full))


        logging.debug( "Feature count %d for entity id: %d after %s" %(instance.feature_count(),instance.attributes["id"], self.name))

예제 #7

0

파일 보기

    def operate(self, instance):
        """
        given an instance a list of categories as features
        """
        if not self.force and instance.feature_groups.has_key(self.name):
            return
        instance.feature_groups[self.name] = {}

        tokens = tokenize(
            normalize_no_lower(instance.attributes["entity_inferred_name"]))

        for token in tokens:
            if token not in self.forbidden:
                feature_name = self.feature_prefix + token.lower()
                instance.feature_groups[self.name][feature_name] = Feature(
                    feature_name, 1)

        logging.debug(
            "Feature count %d for entity id: %d after %s" %
            (instance.feature_count(), instance.attributes["id"], self.name))

예제 #8

0

파일 보기

파일: simple_entity_text_feature_generator.py 프로젝트: dssg/machine_learning_legislation

    def operate(self, instance):
        """
        given an instance a list of categories as features
        """
        if not self.force and instance.feature_groups.has_key(self.name):
            return
        instance.feature_groups[self.name] = {}

        s = instance.attributes["entity_inferred_name"]
        num_chars = float(len(s))
        tokens = tokenize(normalize(s))
        num_tokens = float(len(tokens))

        for t in feature_functions:
            feature_name = self.feature_prefix + t[0]
            instance.feature_groups[self.name][feature_name] = Feature(
                feature_name, t[1](s, num_chars, tokens, num_tokens))

        logging.debug(
            "Feature count %d for entity id: %d after %s" %
            (instance.feature_count(), instance.attributes["id"], self.name))

예제 #9

0

파일 보기

파일: label_new_documents.py 프로젝트: dssg/machine_learning_legislation

    def label_row(self, row, column_indices, table_offset, congress, chamber, document_type, number, sponsor_indices):

        instance = self.get_instance_from_row(row, column_indices)
        X, y, space = pipe.instances_to_matrix([instance,], feature_space = self.feature_space, dense = False)
        scores = self.model.decision_function(X)
        fields = ['congress', 'chamber','document_type','number', 'row', 'row_offset', 'row_length', 'score', 'state', 'sponsors']
        cmd = "insert into candidate_earmarks (" + ", ".join(fields) + ") values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) returning id"
        attributes = instance.attributes
        state = self.geo_coder.get_state(attributes['entity_text'])
        cur = self.conn.cursor()
        if sponsor_indices:
            print sponsor_indices

        sponsors = []
        for index in sponsor_indices:
            try:
                sponsor_cell = attributes['entity_text'].split("|")[index]
                sponsors_in_cell = string_functions.tokenize(string_functions.normalize_no_lower(sponsor_cell))
                for sic in sponsors_in_cell:
                    if sic in self.sponsor_coder.sponsors[congress]:
                        sponsors.append(sic)

            except Exception as e:
                print "Index: %d" % index
                print len(attributes['entity_text'].split("|"))
                print attributes['entity_text']
                logging.exception("SCREW UP")

        sponsors_string = "|".join(sponsors)[:1024]

        cur.execute(cmd, (congress, chamber, document_type, number, attributes['entity_text'], row.offset+table_offset, row.length, scores[0], state, sponsors_string))
        curr_id = cur.fetchone()[0]

        for sponsor in sponsors:
            cur.execute('insert into sponsors (candidate_earmark_id, sponsor) values (%s, %s)', (curr_id,sponsor ))


        self.conn.commit()

예제 #10

0

파일 보기

파일: simple_entity_text_feature_generator.py 프로젝트: dssg/machine_learning_legislation

    def operate(self, instance):
        """
        given an instance a list of categories as features
        """
        if not self.force and instance.feature_groups.has_key(self.name):
            return
        instance.feature_groups[self.name] = {}

        s = instance.attributes["entity_inferred_name"]
        num_chars = float(len(s))
        tokens = tokenize(normalize(s))
        num_tokens = float(len(tokens))
        
        for t in feature_functions:
            feature_name = self.feature_prefix +t[0]
            instance.feature_groups[self.name][feature_name] = Feature(feature_name, t[1](s, num_chars, tokens, num_tokens))



        logging.debug( "Feature count %d for entity id: %d after %s" %(instance.feature_count(),instance.attributes["id"], self.name))