Exemplo n.º 1
0
    def associate_data(self):
        """
        add a new property to the Journal: datas
        at one point this was generated automaticaly,
        but that slowed things down

        this allows it to be generated when it is needed
        which so far is only when then node.directory object
        is looking for a default image
        """
        self._datas = Association()
        for entry in self._entries:
            self._datas.associate(entry, entry.data)
Exemplo n.º 2
0
 def _check_entry_product(product, product_type):
     if product_type == 'specialities':
         return Speciality(**product)
     elif product_type == 'substances':
         return Substance(**product)
     elif product_type == 'associations':
         return Association(**product)
     else:
         raise ValueError()
Exemplo n.º 3
0
 def regles_asso(self):
     liste_regles = []
     #si l'itemset est de taille 1, return une liste vide
     if len(self) != 1:
         for item in self:
             antecedent = Itemset(self - Itemset([item]))
             consequent = Itemset([item])
             asso = Association(antecedent, consequent)
             liste_regles.append(asso)
     return liste_regles
Exemplo n.º 4
0
    def __init__(self, path=None, items=[], title=None, debug=False):

        self._entries = []

        #keys are compact date string
        self._dates = Association()

        #keys are tag name
        self._tags = Association()

        # renamed self.name to self.path
        # then use name as a general name for the journal, if displayed
        # *2009.11.07 10:28:44
        # using title instead of name, then if anything else is still
        # using name the old way, it will flag a more recognizable error
        self.title = title

        self.debug = debug

        #*2011.06.26 13:45:20
        #really no such thing as a default path (self.path)
        # used for default file path:
        # convert to string just incase a Path object is sent
        #self.path = str(path)

        #if we want to store it to a path, then should specify that
        #in a to_file / save call
        #otherwise should keep track of all paths loaded
        #so that we can reload them later.
        self.loaded = []
        #or ...
        #could index based on original source too
        #this would allow them to be re-saved to their original source
        self._sources = Association()

        if path:
            self.load(path)

        if items:
            self.update_entries(items)
Exemplo n.º 5
0
    def clear(self):
        """
        clear mind
        start fresh

        in practice it's probably easier to just create a new journal
        but reload might need this
        """
        del self._entries
        self._entries = []
        del self._tags
        self._tags = Association()
        del self._dates
        self._dates = Association()

        self.loaded = []
        self.sources = Association()
        #todo
        #way to see how much memory is consumed by current process?
        #should show before and after if so

        return True
Exemplo n.º 6
0
    def associate_files(self):
        """
        add a new property to the Journal: files
        similar to associate_datas

        but checks each entry's data for path information
        if path is found
        just take the filename portion
        and associate the entry with that portion

        otherwise associate each line of the entry's data (as is)
        """
        self._files = Association()
        for entry in self._entries:
            lines = entry.data.splitlines()
            for line in lines:
                if re.search('/', line):
                    name = os.path.basename(line)
                    self._files.associate(entry, name)
                elif line.strip():
                    self._files.associate(entry, line.strip())
                else:
                    #must be a blank line
                    pass
Exemplo n.º 7
0
class Journal(object):
    """
    Main Moments module for collecting Moment entries in one place

    *2011.06.26 13:50:20
    not sure that it even makes sense to base this on a standard list
    should use a list internally
    but we really don't use any of the methods for a list to interact
    with a Journal object

    so we could have
    self._entries
    self._tags
    self._dates

    to store everything internally
    and then use custom methods for interacting with the Journal
    these methods should be the same
    whether the Journal is a local, native, instance,
    or if it is remote.

    i.e.
    using journal object attributes directly in code is discouraged
    to ensure that local and remote journal objects work identically

    """
    def __init__(self, path=None, items=[], title=None, debug=False):

        self._entries = []

        #keys are compact date string
        self._dates = Association()

        #keys are tag name
        self._tags = Association()

        # renamed self.name to self.path
        # then use name as a general name for the journal, if displayed
        # *2009.11.07 10:28:44
        # using title instead of name, then if anything else is still
        # using name the old way, it will flag a more recognizable error
        self.title = title

        self.debug = debug

        #*2011.06.26 13:45:20
        #really no such thing as a default path (self.path)
        # used for default file path:
        # convert to string just incase a Path object is sent
        #self.path = str(path)

        #if we want to store it to a path, then should specify that
        #in a to_file / save call
        #otherwise should keep track of all paths loaded
        #so that we can reload them later.
        self.loaded = []
        #or ...
        #could index based on original source too
        #this would allow them to be re-saved to their original source
        self._sources = Association()

        if path:
            self.load(path)

        if items:
            self.update_entries(items)

    #*2011.06.26 13:53:03
    #should there be a to_file
    #and also a to_original_files (or something like that) ?
    #could loop through all of self.loaded and store changes to those
    #entries affected in those sources

    #aka to_file
    def save(self, filename=None, order='original', include_path=False):
        """
        >>> from entry import Entry
        >>> e = Entry("test entry")
        >>> j.add_entry(e)
        >>> j.to_file("sample_log2.txt")
        >>> k = Journal()
        >>> k.load("sample_log2.txt")
        >>> len(k.entries)
        2
        """
        if filename:
            self.path = str(filename)

        if hasattr(self, "path") and self.path:
            l = Log(self.path)
        else:
            print "No name to save Journal to"
            exit()

        #l.from_journal(self, holder, entry)
        l.from_entries(self.sort(order=order), include_path=include_path)
        l.to_file()
        l.close()

    def save_originals(self):
        """
        loop through all self.sources or self.loaded
        and save the corresponding entries back
        (only if there is an actual change???)

        might want to return any entries that don't have a destination
        or would it be better to return an error?
        or not save if entries don't have a destination
        """
        pass

    def save_instance(self, instance_file):
        """
        save the currently loaded sources to an instance file
        """
        pass

    def load_instance(self, instance_name, instance_file):
        """
        load the first entry tagged instance_name from the instance file
        """
        pass

    #aka open, etc
    #formerly: from_file, add_log_to_journal, add_file
    def load(self, log_name, add_tags=[]):
        """
        adds a log file to the journal object currently in memory

        this can be called multiple times with different filenames
        to merge those files/entries into the journal

        >>> from journal import *
        >>> j = Journal()
        >>> j.load("sample_log.txt")
        >>> len(j.entries)
        1

        return True if the file was able to be loaded
        False if it was not a journal/Log file
        """
        found_entries = 0

        if not str(log_name) in self.loaded:
            self.loaded.append(str(log_name))
        #TODO:
        #should also handle adding entry to self._sources??
        #or will that happen in update_entries

        #would it be better for sources and loaded to be associated
        #on an update?
        #that way if entries are added outside of a load
        #the sources would still get updated.

        l = Log()
        l.from_file(str(log_name))

        entries = l.to_entries(add_tags)
        #print "%s entries loaded from file" % len(entries)
        #print "%s entries in self before merging in entries" % len(self)
        self.update_many(entries)
        #print "%s entries in self after merging in entries" % len(self)

        #if l.has_entries:
        found_entries = len(entries)

        l.close()

        return found_entries

    def reload(self):
        """
        create a new instance of a journal
        based on the paths we have previously loaded
        (self.loaded)

        load everything that was previously loaded
        then swap out the contents old journal for the new one
        """
        #use this to load new _entries, etc
        temp_j = Journal()
        for item in self.loaded:
            new_j.load(item)
            #temp_j = load_journal(item)
            #new_j.from_entries(temp_j.entries())
            #del temp_j
        old_entries = self._entries
        old_tags = self._tags
        old_dates = self._dates
        old_sources = self._sources

        self._entries = new_j._entries
        self._tags = new_j._tags
        self._dates = new_j._dates
        self._sources = new_j._sources

        del old_entries
        del old_tags
        del old_dates
        del old_sources

    def _add(self, entry, position=None):
        """
        this is the base case for adding an entry
        blindly adds the entry object to the journal's list of entries
        no checks are performed

        will add multiple copies of the same entry to the journal
        use update to avoid duplicates
        """
        if position is None:
            #cannot assume insert here...
            #insert(0, entry) reverses the list order on log read
            self._entries.append(entry)
        else:
            self._entries.insert(position, entry)

        if hasattr(entry, "created") and entry.created:
            entry_time = entry.created.compact()
            self._dates.associate(entry, entry_time)
        else:
            self._dates.associate(entry, None)

        for t in entry.tags:
            self._tags.associate(entry, t)

    #TODO:
    #integrate source
    def update(self, entry, position=None, source=None):
        """
        checks if an entry already exists in the journal
        if other entries in with that time stamp are similar, 
        see if they can be merged easily (i.e. only tags differ)

        otherwise just add it as a separate entry
        no longer attempting to choose which one to keep here
        since journal can hold multiple entries with the same timestamp

        can merge later as needed using dedicated script for that purpose
        """
        if not hasattr(entry, "created") or not entry.created:
            if entry not in self._entries:
                self._add(entry, position)
                if self.debug:
                    print "Entry has no time associated, and no other entry found. added"

        else:
            #this makes entry_time available in the event the entry already
            #is in the journal:
            #print entry.created
            entry_time = entry.created.compact()
            if entry not in self._entries:

                if not self._dates.has_key(entry_time):
                    self._add(entry, position)
                    if self.debug:
                        print "No other entry found with time: %s. added" % entry_time

                elif self._dates.has_key(entry_time):
                    #it must have *something* in that time slot
                    #check for duplicates
                    if self.debug:
                        print "Other entries found with time: %s. checking all.." % entry_time

                    options = self._dates[entry_time]
                    found_match = False
                    for existing in options:

                        if existing.is_equal(entry, debug=self.debug):
                            #print "DUPE, but tags and data are same... skipping"
                            found_match = True
                            if self.debug: print "Equal entry found. Skipping"

                        #only want to merge if we have data
                        #otherwise blank entries can end up grouped together
                        elif entry.data and (existing.data == entry.data):
                            #tags must differ... those are easy to merge:
                            print "from: %s, %s" % (existing.path,
                                                    existing.created)
                            print "and: %s, %s" % (entry.path, entry.created)
                            print "only TAGS differ"
                            print "original: %s" % existing.tags
                            print "new: %s" % entry.tags
                            existing.tags.union(entry.tags)
                            print "merged: %s" % existing.tags
                            found_match = True

                        else:
                            #this one didn't match
                            #but we won't add the entry until we've checked them all
                            pass

                    if not found_match:
                        #2009.12.04 16:03:15
                        #this information doesn't help much anymore:
                        #print "MULTIPLE ENTRIES EXISTS AT: %s" % (entry_time)
                        #print "but none matched this one.  Adding now"
                        self._add(entry, position)

                        if self.debug:
                            print "No equivalent entries found. adding"
            else:
                if self.debug:
                    print "Entry (%s) already exists in journal" % entry_time

    #aka create, new
    def make(self, data, tags=[], created=None, source='', position=0):
        """
        helper for making a new entry right in a journal object
        this way should not need to import moments.entry.Entry elsewhere
        """
        if not created:
            created = datetime.now()
        entry = Moment(data, tags, created, path=source)
        #print "Journal.make.position: %s" % position
        self.update(entry, position=position)
        return entry

    #AKA DELETE
    def remove(self, entry):
        """
        remove associations from self._dates and self._tags
        then remove the entry from the journal.
        """
        #text_time = str(entry.created)
        #text_time = e.created.strftime(time_format)

        self._tags.remove(entry)
        self._dates.remove(entry)

        #remove from the list of entries
        self._entries.remove(entry)

    #*2011.07.09 10:32:42
    #is this ever used?
    #seems dangerous to remove everything at a given timestamp
    #more likely to add as a separate one
    #or remove explicitly and then add/update/make
    ## def replace(self, entry):
    ##     """
    ##     remove all entries from the journal with the same timestamp as entry
    ##     then add the new entry to the journal

    ##     i.e.
    ##     accepts a new entry
    ##     and uses it to find and then remove the original one(s)
    ##     add the new one to the journal
    ##     thereby replacing the original(s)
    ##     """
    ##     entry_time = entry.created.compact()

    ##     if self._dates.has_key(entry_time):
    ##         options = self._dates[entry_time]
    ##     else:
    ##         options = []
    ##     for existing in options:
    ##         self.remove(existing)

    ##     self._add(entry)

    #aka from_entries
    #aka add_entries
    #aka update_entries
    def update_many(self, entries, source=None):
        """
        loop over a list of entries to add/update each one to the journal
        """
        for e in entries:
            self.update(e, source=source)

    #aka remove_entries
    def remove_many(self, entries):
        """
        take a list of entry objects,
        remove each one
        """
        for e in entries:
            self.remove(e)

    #Following are all different ways to READ
    #they are also closely related to the hidden properties:
    #_tags, _dates, _entries

    #*2011.07.05 21:14:47
    #thinking that it makes sense to have two separate calls
    #could combine tag and tags (etc)
    #by returning the plural version (dict) when no tag specified
    #but the function name is unclear in that case

    def tag(self, tag_key=None):
        """
        lookup tag_key in self._tags

        should only return a list of entries associated with that tag
        not a dict
        with the tag name
        server can do that
        but server needs to be a little different
        """
        #print self._tags.keys()
        if tag_key and self._tags.has_key(tag_key):
            #print self._tags[tag_key]
            ## moments = []
            ## for m in self._tags[tag_key]:
            ##     #instead of rendering a string:
            ##     #moments.append(m.render())
            ##     #supply a dictionary of the moment item
            ##     moments.append(m.as_dict())
            #return { tag_key:self._tags[tag_key] }
            return self._tags[tag_key]
        ## elif tag_key:
        ##     #must not have any content associated with this tag
        ##     return { tag_key:[] }
        else:
            #could also return self.tags()
            #return self.tags()
            #return { 'tags': self._tags.keys() }
            #return { tag_key:[] }
            return []

    def tags(self, tags=[]):
        """
        return a dictionary with:
        all tags as keys, and
        number of entries for each tag as values

        *2011.07.10 10:38:07
        also
        could use mindstream.entries_tagged
        to accept a list of tags
        and combine all of those entries into a single list
        and return that
        """
        if tags:
            #*2011.11.09 11:42:38
            #if there is only one tag
            #should we just call self.tag()???

            if not isinstance(tags, list):
                tags = [tags]
            found_entries = Journal()
            for t in tags:
                if self._tags.has_key(t):
                    #print len(self._tags[t])
                    found_entries.update_many(self._tags[t])

            found_entries = found_entries.sort("reverse-chronological")
            #return found_entries._entries
            return found_entries

        else:
            tdict = {}
            for tag in self._tags.keys():
                tdict[tag] = len(self._tags[tag])
            return tdict

    def date(self, date_key=None):
        """
        lookup date_key in self._dates
        date_key should be compact stamp
        """
        if date_key:
            if isinstance(date_key, Timestamp):
                ts = date_key
            else:
                ts = Timestamp(compact=date_key)

            #print ts, type(ts)
            #print ts.accuracy
            if ts.accuracy and ts.accuracy != "second":
                rr = Timerange(ts)
                #get the timerange
                tr = rr.default()
                #print tr
                #print tr.start.datetime
                #print tr.end.datetime
                entries = self.range(tr.start, tr.end)

                #return {ts.compact():entries}
                return entries

            elif self._dates.has_key(ts.compact()):
                entries = self._dates[ts.compact()]
                #print "LEN ENTRIES: %s" % len(entries)
                #print entries
                #return { ts.compact():entries }
                return entries
            else:
                #return { ts.compact():[] }
                return []
        else:
            #could also return self.dates()
            #return self.dates()
            #return { date_key:[] }
            return []

    def dates(self):
        """
        return a dictionary with:
        all dates as keys, and
        number of entries for each date as values
        """
        ddict = {}
        for key in self._dates.keys():
            #print "KEY:", key

            #key might be blank here (i.e. no timestamp)
            if key:
                ts = Timestamp(compact=key)
                #print ts
                ddict[ts.compact()] = len(self._dates[key])
            else:
                ddict[key] = len(self._dates[key])
        return ddict

    #aka item???
    def entry(self, index=None):
        """
        return the item at index point in list
        is this already defined on a list object? should be consistent
        """
        if len(self._entries) > index:
            return self._entries[index]
        else:
            return None

    def entries(self):
        """
        return a list of entries
        making a function call rather than an attribute
        to make consistent between local and remote calls
        """
        return self._entries

    def related(self, key):
        """
        look for tags
        if no matching tags
        see if it is a date string  (get range, find entries there)

        either way return tags that are related
        (maybe as a dictionary {'tag':number_of_items} ...
         same as self._tags)
        """
        #make sure we have it, otherwise nothing relates
        if not self._tags.has_key(key):
            return []

        entries = self._tags[key]
        related = []
        for e in entries:
            #todo:
            #could also generate a cloud
            #ranking most common related higher
            for t in e.tags:
                if t not in related:
                    related.append(t)

        return related

    def search(self, look_for, data=False, limit=0):
        """
        scan tags for tags matching (searching) look_for
        if data is True, look in entry.data too
        """
        tags = self._tags.keys()
        found = []

        # in this case, we'll return the whole entry
        if data:
            for e in self._entries:
                if re.search(look_for, e.data):
                    found.append(e)
                else:
                    for t in e.tags():
                        if re.search(look_for, t):
                            found.append(e)

        # in this case we'll only return matching tags
        else:
            results = []
            for t in tags:
                if re.search(look_for, t):
                    results.append(t)

            ## #now look for the results that start with "look_for"
            ## matches = []
            ## for r in results:
            ##     if re.match(look_for, r):
            ##         matches.append(r)
            ##         results.remove(r)

            # sort tags by the number of entries they have
            priority = []
            for tag in results:
                priority.append((len(self._tags[tag]), tag))
            priority.sort()
            priority.reverse()
            #print "Priority: %s" % priority

            for p in priority:
                found.append(p[1])

        if limit:
            found = found[:int(limit)]

        return found

    def sort(self, order='original'):
        """
        Sorts the items in our Journal's ._entries list
        
        returns a list of  the rearranged order of the entries in the journal

        can specify order:

        'original'
        to keep the original order that the entries were added to the journal

        'reverse'

        'chronological' or 'oldest to newest'
        oldest entries first in the list

        'reverse-chronological'  or 'newest to oldest'
        
        if not all entries are wanted, see self.range()
        """
        #print order
        if order == "original":
            return self._entries

        elif order == "reverse":
            self._entries.reverse()
            return self._entries

        else:
            entry_times = self._dates.keys()

            if order == "reverse-chronological" or order == 'newest to oldest':
                entry_times.sort()
                entry_times.reverse()
            elif order == "chronological" or order == 'oldest to newest':
                if self.debug: print "oldest to newest"
                entry_times.sort()
                if self.debug: print entry_times
            else:
                raise ValueError, "Unknown sort option supplied: %s" % order

            entries = []
            for et in entry_times:
                elist = self._dates[et]
                for entry in elist:
                    entries.append(entry)

            assert len(entries) == len(self._entries)
            del self._entries
            self._entries = entries

            return entries

    #aka limit, timerange, mindstream.time_range
    def range(self, start=None, end=None):
        """
        if no start *and* end specified
        return the time range for the entries in the currently loaded journal

        if only start
        return the entries in range for the accuracy of the start (e.g. 1 day)

        if start and end
        return all entries in the journal that fall in that range

        should accept a string, a datetime object, or a Timestamp object
        """

        if start is None and end is None:
            dates = self._dates.keys()
            dates.sort()
            start = dates[0]
            end = dates[-1]
            #might have entries with no timestamp first:
            if start is None:
                start = dates[1]
            print start, end
            return Timerange(start=start, end=end)

        else:
            start = Timestamp(start)
            if end:
                end = Timestamp(end)
            else:
                relative = Timerange(start)
                end = relative.end

            times = self._dates.keys()
            times.sort()

            matches = []
            for t in times:
                #not sure why we're using just time here
                #seems like we would want to use the date too?
                #pytime = Timestamp(t).time

                #sometimes t is None... those don't fit in a range.
                if t:
                    pytime = Timestamp(t).datetime
                    if (pytime >= start.datetime) and (pytime <= end.datetime):
                        matches.extend(self._dates[t])
            return matches

    def clear(self):
        """
        clear mind
        start fresh

        in practice it's probably easier to just create a new journal
        but reload might need this
        """
        del self._entries
        self._entries = []
        del self._tags
        self._tags = Association()
        del self._dates
        self._dates = Association()

        self.loaded = []
        self.sources = Association()
        #todo
        #way to see how much memory is consumed by current process?
        #should show before and after if so

        return True

    def associate_data(self):
        """
        add a new property to the Journal: datas
        at one point this was generated automaticaly,
        but that slowed things down

        this allows it to be generated when it is needed
        which so far is only when then node.directory object
        is looking for a default image
        """
        self._datas = Association()
        for entry in self._entries:
            self._datas.associate(entry, entry.data)

    def associate_files(self):
        """
        add a new property to the Journal: files
        similar to associate_datas

        but checks each entry's data for path information
        if path is found
        just take the filename portion
        and associate the entry with that portion

        otherwise associate each line of the entry's data (as is)
        """
        self._files = Association()
        for entry in self._entries:
            lines = entry.data.splitlines()
            for line in lines:
                if re.search('/', line):
                    name = os.path.basename(line)
                    self._files.associate(entry, name)
                elif line.strip():
                    self._files.associate(entry, line.strip())
                else:
                    #must be a blank line
                    pass
Exemplo n.º 8
0
def autofis_onecv(file_zip, file_train, file_test, parameters):
    # General parameters
    t_norm = parameters[3]
    max_size_of_premise = parameters[5]
    association_method = parameters[11]
    aggregation_method = parameters[12]

    # Gathering parameters
    # Formulation parameters:
    par_area, par_over, par_pcd = toolfis.get_formulation_parameters(parameters)

    # 1. Lecture & Fuzzification
    out1 = toolfis.lecture_fuz_one_cv(file_zip, file_train, file_test, parameters)
    ux_train, cbin_train = out1[0]
    ux_test, cbin_test = out1[1]
    num_premises_by_attribute, premises_by_attribute, ref_attributes, premises_contain_negation = out1[2]
    freq_classes = out1[3]

    report = []  # To save our results

    try:
        # 3. Formulation
        f2 = Formulation(ux_train, cbin_train, ref_attributes, premises_by_attribute,
                         num_premises_by_attribute, premises_contain_negation)
        # Inputs given by user
        arbol = f2.gen_ARB(max_size_of_premise, t_norm, par_area, par_over, par_pcd)

        status = [0 if not i[0] else 1 for i in arbol]
        sum_status = sum(status)
        if sum_status != len(arbol):
            if sum_status == 0:
                raise ValueError("Error in Formulation Module. Any premise survived. "
                                 "Sorry, you can not continue in the next stage."
                                 "\nTry to change the configuration")
            else:
                arb = [i for i in arbol if i[0]]
                arbol, arb = arb, arbol

        number_classes = cbin_train.shape[1]

        report.append("\nFormulation:\n-----------------")
        report.append("Elementos acorde a la profundidad " + str(len(arbol)) + " del arbol")
        for i in range(len(arbol)):
            report.append('Profundidad ' + str(i + 1) + ': ' + str(arbol[i][1].shape))
            # print 'Profundidad ' + str(i + 1) + ': ' + str(arbol[i][1].shape)

        # 4. Association: ex-Division
        f3 = Association(arbol, cbin_train)
        premises_ux_by_class = f3.division(association_method)

        status = [0 if not i[0] else 1 for i in premises_ux_by_class]
        if sum(status) != number_classes:
            raise ValueError("Error in Division Module. Some classes did not get premises. "
                             "Sorry, you can not continue in the next stage."
                             "\nTry to change the configuration")

        # 5. Aggregation:
        f4 = Aggregation(premises_ux_by_class, cbin_train)
        output_aggregation = f4.aggregation(aggregation_method)

        premises_weights_names = output_aggregation[0]
        estimation_classes = output_aggregation[1]

        status = [0 if not i[0] else 1 for i in premises_weights_names]
        if sum(status) != number_classes:
            raise ValueError("Error in Aggregation Module. Some classes did not get premises. "
                             "Sorry, you can not continue in the next stage."
                             "\nTry to change the configuration")

        final_premises_classes = []
        report.append("\n\nPremises:\n=========")
        for i in range(len(premises_weights_names)):
            report.append("Premises of Class " + str(i) + ": " + str(premises_weights_names[i][0]))
            final_premises_classes.append(premises_weights_names[i][0])
            report.append("weights_" + str(i) + ": " + str(premises_weights_names[i][1].T))

        # 6. Decision:
        f5 = Decisions(estimation_classes, freq_classes)
        train_bin_prediction = f5.dec_max_pert()

        # 7. Evaluation
        f6 = Evaluation(premises_weights_names, final_premises_classes, freq_classes)
        metrics_train = f6.eval_train(cbin_train, train_bin_prediction)
        metrics_test = f6.eval_test(cbin_test, ux_test, t_norm)

        report.append("\nEvaluation Training:\n---------------------------")
        report.append("Accuracy on train dataset: " + str(metrics_train[0]))
        report.append("AUC in train dataset: " + str(metrics_train[1]))
        report.append("Recall: " + str(metrics_train[3]))
        report.append('Confusion matrix:\n' + str(metrics_train[2]))

        report.append("\nEvaluation Testing:\n---------------------------")
        report.append("Accuracy on test dataset: " + str(metrics_test[0]))
        report.append("AUC in test dataset: " + str(metrics_test[1]))
        report.append("Recall: " + str(metrics_test[3]))
        report.append("Confusion matrix:\n" + str(metrics_test[2]))

        # Metrics to eval: accuracy_test, auc_test,
        #                  [num_regras, total_rule_length, tamano_medio_das_regras]]
        metricas = [1, [metrics_train[0], metrics_test[0], metrics_train[1], metrics_test[1], metrics_test[4]]]

    except ValueError as e:
        print e
        report = e  # .append("\n" + str(e))
        metricas = [0, "No se termino el proceso, se detuvo en algun etapa"]

    return report, metricas
Exemplo n.º 9
0
Arquivo: data.py Projeto: wuhenq/Test
    def process_nom_features(self):
        """
        处理一批标称属性,获取所有可能的取值及其对应个数(包括缺失值的个数)
        :param feature_list: 属性列表
        :return: 一个字典,key为所有可能取值,value为取值对应的个数
        """
        out_path = self.result_path
        association = Association()
        filename = self.dataset_path

        columns = []

        dataload = pd.read_csv(filename)
        dataload['price'] = pd.cut(
            dataload['price'],
            [0, 8, 12, 16, 20, 24, 28, 32, 36, 60, 100, 3300])
        dataload['points'] = pd.cut(dataload['points'], 20)

        dataload = dataload[[
            'country', 'points', 'price', 'province', 'region_1', 'variety'
        ]]
        for feature_name in dataload.keys():
            print("Dealing with feature: {}".format(feature_name))
            columns.append(list(dataload[feature_name]))

        rows = list(zip(*columns))

        dataset = []
        feature_names = list(dataload.keys())
        for data_line in rows:
            data_set = []
            for i, value in enumerate(data_line):
                if value == value:
                    data_set.append((feature_names[i], value))
            if data_set:
                dataset.append(data_set)

        freq_set, support_data = association.apriori(dataset)
        support_data_out = sorted(support_data.items(),
                                  key=lambda d: d[1],
                                  reverse=True)
        #print(support_data_out)

        big_rules_list = association.generate_rules(freq_set, support_data)
        big_rules_list = sorted(big_rules_list,
                                key=lambda x: x[3],
                                reverse=True)
        big_rules_list = sorted(big_rules_list,
                                key=lambda x: x[4],
                                reverse=True)
        #print(big_rules_list)

        freq_set_file = open('freq_set.json', 'w', encoding='utf-8')
        for (key, value) in support_data_out:
            result_dict = {'set': None, 'sup': None}
            set_result = list(key)
            sup_result = value
            result_dict['set'] = set_result
            result_dict['sup'] = sup_result
            json_str = json.dumps(result_dict, cls=MyEncoder)
            freq_set_file.write(json_str + '\n')
        freq_set_file.close()

        rules_file = open('rules.json', 'w', encoding='utf-8')
        for result in big_rules_list:
            result_dict = {
                'X_set': None,
                'Y_set': None,
                'sup': None,
                'conf': None,
                'lift': None
            }
            X_set, Y_set, sup, conf, lift, cosine = result
            result_dict['X_set'] = list(X_set)
            result_dict['Y_set'] = list(Y_set)
            result_dict['sup'] = sup
            result_dict['conf'] = conf
            result_dict['lift'] = lift
            result_dict['cosine'] = cosine
            json_str = json.dumps(result_dict,
                                  cls=MyEncoder,
                                  ensure_ascii=False)
            rules_file.write(json_str + '\n')
        rules_file.close()