예제 #1
0
def run_models(policies: pd.DataFrame, claims: pd.DataFrame,
               testing: pd.DataFrame, config: Dict[str, str]) -> None:
    """
    With the loaded datasets, run the complete process of model building
    for each of SLM, GLM and MLM.
    """
    logging.info('Build features.')
    policies = build_features(policies)
    testing = build_features(testing)
    grouped = claims.groupby('pol_id')

    # Currently compute both averages and totals.
    averages, totals = get_claim_amounts(policies, grouped)
    counts = get_claim_counts(policies, grouped)

    res_slm = evaluate_slm(policies, counts, averages, testing)
    res_glm = evaluate_glm(policies, counts, averages, testing)
    res_mlm = evaluate_mlm(policies, counts, averages, testing)

    client = storage.Client(project=config['project'])
    _store_pickle_file(res_slm, 'slm', client, config['bucket'])
    _store_pickle_file(res_glm, 'glm', client, config['bucket'])
    _store_pickle_file(res_mlm, 'mlm', client, config['bucket'])

    res_slm.to_csv('slm.csv', encoding='utf-8')
    res_glm.to_csv('glm.csv', encoding='utf-8')
    res_mlm.to_csv('mlm.csv', encoding='utf-8')
예제 #2
0
    def buildFeature(self, oldData=None):
        """
        build the feature from the text corpus

        @type  oldData:  list object
        @param oldData:  the list of documents 
        """

        if oldData == None:
            oldData = self.docs

        if (len(oldData) < 5000):
            self.Nwords = 1000
        out = ft.build_features(oldData,
                                keyWords=None,
                                max_words=self.Nwords,
                                Stem=True,
                                Bigram=True,
                                Tfidf=self.Tfidf,
                                stopwords=True,
                                Preprocess=True)
        # save model for feature extraction on new documents
        self.featureObj = {
            'vectorizer': out['vectorizer'],
            'tfMtx': out['tfMtx'],
            'terms': out['terms']
        }
        # save the features
        self.features = out['TDM']
예제 #3
0
    def showUnseenFeatures(self, newData):
        """
        show the new word features extracted from newData but not in the exising text corpus

        @type  newData:  list object
        @param newData:  a list of new documents 
        
        @returns: a list of tuple (terms_D, tf_D) sorted by descending order of term frequency. terms_D is a list of terms, tf_D is a list of term frequency 
        """

        # vectorize the new data: only extract TF feature
        out = ft.build_features(newData,
                                keyWords=None,
                                max_words=self.Nwords,
                                Stem=True,
                                Bigram=True,
                                Tfidf=False,
                                stopwords=True,
                                Preprocess=True)
        tfMtx = out['tfMtx']
        terms = out['terms']

        # find the terms in newData but not oldData
        oldTerms = self.featureObj['terms']

        # find index corresponding to new terms
        ind = [i for i in range(len(terms)) if terms[i] not in oldTerms]
        termsNew = [terms[i] for i in ind]
        tfMtxNew = tfMtx[:, ind]

        # return features ranked by TF over alls samples
        return self.showFeatures(tfMtxNew, termsNew)
예제 #4
0
def extract_data(in_file):
    """Extract a set of log files into a set of log events"""
    events = []
    no_timestamp_lines = []
    no_timestamps = 0
    print(in_file)
    with open(in_file) as f:
        for log_line in f:
            try:
                # trying to extract the timestamp
                # the presence of a timestamp delimits a new log event
                timestamp, log_line = timestamp_extract(log_line)
                events.append(parse_event(timestamp, log_line, f))
            except ValueError:
                # we don't have a timestamp! oh no!
                no_timestamps += 1
                no_timestamp_lines.append(log_line)
    #print("Found {0} lines with no timestamp".format(no_timestamps))
    unknown_events = filter(lambda x: isinstance(x, UnknownEvent), events)
    #print("Found {0} unknown log events".format(len(list(unknown_events))))
    subms = events_to_submissions(events)
    problems = set()
    features = build_features(subms)

    problems_feature = next(f for f in features 
        if isinstance(f, ProblemsAttemptedCumulative))
    indices_to_delete = []
    for i in range(len(features[0].values)):
        if problems_feature.values[i] < 2:
            indices_to_delete.append(i)
    indices_to_delete.reverse()
    for i in indices_to_delete:
        for feature in features:
            del feature.values[i]
            if isinstance(feature, CumulativeStatisticsFeatureBase):
                del feature.max_values[i]
                del feature.min_values[i]
                del feature.mean_values[i]
                del feature.stdev_values[i]
    abandon_state = classify_problems(subms)
    return LogFileData(in_file, features, abandon_state)
예제 #5
0
########################################################################
# load data

# newsgroup
from load_data import load_moviereview_data
dt = load_moviereview_data()
docs = dt['docs']
label = dt['label']

docs = docs[1:5000]
label = label[1:5000]

#########################################################################
# build features with the text corpus

out = build_features(docs, keyWords=None, max_words= 5000, Stem = True, Bigram=True, Tfidf=True, stopwords=True, Preprocess=True)
features = out['TDM']
terms = out['terms']


###########################################################################
# split into training and testing
Nsamp = len(label)
Ntrain = 2000

random.seed(123)
random.shuffle(docs)
X_train = features[:Ntrain]
X_test = features[Ntrain:]
y_train = label[:Ntrain]
y_test = label[Ntrain:]
예제 #6
0
import settings
from features import build_features, FeatureFunction
from templates import *



a_templates = [WordLengthTemplate]
b_templates = [TagSequenceTemplate]

(a_features, b_features) = build_features(a_templates, b_templates)


print "a_features: ", a_features
print "b_features: ", b_features
#feature_function = FeatureFunction(a_features, b_features)


feature_function = FeatureFunction(a_features, b_features)



for i in xrange(feature_function.cardinality()):
    print feature_function.evaluate(i, ["hello"], ["yooo"], 0)
예제 #7
0
import settings
from features import build_features, FeatureFunction
from templates import *

a_templates = [WordLengthTemplate]
b_templates = [TagSequenceTemplate]

(a_features, b_features) = build_features(a_templates, b_templates)

print "a_features: ", a_features
print "b_features: ", b_features
#feature_function = FeatureFunction(a_features, b_features)

feature_function = FeatureFunction(a_features, b_features)

for i in xrange(feature_function.cardinality()):
    print feature_function.evaluate(i, ["hello"], ["yooo"], 0)