def run_models(policies: pd.DataFrame, claims: pd.DataFrame, testing: pd.DataFrame, config: Dict[str, str]) -> None: """ With the loaded datasets, run the complete process of model building for each of SLM, GLM and MLM. """ logging.info('Build features.') policies = build_features(policies) testing = build_features(testing) grouped = claims.groupby('pol_id') # Currently compute both averages and totals. averages, totals = get_claim_amounts(policies, grouped) counts = get_claim_counts(policies, grouped) res_slm = evaluate_slm(policies, counts, averages, testing) res_glm = evaluate_glm(policies, counts, averages, testing) res_mlm = evaluate_mlm(policies, counts, averages, testing) client = storage.Client(project=config['project']) _store_pickle_file(res_slm, 'slm', client, config['bucket']) _store_pickle_file(res_glm, 'glm', client, config['bucket']) _store_pickle_file(res_mlm, 'mlm', client, config['bucket']) res_slm.to_csv('slm.csv', encoding='utf-8') res_glm.to_csv('glm.csv', encoding='utf-8') res_mlm.to_csv('mlm.csv', encoding='utf-8')
def buildFeature(self, oldData=None): """ build the feature from the text corpus @type oldData: list object @param oldData: the list of documents """ if oldData == None: oldData = self.docs if (len(oldData) < 5000): self.Nwords = 1000 out = ft.build_features(oldData, keyWords=None, max_words=self.Nwords, Stem=True, Bigram=True, Tfidf=self.Tfidf, stopwords=True, Preprocess=True) # save model for feature extraction on new documents self.featureObj = { 'vectorizer': out['vectorizer'], 'tfMtx': out['tfMtx'], 'terms': out['terms'] } # save the features self.features = out['TDM']
def showUnseenFeatures(self, newData): """ show the new word features extracted from newData but not in the exising text corpus @type newData: list object @param newData: a list of new documents @returns: a list of tuple (terms_D, tf_D) sorted by descending order of term frequency. terms_D is a list of terms, tf_D is a list of term frequency """ # vectorize the new data: only extract TF feature out = ft.build_features(newData, keyWords=None, max_words=self.Nwords, Stem=True, Bigram=True, Tfidf=False, stopwords=True, Preprocess=True) tfMtx = out['tfMtx'] terms = out['terms'] # find the terms in newData but not oldData oldTerms = self.featureObj['terms'] # find index corresponding to new terms ind = [i for i in range(len(terms)) if terms[i] not in oldTerms] termsNew = [terms[i] for i in ind] tfMtxNew = tfMtx[:, ind] # return features ranked by TF over alls samples return self.showFeatures(tfMtxNew, termsNew)
def extract_data(in_file): """Extract a set of log files into a set of log events""" events = [] no_timestamp_lines = [] no_timestamps = 0 print(in_file) with open(in_file) as f: for log_line in f: try: # trying to extract the timestamp # the presence of a timestamp delimits a new log event timestamp, log_line = timestamp_extract(log_line) events.append(parse_event(timestamp, log_line, f)) except ValueError: # we don't have a timestamp! oh no! no_timestamps += 1 no_timestamp_lines.append(log_line) #print("Found {0} lines with no timestamp".format(no_timestamps)) unknown_events = filter(lambda x: isinstance(x, UnknownEvent), events) #print("Found {0} unknown log events".format(len(list(unknown_events)))) subms = events_to_submissions(events) problems = set() features = build_features(subms) problems_feature = next(f for f in features if isinstance(f, ProblemsAttemptedCumulative)) indices_to_delete = [] for i in range(len(features[0].values)): if problems_feature.values[i] < 2: indices_to_delete.append(i) indices_to_delete.reverse() for i in indices_to_delete: for feature in features: del feature.values[i] if isinstance(feature, CumulativeStatisticsFeatureBase): del feature.max_values[i] del feature.min_values[i] del feature.mean_values[i] del feature.stdev_values[i] abandon_state = classify_problems(subms) return LogFileData(in_file, features, abandon_state)
######################################################################## # load data # newsgroup from load_data import load_moviereview_data dt = load_moviereview_data() docs = dt['docs'] label = dt['label'] docs = docs[1:5000] label = label[1:5000] ######################################################################### # build features with the text corpus out = build_features(docs, keyWords=None, max_words= 5000, Stem = True, Bigram=True, Tfidf=True, stopwords=True, Preprocess=True) features = out['TDM'] terms = out['terms'] ########################################################################### # split into training and testing Nsamp = len(label) Ntrain = 2000 random.seed(123) random.shuffle(docs) X_train = features[:Ntrain] X_test = features[Ntrain:] y_train = label[:Ntrain] y_test = label[Ntrain:]
import settings from features import build_features, FeatureFunction from templates import * a_templates = [WordLengthTemplate] b_templates = [TagSequenceTemplate] (a_features, b_features) = build_features(a_templates, b_templates) print "a_features: ", a_features print "b_features: ", b_features #feature_function = FeatureFunction(a_features, b_features) feature_function = FeatureFunction(a_features, b_features) for i in xrange(feature_function.cardinality()): print feature_function.evaluate(i, ["hello"], ["yooo"], 0)