def test_wordnik_patterns_match(): from serapis.features import match_wordnik_rules with open("serapis/tests/data/patterns.yaml") as f: test_cases = yaml.load(f) for rule, sentence in test_cases.items(): assert rule in match_wordnik_rules(sentence), "Rule {} does not match '{}'".format(rule, sentence)
def detect(message): """Takes a message that must contain a list of URL objects, each having at least a doc property. This will split the doc of each URL into sentences, and determine whether each sentence is an FRD or not. """ batch_tag_sentences(message) # Load Models model_pipeline = PackagedPipeline().get() created_at = model_pipeline.metadata['created_at'] feature_union = model_pipeline._feature_union model = model_pipeline._pipeline class_idx = np.where(model.classes_ == 1)[0][0] # index of '1' pred in .predict_proba for url_object in message['urls']: readability_score(url_object) for sentence in url_object['sentences']: sentence_clean = sentence['s_clean'] pos = ' '.join([i[i.find('/') + 1:] for i in sentence['pos_tags'].split()]) # just pos tags sentence_feature_union = feature_union.transform({ 's_clean': [sentence['s_clean']], 'pos': [pos] }) # metadata sentence['model_creation_date'] = created_at # predictions from model sentence['patterns'] = match_wordnik_rules(sentence_clean) sentence['frd'] = model.predict(sentence_feature_union)[0] sentence['frd_likelihood'] = round(model.predict_proba(sentence_feature_union)[0][class_idx], 4) # P(Classification as FRD) return write_message('save', message)
def test_wordnik_patterns_match(): from serapis.features import match_wordnik_rules with open("serapis/tests/data/patterns.yaml") as f: test_cases = yaml.load(f) for rule, sentence in test_cases.items(): assert rule in match_wordnik_rules( sentence), "Rule {} does not match '{}'".format(rule, sentence)
def test_wordnik_patterns_perc(): from serapis.features import match_wordnik_rules from serapis.preprocess import clean_sentence min_coverage = 0.2 matches = 0.0 with open("serapis/tests/data/frds_wordnik.csv") as f: test_cases = list(csv.reader(f)) for term, sentence in test_cases: s_clean, _ = clean_sentence(sentence, term) matches += 1 if match_wordnik_rules(s_clean) else 0 assert matches / len( test_cases) > min_coverage, "Only matched {:.2f}% of data set".format( 100 * matches / len(test_cases))
def test_wordnik_patterns_perc(): from serapis.features import match_wordnik_rules from serapis.preprocess import clean_sentence min_coverage = 0.2 matches = 0.0 with open("serapis/tests/data/frds_wordnik.csv") as f: test_cases = list(csv.reader(f)) for term, sentence in test_cases: s_clean, _ = clean_sentence(sentence, term) matches += 1 if match_wordnik_rules(s_clean) else 0 assert matches / len(test_cases) > min_coverage, "Only matched {:.2f}% of data set".format( 100 * matches / len(test_cases) )
def detect(message): """Takes a message that must contain a list of URL objects, each having at least a doc property. This will split the doc of each URL into sentences, and determine whether each sentence is an FRD or not. """ batch_tag_sentences(message) # Load Models model_pipeline = PackagedPipeline().get() created_at = model_pipeline.metadata['created_at'] feature_union = model_pipeline._feature_union model = model_pipeline._pipeline class_idx = np.where( model.classes_ == 1)[0][0] # index of '1' pred in .predict_proba for url_object in message['urls']: readability_score(url_object) for sentence in url_object['sentences']: sentence_clean = sentence['s_clean'] pos = ' '.join([ i[i.find('/') + 1:] for i in sentence['pos_tags'].split() ]) # just pos tags sentence_feature_union = feature_union.transform({ 's_clean': [sentence['s_clean']], 'pos': [pos] }) # metadata sentence['model_creation_date'] = created_at # predictions from model sentence['patterns'] = match_wordnik_rules(sentence_clean) sentence['frd'] = model.predict(sentence_feature_union)[0] sentence['frd_likelihood'] = round( model.predict_proba(sentence_feature_union)[0][class_idx], 4) # P(Classification as FRD) return write_message('save', message)