def handler(event, context): if "term" in event: # API call if not qualify_term(event['term']): return {'error': 'Invalid search term'} message = { "word": event['term'], 'hashslug': hashslug(event['term']) } if "sentence" in event: # Detect s_clean, variants = clean_sentence(event['sentence'], event['term']) message['crawl_date'] = now() message['urls'] = [{ "url": event.get('url'), "source": get_source_from_url(event.get('url')), "sentences": [{ "s": event['sentence'], "s_clean": s_clean, }], "variants": list(variants) }] return tasks.detect(message) else: # Search return tasks.search(message) elif "Records" in event: # This comes from S3 for record in event['Records']: bucket = record['s3']['bucket']['name'] key = record['s3']['object']['key'] key = key.replace("%3A", ":") # That's my URLDecode. if key.count(":") == 2: return run_task(bucket, key) elif key.endswith(".wordlist"): return add_words(bucket, key) else: print "Don't know what to do with '{}'".format(key)
def extract_sentences(self, page_text): """Finds all sentences that contain the term or a spelling variants. Sets self.sentences ans self.variants. Args: page_text: str Returns str -- cleaned page text. """ doc = [] for paragraph in page_text.split('\n\n'): if is_english(paragraph): for sentence in paragraph_to_sentences(paragraph, self.term): if qualify_sentence(sentence): doc.append(sentence) s_clean, variants = clean_sentence(sentence, self.term) if variants and s_clean not in [ s['s_clean'] for s in self.sentences ]: self.variants.update(variants) self.sentences.append({ 's': sentence, 's_clean': s_clean }) return " ".join(doc)
def test_wordnik_patterns_perc(): from serapis.features import match_wordnik_rules from serapis.preprocess import clean_sentence min_coverage = 0.2 matches = 0.0 with open("serapis/tests/data/frds_wordnik.csv") as f: test_cases = list(csv.reader(f)) for term, sentence in test_cases: s_clean, _ = clean_sentence(sentence, term) matches += 1 if match_wordnik_rules(s_clean) else 0 assert matches / len( test_cases) > min_coverage, "Only matched {:.2f}% of data set".format( 100 * matches / len(test_cases))
def test_wordnik_patterns_perc(): from serapis.features import match_wordnik_rules from serapis.preprocess import clean_sentence min_coverage = 0.2 matches = 0.0 with open("serapis/tests/data/frds_wordnik.csv") as f: test_cases = list(csv.reader(f)) for term, sentence in test_cases: s_clean, _ = clean_sentence(sentence, term) matches += 1 if match_wordnik_rules(s_clean) else 0 assert matches / len(test_cases) > min_coverage, "Only matched {:.2f}% of data set".format( 100 * matches / len(test_cases) )
def extract_sentences(self, page_text): """Finds all sentences that contain the term or a spelling variants. Sets self.sentences ans self.variants. Args: page_text: str Returns str -- cleaned page text. """ doc = [] for paragraph in page_text.split('\n\n'): if is_english(paragraph): for sentence in paragraph_to_sentences(paragraph, self.term): if qualify_sentence(sentence): doc.append(sentence) s_clean, variants = clean_sentence(sentence, self.term) if variants and s_clean not in [s['s_clean'] for s in self.sentences]: self.variants.update(variants) self.sentences.append({ 's': sentence, 's_clean': s_clean }) return " ".join(doc)