예제 #1
0
def handler(event, context):
    if "term" in event:  # API call
        if not qualify_term(event['term']):
            return {'error': 'Invalid search term'}
        message = {
            "word": event['term'],
            'hashslug': hashslug(event['term'])
        }
        if "sentence" in event:  # Detect
            s_clean, variants = clean_sentence(event['sentence'], event['term'])
            message['crawl_date'] = now()
            message['urls'] = [{
                "url": event.get('url'),
                "source": get_source_from_url(event.get('url')),
                "sentences": [{
                    "s": event['sentence'],
                    "s_clean": s_clean,
                }],
                "variants": list(variants)
            }]
            return tasks.detect(message)
        else:  # Search
            return tasks.search(message)

    elif "Records" in event:  # This comes from S3
        for record in event['Records']:
            bucket = record['s3']['bucket']['name']
            key = record['s3']['object']['key']
            key = key.replace("%3A", ":")  # That's my URLDecode.
            if key.count(":") == 2:
                return run_task(bucket, key)
            elif key.endswith(".wordlist"):
                return add_words(bucket, key)
            else:
                print "Don't know what to do with '{}'".format(key)
예제 #2
0
파일: extract.py 프로젝트: wordnik/serapis
    def structured(self):
        structure = {
            "term": self.term,
            "url": self.url,
            "source": get_source_from_url(self.url),
            "doc": self.text,
            "features": self.features,
            "variants": list(self.variants),  # Sets are not JSON serializable
            "sentences": self.sentences,
            "author": self.author,
            "title": self.title
        }

        if config.save_html:
            structure["html"] = self.html
        return structure
예제 #3
0
    def structured(self):
        structure = {
            "term": self.term,
            "url": self.url,
            "source": get_source_from_url(self.url),
            "doc": self.text,
            "features": self.features,
            "variants": list(self.variants),  # Sets are not JSON serializable
            "sentences": self.sentences,
            "author": self.author,
            "title": self.title
        }

        if config.save_html:
            structure["html"] = self.html
        return structure