Пример #1
0
class ExtractionListAPI(Resource):
    #curl http://localhost:5000/extractions
    #curl http://localhost:5000/extractions?"<page_id>"
    def get(self): # returns all Extractions, or returns a specific Extractions based on page_id
        args = parser.parse_args()
        page_id = args['page_id']
        if page_id is None or "": # requested all Extractions
            jsonify = lambda x: x._to_json(x.id) if x else {}
            return map(jsonify, Extraction.get_all_extractions()) 
        else: # requested Extractions filtered by one page_id
            retval = lambda x: x._to_json(x.id) if x else 400
            return retval(Extraction.get_extraction_by_page_id(page_id))

    # curl http://localhost:5000/pages -d "url=http://www.bbc.com" -X POST -v
    def post(self): # creates a new page
        args = parser.parse_args()
        page_id = args['page_id']
        if page_id is None or "":
            return 400 # bad request
        retval = lambda x: x._to_json(x.id) if x else 400
        return retval(Extraction.add_extraction(page_id))
    
manager.add_document(Extraction)
manager.add_viewdef(Extraction.all_extractions)
manager.add_viewdef(Extraction.extraction_by_page_id)
manager.sync(app)

api.add_resource(ExtractionAPI, '/extractions/<string:_id>')
api.add_resource(ExtractionListAPI, '/extractions')
Пример #2
0
    @staticmethod
    def get_by_domain(protocol, domain):
        r = RobotsTxt.robtxt_by_domian(key=[protocol, domain])
        if len(r) > 0: 
            # we have already crawled here and obtained RobotsTxt info. No need to request it again
            for row in r:
                doc = RobotsTxt.load(row.value)
                if doc.is_valid():
                    return doc
        else:
            # We are here for the first time, we need to get the RobotsTxt info and store it for current and future reference
            doc = RobotsTxt(protocol=protocol, domain=domain)
        doc.update()
        return doc
        
manager.add_document(Page)
manager.add_viewdef(Page.all_pages)
manager.add_viewdef(Page.page_by_url)
manager.add_document(RobotsTxt)
manager.add_viewdef(RobotsTxt.robtxt_by_domian)
manager.sync(app)

api.add_resource(PageAPI, '/pages/<string:_id>')
api.add_resource(PageListAPI, '/pages')

########### Helper functions #######
def unescape(text):
    """Removes HTML or XML character references and entities from a text string.
    keep &amp;, &gt;, &lt; in the source code.
    """
    def fixup(m):