Exemplo n.º 1
0
    def identify_bio_elements(self, firm, urls):
        fts = FirmTrainingSet.get_for_firm(firm)
        spider = Spider(firm.domain, os.path.join(self.scrape_dir, str(fts.id) + ".db"), workers=4, retry_attempts=2)

        self.stdout.write("Identifying bio elements...\n")

        model = ElementClassifier(os.path.join(settings.MODEL_DIR, 'model', str(fts.id) + "_element.tgm"))
        model.load()

        self.stdout.write('Retrieving element features...\n')

        elements_out = []
        for url in urls:
            page = spider.get(url)
            out_data = model.extract(page, format="html")
            out_data['url'] = url

            out_data = dict(out_data)
            print json.dumps(out_data, indent=4)
            elements_out.append(out_data)

        self.stdout.write('Done.\n')

        return elements_out