Exemplo n.º 1
0
    def parse(self, response):
        soup = BeautifulSoup(response, 'lxml')
        dd = dict()
        dd['%s' % self.lang] = {}

        for i, event_ in enumerate(soup.select("td.event")):
            dd['%s' % self.lang]['%d' % i] = {}

            link = event_.select(
                "a.eventtitle")[0].get(
                "href").replace(
                "/en/", "/%s/" % self.lang
                )

            dd['%s' % self.lang]['%d' % i]['link'] = (
                'http://parter.ua{}'.format(link)
            )

            for k, v in self.fields.items():
                if k in 'date':
                    data_ = event_.select(v)[0].get_text()
                    dd['%s' % self.lang]['%d' % i][k] = datetime_process(data_)
                else:
                    dd['%s' % self.lang]['%d' % i][k] = event_.select(v)[0].get_text()

        with open(self.out_path, 'w', encoding='utf-8') as file:
            file.write(str(dd))
            return self.out_path
Exemplo n.º 2
0
    def parse_posts(self, results):
        for result in results:
            soup = BeautifulSoup(result.text, 'lxml')

            for self.i, event_ in enumerate(soup.select("div.col-sm-4")):

                link = event_.select("div.b-title > a")[0].get('href')
                cur_lenght = re.search(r'(?<=/)[a-z]{2}(?=/)', link).group()

                print("cur_lenght_0: %s" % cur_lenght)
                self.dd[cur_lenght][self.i]['link'] = link

                for key, val in self.fields.items():
                    try:
                        self.dd[cur_lenght][self.i][key] = (
                            event_.select(val)[0].get_text().strip())
                    except:
                        pass

                    if key in max(self.fields.keys()):
                        self.dd[cur_lenght][self.i] = datetime_process(
                            dict(self.dd[cur_lenght][self.i]))

        with open(self.out_path, 'w') as file:
            json.dump(self.dd, file)

        if self.dd:
            # Importing to Database
            import_ = ImportDb(self.out_path)
            import_.import_to_db()
Exemplo n.º 3
0
    def parse_caribbean(self):

        monkey.patch_socket()
        #try:
        self.results = grequests.map((grequests.get(u) for u in self.urls),
                                     exception_handler=exception,
                                     size=3)
        #print(f"self.results @method_async_decorator: {self.results}")
        #print("MAKE FINALLY-----")
        # grequests has done its job
        reload(socket)

        #print(f"!!!results: {self.results}")
        #print ("Class @parse_caribbean(self.urls): %s" % (self.urls))

        for result in self.results:
            soup = BeautifulSoup(result.text, 'lxml')
            # print(len(soup))

            for i, event_ in enumerate(soup.select(self.soup_select)):
                link = event_.select(self.link_select)[0].get('href')
                cur_language = re.search(self.lang_regex, link).group()

                self.d[cur_language][i]['link'] = link

                for key, val in self.fields.items():
                    try:
                        self.d[cur_language][i][key] = (
                        event_.select(val)[0].get_text().strip()
                        )
                    except KeyError:
                        pass

                    if key in max(self.fields.keys()):
                        self.d[cur_language][i] = datetime_process(
                        dict(self.d[cur_language][i]))


        #print("Class @save_result_to_file(out_path): %s" % (self.out_path))
        with open(self.out_path, 'w', encoding='utf-8') as file:
            file.write(f"{json.loads(json.dumps(self.d))}")
            return self.out_path