Exemplo n.º 1
0
def import_orgs():
    for file_path in filter(lambda x: x.startswith('profile_organization'),
                            get_parquet_file_paths()):
        table = pq.read_table(file_path)
        df = table.to_pandas()
        for idx, row in df.iterrows():
            print(f"org {idx}")
            c = Organization()
            c.id = uuid.uuid4().hex
            c.created = row['created']
            c.updated = row['updated']
            c.profile_id = row['profile_id']
            c.starts_at = row['starts_at'] if not pd.isnull(
                row['starts_at']) else None
            c.ends_at = row['ends_at'] if not pd.isnull(
                row['ends_at']) else None
            c.name = row['name']
            c.title = row['title']
            c.description = row['description']
            session.add(c)
        session.commit()
    def spider_close(self):
        """
        Завершение процесса сбора данных. Сохранение собранных данных в базу данных.
        """
        logging.info("Spider closed")
        logging.info("Saving collected data")
        for domain in self.organization_processing_by_domain.keys():
            logging.info("Saving for " + domain)

            collected_data = self.collected_data_by_domain[domain]
            collected_data.resources = json.dumps(
                collected_data.resources_array).encode("utf-8")
            collected_data.save()

            organization = Organization()
            organization.name = domain
            organization.save()

            self.organization_processing_by_domain[
                domain].organization = organization
            self.organization_processing_by_domain[
                domain].collected_data = collected_data
            self.organization_processing_by_domain[domain].save()