Exemplo n.º 1
0
    def add_edge(self, src_sid, dst_sid, label):
        self.total_edges += 1

        src_id = self.session.query(EdgeLookup.id).filter_by(
            oid=src_sid).filter(EdgeLookup.ad_id == self.ad_id).first()
        if src_id is None:
            #this should not happen
            t = EdgeLookup(self.ad_id, src_sid, 'unknown')
            self.session.add(t)
            self.session.commit()
            self.session.refresh(t)
            src_id = t.id
        else:
            src_id = src_id[0]

        dst_id = self.session.query(EdgeLookup.id).filter_by(
            oid=dst_sid).filter(EdgeLookup.ad_id == self.ad_id).first()
        if dst_id is None:
            #this should not happen
            t = EdgeLookup(self.ad_id, dst_sid, 'unknown')
            self.session.add(t)
            self.session.commit()
            self.session.refresh(t)
            dst_id = t.id
        else:
            dst_id = dst_id[0]

        edge = Edge(self.ad_id, src_id, dst_id, label)
        self.session.add(edge)
Exemplo n.º 2
0
	def add_edge(self, src_sid, dst_sid, label, with_boost = False):
		self.total_edges += 1
		src_id = self.get_id_for_sid(src_sid, with_boost = with_boost)		
		dst_id = self.get_id_for_sid(dst_sid, with_boost = with_boost)

		edge = Edge(self.ad_id, self.graph_id, src_id, dst_id, label)
		self.session.add(edge)
		if self.total_edges % 10000 == 0:
			self.session.commit()
Exemplo n.º 3
0
    async def calc_sds_mp(self):
        await self.log_msg('Calculating SD edges')
        logger.debug('starting calc_sds_mp')
        try:
            cnt = 0
            total = self.session.query(func.count(
                JackDawSD.id)).filter(JackDawSD.ad_id == self.ad_id).scalar()
            logger.debug('calc_sds_mp total SDs %s' % str(total))
            q = self.session.query(JackDawSD).filter_by(ad_id=self.ad_id)

            if self.progress_queue is not None:
                msg = GathererProgress()
                msg.type = GathererProgressType.SDCALC
                msg.msg_type = MSGTYPE.STARTED
                msg.adid = self.ad_id
                msg.domain_name = self.domain_name
                await self.progress_queue.put(msg)

            sdcalc_pbar = None
            if self.show_progress is True:
                sdcalc_pbar = tqdm(desc='Writing SD edges to file',
                                   total=total)

            testfile = tempfile.TemporaryFile('w+', newline='')
            buffer = []
            if self.mp_pool is None:
                self.mp_pool = mp.Pool()

            logger.debug('calc_sds_mp starting calc')
            tf = 0
            last_stat_cnt = 0
            try:
                for adsd in windowed_query(q, JackDawSD.id, self.buffer_size):
                    tf += 1
                    adsd = JackDawSD.from_dict(adsd.to_dict())
                    buffer.append(adsd)
                    if len(buffer) == self.buffer_size:
                        self.calc_sds_batch(buffer, testfile)
                        buffer = []

                        if sdcalc_pbar is not None:
                            sdcalc_pbar.update(self.buffer_size)

                    if self.progress_queue is not None and tf % self.progress_step_size == 0:
                        last_stat_cnt += self.progress_step_size
                        now = datetime.datetime.utcnow()
                        td = (now - self.progress_last_updated).total_seconds()
                        self.progress_last_updated = now
                        msg = GathererProgress()
                        msg.type = GathererProgressType.SDCALC
                        msg.msg_type = MSGTYPE.PROGRESS
                        msg.adid = self.ad_id
                        msg.domain_name = self.domain_name
                        msg.total = total
                        msg.total_finished = tf
                        if td > 0:
                            msg.speed = str(self.progress_step_size // td)
                        msg.step_size = self.progress_step_size
                        await self.progress_queue.put(msg)
                        await asyncio.sleep(0)

                if len(buffer) > 0:
                    self.calc_sds_batch(buffer, testfile)
                    if self.progress_queue is not None:
                        now = datetime.datetime.utcnow()
                        td = (now - self.progress_last_updated).total_seconds()
                        self.progress_last_updated = now
                        msg = GathererProgress()
                        msg.type = GathererProgressType.SDCALC
                        msg.msg_type = MSGTYPE.PROGRESS
                        msg.adid = self.ad_id
                        msg.domain_name = self.domain_name
                        msg.total = total
                        msg.total_finished = tf
                        if td > 0:
                            msg.speed = str(len(buffer) // td)
                        msg.step_size = tf - last_stat_cnt
                        await self.progress_queue.put(msg)
                        await asyncio.sleep(0)

                    buffer = []

                if self.progress_queue is not None:
                    msg = GathererProgress()
                    msg.type = GathererProgressType.SDCALC
                    msg.msg_type = MSGTYPE.FINISHED
                    msg.adid = self.ad_id
                    msg.domain_name = self.domain_name
                    await self.progress_queue.put(msg)

                if self.show_progress is True and sdcalc_pbar is not None:
                    sdcalc_pbar.refresh()
                    sdcalc_pbar.disable = True

            except Exception as e:
                logger.exception('SD calc exception!')
                raise e
            finally:
                if self.foreign_pool is False:
                    self.mp_pool.close()

            if self.progress_queue is not None:
                msg = GathererProgress()
                msg.type = GathererProgressType.SDCALCUPLOAD
                msg.msg_type = MSGTYPE.STARTED
                msg.adid = self.ad_id
                msg.domain_name = self.domain_name
                await self.progress_queue.put(msg)

            logger.debug('Writing SD edge file contents to DB')
            await self.log_msg('Writing SD edge file contents to DB')
            sdcalcupload_pbar = None
            if self.show_progress is True:
                sdcalcupload_pbar = tqdm(
                    desc='Writing SD edge file contents to DB', total=cnt)

            testfile.seek(0, 0)
            last_stat_cnt = 0
            i = 0

            for line in testfile:
                i += 1
                line = line.strip()
                src_id, dst_id, label, _ = line.split(',')
                edge = Edge(self.ad_id, self.graph_id, src_id, dst_id, label)
                self.session.add(edge)
                if i % (self.buffer_size * 100) == 0:
                    self.session.commit()

                if self.show_progress is True:
                    sdcalcupload_pbar.update()

                if self.progress_queue is not None and i % self.progress_step_size == 0:
                    last_stat_cnt += self.progress_step_size
                    now = datetime.datetime.utcnow()
                    td = (now - self.progress_last_updated).total_seconds()
                    self.progress_last_updated = now
                    msg = GathererProgress()
                    msg.type = GathererProgressType.SDCALCUPLOAD
                    msg.msg_type = MSGTYPE.PROGRESS
                    msg.adid = self.ad_id
                    msg.domain_name = self.domain_name
                    msg.total = self.sd_edges_written
                    msg.total_finished = i
                    if td > 0:
                        msg.speed = str(self.progress_step_size // td)
                    msg.step_size = self.progress_step_size
                    await self.progress_queue.put(msg)
                    await asyncio.sleep(0)

            self.session.commit()

            if self.progress_queue is not None:
                now = datetime.datetime.utcnow()
                td = (now - self.progress_last_updated).total_seconds()
                self.progress_last_updated = now
                msg = GathererProgress()
                msg.type = GathererProgressType.SDCALCUPLOAD
                msg.msg_type = MSGTYPE.PROGRESS
                msg.adid = self.ad_id
                msg.domain_name = self.domain_name
                msg.total = cnt
                msg.total_finished = i
                if td > 0:
                    msg.speed = str((i - last_stat_cnt) // td)
                msg.step_size = i - last_stat_cnt
                await self.progress_queue.put(msg)
                await asyncio.sleep(0)

            if self.progress_queue is not None:
                msg = GathererProgress()
                msg.type = GathererProgressType.SDCALCUPLOAD
                msg.msg_type = MSGTYPE.FINISHED
                msg.adid = self.ad_id
                msg.domain_name = self.domain_name
                await self.progress_queue.put(msg)

            if self.show_progress is True and sdcalcupload_pbar is not None:
                sdcalcupload_pbar.refresh()
                sdcalcupload_pbar.disable = True

            return True, None
        except Exception as e:
            logger.exception('sdcalc!')
            return False, e
Exemplo n.º 4
0
    async def store_file_data(self):
        try:
            if self.progress_queue is not None:
                msg = GathererProgress()
                msg.type = GathererProgressType.MEMBERSUPLOAD
                msg.msg_type = MSGTYPE.STARTED
                msg.adid = self.ad_id
                msg.domain_name = self.domain_name
                await self.progress_queue.put(msg)

            if self.show_progress is True:
                self.upload_pbar = tqdm(desc='Uploading memberships to DB',
                                        total=self.member_finish_ctr)

            self.token_file.close()
            cnt = 0
            last_stat_cnt = 0
            with gzip.GzipFile(self.token_file_path, 'r') as f:
                for line in f:
                    sd = JackDawTokenGroup.from_json(line.strip())
                    src_id = self.sid_to_id_lookup(sd.sid, sd.ad_id,
                                                   sd.object_type)
                    dst_id = self.sid_to_id_lookup(sd.member_sid, sd.ad_id,
                                                   sd.object_type)

                    edge = Edge(sd.ad_id, self.graph_id, src_id, dst_id,
                                'member')

                    self.session.add(edge)
                    await asyncio.sleep(0)
                    cnt += 1
                    if cnt % 10000 == 0:
                        self.session.commit()

                    if self.show_progress is True:
                        self.upload_pbar.update()

                    if self.progress_queue is not None and cnt % self.progress_step_size == 0:
                        last_stat_cnt += self.progress_step_size
                        now = datetime.datetime.utcnow()
                        td = (now - self.progress_last_updated).total_seconds()
                        self.progress_last_updated = now
                        msg = GathererProgress()
                        msg.type = GathererProgressType.MEMBERSUPLOAD
                        msg.msg_type = MSGTYPE.PROGRESS
                        msg.adid = self.ad_id
                        msg.domain_name = self.domain_name
                        msg.total = self.member_finish_ctr
                        msg.total_finished = cnt
                        if td > 0:
                            msg.speed = str(self.progress_step_size // td)
                        msg.step_size = self.progress_step_size
                        await self.progress_queue.put(msg)

            if self.progress_queue is not None:
                now = datetime.datetime.utcnow()
                td = (now - self.progress_last_updated).total_seconds()
                self.progress_last_updated = now
                msg = GathererProgress()
                msg.type = GathererProgressType.MEMBERSUPLOAD
                msg.msg_type = MSGTYPE.PROGRESS
                msg.adid = self.ad_id
                msg.domain_name = self.domain_name
                msg.total = self.member_finish_ctr
                msg.total_finished = cnt
                if td > 0:
                    msg.speed = str(
                        (self.member_finish_ctr - last_stat_cnt) // td)
                msg.step_size = self.member_finish_ctr - last_stat_cnt
                await self.progress_queue.put(msg)

            self.session.commit()
            if self.progress_queue is not None:
                msg = GathererProgress()
                msg.type = GathererProgressType.MEMBERSUPLOAD
                msg.msg_type = MSGTYPE.FINISHED
                msg.adid = self.ad_id
                msg.domain_name = self.domain_name
                await self.progress_queue.put(msg)

            return True, None

        except Exception as e:
            logger.exception(
                'Error while uploading memberships from file to DB')
            if self.progress_queue is not None:
                msg = GathererProgress()
                msg.type = GathererProgressType.MEMBERSUPLOAD
                msg.msg_type = MSGTYPE.ERROR
                msg.adid = self.ad_id
                msg.domain_name = self.domain_name
                msg.error = e
                await self.progress_queue.put(msg)

            return None, e
        finally:
            if self.token_file_path is not None:
                try:
                    os.remove(self.token_file_path)
                except:
                    pass