Пример #1
0
def saver_queue(q2, number_of_workers):
    engine = create_engine(URL(**CONF['database']['connect_args']),
                           pool_size=1,
                           pool_recycle=CONF['database']['pool_recycle'],
                           client_encoding='utf8')
    Session = scoped_session(sessionmaker(bind=engine))
    session = Session()
    workers_status = [1 for i in range(number_of_workers)]
    while True:
        pid, status, uusers = q2.get()
        if status == 'STOP':
            logger.info(
                'Saver process: STOP sign of worker process %s received from q2',
                pid)
            workers_status[pid] = 0
            if sum(workers_status) == 0:
                logger.warning('All STOP signs received from q2.')
                logger.warning('Data saving task done!')
                break
        else:
            logger.info('Saver process: size of uusers is %s', len(uusers))
            stmt_do_nothing = insert(TwitterUserUnion).values(
                uusers).on_conflict_do_nothing(index_elements=['raw_id'])
            session.execute(stmt_do_nothing)
            session.commit()
Пример #2
0
 def run(cls, args):
     try:
         # print(args)
         args = cls.args_schema.validate(args)
     except SchemaError as e:
         sys.exit(e)
     session = Session()
     # make sure lucene be inited
     lucene.initVM()
     lucene.getVMEnv().attachCurrentThread()
     if args['--index'] is True:
         configure_logging(
             'lucene.index', console_level=args['--console-log-level'])
         mgid = get_or_create_m(
             session,
             MetaInfo,
             data=dict(
                 name='article_group_id_lucene_index',
                 value='0',
                 value_type='int',
                 description='article.group_id used for lucene index'),
             fb_uk='name')
         if args['--mode'] == 'create':
             mgid.set_value(0)
             session.commit()
         logger.debug('Indexing started.. Getting articles..')
         q = """
         SELECT DISTINCT ON (a.group_id) a.id, a.group_id,
             a.canonical_url,
             a.title, a.meta, a.content,
             coalesce(a.date_published, a.date_captured) AS pd,
             s.domain, s.site_type
         FROM article AS a
             JOIN site AS s ON s.id=a.site_id
         WHERE a.site_id IS NOT NULL AND s.is_enabled IS TRUE
             AND a.group_id>:gid
         ORDER BY group_id, pd ASC
         """
         articles_iter = session.execute(
             sqlalchemy.text(q).bindparams(gid=mgid.get_value()))
         cls.index(session, args['--mode'], articles_iter, mgid)
     elif args['--search'] is True:
         configure_logging(
             'lucene.search', console_level=args['--console-log-level'])
         cls.search(args['--query'], args['--top'])
     else:
         print("Unrecognized command!")
         sys.exit(2)
Пример #3
0
def workers_queue(pid, q1, q2):
    """Receiving parameters from q1, then computing and finally put results
       into q2
    """
    engine = create_engine(URL(**CONF['database']['connect_args']),
                           pool_size=1,
                           pool_recycle=CONF['database']['pool_recycle'],
                           client_encoding='utf8')
    Session = scoped_session(sessionmaker(bind=engine))
    session = Session()
    parser = BulkParser(platform_id=1, save_none_url_tweet=True)

    while True:
        try:
            data = q1.get(timeout=1)
        except Empty:
            logger.info('Worker process %s: queue is empty for 1 seconds', pid)
            q2.put((pid, 'STOP', None))
            break
        if data == 'STOP':
            logger.info('Worker process %s: STOP sign received from q1!', pid)
            q1.put('STOP')
            q2.put((pid, 'STOP', None))
            break
        else:
            logger.info('Worker process %s: data=%s received', pid, data)
        w_open_left, w_close_right = data
        jds = dict()
        g_urls_map = dict()
        query = """
            SELECT tw.id, tw.json_data, u.id, u.raw
            FROM tweet AS tw
            LEFT JOIN ass_tweet_url AS atu ON atu.tweet_id=tw.id
            LEFT JOIN url AS u ON u.id=atu.url_id
            WHERE tw.id>:l AND tw.id<=:r
            """
        for tw_id, jd, url_id, url in engine.execute(
                text(query).bindparams(l=w_open_left, r=w_close_right)):
            jds[tw_id] = jd
            if url_id is not None:
                g_urls_map[url] = url_id
        g_uusers_set = set()
        g_edges_set = set()
        for tw_id, jd in jds.items():
            parser.parse_existed_one(tw_id,
                                     jd,
                                     session,
                                     g_urls_map=g_urls_map,
                                     g_uusers_set=g_uusers_set,
                                     g_edges_set=g_edges_set)
        edges = [
            dict(tweet_raw_id=t0,
                 from_raw_id=t1,
                 to_raw_id=t2,
                 url_id=t3,
                 is_quoted_url=t4,
                 is_mention=t5,
                 tweet_type=t6) for t0, t1, t2, t3, t4, t5, t6 in g_edges_set
            if t3 != -1
        ]
        uusers = [dict(raw_id=t1, screen_name=t2) for t1, t2 in g_uusers_set]
        # session.bulk_insert_mappings(TwitterNetworkEdge, edges)
        stmt_do_nothing = insert(TwitterNetworkEdge).values(
            edges).on_conflict_do_nothing(index_elements=[
                'tweet_raw_id', 'from_raw_id', 'to_raw_id', 'url_id',
                'is_quoted_url', 'is_mention', 'tweet_type'
            ])
        session.execute(stmt_do_nothing)
        session.commit()
        q2.put((pid, 'RUN', uusers))
        logger.info('Worker process %s: tweets from %s to %s done', pid,
                    w_open_left + 1, w_close_right)