Пример #1
0
def dedup_tags():
    engine.query('''DELETE FROM tag USING tag t WHERE
        tag.status_id = t.status_id AND
        tag.tag = t.tag AND
        tag.category = t.category AND
        tag.regex = t.regex AND
        tag.id < t.id''')
Пример #2
0
def dedup_tags():
    engine.query('''DELETE FROM tag USING tag t WHERE
        tag.status_id = t.status_id AND
        tag.tag = t.tag AND
        tag.category = t.category AND
        tag.regex = t.regex AND
        tag.id < t.id''')
Пример #3
0
def dump_hashtag(tag):
    data = []

    status_tbl = engine['status'].table
    user_tbl = engine['user'].table
    q = status_tbl.join(user_tbl, user_tbl.c.id == status_tbl.c.user_id)
    q = q.join(hashtags_tbl, status_tbl.c.id == hashtags_tbl.c.status_id)
    q = sql.select([status_tbl, user_tbl], from_obj=q, use_labels=True)
    q = q.where(hashtags_tbl.c.text.ilike(tag))
    q = q.order_by(hashtags_tbl.c.status_id.asc())
    
    statuses = []
    for row in engine.query(q):
        data.append(row)
        #data.append(json.loads(row['raw_json']))
    #for json_file in os.listdir('dumps'):
    #    print json_file, len(statuses), len(data)
    #    #min_id = int(json_file.split('.', 1)[0].split('_', 1)[-1])
    #    fh = open('dumps/%s' % json_file, 'rb')
    #    ss = json.load(fh)
    #    for s in ss:
    #        if s.get('id') in statuses:
    #            data.append(s)
    
    log.info("Saving file...")
    fh = open('dump_%s.json' % tag, 'wb')
    print len(data)
    json.dump(data, fh, cls=JSONEncoder)
    fh.close()
    return True
Пример #4
0
def classify_tweets(rules):
    regexen = [d.get('regex') for (a, d) in rules.items()]
    offsets = get_offsets(regexen)
    delete_old_tags(regexen)
    status_tbl = engine['status'].table
    user_tbl = engine['user'].table
    max_id = 0
    q = status_tbl.join(user_tbl, user_tbl.c.id == status_tbl.c.user_id)
    fields = [status_tbl.c.id, status_tbl.c.text, user_tbl.c.id, user_tbl.c.name, user_tbl.c.screen_name]
    q = sql.select(fields, from_obj=q, use_labels=True)
    dt = datetime.utcnow() - timedelta(days=28)
    q = q.where(sql.and_(status_tbl.c.lang == 'de',
                         status_tbl.c.id >= min(offsets.values()),
                         status_tbl.c.created_at > dt))
    q = q.order_by(status_tbl.c.id.asc())

    offset = 0
    while True:
        engine.begin()
        lq = q.limit(PAGE_SIZE).offset(offset)
        offset += PAGE_SIZE
        print offset, PAGE_SIZE
        has_records = False
        for i, status in enumerate(engine.query(lq)):
            has_records = True
            max_id = max(max_id, status.get('status_id'))
            handle_status(status, rules, offsets)
        if not has_records:
            break
        for regex in regexen:
            offset_table.upsert({'regex': regex, 'status_id': max_id}, ['regex'])
        engine.commit()
    dedup_tags()
Пример #5
0
def classify_tweets():
    rules, regexen = get_rules()
    offsets = get_offsets(regexen)
    delete_old_tags(regexen)
    status_tbl = engine['status'].table
    user_tbl = engine['user'].table
    engine.begin()
    max_id = 0
    q = status_tbl.join(user_tbl, user_tbl.c.id == status_tbl.c.user_id)
    q = sql.select([status_tbl, user_tbl], from_obj=q, use_labels=True)
    q = q.where(sql.and_(status_tbl.c.lang == 'de',
                         status_tbl.c.id >= min(offsets.values())))
    q = q.order_by(status_tbl.c.id.desc())
    for i, status in enumerate(engine.query(q)):
        max_id = max(max_id, status.get('status_id'))
        for (field, rule), data in rules.items():
            if offsets.get(data.get('regex')) > status.get('status_id'):
                continue
            m = rule.search(unicode(status.get(field)).lower())
            #print [field,data.get('regex'), m]
            if m is not None:
                #print [field, data.get('regex'), m]
                data['status_id'] = status['status_id']
                tag_table.insert(data)
        if i % 1000 == 0:
            print 'Processed: ', i
    for regex in regexen:
        offset_table.upsert({'regex': regex, 'status_id': max_id}, ['regex'])
    engine.commit()
    dedup_tags()
Пример #6
0
def classify_tweets(rules):
    regexen = [d.get('regex') for (a, d) in rules.items()]
    offsets = get_offsets(regexen)
    delete_old_tags(rules)

    q = text("""
        INSERT INTO tag (category, tag, status_id, classified_at, regex) 
        SELECT :category, :tag, s.id, NOW(), :regex
            FROM status s
            LEFT JOIN tag_offset tgo ON tgo.regex = :regex
            LEFT JOIN "user" u ON s.user_id = u.id
            WHERE
                (s.id > tgo.status_id OR tgo.status_id IS NULL) AND
                (s.text ~* :regex
                 OR u.name ~* :regex
                 OR u.screen_name ~* :regex)
                AND s.lang = 'de'
                AND s.created_at > NOW() - INTERVAL '28 days'
        """)

    offsets_q = text("""
        INSERT INTO tag_offset (regex, status_id)
            SELECT :regex, t.status_id
                FROM tag t
                WHERE t.regex = :regex 
                ORDER BY t.status_id DESC
                LIMIT 1
        """)

    for rule in rules.values():
        print rule
        engine.begin()
        engine.query(q, **rule)
        offset_table.delete(regex=rule['regex'])
        engine.query(offsets_q, regex=rule['regex'])
        engine.commit()

    dedup_tags()
Пример #7
0
def classify_tweets():
    rules = get_rules()
    status_tbl = engine['status'].table
    user_tbl = engine['user'].table
    #engine.begin()
    q = status_tbl.join(user_tbl, user_tbl.c.id == status_tbl.c.user_id)
    q = sql.select([status_tbl, user_tbl], from_obj=q, use_labels=True)
    q = q.where(user_tbl.c.lang == 'de')
    q = q.order_by(status_tbl.c.id.desc())
    for i, status in enumerate(engine.query(q)):
        for (field, rule), (category, tag) in rules.items():
            m = rule.search(unicode(status.get(field)).lower())
            if m is not None:
                tag_status(status, category, tag)
        if i % 1000 == 0:
            print 'Processed: ', i
        #engine.commit()
    dedup_tags()
Пример #8
0
def classify_tweets(rules):
    regexen = [d.get('regex') for (a, d) in rules.items()]
    offsets = get_offsets(regexen)
    delete_old_tags(regexen)
    status_tbl = engine['status'].table
    user_tbl = engine['user'].table
    max_id = 0
    q = status_tbl.join(user_tbl, user_tbl.c.id == status_tbl.c.user_id)
    fields = [
        status_tbl.c.id, status_tbl.c.text, user_tbl.c.id, user_tbl.c.name,
        user_tbl.c.screen_name
    ]
    q = sql.select(fields, from_obj=q, use_labels=True)
    dt = datetime.utcnow() - timedelta(days=28)
    q = q.where(
        sql.and_(status_tbl.c.lang == 'de',
                 status_tbl.c.id >= min(offsets.values()),
                 status_tbl.c.created_at > dt))
    q = q.order_by(status_tbl.c.id.asc())

    offset = 0
    while True:
        engine.begin()
        lq = q.limit(PAGE_SIZE).offset(offset)
        offset += PAGE_SIZE
        print offset, PAGE_SIZE
        has_records = False
        for i, status in enumerate(engine.query(lq)):
            has_records = True
            max_id = max(max_id, status.get('status_id'))
            handle_status(status, rules, offsets)
        if not has_records:
            break
        for regex in regexen:
            offset_table.upsert({
                'regex': regex,
                'status_id': max_id
            }, ['regex'])
        engine.commit()
    dedup_tags()
Пример #9
0
def geocode_locations():
    q = """SELECT DISTINCT TRIM(LOWER(u.location)) AS loc FROM "user" u
        LEFT OUTER JOIN locations lx ON lx.location = TRIM(LOWER(u.location))
        WHERE u.location IS NOT NULL AND lx.location IS NULL;"""
    for location in list(engine.query(q)):
        geocode_location(location)
Пример #10
0
def geocode_locations():
    q = """SELECT DISTINCT TRIM(LOWER(u.location)) AS loc FROM "user" u
        LEFT OUTER JOIN locations lx ON lx.location = TRIM(LOWER(u.location))
        WHERE u.location IS NOT NULL AND lx.location IS NULL;"""
    for location in list(engine.query(q)):
        geocode_location(location)