def classify_tweets(rules): regexen = [d.get('regex') for (a, d) in rules.items()] offsets = get_offsets(regexen) delete_old_tags(regexen) status_tbl = engine['status'].table user_tbl = engine['user'].table max_id = 0 q = status_tbl.join(user_tbl, user_tbl.c.id == status_tbl.c.user_id) fields = [status_tbl.c.id, status_tbl.c.text, user_tbl.c.id, user_tbl.c.name, user_tbl.c.screen_name] q = sql.select(fields, from_obj=q, use_labels=True) dt = datetime.utcnow() - timedelta(days=28) q = q.where(sql.and_(status_tbl.c.lang == 'de', status_tbl.c.id >= min(offsets.values()), status_tbl.c.created_at > dt)) q = q.order_by(status_tbl.c.id.asc()) offset = 0 while True: engine.begin() lq = q.limit(PAGE_SIZE).offset(offset) offset += PAGE_SIZE print offset, PAGE_SIZE has_records = False for i, status in enumerate(engine.query(lq)): has_records = True max_id = max(max_id, status.get('status_id')) handle_status(status, rules, offsets) if not has_records: break for regex in regexen: offset_table.upsert({'regex': regex, 'status_id': max_id}, ['regex']) engine.commit() dedup_tags()
def classify_tweets(): rules, regexen = get_rules() offsets = get_offsets(regexen) delete_old_tags(regexen) status_tbl = engine['status'].table user_tbl = engine['user'].table engine.begin() max_id = 0 q = status_tbl.join(user_tbl, user_tbl.c.id == status_tbl.c.user_id) q = sql.select([status_tbl, user_tbl], from_obj=q, use_labels=True) q = q.where(sql.and_(status_tbl.c.lang == 'de', status_tbl.c.id >= min(offsets.values()))) q = q.order_by(status_tbl.c.id.desc()) for i, status in enumerate(engine.query(q)): max_id = max(max_id, status.get('status_id')) for (field, rule), data in rules.items(): if offsets.get(data.get('regex')) > status.get('status_id'): continue m = rule.search(unicode(status.get(field)).lower()) #print [field,data.get('regex'), m] if m is not None: #print [field, data.get('regex'), m] data['status_id'] = status['status_id'] tag_table.insert(data) if i % 1000 == 0: print 'Processed: ', i for regex in regexen: offset_table.upsert({'regex': regex, 'status_id': max_id}, ['regex']) engine.commit() dedup_tags()
def delete_old_tags(rules): engine.begin() regexen = [d.get('regex') for (a, d) in rules.items()] for row in tag_table.distinct('regex'): if row.get('regex') not in regexen: tag_table.delete(regex=row.get('regex')) engine.commit()
def dump_batches(): if len(raw_table) < BATCH_SIZE: log.info("Not enough entries remaining.") return False data, min_id, max_id = [], None, 0 log.info("Fetching %s raw tweets...", BATCH_SIZE) engine.begin() for row in raw_table.find(_limit=BATCH_SIZE, order_by=['id']): if min_id is None: min_id = row['id'] data.append(json.loads(row['json'])) raw_table.delete(id=row['id']) log.info("Saving file...") fh = open('dumps/raw_%s.json' % min_id, 'wb') json.dump(data, fh) fh.close() engine.commit() return True
def dump_batches(): if len(raw_table) < BATCH_SIZE: log.info("Not enough entries remaining.") return False data, min_id, max_id = [], None, 0 log.info("Fetching %s raw tweets...", BATCH_SIZE) engine.begin() for row in raw_table.find(_limit=BATCH_SIZE, order_by=['id']): if min_id is None: min_id = row['id'] data.append(row['json']) raw_table.delete(id=row['id']) log.info("Saving file...") fh = open('dumps/raw_%s.json' % min_id, 'wb') data = '\n'.join(data) fh.write(data.encode('utf-8')) fh.close() engine.commit() return True
def classify_tweets(rules): regexen = [d.get('regex') for (a, d) in rules.items()] offsets = get_offsets(regexen) delete_old_tags(regexen) status_tbl = engine['status'].table user_tbl = engine['user'].table max_id = 0 q = status_tbl.join(user_tbl, user_tbl.c.id == status_tbl.c.user_id) fields = [ status_tbl.c.id, status_tbl.c.text, user_tbl.c.id, user_tbl.c.name, user_tbl.c.screen_name ] q = sql.select(fields, from_obj=q, use_labels=True) dt = datetime.utcnow() - timedelta(days=28) q = q.where( sql.and_(status_tbl.c.lang == 'de', status_tbl.c.id >= min(offsets.values()), status_tbl.c.created_at > dt)) q = q.order_by(status_tbl.c.id.asc()) offset = 0 while True: engine.begin() lq = q.limit(PAGE_SIZE).offset(offset) offset += PAGE_SIZE print offset, PAGE_SIZE has_records = False for i, status in enumerate(engine.query(lq)): has_records = True max_id = max(max_id, status.get('status_id')) handle_status(status, rules, offsets) if not has_records: break for regex in regexen: offset_table.upsert({ 'regex': regex, 'status_id': max_id }, ['regex']) engine.commit() dedup_tags()
def classify_tweets(rules): regexen = [d.get('regex') for (a, d) in rules.items()] offsets = get_offsets(regexen) delete_old_tags(rules) q = text(""" INSERT INTO tag (category, tag, status_id, classified_at, regex) SELECT :category, :tag, s.id, NOW(), :regex FROM status s LEFT JOIN tag_offset tgo ON tgo.regex = :regex LEFT JOIN "user" u ON s.user_id = u.id WHERE (s.id > tgo.status_id OR tgo.status_id IS NULL) AND (s.text ~* :regex OR u.name ~* :regex OR u.screen_name ~* :regex) AND s.lang = 'de' AND s.created_at > NOW() - INTERVAL '28 days' """) offsets_q = text(""" INSERT INTO tag_offset (regex, status_id) SELECT :regex, t.status_id FROM tag t WHERE t.regex = :regex ORDER BY t.status_id DESC LIMIT 1 """) for rule in rules.values(): print rule engine.begin() engine.query(q, **rule) offset_table.delete(regex=rule['regex']) engine.query(offsets_q, regex=rule['regex']) engine.commit() dedup_tags()
def delete_old_tags(regexen): engine.begin() for row in tag_table.distinct('regex'): if row.get('regex') not in regexen: tag_table.delete(regex=row.get('regex')) engine.commit()
def parse(filename, file_content): #fh = open(filename, 'rb') xmldata = file_content.replace('xmlns="', 'xmlns_="') #fh.close() #print xmldata.decode('utf-8').encode('ascii', 'replace') root = etree.fromstring(xmldata) form = root.find('.//FORM_SECTION') form.getparent().remove(form) ext = Extractor(root) cpvs = [{ 'code': e.get('CODE'), 'text': e.text } for e in root.findall('.//NOTICE_DATA/ORIGINAL_CPV')] ext.ignore('./CODED_DATA_SECTION/NOTICE_DATA/ORIGINAL_CPV') refs = [ e.text for e in root.findall('.//NOTICE_DATA/REF_NOTICE/NO_DOC_OJS') ] ext.ignore('./CODED_DATA_SECTION/NOTICE_DATA/REF_NOTICE/NO_DOC_OJS') data = { 'technical_reception_id': ext.text('./TECHNICAL_SECTION/RECEPTION_ID'), 'technical_comments': ext.text('./TECHNICAL_SECTION/COMMENTS'), 'technical_deletion_date': ext.text('./TECHNICAL_SECTION/DELETION_DATE'), 'technical_form_lang': ext.text('./TECHNICAL_SECTION/FORM_LG_LIST'), 'reception_id': ext.text('./TECHNICAL_SECTION/RECEPTION_ID'), 'oj_collection': ext.text('.//REF_OJS/COLL_OJ'), 'oj_number': ext.text('.//REF_OJS/NO_OJ'), 'oj_date': ext.text('.//REF_OJS/DATE_PUB'), 'doc_no': ext.text('.//NOTICE_DATA/NO_DOC_OJS'), 'doc_url': ext.text('.//NOTICE_DATA//URI_DOC[@LG="EN"]') or ext.text('.//NOTICE_DATA//URI_DOC'), 'info_url': ext.text('.//NOTICE_DATA/IA_URL_GENERAL'), 'etendering_url': ext.text('.//NOTICE_DATA/IA_URL_ETENDERING'), 'orig_language': ext.text('.//NOTICE_DATA/LG_ORIG'), 'orig_nuts': ext.text('.//NOTICE_DATA/ORIGINAL_NUTS'), 'orig_nuts_code': ext.attr('.//NOTICE_DATA/ORIGINAL_NUTS', 'CODE'), 'iso_country': ext.attr('.//NOTICE_DATA/ISO_COUNTRY', 'VALUE'), 'original_cpv': cpvs, 'references': refs, 'dispatch_date': ext.text('.//CODIF_DATA/DS_DATE_DISPATCH'), 'request_document_date': ext.text('.//CODIF_DATA/DD_DATE_REQUEST_DOCUMENT'), 'submission_date': ext.text('.//CODIF_DATA/DT_DATE_FOR_SUBMISSION'), 'heading': ext.text('.//CODIF_DATA/HEADING'), 'directive': ext.attr('.//CODIF_DATA/DIRECTIVE', 'VALUE'), 'authority_type_code': ext.attr('.//CODIF_DATA/AA_AUTHORITY_TYPE', 'CODE'), 'authority_type': ext.text('.//CODIF_DATA/AA_AUTHORITY_TYPE'), 'document_type_code': ext.attr('.//CODIF_DATA/TD_DOCUMENT_TYPE', 'CODE'), 'document_type': ext.text('.//CODIF_DATA/TD_DOCUMENT_TYPE'), 'contract_nature_code': ext.attr('.//CODIF_DATA/NC_CONTRACT_NATURE', 'CODE'), 'contract_nature': ext.text('.//CODIF_DATA/NC_CONTRACT_NATURE'), 'procedure_code': ext.attr('.//CODIF_DATA/PR_PROC', 'CODE'), 'procedure': ext.text('.//CODIF_DATA/PR_PROC'), 'regulation_code': ext.attr('.//CODIF_DATA/RP_REGULATION', 'CODE'), 'regulation': ext.text('.//CODIF_DATA/RP_REGULATION'), 'bid_type_code': ext.attr('.//CODIF_DATA/TY_TYPE_BID', 'CODE'), 'bid_type': ext.text('.//CODIF_DATA/TY_TYPE_BID'), 'award_criteria_code': ext.attr('.//CODIF_DATA/AC_AWARD_CRIT', 'CODE'), 'award_criteria': ext.text('.//CODIF_DATA/AC_AWARD_CRIT'), 'main_activities_code': ext.attr('.//CODIF_DATA/MA_MAIN_ACTIVITIES', 'CODE'), 'main_activities': ext.text('.//CODIF_DATA/MA_MAIN_ACTIVITIES'), 'title_text': ext.text('.//ML_TITLES/ML_TI_DOC[@LG="EN"]/TI_TEXT'), 'title_town': ext.text('.//ML_TITLES/ML_TI_DOC[@LG="EN"]/TI_TOWN'), 'title_country': ext.text('.//ML_TITLES/ML_TI_DOC[@LG="EN"]/TI_CY'), 'authority_name': ext.text('./TRANSLATION_SECTION/ML_AA_NAMES/AA_NAME') } ext.ignore('./LINKS_SECTION/FORMS_LABELS_LINK') ext.ignore('./LINKS_SECTION/OFFICIAL_FORMS_LINK') ext.ignore('./LINKS_SECTION/ORIGINAL_NUTS_LINK') ext.ignore('./LINKS_SECTION/ORIGINAL_CPV_LINK') ext.ignore('./LINKS_SECTION/XML_SCHEMA_DEFINITION_LINK') # TODO: Figure out if we need any of this, even with the forms. ext.ignore( './CODED_DATA_SECTION/NOTICE_DATA/VALUES_LIST/VALUES/SINGLE_VALUE/VALUE' ) ext.ignore('./CODED_DATA_SECTION/NOTICE_DATA/VALUES_LIST') ext.ignore( './CODED_DATA_SECTION/NOTICE_DATA/VALUES_LIST/VALUES/RANGE_VALUE/VALUE' ) ext.ignore( './TRANSLATION_SECTION/TRANSLITERATIONS/TRANSLITERATED_ADDR/TOWN') ext.ignore( './TRANSLATION_SECTION/TRANSLITERATIONS/TRANSLITERATED_ADDR/POSTAL_CODE' ) ext.ignore( './TRANSLATION_SECTION/TRANSLITERATIONS/TRANSLITERATED_ADDR/PHONE') ext.ignore( './TRANSLATION_SECTION/TRANSLITERATIONS/TRANSLITERATED_ADDR/ORGANISATION/OFFICIALNAME' ) ext.ignore( './TRANSLATION_SECTION/TRANSLITERATIONS/TRANSLITERATED_ADDR/FAX') ext.ignore( './TRANSLATION_SECTION/TRANSLITERATIONS/TRANSLITERATED_ADDR/COUNTRY') ext.ignore( './TRANSLATION_SECTION/TRANSLITERATIONS/TRANSLITERATED_ADDR/CONTACT_POINT' ) ext.ignore( './TRANSLATION_SECTION/TRANSLITERATIONS/TRANSLITERATED_ADDR/ATTENTION') ext.ignore( './TRANSLATION_SECTION/TRANSLITERATIONS/TRANSLITERATED_ADDR/ADDRESS') ext.audit() form_ = select_form(form, data['orig_language']) contracts = [] if form_.tag.startswith('CONTRACT_AWARD_'): from forms.contract_award import parse_form contracts = parse_form(form_) # save to DB doc_no = data['doc_no'] engine.begin() cpvs_table.delete(doc_no=doc_no) references_table.delete(doc_no=doc_no) contracts_table.delete(doc_no=doc_no) documents_table.delete(doc_no=doc_no) for cpv in data.pop('original_cpv'): cpv['doc_no'] = doc_no cpvs_table.insert(cpv) for ref in data.pop('references'): obj = {'doc_no': doc_no, 'ref': ref} references_table.insert(obj) for contract in contracts: contract['doc_no'] = doc_no contracts_table.insert(contract) documents_table.insert(data) engine.commit()
def parse(filename, file_content): #fh = open(filename, 'rb') xmldata = file_content.replace('xmlns="', 'xmlns_="') #fh.close() #print xmldata.decode('utf-8').encode('ascii', 'replace') root = etree.fromstring(xmldata) form = root.find('.//FORM_SECTION') form.getparent().remove(form) ext = Extractor(root) cpvs = [{'code': e.get('CODE'), 'text': e.text} for e in root.findall('.//NOTICE_DATA/ORIGINAL_CPV')] ext.ignore('./CODED_DATA_SECTION/NOTICE_DATA/ORIGINAL_CPV') refs = [e.text for e in root.findall('.//NOTICE_DATA/REF_NOTICE/NO_DOC_OJS')] ext.ignore('./CODED_DATA_SECTION/NOTICE_DATA/REF_NOTICE/NO_DOC_OJS') data = { 'technical_reception_id': ext.text('./TECHNICAL_SECTION/RECEPTION_ID'), 'technical_comments': ext.text('./TECHNICAL_SECTION/COMMENTS'), 'technical_deletion_date': ext.text('./TECHNICAL_SECTION/DELETION_DATE'), 'technical_form_lang': ext.text('./TECHNICAL_SECTION/FORM_LG_LIST'), 'reception_id': ext.text('./TECHNICAL_SECTION/RECEPTION_ID'), 'oj_collection': ext.text('.//REF_OJS/COLL_OJ'), 'oj_number': ext.text('.//REF_OJS/NO_OJ'), 'oj_date': ext.text('.//REF_OJS/DATE_PUB'), 'doc_no': ext.text('.//NOTICE_DATA/NO_DOC_OJS'), 'doc_url': ext.text('.//NOTICE_DATA//URI_DOC[@LG="EN"]') or ext.text('.//NOTICE_DATA//URI_DOC'), 'info_url': ext.text('.//NOTICE_DATA/IA_URL_GENERAL'), 'etendering_url': ext.text('.//NOTICE_DATA/IA_URL_ETENDERING'), 'orig_language': ext.text('.//NOTICE_DATA/LG_ORIG'), 'orig_nuts': ext.text('.//NOTICE_DATA/ORIGINAL_NUTS'), 'orig_nuts_code': ext.attr('.//NOTICE_DATA/ORIGINAL_NUTS', 'CODE'), 'iso_country': ext.attr('.//NOTICE_DATA/ISO_COUNTRY', 'VALUE'), 'original_cpv': cpvs, 'references': refs, 'dispatch_date': ext.text('.//CODIF_DATA/DS_DATE_DISPATCH'), 'request_document_date': ext.text('.//CODIF_DATA/DD_DATE_REQUEST_DOCUMENT'), 'submission_date': ext.text('.//CODIF_DATA/DT_DATE_FOR_SUBMISSION'), 'heading': ext.text('.//CODIF_DATA/HEADING'), 'directive': ext.attr('.//CODIF_DATA/DIRECTIVE', 'VALUE'), 'authority_type_code': ext.attr('.//CODIF_DATA/AA_AUTHORITY_TYPE', 'CODE'), 'authority_type': ext.text('.//CODIF_DATA/AA_AUTHORITY_TYPE'), 'document_type_code': ext.attr('.//CODIF_DATA/TD_DOCUMENT_TYPE', 'CODE'), 'document_type': ext.text('.//CODIF_DATA/TD_DOCUMENT_TYPE'), 'contract_nature_code': ext.attr('.//CODIF_DATA/NC_CONTRACT_NATURE', 'CODE'), 'contract_nature': ext.text('.//CODIF_DATA/NC_CONTRACT_NATURE'), 'procedure_code': ext.attr('.//CODIF_DATA/PR_PROC', 'CODE'), 'procedure': ext.text('.//CODIF_DATA/PR_PROC'), 'regulation_code': ext.attr('.//CODIF_DATA/RP_REGULATION', 'CODE'), 'regulation': ext.text('.//CODIF_DATA/RP_REGULATION'), 'bid_type_code': ext.attr('.//CODIF_DATA/TY_TYPE_BID', 'CODE'), 'bid_type': ext.text('.//CODIF_DATA/TY_TYPE_BID'), 'award_criteria_code': ext.attr('.//CODIF_DATA/AC_AWARD_CRIT', 'CODE'), 'award_criteria': ext.text('.//CODIF_DATA/AC_AWARD_CRIT'), 'main_activities_code': ext.attr('.//CODIF_DATA/MA_MAIN_ACTIVITIES', 'CODE'), 'main_activities': ext.text('.//CODIF_DATA/MA_MAIN_ACTIVITIES'), 'title_text': ext.text('.//ML_TITLES/ML_TI_DOC[@LG="EN"]/TI_TEXT'), 'title_town': ext.text('.//ML_TITLES/ML_TI_DOC[@LG="EN"]/TI_TOWN'), 'title_country': ext.text('.//ML_TITLES/ML_TI_DOC[@LG="EN"]/TI_CY'), 'authority_name': ext.text('./TRANSLATION_SECTION/ML_AA_NAMES/AA_NAME') } ext.ignore('./LINKS_SECTION/FORMS_LABELS_LINK') ext.ignore('./LINKS_SECTION/OFFICIAL_FORMS_LINK') ext.ignore('./LINKS_SECTION/ORIGINAL_NUTS_LINK') ext.ignore('./LINKS_SECTION/ORIGINAL_CPV_LINK') ext.ignore('./LINKS_SECTION/XML_SCHEMA_DEFINITION_LINK') # TODO: Figure out if we need any of this, even with the forms. ext.ignore('./CODED_DATA_SECTION/NOTICE_DATA/VALUES_LIST/VALUES/SINGLE_VALUE/VALUE') ext.ignore('./CODED_DATA_SECTION/NOTICE_DATA/VALUES_LIST') ext.ignore('./CODED_DATA_SECTION/NOTICE_DATA/VALUES_LIST/VALUES/RANGE_VALUE/VALUE') ext.ignore('./TRANSLATION_SECTION/TRANSLITERATIONS/TRANSLITERATED_ADDR/TOWN') ext.ignore('./TRANSLATION_SECTION/TRANSLITERATIONS/TRANSLITERATED_ADDR/POSTAL_CODE') ext.ignore('./TRANSLATION_SECTION/TRANSLITERATIONS/TRANSLITERATED_ADDR/PHONE') ext.ignore('./TRANSLATION_SECTION/TRANSLITERATIONS/TRANSLITERATED_ADDR/ORGANISATION/OFFICIALNAME') ext.ignore('./TRANSLATION_SECTION/TRANSLITERATIONS/TRANSLITERATED_ADDR/FAX') ext.ignore('./TRANSLATION_SECTION/TRANSLITERATIONS/TRANSLITERATED_ADDR/COUNTRY') ext.ignore('./TRANSLATION_SECTION/TRANSLITERATIONS/TRANSLITERATED_ADDR/CONTACT_POINT') ext.ignore('./TRANSLATION_SECTION/TRANSLITERATIONS/TRANSLITERATED_ADDR/ATTENTION') ext.ignore('./TRANSLATION_SECTION/TRANSLITERATIONS/TRANSLITERATED_ADDR/ADDRESS') ext.audit() form_ = select_form(form, data['orig_language']) contracts = [] if form_.tag.startswith('CONTRACT_AWARD_'): from forms.contract_award import parse_form contracts = parse_form(form_) # save to DB doc_no = data['doc_no'] engine.begin() cpvs_table.delete(doc_no=doc_no) references_table.delete(doc_no=doc_no) contracts_table.delete(doc_no=doc_no) documents_table.delete(doc_no=doc_no) for cpv in data.pop('original_cpv'): cpv['doc_no'] = doc_no cpvs_table.insert(cpv) for ref in data.pop('references'): obj = {'doc_no': doc_no, 'ref': ref} references_table.insert(obj) for contract in contracts: contract['doc_no'] = doc_no contracts_table.insert(contract) documents_table.insert(data) engine.commit()