def output_items_random(cf,path,query,random_no): ## Randomly select give number of tweets from results of query, limiting the query to the size 10*random_no os.makedirs(path, exist_ok=True) total,items=query_items(cf,query,random_no*10) if total>0: selected_items_ids=random.sample(es_outputs_to_ids(items),min(random_no, len(items))) with open(path+query+".csv", 'w') as csvfile: for item in general_storage.get_items_by_ids(cf,selected_items_ids): utils.write_to_csv(csvfile,[item['post_id'],item['object_id'],item['original_data']['created_at'],item['original_data'].get('user').get('name'),item['message']])
def process_sqs_rerun(cf,queue_name,process,batch_size=100): queue_url=sqs.get_url_by_name(queue_name) table=general_storage.dynamodb.Table(cf.table_name) message,handler=sqs.read_message(queue_url) if len(message)>0: processed_items=[] print("Processing sqs items") print(len(message)) items = general_storage.get_items_by_ids(cf, [x['id'] for x in message]) print(len(items)) counter,error=process_rerun(cf,items,process,batch_size) print(counter,error) if handler and counter > 0: sqs.delete_message(queue_url,handler) return counter,error else: print("No message was found") return 0,0
if __name__ == "__main__": parser = argparse.ArgumentParser(description='Normalizer for twitter between DynamoDB and mysql') parser.add_argument('config', type=str, help='an config file for normalizer') parser.add_argument('--query', type=str, default=None, help='query to get data for normalizer') parser.add_argument('--type', type=str, default="own", help='general or own. general:get everything using query; own:get own post and all replies') args = parser.parse_args() config = __import__(args.config) cf =config.Config() if args.type=="own": query_str = args.query if query_str: query_str = query_str + " AND user_id:%s AND object_type:post" %(cf.twitter_user_id) else: query_str="user_id:%s AND object_type:post" %(cf.twitter_user_id) total,posts = query.query_items(cf,query_str) if total>0: for post_id in [x["id"] for x in posts]: post_with_comments=general_storage.get_item_and_comments(cf,post_id) #print("%s comments" %(len(post_with_comments["comments"]))) insert_dynamodb_item_into_mysql(cf,post_with_comments["item"]) for comment in post_with_comments["comments"]: insert_dynamodb_item_into_mysql(cf,comment) elif args.type=="general": #utils.run_until_finish(lambda: utils.process_sqs_rerun(cf,queue_name,process_clara,cf.clara_batch_size)) db_items=general_storage.get_items_by_ids(cf,query.es_outputs_to_ids(items)) for i in db_items: insert_dynamodb_item_into_mysql(cf,i)