def main(): """ Utility to cache messages from all queues from the --hostname provided with 'cache: true' option set in embers.conf --hostname : Cache all active queues on this host --log_file : Path to write the log file to --log_level : Logging level """ from etool import args global log arg_parser = args.get_parser() arg_parser.add_argument('--hostname', metavar='HOSTNAME', type=str, default=environ.get('HOSTNAME', None), help="The hostname of the machine whose services' data you wish to cache") arg = arg_parser.parse_args() log = logs.getLogger(log_name=arg.log_file) logs.init(arg, l=arg.log_level, logfile=arg.log_file) conf.init(arg) assert arg.hostname, '--hostname must be provided' queues = conf.get_all_cached_queues(hostname=arg.hostname) pool = [] for queue in queues: log.info('Spawning cache process for %s' % queue) p = multiprocessing.Process(name=queue, target=cache_queue, args=(queue,)) p.start() pool.append(p) try: for process in pool: process.join() log.warn('%s caching has stopped' % process.name) except KeyboardInterrupt: log.warn('Keyboard interrupt in main')
def main(): """ Utility for warnings stored in Elasticsearch --log_file : Path to write the log file to --log_level : Logging level """ from etool import args global log arg_parser = args.get_parser() arg = arg_parser.parse_args() log = logs.getLogger(log_name=arg.log_file) logs.init(arg, l=arg.log_level, logfile=arg.log_file) print(query(max_results=30))
def main(): """ Utility to cache messages from all queues from the --hostname provided with 'cache: true' option set in embers.conf --hostname : Cache all active queues on this host --log_file : Path to write the log file to --log_level : Logging level """ from etool import args global log arg_parser = args.get_parser() arg_parser.add_argument( '--hostname', metavar='HOSTNAME', type=str, default=environ.get('HOSTNAME', None), help= "The hostname of the machine whose services' data you wish to cache") arg = arg_parser.parse_args() log = logs.getLogger(log_name=arg.log_file) logs.init(arg, l=arg.log_level, logfile=arg.log_file) conf.init(arg) assert arg.hostname, '--hostname must be provided' queues = conf.get_all_cached_queues(hostname=arg.hostname) pool = [] for queue in queues: log.info('Spawning cache process for %s' % queue) p = multiprocessing.Process(name=queue, target=cache_queue, args=(queue, )) p.start() pool.append(p) try: for process in pool: process.join() log.warn('%s caching has stopped' % process.name) except KeyboardInterrupt: log.warn('Keyboard interrupt in main')
def main(): ap = args.get_parser() ap.add_argument('-i', '--input', default='sys.stdin', type=str, help='Path to the input file.' 'Default is sys.stdin') ap.add_argument('-o', '--out', default='sys.stdout', type=str, help='Path to the output file.' 'Default is sys.stdout') ap.add_argument('searchPhrase', default='config/phrases.txt', type=str, help='Path to ' 'the Phrase File if "-f" flag is specified, else the input string is considered' 'to be the phrase.') ap.add_argument('-f', '--file', action='store_true', default=False, help='If given, then the ' 'the searchPhrase argument is interpreted as path to a file') global logger logger = logs.getLogger("%s-%s.log" % (__processor__, str(datetime.now()))) arg = ap.parse_args() logs.init(args) inputFile = None outFile = None phraseFile = None if arg.input == 'sys.stdin': reader = codecs.getreader('utf-8')(sys.stdin) else: inputFile = open(arg.input, "r") reader = codecs.getreader('utf-8')(inputFile) if arg.out == 'sys.stdout': writer = codecs.getwriter('utf-8')(sys.stdout) else: outFile = codecs.open(arg.out, "w", encoding="utf-8") writer = codecs.getwriter('utf-8')(outFile) if arg.file: phraseFile = codecs.open(arg.searchPhrase, encoding='utf-8') generatePhraseList(phraseFile.readlines()) else: generatePhraseList([arg.searchPhrase]) phraseSearch(reader, writer) #close all files if inputFile: inputFile.close() if outFile: outFile.close() if phraseFile: phraseFile.close()
def main(): """ Utility to set up a mapping for an EMBERS queue in Elasticsearch -q | --queue : Queue name to set up the mapping for. Settings are read from embers.conf --log_file : Path to write the log file to --log_level : Logging level """ from etool import args global log arg_parser = args.get_parser() arg_parser.add_argument('-q', '--queue', help='Queue name to map into Elasticsearch') arg = arg_parser.parse_args() assert arg.queue, '--queue must be provided' log = logs.getLogger(log_name=arg.log_file) logs.init(arg, l=arg.log_level, logfile=arg.log_file) add_type(index_name=general.get_index_name(), type_name=arg.queue)
def __init__(self, wg_data=WG_DATA, co_admin_data=CO_ADMIN_DATA, priority_policy=PRIORITY_POLICY, debug=False): """ """ self.priority_policy = priority_policy self.debug = debug self.__version__ = "{0}-{1}-{2}-{3}-{4}".format( self.__class__.__name__, __version__, hashlib.md5(get_wg_data(wg_data).read()).hexdigest(), hashlib.md5(get_co_admin_data(co_admin_data).read()).hexdigest(), hashlib.md5(" ".join(self.priority_policy)).hexdigest()) if self.debug: try: logs.init() except IOError: # , err: logs.init(logfile=self.__class__.__name__.lower()) self.log = logs.getLogger("{0}-{1}".format( self.__class__.__name__, __version__.replace('.', '_'))) # 1. load country and admin1 level geo data f = get_co_admin_data(co_admin_data) dialect = csv.Sniffer().sniff(f.read(10240), delimiters="\t") f.seek(0) reader = csv.DictReader(f, dialect=dialect, fieldnames=CO_ADMIN_FIELDS) # NOTE: # Known conflicts b/w codes of countries and other admins # co Colombia ('Colombia', 'C\xc3\xb3rdoba') # cl Chile ('Colombia', 'Caldas') # ar Argentina ('Colombia', 'Arauca') # sv El Salvador ('El Salvador', 'San Vicente') # prep lookup dictionaries # key__value # countries self.co_code = {} self.co_names = {} self.co_aliases = {} self.co_capital_cities = {} # admin1 self.admin_code = {} self.admin_name = {} # assumes countries appear first when reading data from # lac_co_admin TODO BAD! for r in reader: for k in r.keys(): r[k] = r[k].strip() lat = float_or_none(r['latitude']) lon = float_or_none(r['longitude']) code = object_or_none(r['iso_3166_code']) rid = int_or_none(r["id"]) if r['type'] == 'country': # country if code: self.co_code[code] = r['name'] self.co_names[nstr(r['name'])] = (rid, lat, lon, code, r['name']) self.co_capital_cities[nstr(r['capital_city'])] =\ (r['capital_city'], r['name']) aliases = r['alt_names'].split(',') self.co_aliases.update({nstr(alias.strip()): r['name'] for alias in aliases}) else: if self.debug: self.log.error("Bad data country {0} Code {1}".format( r['name'], code)) elif r['type'] == 'admin': # admin admin, co = r['full_name'].split(',') admin, co = admin.strip(), co.strip() if code: if code not in self.admin_code: self.admin_code[code] = [] self.admin_code[code].append((co, admin)) co1, a = nstr(co), nstr(admin) if a not in self.admin_name: self.admin_name[a] = {} if co1 not in self.admin_name[a]: self.admin_name[a][co1] = (rid, lat, lon, code, admin, co) f.close() # 2. load (world-gazeteer) city level geo data f = get_wg_data(wg_data) dialect = csv.Sniffer().sniff(f.read(10240), delimiters="\t") f.seek(0) reader = csv.DictReader(f, dialect=dialect, fieldnames=WG_FIELDS) self.ci_aliases = {} # main data store for geocoding self.data = [] counter = 0 ci_set = set() for r in reader: for k in r.keys(): r[k] = r[k].strip() # get alias names for cities ci_names = [a.strip() for a in r['alt_names'].split(',') if len(a.strip()) > 0] ci_names.extend([a.strip() for a in r['orig_names'].split(',') if len(a.strip()) > 0]) for ci in ci_names: k = (nstr(ci), nstr(r['country'])) a1 = nstr(r['admin1']) if k not in self.ci_aliases: self.ci_aliases[k] = {a1: set([r['name']])} elif a1 not in self.ci_aliases[k]: self.ci_aliases[k][a1] = set([r['name']]) else: # Cases where different cities for same # admin-country pair have the same alias self.ci_aliases[k][a1].add(r['name']) # add ci name aliases into ci_set ci_set.add(nstr(ci)) # store only cannonical cities names self.data.append((counter, (r['name'], r['country'], r['admin1'], object_or_none(r['admin2']), object_or_none(r['admin3']), int_or_none(r['pop']), float_or_none(r['latitude']) / 100, float_or_none(r['longitude']) / 100, int(r['id']), int(r['padded'])))) counter += 1 self.coordinates = {} # cases where admin1 and city share the same name # extended feature/hack #1 to resolve city when # only country and admin1 are specified self.same_ci_a1_name = {} for i, (n, c, a1, a2, a3, p, lat, lon, i_d, pad) in self.data: nn, nc, na1 = nstr(n), nstr(c), nstr(a1) self.coordinates[(lat, lon)] = i if nn == na1 and pad == 0: self.same_ci_a1_name[(nc, na1)] = n ci_set.add(nn) # store (lat, lon) self.kdtree = KDTree([[i, j] for i, j in self.coordinates.keys() if i is not None and j is not None]) # build regular expr dicts co_set = set(self.co_names.keys()) # add country name aliases into co_set co_set.update(self.co_aliases.keys()) self.co_reg = ManyRE(co_set) self.ci_reg = ManyRE(ci_set) # add admin1 name aliases into admin1_set admin1_set = set(self.admin_name.keys()) # build regular expression stores for co-admin1-ci self.admin1_reg = ManyRE(admin1_set) # add stopwords to prevent any 2-letter word in common usage # to be mis-interpretted as country or admin code two_letter_stop_words = set( ['BE', 'WE', '\xc3\xa0', 'YO', 'DO', 'YA', 'DE', 'DA', 'HA', 'BY', 'HE', 'AL', 'NI', 'LE', 'NO', 'LO', 'TU', 'TO', 'TI', 'TE', 'EM', 'EL', 'EN', 'IS', 'OS', 'AM', 'IT', 'AO', 'AN', 'AS', 'AT', 'IN', 'EU', 'ES', 'IF', 'ME', 'ON', 'OF', 'LA', 'MI', 'UP', 'SU', 'UM', 'UN', 'SO', 'NA', 'OU', 'MY', 'OR', 'SE', 'US']) self.co_code_reg = ManyRE([sw for sw in self.co_code.keys() if sw not in two_letter_stop_words]) self.admin1_code_reg1 = ManyRE(self.admin_code.keys()) self.admin1_code_reg2 = ManyRE([sw for sw in self.admin_code.keys() if sw not in two_letter_stop_words]) self.bguess = {} for i, (city, country, admin1, a2, a3, p, la, lo, i_d, pad)\ in self.data: ci, co, a = nstr(city), nstr(country), nstr(admin1) # value is list of admin1's that correspond to ci-co key # ci-co makes dictionary flatter # choose not to use co-admin1-ci as key to add more flexibility # for lookups if ci in self.bguess: if co in self.bguess[ci]: if a in self.bguess[ci][co]: # store original wg-records marked with pad = 0 # to head of the queue if pad == 0: self.bguess[ci][co][a].appendleft(i) else: self.bguess[ci][co][a].append(i) else: self.bguess[ci][co][a] = deque([i]) else: self.bguess[ci][co] = {a: deque([i])} else: self.bguess[ci] = {co: {a: deque([i])}}
#!/usr/bin/env python import sys import json from etool import args, logs, queue from embers.geocode import Geo, GEO_REGION, PRIORITY_POLICY as LA_POLICY from embers.geocode_mena import GeoMena, PRIORITY_POLICY as MENA_POLICY from embers.utils import normalize_str __processor__ = 'geo_code_stream.py' log = logs.getLogger('%s.log' % (__processor__)) LOC_HEADERS = (u"geocode_version", u"city", u"country", u"admin1", u"admin2", u"admin3", u"pop", u"latitude", u"longitude", u"id", u"pad", u"source") def decode(s, encoding='utf-8'): try: return s.decode(encoding=encoding) except: return s def get_geoInfo(tweet, geo): geotuple = [decode(geo.__version__)] + [decode(l) for l in geo.geo_normalize(tweet)] return dict(zip(LOC_HEADERS, geotuple)) def isempty(s): """return if string is empty """
def main(): """ Utility to cache messages from a queue into Elasticsearch -q | --queue : Read from <queue> and write the messages to Elasticsearch. Settings are read from embers.conf --log_file : Path to write the log file to --log_level : Logging level """ from etool import args global log arg_parser = args.get_parser() arg_parser.add_argument('-q', '--queue', help='Queue name to index into Elasticsearch') arg_parser.add_argument( '-s', '--s3fromq', action='store_true', help='ingest from S3 prefix derived from queue name') arg_parser.add_argument('-p', '--prefix', help='Ingest from prefix') #arg_parser.add_argument('-t', '--typename', default='noqueue', help='Type for prefix ingest') arg_parser.add_argument('-t', '--typename', help='Type for prefix ingest') arg_parser.add_argument( '-l', '--tmpcopy', default='/home/embers/data/tmpcopy', help='Name of local copy of S3 file (same for all S3 files)') arg_parser.add_argument('-c', '--chunk', type=int, default=100, help='Chunk size for S3 ingest') arg_parser.add_argument('-i', '--clustername', help='Clustername to determine index name') arg_parser.add_argument( '-w', '--withbase', action="store_true", help="Add basename to prefix when looking for type.") arg_parser.add_argument('--startdate', help='start date in format like 2015-01-02') arg_parser.add_argument('--enddate', help='end date in format like 2015-01-02') arg = arg_parser.parse_args() #assert (arg.queue or (arg.prefix and arg.typename)), 'Either --queue (with optional --s3fromq/--typename) or --prefix with --typename must be provided' assert ( arg.queue or arg.prefix ), 'Either --queue (with optional --s3fromq/--typename) or --prefix must be provided' log = logs.getLogger(log_name=arg.log_file) logs.init(arg, l=arg.log_level, logfile=arg.log_file) index_name = general.get_index_name(arg.clustername) queue.init() if arg.prefix or (arg.queue and arg.s3fromq): if arg.prefix: prefix = arg.prefix # get queue name or its substitute for S3 objects from prefix if arg.typename: type_name = arg.typename else: type_name = queue.conf.get_prefixpair( prefix=prefix, includeS3=True, withBasename=arg.withbase) if not type_name: log.error("Could not get type from prefix %s" % prefix) return 1 log.warning("type_name=%s from prefix=%s" % (type_name, prefix)) else: type_name = arg.queue prefix, include = queue.conf.get_prefix_for_queue( type_name, withBasename=False) if not prefix: log.error("Could not get S3 prefix for queue %s" % type_name) return 1 if not general.get_es_connection().indices.exists_type( index=index_name, doc_type=type_name): # Create mapping if the queue has not been stored in Elasticsearch yet index_setup.add_type(index_name=index_name, type_name=type_name) conn_s3 = boto.connect_s3(aws_access_key_id=arg.aws_key, aws_secret_access_key=arg.aws_secret) bucket = conn_s3.get_bucket( arg.bucket) # connect to S3, get bucket ptr for arg.bucket attach_to_s3(index_name, s3prefix=prefix, bucket=bucket, type_name=type_name, tmpcopy=arg.tmpcopy, chunk_size=arg.chunk, startdate=arg.startdate, enddate=arg.enddate) else: if arg.typename: type_name = arg.typename else: type_name = arg.queue if not general.get_es_connection().indices.exists_type( index=index_name, doc_type=type_name): # Create mapping if the queue has not been stored in Elasticsearch yet index_setup.add_type(index_name=index_name, type_name=type_name) attach_to_queue(index_name=index_name, queue_name=arg.queue, type_name=type_name)
#!/usr/bin/env python import os import sys import codecs import json from etool import args, logs, queue, iqueue from geo2.country import GeoCountry __processor__ = 'geo_code_stream.py' log = logs.getLogger("geo_code_stream") # '%s.log' % (__processor__)) def annotate(msg, geoc, filter_region=None): """ Annotate message with geocountry info Params: msg - dict object geoc - GeoCountry object filter_region - region to be filtered """ content = geoc.annotate(msg) content_region = content.get("embersGeoCodeCountry", {}).get("region", None) if content_region is None: return None if filter_region is not None and filter_region != content_region: return None return content
#!/usr/bin/python # -*- coding: utf-8 -*- __author__ = "Wei Wang" __email__ = "*****@*****.**" from etool import logs, queue, args import json from datetime import datetime __processor__ = 'listen_warning' logs.getLogger(__processor__) SENT_WARNINGS = [] def check_ifexist(warning): eventDate = warning["eventDate"] eventType = warning["eventType"] population = warning["population"] if [eventDate, eventType, population] in SENT_WARNINGS: return True else: SENT_WARNINGS.append([eventDate, eventType, population]) return False def main(): ap = args.get_parser() ap.add_argument('--out', help="the output file of warnings") arg = ap.parse_args()
__author__ = 'mogren' """ General Elasticsearch queries for Twitter feeds """ import sys import general as es import re from etool import logs log = logs.getLogger(__name__) twitter_date_field = 'date' twitter_text_field = 'text' def get_tweets(keywords=None, geo_box=None, start_time=None, end_time=None, max_results=10): """ Retrieves all tweets containing the keywords provided in the provided time frame. If no parameters are provided, this function will return the 10 most recent tweets in the index. If end_time is not provided and start_time is, end_time will be the present time. :param keywords: str, list of strings, or dict - {'<field_name>': '<value>'} or {'<field_name>': [<list_of_values>} :param geo_box: dict {'lat': {'min': <value>, 'max': <value>}, 'lng':{'min': <value>, 'max': <value>}} :param start_time: ISO formatted date string :param end_time: ISO formatted date string
#!/usr/bin/env python # -*- coding: UTF-8 -*- __author__ = 'Michael Shuffett' __email__ = '*****@*****.**' # from datetime import datetime # from datetime import timedelta import urllib2 import json from etool import args, logs, queue, message import os log = logs.getLogger('wikipedia_recent_changes') class API(object): """Wikipedia API class""" def __init__(self, localization="es"): self._url = "http://%s.wikipedia.org/w/api.php" % localization self._max_ids = 50 def get_recent_changes(self, namespace=0): """Queries Wikipedia site for the latest changes. :param integer namespace: the namespace to restrict to, defaults to 0. :returns: a list of changes sorted in reverse chronological order. :rtype: list """ url = "%s?action=query&list=recentchanges&format=json&rclimit=max&rcnamespace=%d" % (self._url, namespace)
def testLog(): __processor__ = 'TestLog' log = logs.getLogger(__processor__) logs.init() log.info("Error: %s" % "I'm Here")
#!/usr/bin/env python import sys import json from etool import args, logs, queue from embers.geocode import Geo, GEO_REGION, PRIORITY_POLICY as LA_POLICY from embers.geocode_mena import GeoMena, PRIORITY_POLICY as MENA_POLICY from embers.utils import normalize_str __processor__ = 'geo_code_stream.py' log = logs.getLogger('%s.log' % (__processor__)) LOC_HEADERS = (u"geocode_version", u"city", u"country", u"admin1", u"admin2", u"admin3", u"pop", u"latitude", u"longitude", u"id", u"pad", u"source") def decode(s, encoding='utf-8'): try: return s.decode(encoding=encoding) except: return s def get_geoInfo(tweet, geo): geotuple = [decode(geo.__version__) ] + [decode(l) for l in geo.geo_normalize(tweet)] return dict(zip(LOC_HEADERS, geotuple)) def isempty(s):
__author__ = 'mogren' """ General caching service """ from etool import conf from os import environ import sys from etool import logs import multiprocessing from etool.cache.elastic.cache import cache_queue log = logs.getLogger(__name__) def main(): """ Utility to cache messages from all queues from the --hostname provided with 'cache: true' option set in embers.conf --hostname : Cache all active queues on this host --log_file : Path to write the log file to --log_level : Logging level """ from etool import args global log arg_parser = args.get_parser() arg_parser.add_argument('--hostname', metavar='HOSTNAME', type=str, default=environ.get('HOSTNAME', None), help="The hostname of the machine whose services' data you wish to cache") arg = arg_parser.parse_args() log = logs.getLogger(log_name=arg.log_file)
# GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, # MA 02110-1301, USA. # # from etool import args, logs, queue import os import codecs import time import json log = logs.getLogger('test_publisher') """ test_publisher.py Arguments (required): --pub the feed to publish to --json_file the JSON file to publish messages to Arguments (optional): --ssh_key the private key to use to tunnel to EMBERS --tunnel the host to tunnel to test_publisher.py will: - Continuously read from a file - Publish each JSON message to the specified queue - once it reaches EOF, start again
# You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, # MA 02110-1301, USA. # # from etool import args, logs, queue, message import os import codecs import json import subprocess import socket from time import sleep log = logs.getLogger('psl_harness') """ psl_harness.py Arguments (required): --sub the feed to subscribe to --pub the feed to publish to --local_port local port to forward and receive messages Arguments (optional): --ssh_key the private key to use to tunnel to EMBERS --tunnel the host to tunnel to psl_harness.py will: - Continuously read from a queue
# GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, # MA 02110-1301, USA. # # from etool import args, logs, queue import os import codecs import time import json log = logs.getLogger('test_publisher') """ test_publisher.py Arguments (required): --pub the feed to publish to --json_file the JSON file to publish messages to Arguments (optional): --ssh_key the private key to use to tunnel to EMBERS --tunnel the host to tunnel to test_publisher.py will: - Continuously read from a file - Publish each JSON message to the specified queue
import sqlite3 as lite from Util import common import json from datetime import datetime import hashlib from etool import logs import sys import argparse # import history raw data into database con = None cur = None __processor__ = "ImportArchivedNews" log = logs.getLogger(__processor__) def init(): global con global cur con = common.getDBConnection() cur = con.cursor() logs.init() def insert_news(article): try: global con global cur sql = "insert into t_daily_news(embers_id,title,author,post_time,post_date,content,stock_index,source,update_time,url) values (?,?,?,?,?,?,?,?,?,?)" embersId = article["embersId"] title = article["title"] author = article["author"]
def main(): """ Utility to cache messages from a queue into Elasticsearch -q | --queue : Read from <queue> and write the messages to Elasticsearch. Settings are read from embers.conf --log_file : Path to write the log file to --log_level : Logging level """ from etool import args global log arg_parser = args.get_parser() arg_parser.add_argument('-q', '--queue', help='Queue name to index into Elasticsearch') arg_parser.add_argument('-s', '--s3fromq', action='store_true', help='ingest from S3 prefix derived from queue name') arg_parser.add_argument('-p', '--prefix', help='Ingest from prefix') #arg_parser.add_argument('-t', '--typename', default='noqueue', help='Type for prefix ingest') arg_parser.add_argument('-t', '--typename', help='Type for prefix ingest') arg_parser.add_argument('-l', '--tmpcopy', default='/home/embers/data/tmpcopy',help='Name of local copy of S3 file (same for all S3 files)') arg_parser.add_argument('-c', '--chunk', type=int, default=100,help='Chunk size for S3 ingest') arg_parser.add_argument('-i', '--clustername', help='Clustername to determine index name') arg_parser.add_argument('-w', '--withbase', action="store_true", help="Add basename to prefix when looking for type.") arg_parser.add_argument('--startdate', help='start date in format like 2015-01-02') arg_parser.add_argument('--enddate', help='end date in format like 2015-01-02') arg = arg_parser.parse_args() #assert (arg.queue or (arg.prefix and arg.typename)), 'Either --queue (with optional --s3fromq/--typename) or --prefix with --typename must be provided' assert (arg.queue or arg.prefix ), 'Either --queue (with optional --s3fromq/--typename) or --prefix must be provided' log = logs.getLogger(log_name=arg.log_file) logs.init(arg, l=arg.log_level, logfile=arg.log_file) index_name = general.get_index_name(arg.clustername) queue.init() if arg.prefix or (arg.queue and arg.s3fromq): if arg.prefix: prefix = arg.prefix # get queue name or its substitute for S3 objects from prefix if arg.typename: type_name = arg.typename else: type_name = queue.conf.get_prefixpair(prefix=prefix,includeS3=True,withBasename=arg.withbase) if not type_name: log.error("Could not get type from prefix %s" % prefix) return 1 log.warning("type_name=%s from prefix=%s" % (type_name, prefix)) else: type_name = arg.queue prefix, include = queue.conf.get_prefix_for_queue(type_name, withBasename=False) if not prefix: log.error("Could not get S3 prefix for queue %s" % type_name) return 1 if not general.get_es_connection().indices.exists_type(index=index_name, doc_type=type_name): # Create mapping if the queue has not been stored in Elasticsearch yet index_setup.add_type(index_name=index_name, type_name=type_name) conn_s3 = boto.connect_s3(aws_access_key_id=arg.aws_key, aws_secret_access_key=arg.aws_secret) bucket = conn_s3.get_bucket(arg.bucket) # connect to S3, get bucket ptr for arg.bucket attach_to_s3(index_name, s3prefix=prefix, bucket=bucket, type_name=type_name, tmpcopy=arg.tmpcopy, chunk_size=arg.chunk, startdate=arg.startdate, enddate=arg.enddate) else: if arg.typename: type_name=arg.typename else: type_name=arg.queue if not general.get_es_connection().indices.exists_type(index=index_name, doc_type=type_name): # Create mapping if the queue has not been stored in Elasticsearch yet index_setup.add_type(index_name=index_name, type_name=type_name) attach_to_queue(index_name=index_name, queue_name=arg.queue, type_name=type_name)