示例#1
0
def process_constituents_related_objects(CURSOR):
    def get_indices():
        indices = {
            "constituent_id_index": columns.index("ConstituentID"),
            "type_id_index": columns.index("ConstituentTypeID"),
            "object_id_index": columns.index("ObjectID"),
            "object_title_index": columns.index("Title"),
            "object_number_index": columns.index("ObjectNumber"),
            "classification_id_index": columns.index("ClassificationID"),
            "object_date_index": columns.index("ObjectDate"),
            "thumb_path_index": columns.index("ThumbPathName"),
            "thumb_file_index": columns.index("ThumbFileName"),
        }
        return indices

    def process_constituent_row(constituent, current_id):
        constituent_id = row[indices["constituent_id_index"]]
        type_key = int(row[indices["type_id_index"]])
        type = CONSTITUENTTYPES.get(type_key)

        if constituent_id != current_id:
            # will likely have multiple rows for one constituent because of many related objects
            # only get a new constituent if we have a new constituent id, but first save old constituent to elasticsearch
            save(constituent)
            current_id = constituent_id
            constituent = {}
            if elasticsearch_connection.item_exists(constituent_id, type):
                constituent = elasticsearch_connection.get_item(constituent_id, type)
            else:
                print "%s could not be found!" % constituent_id
                return (constituent, current_id)

        if "relateditems" not in constituent:
            constituent["relateditems"] = {}
        classification_key = int(row[indices["classification_id_index"]])
        classification = CLASSIFICATIONS.get(classification_key)
        object_id = int(row[indices["object_id_index"]])
        thumbnail_url = get_media_url(row[indices["thumb_path_index"]], row[indices["thumb_file_index"]])

        date = "" if row[indices["object_date_index"]].lower() == "null" else row[indices["object_date_index"]]
        object_title = row[indices["object_title_index"]]
        object_number = row[indices["object_number_index"]]
        if classification == "diarypages" and object_title.lower() == "null":
            idx = object_number.find("_")
            object_title = object_number[idx + 1 :]
        if object_title.lower() == "null":
            object_title = "[No Title]"

        if classification not in constituent["relateditems"]:
            constituent["relateditems"][classification] = []
        constituent["relateditems"][classification].append(
            {
                "id": object_id,
                "title": object_title,
                "displaytext": object_title,
                "classificationid": classification_key,
                "number": object_number,
                "date": date,
                "thumbnail": thumbnail_url,
            }
        )
        # keep the related items sorted
        constituent["relateditems"][classification].sort(key=operator.itemgetter("displaytext"))

        return (constituent, current_id)

    print "Starting Constituents Related Objects..."
    if CURSOR:
        sql_command = constituents_sql.RELATED_OBJECTS
        CURSOR.execute(sql_command)
        columns = [column[0] for column in CURSOR.description]
        indices = get_indices()

        constituent = {}
        current_id = "-1"
        cursor_row = CURSOR.fetchone()
        while cursor_row is not None:
            row = process_cursor_row(cursor_row)
            (constituent, current_id) = process_constituent_row(constituent, current_id)
            cursor_row = CURSOR.fetchone()
            # save last object to elasticsearch
        save(constituent)
    else:
        with open("../data/constituents_objects_related.csv", "rb") as csvfile:
            # Get the query headers to use as keys in the JSON
            headers = next(csvfile)
            if headers.startswith(codecs.BOM_UTF8):
                headers = headers[3:]
            headers = headers.replace("\r\n", "")
            columns = headers.split(",")
            indices = get_indices()

            rows = csv.reader(csvfile, delimiter=",", quotechar='"')
            constituent = {}
            current_id = "-1"
            for row in rows:
                (constituent, current_id) = process_constituent_row(constituent, current_id)
                # save last object to elasticsearch
            save(constituent)
    print "Finished Constituents Related Objects..."
示例#2
0
文件: objects.py 项目: rsinghal/giza
def process_object_related_unpublished(CURSOR):
	def get_indices():
		indices = {
			'id_index' : columns.index('ID'),
			'unpublished_id_index' : columns.index('UnpublishedID'),
			'unpublished_title_index' : columns.index('UnpublishedTitle'),
			'classification_id_index' : columns.index('ClassificationID'),
			'object_date_index' : columns.index('ObjectDate'),
			'object_number_index' : columns.index('ObjectNumber'),
			'thumb_path_index' : columns.index('ThumbPathName'),
			'thumb_file_index' : columns.index('ThumbFileName')
		}
		return indices

	def process_object_row(object, current_id):
		id = row[indices['id_index']]
		classification_key = int(row[indices['classification_id_index']])
		classification = CLASSIFICATIONS.get(classification_key)

		if id != current_id:
			# may have multiple rows for one object because of many related constituents
			save(object)
			current_id = id
			object = {}
			if elasticsearch_connection.item_exists(id, classification):
				object = elasticsearch_connection.get_item(id, classification)
			else:
				print "%s could not be found!" % id
				return(object, current_id)
		if 'relateditems' not in object:
			object['relateditems'] = {}

		unpublished_id = row[indices['unpublished_id_index']]
		unpublished_title = row[indices['unpublished_title_index']]
		number = row[indices['object_number_index']]
		date = "" if row[indices['object_date_index']].lower() == "null" else row[indices['object_date_index']]
		thumbnail_url = get_media_url(row[indices['thumb_path_index']], row[indices['thumb_file_index']])

		if 'unpubdocs' not in object['relateditems']:
			object['relateditems']['unpubdocs'] = []
		object['relateditems']['unpubdocs'].append({
			'id' : unpublished_id,
			'text' : unpublished_title,
			'displaytext' : unpublished_title,
			'date' : date,
			'number' : number,
			'thumbnail' : thumbnail_url})
		# keep the related items sorted
		object['relateditems']['unpubdocs'].sort(key=operator.itemgetter('displaytext'))

		return(object, current_id)

	print "Starting Objects Related Unpublished..."
	if CURSOR:
		sql_command = objects_sql.RELATED_UNPUBLISHED
		CURSOR.execute(sql_command)
		columns = [column[0] for column in CURSOR.description]
		indices = get_indices()

		object = {}
		current_id = '-1'
		cursor_row = CURSOR.fetchone()
		while cursor_row is not None:
			row = process_cursor_row(cursor_row)
			(object, current_id) = process_object_row(object, current_id)
			cursor_row = CURSOR.fetchone()
   		# save last object to elasticsearch
		save(object)
	else:
		with open('../data/objects_unpublished_related.csv', 'rb') as csvfile:
			# Get the query headers to use as keys in the JSON
			headers = next(csvfile)
			if headers.startswith(codecs.BOM_UTF8):
				headers = headers[3:]
			headers = headers.replace('\r\n','')
			columns = headers.split(',')
			indices = get_indices()

			rows = csv.reader(csvfile, delimiter=',', quotechar='"')
			object = {}
			current_id = '-1'
			for row in rows:
				(object, current_id) = process_object_row(object, current_id)
			# save last object to elasticsearch
			save(object)

	print "Finished Objects Related Unpublished..."
示例#3
0
def process_constituents_related_objects(CURSOR):
    def get_indices():
        indices = {
            'constituent_id_index': columns.index('ConstituentID'),
            'type_id_index': columns.index('ConstituentTypeID'),
            'object_id_index': columns.index('ObjectID'),
            'object_title_index': columns.index('Title'),
            'object_number_index': columns.index('ObjectNumber'),
            'classification_id_index': columns.index('ClassificationID'),
            'object_date_index': columns.index('ObjectDate'),
            'thumb_path_index': columns.index('ThumbPathName'),
            'thumb_file_index': columns.index('ThumbFileName')
        }
        return indices

    def process_constituent_row(constituent, current_id):
        constituent_id = row[indices['constituent_id_index']]
        type_key = int(row[indices['type_id_index']])
        type = CONSTITUENTTYPES.get(type_key)

        if constituent_id != current_id:
            # will likely have multiple rows for one constituent because of many related objects
            # only get a new constituent if we have a new constituent id, but first save old constituent to elasticsearch
            save(constituent)
            current_id = constituent_id
            constituent = {}
            if elasticsearch_connection.item_exists(constituent_id, type):
                constituent = elasticsearch_connection.get_item(
                    constituent_id, type)
            else:
                print "%s could not be found!" % constituent_id
                return (constituent, current_id)

        if 'relateditems' not in constituent:
            constituent['relateditems'] = {}
        classification_key = int(row[indices['classification_id_index']])
        classification = CLASSIFICATIONS.get(classification_key)
        object_id = int(row[indices['object_id_index']])
        thumbnail_url = get_media_url(row[indices['thumb_path_index']],
                                      row[indices['thumb_file_index']])

        date = "" if row[indices['object_date_index']].lower(
        ) == "null" else row[indices['object_date_index']]
        object_title = row[indices['object_title_index']]
        object_number = row[indices['object_number_index']]
        if classification == "diarypages" and object_title.lower() == "null":
            idx = object_number.find('_')
            object_title = object_number[idx + 1:]
        if object_title.lower() == "null":
            object_title = "[No Title]"

        if classification not in constituent['relateditems']:
            constituent['relateditems'][classification] = []
        constituent['relateditems'][classification].append({
            'id':
            object_id,
            'title':
            object_title,
            'displaytext':
            object_title,
            'classificationid':
            classification_key,
            'number':
            object_number,
            'date':
            date,
            'thumbnail':
            thumbnail_url
        })
        # keep the related items sorted
        constituent['relateditems'][classification].sort(
            key=operator.itemgetter('displaytext'))

        return (constituent, current_id)

    print "Starting Constituents Related Objects..."
    if CURSOR:
        sql_command = constituents_sql.RELATED_OBJECTS
        CURSOR.execute(sql_command)
        columns = [column[0] for column in CURSOR.description]
        indices = get_indices()

        constituent = {}
        current_id = '-1'
        cursor_row = CURSOR.fetchone()
        while cursor_row is not None:
            row = process_cursor_row(cursor_row)
            (constituent,
             current_id) = process_constituent_row(constituent, current_id)
            cursor_row = CURSOR.fetchone()
# save last object to elasticsearch
        save(constituent)
    else:
        with open('../data/constituents_objects_related.csv', 'rb') as csvfile:
            # Get the query headers to use as keys in the JSON
            headers = next(csvfile)
            if headers.startswith(codecs.BOM_UTF8):
                headers = headers[3:]
            headers = headers.replace('\r\n', '')
            columns = headers.split(',')
            indices = get_indices()

            rows = csv.reader(csvfile, delimiter=',', quotechar='"')
            constituent = {}
            current_id = '-1'
            for row in rows:
                (constituent,
                 current_id) = process_constituent_row(constituent, current_id)
            # save last object to elasticsearch
            save(constituent)
    print "Finished Constituents Related Objects..."
示例#4
0
文件: objects.py 项目: rsinghal/giza
def process_object_altnums(CURSOR):
	def get_indices():
		indices = {
			'object_id_index' : columns.index('ObjectID'),
			'classification_id_index' : columns.index('ClassificationID'),
			'altnum_index' : columns.index('AltNum'),
			'description_index' : columns.index('Description')
		}
		return indices

	def process_object_row(object, current_id):
		object_id = row[indices['object_id_index']]
		classification_key = int(row[indices['classification_id_index']])
		classification = CLASSIFICATIONS.get(classification_key)

		if object_id != current_id:
			# will likely have multiple rows
			save(object)
			current_id = object_id
			object = {}
			if elasticsearch_connection.item_exists(object_id, classification):
				object = elasticsearch_connection.get_item(object_id, classification)
			else:
				print "%s could not be found!" % object_id
				return (object, current_id)

		if 'altnums' not in object:
			object['altnums'] = []
		altnum = row[indices['altnum_index']]
		prefix_idx = altnum.find('_')
		without_prefix = altnum[prefix_idx+1:]
		description = row[indices['description_index']] if row[indices['description_index']] != "NULL" else ""
		object['altnums'].append({"altnum" : altnum, "description" : description, 'without_prefix': without_prefix})
		object['allnumbers'].extend((altnum, without_prefix))
		return (object, current_id)

	print "Starting Objects AltNums..."
	if CURSOR:
		sql_command = objects_sql.ALTNUMS
		CURSOR.execute(sql_command)
		columns = [column[0] for column in CURSOR.description]
		indices = get_indices()

		object = {}
		current_id = '-1'
		cursor_row = CURSOR.fetchone()
		while cursor_row is not None:
			row = process_cursor_row(cursor_row)
			(object, current_id) = process_object_row(object, current_id)
			cursor_row = CURSOR.fetchone()
   		# save last object to elasticsearch
		save(object)
	else:
		with open('../data/objects_altnums.csv', 'rb') as csvfile:
			# Get the query headers to use as keys in the JSON
			headers = next(csvfile)
			if headers.startswith(codecs.BOM_UTF8):
				headers = headers[3:]
			headers = headers.replace('\r\n','')
			columns = headers.split(',')
			indices = get_indices()

			rows = csv.reader(csvfile, delimiter=',', quotechar='"')
			object = {}
			current_id = '-1'
			for row in rows:
				(object, current_id) = process_object_row(object, current_id)
			# save last object to elasticsearch
			save(object)
	print "Finished Objects AltNums..."
示例#5
0
文件: objects.py 项目: rsinghal/giza
def process_object_related_sites(CURSOR):
	def get_indices():
		indices = {
			'id_index' : columns.index('ID'),
			'site_id_index' : columns.index('SiteID'),
			'site_name_index' : columns.index('SiteName'),
			'site_number_index' : columns.index('SiteNumber'),
			'classification_id_index' : columns.index('ClassificationID'),
			'thumb_path_index' : columns.index('ThumbPathName'),
			'thumb_file_index' : columns.index('ThumbFileName')
		}
		return indices

	def process_object_row(object, current_id):
		id = row[indices['id_index']]
		classification_key = int(row[indices['classification_id_index']])
		classification = CLASSIFICATIONS.get(classification_key)

		if id != current_id:
			# may have multiple rows for one object because of many related constituents
			save(object)
			current_id = id
			object = {}
			if elasticsearch_connection.item_exists(id, classification):
				object = elasticsearch_connection.get_item(id, classification)
			else:
				print "%s could not be found!" % id
				return(object, current_id)
		if 'relateditems' not in object:
			object['relateditems'] = {}

		site_id = row[indices['site_id_index']]
		site_name = row[indices['site_name_index']]
		site_number = row[indices['site_number_index']]
		thumbnail_url = get_media_url(row[indices['thumb_path_index']], row[indices['thumb_file_index']])

		site_dict = {}
		site_dict['id'] = site_id
		site_dict['sitename'] = site_name
		site_dict['sitenumber'] = site_number
		site_dict['displaytext'] = site_number
		site_dict['thumbnail'] = thumbnail_url

		if 'sites' not in object['relateditems']:
			object['relateditems']['sites'] = []
		object['relateditems']['sites'].append(site_dict)
		# keep the related items sorted
		object['relateditems']['sites'].sort(key=operator.itemgetter('displaytext'))

		# for unpubdocs, add sites for "Mentioned on this page"
		if classification == "unpubdocs":
			if 'mentioned' not in object:
				object['mentioned'] = {}
			if 'sites' not in object['mentioned']:
				object['mentioned']['sites'] = []
			object['mentioned']['sites'].append(site_dict)

		return(object, current_id)

	print "Starting Objects Related Sites..."
	if CURSOR:
		sql_command = objects_sql.RELATED_SITES
		CURSOR.execute(sql_command)
		columns = [column[0] for column in CURSOR.description]
		indices = get_indices()

		object = {}
		current_id = '-1'
		cursor_row = CURSOR.fetchone()
		while cursor_row is not None:
			row = process_cursor_row(cursor_row)
			(object, current_id) = process_object_row(object, current_id)
			cursor_row = CURSOR.fetchone()
   		# save last object to elasticsearch
		save(object)
	else:
		with open('../data/objects_sites_related.csv', 'rb') as csvfile:
			# Get the query headers to use as keys in the JSON
			headers = next(csvfile)
			if headers.startswith(codecs.BOM_UTF8):
				headers = headers[3:]
			headers = headers.replace('\r\n','')
			columns = headers.split(',')
			indices = get_indices()

			rows = csv.reader(csvfile, delimiter=',', quotechar='"')
			object = {}
			current_id = '-1'
			for row in rows:
				(object, current_id) = process_object_row(object, current_id)
			# save last object to elasticsearch
			save(object)

	print "Finished Objects Related Sites..."
示例#6
0
def process_constituents_related_media(CURSOR):
    def get_indices():
        indices = {
            "constituent_id_index": columns.index("ID"),
            "type_id_index": columns.index("ConstituentTypeID"),
            "media_master_id_index": columns.index("MediaMasterID"),
            "primary_display_index": columns.index("PrimaryDisplay"),
            "media_type_id_index": columns.index("MediaTypeID"),
            "description_index": columns.index("Description"),
            "caption_index": columns.index("PublicCaption"),
            "thumb_path_index": columns.index("ThumbPathName"),
            "thumb_file_index": columns.index("ThumbFileName"),
            "main_path_index": columns.index("MainPathName"),
            "main_file_index": columns.index("MainFileName"),
        }
        return indices

    def process_constituent_row(constituent, current_id):
        constituent_id = row[indices["constituent_id_index"]]
        type_key = int(row[indices["type_id_index"]])
        type = CONSTITUENTTYPES.get(type_key)

        if constituent_id != current_id:
            # will likely have multiple rows for one constituent because of many related photos
            # only get a new constituent if we have a new constituent id, but first save old constituent to elasticsearch
            save(constituent)
            current_id = constituent_id
            constituent = {}
            if elasticsearch_connection.item_exists(constituent_id, type):
                constituent = elasticsearch_connection.get_item(constituent_id, type)
            else:
                print "%s could not be found!" % constituent_id
                return (constituent, current_id)
        if "relateditems" not in constituent:
            constituent["relateditems"] = {}

        media_type_key = int(row[indices["media_type_id_index"]])
        media_type = MEDIATYPES.get(media_type_key)
        media_master_id = row[indices["media_master_id_index"]]
        thumbnail_url = get_media_url(row[indices["thumb_path_index"]], row[indices["thumb_file_index"]])
        main_url = get_media_url(row[indices["main_path_index"]], row[indices["main_file_index"]])
        display_text = row[indices["caption_index"]]

        # this is a bit of a hack because the MediaFormats for videos (in the TMS database) does not correctly identify the type of video
        # so, make sure we are only using videos that are mp4s
        if media_type_key == 3:
            if not row[indices["main_file_index"]].endswith("mp4"):
                return (constituent, current_id)

        if media_type not in constituent["relateditems"]:
            constituent["relateditems"][media_type] = []
            # add primary photo as a top level item as well
        if row[indices["primary_display_index"]] == "1":
            constituent["primarydisplay"] = {"thumbnail": thumbnail_url, "main": main_url, "displaytext": display_text}
        constituent["relateditems"][media_type].append(
            {
                "id": media_master_id,
                "displaytext": display_text,
                "primarydisplay": True if row[indices["primary_display_index"]] == "1" else False,
                "thumbnail": thumbnail_url,
                "main": main_url,
            }
        )
        return (constituent, current_id)

    print "Starting Constituents Related Media..."
    if CURSOR:
        sql_command = constituents_sql.RELATED_MEDIA
        CURSOR.execute(sql_command)
        columns = [column[0] for column in CURSOR.description]
        indices = get_indices()

        constituent = {}
        current_id = "-1"
        cursor_row = CURSOR.fetchone()
        while cursor_row is not None:
            row = process_cursor_row(cursor_row)
            (constituent, current_id) = process_constituent_row(constituent, current_id)
            cursor_row = CURSOR.fetchone()
            # save last object to elasticsearch
        save(constituent)
    else:
        with open("../data/constituents_media_related.csv", "rb") as csvfile:
            # Get the query headers to use as keys in the JSON
            headers = next(csvfile)
            if headers.startswith(codecs.BOM_UTF8):
                headers = headers[3:]
            headers = headers.replace("\r\n", "")
            columns = headers.split(",")
            indices = get_indices()

            rows = csv.reader(csvfile, delimiter=",", quotechar='"')
            constituent = {}
            current_id = "-1"
            for row in rows:
                (constituent, current_id) = process_constituent_row(constituent, current_id)
                # save last object to elasticsearch
            save(constituent)

    print "Finished Constituents Related Media..."
示例#7
0
文件: objects.py 项目: rsinghal/giza
def process_object_geocodes(CURSOR):
	def get_indices():
		indices = {
			'id_index' : columns.index('ID'),
			'geo_code_id_index' : columns.index('GeoCodeID'),
			'geo_code_index' : columns.index('GeoCode'),
			'region_index' : columns.index('Region'),
			'city_index' : columns.index('City'),
			'classification_id_index' : columns.index('ClassificationID')
		}
		return indices

	def process_object_row(object, current_id):
		id = row[indices['id_index']]
		classification_key = int(row[indices['classification_id_index']])
		classification = CLASSIFICATIONS.get(classification_key)

		if id != current_id:
			# may have multiple rows for one object because of many related constituents
			save(object)
			current_id = id
			object = {}
			if elasticsearch_connection.item_exists(id, classification):
				object = elasticsearch_connection.get_item(id, classification)
			else:
				print "%s could not be found!" % id
				return(object, current_id)

		geocode_dict = {}
		geocode_dict['id'] = row[indices['geo_code_id_index']]
		geocode_dict['geocode'] = row[indices['geo_code_index']]
		geocode_dict['region'] = row[indices['region_index']]
		geocode_dict['city'] = row[indices['city_index']]
		object['geocode'] = geocode_dict

		return(object, current_id)

	print "Starting Objects Geocodes..."
	if CURSOR:
		sql_command = objects_sql.GEOCODES
		CURSOR.execute(sql_command)
		columns = [column[0] for column in CURSOR.description]
	 	indices = get_indices()

		object = {}
		current_id = '-1'
		cursor_row = CURSOR.fetchone()
		while cursor_row is not None:
			row = process_cursor_row(cursor_row)
			(object, current_id) = process_object_row(object, current_id)
			cursor_row = CURSOR.fetchone()
   		# save last object to elasticsearch
		save(object)
	else:
		with open('../data/objects_geocodes.csv', 'rb') as csvfile:
			# Get the query headers to use as keys in the JSON
			headers = next(csvfile)
			if headers.startswith(codecs.BOM_UTF8):
				headers = headers[3:]
			headers = headers.replace('\r\n','')
			columns = headers.split(',')
			indices = get_indices()

			rows = csv.reader(csvfile, delimiter=',', quotechar='"')
			object = {}
			current_id = '-1'
			for row in rows:
				(object, current_id) = process_object_row(object, current_id)
			# save last object to elasticsearch
			save(object)

	print "Finished Objects Geocodes..."
示例#8
0
文件: sites.py 项目: rsinghal/giza
def process_site_related_media(CURSOR):
	def get_indices():
		indices = {
			'site_id_index' : columns.index('SiteID'),
			'media_master_id_index' : columns.index('MediaMasterID'),
			'primary_display_index' : columns.index('PrimaryDisplay'),
			'media_type_id_index' : columns.index('MediaTypeID'),
			'description_index' : columns.index('Description'),
			'caption_index' : columns.index('PublicCaption'),
			'thumb_path_index' : columns.index('ThumbPathName'),
			'thumb_file_index' : columns.index('ThumbFileName'),
			'main_path_index' : columns.index('MainPathName'),
			'main_file_index' : columns.index('MainFileName')
		}
		return indices

	def process_site_row(site, current_id):
		site_id = row[indices['site_id_index']]
		#if site_id not in SAMPLE_SITES:
		#	continue
		if site_id != current_id:
			# will likely have multiple rows for one site because of many related photos
			# only get a new site if we have a new site id, but first save old site to elasticsearch
			save(site)
			current_id = site_id
			site = {}
			if elasticsearch_connection.item_exists(site_id, 'sites'):
				site = elasticsearch_connection.get_item(site_id, 'sites')
			else:
				print "%s could not be found!" % site_id
				return(site, current_id)
		if 'relateditems' not in site:
			site['relateditems'] = {}

		media_type_key = int(row[indices['media_type_id_index']])
		media_type = MEDIATYPES.get(media_type_key)
		media_master_id = row[indices['media_master_id_index']]
		thumbnail_url = get_media_url(row[indices['thumb_path_index']], row[indices['thumb_file_index']])
		main_url = get_media_url(row[indices['main_path_index']], row[indices['main_file_index']])
		display_text = row[indices['caption_index']]

		# this is a bit of a hack because the MediaFormats for videos (in the TMS database) does not correctly identify the type of video
		# so, make sure we are only using videos that are mp4s
		if media_type_key == 3:
			if not row[indices['main_file_index']].endswith('mp4'):
				return(site, current_id)

		if media_type not in site['relateditems']:
			site['relateditems'][media_type] = []
		# add primary photo as a top level item as well
		if row[indices['primary_display_index']] == '1':
			site['primarydisplay'] = {
			'thumbnail' : thumbnail_url,
			'main' : main_url,
			'displaytext' : display_text
			}
		site['relateditems'][media_type].append({
			'id' : media_master_id,
			'displaytext' : display_text,
			'primarydisplay' : True if row[indices['primary_display_index']] == '1' else False,
			'thumbnail' : thumbnail_url,
			'main' : main_url
			})
		return(site, current_id)

	print "Starting Sites Related Media..."
	if CURSOR:
		sql_command = sites_sql.RELATED_MEDIA
		CURSOR.execute(sql_command)
		columns = [column[0] for column in CURSOR.description]
		indices = get_indices()

		site = {}
		current_id = '-1'
		cursor_row = CURSOR.fetchone()
		while cursor_row is not None:
			row = process_cursor_row(cursor_row)
			(site, current_id) = process_site_row(site, current_id)
			cursor_row = CURSOR.fetchone()
   		# save last object to elasticsearch
		save(site)
	else:
		with open('../data/sites_media_related.csv', 'rb') as csvfile:
			# Get the query headers to use as keys in the JSON
			headers = next(csvfile)
			if headers.startswith(codecs.BOM_UTF8):
				headers = headers[3:]
			headers = headers.replace('\r\n','')
			columns = headers.split(',')
			indices = get_indices()

			rows = csv.reader(csvfile, delimiter=',', quotechar='"')
			site = {}
			current_id = '-1'
			for row in rows:
				(site, current_id) = process_site_row(site, current_id)
			# save last object to elasticsearch
			save(site)

	print "Finished Sites Related Media..."
示例#9
0
文件: sites.py 项目: rsinghal/giza
def process_site_dates(CURSOR):
	def get_indices():
		indices = {
			'site_id_index' : columns.index('SiteID'),
			'event_type_index' : columns.index('EventType'),
			'date_text_index' : columns.index('DateText')
		}
		return indices

	def process_site_row(site, current_id):
		site_id = row[indices['site_id_index']]

		if site_id != current_id:
			# will likely have multiple rows for one site because of many related objects
			# only get a new site if we have a new site id, but first save old site to elasticsearch
			save(site)
			current_id = site_id
			site = {}
			if elasticsearch_connection.item_exists(site_id, 'sites'):
				site = elasticsearch_connection.get_item(site_id, 'sites')
			else:
				print "%s could not be found!" % site_id
				return (site, current_id)

			if 'sitedates' not in site:
				site['sitedates'] = []

			event_type = row[indices['event_type_index']]
			date_text = row[indices['date_text_index']]

			site['sitedates'].append({
				'type' : event_type,
				'date' : date_text
			})
			site['datevalues'].append(date_text)
		return (site, current_id)

	print "Starting Sites Dates..."
	if CURSOR:
		sql_command = sites_sql.SITEDATES
		CURSOR.execute(sql_command)
		columns = [column[0] for column in CURSOR.description]
		indices = get_indices()

		site = {}
		current_id = '-1'
		cursor_row = CURSOR.fetchone()
		while cursor_row is not None:
			row = process_cursor_row(cursor_row)
			(site, current_id) = process_site_row(site, current_id)
			cursor_row = CURSOR.fetchone()
   		# save last object to elasticsearch
		save(site)
	else:
		with open('../data/sites_dates.csv', 'rb') as csvfile:
			# Get the query headers to use as keys in the JSON
			headers = next(csvfile)
			if headers.startswith(codecs.BOM_UTF8):
				headers = headers[3:]
			headers = headers.replace('\r\n','')
			columns = headers.split(',')
			indices = get_indices()

			rows = csv.reader(csvfile, delimiter=',', quotechar='"')
			site = {}
			current_id = '-1'
			for row in rows:
				(site, current_id) = process_site_row(site, current_id)
			# save last object to elasticsearch
			save(site)
	print "Finished Sites Dates..."
示例#10
0
文件: sites.py 项目: rsinghal/giza
def process_site_related_constituents(CURSOR):
	def get_indices():
		indices = {
			'site_id_index' : columns.index('SiteID'),
			'role_index' : columns.index('Role'),
			'constituent_id_index' : columns.index('ConstituentID'),
			'constituent_type_id_index' : columns.index('ConstituentTypeID'),
			'display_name_index' : columns.index('DisplayName'),
			'display_date_index' : columns.index('DisplayDate'),
			'remarks_index' : columns.index('Remarks'),
			'thumb_path_index' : columns.index('ThumbPathName'),
			'thumb_file_index' : columns.index('ThumbFileName')
		}
		return indices

	def process_site_row(site, current_id):
		site_id = row[indices['site_id_index']]
		#if site_id not in SAMPLE_SITES:
		#	continue
		if site_id != current_id:
			# will likely have multiple rows for one site because of many related constituents
			# only get a new site if we have a new site id, but first save old site to elasticsearch
			save(site)
			current_id = site_id
			site = {}
			if elasticsearch_connection.item_exists(site_id, 'sites'):
				site = elasticsearch_connection.get_item(site_id, 'sites')
			else:
				print "%s could not be found!" % site_id
				return(site, current_id)
		if 'relateditems' not in site:
			site['relateditems'] = {}

		constituent_id = row[indices['constituent_id_index']]
		display_name = row[indices['display_name_index']]
		display_date = ""
		if row[indices['display_date_index']] != "NULL":
			display_date = row[indices['display_date_index']]
		thumbnail_url = get_media_url(row[indices['thumb_path_index']], row[indices['thumb_file_index']])

		constituent_dict = {}
		role = row[indices['role_index']]
		# update the set of roles for this site
		if role not in site['roles']:
			# make sure Tomb Owner is first
			if role == "Tomb Owner":
				site['roles'].insert(0, role)
			else:
				site['roles'].append(role)

		description = row[indices['remarks_index']] if row[indices['remarks_index']] != "NULL" else ""
		constituent_dict['role'] = role
		constituent_dict['id'] = constituent_id
		constituent_dict['displayname'] = display_name
		constituent_dict['displaydate'] = display_date
		constituent_dict['displaytext'] = display_name
		constituent_dict['description'] = description
		constituent_dict['thumbnail'] = thumbnail_url

		constituent_type_key = int(row[indices['constituent_type_id_index']])
		constituent_type = CONSTITUENTTYPES.get(constituent_type_key)

		# add to array of people for easier searching
		if (constituent_type_key in [1,3]):
			site['people'].append(display_name)

		if constituent_type not in site['relateditems']:
			site['relateditems'][constituent_type] = []
		site['relateditems'][constituent_type].append(constituent_dict)
		# keep the related items sorted
		site['relateditems'][constituent_type].sort(key=operator.itemgetter('displaytext'))

		if role == 'Tomb Owner':
			site['tombowner'] = "Yes"
		return(site, current_id)

	print "Starting Sites Related Constituents..."
	if CURSOR:
		sql_command = sites_sql.RELATED_CONSTITUENTS
		CURSOR.execute(sql_command)
		columns = [column[0] for column in CURSOR.description]
		indices = get_indices()

		site = {}
		current_id = '-1'
		cursor_row = CURSOR.fetchone()
		while cursor_row is not None:
			row = process_cursor_row(cursor_row)
			(site, current_id) = process_site_row(site, current_id)
			cursor_row = CURSOR.fetchone()
   		# save last object to elasticsearch
		save(site)
	else:
		with open('../data/sites_constituents_related.csv', 'rb') as csvfile:
			# Get the query headers to use as keys in the JSON
			headers = next(csvfile)
			if headers.startswith(codecs.BOM_UTF8):
				headers = headers[3:]
			headers = headers.replace('\r\n','')
			columns = headers.split(',')
			indices = get_indices()

			rows = csv.reader(csvfile, delimiter=',', quotechar='"')
			site = {}
			current_id = '-1'
			for row in rows:
				(site, current_id) = process_site_row(site, current_id)
			# save last object to elasticsearch
			save(site)

	print "Finished Sites Related Constituents..."
示例#11
0
文件: sites.py 项目: rsinghal/giza
def process_site_related_published(CURSOR):
	def get_indices():
		indices = {
			'site_id_index' : columns.index('SiteID'),
			'reference_id_index' : columns.index('ReferenceID'),
			'title_index' : columns.index('Title'),
			'boiler_text_index' : columns.index('BoilerText'),
			'date_index' : columns.index('DisplayDate'),
			'path_index' : columns.index('MainPathName'),
			'file_index' : columns.index('MainFileName'),
			'thumb_path_index' : columns.index('ThumbPathName'),
			'thumb_file_index' : columns.index('ThumbFileName')		}
		return indices

	def process_site_row(site, current_id):
		site_id = row[indices['site_id_index']]
		#if site_id not in SAMPLE_SITES:
		#	continue
		if site_id != current_id:
			# will likely have multiple rows for one site because of many related published
			# only get a new site if we have a new site id, but first save old site to elasticsearch
			save(site)
			current_id = site_id
			site = {}
			if elasticsearch_connection.item_exists(site_id, 'sites'):
				site = elasticsearch_connection.get_item(site_id, 'sites')
			else:
				print "%s could not be found!" % site_id
				return(site, current_id)
		if 'relateditems' not in site:
			site['relateditems'] = {}

		reference_id = row[indices['reference_id_index']]
		title = row[indices['title_index']]
		boiler_text = row[indices['boiler_text_index']]
		date = row[indices['date_index']]
		main_url = get_media_url(row[indices['path_index']], row[indices['file_index']])
		thumbnail_url = get_media_url(row[indices['thumb_path_index']], row[indices['thumb_file_index']])

		if "pubdocs" not in site['relateditems']:
			site['relateditems']["pubdocs"] = []
		site['relateditems']["pubdocs"].append({
			'id' : reference_id,
			'boilertext' : boiler_text,
			'displaytext' : title,
			'date' : date,
			'url' : main_url,
			'thumbnail' : thumbnail_url})
		# keep the related items sorted
		site['relateditems']['pubdocs'].sort(key=operator.itemgetter('displaytext'))
		return(site, current_id)

	print "Starting Sites Related Published..."
	if CURSOR:
		sql_command = sites_sql.RELATED_PUBLISHED
		CURSOR.execute(sql_command)
		columns = [column[0] for column in CURSOR.description]
		indices = get_indices()

		site = {}
		current_id = '-1'
		cursor_row = CURSOR.fetchone()
		while cursor_row is not None:
			row = process_cursor_row(cursor_row)
			(site, current_id) = process_site_row(site, current_id)
			cursor_row = CURSOR.fetchone()
   		# save last object to elasticsearch
		save(site)
	else:
		with open('../data/sites_published_related.csv', 'rb') as csvfile:
			# Get the query headers to use as keys in the JSON
			headers = next(csvfile)
			if headers.startswith(codecs.BOM_UTF8):
				headers = headers[3:]
			headers = headers.replace('\r\n','')
			columns = headers.split(',')
			indices = get_indices()

			rows = csv.reader(csvfile, delimiter=',', quotechar='"')
			site = {}
			current_id = '-1'
			for row in rows:
				(site, current_id) = process_site_row(site, current_id)
			# save last object to elasticsearch
			save(site)

	print "Finished Sites Related Published..."
示例#12
0
文件: sites.py 项目: rsinghal/giza
def process_site_related_objects(CURSOR):
	def get_indices():
		indices = {
			'site_id_index' : columns.index('SiteID'),
			'classification_id_index' : columns.index('ClassificationID'),
			'object_id_index' : columns.index('ObjectID'),
			'object_title_index' : columns.index('Title'),
			'object_number_index' : columns.index('ObjectNumber'),
			'object_date_index' : columns.index('ObjectDate'),
			'thumb_path_index' : columns.index('ThumbPathName'),
			'thumb_file_index' : columns.index('ThumbFileName')
		}
		return indices

	def process_site_row(site, current_id):
		site_id = row[indices['site_id_index']]
		#if site_id not in SAMPLE_SITES:
		#	continue
		if site_id != current_id:
			# will likely have multiple rows for one site because of many related objects
			# only get a new site if we have a new site id, but first save old site to elasticsearch
			save(site)
			current_id = site_id
			site = {}
			if elasticsearch_connection.item_exists(site_id, 'sites'):
				site = elasticsearch_connection.get_item(site_id, 'sites')
			else:
				print "%s could not be found!" % site_id
				return (site, current_id)

		if 'relateditems' not in site:
			site['relateditems'] = {}
		classification_key = int(row[indices['classification_id_index']])
		classification = CLASSIFICATIONS.get(classification_key)
		object_id = int(row[indices['object_id_index']])
		thumbnail_url = get_media_url(row[indices['thumb_path_index']], row[indices['thumb_file_index']])

		date = "" if row[indices['object_date_index']].lower() == "null" else row[indices['object_date_index']]
		object_title = row[indices['object_title_index']]
		object_number = row[indices['object_number_index']]
		if classification == "diarypages" and object_title.lower() == "null":
			idx = object_number.find('_')
			object_title = object_number[idx+1:]
		if object_title.lower() == "null":
			object_title = "[No Title]"

		if classification not in site['relateditems']:
			site['relateditems'][classification] = []
		site['relateditems'][classification].append({
			'id' : object_id,
			'title' : object_title,
			'displaytext' : object_title,
			'classificationid' : classification_key,
			'number' : object_number,
			'date' : date,
			'thumbnail' : thumbnail_url})
		# keep the related items sorted
		site['relateditems'][classification].sort(key=operator.itemgetter('displaytext'))
		return (site, current_id)

	print "Starting Sites Related Objects..."
	if CURSOR:
		sql_command = sites_sql.RELATED_OBJECTS
		CURSOR.execute(sql_command)
		columns = [column[0] for column in CURSOR.description]
		indices = get_indices()

		site = {}
		current_id = '-1'
		cursor_row = CURSOR.fetchone()
		while cursor_row is not None:
			row = process_cursor_row(cursor_row)
			(site, current_id) = process_site_row(site, current_id)
			cursor_row = CURSOR.fetchone()
   		# save last object to elasticsearch
		save(site)
	else:
		with open('../data/sites_objects_related.csv', 'rb') as csvfile:
			# Get the query headers to use as keys in the JSON
			headers = next(csvfile)
			if headers.startswith(codecs.BOM_UTF8):
				headers = headers[3:]
			headers = headers.replace('\r\n','')
			columns = headers.split(',')
			indices = get_indices()

			rows = csv.reader(csvfile, delimiter=',', quotechar='"')
			site = {}
			current_id = '-1'
			for row in rows:
				(site, current_id) = process_site_row(site, current_id)
			# save last object to elasticsearch
			save(site)
	print "Finished Sites Related Objects..."
示例#13
0
文件: sites.py 项目: rsinghal/giza
def process_site_altnums(CURSOR):
	def get_indices():
		indices = {
			'site_id_index' : columns.index('SiteID'),
			'altnum_index' : columns.index('AltNum'),
			'description_index' : columns.index('Description')
		}
		return indices

	def process_site_row(site, current_id):
		site_id = row[indices['site_id_index']]
		#if site_id not in SAMPLE_SITES:
		#	continue

		if site_id != current_id:
			# will likely have multiple rows for one site because of many related objects
			# only get a new site if we have a new site id, but first save old site to elasticsearch
			save(site)
			current_id = site_id
			site = {}
			if elasticsearch_connection.item_exists(site_id, 'sites'):
				site = elasticsearch_connection.get_item(site_id, 'sites')
			else:
				print "%s could not be found!" % site_id
				return (site, current_id)

		if 'altnums' not in site:
			site['altnums'] = []
		if 'altnum_types' not in site:
			site['altnum_types'] = []

		altnum = row[indices['altnum_index']]
		description = row[indices['description_index']] if row[indices['description_index']] != "NULL" else ""
		if description not in site['altnum_types']:
			site['altnum_types'].append(description)

		site['altnums'].append({"altnum" : altnum, "description" : description})
		return (site, current_id)

	print "Starting Sites AltNums..."
	if CURSOR:
		sql_command = sites_sql.ALTNUMS
		CURSOR.execute(sql_command)
		columns = [column[0] for column in CURSOR.description]
		indices = get_indices()

		site = {}
		current_id = '-1'
		cursor_row = CURSOR.fetchone()
		while cursor_row is not None:
			row = process_cursor_row(cursor_row)
			(site, current_id) = process_site_row(site, current_id)
			cursor_row = CURSOR.fetchone()
   		# save last object to elasticsearch
		save(site)
	else:
		with open('../data/sites_altnums.csv', 'rb') as csvfile:
			# Get the query headers to use as keys in the JSON
			headers = next(csvfile)
			if headers.startswith(codecs.BOM_UTF8):
				headers = headers[3:]
			headers = headers.replace('\r\n','')
			columns = headers.split(',')
			indices = get_indices()

			rows = csv.reader(csvfile, delimiter=',', quotechar='"')
			site = {}
			current_id = '-1'
			for row in rows:
				(site, current_id) = process_site_row(site, current_id)
			# save last object to elasticsearch
			save(site)
	print "Finished Sites AltNums..."
示例#14
0
文件: sites.py 项目: rsinghal/giza
def process_sites(CURSOR):
	def get_indices():
		site_id_index = columns.index('ID')
		return site_id_index

	def process_site_row(site, current_id):
		site_id = row[site_id_index]
		#if site_id not in SAMPLE_SITES:
		#	continue
		# could have multiple rows for one site because of multiple SiteDates or other pieces of information
		# only create a new site if we have a new site id, but first save old site to elasticsearch
		if site_id != current_id:
			save(site)
			current_id = site_id
			site = {}

		# loop through each row
		for index, value in enumerate(columns):
			key = value.lower()
			row_value = row[index]

			# cleanup row data
			if row_value.isdigit():
				row_value = int(row_value)
			elif row_value == "NULL":
				row_value = None
			else:
				row_value = row_value.replace(',,','')

			if 'sitetype' in key:
				if not row_value:
					continue
				# group sitetype fields into an object
				if 'sitetype' not in site:
					site['sitetype'] = {}
				site['sitetype'][key] = row_value
			else:
				# no special processing - just add it to the JSON
				site[key] = row_value
		display_text = site['number']
		site['displaytext'] = display_text
		site['tombowner'] = "No"
		site['roles'] = []
		site['people'] = []
		site['datevalues'] = []
		return (site, current_id)

	print "Starting Sites..."
	if CURSOR:
		sql_command = sites_sql.SITES
		CURSOR.execute(sql_command)
		columns = [column[0] for column in CURSOR.description]
		site_id_index = get_indices()

		site = {}
		current_id = '-1'
		cursor_row = CURSOR.fetchone()
		while cursor_row is not None:
			row = process_cursor_row(cursor_row)
			(site, current_id) = process_site_row(site, current_id)
			cursor_row = CURSOR.fetchone()
   		# save last object to elasticsearch
		save(site)

	else:
		with open('../data/sites.csv', 'rb') as csvfile:
			# Get the query headers to use as keys in the JSON
			headers = next(csvfile)
			if headers.startswith(codecs.BOM_UTF8):
				headers = headers[3:]
			headers = headers.replace('\r\n','')
			columns = headers.split(',')
			site_id_index = get_indices()

			rows = csv.reader(csvfile, delimiter=',', quotechar='"')
			site = {}
			current_id = '-1'
			for row in rows:
				(site, current_id) = process_site_row(site, current_id)
			# save last object to elasticsearch
			save(site)

	print "Finished Sites..."
示例#15
0
def process_constituents_related_sites(CURSOR):
    def get_indices():
        indices = {
            "constituent_id_index": columns.index("ConstituentID"),
            "type_id_index": columns.index("ConstituentTypeID"),
            "site_id_index": columns.index("SiteID"),
            "site_name_index": columns.index("SiteName"),
            "site_number_index": columns.index("SiteNumber"),
            "thumb_path_index": columns.index("ThumbPathName"),
            "thumb_file_index": columns.index("ThumbFileName"),
        }
        return indices

    def process_constituent_row(constituent, current_id):
        constituent_id = row[indices["constituent_id_index"]]
        type_key = int(row[indices["type_id_index"]])
        type = CONSTITUENTTYPES.get(type_key)

        if constituent_id != current_id:
            # will likely have multiple rows for one constituent because of many related constituents
            # only get a new constituent if we have a new constituent id, but first save old constituent to elasticsearch
            save(constituent)
            current_id = constituent_id
            constituent = {}
            if elasticsearch_connection.item_exists(constituent_id, type):
                constituent = elasticsearch_connection.get_item(constituent_id, type)
            else:
                print "%s could not be found!" % constituent_id
                return (constituent, current_id)
        if "relateditems" not in constituent:
            constituent["relateditems"] = {}

        site_id = row[indices["site_id_index"]]
        site_name = row[indices["site_name_index"]]
        site_number = row[indices["site_number_index"]]
        thumbnail_url = get_media_url(row[indices["thumb_path_index"]], row[indices["thumb_file_index"]])

        site_dict = {}
        site_dict["id"] = site_id
        site_dict["sitename"] = site_name
        site_dict["sitenumber"] = site_number
        site_dict["displaytext"] = "%s, %s" % (site_name, site_number)
        site_dict["thumbnail"] = thumbnail_url

        if "sites" not in constituent["relateditems"]:
            constituent["relateditems"]["sites"] = []
        constituent["relateditems"]["sites"].append(site_dict)
        # keep the related items sorted
        constituent["relateditems"]["sites"].sort(key=operator.itemgetter("displaytext"))

        return (constituent, current_id)

    print "Starting Constituents Related Sites..."
    if CURSOR:
        sql_command = constituents_sql.RELATED_SITES
        CURSOR.execute(sql_command)
        columns = [column[0] for column in CURSOR.description]
        indices = get_indices()

        constituent = {}
        current_id = "-1"
        cursor_row = CURSOR.fetchone()
        while cursor_row is not None:
            row = process_cursor_row(cursor_row)
            (constituent, current_id) = process_constituent_row(constituent, current_id)
            cursor_row = CURSOR.fetchone()
            # save last object to elasticsearch
        save(constituent)
    else:
        with open("../data/constituents_sites_related.csv", "rb") as csvfile:
            # Get the query headers to use as keys in the JSON
            headers = next(csvfile)
            if headers.startswith(codecs.BOM_UTF8):
                headers = headers[3:]
            headers = headers.replace("\r\n", "")
            columns = headers.split(",")
            indices = get_indices()

            rows = csv.reader(csvfile, delimiter=",", quotechar='"')
            constituent = {}
            current_id = "-1"
            for row in rows:
                (constituent, current_id) = process_constituent_row(constituent, current_id)
                # save last object to elasticsearch
            save(constituent)

    print "Finished Constituents Related Sites..."
示例#16
0
def process_pubs(CURSOR):
	def get_indices():
		pub_id_index = columns.index('ID')
		return pub_id_index

	def process_pub_row(pub, current_id):
		pub_id = row[pub_id_index]

		if pub_id != current_id:
			save(pub)
			current_id = pub_id
			pub = {}

		# loop through columns in row
		for index, value in enumerate(columns):
			key = value.lower()
			row_value = row[index]

			# cleanup row data
			if row_value.isdigit():
				row_value = int(row_value)
			elif row_value == "NULL":
				row_value = None
			else:
				row_value = row_value.replace(',,','')

			pub[key] = row_value

		display_text = pub['title']
		pub['displaytext'] = display_text
		pub['roles'] = []
		pub["authors"] = []
		return (pub, current_id)

	print "Starting Pub Docs..."
	if CURSOR:
		sql_command = published_sql.PUBLISHED
		CURSOR.execute(sql_command)
		columns = [column[0] for column in CURSOR.description]
		pub_id_index = get_indices()

		pub = {}
		current_id = '-1'
		cursor_row = CURSOR.fetchone()
		while cursor_row is not None:
			row = process_cursor_row(cursor_row)
			(pub, current_id) = process_pub_row(pub, current_id)
			cursor_row = CURSOR.fetchone()
   		# save last object to elasticsearch
		save(pub)

	else:
		with open('../data/pubdocs.csv', 'rb') as csvfile:
			# Get the query headers to use as keys in the JSON
			headers = next(csvfile)
			if headers.startswith(codecs.BOM_UTF8):
				headers = headers[3:]
			headers = headers.replace('\r\n','')
			columns = headers.split(',')
			pub_id_index = get_indices()

			rows = csv.reader(csvfile, delimiter=',', quotechar='"')
			pub = {}
			current_id = '-1'
			for row in rows:
				(pub, current_id) = process_pub_row(pub, current_id)
			# save last object to elasticsearch
			save(pub)

	print "Finished Pub Docs..."
示例#17
0
def process_constituents_related_published(CURSOR):
    def get_indices():
        indices = {
            "constituent_id_index": columns.index("ConstituentID"),
            "type_id_index": columns.index("ConstituentTypeID"),
            "reference_id_index": columns.index("ReferenceID"),
            "title_index": columns.index("Title"),
            "boiler_text_index": columns.index("BoilerText"),
            "date_index": columns.index("DisplayDate"),
            "path_index": columns.index("MainPathName"),
            "file_index": columns.index("MainFileName"),
        }
        return indices

    def process_constituent_row(constituent, current_id):
        constituent_id = row[indices["constituent_id_index"]]
        type_key = int(row[indices["type_id_index"]])
        type = CONSTITUENTTYPES.get(type_key)

        if constituent_id != current_id:
            # will likely have multiple rows for one constituent because of many related published
            # only get a new constituent if we have a new constituent id, but first save old constituent to elasticsearch
            save(constituent)
            current_id = constituent_id
            constituent = {}
            if elasticsearch_connection.item_exists(constituent_id, type):
                constituent = elasticsearch_connection.get_item(constituent_id, type)
            else:
                print "%s could not be found!" % constituent_id
                return (constituent, current_id)
        if "relateditems" not in constituent:
            constituent["relateditems"] = {}

        reference_id = row[indices["reference_id_index"]]
        title = row[indices["title_index"]]
        boiler_text = row[indices["boiler_text_index"]]
        date = row[indices["date_index"]]
        main_url = get_media_url(row[indices["path_index"]], row[indices["file_index"]])

        if "pubdocs" not in constituent["relateditems"]:
            constituent["relateditems"]["pubdocs"] = []
        constituent["relateditems"]["pubdocs"].append(
            {"id": reference_id, "boilertext": boiler_text, "displaytext": title, "date": date, "url": main_url}
        )
        # keep the related items sorted
        constituent["relateditems"]["pubdocs"].sort(key=operator.itemgetter("displaytext"))
        return (constituent, current_id)

    print "Starting Constituents Related Published..."
    if CURSOR:
        sql_command = constituents_sql.RELATED_PUBLISHED
        CURSOR.execute(sql_command)
        columns = [column[0] for column in CURSOR.description]
        indices = get_indices()

        constituent = {}
        current_id = "-1"
        cursor_row = CURSOR.fetchone()
        while cursor_row is not None:
            row = process_cursor_row(cursor_row)
            (constituent, current_id) = process_constituent_row(constituent, current_id)
            cursor_row = CURSOR.fetchone()
            # save last object to elasticsearch
        save(constituent)
    else:
        with open("../data/constituents_published_related.csv", "rb") as csvfile:
            # Get the query headers to use as keys in the JSON
            headers = next(csvfile)
            if headers.startswith(codecs.BOM_UTF8):
                headers = headers[3:]
            headers = headers.replace("\r\n", "")
            columns = headers.split(",")
            indices = get_indices()

            rows = csv.reader(csvfile, delimiter=",", quotechar='"')
            constituent = {}
            current_id = "-1"
            for row in rows:
                (constituent, current_id) = process_constituent_row(constituent, current_id)
                # save last object to elasticsearch
            save(constituent)

    print "Finished Constituents Related Published..."
示例#18
0
def process_pub_related_objects(CURSOR):
	def get_indices():
		indices = {
			'pub_id_index' : columns.index('ReferenceID'),
			'classification_id_index' : columns.index('ClassificationID'),
			'object_id_index' : columns.index('ObjectID'),
			'object_title_index' : columns.index('Title'),
			'object_number_index' : columns.index('ObjectNumber'),
			'object_date_index' : columns.index('ObjectDate'),
			'thumb_path_index' : columns.index('ThumbPathName'),
			'thumb_file_index' : columns.index('ThumbFileName')
		}
		return indices

	def process_pub_row(pub, current_id):
		pub_id = row[indices['pub_id_index']]

		if pub_id != current_id:
			save(pub)
			current_id = pub_id
			pub = {}
			if elasticsearch_connection.item_exists(pub_id, 'pubdocs'):
				pub = elasticsearch_connection.get_item(pub_id, 'pubdocs')
			else:
				print "%s could not be found!" % pub_id
				return (pub, current_id)

		if 'relateditems' not in pub:
			pub['relateditems'] = {}
		classification_key = int(row[indices['classification_id_index']])
		classification = CLASSIFICATIONS.get(classification_key)
		object_id = int(row[indices['object_id_index']])
		thumbnail_url = get_media_url(row[indices['thumb_path_index']], row[indices['thumb_file_index']])

		date = "" if row[indices['object_date_index']].lower() == "null" else row[indices['object_date_index']]
		object_title = row[indices['object_title_index']]
		object_number = row[indices['object_number_index']]
		if classification == "diarypages" and object_title.lower() == "null":
			idx = object_number.find('_')
			object_title = object_number[idx+1:]
		if object_title.lower() == "null":
			object_title = "[No Title]"

		if classification not in pub['relateditems']:
			pub['relateditems'][classification] = []
		pub['relateditems'][classification].append({
			'id' : object_id,
			'title' : object_title,
			'displaytext' : object_title,
			'classificationid' : classification_key,
			'number' : object_number,
			'date' : date,
			'thumbnail' : thumbnail_url})
		# keep the related items sorted
		pub['relateditems'][classification].sort(key=operator.itemgetter('displaytext'))

		return (pub, current_id)

	print "Starting Pub Docs Related Objects..."
	if CURSOR:
		sql_command = published_sql.RELATED_OBJECTS
		CURSOR.execute(sql_command)
		columns = [column[0] for column in CURSOR.description]
		indices = get_indices()

		pub = {}
		current_id = '-1'
		cursor_row = CURSOR.fetchone()
		while cursor_row is not None:
			row = process_cursor_row(cursor_row)
			(pub, current_id) = process_pub_row(pub, current_id)
			cursor_row = CURSOR.fetchone()
   		# save last object to elasticsearch
		save(pub)
	else:
		with open('../data/pubdocs_objects_related.csv', 'rb') as csvfile:
			# Get the query headers to use as keys in the JSON
			headers = next(csvfile)
			if headers.startswith(codecs.BOM_UTF8):
				headers = headers[3:]
			headers = headers.replace('\r\n','')
			columns = headers.split(',')
			indices = get_indices()

			rows = csv.reader(csvfile, delimiter=',', quotechar='"')
			pub = {}
			current_id = '-1'
			for row in rows:
				(pub, current_id) = process_pub_row(pub, current_id)
			# save last object to elasticsearch
			save(pub)
	print "Finished Pub Docs Related Objects..."
示例#19
0
def process_constituents_altnames(CURSOR):
    def get_indices():
        indices = {
            "constituent_id_index": columns.index("ConstituentID"),
            "type_id_index": columns.index("ConstituentTypeID"),
            "altname_index": columns.index("DisplayName"),
            "name_type_index": columns.index("NameType"),
        }
        return indices

    def process_constituent_row(constituent, current_id):
        constituent_id = row[indices["constituent_id_index"]]
        type_key = int(row[indices["type_id_index"]])
        type = CONSTITUENTTYPES.get(type_key)

        if constituent_id != current_id:
            # will likely have multiple rows for one constituent because of many related objects
            # only get a new constituent if we have a new constituent id, but first save old constituent to elasticsearch
            save(constituent)
            current_id = constituent_id
            constituent = {}
            if elasticsearch_connection.item_exists(constituent_id, type):
                constituent = elasticsearch_connection.get_item(constituent_id, type)
            else:
                print "%s could not be found!" % constituent_id
                return (constituent, current_id)

        if "altnames" not in constituent:
            constituent["altnames"] = []
        altname = row[indices["altname_index"]]
        name_type = row[indices["name_type_index"]]
        constituent["altnames"].append({"name": altname, "type": name_type})
        return (constituent, current_id)

    print "Starting Constituents AltNames..."
    if CURSOR:
        sql_command = constituents_sql.ALT_NAMES
        CURSOR.execute(sql_command)
        columns = [column[0] for column in CURSOR.description]
        indices = get_indices()

        constituent = {}
        current_id = "-1"
        cursor_row = CURSOR.fetchone()
        while cursor_row is not None:
            row = process_cursor_row(cursor_row)
            (constituent, current_id) = process_constituent_row(constituent, current_id)
            cursor_row = CURSOR.fetchone()
            # save last object to elasticsearch
        save(constituent)
    else:
        with open("../data/constituents_altnames.csv", "rb") as csvfile:
            # Get the query headers to use as keys in the JSON
            headers = next(csvfile)
            if headers.startswith(codecs.BOM_UTF8):
                headers = headers[3:]
            headers = headers.replace("\r\n", "")
            columns = headers.split(",")
            indices = get_indices()

            rows = csv.reader(csvfile, delimiter=",", quotechar='"')
            constituent = {}
            current_id = "-1"
            for row in rows:
                (constituent, current_id) = process_constituent_row(constituent, current_id)
                # save last object to elasticsearch
            save(constituent)
    print "Finished Constituents AltNames..."
示例#20
0
def process_pub_related_constituents(CURSOR):
	def get_indices():
		indices = {
			'pub_id_index' : columns.index('ReferenceID'),
			'role_index' : columns.index('Role'),
			'constituent_id_index' : columns.index('ConstituentID'),
			'constituent_type_id_index' : columns.index('ConstituentTypeID'),
			'display_name_index' : columns.index('DisplayName'),
			'alpha_sort_index' : columns.index('AlphaSort'),
			'display_date_index' : columns.index('DisplayDate'),
			'remarks_index' : columns.index('Remarks'),
			'thumb_path_index' : columns.index('ThumbPathName'),
			'thumb_file_index' : columns.index('ThumbFileName')
		}
		return indices

	def process_pub_row(pub, current_id):
		pub_id = row[indices['pub_id_index']]

		if pub_id != current_id:
			# will likely have multiple rows for one pub because of many related constituents
			# only get a new pub if we have a new pub id, but first save old pub to elasticsearch
			save(pub)
			current_id = pub_id
			pub = {}
			if elasticsearch_connection.item_exists(pub_id, 'pubdocs'):
				pub = elasticsearch_connection.get_item(pub_id, 'pubdocs')
			else:
				print "%s could not be found!" % pub_id
				return(pub, current_id)
		if 'relateditems' not in pub:
			pub['relateditems'] = {}

		constituent_id = row[indices['constituent_id_index']]
		display_name = row[indices['display_name_index']]
		display_date = ""
		if row[indices['display_date_index']] != "NULL":
			display_date = row[indices['display_date_index']]
		thumbnail_url = get_media_url(row[indices['thumb_path_index']], row[indices['thumb_file_index']])
		alpha_sort = row[indices['alpha_sort_index']]

		constituent_dict = {}
		role = row[indices['role_index']]
		# update the set of roles for this pub
		if role not in pub['roles']:
			pub['roles'].append(role)

		if role == "Author":
			pub["authors"].append(alpha_sort)

		description = row[indices['remarks_index']] if row[indices['remarks_index']] != "NULL" else ""
		constituent_dict['role'] = role
		constituent_dict['id'] = constituent_id
		constituent_dict['displayname'] = display_name
		constituent_dict['displaydate'] = display_date
		constituent_dict['displaytext'] = display_name
		constituent_dict['description'] = description
		constituent_dict['thumbnail'] = thumbnail_url

		constituent_type_key = int(row[indices['constituent_type_id_index']])
		constituent_type = CONSTITUENTTYPES.get(constituent_type_key)
		if constituent_type not in pub['relateditems']:
			pub['relateditems'][constituent_type] = []
		pub['relateditems'][constituent_type].append(constituent_dict)
		# keep the related items sorted
		pub['relateditems'][constituent_type].sort(key=operator.itemgetter('displaytext'))

		return(pub, current_id)

	print "Starting Pub Docs Related Constituents..."
	if CURSOR:
		sql_command = published_sql.RELATED_CONSTITUENTS
		CURSOR.execute(sql_command)
		columns = [column[0] for column in CURSOR.description]
		indices = get_indices()

		pub = {}
		current_id = '-1'
		cursor_row = CURSOR.fetchone()
		while cursor_row is not None:
			row = process_cursor_row(cursor_row)
			(pub, current_id) = process_pub_row(pub, current_id)
			cursor_row = CURSOR.fetchone()
   		# save last object to elasticsearch
		save(pub)
	else:
		with open('../data/pubdocs_constituents_related.csv', 'rb') as csvfile:
			# Get the query headers to use as keys in the JSON
			headers = next(csvfile)
			if headers.startswith(codecs.BOM_UTF8):
				headers = headers[3:]
			headers = headers.replace('\r\n','')
			columns = headers.split(',')
			indices = get_indices()

			rows = csv.reader(csvfile, delimiter=',', quotechar='"')
			pub = {}
			current_id = '-1'
			for row in rows:
				(pub, current_id) = process_pub_row(pub, current_id)
			# save last object to elasticsearch
			save(pub)

	print "Finished Pub Docs Related Constituents..."
示例#21
0
文件: objects.py 项目: rsinghal/giza
def process_objects(CURSOR):
	def get_indices():
		indices = {
			'id_index' : columns.index('ID'),
			'classification_id_index' : columns.index('ClassificationID'),
			'object_number_index' : columns.index('Number')
		}
		return indices

	def process_object_row(object, current_id):
		id = row[indices['id_index']]
		classification_key = int(row[indices['classification_id_index']])
		classification = CLASSIFICATIONS.get(classification_key)
		object_number = row[indices['object_number_index']]

		#if id not in SAMPLE_OBJECTS:
		#	return (object, current_id)

		# I don't think there are any duplicate rows for objects, but keep it here since it doesn't break anything
		if id != current_id:
			save(object)
			current_id = id
			object = {}

		object['classification'] = classification
		# loop through each row
		for index, value in enumerate(columns):
			key = value.lower()
			row_value = row[index]

			# cleanup row data
			if row_value.isdigit():
				row_value = int(row_value)
			elif row_value == "NULL":
				row_value = None
			else:
				row_value = row_value.replace(',,','')

			if 'title' in key:
				object_title = row_value
				if classification == "diarypages" and object_title is None:
					idx = object_number.find('_')
					object_title = object_number[idx+1:]
					object[key] = object_title
				else:
					object[key] = row_value
			else:
				# remove whitespace at end of line if a string
				object[key] = row_value.rstrip() if type(row_value) is str else row_value
		# Add some extra fields not in the TMS data
		object['displaytext'] = object['title']
		prefix_idx = object_number.find('_')
		object['allnumbers'] = [object_number, object_number[prefix_idx+1:]]
		object['roles'] = []
		object['hasphoto'] = "No"
		return (object, current_id)

	print "Starting Objects..."
	if CURSOR:
		sql_command = objects_sql.OBJECTS
		CURSOR.execute(sql_command)
		columns = [column[0] for column in CURSOR.description]
		indices = get_indices()

		object = {}
		current_id = '-1'
		cursor_row = CURSOR.fetchone()
		while cursor_row is not None:
			row = process_cursor_row(cursor_row)
			(object, current_id) = process_object_row(object, current_id)
			cursor_row = CURSOR.fetchone()
   		# save last object to elasticsearch
		save(object)

	else:
		with open('../data/objects.csv', 'rb') as csvfile:
			# Get the query headers to use as keys in the JSON
			headers = next(csvfile)
			if headers.startswith(codecs.BOM_UTF8):
				headers = headers[3:]
			headers = headers.replace('\r\n','')
			columns = headers.split(',')
			indices = get_indices()

			rows = csv.reader(csvfile, delimiter=',', quotechar='"')
			object = {}
			current_id = '-1'
			for row in rows:
				(object, current_id) = process_object_row(object, current_id)
			# save last object to elasticsearch
			save(object)

	print "Finished Objects..."
示例#22
0
def process_pub_related_media(CURSOR):
	def get_indices():
		indices = {
			'pub_id_index' : columns.index('ReferenceID'),
			'thumb_path_index' : columns.index('ThumbPathName'),
			'thumb_file_index' : columns.index('ThumbFileName'),
			'main_path_index' : columns.index('MainPathName'),
			'main_file_index' : columns.index('MainFileName')
		}
		return indices

	def process_pub_row(pub, current_id):
		pub_id = row[indices['pub_id_index']]

		if pub_id != current_id:
			save(pub)
			current_id = pub_id
			pub = {}
			if elasticsearch_connection.item_exists(pub_id, 'pubdocs'):
				pub = elasticsearch_connection.get_item(pub_id, 'pubdocs')
			else:
				print "%s could not be found!" % pub_id
				return(pub, current_id)

		thumbnail_url = get_media_url(row[indices['thumb_path_index']], row[indices['thumb_file_index']])
		main_url = get_media_url(row[indices['main_path_index']], row[indices['main_file_index']])

		pub['primarydisplay'] = {
		'thumbnail' : thumbnail_url,
		'main' : thumbnail_url
		}
		pub['pdf'] = main_url

		return(pub, current_id)

	print "Starting Pub Docs Related Media..."
	if CURSOR:
		sql_command = published_sql.RELATED_MEDIA
		CURSOR.execute(sql_command)
		columns = [column[0] for column in CURSOR.description]
		indices = get_indices()

		pub = {}
		current_id = '-1'
		cursor_row = CURSOR.fetchone()
		while cursor_row is not None:
			row = process_cursor_row(cursor_row)
			(pub, current_id) = process_pub_row(pub, current_id)
			cursor_row = CURSOR.fetchone()
   		# save last object to elasticsearch
		save(pub)
	else:
		with open('../data/pubdocs_media_related.csv', 'rb') as csvfile:
			# Get the query headers to use as keys in the JSON
			headers = next(csvfile)
			if headers.startswith(codecs.BOM_UTF8):
				headers = headers[3:]
			headers = headers.replace('\r\n','')
			columns = headers.split(',')
			indices = get_indices()

			rows = csv.reader(csvfile, delimiter=',', quotechar='"')
			pub = {}
			current_id = '-1'
			for row in rows:
				(pub, current_id) = process_pub_row(pub, current_id)
			# save last object to elasticsearch
			save(pub)

	print "Finished Pub Docs Related Media..."
示例#23
0
文件: objects.py 项目: rsinghal/giza
def process_object_flexfields(CURSOR):
	def get_indices():
		indices = {
			'object_id_index' : columns.index('ObjectID'),
			'classification_id_index' : columns.index('ClassificationID'),
			'group_name_index' : columns.index('GroupName'),
			'field_name_index' : columns.index('UserFieldName'),
			'field_value_index' : columns.index('UserFieldValue')
		}
		return indices

	def process_object_row(object, current_id):
		object_id = row[indices['object_id_index']]
		classification_key = int(row[indices['classification_id_index']])
		classification = CLASSIFICATIONS.get(classification_key)

		if object_id != current_id:
			# will likely have multiple rows
			save(object)
			current_id = object_id
			object = {}
			if elasticsearch_connection.item_exists(object_id, classification):
				object = elasticsearch_connection.get_item(object_id, classification)
			else:
				print "%s could not be found!" % object_id
				return (object, current_id)

		if 'flexfields' not in object:
			object['flexfields'] = {}

		groupname = row[indices['group_name_index']]
		if groupname not in object['flexfields']:
			object['flexfields'][groupname] = []

		fieldname = row[indices['field_name_index']]
		fieldvalue = row[indices['field_value_index']]
		object['flexfields'][groupname].append({fieldname : fieldvalue})
		return (object, current_id)

	print "Starting Objects Flex Fields..."
	if CURSOR:
		sql_command = objects_sql.FLEXFIELDS
		CURSOR.execute(sql_command)
		columns = [column[0] for column in CURSOR.description]
		indices = get_indices()

		object = {}
		current_id = '-1'
		cursor_row = CURSOR.fetchone()
		while cursor_row is not None:
			row = process_cursor_row(cursor_row)
			(object, current_id) = process_object_row(object, current_id)
			cursor_row = CURSOR.fetchone()
   		# save last object to elasticsearch
		save(object)
	else:
		with open('../data/objects_flexfields.csv', 'rb') as csvfile:
			# Get the query headers to use as keys in the JSON
			headers = next(csvfile)
			if headers.startswith(codecs.BOM_UTF8):
				headers = headers[3:]
			headers = headers.replace('\r\n','')
			columns = headers.split(',')
			indices = get_indices()

			rows = csv.reader(csvfile, delimiter=',', quotechar='"')
			object = {}
			current_id = '-1'
			for row in rows:
				(object, current_id) = process_object_row(object, current_id)
			# save last object to elasticsearch
			save(object)
	print "Finished Objects Flex Fields..."
示例#24
0
def process_pub_related_sites(CURSOR):
	def get_indices():
		indices = {
			'id_index' : columns.index('ReferenceID'),
			'site_id_index' : columns.index('SiteID'),
			'site_name_index' : columns.index('SiteName'),
			'site_number_index' : columns.index('SiteNumber'),
			'thumb_path_index' : columns.index('ThumbPathName'),
			'thumb_file_index' : columns.index('ThumbFileName')
		}
		return indices

	def process_pub_row(pub, current_id):
		id = row[indices['id_index']]

		if id != current_id:
			save(pub)
			current_id = id
			pub = {}
			if elasticsearch_connection.item_exists(id, 'pubdocs'):
				pub = elasticsearch_connection.get_item(id, 'pubdocs')
			else:
				print "%s could not be found!" % id
				return(pub, current_id)
		if 'relateditems' not in pub:
			pub['relateditems'] = {}

		site_id = row[indices['site_id_index']]
		site_name = row[indices['site_name_index']]
		site_number = row[indices['site_number_index']]
		thumbnail_url = get_media_url(row[indices['thumb_path_index']], row[indices['thumb_file_index']])

		site_dict = {}
		site_dict['id'] = site_id
		site_dict['sitename'] = site_name
		site_dict['sitenumber'] = site_number
		site_dict['displaytext'] = site_number
		site_dict['thumbnail'] = thumbnail_url

		if 'sites' not in pub['relateditems']:
			pub['relateditems']['sites'] = []
		pub['relateditems']['sites'].append(site_dict)
		# keep the related items sorted
		pub['relateditems']['sites'].sort(key=operator.itemgetter('displaytext'))

		return(pub, current_id)

	print "Starting Pub Docs Related Sites..."
	if CURSOR:
		sql_command = published_sql.RELATED_SITES
		CURSOR.execute(sql_command)
		columns = [column[0] for column in CURSOR.description]
		indices = get_indices()

		pub = {}
		current_id = '-1'
		cursor_row = CURSOR.fetchone()
		while cursor_row is not None:
			row = process_cursor_row(cursor_row)
			(pub, current_id) = process_pub_row(pub, current_id)
			cursor_row = CURSOR.fetchone()
   		# save last pub to elasticsearch
		save(pub)
	else:
		with open('../data/pubdocs_sites_related.csv', 'rb') as csvfile:
			# Get the query headers to use as keys in the JSON
			headers = next(csvfile)
			if headers.startswith(codecs.BOM_UTF8):
				headers = headers[3:]
			headers = headers.replace('\r\n','')
			columns = headers.split(',')
			indices = get_indices()

			rows = csv.reader(csvfile, delimiter=',', quotechar='"')
			pub = {}
			current_id = '-1'
			for row in rows:
				(pub, current_id) = process_pub_row(pub, current_id)
			# save last pub to elasticsearch
			save(pub)

	print "Finished Pub Docs Related Sites..."
示例#25
0
文件: objects.py 项目: rsinghal/giza
def process_object_related_constituents(CURSOR):
	def get_indices():
		indices = {
			'id_index' : columns.index('ID'),
			'role_index' : columns.index('Role'),
			'role_id_index' : columns.index('RoleID'),
			'constituent_id_index' : columns.index('ConstituentID'),
			'constituent_type_id_index' : columns.index('ConstituentTypeID'),
			'display_name_index' : columns.index('DisplayName'),
			'display_date_index' : columns.index('DisplayDate'),
			'classification_id_index' : columns.index('ClassificationID'),
			'remarks_index' : columns.index('Remarks'),
			'thumb_path_index' : columns.index('ThumbPathName'),
			'thumb_file_index' : columns.index('ThumbFileName')
		}
		return indices

	def process_object_row(object, current_id):
		id = row[indices['id_index']]
		classification_key = int(row[indices['classification_id_index']])
		classification = CLASSIFICATIONS.get(classification_key)

		if id != current_id:
			# may have multiple rows for one object because of many related constituents
			save(object)
			current_id = id
			object = {}
			if elasticsearch_connection.item_exists(id, classification):
				object = elasticsearch_connection.get_item(id, classification)
			else:
				print "%s could not be found!" % id
				return(object, current_id)
		if 'relateditems' not in object:
			object['relateditems'] = {}

		constituent_id = row[indices['constituent_id_index']]
		display_name = row[indices['display_name_index']]
		description = row[indices['remarks_index']] if row[indices['remarks_index']] != "NULL" else ""
		display_date = ""
		if row[indices['display_date_index']] != "NULL":
			display_date = row[indices['display_date_index']]
		thumbnail_url = get_media_url(row[indices['thumb_path_index']], row[indices['thumb_file_index']])

		constituent_dict = {}
		role = row[indices['role_index']]
		# update the set of roles for this object
		if role not in object['roles']:
			object['roles'].append(role)

		constituent_dict['role'] = row[indices['role_index']]
		constituent_dict['roleid'] = row[indices['role_id_index']]
		constituent_dict['id'] = constituent_id
		constituent_dict['displayname'] = display_name
		constituent_dict['displaydate'] = display_date
		constituent_dict['displaytext'] = display_name
		constituent_dict['description'] = description
		constituent_dict['thumbnail'] = thumbnail_url

		constituent_type_key = int(row[indices['constituent_type_id_index']])
		constituent_type = CONSTITUENTTYPES.get(constituent_type_key)
		if constituent_type not in object['relateditems']:
			object['relateditems'][constituent_type] = []
		object['relateditems'][constituent_type].append(constituent_dict)
		# keep the related items sorted
		object['relateditems'][constituent_type].sort(key=operator.itemgetter('displaytext'))

		return(object, current_id)

	print "Starting Objects Related Constituents..."
	if CURSOR:
		sql_command = objects_sql.RELATED_CONSTITUENTS
		CURSOR.execute(sql_command)
		columns = [column[0] for column in CURSOR.description]
		indices = get_indices()

		object = {}
		current_id = '-1'
		cursor_row = CURSOR.fetchone()
		while cursor_row is not None:
			row = process_cursor_row(cursor_row)
			(object, current_id) = process_object_row(object, current_id)
			cursor_row = CURSOR.fetchone()
   		# save last object to elasticsearch
		save(object)
	else:
		with open('../data/objects_constituents_related.csv', 'rb') as csvfile:
			# Get the query headers to use as keys in the JSON
			headers = next(csvfile)
			if headers.startswith(codecs.BOM_UTF8):
				headers = headers[3:]
			headers = headers.replace('\r\n','')
			columns = headers.split(',')
			indices = get_indices()

			rows = csv.reader(csvfile, delimiter=',', quotechar='"')
			object = {}
			current_id = '-1'
			for row in rows:
				(object, current_id) = process_object_row(object, current_id)
			# save last object to elasticsearch
			save(object)

	print "Finished Objects Related Constituents..."
示例#26
0
def process_constituents(CURSOR):
    def get_indices():
        indices = {"constituent_id_index": columns.index("ID"), "type_id_index": columns.index("ConstituentTypeID")}
        return indices

    def process_constituent_row(constituent, current_id):
        constituent_id = row[indices["constituent_id_index"]]
        type_key = int(row[indices["type_id_index"]])
        type = CONSTITUENTTYPES.get(type_key)

        # could have multiple rows
        if constituent_id != current_id:
            save(constituent)
            current_id = constituent_id
            constituent = {}

        constituent["type"] = type
        # loop through each row
        for index, value in enumerate(columns):
            key = value.lower()
            row_value = row[index]

            # cleanup row data
            if row_value.isdigit():
                row_value = int(row_value)
            elif row_value == "NULL":
                row_value = None
            else:
                row_value = row_value.replace(",,", "")

            if key in ["begindate", "enddate"]:
                if row_value == 0:
                    row_value = None

            constituent[key] = row_value

        display_text = constituent["displayname"]
        constituent["displaytext"] = display_text
        return (constituent, current_id)

    print "Starting Constituents..."
    if CURSOR:
        sql_command = constituents_sql.CONSTITUENTS
        CURSOR.execute(sql_command)
        columns = [column[0] for column in CURSOR.description]
        indices = get_indices()

        constituent = {}
        current_id = "-1"
        cursor_row = CURSOR.fetchone()
        while cursor_row is not None:
            row = process_cursor_row(cursor_row)
            (constituent, current_id) = process_constituent_row(constituent, current_id)
            cursor_row = CURSOR.fetchone()
            # save last object to elasticsearch
        save(constituent)

    else:
        with open("../data/constituents.csv", "rb") as csvfile:
            # Get the query headers to use as keys in the JSON
            headers = next(csvfile)
            if headers.startswith(codecs.BOM_UTF8):
                headers = headers[3:]
            headers = headers.replace("\r\n", "")
            columns = headers.split(",")
            indices = get_indices()

            rows = csv.reader(csvfile, delimiter=",", quotechar='"')
            constituent = {}
            current_id = "-1"
            for row in rows:
                (constituent, current_id) = process_constituent_row(constituent, current_id)
                # save last object to elasticsearch
            save(constituent)

    print "Finished Constituents..."
示例#27
0
文件: objects.py 项目: rsinghal/giza
def process_object_related_media(CURSOR):
	def get_indices():
		indices = {
			'id_index' : columns.index('ID'),
			'media_master_id_index' : columns.index('MediaMasterID'),
			'classification_id_index' : columns.index('ClassificationID'),
			'primary_display_index' : columns.index('PrimaryDisplay'),
			'media_type_id_index' : columns.index('MediaTypeID'),
			'description_index' : columns.index('Description'),
			'caption_index' : columns.index('PublicCaption'),
			'thumb_path_index' : columns.index('ThumbPathName'),
			'thumb_file_index' : columns.index('ThumbFileName'),
			'main_path_index' : columns.index('MainPathName'),
			'main_file_index' : columns.index('MainFileName')
		}
		return indices

	def process_object_row(object, current_id):
		id = row[indices['id_index']]
		classification_key = int(row[indices['classification_id_index']])
		classification = CLASSIFICATIONS.get(classification_key)

		if id != current_id:
			# may have multiple rows for one object because of many related constituents
			save(object)
			current_id = id
			object = {}
			if elasticsearch_connection.item_exists(id, classification):
				object = elasticsearch_connection.get_item(id, classification)
			else:
				print "%s could not be found!" % id
				return(object, current_id)
		if 'relateditems' not in object:
			object['relateditems'] = {}

		media_type_key = int(row[indices['media_type_id_index']])
		media_type = MEDIATYPES.get(media_type_key)
		media_master_id = row[indices['media_master_id_index']]
		thumbnail_url = get_media_url(row[indices['thumb_path_index']], row[indices['thumb_file_index']])
		main_url = get_media_url(row[indices['main_path_index']], row[indices['main_file_index']])
		display_text = row[indices['caption_index']]

		# this is a bit of a hack because the MediaFormats for videos (in the TMS database) does not correctly identify the type of video
		# so, make sure we are only using videos that are mp4s
		if media_type_key == 3:
			if not row[indices['main_file_index']].endswith('mp4'):
				return(object, current_id)

		if media_type not in object['relateditems']:
			object['relateditems'][media_type] = []

		if media_type == 'photos':
			object['hasphoto'] = "Yes"
		# add primary photo as a top level item as well
		if row[indices['primary_display_index']] == '1':
			object['primarydisplay'] = {
			'thumbnail' : thumbnail_url,
			'main' : main_url,
			'displaytext' : display_text
			}
		if not (classification == '3dmodels' and media_type == '3dmodels'):
			object['relateditems'][media_type].append({
				'id' : media_master_id,
				'displaytext' : display_text,
				'primarydisplay' : True if row[indices['primary_display_index']] == '1' else False,
				'thumbnail' : thumbnail_url,
				'main' : main_url
				})
		return(object, current_id)

	print "Starting Objects Related Media..."
	if CURSOR:
		sql_command = objects_sql.RELATED_MEDIA
		CURSOR.execute(sql_command)
		columns = [column[0] for column in CURSOR.description]
		indices = get_indices()

		object = {}
		current_id = '-1'
		cursor_row = CURSOR.fetchone()
		while cursor_row is not None:
			row = process_cursor_row(cursor_row)
			(object, current_id) = process_object_row(object, current_id)
			cursor_row = CURSOR.fetchone()
   		# save last object to elasticsearch
		save(object)
	else:
		with open('../data/objects_media_related.csv', 'rb') as csvfile:
			# Get the query headers to use as keys in the JSON
			headers = next(csvfile)
			if headers.startswith(codecs.BOM_UTF8):
				headers = headers[3:]
			headers = headers.replace('\r\n','')
			columns = headers.split(',')
			indices = get_indices()

			rows = csv.reader(csvfile, delimiter=',', quotechar='"')
			object = {}
			current_id = '-1'
			for row in rows:
				(object, current_id) = process_object_row(object, current_id)
			# save last object to elasticsearch
			save(object)

	print "Finished Objects Related Media..."
示例#28
0
def process_constituents(CURSOR):
    def get_indices():
        indices = {
            'constituent_id_index': columns.index('ID'),
            'type_id_index': columns.index('ConstituentTypeID')
        }
        return indices

    def process_constituent_row(constituent, current_id):
        constituent_id = row[indices['constituent_id_index']]
        type_key = int(row[indices['type_id_index']])
        type = CONSTITUENTTYPES.get(type_key)

        # could have multiple rows
        if constituent_id != current_id:
            save(constituent)
            current_id = constituent_id
            constituent = {}

        constituent['type'] = type
        # loop through each row
        for index, value in enumerate(columns):
            key = value.lower()
            row_value = row[index]

            # cleanup row data
            if row_value.isdigit():
                row_value = int(row_value)
            elif row_value == "NULL":
                row_value = None
            else:
                row_value = row_value.replace(',,', '')

            if key in ['begindate', 'enddate']:
                if row_value == 0:
                    row_value = None

            constituent[key] = row_value

        display_text = constituent['displayname']
        constituent['displaytext'] = display_text
        return (constituent, current_id)

    print "Starting Constituents..."
    if CURSOR:
        sql_command = constituents_sql.CONSTITUENTS
        CURSOR.execute(sql_command)
        columns = [column[0] for column in CURSOR.description]
        indices = get_indices()

        constituent = {}
        current_id = '-1'
        cursor_row = CURSOR.fetchone()
        while cursor_row is not None:
            row = process_cursor_row(cursor_row)
            (constituent,
             current_id) = process_constituent_row(constituent, current_id)
            cursor_row = CURSOR.fetchone()
# save last object to elasticsearch
        save(constituent)

    else:
        with open('../data/constituents.csv', 'rb') as csvfile:
            # Get the query headers to use as keys in the JSON
            headers = next(csvfile)
            if headers.startswith(codecs.BOM_UTF8):
                headers = headers[3:]
            headers = headers.replace('\r\n', '')
            columns = headers.split(',')
            indices = get_indices()

            rows = csv.reader(csvfile, delimiter=',', quotechar='"')
            constituent = {}
            current_id = '-1'
            for row in rows:
                (constituent,
                 current_id) = process_constituent_row(constituent, current_id)
            # save last object to elasticsearch
            save(constituent)

    print "Finished Constituents..."