def findParents(TermID, parentHits, secondaryHits, checked): if TermID not in checked: checked.append(TermID) if str(TermID) in primary_parents_list: if str(TermID) not in parentHits: parentHits.append(TermID) else: query = "SELECT DISTINCT `ParentTermID`, `ParentTermLabel` FROM `thesaurusNew2015` WHERE TermID = %s" cursor = db.get_cursor() cursor.execute(query, [TermID]) set = {} while True: row = cursor.fetchone() if not row: break if row: set[str(row["ParentTermID"])] = row["ParentTermLabel"] db.get_connection().commit() cursor.close() for ParentTermID in set: if str(ParentTermID) in primary_parents_list: if str(ParentTermID) not in parentHits: parentHits.append(ParentTermID) if str(TermID) not in secondaryHits: secondaryHits.append(TermID) else: parentHits, secondaryHits = findParents(ParentTermID, parentHits, secondaryHits, checked) return parentHits, secondaryHits
def getBaseProbabilities(level, database): base_probabilities = {} if database: level = level[0].upper() + level[1:] tablename = database + level + 'LevelBaseProbabilities2015' query = 'SELECT * FROM {tablename}'.format(tablename = tablename) cursor = db.get_cursor() cursor.execute(query) count = 0 while True: row = cursor.fetchone() if not row: break else: if level.lower() == 'first': if count <= 100: for row_val in row: if row_val not in base_probabilities: base_probabilities[row_val] = {} base_probabilities[row_val][count+1] = {} base_probabilities[row_val][count+1] = row[row_val] if level.lower() == 'second': parent = "L" + str(row["ParentID"]) if parent not in base_probabilities: base_probabilities[parent] = {} for index in range(1,101): base_probabilities[parent][index] = {} base_probabilities[parent][index] = row["L" + str(index)] count += 1 newtablename = '{tablename}_extended'.format(tablename = tablename) return base_probabilities, newtablename
def insertToNewTable(baseProbabilities, parents, tablename): columnList = ['ParentID', 'ParentLabel'] for num in range(1,101): columnList.append('L' + str(num)) columnsString = ', '.join(columnList) queryTemplate = 'INSERT INTO {tablename} ({columns}) VALUES ({values})' cursor = db.get_cursor() for parent in baseProbabilities: if parent[1:] in parents: baseProbabilities[parent]["ParentLabel"] = parents[parent[1:]] values = baseProbabilities[parent] string = values["ParentLabel"].encode('ascii', 'ignore') string = re.sub("'", "", string) label = "'{label}'".format(label = string) valueList = [] valueList.append(parent[1:]) valueList.append(label) for num in range(1,101): valueList.append(str(values[num])) valuesString = ', '.join(valueList) query = queryTemplate.format(tablename = tablename, columns = columnsString, values = valuesString) print(query) cursor.execute(query) cursor.close()
def createNewTable(level, database, tablename): query = 'CREATE TABLE IF NOT EXISTS {tablename}'.format(tablename = tablename) columnList = ['ParentID INT', 'ParentLabel VARCHAR(200)'] for num in range(1,101): columnList.append('L' + str(num) + ' FLOAT') columnsString = ', '.join(columnList) query = query + ' ( ' + columnsString + ' ) ' cursor = db.get_cursor() cursor.execute(query) cursor.close()
def getParents(level): parents = {} tablename = level + 'LevelParents2015' query = 'SELECT * from {tablename}'.format(tablename = tablename) cursor = db.get_cursor() cursor.execute(query) while True: row = cursor.fetchone() if not row: break else: parents[str(row["TermID"])] = row["TermLabel"] return parents
from __future__ import division import common.db as db import json import csv from math import floor, ceil # This is for only ShortFormID = 2, Even Testimonies and Pruned with SaturationScore >= 0.20 # And now, further pruned with TotSegment >= 40. query1 = """ SELECT * FROM testimoniesSaturationPruned2015 """ cursor1 = db.get_cursor() cursor1.execute(query1) testimony_details_pruned = {} i = 0 while True: row = cursor1.fetchone() if not row: break else: testimony_details_pruned[row["IntCode"]] = { "IntCode": row["IntCode"], "IntervieweeName": row["IntervieweeName"], "TotTaggedSegments": row["TotTaggedSegments"], "TotSegments": row["TotSegments"], "SaturationPercent": row["SaturationPercent"], "NormativityScore": 0 } i = i + 1
from __future__ import division import common.db as db import json import xmltodict from math import floor, ceil # Extracting the First Level Parents: # Important Note!: primary node has label 32582 NOT -1 in thesaurusNew2015 query = """ SELECT DISTINCT `TermID` , `TermLabel` FROM `thesaurusNew2015` WHERE `ParentTermID` = 32582 """ cursor = db.get_cursor() cursor.execute(query) primary_parents = {} primary_parents_list = [] while True: row = cursor.fetchone() if not row: break if row: primary_parents[row["TermID"]] = row["TermLabel"] primary_parents_list.append(str(row["TermID"])) cursor.close() print(len(primary_parents))
import common.db as db cursor = db.get_cursor() print "Resetting table ..." cursor.execute("""DROP TABLE IF EXISTS firstLevelBaseProbabilities""") segmentile_columns = ['L{}'.format(x) for x in range(1, 101)] cursor.execute( """CREATE TABLE firstLevelBaseProbabilities (TermID INT, KeywordLabel VARCHAR(100), ShortFormID INT, {} FLOAT)""" .format(' FLOAT, '.join(segmentile_columns))) print "Table reset" print "Loading data ..." query = """ SELECT totalKeywordsBySegment.ShortFormID, totalKeywordsBySegment.segmentile, FirstLevelParentID, keywordCountsBySegment.keywordCount, keywordCountsBySegment.keywordCount / totalKeywordsBySegment.totalKeywords AS probability FROM ( -- Total keywords by segment and ShortFormID SELECT t.ShortFormID, ROUND(SegmentNumber / totalSegments.total * 100) AS segmentile, COUNT(*) AS totalKeywords FROM segmentKeywordsFirstParents2015Filled AS sk JOIN ( SELECT sk.IntCode, MAX(SegmentNumber) AS total FROM segmentKeywordsFirstParents2015Filled AS sk GROUP BY sk.IntCode ) AS totalSegments ON totalSegments.IntCode = sk.IntCode JOIN testimonies2015_include AS t ON t.IntCode = sk.IntCode GROUP BY t.ShortFormID, segmentile
import common.db as db import csv import json from __future__ import division from math import floor, ceil # Getting all the unique parents first. And then creating the parents table # First, processing only the first level parents. query1 = """ SELECT TermID FROM `secondLevelParents2015` """ cursor1 = db.get_cursor() cursor1.execute(query1) parentsTable = {} while True: row=cursor1.fetchone() if not row: break else: print row["TermID"] i = 1 parentsTable[row["TermID"]] = {} for i in range(1,101): parentsTable[row["TermID"]][i] = 0 cursor1.close() # All the parent values have now been initialized. query1 = """ SELECT * FROM testimoniesSaturationPruned2015
import common.db as db conn = db.get_connection() cursor = db.get_cursor() cursor.execute('''DROP TABLE IF EXISTS firstLevelBaseProbabilitiesUnrolled''') cursor.execute( '''CREATE TABLE firstLevelBaseProbabilitiesUnrolled (TermID INT, ShortFormID INT, Percentile INT, Probability FLOAT)''' ) cursor.execute('''DROP TABLE IF EXISTS secondLevelBaseProbabilitiesUnrolled''') cursor.execute( '''CREATE TABLE secondLevelBaseProbabilitiesUnrolled (TermID INT, ShortFormID INT, Percentile INT, Probability FLOAT)''' ) read_cursor = db.get_cursor() write_cursor = db.get_cursor() read_cursor.execute('''SELECT * FROM firstLevelBaseProbabilities ''') while True: row = read_cursor.fetchone() if not row: break term_ID = row['TermID'] short_form_ID = row['ShortFormID'] for segmentile in range(1, 101): key = 'L' + str(segmentile) write_cursor.execute( '''INSERT INTO firstLevelBaseProbabilitiesUnrolled (TermID, ShortFormID, Percentile, Probability) VALUES (%s, %s, %s, %s) ''', [ term_ID, short_form_ID,
# This is to calculate the FIRST LEVEL parents based uniqueness scores for all # the of testimonies fed from the chosen table and write them to file. import common.db as db import csv import json from __future__ import division from math import floor, ceil # Read first level parent base probabilities. query1 = """ SELECT * FROM firstLevelBaseProbabilities2015 """ cursor1 = db.get_cursor() cursor1.execute(query1) base_probabilities = {} while True: row = cursor1.fetchone() if not row: break else: parent = "L" + str(row["ParentID"]) if parent not in base_probabilities: base_probabilities[parent] = {} for index in range(1,101): base_probabilities[parent][index] = {} base_probabilities[parent][index] = row["L" + str(index)] cursor1.close()
from __future__ import division import common.db as db import json import csv from math import floor, ceil # This is for only ShortFormID = 2, Even Testimonies and Pruned with SaturationScore >= 0.20 # And now, further pruned with TotSegment >= 40. query1 = """ SELECT * FROM testimoniesSaturationPruned2015 """ cursor1 = db.get_cursor() cursor1.execute(query1) testimony_details_pruned = {} i=0 while True: row=cursor1.fetchone() if not row: break else: testimony_details_pruned[row["IntCode"]]={ "IntCode" : row["IntCode"], "IntervieweeName" : row["IntervieweeName"], "TotTaggedSegments" : row["TotTaggedSegments"], "TotSegments" : row["TotSegments"], "SaturationPercent" : row["SaturationPercent"], "NormativityScore" : 0 } i=i+1
# Opening all files in order: from os import walk allfiles = [] mypath = args.thesaurus for (dirpath, dirnames, filenames) in walk(mypath): allfiles.extend(filenames) break insert_query3 = """ INSERT INTO thesaurus2015 (TermID, TermLabel, ParentTermID, ParentTermLabel, ThesaurusID, ThesaurusType, KeywordCount, TypeID, IsParent, KWDefinition) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) """ count = 1 cursor = db.get_cursor() for file in allfiles: if True: try: filename = args.thesaurus + file myfile = open(filename, 'r') data = myfile.read() udata = unicode(data, 'utf-8') encoded_udata = udata.encode('utf-8-sig') decoded_udata = encoded_udata.decode('utf-8-sig') ordered_dict = xmltodict.parse(decoded_udata) pydict = dict(ordered_dict) items = pydict["USCShoahFoundationInstituteData"]["ThesaurusData"][ "Items"] l = len(items["Item"])
import common.db as db conn = db.get_connection() cursor = db.get_cursor() print 'Clearing database ...' cursor.execute('''DROP TABLE IF EXISTS uniqueness_scores_first_level ''') cursor.execute('''DROP TABLE IF EXISTS uniqueness_scores_second_level ''') cursor.execute( '''CREATE TABLE uniqueness_scores_first_level (IntCode INT, Score FLOAT)''' ) cursor.execute( '''CREATE TABLE uniqueness_scores_second_level (IntCode INT, Score FLOAT)''' ) max_segments = {} print 'Finding maximum segments ...' cursor.execute( '''SELECT skf.IntCode, MAX(skf.SegmentNumber) AS max_seg FROM segmentKeywordsFirstParents2015Filled skf JOIN testimonies2015_include AS ti ON ti.IntCode = skf.IntCode GROUP BY IntCode ''') while True: row = cursor.fetchone() if not row: break max_segments[row['IntCode']] = row['max_seg'] print 'Running first level normativities ...' cursor.execute('''SELECT * FROM firstLevelBaseProbabilities ''') first_level_base_probabilities = {}
from os import walk allfiles = [] mypath = args.thesaurus for (dirpath, dirnames, filenames) in walk(mypath): allfiles.extend(filenames) break insert_query3 = """ INSERT INTO thesaurus2015 (TermID, TermLabel, ParentTermID, ParentTermLabel, ThesaurusID, ThesaurusType, KeywordCount, TypeID, IsParent, KWDefinition) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) """ count = 1 cursor = db.get_cursor() for file in allfiles: if True: try: filename = args.thesaurus + file myfile = open(filename, 'r') data = myfile.read() udata = unicode(data, 'utf-8') encoded_udata = udata.encode('utf-8-sig') decoded_udata = encoded_udata.decode('utf-8-sig') ordered_dict = xmltodict.parse(decoded_udata) pydict = dict(ordered_dict) items = pydict["USCShoahFoundationInstituteData"]["ThesaurusData"]["Items"] l = len(items["Item"]) for no in range(0,l):
# This code: # First, the all of the testimonies and their details are collectedi from the EVEN segments testimonies table. # Second, the saturation scores and the unique_tags are gotten from the segnment-keywords table. # Third, from __future__ import division import common.db as db import json query = """ SELECT * from evenSegmentTestimonies2015 """ cursor = db.get_cursor() cursor.execute(query) evenTestimoniesList = [] while True: row = cursor.fetchone() if not row: break if row: evenTestimoniesList.append(row["IntCode"]) cursor.close() refinedEvenTestimoniesList = ', '.join(a for a in evenTestimoniesList) query = """ SELECT ES.IntCode AS IntCode, MAX( ES.SegmentNumber ) AS TotSegments, CS.ShortFormID AS ShortFormID FROM segmentKeywordsTable2015 AS ES, testimonies2015 AS CS WHERE ES.IntCode = CS.IntCode AND CS.ShortFormID = 2 AND ES.IntCode in ({0}) GROUP BY ES.IntCode """
# This code is to generate a new thesaurus (TWO PASS Code): # - Match the thesaurus keywords and the segment keywords by the keyword text. # - and reassign keyword ids! import common.db as db import re query = """ SELECT * FROM thesaurus2015 """ # Pass One: To get new unique IDs to each of teh terms matched by text. cursor = db.get_cursor() cursor.execute(query) newThesaurus = {} backMap = {} wordCount = 1 while True: row = cursor.fetchone() if not row: break if row: splice = row["TermLabel"].strip() if splice not in newThesaurus: newThesaurus[splice] = { "TermLabel": row["TermLabel"].strip(), "ThesaurusID": row["ThesaurusID"], "ThesaurusType": row["ThesaurusType"], "KeywordCount": row["KeywordCount"],
from common import db # find testimonies with > 40% saturation # # find all testimonies table_name = 'segmentKeywordsSecondParents2015filled' print "Clearing table ..." create_table_cursor = db.get_cursor() create_table_cursor.execute( '''DROP TABLE IF EXISTS {table_name}'''.format(**{ 'table_name': table_name, })) create_table_cursor.execute('''CREATE TABLE {table_name} ( SegmentNumber INT, SegmentID INT, TermID INT, TermLabel VARCHAR(255), SecondLevelParentID INT, ParentLabel VARCHAR(255), IntCode INT ) '''.format(**{ 'table_name': table_name, })) print 'Table cleared' get_testimony_segments = """ SELECT SegmentNumber, SegmentID, TermID, TermLabel, SecondLevelParentID, ParentLabel, IntCode