예제 #1
0
def get_full_transcripts():
    list_of_dicts = gd.get_data_list_of_dicts()

    final = {}
    identifiers = []

    for entry in list_of_dicts:
	if not entry["Identifier"] in identifiers:
	    identifiers.append(entry["Identifier"])
	    final[entry["Identifier"]] = [entry]
	else:
	   final[entry["Identifier"]].append(entry)

    for i in final.keys():
	final[i] = sorted(final[i], key= lambda k: k['Title'])

    transcripts = {}

    for entry in final.keys():
	temp_string = ""
	for item in final[entry]:
	    temp_string += item["Transcript"]
	transcripts[entry] = temp_string

    return transcripts
예제 #2
0
def makeTrainingData(wordList):
    list_of_dicts = gd.get_data_list_of_dicts()
    wordCounts = []
    probability = 3
    full = []
    partial = []
    added = {
        "0-10": False,
        "10-20": False,
        "20-30": False,
        "30-40": False,
        "40-50": False,
        "50-60": False,
        "60-70": False,
        "70-80": False,
        "80-90": False,
        "unkown": False
    }

    for entry in list_of_dicts:
        tempDict = {}
        decade = "unkown"
        if len(entry["Age of Author"]) > 3 and len(entry["Age of Author"]) < 9:
            decade = entry["Age of Author"]

        tempDict["class"] = decade

        t = entry["Transcript"]
        t = t.replace("<br>", " ")
        t = t.replace("COMMA", " ")
        t = re.sub('\W', ' ', t)
        t = t.split(" ")

        for word in t:
            if not word == "" and word in wordList:
                word = word.lower()
                try:
                    tempDict[word] += 1
                except:
                    tempDict[word] = 1

        full.append(tempDict)

        temp = randint(0, 10)
        if temp > probability or not added[decade]:
            added[decade] = True
            partial.append(tempDict)

    filename1 = "trainingData.csv"
    filename2 = "fullData.csv"
    wordList.append("class")
    headers = wordList
    gd.write_data_dicts(filename1, headers, partial)
    gd.write_data_dicts(filename2, headers, full)
예제 #3
0
def makeTrainingData(wordList):
    list_of_dicts = gd.get_data_list_of_dicts()
    wordCounts = []
    probability = 3
    full = []
    partial = []
    added = {"0-10":False,
	     "10-20":False,
	     "20-30":False,
	     "30-40":False,
	     "40-50":False,
	     "50-60":False,
	     "60-70":False,
	     "70-80":False,
	     "80-90":False,
	     "unkown":False
	     }

    for entry in list_of_dicts:
	tempDict = {}
	decade = "unkown"
	if len(entry["Age of Author"]) > 3 and len(entry["Age of Author"]) < 9:
	    decade = entry["Age of Author"]
	
	tempDict["class"] = decade

	t = entry["Transcript"]
	t = t.replace("<br>", " ")
	t = t.replace("COMMA", " ")
	t = re.sub('\W',' ',t)
	t = t.split(" ")
	
	for word in t:
	    if not word == "" and word in wordList:
		word = word.lower()
		try:
		    tempDict[word] += 1
		except:
		    tempDict[word] = 1

	full.append(tempDict)	

	temp = randint(0,10)
	if temp > probability or not added[decade]:
	    added[decade] = True
	    partial.append(tempDict)

    filename1 = "trainingData.csv"
    filename2 = "fullData.csv"
    wordList.append("class")
    headers = wordList
    gd.write_data_dicts(filename1,headers,partial)	
    gd.write_data_dicts(filename2,headers,full)
예제 #4
0
def normalize_data():
    list_of_dicts = gd.get_data_list_of_dicts()
    headers = gd.get_headers()
    final = []
    for y in range(len(headers)):
	final.append([])

    for x in range(len(headers)):
	col = gd.get_data_slice(headers[x], list_of_dicts)
	colFloat = convert_colFloat(col)
	final[x].append(np.std(colFloat))
	final[x].append(np.mean(colFloat))
	
    return final
예제 #5
0
def remove_dup_cols():
    list_of_dicts = gd.get_data_list_of_dicts()
    headers = gd.get_headers()
     
    cols_2remove = []
    for x in range(len(headers)-15):
	col1 = gd.get_data_slice(headers[x],list_of_dicts)
	col1 = replace_missing(col1)
	for y in range(len(headers)-15):
	    col2 = gd.get_data_slice(headers[y],list_of_dicts)	 
	    col2 = replace_missing(col2)
	    r = find_r(col1,col2)
	    pairIn = exist_same_pair(cols_2remove, x, y)
	    if r>=.99 and r<=1.002 and not x ==y and not pairIn:
		    cols_2remove.append([y,x, r[0], headers[x], headers[y]])
예제 #6
0
def make_points():
    rows_list_of_lists = gd.get_data_list_of_dicts()
    rows_list_of_lists = rows_list_of_lists[1:]
    points = []
    for row in rows_list_of_lists:
        temp = []
	for entry in row.keys():
	    temp.append(0)
	for entry in row.keys():
	    index = get_index(entry)
	    temp[index] = float(row[entry])
	temp_point = Point(temp)
	temp_point.normalize()
	points.append(temp_point)
    return points
예제 #7
0
import get_data as gd
import re
import operator

list_of_dicts = gd.get_data_list_of_dicts()

decades_list = [{
    "decade": "1800",
    "words": {},
    "letters": 0,
    "wordCount": 0
}, {
    "decade": "1810",
    "words": {},
    "letters": 0,
    "wordCount": 0
}, {
    "decade": "1820",
    "words": {},
    "letters": 0,
    "wordCount": 0
}, {
    "decade": "1830",
    "words": {},
    "letters": 0,
    "wordCount": 0
}, {
    "decade": "1840",
    "words": {},
    "letters": 0,
    "wordCount": 0
		final = final
	    else:
		final = final + i
	print "entry: " + final
	new_headers.append(final)

    return new_headers

def change_to_list_of_lists(list_of_dicts):
    headers = gd.get_headers()
    final = []

    for x in range(len(list_of_dicts)):
	final.append([])
    
    for entry in headers:
	col = gd.get_data_slice(entry, list_of_dicts)
	for y in range(len(list_of_dicts)):
	    final[y].append(col[y])

    return final

filename = "cleaned_headers.csv"
list_of_dicts = gd.get_data_list_of_dicts()
new_headers = cleaned_headers()
rows_list_of_lists = change_to_list_of_lists(list_of_dicts)

gd.write_data(filename, new_headers, rows_list_of_lists)