Exemplo n.º 1
0
def makeSubjectCsv(subjects_sorted):
    final = []
    yearLookup = {}
    subjects = ["Travel","Education","Love","Health","Family","Religion","Political","Lifestyle","Unidentified"]
    
    counter = 0
    for x in range(1800,1912):
	final.append({"year":x})
	for subject in subjects:
	    final[counter][subject] = 0
	yearLookup[x] = counter
	counter += 1

    for entry in subjects_sorted:
	mySub = entry["subject"]
	for year in entry["yearDate"].keys():
	    if not year == '':
		try:
		    index = yearLookup[int(year)]
		    final[index][mySub] = entry["yearDate"][year]
		except:
		    print year
	
    headers = ["year","Travel","Education","Love","Health","Family","Religion","Political","Lifestyle","Unidentified"]
    filename = "subject_year.csv"
    gd.write_data_dicts(filename,headers,final)
Exemplo n.º 2
0
def makeTrainingData(wordList):
    list_of_dicts = gd.get_data_list_of_dicts()
    wordCounts = []
    probability = 3
    full = []
    partial = []
    added = {
        "0-10": False,
        "10-20": False,
        "20-30": False,
        "30-40": False,
        "40-50": False,
        "50-60": False,
        "60-70": False,
        "70-80": False,
        "80-90": False,
        "unkown": False
    }

    for entry in list_of_dicts:
        tempDict = {}
        decade = "unkown"
        if len(entry["Age of Author"]) > 3 and len(entry["Age of Author"]) < 9:
            decade = entry["Age of Author"]

        tempDict["class"] = decade

        t = entry["Transcript"]
        t = t.replace("<br>", " ")
        t = t.replace("COMMA", " ")
        t = re.sub('\W', ' ', t)
        t = t.split(" ")

        for word in t:
            if not word == "" and word in wordList:
                word = word.lower()
                try:
                    tempDict[word] += 1
                except:
                    tempDict[word] = 1

        full.append(tempDict)

        temp = randint(0, 10)
        if temp > probability or not added[decade]:
            added[decade] = True
            partial.append(tempDict)

    filename1 = "trainingData.csv"
    filename2 = "fullData.csv"
    wordList.append("class")
    headers = wordList
    gd.write_data_dicts(filename1, headers, partial)
    gd.write_data_dicts(filename2, headers, full)
Exemplo n.º 3
0
def makeTrainingData(wordList):
    list_of_dicts = gd.get_data_list_of_dicts()
    wordCounts = []
    probability = 3
    full = []
    partial = []
    added = {"0-10":False,
	     "10-20":False,
	     "20-30":False,
	     "30-40":False,
	     "40-50":False,
	     "50-60":False,
	     "60-70":False,
	     "70-80":False,
	     "80-90":False,
	     "unkown":False
	     }

    for entry in list_of_dicts:
	tempDict = {}
	decade = "unkown"
	if len(entry["Age of Author"]) > 3 and len(entry["Age of Author"]) < 9:
	    decade = entry["Age of Author"]
	
	tempDict["class"] = decade

	t = entry["Transcript"]
	t = t.replace("<br>", " ")
	t = t.replace("COMMA", " ")
	t = re.sub('\W',' ',t)
	t = t.split(" ")
	
	for word in t:
	    if not word == "" and word in wordList:
		word = word.lower()
		try:
		    tempDict[word] += 1
		except:
		    tempDict[word] = 1

	full.append(tempDict)	

	temp = randint(0,10)
	if temp > probability or not added[decade]:
	    added[decade] = True
	    partial.append(tempDict)

    filename1 = "trainingData.csv"
    filename2 = "fullData.csv"
    wordList.append("class")
    headers = wordList
    gd.write_data_dicts(filename1,headers,partial)	
    gd.write_data_dicts(filename2,headers,full)
Exemplo n.º 4
0
import get_data as gd

headers_clean = gd.get_headers()
list_of_dicts = gd.get_data_list_of_dicts()

new_headers = []
for h in headers_clean:
    h = h.split(",")
    if not " Error" in h and not "" in h:
        new_headers.append(",".join(h))

final = []
for entry in list_of_dicts:
    temp = {}
    for header in new_headers:
        temp[header] = entry[header]

    final.append(temp)

filename = "noError.csv"
headers = new_headers
gd.write_data_dicts(filename, headers, final)
Exemplo n.º 5
0
for entry in list_of_dicts:
    if entry["Creator"] == "" or entry["Recipient"] == "" or entry["Creator"] == "Unknown":
	continue
    else:
	temp_string = entry["Recipient"].split(";")
	for i in temp_string:
	    if not i == "":
		my_dict = entry.copy()
		person = i.split("COMMA")
		if len(person) == 2:
		    person = person[1] + " " + person[0]
		elif len(person) == 3:
		    person = person[1] + " " + person[0] + ": " + person[2]
	
		my_dict["Recipient"] = person
		temp.append(my_dict)	    

headers_wanted = ["Title","Date","Creator","Identifier","Recipient", "Gender of Author","Age of Author","Identified People","Unidentified People","Subject","Geographic Subjects","Place Of Origin","Destination","Notes","Language","Transcript","Reference URL","CONTENTdm number","CONTENTdm file name"]

final = []
for entry in temp:
    temp_dict = {}
    for key in entry.keys():
	if key in headers_wanted:
	    temp_dict[key]=entry[key]
    final.append(temp_dict)

headers = headers_wanted
file_name = "Recipient_and_Creator_cleaned2.csv"
gd.write_data_dicts(file_name, headers, final)
Exemplo n.º 6
0
print "lifestyle:\t" + str(len(lifestyle_bin))+"\t\t"+str(min_age[7])+"\t\t"+str(max_age[7])
print "---------------------------------------------"
print "Total:\t\t" + str(total)
print "---------------------------------------------"
print "unidentified:	 " + str(len(unidentified))
print "max_age: " + str(max_age)
print "min_age: " + str(min_age)
print
print "Distribution:"
print "\t\t0-10\t10-20\t20-30\t30-40\t40-50\t50-60\t60-70\t70-80\t80-90\t90-100\tunkown"
age_strings = []
for entry in age_count:
    temp = ""
    for item in entry:
	temp += str(item) + "\t"
    age_strings.append(temp)
print "travel:\t\t"+age_strings[0] 
print "education:\t"+age_strings[1]
print "love:\t\t"+age_strings[2]
print "health:\t\t" +age_strings[3]
print "family:\t\t" +age_strings[4]
print "religion:\t" +age_strings[5]
print "political:\t"+age_strings[6]
print "lifestyle:\t"+age_strings[7]
"""
"""
filename = "organized.csv"
headers = gd.get_headers()
gd.write_data_dicts(filename, headers, has_subject)
"""
Exemplo n.º 7
0
            is_int = 0
            try:
                is_int = int(raw_list[item][0])
            except:
                is_int = -1

            if is_int >= 0:
                final_dicts[counter // 3]["date"] = raw_list[item]
                counter += 1
                item += 1
            else:
                final_dicts[counter // 3]["date"] = "unknown"
                counter += 1

    return final_dicts


list_of_dicts = []

for x in range(1, 17):
    print x
    temp = get_page_data(x)
    if temp == list_of_dicts:
        print "what"
    else:
        list_of_dicts = list_of_dicts + temp

filename = "letters_list.csv"
headers = ["date", "recipient", "creator"]
gd.write_data_dicts(filename, headers, list_of_dicts)
Exemplo n.º 8
0
print "Total:\t\t" + str(total)
print "---------------------------------------------"
print "unidentified:	 " + str(len(unidentified))
print "max_age: " + str(max_age)
print "min_age: " + str(min_age)
print
print "Distribution:"
print "\t\t0-10\t10-20\t20-30\t30-40\t40-50\t50-60\t60-70\t70-80\t80-90\t90-100\tunkown"
age_strings = []
for entry in age_count:
    temp = ""
    for item in entry:
	temp += str(item) + "\t"
    age_strings.append(temp)
print "travel:\t\t"+age_strings[0] 
print "education:\t"+age_strings[1]
print "love:\t\t"+age_strings[2]
print "health:\t\t" +age_strings[3]
print "family:\t\t" +age_strings[4]
print "religion:\t" +age_strings[5]
print "political:\t"+age_strings[6]
print "lifestyle:\t"+age_strings[7]
"""

"""
filename = "organized.csv"
headers = gd.get_headers()
gd.write_data_dicts(filename, headers, has_subject)
"""

Exemplo n.º 9
0
for h2 in headers_income:
    h2 = h2.split(" - ")
    code = h2[0]
    try:
        if not "Error" in h2[1]:
            name = h2[1]
            codes[code] = name
    except:
        print h2

for entry in full:
    temp = {}
    for h1 in full_headers:
        try:
            temp[codes[h1]] = entry[h1]
            if not codes[h1] in final_headers:
                final_headers.append(codes[h1])
        except:
            if not h1[len(h1) - 1] == "e":
                temp[h1] = entry[h1]
                if not h1 in final_headers:
                    final_headers.append(h1)

    full_clean.append(temp)

filename = "partialNameClean2.csv"
headers = final_headers

gd.write_data_dicts(filename, headers, full_clean)
Exemplo n.º 10
0
    if len(Poo) == 1:
        Poo = Poo[0]
    else:
        Poo = Poo[0] + ", " + Poo[1]

    Dest = item["Destination"].replace(" ", "")
    Dest = Dest.replace("(", "COMMA")
    Dest = Dest.replace(")", "COMMA")
    Dest = Dest.split("COMMA")
    Dest2 = []
    for word in Dest:
        if not word == "":
            Dest2.append(word)
    Dest = Dest2[:]
    if len(Dest) == 1:
        Dest = Dest[0]
    else:
        Dest = Dest[0] + ", " + Dest[1]

    if Poo in places and Dest in places:
        my_json = json.dumps(item)
        has_full.append({
            "Poo": places.index(Poo),
            "Dest": places.index(Dest),
            "Letter": my_json
        })

filename = "letterTravels.csv"
headers = ["Poo", "Dest", "Letter"]
gd.write_data_dicts(filename, headers, has_full)
for h2 in headers_income:
    h2 = h2.split(" - ")
    code = h2[0]
    try:
	if not "Error" in h2[1]:
	    name = h2[1]
	    codes[code] = name
    except:
	print h2

for entry in full:
    temp = {}
    for h1 in full_headers:
	try:
	    temp[codes[h1]] = entry[h1]
	    if not codes[h1] in final_headers:
		final_headers.append(codes[h1])
	except:
	    if not  h1[len(h1)-1] == "e":
		temp[h1] = entry[h1]
		if not h1 in final_headers:
		    final_headers.append(h1)

    full_clean.append(temp)

filename = "partialNameClean2.csv"
headers = final_headers

gd.write_data_dicts(filename,headers,full_clean)
Exemplo n.º 12
0
		is_int = 0
		try:
		    is_int = int(raw_list[item][0])
		except:
		    is_int = -1
		
		if is_int >= 0:
		    final_dicts[counter//3]["date"] = raw_list[item]
		    counter += 1
		    item += 1
		else:
		    final_dicts[counter//3]["date"] = "unknown"
		    counter +=1
		

	return final_dicts

list_of_dicts = []

for x in range(1,17):
    print x
    temp = get_page_data(x)
    if temp == list_of_dicts:
	print "what"
    else:
	list_of_dicts = list_of_dicts + temp

filename = "letters_list.csv"
headers = ["date","recipient","creator"]
gd.write_data_dicts(filename, headers, list_of_dicts)
Exemplo n.º 13
0
    for word in Poo:
	if not word == "":
	    Poo2.append(word)
    Poo = Poo2[:]
    if len(Poo) == 1:
	Poo = Poo[0]
    else:
	Poo = Poo[0] +", "+Poo[1]
	
    Dest = item["Destination"].replace(" ","")
    Dest = Dest.replace("(","COMMA")
    Dest = Dest.replace(")","COMMA")
    Dest = Dest.split("COMMA")
    Dest2 = []
    for word in Dest:
	if not word == "":
	    Dest2.append(word)
    Dest = Dest2[:]
    if len(Dest) == 1:
	Dest = Dest[0]
    else:
	Dest = Dest[0] +", "+Dest[1]

    if Poo in places and Dest in places:
	my_json = json.dumps(item)
	has_full.append({"Poo":places.index(Poo),"Dest":places.index(Dest), "Letter":my_json})

filename="letterTravels.csv"
headers = ["Poo","Dest","Letter"]
gd.write_data_dicts(filename,headers,has_full)
Exemplo n.º 14
0
"2ndBeach, Newport",
"Mossgiel, R.I.",
"Mossgiel"
]
print len(not_right)
not_right_clean = [] 
for entry in not_right:
    not_right_clean.append(entry.replace(" ?",""))
right = []
counter = 0
left_out = []
for entry in list_of_dicts:
    if entry["Name"] in not_right_clean:
	counter += 1
	left_out.append(entry["Name"])
    else:
	right.append(entry)

print counter

for entry in not_right_clean:
    if entry in left_out:
	continue
    else:
	print entry

filename = "partialCurrectLocations.csv"
headers = gd.get_headers()
gd.write_data_dicts(filename,headers,right)