def findUtimateTexts(filename_save, num): worker = JSON.workOnJSON() list = worker.read_JSON_file(constants.location + "dataSave.json") totalPosts = 0 authorDict = {} for entry in list: author = entry[0] entry = entry[1] totalPosts += entry["textNumber"] authorDict[author] = entry["textNumber"] #we now know the total number of posts for i in range(0, num): refNumber = i + 1 authorList = authorDict.keys() pickedAuthors = [] pickedNumber = 0 ran.seed() while 1: randInt = ran.randint(0, len(authorList) - 1) author = authorList[randInt] authorList.remove(author) pickedNumber += authorDict[author] pickedAuthors.append(author) if pickedNumber >= (totalPosts / 2): break pickedAuthors.sort() worker.save_JSON_file(constants.authors + "UltimateAuthors" + str(refNumber) + ".json", pickedAuthors) extractRandomAuthorTexts(constants.location + "newData.json", constants.authors + "UltimateAuthors" + str(refNumber) + ".json", filename_save + str(refNumber) + ".json")
def shortBogusCorpora(num): worker = JSON.workOnJSON() list = worker.read_JSON_file(constants.corpora + "Many.json") authorDict = {} for entry in list: author = entry["user_id"] authorDict[author] = 1 authorList = authorDict.keys() ran.seed() for i in range(0, num): index = i + 1 ranInt = ran.randint(0, len(authorList) - 1) author = authorList[ranInt] # I update the authorList del authorList[ranInt] finalList = [] for entry in list: user_id = entry["user_id"] if user_id == author: finalList.append(entry) # I find the text that is to be used for the tests ranInt = ran.randint(0, len(finalList) - 1) randomEntry = finalList[ranInt] finalList.append({"user_id": "Bogus", "text": "hello", "post_id": "B1"}) finalList.append({"user_id": "Bogus", "text": "Why, hello again!", "post_id": "B2"}) worker.save_JSON_file(constants.corpora + "shortBogusCorpora" + str(index) + ".json", finalList) worker.save_JSON_file(constants.tests + "ShortBogusText" + str(index) + ".json", [randomEntry])
def makeTable(filename, foldername, givenNum): getcontext().prec = 2 worker = JSON.workOnJSON() for index in range(1, givenNum + 1): (id, authorData, name, num) = worker.read_JSON_file(constants.resultDir + filename + str(index) + ".json") placeToSave = constants.folderLocation + "report/tabeller/" + foldername + "/" + filename produceTable(id, authorData, placeToSave, num)
def singlePostCorpora(): worker = JSON.workOnJSON() list = worker.read_JSON_file(constants.location + "newData.json") authorDict = {} for entry in list: author = entry["user_id"] text = entry["text"] id = entry["post_id"] value = {"text": text, "post_id": id} if authorDict.has_key(author): authorDict[author].append(value) else: authorDict[author] = [value] finalTexts = [] for author in authorDict.keys(): textList = authorDict[author] ran.seed() index = ran.randint(0, len(textList) - 1) entry = textList[index] value = {"user_id": author, "text": entry["text"], "post_id": entry["post_id"]} finalTexts.append(value) worker.save_JSON_file(constants.corpora + "singlePostCorpora.json", finalTexts)
def runTest(compareDict, filename, name, num): # files to work on tempName = name.rpartition("/")[-1] # we load the comparisons if runTimeTest: startTime = time.time() print startTime (ngramLists, tg_dict) = makeNgram(filename) worker = JSON.workOnJSON() if runTimeTest: ngramTime = time.time() - startTime file = open(constants.results + "ngramTime.dat", "a") file.write(str(corpNumber)+ "\t" + str(ngramTime) + "\n") file.close() # the list of posts we want to compare to the corpus if runTimeTest: startTime = time.time() (id, authorData) = compareAuthors(ngramLists, compareDict, tg_dict) if runTimeTest: compareTime = time.time() - startTime file = open(constants.results + "workTime.dat", "a") file.write(str(corpNumber)+ "\t" + str(compareTime) + "\n") file.close() worker.save_JSON_file(constants.resultDir + tempName + str(num) + ".json", (id, authorData, name, num))
def produceXtable(): getcontext().prec = 4 worker = JSON.workOnJSON() authorList = worker.read_JSON_file(constants.resultDir +"workTime.json") stringResult = StringIO() numElements = len(authorList) next = "\\\\ \n" line = "\\hline \n" stringResult.write("\\begin{center}\n") stringResult.write("\\begin{tabular}{|c|" + "c|" * numElements + "}\n") stringResult.write(line ) keys = [str(i * 100) for i in range(1, 13)] for time in keys: stringResult.write(" & " + str(time)) stringResult.write(next) stringResult.write(line) for time1 in keys: stringResult.write(str(time1)) for time2 in keys: result = +Decimal(str(authorList[time2][time1])) stringResult.write(" & " + str(result)) stringResult.write(next) stringResult.write(line) stringResult.write("\\end{tabular}\n") stringResult.write("\\end{center}") FILE = open(constants.tableSave + "crossSave.tex", "w") FILE.write(stringResult.getvalue()) FILE.close() dict = {} for key in keys: dict[key] = authorList[key]["1200"] makeGNUplot("ultimateGNUPlot", dict, keys) stringResult = StringIO() authorList = worker.read_JSON_file(constants.resultDir +"ngramTime.json") splitPoint = 6 stringResult = printPartList(authorList, keys[:splitPoint], stringResult) stringResult.write("\n \n") stringResult = printPartList(authorList, keys[splitPoint:], stringResult) FILE = open(constants.tableSave + "ngramTime.tex", "w") FILE.write(stringResult.getvalue()) FILE.close() #make dat table makeGNUplot("ngramGNUPlot", authorList, keys)
def fix(null, dir, files): worker = JSON.workOnJSON() for file in files: path = dir + file dict = worker.read_JSON_file(path) for oAuthor in dict.keys(): entry = dict[oAuthor] for iAuthor in entry.keys(): entry[iAuthor] = float(entry[iAuthor]) / float(constants.testTimes) worker.save_JSON_file(path, dict)
def doUltimateTable(): worker = JSON.workOnJSON() filename = "UltimateTest" folderName = "UltimateTest" corpora = "newData" placeToSave = folderName + filename givenNum = 3 for index in range(1, givenNum + 1): (id, authorData, name, num) = worker.read_JSON_file(constants.resultDir + filename + str(index) + ".json") placeToSave = constants.folderLocation + "report/tabeller/" + folderName + "/" + filename + str(index) (authorAttri, averageFMeasure, authorList, overall) = makeTableData(id, authorData, placeToSave, num) produceUltimateTables(authorAttri, averageFMeasure, authorList, id, authorData, placeToSave, num, overall, corpora)
def makeTable(filename, foldername, corpora, givenNum): getcontext().prec = 3 worker = JSON.workOnJSON() for index in range(1, givenNum + 1): (id, authorData, name, num) = worker.read_JSON_file(constants.resultDir + filename + str(index) + ".json") placeToSave = constants.folderLocation + "report/tabeller/" + foldername + "/" + filename (authorAttri, averageFMeasure, authorList, overall) = makeTableData(id, authorData, placeToSave, num) if filename.count("ShortBogusText"): finalCorporaName = corpora + str(index) else: finalCorporaName = corpora produceTable(authorAttri, averageFMeasure, authorList, authorList, id, authorData, placeToSave, num, overall, finalCorporaName)
def extractRandomAuthorTexts(filename, author_filename, filename_save): worker = JSON.workOnJSON() authorList = worker.read_JSON_file(author_filename) posts = worker.read_JSON_file(filename) authorPosts = [] for entry in posts: if authorList.count(entry["user_id"]): authorPosts.append(entry) worker.save_JSON_file(constants.tests + filename_save, authorPosts)
def makeRandomTestTables(filename, corpora): folderName = "RandomTest" worker = JSON.workOnJSON() placeToSave = folderName + filename authorData = worker.read_JSON_file(constants.randomTest + filename + ".json") placeToSave = constants.folderLocation + "report/tabeller/" + folderName + "/" + filename (authorAttri, averageFMeasure, authorList, overall) = makeTableData({}, authorData, placeToSave, 1) ultimate = None if filename.count("Ultimate"): ultimate = 1 produceUltimateTables(authorAttri, averageFMeasure, authorData.keys(), id, authorData, placeToSave, -1, overall, corpora, ultimate)
def makeTimeTest(): i = 100 worker = JSON.workOnJSON() fromDirectory = constants.corpora + "newData.json" posts = worker.read_JSON_file(fromDirectory) while i < 1400: saveFile = constants.corpora + "timeTest" + str(i) + ".json" listToSave = posts[0 : i - 1] worker.save_JSON_file(saveFile, listToSave) i += 100 i = 1329 saveFile = constants.corpora + "timeTest" + str(i) + ".json" listToSave = posts[0 : i - 1] worker.save_JSON_file(saveFile, listToSave)
def extractRandomText(filename, filename_save, num): worker = JSON.workOnJSON() filename = constants.corpora + filename + ".json" filename_save = constants.tests + filename_save posts = worker.read_JSON_file(filename) ran.seed() post = [] for index in range(0, num): postNum = ran.randint(0, len(posts) - 1) post.append(posts[postNum]) del posts[postNum] for index in range(1, num + 1): worker.save_JSON_file(filename_save + str(index) + ".json", [post[index - 1]])
def getAuthorWrittenData(num, corpora): worker = JSON.workOnJSON() corpora = worker.read_JSON_file(constants.corpora + corpora + ".json") listOfOne = [] writtenDict = {} for entry in corpora: authorName = entry["user_id"] if writtenDict.has_key(authorName): writtenDict[authorName] += 1 else: writtenDict[authorName] = 1 for authorName in writtenDict.keys(): if writtenDict[authorName] == num: listOfOne.append(authorName) return (listOfOne, writtenDict)
def randomTest(tests, corpora, save_file, times): print tests worker = JSON.workOnJSON() corpora = worker.read_JSON_file(constants.corpora + corpora + ".json") tests = worker.read_JSON_file(constants.tests + tests + ".json") tempDict = {} permTestList = [] permCorporaList = [] for entry in tests: permTestList.append(entry["user_id"]) for entry in corpora: permCorporaList.append(entry["user_id"]) resultDict= {} corpora = copy.deepcopy(permCorporaList) tests = copy.deepcopy(permTestList) numAuthor = len(tests) authorsDone = 1 for realAuthor in tests: for i in range(0, times): ran.seed() if i > 0 and i % 1000 == 0: print (float(i * authorsDone) / float(times * numAuthor)) * 100 , "percent done" ranInt = ran.randint(0, len(corpora) - 1) author = corpora[ranInt] if not resultDict.has_key(realAuthor) : resultDict[realAuthor] = {author: 1} elif not resultDict[realAuthor].has_key(author): resultDict[realAuthor][author] =1 else: resultDict[realAuthor][author] += 1 authorsDone += 1 for oAuthor in resultDict.keys(): entry = resultDict[oAuthor] for iAuthor in entry.keys(): entry[iAuthor] = float(entry[iAuthor]) / float(constants.testTimes) worker.save_JSON_file(constants.randomTest + save_file + ".json", resultDict)
def AuthorTest(num, filename_test, corpora_name, foldername, filename_save): print "Test:", filename_test folder = constants.tableSave + foldername + "/" if (corpora_name != "newData" or corpora_name != "testData"): corpora_name = constants.corpora + corpora_name else: corpora_name = constants.location + corpora_name worker = JSON.workOnJSON() if num == 0: authorText = worker.read_JSON_file(constants.tests + filename_test + ".json") value = runTest(authorText, corpora_name + ".json", folder + filename_save, 0) else: for i in range(0, num): index = i + 1 authorText = worker.read_JSON_file(constants.tests + filename_test + str(index) + ".json") corpora_final_name = corpora_name if filename_save.count("shortBogusText"): corpora_final_name = corpora_name + str(index) value = runTest(authorText, corpora_final_name + ".json", folder + filename_save, index)
def makeNgram(filename): worker = JSON.workOnJSON() dict = worker.read_JSON_file(filename) authorDict = {} authorWrittenDict = {} tg_dict = {} authorNameDirec = {} num = 0 for entry in dict: author = entry["user_id"] id = entry["post_id"] value = {"user_id": author, "text": entry["text"]} if authorDict.has_key(author): authorDict[author].append(value) authorWrittenDict[author].append(id) else: authorDict[author] = [value] authorWrittenDict[author] = [id] newAuthorDict = {} authorTexts = {} for authorName in authorDict.keys(): author = authorDict[authorName] newAuthorDictTemp = {} listOfEntries = [entry["text"] for entry in author] newAuthorDict[authorName] = listOfEntries text = ''.join(listOfEntries) tg = ngram.ngram(listOfEntries) tg.corp = text tg.newRemember() tg_dict[authorName] = tg return (newAuthorDict, tg_dict)
def getAuthorWithOverXPosts(data_file, metadata_file, number): postFiles = "authorsWithOver" worker = JSON.workOnJSON() file_data = worker.read_JSON_file(data_file) file_metadata = worker.read_JSON_file(metadata_file) listOfAuthors = [] texts = [] # I find the authors who have written over the needed number of texts for entry in file_metadata: authorName = entry[0] entry = entry[1] if entry["textNumber"] >= number: listOfAuthors.append(authorName) # With the list of authors I now find all the texts they have written for entry in file_data: authorName = entry["user_id"] if listOfAuthors.count(authorName): texts.append(entry) worker.save_JSON_file(postFiles + str(number) + ".json", texts)
def chooseAuthorsWithNumber(filename_save, number_of_posts, num): worker = JSON.workOnJSON() list = worker.read_JSON_file(constants.location + "newData.json") authorDict = {} for entry in list: author = entry["user_id"] value = {"text": entry["text"], "user_id": author, "post_id": entry["post_id"]} if authorDict.has_key(author): (number, texts) = authorDict[author] texts.append(value) authorDict[author] = (number + 1, texts) else: authorDict[author] = (1, [value]) authorList = [] textList = [] authorKeyList = authorDict.keys() for author in authorKeyList: number = authorDict[author][0] if number >= number_of_posts[0] and number <= number_of_posts[1]: authorList.append(author) textList.extend(authorDict[author][1]) worker.save_JSON_file(constants.corpora + filename_save + ".json", textList) for i in range(0, num): author = None index = i + 1 ran.seed() if len(authorKeys) != 0: ranIndex = ran.randint(0, len(authorList) - 1) author = authorList[ranIndex] authorList.remove(author) worker.save_JSON_file( constants.tests + "Author" + name + "Post" + str(index) + ".json", authorDict[author][1] )
def produceStatisticalData(filename, filename_save): worker = JSON.workOnJSON() result = worker.read_JSON_file(filename) authorData = {} for entry in result: authorName = entry["user_id"] if not authorData.has_key(authorName): textLength = len(entry["text"]) authorData[authorName] = {"textNumber": 1, "min": textLength, "max": textLength, "totalLength": textLength} else: authorEntry = authorData[authorName] authorData[authorName]["textNumber"] += 1 textLength = len(entry["text"]) authorEntry["min"] = min(authorEntry["min"], textLength) authorEntry["max"] = max(authorEntry["max"], textLength) authorEntry["totalLength"] += textLength getcontext().prec = 2 for authorName in authorData.keys(): authorEntry = authorData[authorName] authorEntry["average"] = round(Decimal(authorEntry["totalLength"]) / Decimal(authorEntry["textNumber"]), 3) length = 0 numberTexts = 0 minNumber = 1000000 maxNumber = -1 for key in authorData.keys(): entry = authorData[key] length += entry["totalLength"] numberTexts += entry["textNumber"] minNumber = min(minNumber, entry["totalLength"]) maxNumber = max(maxNumber, entry["totalLength"]) keys = sortKeys(authorData.keys()) worker.save_JSON_file(filename_save, authorData) header = "\\begin{tabular}{cccccc}\n Name & Number of Texts & Min Length& Max Length & Average Length\\\\\n" stringWriter = StringIO() stringWriter.write(header) count = 0 endCount = 35 numberOnePost = 0 for name in keys: entry = authorData[name] number = str(entry["textNumber"]) if number == "1": numberOnePost += 1 stringWriter.write(str(name[0:15]) + " & " + number + " & " + str(entry["min"]) + " & " + str(entry["max"]) + " & " + str(entry["average"]) + "\\\\\n") if count == endCount: stringWriter.write("\\end{tabular}\n") stringWriter.write("\\newpage\n") stringWriter.write(header) count = 0 count += 1 stringWriter.write("& & & & & \\\\ \n") stringWriter.write("Number of Authors & Number of Texts & Total Min & Total Max & Total Average \\\\ \n") stringWriter.write(str(len(authorData)) + " & " + str(numberTexts) + " & " + str(minNumber) + " & " + str(maxNumber) + " & " + str(round(Decimal(length) / Decimal(numberTexts), 3)) + "\\\\ \n") oneAuthor = str(float(numberOnePost) / float(len(authorData)) * 100) stringWriter.write("\\multicolumn{5}{c}{Percentage of authors who have only written 1 post: " + oneAuthor[:5] + " \\%}") stringWriter.write("\\end{tabular}\n") FILE_TO_SAVE = open(constants.tableSave + "reportFile.tex","w") FILE_TO_SAVE.write(stringWriter.getvalue()) FILE_TO_SAVE.close()
def produceStatisticalData(filename, filename_save): worker = JSON.workOnJSON() result = worker.read_JSON_file(filename) authorData = {} for entry in result: authorName = entry["user_id"] if not authorData.has_key(authorName): textLength = len(entry["text"]) authorData[authorName] = {"textNumber": 1, "min": textLength, "max": textLength, "totalLength": textLength} else: authorEntry = authorData[authorName] authorData[authorName]["textNumber"] += 1 textLength = len(entry["text"]) authorEntry["min"] = min(authorEntry["min"], textLength) authorEntry["max"] = max(authorEntry["max"], textLength) authorEntry["totalLength"] += textLength getcontext().prec = 2 for authorName in authorData.keys(): authorEntry = authorData[authorName] authorEntry["average"] = round(Decimal(authorEntry["totalLength"]) / Decimal(authorEntry["textNumber"]), 3) length = 0 numberTexts = 0 minNumber = 1000000 maxNumber = -1 for key in authorData.keys(): entry = authorData[key] length += entry["totalLength"] numberTexts += entry["textNumber"] minNumber = min(minNumber, entry["totalLength"]) maxNumber = max(maxNumber, entry["totalLength"]) print "Number of authors:", len(authorData) print "Length:", length print "Number of texts:", numberTexts print "Average:", str(round(Decimal(length) / Decimal(numberTexts), 3)) authorData = sorted(authorData.iteritems(), key=itemgetter(1)) worker.save_JSON_file(filename_save, authorData) FILE_TO_SAVE = open(constants.tableSave + "reportFile.tex","w") # FILE_TO_SAVE.write("\\documentclass[letter, 12pt, english]{article}\n") # FILE_TO_SAVE.write("\\begin{document}\n") FILE_TO_SAVE.write("\\begin{tabular}{cccccc}\n") FILE_TO_SAVE.write("Name & Number of Texts & Min & Max & Average\\\\\n") count = 0 endCount = 35 for entry in authorData: name = entry[0] entry = entry[1] number = str(entry["textNumber"]) # if (number > 1 and number < 10): # number = "\\emph{" + number + "} # elif (number >= 10 and number < 100): # number = "\\texttt{" + number + "}" # elif number >= 100: # number = "\\texttt{\\emph{" + number + "}}" FILE_TO_SAVE.write(str(name[0:15]) + " & " + number + " & " + str(entry["min"]) + " & " + str(entry["max"]) + " & " + str(entry["average"]) + "\\\\\n") if count == endCount: FILE_TO_SAVE.write("\\end{tabular}\n") FILE_TO_SAVE.write("\\newpage\n") FILE_TO_SAVE.write("\\begin{tabular}{cccccc}\n") FILE_TO_SAVE.write("Name & Number of Texts & Min & Max & Average\\\\\n") count = 0 count += 1 FILE_TO_SAVE.write("& & & & & \\\\ \n") FILE_TO_SAVE.write("Number of Authors & Number of Texts & Total Min & Total Max & Total Average \\\\ \n") FILE_TO_SAVE.write(str(len(authorData)) + " & " + str(numberTexts) + " & " + str(minNumber) + " & " + str(maxNumber) + " & " + str(round(Decimal(length) / Decimal(numberTexts), 3)) + "\\\\ \n") FILE_TO_SAVE.write("\\end{tabular}\n") # FILE_TO_SAVE.write("\\end{document}\n") FILE_TO_SAVE.close()