def readIn(filename, fileVersion, verbose=True): if verbose: print ("Reading information of file already transformed into MY FORMAT! Filename: ", filename) #data already sorted #data = readMyFormat(filename, fileVersion, verbose) data = readMyFormat(filename, verbose) return data
def createFV(filename, label, minNumberOfQueries, maxNumberOfQueries): print "min = ", minNumberOfQueries, " max = ", maxNumberOfQueries data = readMyFormat(filename, formatVersion) data = preProcessData(data, removeStopWords) # Sort the data by user and date data = keepUsersInsideLimiteOfQueires(data, minNumberOfQueries, maxNumberOfQueries) userDict = createDictOfUsers(data, label) print len(userDict) return userDict
PATH_TO_DATASETS = "/data/palotti/logAnalysisDataSets/" AOLH_DATASET = "/data/palotti/logAnalysisDataSets/aolData/AOL-user-ct-collection/healthq.fixed.gz" AOLNH_DATASET = "/data/palotti/logAnalysisDataSets/aolData/AOL-user-ct-collection/nhealthq.fixed.gz" usingScoop = False useHON = True useGM = True useTRIP = True useAOLH = True useAOLNH = False useLAY = False useEXP = False if __name__ == "__main__A": datasets = [] data = readMyFormat(PATH_TO_DATASETS + "trip/trip1.gz") datasets.append([data, "TEST"]) calculateStatistics(datasets, usingScoop) if __name__ == "__main__": datasets = [] #GoldMiner if useGM: gm = readMyFormat(PATH_TO_DATASETS + "gm/gm.gz") datasets.append([gm, "GM"]) #HON if useHON:
def calculateDCohen(values, idx1, idx2): n1 = values[idx1] n2 = values[idx2] return DCohen(n1.mean, n2.mean, n1.std, n2.std) values = [] #for file in sys.argv[1:]: # data = readMyFormat(file, "v5") # npCombo, countingCombo = analyseData(data) # for k, c in sorted(countingCombo.items(), key= lambda x:x[0] ): # print "%f,%d" % (k, c) # values.append(npCombo) file = sys.argv[1] data = readMyFormat(file, "v5") file = sys.argv[2] data += readMyFormat(file, "v5") file = sys.argv[3] data2 = readMyFormat(file, "v5") file = sys.argv[4] data2 += readMyFormat(file, "v5") npCombo, countingCombo = analyseData(data) values.append(npCombo) npCombo2, countingCombo = analyseData(data2) values.append(npCombo2)
Implementation decision: 1) If combo value for the CHV entry is -1, I decided to substitute it for the mean combo value of all entries. 2) When it is not found any CHV entry in the query, I assume that the combo value for that entry is the mean combo value (around 0.28) """ usingScoop = True if usingScoop: from scoop import futures chvfile = sys.argv[1] v4datasetFile = sys.argv[2] outfilename = sys.argv[3] popularNames = [] data = readMyFormat(v4datasetFile, "v5") queries = [] from collections import defaultdict popCounter = defaultdict(int) class CHV(object): def __init__(self, text, isCHV, isUMLS, misspelled, combo): self.text = text self.isCHV = isCHV self.isUMLS = isUMLS self.misspelled = misspelled self.comboScore = combo for member in data: query = tokenize(member.keywords)