def case_level(): sentiment_dir = '/home/' + username + '/VADER_DATA_STORE/' similarity_dir = '/home/' + username + '/SIMILARITY_DATA_STORE/' if (demo_local): sentiment_dir = '../VADER_DATA_STORE' similarity_dir = '../SIMILARITY_DATA_STORE' outDir = '../Aggregate' list_similarity_dir = util.getDirectoryList(similarity_dir) outDirectory = outDir + '/CaseLevel' for directory in list_similarity_dir: if not directory.endswith('zip'): util.createDirectory(outDirectory) util.createDirectory(outDirectory + "/" + directory) files = util.getFilesListFromDir(directory, False) for file in files: sentiment_list = util.getDataFromPickle( file, sentiment_dir + "/" + directory + '/') similarity_list = util.getDataFromPickle( file, similarity_dir + "/" + directory + '/') for similarity in similarity_list: similarity[:] = [ util.normalize_similarity(score) for score in similarity ] if len(similarity_list) == len(sentiment_list): ss = np.dot(sentiment_list, similarity_list) if len(sentiment_list) == 0: util.writeToPickle(0, outDirectory, directory, file) else: util.writeToPickle(ss / len(sentiment_list), outDirectory, directory, file)
def main(): if (not demo): root_Directory = 'data/clean_Mar_20' if (demo_local): global outDir outDir = "SNLP_sentiment" root_Directory = '../../Data/clean_Mar_20' list_of_dirs = util.getDirectoryList(root_Directory) for directory in list_of_dirs: if not directory.endswith('zip'): year = int(directory) if year >= 1964: print(directory) util.createDirectory(outDir) util.createDirectory(outDir + "/" + directory) files = util.getFilesListFromDir(directory) for file_name in files: para_list = util.getParaListFromFile( file_name, directory) avgParaSentimentList, paraSentimentList = getParaSentimentList( para_list) util.writeToPickle(paraSentimentList, outDir, directory, file_name, avg=False) util.writeToPickle(avgParaSentimentList, outDir, directory, file_name, avg=True) else: para_list = [ "DRUMMOND, C. J. The schooner American was at Oswego in the fall of 1872, and took in a cargo of coal for Chicago, leaving Oswego on the tenth of November. A general bill of lading was given, and a high price charged for the transportation of the coal from Oswego to Chicago, being $2.75 per ton. The schooner met with adverse winds and did not arrive at Port Huron until November 29th. The weather, according to the testimony of the witnesses, was very inclement that fall, and the captain concluded that the safest course was to strip the vessel and lay up at Port Huron. The schooner accordingly remained there with her cargo during the winter, and the coal was not delivered in Chicago or received by the consignees until May 8, 1873, when the spring freight was paid by the consignees on the coal, being much less than that charged in the bill of lading. After the coal had been thus delivered by the schooner to the consignees, a libel was filed claiming the amount of freight stated in the bill of lading, the consignees having refused to pay any more than the spring price of freight. The case went to proof before the district court, where the libel was dismissed; but a cross-libel having been filed claiming that the captain of the American was negligent in wintering at Port Hur on, and that the vessel should have come on in the fall of 1872, the district court gave a decree on the cross-libel for damages against the libelants in consequence of the supposed negligence of the captain. From t hese decrees the libelants have appealed to this court, and the question is whether the decrees of the district court are right.", "Several cities, New York City in particular for this paper, have a 311 24-hour hot line and online service, which allows anyone, residents and tourists, to report a non-emergency problem. Reported 311 problems are passed along to government services, who address and solve the problem. The records of 311 calls are publicly open and updated daily.", "Analysis of 311 calls can clearly be of great use for a wide variety of purposes, ranging from a rich understanding of the status of a city to the effectiveness of the government services in addressing such calls. Ideally, the analysis can also support a prediction of future 311 calls, which would enable the assignment of service resources by the city government.", "We have been extensively analyzing 311 calls in NYC. In this paper, we profile the data set and highlight a few interesting facts. We provide statistics along complaint types, geolocation, and temporal patterns and show the diversity of the big 311 data along those dimensions. We then discuss the prediction problem of number of calls, where we experiment with different sets of semantic features. We show that the prediction error for different complaint types can significantly vary if some features are not considered." ] avgParaSentimentList, paraSentimentList = getParaSentimentList( para_list) print(avgParaSentimentList) print(paraSentimentList)
def get_relative_path_of_cases(): root_dir = 'data/clean_Mar_20' if demo_local: outDir = '../Aggregate/CaseLevel/' years=utl.getDirectoryList(outDir) #if demo_local: # years = ['1964'] paths={} same_dict = {} for year in years: if not year.endswith('zip') and year >= '1964': cases_files=utl.getFilesListFromDir(year, False) for case in cases_files: key = case.replace('-maj.p','') key += str(year) paths[key] = outDir + year + '/' + case if check_same_val_for_multiple_key: if not key in same_dict: same_dict[key] = [] same_dict[key].append(outDir + year+'/'+ case) if check_same_val_for_multiple_key: case_set = create_repeat_set(same_dict) #find_diff_files(same_dict) return paths, case_set
def train_phraser(max_phrase_length=3, stemmer=None, vocab=None, min_doc_freq=None, min_gmean=None): # take documents and get POS-gram dictionary numdocs = 0 docfreqs = Counter() termfreqs = Counter() root_Directory = 'data/clean_Mar_20' list_of_dirs = utils.getDirectoryList(root_Directory) for directory in list_of_dirs: if not directory.endswith('zip'): print(directory) utils.createDirectory("similarities") utils.createDirectory("similarities/" + directory) files = utils.getFilesListFromDir(directory) for file_name in files: para_list = utils.getParaListFromFile(file_name, directory) for para in para_list: numdocs += 1 docgrams = set() # split into sentences sentences = sent_tokenize(para) for sentence in sentences: # split into words and get POS tags words = sentence.split() tagwords = tagsentence(words, stemmer, vocab) for n in range(1, max_phrase_length + 1): rawgrams = ngrams(tagwords, n) for rawgram in rawgrams: # skip grams that have words not in vocab if None in rawgram: continue gramtags = ''.join([x[1][0] for x in rawgram]) if gramtags in tagpatterns: # if tag sequence is allowed, add to counter gram = '_'.join([x[0] for x in rawgram]) termfreqs[gram] += 1 docgrams.add(gram) docfreqs.update(docgrams) # filter vocabulary based on document frequency and make gram ids gram2id = {} id2gram = {} if min_doc_freq is None: min_doc_freq = round(numdocs / 200) + 1 i = 0 for (phrase, v) in docfreqs.most_common(): if v < min_doc_freq: break if min_gmean is not None: # check geometric mean association n = v.count('_') + 1 if len(n) >= 2: gscore = gmean(phrase, termfreqs) if gscore[n] < min_gmean[n]: continue gram2id[phrase] = i id2gram[i] = phrase i += 1 return gram2id, id2gram
gram_word = '_'.join(gram) if gram_word in gram2id: new_s.append(gram2id[gram_word]) skip = n - 1 break return new_s ''' documents = ['This is a test document sentence. This is the second sentence.', 'This is a second test document.', 'Beyond a reasonable doubt.'] ''' root_Directory = 'data/clean_Mar_20' list_of_dirs = utils.getDirectoryList(root_Directory) # Training the phraser phrase2id, id2phrase = train_phraser() # Phrase vector of the thermometers thermometer_vector = [ phrase_similarity.PhraseVector(thermometer) for thermometer in thermometers ] # Getting data from the phraser for directory in list_of_dirs: if not directory.endswith('zip'): print(directory) utils.createDirectory("similarities") utils.createDirectory("similarities/" + directory)