Пример #1
0
def case_level():
    sentiment_dir = '/home/' + username + '/VADER_DATA_STORE/'
    similarity_dir = '/home/' + username + '/SIMILARITY_DATA_STORE/'
    if (demo_local):
        sentiment_dir = '../VADER_DATA_STORE'
        similarity_dir = '../SIMILARITY_DATA_STORE'
        outDir = '../Aggregate'
    list_similarity_dir = util.getDirectoryList(similarity_dir)
    outDirectory = outDir + '/CaseLevel'
    for directory in list_similarity_dir:
        if not directory.endswith('zip'):
            util.createDirectory(outDirectory)
            util.createDirectory(outDirectory + "/" + directory)
            files = util.getFilesListFromDir(directory, False)
            for file in files:
                sentiment_list = util.getDataFromPickle(
                    file, sentiment_dir + "/" + directory + '/')
                similarity_list = util.getDataFromPickle(
                    file, similarity_dir + "/" + directory + '/')
                for similarity in similarity_list:
                    similarity[:] = [
                        util.normalize_similarity(score)
                        for score in similarity
                    ]
                if len(similarity_list) == len(sentiment_list):
                    ss = np.dot(sentiment_list, similarity_list)
                    if len(sentiment_list) == 0:
                        util.writeToPickle(0, outDirectory, directory, file)
                    else:
                        util.writeToPickle(ss / len(sentiment_list),
                                           outDirectory, directory, file)
Пример #2
0
def main():
    if (not demo):
        root_Directory = 'data/clean_Mar_20'
        if (demo_local):
            global outDir
            outDir = "SNLP_sentiment"
            root_Directory = '../../Data/clean_Mar_20'
        list_of_dirs = util.getDirectoryList(root_Directory)
        for directory in list_of_dirs:
            if not directory.endswith('zip'):
                year = int(directory)
                if year >= 1964:
                    print(directory)
                    util.createDirectory(outDir)
                    util.createDirectory(outDir + "/" + directory)

                    files = util.getFilesListFromDir(directory)
                    for file_name in files:
                        para_list = util.getParaListFromFile(
                            file_name, directory)
                        avgParaSentimentList, paraSentimentList = getParaSentimentList(
                            para_list)
                        util.writeToPickle(paraSentimentList,
                                           outDir,
                                           directory,
                                           file_name,
                                           avg=False)
                        util.writeToPickle(avgParaSentimentList,
                                           outDir,
                                           directory,
                                           file_name,
                                           avg=True)
    else:
        para_list = [
            "DRUMMOND, C. J. The schooner American was at Oswego in the fall of 1872, and took in a cargo of coal for Chicago, leaving Oswego on the tenth of November. A general bill of lading was given, and a high price charged for the transportation of the coal from Oswego to Chicago, being $2.75 per ton. The schooner met with adverse winds and did not arrive at Port Huron until November 29th. The weather, according to the testimony of the witnesses, was very inclement that fall, and the captain concluded that the safest course was to strip the vessel and lay up at Port Huron. The schooner accordingly remained there with her cargo during the winter, and the coal was not delivered in Chicago or received by the consignees until May 8, 1873, when the spring freight was paid by the consignees on the coal, being much less than that charged in the bill of lading. After the coal had been thus delivered by the schooner to the consignees, a libel was filed claiming the amount of freight stated in the bill of lading, the consignees having refused to pay any more than the spring price of freight. The case went to proof before the district court, where the libel was dismissed; but a cross-libel having been filed claiming that the captain of the American was negligent in wintering at Port Hur on, and that the vessel should have come on in the fall of 1872, the district court gave a decree on the cross-libel for damages against the libelants in consequence of the supposed negligence of the captain. From t hese decrees the libelants have appealed to this court, and the question is whether the decrees of the district court are right.",
            "Several cities, New York City in particular for this paper, have a 311 24-hour hot line and online service, which allows anyone, residents and tourists, to report a non-emergency problem. Reported 311 problems are passed along to government services, who address and solve the problem. The records of 311 calls are publicly open and updated daily.",
            "Analysis of 311 calls can clearly be of great use for a wide variety of purposes, ranging from a rich understanding of the status of a city to the effectiveness of the government services in addressing such calls. Ideally, the analysis can also support a prediction of future 311 calls, which would enable the assignment of service resources by the city government.",
            "We have been extensively analyzing 311 calls in NYC. In this paper, we profile the data set and highlight a few interesting facts. We provide statistics along complaint types, geolocation, and temporal patterns and show the diversity of the big 311 data along those dimensions. We then discuss the prediction problem of number of calls, where we experiment with different sets of semantic features. We show that the prediction error for different complaint types can significantly vary if some features are not considered."
        ]
        avgParaSentimentList, paraSentimentList = getParaSentimentList(
            para_list)
        print(avgParaSentimentList)
        print(paraSentimentList)
def get_relative_path_of_cases():
    root_dir = 'data/clean_Mar_20'
    if demo_local:
        outDir = '../Aggregate/CaseLevel/'
    years=utl.getDirectoryList(outDir)
    #if demo_local:
    #   years = ['1964']
    paths={}
    same_dict = {}
    for year in years:
        if not year.endswith('zip') and year >= '1964':
            cases_files=utl.getFilesListFromDir(year, False)
            for case in cases_files:
                key = case.replace('-maj.p','')
                key += str(year)
                paths[key] = outDir + year + '/' + case
                if check_same_val_for_multiple_key:
                    if not key in same_dict:
                        same_dict[key] = []
                    same_dict[key].append(outDir + year+'/'+ case)
    if check_same_val_for_multiple_key:
        case_set = create_repeat_set(same_dict)
        #find_diff_files(same_dict)
    return paths, case_set
def train_phraser(max_phrase_length=3,
                  stemmer=None,
                  vocab=None,
                  min_doc_freq=None,
                  min_gmean=None):
    # take documents and get POS-gram dictionary

    numdocs = 0
    docfreqs = Counter()
    termfreqs = Counter()

    root_Directory = 'data/clean_Mar_20'
    list_of_dirs = utils.getDirectoryList(root_Directory)
    for directory in list_of_dirs:
        if not directory.endswith('zip'):
            print(directory)
            utils.createDirectory("similarities")
            utils.createDirectory("similarities/" + directory)

            files = utils.getFilesListFromDir(directory)
            for file_name in files:
                para_list = utils.getParaListFromFile(file_name, directory)
                for para in para_list:

                    numdocs += 1

                    docgrams = set()
                    # split into sentences
                    sentences = sent_tokenize(para)
                    for sentence in sentences:
                        # split into words and get POS tags
                        words = sentence.split()
                        tagwords = tagsentence(words, stemmer, vocab)
                        for n in range(1, max_phrase_length + 1):
                            rawgrams = ngrams(tagwords, n)
                            for rawgram in rawgrams:
                                # skip grams that have words not in vocab
                                if None in rawgram:
                                    continue
                                gramtags = ''.join([x[1][0] for x in rawgram])
                                if gramtags in tagpatterns:
                                    # if tag sequence is allowed, add to counter
                                    gram = '_'.join([x[0] for x in rawgram])
                                    termfreqs[gram] += 1
                                    docgrams.add(gram)
                    docfreqs.update(docgrams)

    # filter vocabulary based on document frequency and make gram ids
    gram2id = {}
    id2gram = {}

    if min_doc_freq is None:
        min_doc_freq = round(numdocs / 200) + 1

    i = 0
    for (phrase, v) in docfreqs.most_common():
        if v < min_doc_freq:
            break
        if min_gmean is not None:
            # check geometric mean association
            n = v.count('_') + 1
            if len(n) >= 2:
                gscore = gmean(phrase, termfreqs)
                if gscore[n] < min_gmean[n]:
                    continue
        gram2id[phrase] = i
        id2gram[i] = phrase
        i += 1

    return gram2id, id2gram
            gram_word = '_'.join(gram)
            if gram_word in gram2id:
                new_s.append(gram2id[gram_word])
                skip = n - 1
                break
    return new_s


'''
documents = ['This is a test document sentence. This is the second sentence.',
         'This is a second test document.',
         'Beyond a reasonable doubt.']
'''

root_Directory = 'data/clean_Mar_20'
list_of_dirs = utils.getDirectoryList(root_Directory)

# Training the phraser
phrase2id, id2phrase = train_phraser()

# Phrase vector of the thermometers
thermometer_vector = [
    phrase_similarity.PhraseVector(thermometer) for thermometer in thermometers
]

# Getting data from the phraser
for directory in list_of_dirs:
    if not directory.endswith('zip'):
        print(directory)
        utils.createDirectory("similarities")
        utils.createDirectory("similarities/" + directory)