Exemplo n.º 1
0
def load_word_counts(input_dir, input_prefix, vocab=None):
    print("Loading data")
    # laod the word counts and convert to a dense matrix
    #temp = fh.load_sparse(os.path.join(input_dir, input_prefix + '.npz')).todense()
    #X = np.array(temp, dtype='float32')
    X = fh.load_sparse(os.path.join(input_dir, input_prefix + '.npz')).tocsr()
    # load the vocabulary
    if vocab is None:
        vocab = fh.read_json(os.path.join(input_dir, input_prefix + '.vocab.json'))
    n_items, vocab_size = X.shape
    assert vocab_size == len(vocab)
    print("Loaded %d documents with %d features" % (n_items, vocab_size))

    ids = fh.read_json(os.path.join(input_dir, input_prefix + '.ids.json'))

    # filter out empty documents and return a boolean selector for filtering labels and covariates
    #row_selector = np.array(X.sum(axis=1) > 0, dtype=bool)
    row_sums = np.array(X.sum(axis=1)).reshape((n_items,))
    row_selector = np.array(row_sums > 0, dtype=bool)

    print("Found %d non-empty documents" % np.sum(row_selector))
    X = X[row_selector, :]
    ids = [doc_id for i, doc_id in enumerate(ids) if row_selector[i]]

    return X, vocab, row_selector, ids
Exemplo n.º 2
0
    def load_from_files(self, debug=False, debug_index=None):
        vocab = vocabulary_with_counts.VocabWithCounts(self.get_prefix(), add_oov=True,
                                                       read_from_filename=self.get_vocab_filename())
        index = fh.read_json(self.get_index_filename())
        feature_counts = fh.unpickle_data(self.get_feature_filename())
        oov_counts = fh.read_json(self.get_oov_count_filename())

        # TESTING
        if debug:
            if debug_index is None:
                item_index = random.randint(0, len(index))
            else:
                item_index = debug_index
            item = index[item_index]
            counts = feature_counts[item_index, :]

            print item
            print counts.indices
            print counts.data
            print vocab.get_tokens(counts.indices)
            print oov_counts[item_index]

        self.feature_counts = feature_counts
        self.index = index
        self.vocab = vocab
        self.oov_counts = oov_counts
Exemplo n.º 3
0
def load_data(input_dir, input_prefix, log_file, vocab=None):
    print("Loading data")
    temp = fh.load_sparse(os.path.join(input_dir,
                                       input_prefix + '.npz')).todense()
    X = np.array(temp, dtype='float32')
    if vocab is None:
        vocab = fh.read_json(
            os.path.join(input_dir, input_prefix + '.vocab.json'))
    lists_of_indices = fh.read_json(
        os.path.join(input_dir, input_prefix + '.indices.json'))
    index_arrays = [np.array(l, dtype='int32') for l in lists_of_indices]
    n_items, vocab_size = X.shape
    print(n_items, len(index_arrays))
    assert vocab_size == len(vocab)
    assert n_items == len(index_arrays)
    log(log_file,
        "Loaded %d documents with %d features" % (n_items, vocab_size))

    label_file = os.path.join(input_dir, input_prefix + '.labels.npz')
    if os.path.exists(label_file):
        print("Loading labels")
        temp = fh.load_sparse(label_file).todense()
        labels = np.array(temp, dtype='float32')
    else:
        print("Label file not found")
        labels = np.zeros([n_items, 1], dtype='float32')
    assert len(labels) == n_items

    counts_sum = X.sum(axis=0)
    order = list(np.argsort(counts_sum).tolist())
    order.reverse()
    print("Most common words: ", ' '.join([vocab[i] for i in order[:10]]))

    return X, vocab, index_arrays, labels
Exemplo n.º 4
0
 def read_from_file(self, filename):
     json_obj = fh.read_json(filename)
     self.index2token = json_obj['index2token']
     self.counts = Counter(json_obj['counts'])
     self.doc_counts = Counter(json_obj['doc_counts'])
     self.token2index = dict(
         zip(self.index2token, range(len(self.index2token))))
Exemplo n.º 5
0
    def extract_features(self, source, write_to_file=True, vocab_source=None):
        print "Extracting ngram tokens"

        # read in a dict of {document_key: text}
        data = fh.read_json(source)
        all_items = data.keys()

        tokens = self.extract_tokens_from_text(data)

        if vocab_source is None:
            vocab = self.make_vocabulary(tokens, all_items)
            vocab.prune(self.min_df)
            self.vocab = vocab
        else:
            vocab = self.load_vocabulary(vocab_source)
            self.vocab = vocab

        feature_counts, index = self.extract_feature_counts(all_items, tokens, vocab)

        if write_to_file:
            vocab.write_to_file(self.get_vocab_filename())
            fh.write_to_json(index, self.get_index_filename(), sort_keys=False)
            fh.pickle_data(feature_counts, self.get_feature_filename())

        self.feature_counts = feature_counts
        self.index = index
        self.column_names = np.array(self.vocab.index2token)
        self.do_transformations()
Exemplo n.º 6
0
def main(call=None):

    # handle command line
    parser = argparse.ArgumentParser()
    parser.add_argument("model_path", help="path for model directory")
    parser.add_argument("-n",
                        dest="n_words",
                        type=int,
                        default=30,
                        help="number of words to show in each topic")
    options = parser.parse_args(call)
    model_path = options.model_path
    n_words = options.n_words

    ## load Beta
    beta = np.load(os.path.join(model_path, 'beta.npz'))['beta']

    ## load vocab
    vocab = fh.read_json(os.path.join(model_path, 'vocab.json'))

    # get and print topics
    topics = get_top_n_topic_words(beta, vocab, n_words)
    for topic in topics:
        topicstring = ' '.join(topic)
        print(topicstring)
Exemplo n.º 7
0
    def load_from_files(self, vocab_source=None):
        self.vocab = self.load_vocabulary(vocab_source=vocab_source)

        index = fh.read_json(self.get_index_filename())
        feature_counts = fh.unpickle_data(self.get_feature_filename())

        self.feature_counts = feature_counts
        self.index = index
        self.column_names = np.array(self.vocab.index2token)
        self.do_transformations()
Exemplo n.º 8
0
def load_and_compute_npmi(topics_file,
                          ref_vocab_file,
                          ref_counts_file,
                          n_vals,
                          cols_to_skip=0,
                          output_file=None):
    print("Loading reference counts")
    ref_vocab = fh.read_json(ref_vocab_file)
    ref_counts = fh.load_sparse(ref_counts_file).tocsc()
    compute_npmi(topics_file, ref_vocab, ref_counts, n_vals, cols_to_skip,
                 output_file)
def main():
    usage = "%prog msa_db.csv data_dir output_file.jsonlist"
    parser = OptionParser(usage=usage)
    #parser.add_option('--keyword', dest='key', default=None,
    #                  help='Keyword argument: default=%default')
    #parser.add_option('--boolarg', action="store_true", dest="boolarg", default=False,
    #                  help='Keyword argument: default=%default')

    (options, args) = parser.parse_args()

    msa_db = args[0]
    data_dir = args[1]
    output_filename = args[2]

    articles = []

    exclude = [
        'murderpedia.org', 'www.gunviolencearchive.org', 'www.fbi.gov',
        'en.wikipedia.org', 'www.history.com', 'web.archive.org'
    ]

    df = pd.read_csv(msa_db, header=0)
    index = df.index
    for i in index:
        row = df.loc[i]
        caseid = row['CaseID']
        title = row['Title']
        names = row['Shooter Name'].split()
        #subdirs = glob.glob(os.path.join(data_dir, '*_*'))
        subdir = os.path.join(data_dir, str(caseid) + '_' + '_'.join(names))
        if not os.path.exists(subdir):
            files = glob.glob(
                os.path.join(data_dir,
                             str(caseid) + '_*', '*.json'))
        else:
            files = glob.glob(os.path.join(subdir, '*.json'))
        print(subdir, len(files))
        for f in files:
            data = fh.read_json(f)
            text = data['text']
            url = data['url']
            parts = url.split('/')
            domain = parts[2]
            if len(text) > 200:
                if domain not in exclude:
                    articles.append({
                        'id': str(i),
                        'caseid': str(caseid),
                        'event_name': title,
                        'text': text
                    })

    fh.write_jsonlist(articles, output_filename, sort_keys=False)
Exemplo n.º 10
0
def load_background_freq(input_dir, input_prefix, vocab):
    word_freq_file = os.path.join(input_dir, input_prefix + '.word_freq.json')
    if os.path.exists(word_freq_file):
        print("Loading background frequencies")
        log_word_freq = np.log(np.array(fh.read_json(word_freq_file)))
        order = np.argsort(log_word_freq)
        for i in range(10):
            print('%d %s %0.3f' % (i, vocab[order[-i - 1]],
                                   np.exp(log_word_freq[order[-i - 1]])))
    else:
        print("*** Background word frequency file not found! ***")
        log_word_freq = None
    return log_word_freq
def preprocess_data(csv_file, parsed_dir, output_dir, output_prefix, parse_prefix):

    df = pd.read_csv(csv_file, header=0, index_col=0)
    n_rows, n_columns = df.shape
    print(df.shape)

    files = glob.glob(os.path.join(parsed_dir, '*.json'))
    n_files = len(files)

    #assert n_files == n_rows

    coref_input = []

    pos_tags_all = set()
    print("Parsing %d documents" % n_files)
    for i in range(n_files):
        if i % 1000 == 0 and i > 0:
            print(i)

        valid = df.loc[i, 'matching']
        name = str(df.loc[i, 'shooter_names'])
        # fix an important name error
        name = re.sub('Marteen', 'Mateen', name)
        names = name.split()
        age = str(df.loc[i, 'age'])

        if valid:
            filename = os.path.join(parsed_dir, parse_prefix + '_' + str(i) + '.json')
            parse = fh.read_json(filename)

            # get the text and convert to tokens
            sentences, lemmas, pos_tags, speakers, dependencies, target_mentions, age_pos_tags = process_parse(parse, names, age)
            pos_tags_all.update(age_pos_tags)

            # write output for e2e-coref
            coref_input.append({"id": i,
                                "clusters": [],
                                "doc_key": "nw",
                                "sentences": sentences,
                                "lemmas": lemmas,
                                "speakers": speakers,
                                "pos_tags": pos_tags,
                                "dependencies": dependencies,
                                "coref": [target_mentions]
                                })

            print(i, names, age, len(target_mentions))

        fh.write_jsonlist(coref_input, os.path.join(output_dir, output_prefix + '.parsed.jsonlist'))
Exemplo n.º 12
0
def main():
    print("Reading model 1")
    beta1   = np.load(os.path.join(model_path1, 'beta.npz'))['beta']
    vocab1  = fh.read_json(os.path.join(model_path1, 'vocab.json'))
    topics1 = get_top_n_topic_words(beta1, vocab1, n_words)

    print("Reading model 2")
    beta2 = np.load(os.path.join(model_path2, 'beta.npz'))['beta']
    vocab2  = fh.read_json(os.path.join(model_path2, 'vocab.json'))
    topics2 = get_top_n_topic_words(beta2, vocab2, n_words)

    
    print("Matching topics")
    topic_match_tuples, topic_match_scores = get_topic_matched_pairs(beta1, beta2)


    for pair, score in zip(topic_match_tuples, topic_match_scores):
        print(str(score) + "\t" + str(pair))
        topicnum1    = pair[0]
        topicnum2    = pair[1]
        topicstring1 = ' '.join(topics1[topicnum1])
        topicstring2 = ' '.join(topics2[topicnum2])
        print(topicstring1)
        print(topicstring2)
Exemplo n.º 13
0
def load_data(input_dir: str,
              input_prefix: str,
              vocab_size=None,
              vocab=None,
              col_sel=None):
    print("Loading data")
    temp = fh.load_sparse(os.path.join(input_dir,
                                       input_prefix + '.npz')).todense()
    n_items, temp_size = temp.shape
    print("Loaded %d documents with %d features" % (n_items, temp_size))

    if vocab is None:
        col_sel = None
        vocab = fh.read_json(
            os.path.join(input_dir, input_prefix + '.vocab.json'))
        # filter vocabulary by word frequency
        if vocab_size is not None:
            print("Filtering vocabulary to the most common %d terms" %
                  int(vocab_size))
            col_sums = np.array(temp.sum(axis=0)).reshape((len(vocab), ))
            order = list(np.argsort(col_sums))
            order.reverse()
            col_sel = np.array(np.zeros(len(vocab)), dtype=bool)
            for i in range(int(vocab_size)):
                col_sel[order[i]] = True
            temp = temp[:, col_sel]
            vocab = [word for i, word in enumerate(vocab) if col_sel[i]]

    elif col_sel is not None:
        print("Using given vocabulary")
        temp = temp[:, col_sel]

    X = np.array(temp, dtype='float32')
    n_items, vocab_size = X.shape
    assert vocab_size == len(vocab)
    print("Loaded %d documents with %d features" % (n_items, vocab_size))

    # filter out empty documents
    non_empty_sel = X.sum(axis=1) > 0
    print("Found %d non-empty documents" % np.sum(non_empty_sel))
    X = X[non_empty_sel, :]

    counts_sum = X.sum(axis=0)
    order = list(np.argsort(counts_sum).tolist())
    order.reverse()
    print("Most common words: ", ' '.join([vocab[i] for i in order[:10]]))
    num = list(vocab[i] for i in order[:200])
    return X, vocab, col_sel, num
Exemplo n.º 14
0
def load_word_counts(input_dir, input_prefix, vocab=None):
    print("Loading data")
    # laod the word counts and convert to a dense matrix
    temp = fh.load_sparse(os.path.join(input_dir,
                                       input_prefix + '.npz')).todense()
    X = np.array(temp, dtype='float32')
    # load the vocabulary
    if vocab is None:
        vocab = fh.read_json(
            os.path.join(input_dir, input_prefix + '.vocab.json'))
    n_items, vocab_size = X.shape
    assert vocab_size == len(vocab)
    print("Loaded %d documents with %d features" % (n_items, vocab_size))

    # filter out empty documents and return a boolean selector for filtering labels and covariates
    row_selector = X.sum(axis=1) > 0
    print("Found %d non-empty documents" % np.sum(row_selector))
    X = X[row_selector, :]

    return X, vocab, row_selector
Exemplo n.º 15
0
    def extract_features(self, source, write_to_file=True):
        print "Extracting ngram tokens:"

        # read in a dict of {document_key: text}
        data = fh.read_json(source)
        all_items = data.keys()

        tokens = self.extract_tokens_from_file(data, self.get_n())

        vocab = self.make_vocabulary(tokens, all_items)

        feature_counts, oov_counts = self.extract_feature_counts(all_items, tokens, vocab)

        if write_to_file:
            vocab.write_to_file(self.get_vocab_filename())
            fh.write_to_json(all_items, self.get_index_filename(), sort_keys=False)
            fh.pickle_data(feature_counts, self.get_feature_filename())
            fh.write_to_json(oov_counts, self.get_oov_count_filename(), sort_keys=False)

        self.feature_counts = feature_counts
        self.index = all_items
        self.vocab = vocab
Exemplo n.º 16
0
def load_and_process_data(infile, vocab_size, parser, strip_html=False, vocab=None, label_list=None, use_mallet_stopwords=False, replace_num=False, lemmatize=False, log_transform=False, keep_nonalphanum=False, only_alpha=False, min_length=1):

    mallet_stopwords = None
    if use_mallet_stopwords:
        print("Using MALLET stopwords")
        mallet_stopwords = fh.read_text('mallet_stopwords.txt')
        mallet_stopwords = {s.strip() for s in mallet_stopwords}

    print("Reading data files")
    item_dict = fh.read_json(infile)
    n_items = len(item_dict)

    parsed = []
    labels = []

    print("Parsing %d documents" % n_items)
    word_counts = Counter()
    doc_counts = Counter()
    keys = list(item_dict.keys())
    keys.sort()
    for i, k in enumerate(keys):
        item = item_dict[k]
        if i % 1000 == 0 and i > 0:
            print(i)

        text = item['text']
        label = item['label']
        labels.append(label)

        if strip_html:
            # remove each pair of angle brackets and everything within them
            text = re.sub('<[^>]+>', '', text)

        parse = parser(text)
        # remove white space from tokens
        if lemmatize:
            words = [re.sub('\s', '', token.lemma_) for token in parse]
        else:
            words = [re.sub('\s', '', token.orth_) for token in parse]
        # convert to lower case and drop empty strings
        words = [word.lower() for word in words if len(word) >= min_length]
        # remove stop words
        if use_mallet_stopwords:
            words = [word for word in words if word not in mallet_stopwords]
        # remove tokens that don't contain letters or numbers
        if only_alpha:
            words = [word for word in words if re.match('^[a-zA-A]*$', word) is not None]
        if not keep_nonalphanum:
            words = [word for word in words if re.match('[a-zA-A0-9]', word) is not None]
        # convert numbers to a number symbol
        if replace_num:
            words = ['<NUM>' if re.match('[0-9]', word) is not None else word for word in words]
        # store the parsed documents
        parsed.append(words)
        # keep track fo the number of documents with each word
        word_counts.update(words)
        doc_counts.update(set(words))

    print("Size of full vocabulary=%d" % len(word_counts))

    if vocab is None:
        most_common = doc_counts.most_common(n=vocab_size)
        words, counts = zip(*most_common)
        print("Most common words:")
        for w in range(20):
            print(words[w], doc_counts[words[w]], word_counts[words[w]])
        vocab = list(words)
        vocab.sort()
        total_words = np.sum(list(word_counts.values()))
        word_freqs = np.array([word_counts[v] for v in vocab]) / float(total_words)
    else:
        word_freqs = None

    vocab_index = dict(zip(vocab, range(vocab_size)))

    if label_list is None:
        label_list = list(set(labels))
        label_list.sort()

    n_labels = len(label_list)
    label_index = dict(zip(label_list, range(n_labels)))

    X = np.zeros([n_items, vocab_size], dtype=int)
    y = []

    dat_strings = []
    svm_strings = []
    mallet_strings = []

    lists_of_indices = []  # an alternative representation of each document as a list of indices

    print("First document:")
    print(' '.join(parsed[0]))

    counter = Counter()
    print("Converting to count representations")
    count = 0
    total_tokens = 0
    for i, words in enumerate(parsed):
        indices = [vocab_index[word] for word in words if word in vocab_index]
        word_subset = [word for word in words if word in vocab_index]
        counter.clear()
        counter.update(indices)
        # only include non-empty documents
        if len(counter.keys()) > 0:
            # udpate the counts
            mallet_strings.append(str(i) + '\t' + 'en' + '\t' + ' '.join(word_subset))
            values = list(counter.values())
            if log_transform:
                # apply the log transform from Salakhutdinov and Hinton
                values = np.array(np.round(np.log(1 + np.array(values, dtype='float'))), dtype=int)
            X[np.ones(len(counter.keys()), dtype=int) * count, list(counter.keys())] += values
            total_tokens += len(word_subset)
            y_vector = np.zeros(n_labels)
            y_vector[label_index[labels[i]]] = 1
            y.append(y_vector)
            #y.append(label_index[labels[i]])
            # save the list of indices
            lists_of_indices.append(indices)
            dat_string = str(int(len(counter))) + ' '
            dat_string += ' '.join([str(int(k)) + ':' + str(int(v)) for k, v in zip(list(counter.keys()), list(counter.values()))])
            dat_strings.append(dat_string)
            svm_string = 'target '
            svm_string += ' '.join([vocab[int(k)] + ':' + str(int(v)) for k, v in zip(list(counter.keys()), list(counter.values()))])
            svm_strings.append(svm_string)
            #text_map[count] = words
            count += 1

    print("Found %d non-empty documents" % count)
    print("Total tokens = %d" % total_tokens)

    # drop the items that don't have any words in the vocabualry
    X = np.array(X[:count, :], dtype=int)

    temp = np.array(y)
    y = np.array(temp[:count], dtype=int)
    sparse_y = sparse.csr_matrix(y)

    # convert to a sparse representation
    sparse_X = sparse.csr_matrix(X)
    sparse_X_sage = sparse.csr_matrix(X, dtype=float)

    vocab_for_sage = np.zeros((vocab_size,), dtype=np.object)
    vocab_for_sage[:] = vocab

    tr_aspect = np.ones([n_items, 1], dtype=float)
    widx = np.arange(vocab_size, dtype=float) + 1

    sage_output = {'tr_data': sparse_X_sage, 'tr_aspect': tr_aspect, 'widx': widx, 'vocab': vocab_for_sage}

    return sparse_X, vocab, lists_of_indices, sparse_y, label_list, word_freqs, dat_strings[:count], mallet_strings[:count], sage_output, svm_strings[:count]
Exemplo n.º 17
0
def load_data(input_dir, input_prefix, label_file_name=None, covar_file_names=None, vocab_size=None, vocab=None, col_sel=None):
    print("Loading data")
    temp = fh.load_sparse(os.path.join(input_dir, input_prefix + '.npz')).todense()
    n_items, temp_size = temp.shape
    print("Loaded %d documents with %d features" % (n_items, temp_size))

    if vocab is None:
        col_sel = None
        vocab = fh.read_json(os.path.join(input_dir, input_prefix + '.vocab.json'))
        # filter vocabulary by word frequency
        if vocab_size is not None:
            print("Filtering vocabulary to the most common %d terms" % int(vocab_size))
            col_sums = np.array(temp.sum(axis=0)).reshape((len(vocab), ))
            order = list(np.argsort(col_sums))
            order.reverse()
            col_sel = np.array(np.zeros(len(vocab)), dtype=bool)
            for i in range(int(vocab_size)):
                col_sel[order[i]] = True
            temp = temp[:, col_sel]
            vocab = [word for i, word in enumerate(vocab) if col_sel[i]]

    elif col_sel is not None:
        print("Using given vocabulary")
        temp = temp[:, col_sel]

    X = np.array(temp, dtype='float32')
    n_items, vocab_size = X.shape
    assert vocab_size == len(vocab)
    print("Loaded %d documents with %d features" % (n_items, vocab_size))

    # filter out empty documents
    non_empty_sel = X.sum(axis=1) > 0
    print("Found %d non-empty documents" % np.sum(non_empty_sel))
    X = X[non_empty_sel, :]
    n_items, vocab_size = X.shape

    if label_file_name is not None:
        label_file = os.path.join(input_dir, input_prefix + '.' + label_file_name + '.csv')
        if os.path.exists(label_file):
            print("Loading labels from", label_file)
            temp = pd.read_csv(label_file, header=0, index_col=0)
            label_names = temp.columns
            if 'NA' in label_names:
                na_label_index = list(label_names).index('NA')
            else:
                na_label_index = len(label_names) + 1
            labels = np.array(temp.values)
            labels = labels[non_empty_sel, :]
            n, n_labels = labels.shape
            assert n == n_items
            print("%d labels" % n_labels)
        else:
            print("Label file not found:", label_file)
            sys.exit()
        if (np.sum(labels, axis=1) == 1).all() and (np.sum(labels == 0) + np.sum(labels == 1) == labels.size):
            label_type = 'categorical'
        elif np.sum(labels == 0) + np.sum(labels == 1) == labels.size:
            label_type = 'bernoulli'
        else:
            label_type = 'real'
        print("Found labels of type %s" % label_type)

    else:
        labels = None
        label_names = None
        label_type = None
        na_label_index = None

    if covar_file_names is not None:
        covariate_list = []
        covariate_names_list = []
        covar_file_names = covar_file_names.split(',')
        for covar_file_name in covar_file_names:
            covariates_file = os.path.join(input_dir, input_prefix + '.' + covar_file_name + '.csv')
            if os.path.exists(covariates_file):
                print("Loading covariates from", covariates_file)
                temp = pd.read_csv(covariates_file, header=0, index_col=0)
                covariate_names = temp.columns
                covariates = np.array(temp.values, dtype=np.float32)
                covariates = covariates[non_empty_sel, :]
                n, n_covariates = covariates.shape
                assert n == n_items
                covariate_list.append(covariates)
                covariate_names_list.extend(covariate_names)
            else:
                print("Covariates file not found:", covariates_file)
                sys.exit()
        covariates = np.hstack(covariate_list)
        covariate_names = covariate_names_list
        n, n_covariates = covariates.shape

        if (np.sum(covariates, axis=1) == 1).all() and (np.sum(covariates == 0) + np.sum(covariates == 1) == covariates.size):
            covariates_type = 'categorical'
        else:
            covariates_type = 'other'

        print("Found covariates of type %s" % covariates_type)

        assert n == n_items
        print("%d covariates" % n_covariates)
    else:
        covariates = None
        covariate_names = None
        covariates_type = None

    counts_sum = X.sum(axis=0)
    order = list(np.argsort(counts_sum).tolist())
    order.reverse()
    print("Most common words: ", ' '.join([vocab[i] for i in order[:10]]))

    return X, vocab, labels, label_names, na_label_index, label_type, covariates, covariate_names, covariates_type, col_sel
def preprocess_data(csv_file, parsed_dir, output_dir, output_prefix, parse_prefix):

    df = pd.read_csv(csv_file, header=0, index_col=0)
    n_rows, n_columns = df.shape
    print(df.shape)

    files = glob.glob(os.path.join(parsed_dir, '*.json'))
    n_files = len(files)

    #assert n_files == n_rows

    coref_input = []

    pos_tags_all = set()
    print("Parsing %d documents" % n_files)
    #for i in range(n_files):
    for i in range(n_files):
        if i % 1000 == 0 and i > 0:
            print(i)

        valid = df.loc[i, 'matching']
        name = str(df.loc[i, 'shooter_names'])
        # fix an important name error
        name = re.sub('Marteen', 'Mateen', name)
        names = name.split()
        age = str(df.loc[i, 'age'])
        event_name = 'msa-' + re.sub('\s', '-', df.loc[i, 'title'])

        msa_index = int(df.loc[i, 'df_index'])

        if msa_index == 272:
            # Kalamzoo duplicate
            print("Skipping", i, event_name)
        elif msa_index == 276:
            # Belfair duplicate
            print("Skipping", i, event_name)
        elif msa_index == 293:
            # Sherman, Texas duplicate
            print("Skipping", i, event_name)
        elif msa_index == 280:
            # Chelsea, MA duplicate
            print("Skipping", i, event_name)
        elif msa_index == 283:
            # Kansas City duplicate
            print("Skipping", i, event_name)
        elif msa_index == 331:
            # Cape Coral
            print("Skipping", i, event_name)

        elif valid:
            filename = os.path.join(parsed_dir, parse_prefix + '_' + str(i) + '.json')
            parse = fh.read_json(filename)

            # get the text and convert to tokens
            sentences, sentences_tagged, target_mentions, pos_tags, dependencies = process_parse(parse, names, age, event_name)

            sentences_pruned = []
            for sent in sentences_tagged:
                tokens = [token for token in sent if token != '__DROP__']
                sentences_pruned.append(' '.join(tokens))
            text_pruned = ' '.join(sentences_pruned)

            # write output for e2e-coref
            coref_input.append({"id": i,
                                "clusters": [],
                                "doc_key": "nw",
                                "sentences": sentences,
                                "text_tagged": text_pruned,
                                "pos_tags": pos_tags,
                                "dependencies": dependencies,
                                "coref": [target_mentions]
                                })

            print(i, names, age, len(target_mentions))

        fh.write_jsonlist(coref_input, os.path.join(output_dir, output_prefix + '.parsed.jsonlist'))
Exemplo n.º 19
0
 def read_from_file(self, filename):
     self.index2token = fh.read_json(filename)
     self.token2index = dict(
         zip(self.index2token, range(len(self.index2token))))
Exemplo n.º 20
0
def get_results_data(
        basedir,
        pattern,
        ignore_cols_with_same_vals=True,
        coherence_reference_dir="/fs/clip-political/scholar/congress_votes_dwnom"
):
    """
    Get the results data in folders matching `pattern` in `basedir`
    """
    dirs = [(p.name, p) for p in Path(basedir).glob(pattern) if p.is_dir()]

    ref_vocab = fh.read_json(Path(coherence_reference_dir, "train.vocab.json"))
    ref_counts = fh.load_sparse(Path(coherence_reference_dir,
                                     "test.npz")).tocsc()

    experiments = pd.DataFrame()
    column_names = []
    for run_name, run_dir in tqdm.tqdm(dirs):

        model_path = Path(run_dir, 'torch_model.pt')
        try:
            checkpoint = torch.load(model_path, map_location='cpu')
        except FileNotFoundError:
            continue

        npmi_internal = None
        try:
            topics = fh.read_text(Path(run_dir, "topic.txt"))
        except FileNotFoundError:
            print(
                f"topics.txt not found for {run_name}. Will not calculate npmi"
            )
            pass
        else:
            npmi_internal = compute_npmi_at_n(
                topics=topics,
                ref_vocab=ref_vocab,
                ref_counts=ref_counts,
                n=10,  # could change?
                silent=True,
            )

        model_time = (datetime.fromtimestamp(
            model_path.stat().st_mtime).strftime('%Y-%m-%d %H:%M'))
        run_data = {
            'run_name':
            run_name,
            'git_hash':
            checkpoint['git_hash'],
            'date':
            model_time,

            # hyperparameters
            **checkpoint['options'].__dict__,  # works if we switch to argparse as well

            # results
            'saved_at_epoch':
            checkpoint['epoch'],
            'accuracy_train':
            read_result_from_file(Path(run_dir, 'accuracy.train.txt')),
            'accuracy_dev':
            read_result_from_file(Path(run_dir, 'accuracy.dev.txt')),
            'accuracy_dev_from_chkpt':
            checkpoint['dev_metrics']['accuracy'],
            'accuracy_test':
            read_result_from_file(Path(run_dir, 'accuracy.test.txt')),
            'perplexity_dev':
            read_result_from_file(Path(run_dir, 'perplexity.dev.txt')),
            'perplexity_test':
            read_result_from_file(Path(run_dir, 'perplexity.test.txt')),
            'maw':
            read_result_from_file(Path(run_dir, 'maw.txt'))
        }

        # keep longest set of cols for data ordering (python>=3.6 keeps dict key order)
        if len(run_data.keys()) > len(column_names):
            column_names = list(run_data.keys())

        experiments = experiments.append(run_data, ignore_index=True)

    # reorder columns
    experiments = experiments[column_names]
    if ignore_cols_with_same_vals:
        # remove any columns where the values have not been altered run-to-run
        # see https://stackoverflow.com/a/39658662/5712749
        nunique_vals = experiments.apply(pd.Series.nunique)
        cols_to_drop = nunique_vals[nunique_vals <= 1].index
        experiments = experiments.drop(cols_to_drop, axis=1)

    return experiments.sort_values(by=['date'])
Exemplo n.º 21
0
 def read_from_file(self, filename):
     self.index2token = fh.read_json(filename)
     self.token2index = dict(zip(self.index2token, range(len(self.index2token))))
Exemplo n.º 22
0
def load_data(input_dir,
              input_prefix,
              label_file_name=None,
              covar_file_names=None,
              vocab=None):
    print("Loading data")
    temp = fh.load_sparse(os.path.join(input_dir,
                                       input_prefix + '.npz')).todense()
    X = np.array(temp, dtype='float32')
    if vocab is None:
        vocab = fh.read_json(
            os.path.join(input_dir, input_prefix + '.vocab.json'))
    n_items, vocab_size = X.shape
    assert vocab_size == len(vocab)
    print("Loaded %d documents with %d features" % (n_items, vocab_size))

    # filter out empty documents
    non_empty_sel = X.sum(axis=1) > 0
    print("Found %d non-empty documents" % np.sum(non_empty_sel))
    X = X[non_empty_sel, :]
    n_items, vocab_size = X.shape

    if label_file_name is not None:
        label_file = os.path.join(
            input_dir, input_prefix + '.' + label_file_name + '.csv')
        if os.path.exists(label_file):
            print("Loading labels from", label_file)
            temp = pd.read_csv(label_file, header=0, index_col=0)
            label_names = temp.columns
            labels = np.array(temp.values)
            labels = labels[non_empty_sel, :]
            n, n_labels = labels.shape
            assert n == n_items
            print("%d labels" % n_labels)
        else:
            print("Label file not found:", label_file)
            sys.exit()
        if (np.sum(labels, axis=1)
                == 1).all() and (np.sum(labels == 0) + np.sum(labels == 1)
                                 == labels.size):
            label_type = 'categorical'
        elif np.sum(labels == 0) + np.sum(labels == 1) == labels.size:
            label_type = 'bernoulli'
        else:
            label_type = 'real'
        print("Found labels of type %s" % label_type)

    else:
        labels = None
        label_names = None
        label_type = None

    if covar_file_names is not None:
        covariate_list = []
        covariate_names_list = []
        covar_file_names = covar_file_names.split(',')
        for covar_file_name in covar_file_names:
            covariates_file = os.path.join(
                input_dir, input_prefix + '.' + covar_file_name + '.csv')
            if os.path.exists(covariates_file):
                print("Loading covariates from", covariates_file)
                temp = pd.read_csv(covariates_file, header=0, index_col=0)
                covariate_names = temp.columns
                covariates = np.array(temp.values, dtype=np.float32)
                covariates = covariates[non_empty_sel, :]
                n, n_covariates = covariates.shape
                assert n == n_items
                covariate_list.append(covariates)
                covariate_names_list.extend(covariate_names)
            else:
                print("Covariates file not found:", covariates_file)
                sys.exit()
        covariates = np.hstack(covariate_list)
        covariate_names = covariate_names_list
        n, n_covariates = covariates.shape

        if (np.sum(covariates, axis=1)
                == 1).all() and (np.sum(covariates == 0) +
                                 np.sum(covariates == 1) == covariates.size):
            covariates_type = 'categorical'
        else:
            covariates_type = 'other'

        print("Found covariates of type %s" % covariates_type)

        assert n == n_items
        print("%d covariates" % n_covariates)
    else:
        covariates = None
        covariate_names = None
        covariates_type = None

    counts_sum = X.sum(axis=0)
    order = list(np.argsort(counts_sum).tolist())
    order.reverse()
    print("Most common words: ", ' '.join([vocab[i] for i in order[:10]]))

    return X, vocab, labels, label_names, label_type, covariates, covariate_names, covariates_type
    run_parser.add_argument("--npmi-words", type=int, default=10)
    run_parser.add_argument("--min-acceptable-npmi", type=float, default=0.)
    run_parser.add_argument(
        "--ext-counts-fpath",
    )
    run_parser.add_argument(
        "--ext-vocab-fpath",
    )
    run_args, additional_args = run_parser.parse_known_args()

    outdir_parser = argparse.ArgumentParser()
    outdir_parser.add_argument("-o")
    outdir_args, _ = outdir_parser.parse_known_args(additional_args)

    nyt_counts = fh.load_sparse(run_args.ext_counts_fpath)
    nyt_vocab = fh.read_json(run_args.ext_vocab_fpath)
    
    np.random.seed(run_args.global_seed)
    run_seeds = iter([
        121958, 671155, 131932, 365838, 259178, 921881, 616685, 919314, 130398,
        5591, 11235, 2020, 19, 8000, 1001, 12345,
    ])
    
    # copy over code
    Path(outdir_args.o).mkdir(parents=True, exist_ok=True)
    shutil.copy("run_scholar.py", Path(outdir_args.o, "run_scholar.py"))
    shutil.copy("scholar.py", Path(outdir_args.o, "scholar.py"))

    if Path(outdir_args.o, "dev_metrics.csv").exists():
        old_path = Path(outdir_args.o, "dev_metrics.csv")
        ctime = datetime.fromtimestamp(old_path.stat().st_ctime).strftime("%Y-%m-%d")
Exemplo n.º 24
0
 def read_from_file(self, filename):
     json_obj = fh.read_json(filename)
     self.index2token = json_obj['index2token']
     self.counts = Counter(json_obj['counts'])
     self.doc_counts = Counter(json_obj['doc_counts'])
     self.token2index = dict(zip(self.index2token, range(len(self.index2token))))
def main():
    usage = "%prog model_file.npz vocab_file.json"
    parser = OptionParser(usage=usage)
    parser.add_option('--sparsity_thresh',
                      dest='sparsity_thresh',
                      default=1e-3,
                      help='Sparsity threshold: default=%default')
    parser.add_option('--interactions',
                      action="store_true",
                      dest="interactions",
                      default=False,
                      help='Print interaction topics: default=%default')
    parser.add_option(
        '--n_pos',
        dest='n_pos',
        default=7,
        help='Number of positive terms to display: default=%default')
    parser.add_option(
        '--n_neg',
        dest='n_neg',
        default=4,
        help='Number of negative terms to display: default=%default')
    parser.add_option(
        '--max_classes',
        dest='max_classes',
        default=None,
        help='Maximum number of classes to display: default=%default')

    (options, args) = parser.parse_args()

    model_file = args[0]
    vocab_file = args[1]

    params = np.load(model_file)
    vocab = fh.read_json(vocab_file)
    n_pos = int(options.n_pos)
    n_neg = int(options.n_neg)
    max_classes = options.max_classes

    sparsity_threshold = options.sparsity_thresh
    interactions = options.interactions

    dv = params['d_v']
    n_topics = params['d_t']
    n_classes = params['n_classes']
    if max_classes is not None:
        n_classes = int(max_classes)

    if n_topics > 1:
        print("\nTopics:")
        weights = np.array(params['W_decoder'])
        mean_sparsity = 0.0
        for j in range(n_topics):
            order = list(np.argsort(weights[:, j]).tolist())
            order.reverse()
            highest = ' '.join([vocab[i] for i in order[:n_pos]])
            lowest = ' '.join([vocab[i] for i in order[-n_neg:]])
            min_w = weights[:, j].min()
            max_w = weights[:, j].max()
            mean_w = weights[:, j].mean()
            sparsity = np.array(np.abs(weights[:, j]) < sparsity_threshold,
                                dtype=float).sum() / float(dv)
            mean_sparsity += sparsity
            print("%d %s / %s (%0.3f, %0.3f, %0.3f) [%0.5f]" %
                  (j, highest, lowest, min_w, mean_w, max_w, sparsity))
        sparsity = np.array(np.abs(weights) < sparsity_threshold,
                            dtype=float).sum() / float(dv * n_topics)
        print("Topic sparsity = %0.3f" % sparsity)

    if n_classes > 1:
        print("\nClasses:")
        weights = np.array(params['W_decoder_label'])
        mean_sparsity = 0.0
        for j in range(n_classes):
            order = list(np.argsort(weights[:, j]).tolist())
            order.reverse()
            highest = ' '.join([vocab[i] for i in order[:n_pos]])
            lowest = ' '.join([vocab[i] for i in order[-n_neg:]])
            min_w = weights[:, j].min()
            max_w = weights[:, j].max()
            mean_w = weights[:, j].mean()
            sparsity = np.array(np.abs(weights[:, j]) < sparsity_threshold,
                                dtype=float).sum() / float(dv)
            mean_sparsity += sparsity
            print("%d %s / %s (%0.3f, %0.3f, %0.3f) [%0.5f]" %
                  (j, highest, lowest, min_w, mean_w, max_w, sparsity))
        sparsity = np.array(np.abs(weights) < sparsity_threshold,
                            dtype=float).sum() / float(dv * n_classes)
        print("Covariate sparsity = %0.3f" % sparsity)

    if params['use_interactions']:
        print("\nInteractions:")
        interaction_weights = np.array(params['W_decoder_inter'])
        if interactions:
            mean_sparsity = 0.0
            for j in range(n_topics):
                for k in range(n_classes):
                    index = k + j * n_classes
                    weights_sum = interaction_weights[:, index]
                    order = list(np.argsort(weights_sum).tolist())
                    order.reverse()
                    highest = ' '.join([vocab[i] for i in order[:n_pos]])
                    lowest = ' '.join([vocab[i] for i in order[-n_neg:]])
                    min_w = weights_sum.min()
                    max_w = weights_sum.max()
                    mean_w = weights_sum.mean()
                    sparsity = np.array(
                        np.abs(weights_sum) < sparsity_threshold,
                        dtype=float).sum() / float(dv)
                    mean_sparsity += sparsity
                    print("%d/%d %s / %s (%0.3f, %0.3f, %0.3f) [%0.5f]" %
                          (j, k, highest, lowest, min_w, mean_w, max_w,
                           sparsity))

        sparsity = np.array(np.abs(interaction_weights) < sparsity_threshold,
                            dtype=float).sum() / float(
                                dv * n_topics * n_classes)
        print("Interaction sparsity = %0.3f" % sparsity)

        print("\nWith interactions (but no labels):")
        topic_weights = np.array(params['W_decoder'])
        interaction_weights = np.array(params['W_decoder_inter'])
        if interactions:
            mean_sparsity = 0.0
            for j in range(n_topics):
                print(j)
                for k in range(n_classes):
                    index = k + j * n_classes
                    weights_sum = topic_weights[:,
                                                j] + interaction_weights[:,
                                                                         index]
                    order = list(np.argsort(weights_sum).tolist())
                    order.reverse()
                    highest = ' '.join([vocab[i] for i in order[:n_pos]])
                    lowest = ' '.join([vocab[i] for i in order[-n_neg:]])
                    min_w = weights_sum.min()
                    max_w = weights_sum.max()
                    mean_w = weights_sum.mean()
                    sparsity = np.array(
                        np.abs(weights_sum) < sparsity_threshold,
                        dtype=float).sum() / float(dv)
                    mean_sparsity += sparsity
                    print("%d/%d %s / %s (%0.3f, %0.3f, %0.3f) [%0.5f]" %
                          (j, k, highest, lowest, min_w, mean_w, max_w,
                           sparsity))
Exemplo n.º 26
0
def main():
    usage = "%prog msa_db.csv articles.csv parsed_dir output_file.csv"
    parser = OptionParser(usage=usage)
    parser.add_option('--prefix',
                      dest='parse_prefix',
                      default='all',
                      help='Prefix of parsed files: default=%default')
    #parser.add_option('--boolarg', action="store_true", dest="boolarg", default=False,
    #                  help='Keyword argument: default=%default')

    (options, args) = parser.parse_args()
    msa_csv = args[0]
    articles_csv = args[1]
    parsed_dir = args[2]
    outfile = args[3]
    parse_prefix = options.parse_prefix

    if os.path.exists(outfile):
        raise FileExistsError("outfile already exists!")

    msa_df = pd.read_csv(msa_csv, header=0)
    print(msa_df.shape)

    df = pd.read_csv(articles_csv, header=0, index_col=0)
    n_rows, n_columns = df.shape
    print(df.shape)

    files = glob.glob(os.path.join(parsed_dir, '*.json'))
    n_files = len(files)

    assert n_files == n_rows

    msa_df['n_total_articles'] = 0
    msa_df['n_valid_articles'] = 0
    msa_df['n_terrorism_mentions'] = 0
    msa_df['n_unnegated_terrorism_mentions'] = 0
    msa_df['n_mental_mentions'] = 0
    msa_df['n_islam_mentions'] = 0
    msa_df['n_immigrant_mentions'] = 0

    for i in msa_df.index:
        date = pd.to_datetime(msa_df.loc[i, 'Date'])
        msa_df.loc[i, 'date'] = date
        msa_df.loc[i, 'year'] = date.year

    #msa_df = msa_df[msa_df.year >= 1990]

    for i in range(n_files):
        if i % 100 == 0 and i > 0:
            print(i)

        msa_id = df.loc[i, 'df_index']
        caseid = df.loc[i, 'caseid']
        name = str(df.loc[i, 'shooter_names'])
        # fix an important name error
        name = re.sub('Marteen', 'Mateen', name)
        names = name.split()
        age = str(df.loc[i, 'age'])
        age_string = str(age) + '-year-old'
        city = str(df.loc[i, 'city'])
        title = df.loc[i, 'title']

        if msa_id == 272:
            # Kalamzoo duplicate
            print("Skipping", i, title)
        elif msa_id == 276:
            # Belfair duplicate
            print("Skipping", i, title)
        elif msa_id == 293:
            # Sherman, Texas duplicate
            print("Skipping", i, title)
        elif msa_id == 280:
            # Chelsea, MA duplicate
            print("Skipping", i, title)
        elif msa_id == 283:
            # Kansas City duplicate
            print("Skipping", i, title)
        elif msa_id == 331:
            # Cape Coral
            print("Skipping", i, title)
        else:
            age_found = False
            name_found = False
            city_found = False

            filename = os.path.join(parsed_dir,
                                    parse_prefix + '_' + str(i) + '.json')
            parse = fh.read_json(filename)

            sentences = parse['sentences']
            for sentence in sentences:
                tokens = [token['word'] for token in sentence['tokens']]
                lower_tokens = [token.lower() for token in tokens]
                sentence_text = ' '.join(tokens)
                if age_string in lower_tokens:
                    age_found = True
                if city in sentence_text:
                    city_found = True
                for name in names:
                    if name in tokens:
                        name_found = True

            msa_df.loc[msa_id, 'n_total_articles'] += 1
            if age_found or city_found or name_found:
                msa_df.loc[msa_id, 'n_valid_articles'] += 1

                terrorism_mention = False
                unnegated_terrorism_mention = False
                mental_mention = False
                islam_mention = False
                immigrant_mention = False
                for sentence in sentences:
                    tokens = [
                        token['word'].lower() for token in sentence['tokens']
                    ]
                    sentence_text = ' '.join(tokens)
                    if 'terrorism' in tokens or 'terrorist' in tokens:
                        terrorism_mention = True
                        if 'not' in tokens or re.match('no\s*\S* evidence',
                                                       sentence_text):
                            print(sentence_text)
                        else:
                            unnegated_terrorism_mention = True
                    if 'mental' in tokens:
                        mental_mention = True
                    if 'islam' in tokens or 'islamic' in tokens or 'muslim' in tokens or 'muslims' in tokens:
                        islam_mention = True
                    if 'immigrant' in tokens or 'migrant' in tokens or 'naturalized' in tokens or 'immigrated' in tokens:
                        immigrant_mention = True

                if terrorism_mention:
                    msa_df.loc[msa_id, 'n_terrorism_mentions'] += 1
                if unnegated_terrorism_mention:
                    msa_df.loc[msa_id, 'n_unnegated_terrorism_mentions'] += 1
                if mental_mention:
                    msa_df.loc[msa_id, 'n_mental_mentions'] += 1
                if islam_mention:
                    msa_df.loc[msa_id, 'n_islam_mentions'] += 1
                if immigrant_mention:
                    msa_df.loc[msa_id, 'n_immigrant_mentions'] += 1

    msa_df.to_csv(outfile)
    print(msa_df.n_valid_articles.sum())