예제 #1
0
def load_data(args):
    global word_dict, word_embed
    docs = []
    docs += du.load_sent('../datasets/bbcnews.txt') # BBC_news
    # docs += du.load_sent('../datasets/BBC_news.txt')
    word_dict = util.build_dict(docs)
    # inv_dict = util.build_inv_dict(word_dict)
    word_embed = util.words2embedding(word_dict, 100, args.embedding_file) 
    print('word_dict:', word_dict)
    with open('../datasets/word_dict', 'wb') as fid:
        dump(word_dict, fid)
    doc = ' '.join(docs)
    return doc
예제 #2
0
def test1(args):
    
    docs = []
    docs += du.load_sent('../datasets/bbcnews.txt')
    logging.info('docs: {}'.format(len(docs)))
    logging.info("building dictionary...")
    word_dict, char_dict = util.build_dict(docs)
    word_embed = util.words2embedding(word_dict, 100, args.embedding_file) 
    (args.word_vocab_size, args.word_embed_size) = word_embed.shape
  
    logging.info('docs: {}'.format(word_embed.shape)) # (119, 100) # Words: 117 -> 117
    print(word_dict)
    doc = ' '.join(docs[0])
    # with open('bbcnews.txt') as f:
    #     docs = f.read()
    # sp.build_graph(doc)
    vertice_map = sp.hash_vertex(doc)
    for vertice in vertice_map:
        print(words2word(vertice[0],word_embed,word_dict))
예제 #3
0
def build_dict(docs, max_words=5000000, dict_file=None):
    """
        :param docs: a doc is a list of sentences
        :return: dictionary of words
        """

    word_count = Counter()
    # char_count = Counter()
    for doc in docs:
        logging.info('processing %s' % (doc))
        docs = du.load_sent(doc)
        # sents = sum(sents, [])
        # text = nlp(' '.join(sents))
        # for sent in text.sents:
        for doc in docs:
            '''
            print(len(sent))
            raise Exception
            '''
            for sent in doc:
                for w in sent.split(' '):
                    # w = w.lemma_
                    # w = w.text
                    if w.isdigit():
                        w = "num@#!123"
                    word_count[w.lower()] += 1

    ls = word_count.most_common(max_words)
    # chars = char_count.most_common(80)
    logging.info('#Words: %d -> %d' % (len(word_count), len(ls)))
    print('#Words: %d -> %d' % (len(word_count), len(ls)))
    # leave 0 to padding
    # leave 1 to UNK
    word_dict = {w[0]: index + 2 for (index, w) in enumerate(ls)}

    if dict_file:
        with open(dict_file, 'wb') as fid:
            pickle.dump(word_dict, fid)
예제 #4
0
from event_chain import *
import data_utilities as du
import utilities as util
from pprint import pprint

import time
####### test 3 compare ######
docs = du.load_sent('../datasets/bbcsample1.txt')
word_dict = util.build_dict(docs)

print('models already downloaded')
srl_predictor = Predictor.from_path(
    '../pretrained/srl-model-2018.05.25.tar.gz')


def test3():
    global word_dict, srl_predictor

    # docs = du.load_sent('../datasets/bbcsample1.txt')

    # wd = util.build_dict(docs)
    # pprint(wd)

    # print(docs)
    print('using ecb **********')
    start = time.time()

    ecb = EventChainBuilder(word_dict)

    for i, sent in enumerate(docs):
        print('processing sentence', i)
예제 #5
0
    parser = argparse.ArgumentParser()
    parser.add_argument('--word_dict', type=str, default=None)
    parser.add_argument('--doc', type=str, default=None)
    parser.add_argument('--tdir', type=str, default=None)
    parser.add_argument('--fdir', type=str, default=None)
    parser.add_argument('--save_dir', type=str, default=None)
    parser.add_argument('--debug', action='store_true')

    args = parser.parse_args()
    logging.warning(args)

    docs = []
    if args.doc:
        logging.warning('using docs')
        filename = args.doc
        docs = du.load_sent(filename)
    if args.tdir:
        logging.warning('using t dir')
        dir_ = args.tdir
        filenames = glob(os.path.join(dir_, 'true_*.txt'))
        for filename in filenames:
            docs += du.load_sent(filename)

    if args.fdir:
        logging.warning('using f dir')
        dir_ = args.fdir
        # filenames = glob(os.path.join(dir_, '.txt'))
        filenames = ['train.txt', 'test.txt', 'dev.txt']
        for filename in filenames:
            docs += du.load_sent(os.path.join(dir_, filename))
    '''
예제 #6
0
def test7():
    import os, pickle
    basedir = '/homes/du113/scratch/cnn-political-data'
    filename = 'CNN_2018_8_16.txt'
    # filename = 'test_coref2.txt'
    filename = os.path.join(basedir, filename)

    word_dict = 'cnn_dict.pkl'
    word_dict = os.path.join(basedir, word_dict)

    docs = du.load_sent(filename)

    docs = docs[:1]
    '''
    with open(args.doc) as fid:
        docs = fid.readlines() 
    '''
    # print(docs)
    logging.warning('loaded %d documents' % len(docs))

    assert word_dict
    with open(word_dict, 'rb') as fid:
        wd = load(fid)

    reverse_dict = {v: k for k, v in wd.items()}

    def convert(ls):
        return ' '.join([reverse_dict[i] for i in ls])

    # root = args.save_dir
    filename = filename.split('/')[-1].split('.')[-2][:5]
    # save_path = os.path.join(basedir, "test7_2.pkl")

    for i, doc in enumerate(docs):
        logging.warning('*************************')
        logging.warning('processing %i th document' % i)
        mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
        logging.warning("Memory usage is: {0} MB".format(mem / 1000))

        ecb = EventChainBuilder(wd, doc, False, False)
        # gc.collect()

        ecb.debug = True
        '''
        logging.warning('making path lists')
        ecb.make_path_list()

        # pprint(ecb.vert_ls)
        for i, v in enumerate(ecb.vert_ls):
            print(i, v)
        if not ecb.debug:
            """
            ecb.store_verts(os.path.join(root, \
                    '%d_vert_%s.pkl' % (i, filename)))
            """
            ecb.store_verts(save_path)
        '''
        ecb.load_verts(os.path.join(basedir, \
                '%d_vert_%s.pkl' % (i, filename)))

        logging.warning('*************************')
        logging.warning('building coreference clusters')
        '''
        ecb.get_coref()
        logging.warning('found %d clusters' % len(ecb.corefs))
        print(ecb.corefs)
        if not ecb.debug:
            ecb.store_corefs(os.path.join(root, \
                    '%d_coref_%s.pkl' % (i, filename)))
        '''
        ecb.load_corefs(os.path.join(basedir, \
                '%d_coref_%s.pkl' % (i, filename)))

        logging.warning('building event chains')
        ecb.make_event_chains()
        '''
        if not ecb.debug:
            ecb.store_params(os.path.join(root, \
                    '%d_params_%s.pkl' % (i, filename)))
        '''

        from colors import Color
        for i, chain in enumerate(ecb.event_chains):
            print(Color.BOLD + ('ARG%d' % i) + Color.END)
            # print('ARG%d' % i)
            for k, v in chain.items():
                print('key:', k, convert(k), '\nchains:', v, '\n******')