def load_data(args): global word_dict, word_embed docs = [] docs += du.load_sent('../datasets/bbcnews.txt') # BBC_news # docs += du.load_sent('../datasets/BBC_news.txt') word_dict = util.build_dict(docs) # inv_dict = util.build_inv_dict(word_dict) word_embed = util.words2embedding(word_dict, 100, args.embedding_file) print('word_dict:', word_dict) with open('../datasets/word_dict', 'wb') as fid: dump(word_dict, fid) doc = ' '.join(docs) return doc
def test1(args): docs = [] docs += du.load_sent('../datasets/bbcnews.txt') logging.info('docs: {}'.format(len(docs))) logging.info("building dictionary...") word_dict, char_dict = util.build_dict(docs) word_embed = util.words2embedding(word_dict, 100, args.embedding_file) (args.word_vocab_size, args.word_embed_size) = word_embed.shape logging.info('docs: {}'.format(word_embed.shape)) # (119, 100) # Words: 117 -> 117 print(word_dict) doc = ' '.join(docs[0]) # with open('bbcnews.txt') as f: # docs = f.read() # sp.build_graph(doc) vertice_map = sp.hash_vertex(doc) for vertice in vertice_map: print(words2word(vertice[0],word_embed,word_dict))
def build_dict(docs, max_words=5000000, dict_file=None): """ :param docs: a doc is a list of sentences :return: dictionary of words """ word_count = Counter() # char_count = Counter() for doc in docs: logging.info('processing %s' % (doc)) docs = du.load_sent(doc) # sents = sum(sents, []) # text = nlp(' '.join(sents)) # for sent in text.sents: for doc in docs: ''' print(len(sent)) raise Exception ''' for sent in doc: for w in sent.split(' '): # w = w.lemma_ # w = w.text if w.isdigit(): w = "num@#!123" word_count[w.lower()] += 1 ls = word_count.most_common(max_words) # chars = char_count.most_common(80) logging.info('#Words: %d -> %d' % (len(word_count), len(ls))) print('#Words: %d -> %d' % (len(word_count), len(ls))) # leave 0 to padding # leave 1 to UNK word_dict = {w[0]: index + 2 for (index, w) in enumerate(ls)} if dict_file: with open(dict_file, 'wb') as fid: pickle.dump(word_dict, fid)
from event_chain import * import data_utilities as du import utilities as util from pprint import pprint import time ####### test 3 compare ###### docs = du.load_sent('../datasets/bbcsample1.txt') word_dict = util.build_dict(docs) print('models already downloaded') srl_predictor = Predictor.from_path( '../pretrained/srl-model-2018.05.25.tar.gz') def test3(): global word_dict, srl_predictor # docs = du.load_sent('../datasets/bbcsample1.txt') # wd = util.build_dict(docs) # pprint(wd) # print(docs) print('using ecb **********') start = time.time() ecb = EventChainBuilder(word_dict) for i, sent in enumerate(docs): print('processing sentence', i)
parser = argparse.ArgumentParser() parser.add_argument('--word_dict', type=str, default=None) parser.add_argument('--doc', type=str, default=None) parser.add_argument('--tdir', type=str, default=None) parser.add_argument('--fdir', type=str, default=None) parser.add_argument('--save_dir', type=str, default=None) parser.add_argument('--debug', action='store_true') args = parser.parse_args() logging.warning(args) docs = [] if args.doc: logging.warning('using docs') filename = args.doc docs = du.load_sent(filename) if args.tdir: logging.warning('using t dir') dir_ = args.tdir filenames = glob(os.path.join(dir_, 'true_*.txt')) for filename in filenames: docs += du.load_sent(filename) if args.fdir: logging.warning('using f dir') dir_ = args.fdir # filenames = glob(os.path.join(dir_, '.txt')) filenames = ['train.txt', 'test.txt', 'dev.txt'] for filename in filenames: docs += du.load_sent(os.path.join(dir_, filename)) '''
def test7(): import os, pickle basedir = '/homes/du113/scratch/cnn-political-data' filename = 'CNN_2018_8_16.txt' # filename = 'test_coref2.txt' filename = os.path.join(basedir, filename) word_dict = 'cnn_dict.pkl' word_dict = os.path.join(basedir, word_dict) docs = du.load_sent(filename) docs = docs[:1] ''' with open(args.doc) as fid: docs = fid.readlines() ''' # print(docs) logging.warning('loaded %d documents' % len(docs)) assert word_dict with open(word_dict, 'rb') as fid: wd = load(fid) reverse_dict = {v: k for k, v in wd.items()} def convert(ls): return ' '.join([reverse_dict[i] for i in ls]) # root = args.save_dir filename = filename.split('/')[-1].split('.')[-2][:5] # save_path = os.path.join(basedir, "test7_2.pkl") for i, doc in enumerate(docs): logging.warning('*************************') logging.warning('processing %i th document' % i) mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss logging.warning("Memory usage is: {0} MB".format(mem / 1000)) ecb = EventChainBuilder(wd, doc, False, False) # gc.collect() ecb.debug = True ''' logging.warning('making path lists') ecb.make_path_list() # pprint(ecb.vert_ls) for i, v in enumerate(ecb.vert_ls): print(i, v) if not ecb.debug: """ ecb.store_verts(os.path.join(root, \ '%d_vert_%s.pkl' % (i, filename))) """ ecb.store_verts(save_path) ''' ecb.load_verts(os.path.join(basedir, \ '%d_vert_%s.pkl' % (i, filename))) logging.warning('*************************') logging.warning('building coreference clusters') ''' ecb.get_coref() logging.warning('found %d clusters' % len(ecb.corefs)) print(ecb.corefs) if not ecb.debug: ecb.store_corefs(os.path.join(root, \ '%d_coref_%s.pkl' % (i, filename))) ''' ecb.load_corefs(os.path.join(basedir, \ '%d_coref_%s.pkl' % (i, filename))) logging.warning('building event chains') ecb.make_event_chains() ''' if not ecb.debug: ecb.store_params(os.path.join(root, \ '%d_params_%s.pkl' % (i, filename))) ''' from colors import Color for i, chain in enumerate(ecb.event_chains): print(Color.BOLD + ('ARG%d' % i) + Color.END) # print('ARG%d' % i) for k, v in chain.items(): print('key:', k, convert(k), '\nchains:', v, '\n******')