def print_cost_estimates(commons_path, corpora_path): from corpora import Corpora train = Corpora(corpora_path, commons_path, gold=True) actions = Actions() for document in train: for action in document.gold: actions.add(action) train.rewind() cascades = [cascade_class(actions) for cascade_class in \ [FlatCascade, ShiftCascade, ShiftMarkCascade, ShiftPropbankEvokeCascade]] costs = [0] * len(cascades) counts = [[0] * cascade.size() for cascade in cascades] for document in train: gold = document.gold for index, cascade in enumerate(cascades): cascade_gold_sequence = cascade.translate(gold) delegate = 0 cost = 0 for cascade_gold in cascade_gold_sequence: cost += cascade.delegates[delegate].size() counts[index][delegate] += 1 if cascade_gold.is_cascade(): delegate = cascade_gold.delegate else: delegate = 0 costs[index] += cost for c, cost, cascade in zip(counts, costs, cascades): print "\n", cascade.__class__.__name__, "cost =", cost, "\n", \ "Delegate invocations:", c, "\n", cascade
def initialize_corpora(data_path, data_prefix, dict_path, iterator, **kwargs): corpora_params = dict(data_path=data_path, prefix=data_prefix) if os.path.exists(dict_path): corpora = Corpora(dictionary=dict_path, **corpora_params) else: corpora = Corpora(**corpora_params).build() corpora.dictionary.save_as_text(dict_path) if len(corpora) == 0: raise ValueError( f'Did not find any documents from path: {data_path} for given prefix {data_prefix}' ) return corpora
def dev_accuracy(commons_path, dev_path, tmp_folder, caspar): dev = Corpora(dev_path, caspar.spec.commons) print "Annotating dev documents", now(), mem() test_path = os.path.join(tmp_folder, "dev.annotated.rec") writer = sling.RecordWriter(test_path) count = 0 start_time = time.time() cascade = caspar.spec.cascade dev_total = [0] * cascade.size() dev_disallowed = [0] * cascade.size() for document in dev: state, disallowed, total, trace = \ caspar.forward(document, train=False, debug=True) state.write() trace.write() writer.write(str(count), state.encoded()) count += 1 if count % 100 == 0: print " Annotated", count, "documents", now(), mem() for i, c in enumerate(disallowed): dev_total[i] += total[i] dev_disallowed[i] += c writer.close() end_time = time.time() print "Annotated", count, "documents in", "%.1f" % (end_time - start_time), \ "seconds", now(), mem() print "Disallowed/Total leaf actions for", cascade.__class__.__name__ for i, c in enumerate(dev_disallowed): print "Delegate", i, "disallowed", c, "out of", dev_total[i] return utils.frame_evaluation(gold_corpus_path=dev_path, \ test_corpus_path=test_path, \ commons_path=commons_path)
def train(args): check_present( args, ["train_corpus", "output_folder", "dev_corpus", "train_shuffle_seed"]) train_corpus_path = args.train_corpus if args.train_shuffle_seed > 0: reader = sling.RecordReader(args.train_corpus) items = [(key, value) for key, value in reader] reader.close() r = random.Random(args.train_shuffle_seed) r.shuffle(items) train_corpus_path = os.path.join(args.output_folder, "train_shuffled.rec") writer = sling.RecordWriter(train_corpus_path) for key, value in items: writer.write(key, value) writer.close() print("Wrote shuffled train corpus to %s using seed %d" % \ (train_corpus_path, args.train_shuffle_seed)) # Setting an explicit seed for the sake of determinism. torch.manual_seed(1) # Make commons store if needed. if args.commons == '' or not os.path.exists(args.commons): if args.commons == '': fname = os.path.join(args.output_folder, "commons") print("Will create a commons store at", fname) args.commons = fname else: print("No commons found at", args.commons, ", creating it...") _, symbols = commons_builder.build( [train_corpus_path, args.dev_corpus], args.commons) print("Commons created at", args.commons, "with", len(symbols), \ "symbols besides the usual ones.") # Make the training spec. spec = Spec() spec.build(args.commons, train_corpus_path) # Initialize the model with the spec and any word embeddings. caspar = Caspar(spec) embeddings_file = args.word_embeddings if embeddings_file == '': embeddings_file = None caspar.initialize(embeddings_file) tmp_folder = os.path.join(args.output_folder, "tmp") if not os.path.exists(tmp_folder): os.makedirs(tmp_folder) evaluator = partial(dev_accuracy, args.dev_corpus, tmp_folder) output_file_prefix = os.path.join(args.output_folder, "caspar") hyperparams = Hyperparams(args) print("Using hyperparameters:", hyperparams) trainer = Trainer(caspar, hyperparams, evaluator, output_file_prefix) train = Corpora(train_corpus_path, spec.commons, gold=True) trainer.train(train)
def update_query(self): results = [r["Description"] + r['Title'] for r in self.results] try: corpora = Corpora(self.query, results, self.selectedIDs) print corpora except: print "NLTK Corpora not install, query expansion requires NLKT Corpuses" print "Augmenting query..." # filter - choose words that are not in the query already candidates = [(w, s) for w, s in corpora.getUpdatedQuery() \ if w not in set(self.query.split())] (w1, s1), (w2, s2) = candidates[0], candidates[1] newQueryWords = [w1, w2] if s1 == s2 else [w1] # build new query self.query = " ".join([self.query] + newQueryWords) print "Restarting search with query: ", self.query self.start()
def train(args): check_present(args, ["train_corpus", "output_folder", "dev_corpus"]) # Setting an explicit seed for the sake of determinism. torch.manual_seed(1) # Make commons store if needed. if args.commons == '' or not os.path.exists(args.commons): if args.commons == '': fname = os.path.join(args.output_folder, "commons") print "Will create a commons store at", fname args.commons = fname else: print "No commons found at", args.commons, ", creating it..." _, symbols = commons_builder.build( [args.train_corpus, args.dev_corpus], args.commons) print "Commons created at", args.commons, "with", len(symbols), \ "symbols besides the usual ones." # Make the training spec. spec = Spec() spec.build(args.commons, args.train_corpus) # Initialize the model with the spec and any word embeddings. caspar = Caspar(spec) embeddings_file = args.word_embeddings if embeddings_file == '': embeddings_file = None caspar.initialize(embeddings_file) tmp_folder = os.path.join(args.output_folder, "tmp") if not os.path.exists(tmp_folder): os.makedirs(tmp_folder) evaluator = partial(dev_accuracy, args.commons, args.dev_corpus, tmp_folder) output_file_prefix = os.path.join(args.output_folder, "caspar") hyperparams = Hyperparams(args) print "Using hyperparameters:", hyperparams trainer = Trainer(caspar, hyperparams, evaluator, output_file_prefix) train = Corpora(args.train_corpus, spec.commons, gold=True) trainer.train(train)
def run(args): check_present(args, ["input", "parser", "output"]) assert os.path.exists(args.input), args.input assert os.path.exists(args.parser), args.parser # Read parser flow. flow = Flow() flow.load(args.parser) # Initialize the spec from the flow. spec = Spec() spec.from_flow(flow) # Initialize the model from the flow. caspar = Caspar(spec) caspar.from_flow(flow) corpus = Corpora(args.input, caspar.spec.commons) writer = sling.RecordWriter(args.output) count = 0 for document in corpus: state, _, _, trace = caspar.forward(document, train=False, debug=args.trace) state.write() if trace: trace.write() writer.write(str(count), state.encoded()) count += 1 if count % 100 == 0: print "Annotated", count, "documents", now(), mem() writer.close() print "Annotated", count, "documents", now(), mem() print "Wrote annotated documents to", args.output if args.evaluate: f = tempfile.NamedTemporaryFile(delete=False) fname = f.name caspar.spec.commons.save(fname, binary=True) f.close() eval_result = frame_evaluation(gold_corpus_path=args.input, \ test_corpus_path=args.output, commons=caspar.spec.commons) os.unlink(fname) return eval_result
def build(self, commons_path, corpora_path): # Prepare lexical dictionaries. self.words = Lexicon(self.words_normalize_digits) self.suffix = Lexicon(self.words_normalize_digits, oov_item=None) # Initialize training corpus. corpora = Corpora(corpora_path, commons_path) # Collect word and affix lexicons. for document in corpora: for token in document.tokens: word = token.word self.words.add(word) for s in self.get_suffixes(word): assert type(s) is str self.suffix.add(s) print "Words:", self.words.size(), "items in lexicon, including OOV" print "Suffix:", self.suffix.size(), "items in lexicon" # Load common store, but not freeze it yet. We will add the action table # and cascade specification to it. self.commons_path = commons_path self.commons = sling.Store() self.commons.load(commons_path) schema = sling.DocumentSchema(self.commons) # Prepare action table and cascade. self._build_action_table(corpora) self.cascade = cascade.ShiftMarkCascade(self.actions) print self.cascade # Save cascade specification in commons. _ = self.cascade.as_frame(self.commons, delegate_cell_prefix="delegate") # Freeze the common store. self.commons.freeze() # Add feature specs. self._specify_features()
def beer_tax(): return Corpora(TAX_SYSTEM_IN_US)
def getty(): return Corpora(GETTYSBURG)
def main(model, alpha, gamma, kappa, n_topics, data_path, data_prefix, result_path, dictionary_path, stopwords, vectors_path, batch_size, iterations, passes, n_words, shuffle): if not os.path.exists(result_path): raise OSError(f'Provided path {result_path} does not exist.') corpora_params = dict(data_path=data_path, prefix=data_prefix, iterator='bow', stopwords=stopwords) if os.path.exists(dictionary_path): corpora = Corpora(dictionary=dictionary_path, **corpora_params) else: corpora = Corpora(**corpora_params).build() corpora.dictionary.save_as_text(dictionary_path) if len(corpora) == 0: raise ValueError( f'Did not find any documents from path: {data_path} for given prefix {data_prefix}' ) MAP = dict(lda=(LDAWrapper, dict(n_topics=n_topics, alpha=alpha, iterations=iterations, passes=passes, batch_size=batch_size, id2word=corpora.dictionary)), shdp=(SHDPWrapper, dict(n_topics=n_topics, alpha=alpha, gamma=gamma, passes=passes, batch_size=batch_size, batch_shuffle=shuffle, vector_map=load_vectors( vectors_path, dictionary=corpora.dictionary), num_docs=len(corpora)))) model_class, params = MAP[model] topic_model = model_class(**params) data = [doc for doc, _ in corpora ] # Gather the data since BOW reprecentation is lightweight. for i, seq in enumerate(data): if len(seq) < 1: raise AssertionError(f'Empty seq at index {i}') if shuffle: np.random.shuffle(data) topic_model.fit(data) model_name = str(topic_model) + f'_{data_prefix}' years = corpora.years if years: model_name += f'{years[0]}-{years[-1]}' if shuffle: model_name += f'_shuffled' path_dir = os.path.join(result_path, model_name) if not os.path.exists(path_dir): os.mkdir(path_dir) if hasattr(topic_model, 'save'): topic_model.save(os.path.join(path_dir, model_name)) topic_df = document_topics(topic_model, corpora) dictionary = None if model == 'shdp': dictionary = corpora.dictionary words_df = model_words(topic_model, n=n_words, dictionary=dictionary) topics_path = os.path.join(path_dir, 'topics.csv') words_path = os.path.join(path_dir, 'words.csv') topic_df.to_csv(topics_path, index=False) words_df.to_csv(words_path, index=False)
import numpy as np import matplotlib.pyplot as plt from discord.ext import commands # last one sitting universe #TODO: Add channel ID HERE channels = [0000000] VC = "" bot = commands.Bot(command_prefix=">") from corpora import Corpora Corpora(bot) """ ------------------------------ Members ------------------------------ """ @bot.command("docs") async def hello(ctx): with open("help.txt") as afile: docs = afile.read() await ctx.send(docs) """
from sklearn.decomposition import PCA from sklearn.cluster import DBSCAN from bert_embedding import BertEmbedding sys.path.append('..') style.use('ggplot') from corpora import Corpora data_path = 'M:/Projects/KeyTopicDetection/parsed' dict_path = '../../data/cvpr_13-18_DICT.txt' if os.path.exists(dict_path): corpora = Corpora(data_path=data_path, prefix='CVPR', iterator='bow', dictionary=dict_path) else: corpora = Corpora(data_path=data_path, prefix='CVPR', iterator='bow', word_up_limit=0.75, word_low_limit=20).build() corpora.dictionary.save_as_text(dict_path) ctx = mx.gpu(0) bert = BertEmbedding(ctx=ctx) def visualize_clusters(tw, data): db = DBSCAN(eps=0.5, min_samples=50).fit(data)