def faqScraper(index_file, output_file, start_from): faq_dict = corpus.load(start_from) f = corpus.load(index_file) print "Building Index" s = sorted(faq_dict.keys()) x = [] for i in f: if i[0] not in s: x.append(i[0]) t = len(x) print str(t) + " ID's to scrape" if t < 1: pass else: print "Starting Scrape" for i in x: faq_dict.update(corpus.format(faqSplitter(i), i)) t -= 1 print(str(t) + ' To go') print('Saving') corpus.save(output_file, faq_dict)
def train(self, filename, validation, epochs=20, batch_size=64, verbose=0, optimizer=RMSprop(lr=.01), use_ext_embeddings=True, ** kwargs): X, Y = corpus.extract(corpus.load(filename)) D_X, D_Y = corpus.extract(corpus.load(validation)) self.x_list = ["__START__"] + \ list({w for x in X for w in x}) + ["__UNK__"] self.y_list = ["__START__"] + \ list({c for y in Y for c in y}) + ["__UNK__"] self.x_codes = {x: idx for idx, x in enumerate(self.x_list)} self.y_codes = {y: idx for idx, y in enumerate(self.y_list)} self.reverse_x_codes = {i: x for i, x in enumerate(self.x_list)} self.reverse_y_codes = {i: y for i, y in enumerate(self.y_list)} Xcodes, Ycodes = self._encode(X, Y) D_Xcodes, D_Ycodes = self._encode(D_X, D_Y) L = list(map(len, Ycodes)) self.mL = max(L) D_Xcodes = pad_sequences(D_Xcodes, maxlen=self.mL) D_Ycodes = pad_sequences(D_Ycodes, maxlen=self.mL) Xcodes = pad_sequences(Xcodes, maxlen=self.mL) Ycodes = pad_sequences(Ycodes, maxlen=self.mL) self.x_size = len(self.x_codes) self.y_size = len(self.y_codes) ipt = Input(shape=(self.mL,)) e = Embedding(self.x_size, self.embedding_size, mask_zero=True)(ipt) # <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< if use_ext_embeddings: embedding_matrix = numpy.zeros((self.x_size, self.embedding_size)) for word, i in self.x_codes.items(): embedding_vector = self.embeddings_index.get(word) if embedding_vector is not None: embedding_matrix[i] = embedding_vector e = Embedding(self.x_size, self.embedding_size, weights=[ embedding_matrix], mask_zero=True, trainable=True)(ipt) # <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< h = Bidirectional(LSTM(self.memory_size, return_sequences=True))(e) o = TimeDistributed( Dense(self.y_size, bias_regularizer=l1_l2(0.), activation='softmax'))(h) earlystop = EarlyStopping(monitor='val_acc', patience=0) self.model = Model(ipt, o) if verbose: self.model.summary() self.model.compile( optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy']) self.model.fit(Xcodes, Ycodes, epochs=epochs, verbose=verbose, batch_size=batch_size, shuffle=True, validation_data=(D_Xcodes, D_Ycodes), callbacks=[earlystop]) return self
def test(self, filename): X_test, Y_test = corpus.extract(corpus.load(filename)) Xcodes_test, Ycodes_test = self._encode(X_test, Y_test) Xcodes_test = pad_sequences(Xcodes_test, maxlen=self.mL) Ycodes_test = pad_sequences(Ycodes_test, maxlen=self.mL) return self.model.evaluate(Xcodes_test, Ycodes_test, batch_size=64)
if __name__ == "__main__": logging.basicConfig(format='%(message)s', level=logging.INFO) template = '{:-^50}' REGEX = r"[mMdD][rs]s?\. ?[\w,]+|[\w]+'?[\w,]+|[\.!\?:]" END_CHARS = '.?!' corpus = corpus.Corpus(REGEX, END_CHARS) INPUT_FILE_NAME = sys.argv[1] log = 'Reading from {}'.format(INPUT_FILE_NAME) logging.info(template.format(log)) start_time = time.time() corpus.load(INPUT_FILE_NAME) log = 'Time: {} s'.format(time.time() - start_time) logging.info(template.format(log)) OUTPUT_FILE_NAME = sys.argv[2] SENTENCES_COUNT = int(sys.argv[3]) log = 'Writing to {}'.format(OUTPUT_FILE_NAME) logging.info(template.format(log)) start_time = time.time() text_generator = TextGenerator(corpus) with open(OUTPUT_FILE_NAME, 'w') as f: f.writelines(text_generator.gen_text_line(SENTENCES_COUNT, 1, 10)) log = 'Time: {} s'.format(time.time() - start_time)
def generalFaq(s_num, k_num): cnt = s_num x = [] for i in range(1, k_num): cnt += 1 movie_id = str(str(cnt).zfill(7)) try: faq = len(ia.get_movie_faqs(movie_id)['data']['faqs']) if faq > 0: y = (movie_id, faq) x.append(y) except (RuntimeError, TypeError, NameError, KeyError, IOError, AssertionError, IMDbDataAccessError): pass return x print "loading index" k = corpus.load("data/index_0.json") s_num = int(k[-1][0]) print "starting scrape loop" for i in range(1, 600): z = generalFaq(s_num, 1000) k.extend(z) f = "data/index_0.json" corpus.save(f, k) check.indexFaqCounter(k) s_num += 1000
sorted(ref))) + "\n\ncurrent:\n" + str(list( sorted(cur))) + "\n\nfor input:\n" + "\n".join( map("\t".join, zip(i, x, y))) print("Direct test") train_conll = "sequoia-corpus.np_conll.train" test_conll = "sequoia-corpus.np_conll.test" dev_conll = "sequoia-corpus.np_conll.dev" nnt = NNTagger() # # nnt.train("sequoia-corpus.np_conll.train", verbose=1) # nnt.save() nnt = NNTagger.load() # nnt.model.summary() I, X, Y = corpus.extract(corpus.load(dev_conll), columns=("index", "token", "head")) dataset = [] for i, x, y in zip(I, X, Y): arcs = set(zip(map(int, y), map(int, i))) derivation, last_config = DependencyParser.oracle(arcs) if list(sorted(arcs)) != list(sorted(last_config.arcs)): #print(str(last_config) +"\n\n\n" + message(i,x,y, arcs ,last_config.arcs)) continue dataset.append((["__ROOT__"] + x, derivation)) #print(len(X), len(dataset)) """ Xtest = corpus.extract_features_for_depency(test_conll) XtestIO = list(map(io.StringIO, Xtest)) XtestD = list(map(DependencyTree.read_tree, XtestIO)) """
def indexFaqCounter(k): cnt = 0 for i in k: cnt += i[1] current_id = k[-1][0] print "Latest Movie ID: " + str(current_id) highest_id = 6799992 percent = 100 * float(current_id) / float(highest_id) print "\n Indexed" print "________" print "Percentage: " + str(float("{0:.2f}".format(percent))) + "%" print "Total faqs found: " + str(cnt) print "Total Films indexed: " + str(len(k)) def scrapeFaqChecker(k): cnt = 0 for key, value in k.iteritems(): try: cnt += len(value) except TypeError: pass print "Total faqs scraped: " + str(cnt) print "Total films scraped: " + str(len(k)) infex_file = corpus.load("data/index_0.json") indexFaqCounter(infex_file)
import corpus import RNN import numpy as np import CBOWNS as CBOW import SkipGram # vocabulary_size = 8000 # X_train, Y_train, vocab = corpus.parseRNN("../Words/1-billion-word-language-modeling-benchmark-master/scripts/training-monolingual", 1000, vocabulary_size) # model = RNN.RNN(len(vocab) + 1) # #X_train, Y_train, vocab = corpus.parseRNNFile("number_series_1213.txt", 3) # # RNN.train_with_sgd(model, X_train, Y_train) # model.write_to_file("RNNembeddings.txt", vocab) # word_to_index, contexts, index_to_word, index_count = corpus.NewContextParse("../../Words/1-billion-word-language-modeling-benchmark-master/scripts/training-monolingual/news.2007.en.shuffled", True) word_to_index, contexts, index_to_word, word_count = corpus.load(500000) foo = CBOW.CBOW(100, 3, word_to_index, contexts, index_to_word, word_count) CBOW.train_with_sgd(foo, 0.5, 1., 0.01) foo.writeToFile("CBOW_Wout.txt", "CBOW_Win.txt", "CBOW_labels.txt") # sg = SkipGram.SkipGram(100, "../Words/1-billion-word-language-modeling-benchmark-master/scripts/training-monolingual/news.2007.en.shuffled") # SkipGram.train_with_sgd(sg, 0.2, 0.1) # sg.writeToFile('SkipEmbeddings.txt') # corpus.parseTimon("../Words/1-billion-word-language-modeling-benchmark-master/scripts/training-monolingual/news.2007.en.shuffled", 100000)