示例#1
0
def faqScraper(index_file, output_file, start_from):

    faq_dict = corpus.load(start_from)
    f = corpus.load(index_file)
    print "Building Index"
    s = sorted(faq_dict.keys())
    x = []
    for i in f:
        if i[0] not in s:
            x.append(i[0])

    t = len(x)
    print str(t) + " ID's to scrape"

    if t < 1:
        pass
    else:
        print "Starting Scrape"
        for i in x:
            faq_dict.update(corpus.format(faqSplitter(i), i))
            t -= 1
            print(str(t) + ' To go')

        print('Saving')
        corpus.save(output_file, faq_dict)
示例#2
0
    def train(self, filename, validation, epochs=20, batch_size=64, verbose=0, optimizer=RMSprop(lr=.01), use_ext_embeddings=True, ** kwargs):
        X, Y = corpus.extract(corpus.load(filename))
        D_X, D_Y = corpus.extract(corpus.load(validation))
        self.x_list = ["__START__"] + \
            list({w for x in X for w in x}) + ["__UNK__"]
        self.y_list = ["__START__"] + \
            list({c for y in Y for c in y}) + ["__UNK__"]
        self.x_codes = {x: idx for idx, x in enumerate(self.x_list)}
        self.y_codes = {y: idx for idx, y in enumerate(self.y_list)}
        self.reverse_x_codes = {i: x for i, x in enumerate(self.x_list)}
        self.reverse_y_codes = {i: y for i, y in enumerate(self.y_list)}

        Xcodes, Ycodes = self._encode(X, Y)
        D_Xcodes, D_Ycodes = self._encode(D_X, D_Y)
        L = list(map(len, Ycodes))
        self.mL = max(L)
        D_Xcodes = pad_sequences(D_Xcodes, maxlen=self.mL)
        D_Ycodes = pad_sequences(D_Ycodes, maxlen=self.mL)
        Xcodes = pad_sequences(Xcodes, maxlen=self.mL)
        Ycodes = pad_sequences(Ycodes, maxlen=self.mL)
        self.x_size = len(self.x_codes)
        self.y_size = len(self.y_codes)
        ipt = Input(shape=(self.mL,))
        e = Embedding(self.x_size, self.embedding_size, mask_zero=True)(ipt)
        # <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
        if use_ext_embeddings:
            embedding_matrix = numpy.zeros((self.x_size, self.embedding_size))
            for word, i in self.x_codes.items():
                embedding_vector = self.embeddings_index.get(word)
                if embedding_vector is not None:
                    embedding_matrix[i] = embedding_vector
            e = Embedding(self.x_size, self.embedding_size, weights=[
                embedding_matrix], mask_zero=True, trainable=True)(ipt)
        # <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
        h = Bidirectional(LSTM(self.memory_size, return_sequences=True))(e)
        o = TimeDistributed(
            Dense(self.y_size, bias_regularizer=l1_l2(0.), activation='softmax'))(h)
        earlystop = EarlyStopping(monitor='val_acc', patience=0)
        self.model = Model(ipt, o)
        if verbose:
            self.model.summary()
        self.model.compile(
            optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
        self.model.fit(Xcodes, Ycodes, epochs=epochs,
                       verbose=verbose, batch_size=batch_size, shuffle=True, validation_data=(D_Xcodes, D_Ycodes), callbacks=[earlystop])
        return self
示例#3
0
 def test(self, filename):
     X_test, Y_test = corpus.extract(corpus.load(filename))
     Xcodes_test, Ycodes_test = self._encode(X_test, Y_test)
     Xcodes_test = pad_sequences(Xcodes_test, maxlen=self.mL)
     Ycodes_test = pad_sequences(Ycodes_test, maxlen=self.mL)
     return self.model.evaluate(Xcodes_test, Ycodes_test, batch_size=64)
示例#4
0

if __name__ == "__main__":
    logging.basicConfig(format='%(message)s', level=logging.INFO)
    template = '{:-^50}'

    REGEX = r"[mMdD][rs]s?\. ?[\w,]+|[\w]+'?[\w,]+|[\.!\?:]"
    END_CHARS = '.?!'
    corpus = corpus.Corpus(REGEX, END_CHARS)

    INPUT_FILE_NAME = sys.argv[1]
    log = 'Reading from {}'.format(INPUT_FILE_NAME)
    logging.info(template.format(log))
    start_time = time.time()

    corpus.load(INPUT_FILE_NAME)

    log = 'Time: {} s'.format(time.time() - start_time)
    logging.info(template.format(log))

    OUTPUT_FILE_NAME = sys.argv[2]
    SENTENCES_COUNT = int(sys.argv[3])
    log = 'Writing to {}'.format(OUTPUT_FILE_NAME)
    logging.info(template.format(log))
    start_time = time.time()

    text_generator = TextGenerator(corpus)
    with open(OUTPUT_FILE_NAME, 'w') as f:
        f.writelines(text_generator.gen_text_line(SENTENCES_COUNT, 1, 10))

    log = 'Time: {} s'.format(time.time() - start_time)
示例#5
0

def generalFaq(s_num, k_num):
    cnt = s_num
    x = []
    for i in range(1, k_num):
        cnt += 1
        movie_id = str(str(cnt).zfill(7))
        try:
            faq = len(ia.get_movie_faqs(movie_id)['data']['faqs'])
            if faq > 0:
                y = (movie_id, faq)
                x.append(y)
        except (RuntimeError, TypeError, NameError, KeyError, IOError,
                AssertionError, IMDbDataAccessError):
            pass
    return x


print "loading index"
k = corpus.load("data/index_0.json")
s_num = int(k[-1][0])

print "starting scrape loop"
for i in range(1, 600):
    z = generalFaq(s_num, 1000)
    k.extend(z)
    f = "data/index_0.json"
    corpus.save(f, k)
    check.indexFaqCounter(k)
    s_num += 1000
示例#6
0
            sorted(ref))) + "\n\ncurrent:\n" + str(list(
                sorted(cur))) + "\n\nfor input:\n" + "\n".join(
                    map("\t".join, zip(i, x, y)))

    print("Direct test")
    train_conll = "sequoia-corpus.np_conll.train"
    test_conll = "sequoia-corpus.np_conll.test"
    dev_conll = "sequoia-corpus.np_conll.dev"

    nnt = NNTagger()
    # # nnt.train("sequoia-corpus.np_conll.train", verbose=1)
    # nnt.save()
    nnt = NNTagger.load()
    # nnt.model.summary()

    I, X, Y = corpus.extract(corpus.load(dev_conll),
                             columns=("index", "token", "head"))
    dataset = []
    for i, x, y in zip(I, X, Y):
        arcs = set(zip(map(int, y), map(int, i)))
        derivation, last_config = DependencyParser.oracle(arcs)
        if list(sorted(arcs)) != list(sorted(last_config.arcs)):
            #print(str(last_config) +"\n\n\n" + message(i,x,y, arcs ,last_config.arcs))
            continue
        dataset.append((["__ROOT__"] + x, derivation))
    #print(len(X), len(dataset))
    """
	Xtest = corpus.extract_features_for_depency(test_conll)
	XtestIO = list(map(io.StringIO, Xtest))
	XtestD = list(map(DependencyTree.read_tree, XtestIO))
	"""
示例#7
0

def indexFaqCounter(k):
    cnt = 0
    for i in k:
        cnt += i[1]
    current_id = k[-1][0]
    print "Latest Movie ID: " + str(current_id)
    highest_id = 6799992
    percent = 100 * float(current_id) / float(highest_id)
    print "\n Indexed"
    print "________"
    print "Percentage: " + str(float("{0:.2f}".format(percent))) + "%"
    print "Total faqs found: " + str(cnt)
    print "Total Films indexed: " + str(len(k))


def scrapeFaqChecker(k):
    cnt = 0
    for key, value in k.iteritems():
        try:
            cnt += len(value)
        except TypeError:
            pass

    print "Total faqs scraped: " + str(cnt)
    print "Total films scraped: " + str(len(k))


infex_file = corpus.load("data/index_0.json")
indexFaqCounter(infex_file)
示例#8
0
文件: main.py 项目: Mespith/NLP
import corpus
import RNN
import numpy as np
import CBOWNS as CBOW
import SkipGram

# vocabulary_size = 8000
# X_train, Y_train, vocab = corpus.parseRNN("../Words/1-billion-word-language-modeling-benchmark-master/scripts/training-monolingual", 1000, vocabulary_size)
# model = RNN.RNN(len(vocab) + 1)
# #X_train, Y_train, vocab = corpus.parseRNNFile("number_series_1213.txt", 3)
#
# RNN.train_with_sgd(model, X_train, Y_train)
# model.write_to_file("RNNembeddings.txt", vocab)

# word_to_index, contexts, index_to_word, index_count = corpus.NewContextParse("../../Words/1-billion-word-language-modeling-benchmark-master/scripts/training-monolingual/news.2007.en.shuffled", True)
word_to_index, contexts, index_to_word, word_count = corpus.load(500000)
foo = CBOW.CBOW(100, 3, word_to_index, contexts, index_to_word, word_count)
CBOW.train_with_sgd(foo, 0.5, 1., 0.01)
foo.writeToFile("CBOW_Wout.txt", "CBOW_Win.txt", "CBOW_labels.txt")

# sg = SkipGram.SkipGram(100, "../Words/1-billion-word-language-modeling-benchmark-master/scripts/training-monolingual/news.2007.en.shuffled")
# SkipGram.train_with_sgd(sg, 0.2, 0.1)
# sg.writeToFile('SkipEmbeddings.txt')

# corpus.parseTimon("../Words/1-billion-word-language-modeling-benchmark-master/scripts/training-monolingual/news.2007.en.shuffled", 100000)