示例#1
0
    def __init__(self, vocab, data_path, history_len, batch_size = 1, overlap=False, progress=False, fixed_length=True,
                 target_vector=False, _just_test=False, shuffle=True, max_words=999, min_words=0):
        """
        Generate data for training with RNN
        :type vocab: nlpy.lm.Vocab
        :type data_path: str
        :param history_len: if this value is -1, then one trunk is a sentence
        :type history_len: int
        :type binvector: bool
        """
        self._vocab = vocab
        self._target_vector = target_vector
        self._just_test = _just_test
        self.history_len = history_len
        self.batch_size = batch_size
        self.minibatch_mode = not (batch_size == 1)
        self.fixed_length = fixed_length
        self.progress = progress
        self.overlap = overlap
        self.shuffle = shuffle

        self.sentences = []

        # Treat each sentence as a trunk
        for line in LineIterator(data_path):
            sequence = [vocab.sent_index]
            wc = line.count(" ") + 1
            if wc < min_words or wc > max_words:
                continue
            for w in line.split(" "):
                sequence.append(vocab.index(w))
            sequence.append(vocab.sent_index)
            self.sentences.append(sequence)
        logging.info("%d sentences loaded from %s" % (len(self.sentences), data_path))
示例#2
0
    def _load_source(self):
        tokenizer = NLTKEnglishTokenizer()
        counter = Counter()
        for l in LineIterator(self.source):
            counter.update(map(str.lower, tokenizer.tokenize(l)))

        self._freqmap = dict(counter.items())
        self._maxfreq = sum(self._freqmap.values()) * 2 / len(self._freqmap)
示例#3
0
 def _build_data(self, path):
     data = []
     for l in LineIterator(path):
         l = l.lower()
         word_data = self.vocab.convert(l)
         if len(word_data) == 0:
             continue
         data.append([word_data])
     return data
示例#4
0
 def _load_frequency(self):
     self._maxfreq = 3000
     self._freqmap = {}
     for line in LineIterator(_FREQ_DATA_PATH):
         freq, word = line.split("\t")
         freq = int(freq)
         if freq > self._maxfreq:
             continue
         self._freqmap[word] = freq
示例#5
0
 def load(self, path, fixed_size=-1):
     logging.info("load data from %s" % path)
     if fixed_size > 0:
         self._load_fixed_size(path, fixed_size)
         return
     for line in LineIterator(path):
         words = line.split(" ")
         map(self.add, words)
     logging.info("vocab size: %d" % self.size)
示例#6
0
 def _load_fixed_size(self, path, fixed_size):
     from collections import Counter
     logging.info("fixed size: %d" % fixed_size)
     counter = Counter()
     for line in LineIterator(path):
         words = line.split(" ")
         counter.update(words)
     for w, _ in counter.most_common(fixed_size):
         self.add(w)
示例#7
0
文件: recase.py 项目: Satssuki/nlpy
 def __init__(self):
     """
     Initialize recase map.
     """
     self._recase_map = {}
     for line in LineIterator(_FREQ_DATA_PATH):
         _, word = line.split("\t")
         low_word = word.lower()
         if low_word not in self._recase_map:
             self._recase_map[low_word] = word
示例#8
0
 def _build_data(self, path):
     data = []
     for l in LineIterator(path):
         l = l.lower()
         chars = filter(lambda x: x in self.chat_set, l)
         if not chars:
             continue
         char_ids = [self.chat_set.index(c) + 1 for c in chars]
         char_ids = [0] + char_ids + [0]
         word_data = [
             np.eye(1, M=self.input_size, k=c)[0] for c in char_ids
         ]
         data.append([word_data])
     return data
示例#9
0
    def serve(param):
        from nlpy.util import external_resource
        from nlpy.util import LineIterator
        import urllib2
        global semantic_searcher
        if "semantic_searcher" not in globals():
            print "Loading searcher ..."
            data = LineIterator(
                external_resource("general/elementary_questions.txt"))
            semantic_searcher = SemanticSearcher()
            semantic_searcher.load_data(data)

        caches = set()
        if "caches" in param:
            caches = set(urllib2.unquote(param["caches"]).split(" ||| "))
        print caches
        output = ""
        for _, result in semantic_searcher.searchMany(
                param['input'].convert('utf-8')):
            if result not in caches:
                output = result
                break
        return {"output": output}
示例#10
0
 def _load_ranking(self):
     self._rank_list = []
     for l in LineIterator(_MASSIVE_WORD_LIST):
         self._rank_list.append(l)