示例#1
0
    def process(self, data):
        """new_dialog: [(a, 1/0), (a,1/0)], new_meta: (a, b, topic), new_utt: [[a,b,c)"""
        """ 1 is own utt and 0 is other's utt"""
        new_dialog = []
        new_meta = []
        new_utts = []
        bod_utt = ["<s>", "<d>", "</s>"]
        all_lenes = []

        for l in data:
            if self.config.use_merge:
                if self.config.merge_type == 'init':
                    lower_utts = [
                        (caller, ["<tag>", "<s>"] +
                         nltk.WordPunctTokenizer().tokenize(utt.lower()) +
                         ["</s>"], feat) for caller, utt, feat in l["utts"]
                    ]
                elif self.config.merge_type == 'last':
                    lower_utts = [
                        (caller, ["<s>"] +
                         nltk.WordPunctTokenizer().tokenize(utt.lower()) +
                         ["</s>", '<tag>'], feat)
                        for caller, utt, feat in l["utts"]
                    ]
            else:
                lower_utts = [
                    (caller, ["<s>"] +
                     nltk.WordPunctTokenizer().tokenize(utt.lower()) +
                     ["</s>"], feat) for caller, utt, feat in l["utts"]
                ]
            all_lenes.extend([len(u) for c, u, f in lower_utts])

            a_age = float(l["A"]["age"]) / 100.0
            b_age = float(l["B"]["age"]) / 100.0
            a_edu = float(l["A"]["education"]) / 3.0
            b_edu = float(l["B"]["education"]) / 3.0
            vec_a_meta = [a_age, a_edu
                          ] + ([0, 1] if l["A"]["sex"] == "FEMALE" else [1, 0])
            vec_b_meta = [b_age, b_edu
                          ] + ([0, 1] if l["B"]["sex"] == "FEMALE" else [1, 0])

            # for joint model we mode two side of speakers together. if A then its 0 other wise 1
            meta = (vec_a_meta, vec_b_meta, l["topic"])
            dialog = [(bod_utt, 0, None)
                      ] + [(utt, int(caller == "B"), feat)
                           for caller, utt, feat in lower_utts]

            new_utts.extend([bod_utt] +
                            [utt for caller, utt, feat in lower_utts])
            new_dialog.append(dialog)
            new_meta.append(meta)

        print("Max utt len %d, mean utt len %.2f" %
              (np.max(all_lenes), float(np.mean(all_lenes))))
        return new_dialog, new_meta, new_utts
示例#2
0
文件: conv.py 项目: LeyliG/ds4se
 def frombatch_pipeline(self, batch):
     #TODO
     """@batch: a TensorFlow Dataset Batch"""
     pre_process = [
         self.clean_punctuation(doc.decode("utf-8")) for doc in batch
     ]
     logging.info('frombatch_pipeline: clean punctuation')
     pre_process = [self.split_camel_case_token(doc) for doc in pre_process]
     logging.info('frombatch_pipeline: camel case')
     pre_process = [doc.lower() for doc in pre_process]
     logging.info('frombatch_pipeline: lowe case')
     pre_process = [doc.strip() for doc in pre_process
                    ]  # Leading whitepsace are removed
     logging.info('frombatch_pipeline: white space removed')
     pre_process_tokens = [
         nltk.WordPunctTokenizer().tokenize(doc) for doc in pre_process
     ]
     logging.info('frombatch_pipeline: WordPunctTokenizer')
     filtered_tokens = [self.stop_words(doc)
                        for doc in pre_process_tokens]  #Stop Words
     logging.info('frombatch_pipeline: Stop words')
     filtered_tokens = [self.stemmer(doc)
                        for doc in filtered_tokens]  #Filtering Stemmings
     logging.info('frombatch_pipeline: Stemmings')
     filtered_tokens = [self.remove_terms(doc)
                        for doc in filtered_tokens]  #Filtering remove-terms
     logging.info('frombatch_pipeline: Removed Special Terns')
     #pre_process = [ ' '.join(doc) for doc in filtered_tokens]
     logging.info('frombatch_pipeline [END]')
     return filtered_tokens
示例#3
0
文件: conv.py 项目: LeyliG/ds4se
 def fromdocs_pipeline(self, docs):
     #TODO
     """@tokenized_file: a list of tokens that represents a document/code"""
     pre_process = [self.clean_punctuation(doc) for doc in docs]
     logging.info('fromtokens_pipeline: clean punctuation')
     pre_process = [self.split_camel_case_token(doc) for doc in pre_process]
     logging.info('fromtokens_pipeline: camel case')
     pre_process = [doc.lower() for doc in pre_process]
     logging.info('fromtokens_pipeline: lowe case')
     pre_process = [doc.strip() for doc in pre_process
                    ]  # Leading whitepsace are removed
     logging.info('fromtokens_pipeline: white space removed')
     pre_process_tokens = [
         nltk.WordPunctTokenizer().tokenize(doc) for doc in pre_process
     ]
     logging.info('fromtokens_pipeline: WordPunctTokenizer')
     filtered_tokens = [self.stop_words(doc)
                        for doc in pre_process_tokens]  #Stop Words
     logging.info('fromtokens_pipeline: Stop words')
     filtered_tokens = [self.stemmer(doc)
                        for doc in filtered_tokens]  #Filtering Stemmings
     logging.info('fromtokens_pipeline: Stemmings')
     filtered_tokens = [self.remove_terms(doc)
                        for doc in filtered_tokens]  #Filtering remove-terms
     logging.info('fromtokens_pipeline: Removed Special Terns')
     pre_process = [' '.join(doc) for doc in filtered_tokens]
     logging.info('fromtokens_pipeline END')
     return pre_process
示例#4
0
def isBulling(text):
    # tokenize words
    wpt = nltk.WordPunctTokenizer()
    wordnet_lemmatizer = nltk.WordNetLemmatizer()
    matrix = []
    words = wpt.tokenize(text)
    # remove punctuations
    clean_words = [
        word.lower() for word in words if word not in set(string.punctuation)
    ]
    # remove stop words
    english_stops = nltk.corpus.stopwords.words('english')
    characters_to_remove = [
        "''", '``', "rt", "https", "’", "“", "”", "\u200b", "--", "n't", "'s",
        "...", "//t.c"
    ]
    clean_words = [word for word in clean_words if word not in english_stops]
    clean_words = [
        word for word in clean_words if word not in set(characters_to_remove)
    ]
    # Lematise words
    lemma_list = [wordnet_lemmatizer.lemmatize(word) for word in clean_words]
    s = ''
    for y in lemma_list:
        s = s + ' ' + y
    s = re.sub('[^A-Za-z0-9" "]+', '', s)
    s = s.lstrip()
    matrix.append(s)
    re_tv_matrix = tv.transform(matrix)
    re_tv_matrix = re_tv_matrix.toarray()
    OUTPUT = svm.predict(re_tv_matrix)
    SEVERITY = svm.predict_proba(re_tv_matrix)
    # print(re_tv_matrix)
    return SEVERITY[0][0]
示例#5
0
def BOW(sentence):
    WPT = nltk.WordPunctTokenizer()
    #nltk.download('stopwords')
    #nltk.download('punkt')
    #nltk.download('wordnet')

    lemmatizer = WordNetLemmatizer()
    stop_word_list = nltk.corpus.stopwords.words('english')
    '''Remove numbers and special characters in sentence'''
    sentence = re.sub(" \d+", " ", sentence)  #digits
    sentence = re.sub(r'[0-9]+', "", sentence)  #digits
    sentence = re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", sentence)
    pattern = r"[{}]".format("-_)(,;:$%#.")  #special characters
    sentence = re.sub(pattern, "", sentence)
    sentence = re.sub(r"[\']", "", sentence)
    sentence = re.sub(r"[/']", "", sentence)
    sentence = re.sub(r'\b[a-zA-Z]\b', '',
                      sentence)  #remove single letter words
    sentence = re.sub('\s+', ' ', sentence).strip()  #remove double spaces
    '''Lowercase'''
    sentence = sentence.lower()
    sentence = sentence.strip()
    '''Tokenize'''
    tokens = WPT.tokenize(sentence)
    filtered_tokens = [
        token for token in tokens if token not in stop_word_list
    ]
    '''Lem'''
    k = []
    for word in range(len(filtered_tokens)):
        k.append(lemmatizer.lemmatize(filtered_tokens[word]))
    sentence = ' '.join(k)
    return sentence
示例#6
0
    def __init__(self):
        self.tokenizer = nltk.WordPunctTokenizer()#nltk.RegexpTokenizer("[\w]", flags=re.UNICODE)
        self.stopwords = self.getStopWordList('./frame/stop-words-english4.txt')
        #https://gist.github.com/alexbowe/
        self.sentence_re = r'''(?x)
              ([A-Z])(\.[A-Z])+\.?
            | \w+(-\w+)*
            | \$?\d+(\.\d+)?%?
            | \.\.\.
            | [][.,;"'?():-_`]
        '''
        # Grammar from this paper http://lexitron.nectec.or.th/public/COLING-2010_Beijing_China/PAPERS/pdf/PAPERS065.pdf
        self.grammar = r"""
            NBAR:
                {<NN.*|JJ>*<NN.*>}  #

            NP:
                {<NBAR>}
                {<NBAR><IN><NBAR>}
        """
        self.chunker = nltk.RegexpParser(self.grammar)
        self.toks = ""
        self.postoks = ""
        self.lemmatizer = nltk.WordNetLemmatizer()
        self.stemmer = nltk.stem.porter.PorterStemmer()
        self.tree = ""
示例#7
0
def filter_text_with_marks(text):
    text = text.replace('"', "'").replace('\\', '')
    # this is too dangerous but fix a particular annoying case:
    # text = text.replace('**', '*')
    puncts = re.escape(string.punctuation)
    text = re.sub(r'([%s])(\*)' % puncts, r'\g<1> \g<2>', text)
    text = re.sub(r'(\*)([%s])' % puncts, r'\g<1> \g<2>', text)
    tokenizer = nltk.WordPunctTokenizer()
    tokens = tokenizer.tokenize(text)
    new_text = []
    new_marks = []
    b = False
    for i in range(len(tokens)):
        tk_i = tokens[i]
        if tk_i != '*':
            if '*' in tk_i:
                print(tk_i, tokens)
            new_text.append(tk_i)
        if tk_i == '*' and b is False:
            b = True
        elif tk_i == '*' and b is True:
            b = False
        if tk_i == '*' and b is True:
            new_marks.append(len(new_text))
    new_text = ' '.join(new_text).replace('*', '')
    return new_text, new_marks
示例#8
0
    def process(self, data):
        """ TODO """
        new_dialog = []
        new_meta = []
        new_utts = []
        bod_utt = ["<s>", "<d>",
                   "</s>"]  # TODO what do we do about topic for this?
        all_lenes = []

        for l in data:
            lower_utts = [
                (["<s>"] + nltk.WordPunctTokenizer().tokenize(utt.lower()) +
                 ["</s>"]) for utt in l
            ]  # for utt in l['utts']]
            all_lenes.extend([len(u) for u in lower_utts])

            # dialog = [(bod_utt, 0)] + [(utt, int(ind==len(lower_utts)-2)) for ind, utt in enumerate(lower_utts)]
            dialog = [(utt, int(ind == len(lower_utts) - 2))
                      for ind, utt in enumerate(lower_utts)]

            new_utts.extend([bod_utt] + lower_utts)
            new_dialog.append(dialog)

        print("Max utt len %d, mean utt len %.2f" %
              (np.max(all_lenes), float(np.mean(all_lenes))))
        return new_dialog, new_utts
def nltk_hacking(text_file=OUTPUT_FILE):
    f = open(text_file).read()
    f = clean(f)
    f = unescape(f)
    text = nltk.WordPunctTokenizer().tokenize(f)
    txt = nltk.Text(text)
    return txt
    def transform(self, X, y=None):
        wpt = nltk.WordPunctTokenizer()
        tokenized_corpus = [
            wpt.tokenize(document) for document in X['normalize_review_text']
        ]

        # Set values for various parameters
        feature_size = 100  # Word vector dimensionality
        window_context = 5  # Context window size
        min_word_count = 2  # Minimum word count
        sample = 1e-3  # Downsample setting for frequent words

        self.word2vec = Word2Vec(tokenized_corpus,
                                 size=feature_size,
                                 window=window_context,
                                 min_count=min_word_count,
                                 sample=sample,
                                 iter=10)

        vocabulary = set(self.word2vec.wv.index2word)
        features = [
            self._col_transform(tokenized_sentence,
                                self.word2vec,
                                vocabulary,
                                num_features=100)
            for tokenized_sentence in tokenized_corpus
        ]
        return np.array(features)
示例#11
0
def clean_data(train_data):
    with open("stopwords.txt", 'r') as f:
        stopwords = f.read().split("\n")
    w_punc = nltk.WordPunctTokenizer()
    stopwords.append('br')
    stopwords = [re.sub(r'[^\w\s\d]', '', sw.lower()) for sw in stopwords]
    vocab_dict = []
    vocab_set = set()
    for idx, review in enumerate(train_data):
        review = review.decode('utf-8')
        review = review.replace('\n', ' ')
        review = re.sub(" \d+", " ", review)
        pattern = r"[{}]".format("-?!,.;:/<>'\(\)\"\"")
        review = re.sub(pattern, " ", review)
        review = review.lower()
        review = review.strip()
        tokens = w_punc.tokenize(review)
        filtered_tokens = [token for token in tokens if token not in stopwords]
        review = ' '.join(filtered_tokens)
        train_data[idx] = review
        word_list = review.split(" ")
        word_dict = {}
        for w in word_list:
            vocab_set.add(w)
            word_dict[w] = review.count(w)

        vocab_dict.append(word_dict)
    return train_data, vocab_dict, vocab_set
示例#12
0
def main():
    _file = input(
        "Enter the file name for the words/synonyms you want to find: ")
    read = open(_file, 'r')
    real_file = read.read()
    #print (real_file)
    count = 0
    _word = input("Enter the word that you want to search or find: ")
    new_corp = nltk.WordPunctTokenizer().tokenize(real_file)
    #_pos = nltk.pos_tag(nltk.word_tokenize(_word))
    #print (_pos)
    #thing = nltk.wsd.lesk (lang_,_word)
    #print (thing)
    #print(new_corp)

    for x in new_corp:
        #print(x)
        for _syns in wordnet.synsets(_word):
            for i in _syns.lemmas():
                print(i.name())
                if x == i.name() or x == _word:
                    if i.name() == _word:
                        break
                    else:
                        if x == _word:
                            count += 1
                            continue
                        print("Your corpus has listed", count, "of the word",
                              _word, "and it's synonyms")

                        import re
示例#13
0
    def normalization_data(self):
        import nltk
        nltk.download('stopwords')
        import re
        import numpy as np

        WPT = nltk.WordPunctTokenizer()
        stop_word_list = nltk.corpus.stopwords.words('turkish')
        from snowballstemmer import stemmer

        from TurkishStemmer import TurkishStemmer
        stemmer = TurkishStemmer()

        yorumlar = []
        for i in range(0, len(self.df)):

            yorum = re.sub(
                "[^AaBbCcÇçDdEeFfGgĞğHhİiIıJjKkLlMmNnOoÖöPpRrSsŞşTtUuÜüVvYyZz']",
                ' ', self.df['text'][i])  #drop things that without letters
            yorum = re.sub("[']", '', yorum)  #drop things that without letters
            yorum = yorum.lower()
            yorum = yorum.strip()
            yorum = yorum.split()

            yorum = [
                stemmer.stem(word) for word in yorum
                if word not in stop_word_list
            ]
            yorum = ' '.join(yorum)

            yorumlar.append(yorum)
        # print(yorumlar)ds
        return yorumlar
示例#14
0
    def normalize(cls, text):
        # eliminate non-word characters
        text = re.sub("[^\w\s]","",text)

        # replace unicode character in "Decor"
        pattern = re.compile(u'\uFFFD', re.UNICODE)
        text = re.sub(pattern, "e", text)

        # Exceptions - handling some phrases separately
        #TODO: do this more ellegantly
        # Exclude "Home Theater", put it together with electronics
        text = text.replace("Home Theater", "Electronics")
        # Exclude "Cloud Player for Home"
        text = text.replace("Cloud Player for Home","Cloud Player")
        text = text.replace("Home & Portable Audio", "Portable Audio")
        #print text

        stopset = set(stopwords.words('english'))#["and", "the", "&", "for", "of", "on", "as", "to", "in"]
        stemmer = nltk.PorterStemmer()
        tokens = nltk.WordPunctTokenizer().tokenize(text)
        clean = [token.lower() for token in tokens if token.lower() not in stopset and len(token) > 0]
        final = [stemmer.stem(word) for word in clean]

        # Fill tokens and stems dictionaries
        for token in tokens:
            cls.upper[token.lower()] = token

        for index in range(len(final)):
            if final[index] not in cls.stems:
                cls.stems[final[index]] = cls.upper[clean[index]]

        return final
 def __init__(self, data: pd.DataFrame, feature_column: str, target_column: str, cls2idx, max_len=128):
     self.tokenizer = nltk.WordPunctTokenizer()
     self.data = data
     self.feature_column = feature_column
     self.target_column = target_column
     self.max_len = max_len
     self.cls2idx = cls2idx
示例#16
0
    def preprocess_text(self, document):
        document = re.sub(r'\W', ' ', str(document))

        document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

        document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)

        document = re.sub(r'\s+', ' ', document, flags=re.I)

        document = re.sub(r'^b\s+', '', document)

        document = document.lower()

        tokens = document.split()
        tokens = [self.stemmer.lemmatize(word) for word in tokens]
        tokens = [word for word in tokens if word not in en_stop]
        tokens = [word for word in tokens if len(word) > 3]

        preprocessed_text = ' '.join(tokens)

        word_punctuation_tokenizer = nltk.WordPunctTokenizer()
        word_tokenized_corpus = word_punctuation_tokenizer.tokenize(
            preprocessed_text)

        return word_tokenized_corpus
示例#17
0
文件: yelp.py 项目: onenoc/quati
 def create_fields_tuples():
     tokenizer = nltk.WordPunctTokenizer()
     fields_tuples = [
         ('words', WordsField(tokenize=tokenizer.tokenize)),
         ('target', TagsField())
     ]
     return fields_tuples
def text_preprocess(data):
    tokenizer = nltk.WordPunctTokenizer()
    porter_stemmer = PorterStemmer()
    stop_words = set(stopwords.words('English'))
    comp = re.compile('[^A-Z^a-z ]')

    for i in range(len(data)):
        data_item = data[i]

        if data_item is None:
            x = 1

        # Clean by a regular expression
        data_item = comp.sub('', data_item)

        # Spilt into words
        data_item = tokenizer.tokenize(str(data_item).lower())

        # Remove stop words
        data_item = [word for word in data_item if word not in stop_words]

        # Stemming
        for w_idx in range(len(data_item)):
            data_item[w_idx] = porter_stemmer.stem(data_item[w_idx])

        data[i] = ' '.join(data_item)

    return data
示例#19
0
def convert_kb(data_dir, data_type, vocab_dict, pad_id=0, unk_id=2):
    kb_text_file_path = os.path.join(data_dir, data_type + "_kb_text.txt")
    kb_out_text_file_path = os.path.join(data_dir, data_type + "_kb_text.pkl")
    kb_out_len_file_path = os.path.join(data_dir,
                                        data_type + "_kb_text_len.pkl")
    kb_out_file_path = os.path.join(data_dir, data_type + "_kb_text_both.pkl")
    wpt = nltk.WordPunctTokenizer()

    kb_text_file = open(kb_text_file_path, 'r')
    kb_utterance = []
    kb_seq_len = []
    for utterance in kb_text_file:
        utterance = utterance.strip()
        utterance_words = wpt.tokenize(utterance)
        utterance_word_ids = []
        for word in utterance_words:
            if word in vocab_dict:
                word_id = vocab_dict[word]
            elif word == '':
                word_id = pad_id  #Corner case
            else:
                word_id = unk_id
            utterance_word_ids.append(word_id)
        length_utterance, utterance_word_ids = pad_or_clip_utterance(
            utterance_word_ids)
        kb_utterance.append(utterance_word_ids)
        kb_seq_len.append(length_utterance)
    # length_utterance, utterance_word_ids = pad_or_clip_utterance(utterance_word_ids)
    kb_corpora = [kb_seq_len, kb_utterance]

    with open(kb_out_file_path, 'wb') as f:
        pkl.dump(kb_corpora, f, protocol=pkl.HIGHEST_PROTOCOL)
示例#20
0
def remove_stop_words(doc):
    wpt = nltk.WordPunctTokenizer()
    stop_words = nltk.corpus.stopwords.words('english')
    tokens = wpt.tokenize(doc)
    filtered_tokens = [token for token in tokens if token not in stop_words]
    doc = ' '.join(filtered_tokens).lower()
    return doc
    def _normalize_document(self, sentence):
        wlp = nltk.WordPunctTokenizer()

        stopword_list = stopwords.words('english')
        stopword_list.remove('not')
        stopword_list.remove('no')

        word_list = words.words()

        lemmatizer = nltk.WordNetLemmatizer()

        # lower case and remove special characters\whitespaces
        sentence = re.sub(r'[^a-zA-Z\s]', '', sentence, re.I | re.A)
        sentence = sentence.strip().lower()

        # tokenizing
        sentence = wlp.tokenize(sentence)

        # lemmatizing
        sentence = [lemmatizer.lemmatize(x) for x in sentence]

        #  removing stop words
        sentence = [x for x in sentence if x.lower() not in stopword_list and x.lower() in word_list]

        return ' '.join(sentence)
示例#22
0
    def process(self, data):
        new_dialog = []
        new_meta = []
        new_utts = []
        all_lens = []
        new_persona = []
        new_persona_word = []

        for l in data:
            lower_utts = [
                (caller, ["<s>"] +
                 nltk.WordPunctTokenizer().tokenize(utt.lower()) + ["</s>"],
                 feat) for caller, utt, feat in l["utts"]
            ]
            all_lens.extend([len(u) for c, u, f in lower_utts])
            vec_a_meta = [0, 0] + [0, 0]
            vec_b_meta = [0, 0] + [0, 0]
            meta = (vec_a_meta, vec_b_meta, 'NULL')
            dialog = [(utt, int(caller == "A"), feat)
                      for caller, utt, feat in lower_utts]
            new_utts.extend([utt for caller, utt, feat in lower_utts])
            new_dialog.append(dialog)
            new_meta.append(meta)
            new_persona.append([(p.split(' ')) for p in l['persona']])
            new_persona_word.append(l['persona_word'])

        print("Max utt len %d, mean utt len %.2f" %
              (np.max(all_lens), float(np.mean(all_lens))))
        return new_dialog, new_meta, new_utts, new_persona, new_persona_word
示例#23
0
	def __init__(self, context_size, max_images, user_start_id, user_end_id,
				 sys_start_id, sys_end_id, unk_id, pad_id, start_id, end_id, 
				 cutoff=-1):
		logging.basicConfig(level=logging.INFO)
		self.logger = logging.getLogger('prepare_data_for_hred')
		self.context_size = context_size
		self.max_images = max_images
		self.unk_id = unk_id
		self.pad_id = pad_id
		self.user_start_id = user_start_id
		self.user_end_id = user_end_id
		self.sys_start_id = sys_start_id
		self.sys_end_id = sys_end_id
		self.start_id = start_id
		self.end_id = end_id		
		self.user_start_sym = '<u>'
		self.user_end_sym = '</u>'
		self.sys_start_sym = '<s>'
		self.sys_end_sym = '</s>'
		self.pad_symbol = '<pad>'
		self.unk_symbol = '<unk>'
		self.start_sym = '<sos>'
		self.end_sym = '<eos>'	
		self.cutoff = cutoff # Vocab frequency cutoff
		self.dir_path = None
		self.data_type = None
		self.vocab_file = None
		self.vocab_dict = None
		self.word_counter = None
		self.context_text_file = None
		self.context_image_file = None
		self.user_state_file = None
		self.target_text_file = None
		self.wpt = nltk.WordPunctTokenizer()
示例#24
0
    def process(self, data):
        """new_dialog: [(a, 1/0), (a,1/0)], new_utt: [[a,b,c)"""
        """ 1 is own utt and 0 is other's utt"""
        new_dialog = []
        new_utts = []
        bod_utt = ["<s>", "<d>", "</s>"]  # indicator of a start of a dialog
        all_lenes = []

        for l in data:
            lower_utts = [["<s>"] +
                          nltk.WordPunctTokenizer().tokenize(utt.lower()) +
                          ["</s>"] for utt in l.split('__eou__')[:-1]]
            all_lenes.extend([len(u) for u in lower_utts])

            dialog = [(bod_utt, 0)]
            floor = 1
            for utt in lower_utts:
                floor = floor + 1
                dialog = dialog + [(utt, int(floor % 2 == 0))]
            new_utts.extend([bod_utt] + [utt for utt in lower_utts])
            new_dialog.append(dialog)

        print("Max utt len %d, mean utt len %.2f" %
              (np.max(all_lenes), float(np.mean(all_lenes))))
        return new_dialog, new_utts
示例#25
0
    def prepare_mul_ref(self, dir='./data/test_multi_ref.json'):
        import json
        import nltk

        with open('./data/test_multi_ref.json') as f:
            mul_raws = json.load(f)

        self.mul_ref_outs = []
        self.mul_ref_das = []
        for raw in mul_raws:
            raw_outs = raw['responses']
            raw_das = raw['resp_dialog_acts']

            out_tokens = [["<s>"] + nltk.WordPunctTokenizer().tokenize(raw_out.lower()) + ["</s>"] for raw_out in raw_outs]
            out_ids = [[self.rev_vocab.get(t, self.rev_vocab["<unk>"]) for t in line] for line in out_tokens]
            tmp_outs = []
            for out_id in out_ids:
                if len(out_id) >= 40:
                    tmp_outs.append(np.array(out_id[0:39] + [out_id[-1]]))
                else:
                    tmp_outs.append(np.array(out_id))
            tmp_das = [self.rev_da_vocab[raw_da] for raw_da in raw_das]
            
            assert(len(tmp_das) == len(tmp_outs))
            self.mul_ref_outs.append(tmp_outs)
            self.mul_ref_das.append(tmp_das)
def spell_correction(document, vocab):  ######################with suggestions
    with open(vocab, 'rb') as f:
        vocab = pickle.load(f)

    with open(document, 'rb') as f:
        rawtext = pickle.load(f)

    tokens = nltk.WordPunctTokenizer().tokenize(rawtext)
    tokens = [x for x in tokens if x]
    #print (tokens)
    #print(vocab)

    wrongwords = [word for word in tokens if word not in vocab]
    #print (wrongwords)
    error_location = [x for x, _ in enumerate(tokens) if _ in wrongwords]
    #print(wrongwords)

    suggestions = {}
    mindistances_sugg = {}

    bestmatch = {}
    for word in wrongwords:
        mindistances = []
        mindistances_word = []
        for v in vocab:

            if edit_distance(word, v) <= 4:
                mindistances.append(v)
                mindistances_word.append(edit_distance(word, v))

        suggestions[word] = mindistances
        try:
            mindistances_sugg[word] = min(mindistances_word)
        except:
            pass
    #print(mindistances_word)

    for word in (wrongwords):
        dist = mindistances_sugg[word]
        key = [x for x in suggestions[word] if edit_distance(word, x) == dist]
        bestmatch[word] = key

        #print(bestmatch)

    big_table = list(zip(wrongwords, error_location, bestmatch.values()))

    df = pd.DataFrame(big_table)
    df.columns = ["wrongspellings", "location", "correction"]
    #print (error_location)
    #print(big_table)

    for word in wrongwords:
        print(word + "....?", "did you mean .........", suggestions[word])
        print(
            "best match ......", bestmatch[word],
            "please do humanity a favor and go to school boy ....................."
        )

    return df
示例#27
0
 def __init__(self, word2vec, data, labels, train=True, max_length=32):
     self.data = data
     self.labels = labels
     self.tokenizer = nltk.WordPunctTokenizer()
     self.max_length = max_length
     self.vocab = set(
         self.tokenizer.tokenize(" ".join(d for d in self.data)))
     self.word2idx = {word: idx + 1 for idx, word in enumerate(self.vocab)}
示例#28
0
 def __init__(self, list_content: list):
     self.model = KeyedVectors.load_word2vec_format(
         './nlp/GoogleNews-vectors-negative300.bin', binary=True)
     self.stop_words = set(stopwords.words('english'))
     self.list_content = list_content
     self.wpt = nltk.WordPunctTokenizer()
     self.word2weight = None
     self.corpus = []
示例#29
0
文件: snli.py 项目: onenoc/quati
 def create_fields_tuples():
     # note that words and words_hyp share the same field, therefore when we
     # call words_field.build_vocab() we are creating a shared vocab.
     # Hence, words.vocab == words_hyp.vocab
     tokenizer = nltk.WordPunctTokenizer()
     words_field = WordsField(tokenize=tokenizer.tokenize)
     fields_tuples = [('words', words_field), ('words_hyp', words_field),
                      ('target', TagsField())]
     return fields_tuples
示例#30
0
 def create_fields_tuples():
     # if you choose tokenizer='spacy', please install the en package:
     # python3 -m spacy download en
     tokenizer = nltk.WordPunctTokenizer()
     # tokenizer = nltk.TreebankWordTokenizer()
     fields_tuples = [('words',
                       fields.WordsField(tokenize=tokenizer.tokenize)),
                      ('target', fields.TagsField())]
     return fields_tuples