Python NLPUtils.nonalpha_removal示例

编程语言: Python

命名空间/包名称: utils.nlp_utils

类/类型: NLPUtils

方法/功能: nonalpha_removal

hotexamples.com的示例: 2

Python NLPUtils.nonalpha_removal - 已找到2个示例。这些是从开源项目中提取的最受好评的utils.nlp_utils.NLPUtils.nonalpha_removal现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

preprocess_sentence(9)

nonalpha_removal(2)

punctuation_removal(2)

remove_hyperlinks(2)

sentence_tokenization(2)

stopword_elimination(2)

word_tokenization(2)

示例#1

显示文件

 def __preprocess(self, sentence):
     sentence = self.__to_lower(sentence)
     text_wo_link = NLPUtils.remove_hyperlinks(sentence)
     tokens = []
     try:
         tokens = NLPUtils.word_tokenization(text_wo_link)
         tokens = [NLPUtils.punctuation_removal(token) for token in tokens]
         tokens = NLPUtils.stopword_elimination(tokens)
         tokens = NLPUtils.nonalpha_removal(tokens)
     except AssertionError:
         print("Phrase '{}' cannot be preprocessed".format(sentence))
     return " ".join(tokens)

示例#2

显示文件

def process_raw_dataset(file_path, out_file):
    """TODO"""
    number_of_apps = 0
    with open(file_path) as stream:
        with open(out_file, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)

            reader = csv.reader(stream)
            header = next(reader)
            writer.writerow(header)
            start_time = time.time()
            for row in reader:
                if number_of_apps % 100 == 0:
                    elapsed_time = time.time() - start_time
                    print("Number of apps processed is {}".format(number_of_apps))
                    print("Elapsed time up to now is {}".format(elapsed_time))

                number_of_apps += 1
                text = row[1]
                try:
                    sentences = []
                    if langdetect.detect(text) == u'en':
                        for sentence in NLPUtils.sentence_tokenization(text):
                            sentence = NLPUtils.remove_hyperlinks(sentence)
                            sentence = sentence.lower()
                            if sentence:
                                tokens = NLPUtils.word_tokenization(sentence)
                                tokens = [NLPUtils.punctuation_removal(token) for token in tokens]
                                tokens = NLPUtils.stopword_elimination(tokens)
                                tokens = NLPUtils.nonalpha_removal(tokens)
                                if tokens:
                                    sentence = " ".join(tokens)
                                    sentence = sentence.rstrip()
                                    if sentence != "":
                                        sentences.append(sentence.rstrip())
                        if sentences:
                            writer.writerow([NLPUtils.punctuation_removal(row[0]),
                                            "%%".join(sentences),
                                            "%%".join(row[2].split(",")),
                                            row[3]])
                except Exception:
                    pass