示例#1
0
 def __preprocess(self, sentence):
     sentence = self.__to_lower(sentence)
     text_wo_link = NLPUtils.remove_hyperlinks(sentence)
     tokens = []
     try:
         tokens = NLPUtils.word_tokenization(text_wo_link)
         tokens = [NLPUtils.punctuation_removal(token) for token in tokens]
         tokens = NLPUtils.stopword_elimination(tokens)
         tokens = NLPUtils.nonalpha_removal(tokens)
     except AssertionError:
         print("Phrase '{}' cannot be preprocessed".format(sentence))
     return " ".join(tokens)
示例#2
0
def process_raw_dataset(file_path, out_file):
    """TODO"""
    number_of_apps = 0
    with open(file_path) as stream:
        with open(out_file, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)

            reader = csv.reader(stream)
            header = next(reader)
            writer.writerow(header)
            start_time = time.time()
            for row in reader:
                if number_of_apps % 100 == 0:
                    elapsed_time = time.time() - start_time
                    print("Number of apps processed is {}".format(number_of_apps))
                    print("Elapsed time up to now is {}".format(elapsed_time))

                number_of_apps += 1
                text = row[1]
                try:
                    sentences = []
                    if langdetect.detect(text) == u'en':
                        for sentence in NLPUtils.sentence_tokenization(text):
                            sentence = NLPUtils.remove_hyperlinks(sentence)
                            sentence = sentence.lower()
                            if sentence:
                                tokens = NLPUtils.word_tokenization(sentence)
                                tokens = [NLPUtils.punctuation_removal(token) for token in tokens]
                                tokens = NLPUtils.stopword_elimination(tokens)
                                tokens = NLPUtils.nonalpha_removal(tokens)
                                if tokens:
                                    sentence = " ".join(tokens)
                                    sentence = sentence.rstrip()
                                    if sentence != "":
                                        sentences.append(sentence.rstrip())
                        if sentences:
                            writer.writerow([NLPUtils.punctuation_removal(row[0]),
                                            "%%".join(sentences),
                                            "%%".join(row[2].split(",")),
                                            row[3]])
                except Exception:
                    pass