def __preprocess(self, sentence): sentence = self.__to_lower(sentence) text_wo_link = NLPUtils.remove_hyperlinks(sentence) tokens = [] try: tokens = NLPUtils.word_tokenization(text_wo_link) tokens = [NLPUtils.punctuation_removal(token) for token in tokens] tokens = NLPUtils.stopword_elimination(tokens) tokens = NLPUtils.nonalpha_removal(tokens) except AssertionError: print("Phrase '{}' cannot be preprocessed".format(sentence)) return " ".join(tokens)
def process_raw_dataset(file_path, out_file): """TODO""" number_of_apps = 0 with open(file_path) as stream: with open(out_file, 'w', newline='') as csvfile: writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) reader = csv.reader(stream) header = next(reader) writer.writerow(header) start_time = time.time() for row in reader: if number_of_apps % 100 == 0: elapsed_time = time.time() - start_time print("Number of apps processed is {}".format(number_of_apps)) print("Elapsed time up to now is {}".format(elapsed_time)) number_of_apps += 1 text = row[1] try: sentences = [] if langdetect.detect(text) == u'en': for sentence in NLPUtils.sentence_tokenization(text): sentence = NLPUtils.remove_hyperlinks(sentence) sentence = sentence.lower() if sentence: tokens = NLPUtils.word_tokenization(sentence) tokens = [NLPUtils.punctuation_removal(token) for token in tokens] tokens = NLPUtils.stopword_elimination(tokens) tokens = NLPUtils.nonalpha_removal(tokens) if tokens: sentence = " ".join(tokens) sentence = sentence.rstrip() if sentence != "": sentences.append(sentence.rstrip()) if sentences: writer.writerow([NLPUtils.punctuation_removal(row[0]), "%%".join(sentences), "%%".join(row[2].split(",")), row[3]]) except Exception: pass