def preprocessing(text): '''Preprocesses a text using standard gensim techniques: removes stopwords, strips short words (1-2 characters), strips numbers, strips http addresses, strips Unicode from emoji etc., lowercases everything, strips extra spaces, punctuation, non-alphanumeric symbols. Also perform stemming input: text: a string returns: the preprocessed string. ''' text = text.lower() text = preprocess.remove_stopwords(text) # remove stop words text = preprocess.strip_short(text) #get rid of short words text = preprocess.strip_numeric(text) #get rid of numbers p = re.compile(r'(http.*\s)|(http.*$)') text = p.sub('',text) p = re.compile(r'[^\x00-\x7F]+') text = p.sub('',text) text = preprocess.strip_multiple_whitespaces(text) text = preprocess.strip_punctuation(text) text = preprocess.strip_non_alphanum(text) text = preprocess.remove_stopwords(text) text = preprocess.strip_short(text) # stemming words = text.split() stemmed_words = [stemmer.stem(word) for word in words] text = ' '.join(stemmed_words) return text
def remove_non_plain(document): """ Replaces urls, @usernames, #tags, emojis and numbers with a ' ' (space). Also removes accents and punctuation to finally remove redundant whitespace and lowercase all characters :param document: string :return: processed unicode string """ document = to_unicode(document) document = non_plain_re.sub(' ', document) document = proc.strip_non_alphanum(document) document = proc.strip_numeric(document) document = proc.strip_multiple_whitespaces(document) document = deaccent(document) return document.lower()
def clean_text(text): """ Cleans the text in the only argument in various steps ARGUMENTS: text: content/title, string RETURNS: cleaned text, string""" # Replace newlines by space. We want only one doc vector. text = text.replace('\n', ' ').lower() # Remove URLs text = re.sub(r"http\S+", "", text) # Expand contractions: you're to you are and so on. text = contractions.fix(text) # Remove stop words text = preprocessing.remove_stopwords(text) # Remove punctuation -- all special characters text = preprocessing.strip_multiple_whitespaces( preprocessing.strip_non_alphanum(text)) return text
def import_data(file, row_content, x): content_1 = open(file, 'r') csv_reader = csv.reader(content_1) content = [] for row in csv_reader: row_new = remove_stopwords(row[row_content]) row_new = strip_numeric(row_new) row_new = strip_non_alphanum(row_new) row_new = strip_short(row_new, minsize=3) content.append(row_new) length = len(content) label = [] for i in range(0, length): label.append(x) return content, label
def get_text(self): """ Get the pre-processed text extracted from the Notebook '#TEXT' section Returns ------- text : str Extracted text """ lines = self.read().splitlines() raw_text = ' '.join( [line for line in lines[1:lines.index('#FIGURES')]]) text = strip_non_alphanum(strip_punctuation(raw_text.lower())) return text
def clean_text(x: str) -> str: """ :param x: raw string :return x: cleaned string """ x = x.lower() x = re.sub('ssense|exclusive', '', x) x = strip_non_alphanum(x) x = strip_numeric(x) x = strip_short(x, minsize=2) x = remove_stopwords(x) x = strip_punctuation(x) x = strip_multiple_whitespaces(x) return x
def setname(namevalue): global name name = namevalue.lower() print(preprocess_string(name)) try: preprocessed = [ word for word in preprocess_string(name) if word not in ('people', 'call', 'friend', 'hey', 'hi', 'hei', 'cafe', 'buddi') ][0] name = [ word for word in strip_non_alphanum(name.lower()).split() if preprocessed in word ][0] except: name = name.split()[0] name = name[0].upper() + name[1:]
def get_text(self): """ Get the pre-processed text extracted from the Remark document Returns ------- text : str Extracted text """ with open(self.path + self.name, mode='r') as file: first_line, raw_text = file.readline(), file.read() text = strip_non_alphanum(strip_punctuation(raw_text.lower())) return text
def word_tokenize(text): try: if (isinstance(text, str)): words = text.lower().split() else: words = str(text).lower().split() if len(words) == 0: return '' text = ' '.join(words) text = preprocessing.strip_punctuation(text) text = preprocessing.strip_non_alphanum(text) text = preprocessing.strip_numeric(text) text = preprocessing.strip_tags(text) text = preprocessing.strip_multiple_whitespaces(text) return text.encode('utf-8') except UnicodeDecodeError as e: return ''
def telegrams(): df = pd.DataFrame(columns=['index', 'lista'], ) for j, cable in enumerate(cables_from_source(fname)): print("Gerando telegrama {}".format(j), end='\r') content = getattr(cable, 'content') content = content[content.find("1. "):len(content) - 1].lower() content = strip_short(content, minsize=3) content = strip_punctuation(content) content = strip_non_alphanum(content) content = remove_stopwords(content) content = lemmatization(content, ['NOUN']) df = df.append( { 'lista': content, 'index': j }, ignore_index=True, ) return df
def query_indexer(query_string, directory, topN=30): ''' query_string - sentence used to perform the search. directory - location of the indexer to be used. topN - number of documents returned by the query. The default is 30. ''' ix = open_dir(directory) query_string = strip_non_alphanum(query_string) query_string = strip_multiple_whitespaces(query_string) with ix.searcher(weighting=scoring.BM25F) as searcher: # with ix.searcher(weighting=scoring.Frequency) as searcher: query = QueryParser("question", ix.schema, termclass=FuzzyTerm, group=OrGroup).parse(query_string) try: options = [] options_answers = [] options_docnumbers = [] loop_range = 0 results = searcher.search(query, limit=topN, terms=True) if topN <= len(results): loop_range = topN else: loop_range = len(results) for i in range(loop_range): # this needs to be adapted in order to work with the Whoosh Chatbot; uncomment next line in order to work with the normal Chatbot options_answers.append(results[i]['response']) options.append(results[i]['question']) options_docnumbers.append(results[i].docnum) return options, options_answers, options_docnumbers # return the element with the highest similarity score from the indexer # return results[0]['response'] except IndexError: return None
def gensim_clean_string(textIn, _strip_tags=True, _split_alphanumeric=True, _strip_nonalphanumeric=True, _strip_muliple_whitespace=True, _strip_short=True, _short_charcount_min=3, _strip_punctuation=False, _convert_to_lower = False): cleaner = textIn if _strip_tags: cleaner = strip_tags(textIn) if _strip_nonalphanumeric: cleaner = strip_non_alphanum(cleaner) if _strip_muliple_whitespace: cleaner = strip_multiple_whitespaces(cleaner) if _split_alphanumeric: cleaner = split_alphanum(cleaner) if _strip_short: cleaner = strip_short(cleaner, minsize=_short_charcount_min) if _convert_to_lower: cleaner = cleaner.lower() return cleaner
def text_preprocess(bodyItem): # bodyItem: string (of one mail) => return: list of words (of one mail) # Remove http, https bodyItem = re.sub(r'^https?:\/\/.*[\r\n]*', '', bodyItem, flags=re.MULTILINE) bodyItem = re.sub(r'^http?:\/\/.*[\r\n]*', '', bodyItem, flags=re.MULTILINE) bodyItem = re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', " ", bodyItem) # Decode some bodyItems which are not decoded bodyItem = bodyItem.replace("=", "%") bodyItem = urllib.parse.unquote(bodyItem) # Remove a word which has numbers and alphabets bodyItem = strip_number_alphabets(bodyItem) # Remove meaningless words, convert to lower words and split meaningful words bodyItem = strip_non_alphanum(bodyItem).lower().strip() bodyItem = split_alphanum(bodyItem) # Join two words which have meaning in Vietnamese. Ex: hội thảo -> hội_thảo bodyItem = ViTokenizer.tokenize(bodyItem) # Remove a word which has one letter bodyItem = strip_short(bodyItem, minsize=2) # Remove stopwords words = [word for word in bodyItem.split() if word not in stopwordsVN_ENG.getStopwordsVN_ENG()] return words
def data_preprocessing(para): """ This function takes in paragraph and returns a list pre-processed sentences Args : { para: raw paragraph } returns : { list of individual sentences in the paragraph } """ # Splitting the paragraph into sentences sentences = sent_tokenize(para) processed_sentences = [] for sent in sentences: # lowercase temp_text = sent.lower() # Converting sybols # temp_text = " ".join(symbol_conversion(sent)) # Removing the non alphabetic symbols temp_text = strip_non_alphanum(sent) # Removing multiple white spaces temp_text = strip_multiple_whitespaces(temp_text) # Removing punctuations temp_text = strip_punctuation(temp_text) # Converting digits to alphabets temp_text = " ".join(replace_numbers(temp_text)) # Remove stopword # temp_text = remove_stopwords(temp_text) # Remove short 1 letter values temp_text = strip_short(temp_text, minsize=2) # Lemmatization # doc = nlp(temp_text) # temp_text = " ".join([token.lemma_ for token in doc]) if len(temp_text) > 1: processed_sentences.append(temp_text.lower()) return processed_sentences
def read_corpus(name, max_len=20, test_size=5000): filepath = 'yle-corpus/data/' with open(os.path.join(filepath, name), 'r') as f: # remove label and url from text text = f.read() text = re.sub(r'__label__\S*\s', '', text) text = re.sub(r'\S?http\S+', '', text) text = strip_multiple_whitespaces(text) text = strip_non_alphanum(text) text = strip_punctuation(text) text = text.lower() text = text.split() # dcm = [w for w in text if len(w) < max_len + 4 and len(w) > max_len] text = [w for w in text if len(w) <= max_len] # ml = max([len(w) for w in text]) train, test = train_test_split(text, test_size=test_size, shuffle=False) return train, test
def get_text(self): """ Get the pre-processed text extracted from the PDF document Returns ------- text : str Extracted text (None if the extraction fails) """ output_string = StringIO() try: with open(self.path + self.name, 'rb') as file: document = PDFDocument(PDFParser(file)) resource_manager = PDFResourceManager() device = TextConverter(resource_manager, output_string, laparams=LAParams()) interpreter = PDFPageInterpreter(resource_manager, device) print(f'>>> Reading document "{self.name}"') for page in tqdm(list(PDFPage.create_pages(document)), ncols=80): interpreter.process_page(page) except UnicodeError: e = Exceptions( state='warning', message=f'The pdf file "{self.name}" cannot be read') e.throw() return text = output_string.getvalue() text = strip_non_alphanum(strip_punctuation(text.lower())) return text
def read_clean_inference_data(path_=None, prefixes_to_clean=None, app_mode=False, df=None): """ Description :type path_: string :param path_: (optional) path to the file to read :type prefixes_to_clean: list :param prefixes_to_clean: list of column prefixes i.e just pass ["book", "author"] even though the actual columns being ["book1","book2","author1","author2"]. :type app_mode: bool :param app_mode: (optional) If true then path_ is not used and df should be passed :type df: pd.DataFrame :param df: pandas dataframe to clearn the dataframe with the prefixes mentioned. :type path_: string :param path_: (optional) path to the file to read :rtype: pd.DataFrame """ if app_mode: df = df else: df = pd.read_csv(path_) cols = list() for i in prefixes_to_clean: cols.extend([j for j in df.columns if i in j]) for i in cols: df[i] = df[i].apply(lambda x: (strip_multiple_whitespaces( strip_non_alphanum(x.lower().strip())))) return df
def word_tokenize_and_remove_stop_words(text, stop_word1, stop_word2): try: if isinstance(text, str): words = text.lower().split() else: words = str(text).lower().split() if len(words) == 0: return '' text = ' '.join(filter(lambda x: x not in stop_word1, words)) text = preprocessing.strip_punctuation(text) text = preprocessing.strip_non_alphanum(text) text = preprocessing.strip_numeric(text) text = preprocessing.strip_tags(text) text = preprocessing.strip_multiple_whitespaces(text) words = text.split() if len(words) == 0: return '' text = ' '.join(filter(lambda x: x not in stop_word2, words)) return text.encode('utf-8') except UnicodeDecodeError as e: return ''
def PPL_preprocess(d_type, yelp_round): if d_type == 'dev': input_file = 'dev_rd%d.tmp' % (yelp_round) output_file = 'PPL_dev_rd%d.tmp' % (yelp_round) elif d_type == 'test': input_file = 'test_rd%d.tmp' % (yelp_round) output_file = 'PPL_test_rd%d.tmp' % (yelp_round) else: print 'No such dataset type: %s' % (d_type) return None command = 'java -jar Split_PPL.jar %s %s' % (input_file, output_file) print command os.system(command) if d_type == 'dev': input_file = 'PPL_dev_rd%d.tmp' % (yelp_round) output_file = 'PPL_dev_rd%d.tmp.tmp' % (yelp_round) elif d_type == 'test': input_file = 'PPL_test_rd%d.tmp' % (yelp_round) output_file = 'PPL_test_rd%d.tmp.tmp' % (yelp_round) else: print 'No such dataset type: %s' % (d_type) return None fin = open(input_file, 'rb') fo = open(output_file, 'wb') for s in fin: user_id = s.strip('\n').split() if len(user_id) <= 1: print "there is no word or only user_id in this line!" continue else: fo.write(user_id[0] + ' ') s = '' for i in range(len(user_id) - 1): s = s + user_id[i + 1] + ' ' s = s[:-1] try: s = preprocessing.strip_punctuation(s) s = preprocessing.strip_non_alphanum(s) s = preprocessing.strip_numeric(s) s = preprocessing.strip_tags(s) s = preprocessing.strip_multiple_whitespaces(s) s_array = s.encode('utf8').split() except UnicodeDecodeError: fo.write('\n') continue s = '' actual_word_cnt = 0 if len(s_array) > 0: for ss in s_array: if ss == "RRB" or ss == "LRB" or ss == "LCB" or ss == "RCB": continue ss = ss.lower() s = s + ss + ' ' actual_word_cnt = actual_word_cnt + 1 if actual_word_cnt > 0: fo.write(s[:-1]) fo.write('\n') fin.close() fo.close() command = 'rm %s' % (input_file) #print command os.system(command) # select a sentence for each user dic = {} lower_bound = 8 upper_bound = 10 if d_type == 'dev': input_file = './PPL_dev_rd%d.tmp.tmp' % (yelp_round) output_file = './PPL_dev_rd%d.txt' % (yelp_round) elif d_type == 'test': input_file = './PPL_test_rd%d.tmp.tmp' % (yelp_round) output_file = './PPL_test_rd%d.txt' % (yelp_round) fo = open(output_file, "wb") user_count = 0 user_file = 'user_file_rd%d.txt' % (yelp_round) with open(user_file, "rb") as fin: for line in fin: user_id = line.strip('\n') if user_id not in dic.keys(): dic[user_id] = user_count user_count = user_count + 1 total = user_count print "total %d user" % (total) recorder = [0 for i in range(total)] with open(input_file, "rb") as fin: for i, line in enumerate(fin): array_line = line.strip('\n').split() if array_line[0] == "unknown_user_id": pass else: if recorder[dic[array_line[0]]] != 0: pass else: if (len(array_line) >= (lower_bound + 1) and len(array_line) <= (upper_bound + 1)): fo.write(line.strip('\n')) fo.write('\n') recorder[dic[array_line[0]]] = 1 go_on = 0 count = 0 for i in range(total): if recorder[i] == 0: go_on = 1 count = count + 1 if go_on == 1: with open(input_file, "rb") as fin: for i, line in enumerate(fin): array_line = line.strip('\n').split() if array_line[0] == "unknown_user_id": pass else: if recorder[dic[array_line[0]]] != 0: pass else: if (len(array_line) >= (lower_bound + 1 - 1) and len(array_line) <= (upper_bound + 1 + 1)): fo.write(line.strip('\n')) fo.write('\n') recorder[dic[array_line[0]]] = 1 go_on = 0 count = 0 for i in range(total): if recorder[i] == 0: go_on = 1 count = count + 1 if go_on == 1: with open(input_file, "rb") as fin: for i, line in enumerate(fin): array_line = line.strip('\n').split() if array_line[0] == "unknown_user_id": pass else: if recorder[dic[array_line[0]]] != 0: pass else: if (len(array_line) >= (lower_bound + 1 - 2) and len(array_line) <= (upper_bound + 1 + 2)): fo.write(line.strip('\n')) fo.write('\n') recorder[dic[array_line[0]]] = 1 go_on = 0 count = 0 for i in range(total): if recorder[i] == 0: go_on = 1 count = count + 1 if go_on == 1: with open(input_file, "rb") as fin: for i, line in enumerate(fin): array_line = line.strip('\n').split() if array_line[0] == "unknown_user_id": pass else: if recorder[dic[array_line[0]]] != 0: pass else: if (len(array_line) >= (lower_bound + 1 - 3) and len(array_line) <= (upper_bound + 1 + 3)): fo.write(line.strip('\n')) fo.write('\n') recorder[dic[array_line[0]]] = 1 go_on = 0 count = 0 for i in range(total): if recorder[i] == 0: go_on = 1 count = count + 1 if go_on == 1: with open(input_file, "rb") as fin: for i, line in enumerate(fin): array_line = line.strip('\n').split() if array_line[0] == "unknown_user_id": pass else: if recorder[dic[array_line[0]]] != 0: pass else: fo.write(line.strip('\n')) fo.write('\n') recorder[dic[array_line[0]]] = 1 go_on = 0 count = 0 for i in range(total): if recorder[i] == 0: go_on = 1 count = count + 1 if go_on == 1: print "ERROR" fo.close() command = 'rm %s' % (input_file) #print command os.system(command)
def test_strip_non_alphanum(self): self.assertEqual(strip_non_alphanum("toto nf-kappa titi"), "toto nf kappa titi")
ON abstracts.paperid=englishfields.paperid; """ cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) second_cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) cur.execute(query) for row in tqdm(cur): cur_paper_id = row.get('paperid') # we can now access the columns: row is a psycopg2.extras.RealDictRow (inherited from dict) # print(row.keys()): dict_keys(['paperid', 'papertitle', 'abstract']) # IMPORTANT: EXPERIMENTAL: Get the contexts from the papers which cite the current paper contexts_query = """ SELECT paperreferenceid, string_agg(citationcontext, ' ||--|| ') AS contexts FROM papercitationcontexts WHERE paperreferenceid=%s GROUP BY paperreferenceid; """ second_cur.execute(contexts_query, (cur_paper_id, )) second_results = second_cur.fetchone() if not second_results: # second_results returned None, this paper has not been cited in any citation context continue contexts = second_results['contexts'] #contexts = contexts.split(' ||--|| ') contexts = preprocessing.strip_multiple_whitespaces( preprocessing.strip_non_alphanum(contexts)) #print(contexts) combined_text = '{} {} {} {}\n'.format(cur_paper_id, row['papertitle'], row['abstract'], contexts) file.write(combined_text) file.close()
stoplist = set(stopwords.iloc[:, 0].unique()) if stop_cities: cities = [ 'espoo', 'helsinki', 'turku', 'tampere', 'jyväskylä', 'kuopio', 'oulu', 'espoon', 'helsingin', 'turun', 'tampereen', 'jyväskylän', 'kuopion', 'oulun', 'kouvola', 'kouvolan', 'vaasa', 'vaasan', 'lahti', 'lahden', 'kauhava', 'kauhavan', 'salo', 'salon', 'turussa', 'helsingissä', 'espoossa', 'joensuun', 'kotkan', 'keravan', 'hämeenlinnan', 'joensuun', 'mikkelin', 'vantaan', 'vihdin' ] for city in cities: stoplist.add(city) texts = [[ word for word in strip_short(strip_multiple_whitespaces( strip_numeric(strip_non_alphanum(document.lower()))), minsize=3).split() if word not in stoplist and word != [] ] for document in documents] if len(df) != len(texts): print("wrong lengths for df and texts!") if len(texts) != len(documents): print("wrong lengths for texts and documents!") # # # remove words that appear only once # frequency = defaultdict(int) # for text in texts: # for token in text: # frequency[token] += 1 #
def testStripNonAlphanum(self): self.assertEqual(strip_non_alphanum("toto nf-kappa titi"), "toto nf kappa titi")
#corpus file_dir = os.path.join('C:\\', 'Users', 'cruze', 'Documents', 'CS664') inputfile = os.path.join(file_dir, 'train_E6oV3lV.csv') df = pd.read_csv(inputfile) corpus = df['tweet'] df = [] from gensim.parsing.preprocessing import remove_stopwords, strip_punctuation, strip_non_alphanum, strip_numeric, strip_multiple_whitespaces, stem for msg in corpus: string = remove_stopwords(msg) string = strip_punctuation(string) string = strip_non_alphanum(string) string = strip_numeric(string) string = strip_multiple_whitespaces(string) string = stem(string) df.append(string) corpus = df #out = pd.DataFrame(data=corpus) #out.to_csv('chatOut.csv', index_label=False) tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(corpus)] max_epochs = 50 vec_size = 50 alpha = 0.025
family() else: work() print( '\n\nHi! A debt of gratitude is in order for coming here. I am a chatbot. Individuals say that' 'I am a kind and receptive bot.') name = input('If its not too much trouble let me know your name.\n') try: preprocessed = [ word for word in preprocess_string(name) if word not in ('people', 'call', 'friend') ][0] name = [ word for word in strip_non_alphanum(name.lower()).split() if preprocessed in word ][0] except: name = name.split()[0] name = name[0].upper() + name[1:] print("Hi " + name + "! My name's SYVBot. How about we begin with our session.") reply = input("Hows life?\n") if (predict(reply) >= 0.55): reply = input('That is great. Are you as a rule this glad, or are there '\ 'a few stresses that you need to talk about?\n') if (predict(reply) >= 0.7): reply = input('You appear to be extremely content. Wanna sign off?\n') if (predict(reply) >= 0.7): print('Ok, bye ' + name + '!')
def getline(filepath): with open(filepath, 'r') as fout: for line in fout: yield (strip_non_alphanum(line).split())
def get_processed_stems(self): return prep.stem( prep.remove_stopwords(prep.strip_non_alphanum(self.text))).split()
def clear_unknown_letter(text): text = strip_non_alphanum(text) text = word_tokenize(text) return process_lower(text)
def SVM_preprocess(d_type, yelp_round): # preprocessing for sentiment classification using SVM # remove punctuation, tags, multiple spaces, tags, stop words, convert all words into lower case. if d_type == 'train': input_file = 'train_rd%d.tmp' % (yelp_round) output_file = './SVM_train_rd%d.txt' % (yelp_round) elif d_type == 'dev': input_file = 'dev_rd%d.tmp' % (yelp_round) output_file = './SVM_dev_rd%d.txt' % (yelp_round) elif d_type == 'test': input_file = 'test_rd%d.tmp' % (yelp_round) output_file = './SVM_test_rd%d.txt' % (yelp_round) else: print 'No such dataset type: %s' % (d_type) return None stop_file = 'english_stop.txt' with open(stop_file, "rb") as f: for i, l in enumerate(f): pass total = i + 1 fin = open(input_file, "rb") fo = open(output_file, "wb") stop_word1 = ["" for i in range(total)] stop_word2 = ["" for i in range(total)] cnt1 = 0 cnt2 = 0 with open(stop_file, "rb") as fs: for l in fs: s = l.strip('\n') if "'" in s: stop_word1[cnt1] = s cnt1 = cnt1 + 1 else: stop_word2[cnt2] = s cnt2 = cnt2 + 1 user_flag = 0 start = 1 begin_mark = str('@@@@@begin_mark@@@@@\n') for s in fin: if s == begin_mark: user_flag = 1 continue if user_flag == 1: user_flag = 0 if start != 1: fo.write('\n') else: start = 0 user_id = s.strip('\n').split() if len(user_id) < 2: print "there is no user_id & star rating following the start_mark!" fo.write(user_id[0] + ' ' + user_id[1] + ' ') s = '' if len(user_id) <= 2: continue else: for i in range(len(user_id) - 2): s = s + user_id[i + 2] + ' ' #s = s[:-1] try: s_array = s.encode('utf8').split() s = '' if len(s_array) > 0: for ss in s_array: ss = ss.lower() if ss not in stop_word1: s = s + ss + ' ' else: continue s = s.strip('\n') if len(s) > 0: s = preprocessing.strip_punctuation(s) s = preprocessing.strip_non_alphanum(s) s = preprocessing.strip_numeric(s) s = preprocessing.strip_tags(s) s = preprocessing.strip_multiple_whitespaces(s) s_array = s.encode('utf8').split() s = '' if len(s_array) > 0: for ss in s_array: if ss not in stop_word2: s = s + ss + ' ' else: continue else: continue if len(s) > 0: if s[-1] != ' ': s = s + ' ' else: continue fo.write(s) except UnicodeDecodeError: continue fin.close() fo.close()
def Train_preprocess(yelp_round): input_file = 'train_rd%d.tmp' % (yelp_round) output_file = './swe_train_rd%d.txt' % (yelp_round) fin = open(input_file, 'rb') fo = open(output_file, 'wb') user_flag = 0 start = 1 begin_mark = str('@@@@@begin_mark@@@@@\n') for s in fin: if s == begin_mark: user_flag = 1 continue if user_flag == 1: user_flag = 0 if start != 1: fo.write('\n') else: start = 0 user_id = s.strip('\n').split() if len(user_id) < 1: print "there is no user_id following the start_mark!" fo.write(user_id[0] + ' ') s = '' if len(user_id) <= 1: continue else: for i in range(len(user_id) - 1): s = s + user_id[i + 1] + ' ' try: s = s.strip('\n') s = preprocessing.strip_punctuation(s) s = preprocessing.strip_non_alphanum(s) s = preprocessing.strip_numeric(s) s = preprocessing.strip_tags(s) s = preprocessing.strip_multiple_whitespaces(s) s_array = s.encode('utf8').split() except UnicodeDecodeError: continue s = '' actual_word_cnt = 0 for ss in s_array: ss = ss.lower() actual_word_cnt = actual_word_cnt + 1 s = s + ss + ' ' if (actual_word_cnt > 0): fo.write(s[:-1]) else: continue fin.close() fo.close() # get user_file and train_file if os.path.isfile('./get_user_train_file') == False: command = 'gcc get_user_file_w2v_train.c -o get_user_file_w2v_train -lm -pthread -O3 -march=native -Wall -funroll-loops -Wno-unused-result' print command os.system(command) user_file = 'user_file_rd%d.txt' % (yelp_round) w2v_train = './w2v_train_rd%d.txt' % (yelp_round) command = './get_user_file_w2v_train -input %s -user %s -word %s' % ( output_file, user_file, w2v_train) print command os.system(command)
def NN_preprocess(d_type, yelp_round): # preprocessing for sentiment classification using Deep Neural Network if d_type == 'train': input_file = 'train_rd%d.tmp' % (yelp_round) output_file = './NN_train_rd%d.tmp' % (yelp_round) elif d_type == 'dev': input_file = 'dev_rd%d.tmp' % (yelp_round) output_file = './NN_dev_rd%d.tmp' % (yelp_round) elif d_type == 'test': input_file = 'test_rd%d.tmp' % (yelp_round) output_file = './NN_test_rd%d.tmp' % (yelp_round) else: print 'No such dataset type: %s' % (d_type) return None command = 'java -jar Split_NN.jar %s %s' % (input_file, output_file) print command os.system(command) # remove stop words if d_type == 'train': input_file = './NN_train_rd%d.tmp' % (yelp_round) output_file = './NN_train_rd%d.txt' % (yelp_round) elif d_type == 'dev': input_file = './NN_dev_rd%d.tmp' % (yelp_round) output_file = './NN_dev_rd%d.txt' % (yelp_round) elif d_type == 'test': input_file = './NN_test_rd%d.tmp' % (yelp_round) output_file = './NN_test_rd%d.txt' % (yelp_round) else: print 'No such dataset type: %s' % (d_type) return None stop_file = 'english_stop.txt' fin = open(input_file, 'rb') fs = open(stop_file, "rb") tar_file = open(output_file, 'w+') with open(stop_file, "rb") as f: for i, l in enumerate(f): pass total = i + 1 stop_word1 = ["" for i in range(total)] stop_word2 = ["" for i in range(total)] cnt1 = 0 cnt2 = 0 for l in fs: s = l.strip('\n') if "'" in s: stop_word1[cnt1] = s cnt1 = cnt1 + 1 else: stop_word2[cnt2] = s cnt2 = cnt2 + 1 user_flag = 0 review_flag = 0 start = 1 begin_mark = str('@@@@@begin_mark@@@@@\n') for s in fin: if s == begin_mark: user_flag = 1 continue if user_flag == 1: user_flag = 0 if start != 1: tar_file.write('\n') else: start = 0 user_star = s.strip('\n').split() if (len(user_star) < 2): print "there is no user_id & star rating following the start_mark!" print len(user_star) for i in range(len(user_star)): print user_star[i] tar_file.write(user_star[0] + '\t\t') tar_file.write(user_star[1] + '\t\t') continue try: s_array = s.encode('utf8').split() s = '' if len(s_array) > 0: for ss in s_array: ss = ss.lower() if ss not in stop_word1: s = s + ss + ' ' else: continue s = s.strip('\n') s = preprocessing.strip_punctuation(s) s = preprocessing.strip_non_alphanum(s) s = preprocessing.strip_numeric(s) s = preprocessing.strip_tags(s) s = preprocessing.strip_multiple_whitespaces(s) s_array = s.encode('utf8').split() s = '' actual_word_cnt = 0 if len(s_array) > 0: for ss in s_array: if ss == "RRB" or ss == "LRB" or ss == "LCB" or ss == "RCB": # -LCB-, -LRB-, -RCB-, -RRB- continue if ss not in stop_word2: s = s + ss + ' ' actual_word_cnt = actual_word_cnt + 1 if (actual_word_cnt > 0): tar_file.write(s[:-1]) tar_file.write('#') else: continue except UnicodeDecodeError: continue fin.close() tar_file.close() command = 'rm %s' % (input_file) #print command os.system(command)
def get_processed_text(self): return prep.remove_stopwords(prep.strip_non_alphanum(self.text))