def readFromDir(osList): """ This reads the scraped raw data """ textList = [] for i in range(len(osList)): filesList = [] textArray = [] for (dirpath, dirnames, filenames) in os.walk(osList[i]): filesList.extend(filenames) os.chdir(osList[i]) for _ in range(len(filesList)): with open('{}'.format(filesList[_]), 'r', encoding='utf-8') as file: text_str = file.read() textArray.append(text_str.lower()) text_arr = ','.join(textArray) text_arr = strip_punctuation(text_arr) text_arr = strip_numeric(text_arr) text_arr = strip_non_alphanum(text_arr) textList.append(text_arr) os.chdir('..') return textList
def custom_preprocess(sentence): #Define a custom preprocess function for the test documents, this can also be applied to pandas dataframe series sentence = sentence.lower() no_stopwords = remove_stopwords(sentence) tokens = tokenize(no_stopwords) no_punctuation = strip_punctuation(no_stopwords) unwanted = remove_unwanted(no_punctuation) return unwanted
def sentence_tokenize_and_word_tokenize_and_remove_stop_words( text, tokenizer, stop_word1, stop_word2): try: if isinstance(text, str): sentences = tokenizer.tokenize(text.lower()) else: sentences = tokenizer.tokenize(str(text).lower()) except UnicodeDecodeError as e: return '' if len(sentences) == 0: return '' text_total = '' for sentence in sentences: words = sentence.split() if len(words) == 0: continue text = ' '.join(filter(lambda x: x not in stop_word1, words)) try: text = preprocessing.strip_punctuation(text) text = preprocessing.strip_non_alphanum(text) text = preprocessing.strip_numeric(text) text = preprocessing.strip_tags(text) text = preprocessing.strip_multiple_whitespaces(text) words = text.split() if len(words) == 0: continue text = ' '.join(filter(lambda x: x not in stop_word2, words)) text_total = text_total + text.encode('utf-8') + '#' except UnicodeDecodeError as e: pass return text_total
def make_array_vectorize(text): texts=[] texts.append(extract_sentences_from_paragraph(text)) nested_list_len = lambda x: sum(len(list) for list in x) source_text_vectors = np.zeros((nested_list_len(texts), 3500)) vec_idx = 0 if(type(texts[0]) == list): for i in range(len(texts)): sentences = texts[i] # Get text vector for s in sentences: sentence_vector = np.array([]) # s=remove_stopwords(strip_punctuation(strip_non_alphanum(str(s).lower()))) s=remove_stopwords(strip_punctuation(strip_non_alphanum(str(s)))) s=clean_str(s) for w in word_tokenize(s): w=lemmatizer.lemmatize(w) if(model_ft.__contains__(w)==False): continue if(len(sentence_vector) < MAX_SENTENCE_LEN*EMBEDDING_SIZE): sentence_vector = np.append(sentence_vector, model_ft[w]) else: break while(len(sentence_vector) < MAX_SENTENCE_LEN*EMBEDDING_SIZE): sentence_vector = np.append(sentence_vector,np.zeros(EMBEDDING_SIZE)) source_text_vectors[vec_idx] = sentence_vector vec_idx+=1 return (source_text_vectors)
def clean_text (self, text_tag, processes = ["urls", "punctuation", "numeric", "lower"]): text = self.texts [text_tag] #print (text) if "urls" in processes: text = [re.sub(r"(?:\@|https?\://)\S+", "", str(x)) for x in text] text = [re.sub(r' +', ' ', str(x)) for x in text] if "stopwords" in processes: text = [remove_stopwords (x) for x in text] if "punctuation" in processes: text = [strip_punctuation(x) for x in text] if "numeric" in processes: text = [strip_numeric(x) for x in text] text = [x.replace('"', "") for x in text] text = [x.replace('©', "") for x in text] text = [x.replace('\n', " ") for x in text] text = [x.replace('\r', ".") for x in text] text = [x.replace('QT', " ") for x in text] text = [x.replace('RT', " ") for x in text] text = [x.replace('#', " ") for x in text] text = [strip_multiple_whitespaces(x) for x in text] text = [x.strip() for x in text] if "lower" in processes: text = [x.lower() for x in text] # clean_text = [nltk.sent_tokenize (x) for x in clean_text] self.texts[text_tag] = text
def getLemmatizedText(name, content, language): language = language[:2] language = language.lower() outText = "" if (language): if (language=="is"): outText = getLemmatizedTextIS(name, content) print("IS") else: outText = lemmatizerMultilanguage.getLemmatizedText(language, name+" "+content) print(language.upper()) else: text = name+" "+content outText = text.lower().replace('.','.') print("ERROR: No language for Lemmatizing text") cleaned = re.sub(' +', ' ',outText) cleaned = cleaned.replace('\n', '') cleaned = cleaned.replace('\r', '') cleaned = remove_stopwords(cleaned) cleaned = strip_tags(cleaned) cleaned = strip_punctuation(cleaned) cleaned = strip_numeric(cleaned) cleaned = strip_short(cleaned, 1) cleaned = strip_multiple_whitespaces(cleaned) cleaned = cleaned.lower() print("Lemmatized CLEAN: "+cleaned) return cleaned
def process_review_raw_data(self): print("Review data pre-processing start...") _reviews = [] with open(config.path2datasets + self.dataset_name, 'r') as f: for line in f.readlines(): review_json = json.loads(line) _business_id = review_json['business_id'] _review_id = review_json['review_id'] _stars = review_json['stars'] _text = review_json['text'] # remove punctuation _text = strip_punctuation(_text) _text = remove_stopwords(_text) _text = _text.lower() _reviews.append({ 'review_id': _review_id, 'business_id': _business_id, 'stars': _stars, 'text': _text }) _reviews = pd.DataFrame(_reviews) _reviews.to_csv(config.path2data + self.dataset_name + "." + config.path2reviews) _reviews = None print("Review data pre-processing DONE")
def noPuncNoNumb(corpora): List_No_punct_numb = [[[strip_punctuation(stringa) and strip_numeric(stringa) for stringa in group] for group in corpus] for corpus in corpora] # print("\nList_No_punct_numb:") # print(List_No_punct_numb) return List_No_punct_numb
def _normalize_target(s): s = s.lower() for k, v in contractions.items(): s.replace(k, v) return strip_multiple_whitespaces(strip_punctuation(strip_tags(s))).split()
def preprocess_for_lda(tweet, pos_tag=True): """ Processes a tweet for entry into an LDA topic model. Removes hashtags and unnecessary characters, filters out stopwords, tokenizes the tweet into individual words, and lemmatizes the words. """ tweet = preprocess_tweet_text(tweet) # Handle contractions tweet = decontract(tweet) # Remove punctuation tweet = strip_punctuation(tweet) # Remove multiple spaces tweet = strip_multiple_whitespaces(tweet) # Tokenize, cases everything to lowercase, removes emojis tokens = simple_preprocess(tweet, max_len=30) # Lemmatize tokens if pos_tag: # This uses pos-tags and is slower but more accurate words = lemmatize_sentence(tokens) else: words = [lemmatizer.lemmatize(word) for word in tokens] # Remove stopwords words = [word for word in words if word not in FILTER_WORDS] return words
def search_solr_parse_json(query, collection, search_field): """ Searches the arxiv_cs_metadata collection on arxiv_identifier (search_field) using the resp. arxiv id as the query, parses the json result and returns it as a list of dictionaries where each dictionary corresponds to a record. ARGUMENTS: query, string: each arxiv id collection: the Solr collection name (=arxiv_cs_metadata) search_field: the Solr field which is queried (=arxiv_identifier) RETURNS: docs, list of dicts: the documents (records) returned by Solr AFTER getting the JSON response and parsing it.""" solr_url = 'http://localhost:8983/solr/' + collection + '/select' url_params = {'q': query, 'rows': 1, 'df': search_field} solr_response = requests.get(solr_url, params=url_params) if solr_response.ok: data = solr_response.json() # Only one result, so index 0. docs = data['response']['docs'] if docs == []: print(docs, query) return None, None doc = docs[0] title = doc.get('title').replace('\n', ' ') # Normalize the title title = preprocessing.strip_multiple_whitespaces( preprocessing.strip_punctuation(title.lower())) published_year = doc.get('published_date')[:4] return title, published_year else: print("Invalid response returned from Solr") sys.exit(11)
def get_fig_captions(self): """ Get the figures captions of the Notebook document Returns ------- captions : list of str Figures captions """ captions = [] cap = '' for line in self.get_figs_paragraph().splitlines(): if line.startswith('-'): if cap is not None: captions.append(cap) cap = '' else: cap = cap + ' ' + line captions.append(cap) captions = [strip_non_alphanum(strip_punctuation(cap.lower())) if not cap == '' else None for cap in captions] return captions
def preprocessing(text): '''Preprocesses a text using standard gensim techniques: removes stopwords, strips short words (1-2 characters), strips numbers, strips http addresses, strips Unicode from emoji etc., lowercases everything, strips extra spaces, punctuation, non-alphanumeric symbols. Also perform stemming input: text: a string returns: the preprocessed string. ''' text = text.lower() text = preprocess.remove_stopwords(text) # remove stop words text = preprocess.strip_short(text) #get rid of short words text = preprocess.strip_numeric(text) #get rid of numbers p = re.compile(r'(http.*\s)|(http.*$)') text = p.sub('',text) p = re.compile(r'[^\x00-\x7F]+') text = p.sub('',text) text = preprocess.strip_multiple_whitespaces(text) text = preprocess.strip_punctuation(text) text = preprocess.strip_non_alphanum(text) text = preprocess.remove_stopwords(text) text = preprocess.strip_short(text) # stemming words = text.split() stemmed_words = [stemmer.stem(word) for word in words] text = ' '.join(stemmed_words) return text
def clean_text(text): """ Cleans the text in the only argument in various steps ARGUMENTS: text: content/title, string RETURNS: cleaned text, string""" if isfloat(text): try: if math.isnan(text): return '' except TypeError: print('text: {}'.format(text)) return '' # Replace newlines by space. We want only one doc vector. text = text.replace('\n', ' ').lower() # Expand contractions: you're to you are and so on. # text = contractions.fix(text) # Remove stop words text = preprocessing.remove_stopwords(text) # Remove html tags and numbers: can numbers possible be useful? text = preprocessing.strip_tags(preprocessing.strip_numeric(text)) # Remove punctuation -- all special characters text = preprocessing.strip_multiple_whitespaces( preprocessing.strip_punctuation(text)) #text = re.sub(r'[^\w\s]', '', text.lower()) # STEMMING (Porter) automatically lower-cases as well # To stem or not to stem, that is the question #text = preprocessing.stem_text(text) return text
def customized_strip(s): """ Static function that strips a given text of most unwanted characters. Args: s (string): text we want to strip Returns: string: stripped text """ # strip the comments s = s.replace('"', '') s = s.replace("'", '') s = s.replace('“', '') s = s.replace('”', '') s = re.sub('https?:\/\/[^\s]+', ' ', s) # strip urls s = re.sub('[\d]+', ' ', s) # strip numbers # strip whitespace s = s.replace("\r", ' ').replace("\xa0", ' ') s = re.sub('[\s]+', ' ', s) s = strip_punctuation(s) s = s.lower() return s
def tokenize(text): return [ token for token in gensim.utils.simple_preprocess( gpp.strip_non_alphanum( gpp.strip_punctuation( gpp.strip_multiple_whitespaces(gensim.utils.deaccent( text))))) if token not in gpp.STOPWORDS ]
def preprocess_text(corpus=[]): print("Preprocessing Corpus from list data structure") for i, val in enumerate(corpus): #iterate through list corpus[i] = corpus[i].strip('\n') corpus[i] = strip_punctuation(corpus[i]) corpus[i] = strip_non_alphanum(corpus[i]) corpus[i] = strip_numeric(corpus[i]) return corpus
def index_metadata(self, table): """ Function that will index metadata of the documents. Function will index descriptors, subjects of the documents. Since they are important for the document they will be indexed by word and as a whole descriptor/subject. Parameters: table : string name of the table Returns: None """ documents = self.get_documents('postgres', 'dbpass', 'eurlex_environment_only', table) for i, document in enumerate(documents): celex_number = document.get('document_celex_num') descriptor_name = document.get('descriptor_name', None) subject_name = document.get('subject_name', None) document_id = self.doc2id[celex_number] if descriptor_name is not None: for word in strip_punctuation(descriptor_name).lower().split(): if word not in self.stopwords: self.index[word].add(document_id) # We add the whole self.index[descriptor_name].add(document_id) if subject_name is not None: for word in strip_punctuation(subject_name).lower().split(): if word not in self.stopwords: self.index[word].add(document_id) # We add the whole self.index[subject_name].add(document_id) if i % 10000 == 0: print(f""" Currently finished {i} documents. The size of index is {len(self.index)} """)
def process_data(text_array): sents = text_array for i, sentence in enumerate(sents): sents[i] = strip_punctuation(sentence) #d = ' '.join(word_tokenize(text_array)) sents[i] = remove_stopwords(sents[i]) sents[i] = sents[i].lower() return sents
def new_processor(token): str = unidecode(token) str = strip_punctuation(str) tokens = sp(str) tokens = [PorterStemmer.stem(token) for token in tokens] # str = " ".join(tokens) # str = strip_multiple_whitespaces(str) # str = str.strip(' ') return tokens
def clean_text(text): """ Cleans the text in the only argument in various steps ARGUMENTS: text: content/title, string RETURNS: cleaned text, string""" # Expand contractions: you're to you are and so on. text = contractions.fix(text) # Remove punctuation -- all special characters text = preprocessing.strip_multiple_whitespaces(preprocessing.strip_punctuation(text)) return text
def preprocessing(corpus): for document in corpus: doc = strip_numeric(document) doc = remove_stopwords(doc) doc = strip_short(doc, 3) #doc = stem_text(doc) doc = strip_punctuation(doc) strip_tags(doc) yield gensim.utils.tokenize(doc, lower=True)
def _normalize(s): s = s.lower() for k, v in contractions.items(): s.replace(k, v) return strip_multiple_whitespaces( strip_non_alphanum( strip_numeric(remove_stopwords(strip_punctuation( strip_tags(s)))))).split()
def tokenize(self, text): """ Remove punctuation and lowercase text, then generate tokens of our chat file. """ return [ token for token in simple_preprocess(strip_punctuation(text.strip())) if token not in self.STOPWORDS ]
def preprocess_text(corpus,field_name = 'Comment'): print("Preprocessing Corpus from pandas data frame") for index, row in corpus.iterrows(): #iterate through rows in dataframe line = row['Comment'].strip('\n') line = strip_punctuation(line) line = strip_non_alphanum(line) line = strip_numeric(line) line = strip_multiple_whitespaces(line) line = strip_short(line) #add cleaned text line to new dataframe corpus.at[index,field_name] = line #set value at row/column in corpus dataframet return corpus
def main(): """ Main function""" sconn = db_connect() scur = sconn.cursor() create_acl_mag_table(sconn) reject = open('AdditionalOutputs/no_acl_mag_mapping.txt', 'w') with open('Metadata/acl-metadata.txt', 'r', encoding='ISO-8859-1') as aclfile: content = aclfile.read() #'id = {D10-1001}\nauthor = {Rush, Alexander M.; Sontag, David; Collins, Michael John; Jaakkola, Tommi} #\ntitle = {On Dual Decomposition and Linear Programming Relaxations for Natural Language Processing}\n #venue = {EMNLP}\nyear = {2010}\n\nid = {D10-1002}\nauthor = {Huang, Zhongqiang; Harp' lines = content.split('\n\n') for line in lines: parts = line.split('\n') # 'id = {D10-1002}\nauthor = {Huang, Zhongqiang; Harper, Mary P.; Petrov, Slav}\ntitle = {Self- # Training with Products of Latent Variable Grammars}\nvenue = {EMNLP}\nyear = {2010}' acl_id = parts[0][parts[0].find('{') + 1:parts[0].find('}')] title = parts[2][parts[2].find('{') + 1:parts[2].find('}')] print(parts[4]) publishedyear = int(parts[4][parts[4].find('{') + 1:parts[4].find('}')]) title = preprocessing.strip_multiple_whitespaces( preprocessing.strip_punctuation(title.lower())).strip() query1 = 'select paperid from papers where papertitle=%s and publishedyear=%s;' cur.execute(query1, (title, publishedyear)) paperid = cur.fetchone() if paperid: paperid = paperid['paperid'] query2 = "select paperid from papers where papertitle=%s;" if not paperid: # Try the query without the year cur.execute(query2, (title, )) resultset = cur.fetchone() if not resultset: # Skip this reference, not found in MAG reject.write('{}\n'.format(acl_id)) continue paperid = resultset['paperid'] insert_into_acl_mag(sconn, scur, acl_id, paperid, publishedyear) try: sconn.commit() except: print("Something went wrong while committing, attempting to rollback!") sconn.rollback() scur.execute("select count(*) from acl_mag") print("No. of records in db=", scur.fetchall()) sconn.close() reject.close()
def dataprocessing(x): x = rmvhtmltags(x) x = remove_urls(x) x = x.lower() x = rmvspclcharacter(x) x = remove_stopwords(x) x = strip_punctuation(x) x = strip_multiple_whitespaces(x) x = lemmatize_words(x) x = ' '.join([re.sub(r'\d+', '', i) for i in word_tokenize(x)]) return x
def export(type_data='train'): print("Extracting data...") if type_data.lower() == 'train': filename = 'training.1600000.processed.noemoticon.csv' elif type_data.lower() == 'test': filename = 'testdata.manual.2009.06.14.csv' data_file = codecs.open('Sentiment140/' + filename, encoding='ISO-8859-1') data = [] for tweet in data_file.read().split('\n')[:-1]: data.append( [string for string in tweet.split('"') if string not in ['', ',']]) data_file.close() labels = [(float(tweet[0]) / 4.0) for tweet in data] tweets = [tweet[-1] for tweet in data] print("Preprocessing data...") for i, tweet in enumerate(tweets): new_tweet = ' '.join([word for word in tweet.split(' ') if len(word)\ > 0 and word[0] not in ['@', '#'] and 'http' not\ in word]).strip() pro_tweet = [ word[:-3] if word[-3:] == 'xxx' else word for word in preprocess_string(new_tweet.replace('not', 'notxxx')) ] #pro_tweet = preprocess_string(new_tweet) if len(pro_tweet) < 2: tweets[i] = strip_punctuation(stem_text(new_tweet.lower())).\ strip().split() else: tweets[i] = pro_tweet sys.stdout.write("\r%d tweet(s) pre-processed out of %d\r" % (i + 1, len(tweets))) sys.stdout.flush() print("\nCleaning data...") backup_tweets = np.array(tweets) backup_labels = np.array(labels) tweets = [] labels = [] for i, tweet in enumerate(backup_tweets): if len(tweet) >= 2: tweets.append(tweet) labels.append(backup_labels[i]) del backup_tweets del backup_labels # Shuffle the dataset data = list(zip(tweets, labels)) np.random.shuffle(data) tweets, labels = list(zip(*data)) return (tweets, labels)
def process_string(string, stemming=True, remove_stopwords=True): string = string.lower() abbreviations = re.findall(r'(?:[a-z]\.)+', string) for abbr in abbreviations: string = string.replace(abbr, abbr.replace('.', '')) string = pproc.strip_punctuation(string) if remove_stopwords: string = pproc.remove_stopwords(string) if stemming: string = pproc.stem_text(string) string = string.strip() return string
def string_processor(token): # str = str(token) str = unidecode(token) # str = strip_custom(str) str = remove_stopwords(str) str = strip_punctuation(str) str = strip_non_alphanum(str) # will rm puncs tokens = sp(str) tokens = [token.lemma_ for token in tokens] # lemma_ will replace i to -PRON-, sorce code bug tokens = [porter_stemmer.stem(token) for token in tokens] str = " ".join(tokens) str = strip_multiple_whitespaces(str) str = str.strip(' ') return str
def get_text_sentences(filepath, sbd_model): tokens_by_sentence = [] with codecs.open(filepath, encoding='utf8') as f: raw_text = f.read() #raw_text = raw_text.lower() raw_text = strip_multiple_whitespaces(raw_text) sentences = splitta.sbd.sbd_text(sbd_model, raw_text, do_tok=False) for s in sentences: new_s = strip_punctuation(s) tokens_by_sentence.append(list(utils.tokenize(new_s, deacc=True, lowercase=True))) #print raw_text #for filt in self.preprocess: # raw_text = filt(raw_text) #text = list(utils.tokenize(raw_text, deacc=True, lowercase=True)) return sentences, tokens_by_sentence
def save_word_dict(text): proc_text = [] sentences = text sentences = tokenize.sent_tokenize(sentences) for sentence in sentences: sentence_without_stops = remove_stopwords(sentence) sentence_without_stops = stem_text(sentence_without_stops) sentence_without_stops = strip_short(sentence_without_stops) sentence_without_stops = strip_punctuation(sentence_without_stops) proc_sentence = word_tokenize(sentence_without_stops.lower()) if len(proc_sentence) == 0: continue proc_text.append(proc_sentence) dictionary = corpora.Dictionary(proc_text) return [dictionary, proc_text, sentences]
def main(): logging.basicConfig(format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO) logging.info("Loading sentences dict...") sentences_dict_file = "../../experiments/ptr/sunlight_full_train.sentences_dict.p" with open(sentences_dict_file) as f: sentences_dict = cPickle.load(f) logging.info("Loading files") ct = 0 tokens_by_sentence_dict = {} for fname, sentence_list in sentences_dict.iteritems(): ct += 1 tokens_by_sentence = [] for s in sentence_list: new_s = strip_punctuation(s) tokens_by_sentence.append(list(utils.tokenize(new_s, deacc=True, lowercase=True))) tokens_by_sentence_dict[fname] = tokens_by_sentence if ct % 10000 == 0: logging.info("Writing tokens by sentence %s" % ct) with open("../../experiments/ptr/sunlight_full_train.tokens_by_sentence.%s.p" % str(ct), "w") as f: cPickle.dump(tokens_by_sentence_dict, f) tokens_by_sentence_dict = {} sys.exit() # data_path = '../../data/fcc/sunlight_full_partitions/' # create corpus output_path = "../../experiments/ptr/" filename = "sunlight_full_train" # logger.info("Saving files") with open(os.path.join(output_path, filename + ".sentences_dict.p"), "w") as f: cPickle.dump(sentences_dict, f) with open(os.path.join(output_path, filename + ".tokens_by_sentence_dict.p"), "w") as f: cPickle.dump(tokens_by_sentence_dict, f)
def clean_string(string): # Empty strings if not string or string == 'N': return None string = deaccent(string).lower() # Remove quote text string = re.sub(re_reply_to, '', string) string = re.sub(re_quote_line, '', string) string = re.sub(re_youtube_link, ' YOUTUBELINK ', string) string = re.sub(re_link, ' WEBLINK ', string) string = re.sub(re_pol_board, ' pol ', string) string = re.sub(re_b_board, ' RANDOMBOARD ', string) string = re.sub(re_chan_board, ' CHANBOARD ', string) string = strip_punctuation(string) # Punctuation to remove completely # string = re.sub(re_punc_to_none, '', string) # Substitute in this order # string = re.sub(re_ellipsis, ' <ELLIPSIS> ', string) # string = re.sub(re_echoes, ' <ECHOES> ', string) # string = re.sub(re_pol_board, ' <POLBOARD> ', string) # string = re.sub(re_numbers, ' <NUMBER> ', string) # string = re.sub(re_period, ' <PERIOD> ', string) # string = re.sub(re_question, ' <QUESTION> ', string) # Replace all other punc to spaces and remove whitespace in between # string = re.sub(re_punc_to_space, ' ', string) string = ' '.join([word for word in [w.strip() for w in string.split()]]) return string if string else None