def preprocess(data, stem_data, remove_stopwords): processed = [] stemmer = PorterStemmer() for file in data: # lowercasing all text file = str(file).lower() # removing non-alpha characters file = re.sub('[^a-zA-Z]', ' ', file) # tokenizing articles tokenized = word_tokenize(file) # removing stop words from tokens stop_removed_tokens = [] if remove_stopwords: for word in tokenized: if word not in stop_words: stop_removed_tokens.append(word) else: stop_removed_tokens = tokenized if stem_data: stemmed = [] for token in stop_removed_tokens: stemmed.append(stemmer.stem(token)) processed.append(stemmed) else: processed.append(stop_removed_tokens) return processed
def stem_text(text): """ Return lowercase and (porter-)stemmed version of string `text`. """ text = utils.to_unicode(text) p = PorterStemmer() return ' '.join(p.stem(word) for word in text.split())
def get_top_labels(country_scores): """Output: Dictionary --> key = country, value = list of top labels""" ps = PorterStemmer() country_scores['stem'] = ps.stem_documents( [str(word) for word in country_scores['label']]) country_scores = country_scores.sort_values(by="tfidf", ascending=False) country_scores_pmi = country_scores.sort_values(by="pmi", ascending=False) top_labels = [[] for x in range(country_scores['num_countries'][0])] top_labels_pmi = [[] for x in range(country_scores_pmi['num_countries'][0])] used_stems = set() used_stems_pmi = set() for row in country_scores.itertuples(): if row.stem not in used_stems: if len(top_labels[row.country]) < 40: top_labels[row.country].extend([ row.label.lower().replace('_', ' ').strip(), row.tfidf, row.pmi, row.country ]) used_stems.add(row.stem) for row in country_scores_pmi.itertuples(): if row.stem not in used_stems_pmi: if len(top_labels_pmi[row.country]) < 40: top_labels_pmi[row.country].extend([ row.label.lower().replace('_', ' ').strip(), row.tfidf, row.pmi, row.country ]) used_stems_pmi.add(row.stem) return top_labels, top_labels_pmi
def assign_country_label_ids(country_scores, label_score, num_candidates, use_label_candidates): """Output: Dictionary --> key = country, value = label""" ps = PorterStemmer() country_scores['stem'] = ps.stem_documents([str(word) for word in country_scores['label']]) country_scores = country_scores.sort_values(by=label_score, ascending=False) used_stems = set() if use_label_candidates is True: # print('USING SOFT LABELING') final_labels = defaultdict(set) final_ids = defaultdict(set) for row in country_scores.itertuples(): if len(final_labels[row.country]) <= num_candidates and row.stem not in used_stems and row.stem not in BLACK_LIST: final_labels[row.country].add([row.label.lower().replace('_', ' ').strip(), row.tfidf, row.pmi]) final_ids[row.country].add(int(row.label_id)) used_stems.add(row.stem) else: final_labels = {} final_ids = {} for row in country_scores.itertuples(): if row.country not in final_labels and row.stem not in used_stems and row.stem not in BLACK_LIST: final_labels[row.country] = [row.label.lower().replace('_', ' ').strip(), row.tfidf, row.pmi] final_ids[row.country] = row.label_id used_stems.add(row.stem) return final_labels, final_ids
def process(text): p = PorterStemmer() doc_nor = text.lower() doc_sw = remove_stopwords(doc_nor) doc_stem = p.stem_sentence(doc_sw) stem = re.findall(r'[\w]+', doc_stem) return doc_stem.split()
def preprocess(text): #convert text to lower case text = text.lower() #removing whitespace text.strip() #removing digits text = gensim.parsing.preprocessing.strip_numeric(text) #text = ' '.join(s for s in text.split() if not any(c.isdigit() for c in s)) #print(text) #remove stopwords text = gensim.parsing.preprocessing.remove_stopwords(text) #strip punctutation text = gensim.parsing.preprocessing.strip_punctuation2(text) #strip multiple whitepsace that might occur after we remove stopwords text = gensim.parsing.preprocessing.strip_multiple_whitespaces(text) p = PorterStemmer() text = ' '.join(p.stem(word) for word in text.split()) #print(text) return text
def dataToXYListRead(fileName): with open(fileName) as file: porter_stemmer = PorterStemmer() lineCount = 0 wordSentenceDbLi = [] while True: line = file.readlines(1) if not line: break # if lineCount == 20: # break jsonLine = json.loads(line[0]) # noStopWords = remove_stopwords(jsonLine['text']) # stemWords = porter_stemmer.stem(noStopWords) stemWords = porter_stemmer.stem(jsonLine['text']) tokenWords = simple_preprocess(stemWords, deacc=True) # print(tokenWords) wordSentenceDbLi.append(tokenWords) lineCount += 1 # yelpDic = corpora.Dictionary(wordSentenceDbLi) # yelpDic.save('yelpDictionary.dict') # print(yelpDic.token2id) # print(yelpDic[8]) return wordSentenceDbLi
def stem_text(text): """ Return lowercase and (porter-)stemmed version of string `text`. """ p = PorterStemmer() return ' '.join( p.stem(word) for word in text.lower().split()) # lowercasing required by the stemmer
def document_preprocess(text): p = PorterStemmer() first = text.encode('ascii', 'ignore').decode('utf-8').lower() second = preprocessing.remove_stopwords(first) third = preprocessing.strip_punctuation(second) fourth = preprocessing.strip_short(preprocessing.strip_numeric(third)) fifth = p.stem(fourth) return fifth
def open_spider(self, spider): # Create an empty model w2v = gensim.models.Word2Vec([['seo']], min_count=1) self.name = '/tmp/Word2Vec' + str(time.time()) # Save it w2v.save(self.name) self.p = PorterStemmer() self.stop_words = set(stopwords.words('french'))
def token_stem(text): tokens = simple_preprocess(text, deacc=True) porter_stemmer = PorterStemmer() stem_tokens = [porter_stemmer.stem(word) for word in tokens] return stem_tokens
def preprocess_data(train_data, test_data): custom_stopwords = set(ENGLISH_STOP_WORDS) custom_stopwords.update(["say", "says", "said", "saying", "just", "year", "man", "men", "woman", \ "women", "guy", "guys", "run", "running", "ran", "run", "do", "don't", "does", "doesn't" , \ "doing", "did", "didn't", "use", "used", "continue", "number", "great", "big", "good", "bad", \ "better", "worse", "best", "worst", "actually", "fact", "way", "tell", "told", "include", "including", \ "want", "wanting", "will", "won't", "give", "given", "month", "day", "place", "area", "look", \ "looked", "far", "near", "get", "getting", "got", "know", "knows", "knew", "long", "week", "have", \ "has", "haven't", "hasn't", "having", "had", "hadn't", "not", "think", "thinking", "Monday", \ "Tuesday", "Wednesday", "Thursday", "Saturday", "Sunday", "high", "low", "thing", "there", "they're", \ "It", "I've", "I'd", "He's", "She's", "They've", "I'm", "You're", "your", "their", "his", "hers", \ "mine", "today", "yesterday", "it", "ve", "going", "go", "went", "lot", "don", "saw", "seen", "come", "came"]) titled_train_data = add_titles(train_data['Content'], train_data['Title']) if test_data is not None: titled_test_data = add_titles(test_data['Content'], test_data['Title']) # Removing stopwords: new_train_data = [] for doc in titled_train_data: doc_wordlist = doc.split() new_doc_wordlist = [ word for word in doc_wordlist if word not in custom_stopwords ] new_doc = ' '.join(new_doc_wordlist) new_train_data.append(new_doc) if test_data is not None: new_test_data = [] for doc in titled_test_data: doc_wordlist = doc.split() new_doc_wordlist = [ word for word in doc_wordlist if word not in custom_stopwords ] new_doc = ' '.join(new_doc_wordlist) new_test_data.append(new_doc) p = PorterStemmer() train_docs = p.stem_documents(new_train_data) if test_data is not None: test_docs = p.stem_documents(new_test_data) print "my_method: Stemmed data." vectorizer = TfidfVectorizer() X = vectorizer.fit_transform(train_docs) if test_data is not None: Test = vectorizer.transform(test_docs) else: Test = None print "my_method: Vectorized data" svd_model = TruncatedSVD(n_components=200) # random_state=13 X = svd_model.fit_transform(X) if test_data is not None: Test = svd_model.transform(Test) print "SVD'd data" return X, Test
def find_documents(self, term, stemming=False): stemmer = PorterStemmer() if stemming: term = stemmer.stem(term) term_id = self.get_id_for_term(term) if term_id < 0: return set() docs = self.get_related_documents(term_id) return set(docs)
def cleanText(self, textToClean): textLower = str(textToClean).lower() englishText = "".join( [char for char in textLower if char in string.printable]) textNoPunc = "".join( [char for char in englishText if char not in string.punctuation]) textStop = remove_stopwords(textNoPunc) porter = PorterStemmer() textStemmed = porter.stem(textStop) return (textStemmed.split())
def __iter__(self): p = PorterStemmer() for entry in scandir("./dblpfiledir"): with open(entry.path, "r", encoding="utf-8") as f: jsoncontent = json.load(f) doc = jsoncontent["abstract"] if len(doc) > 0: doc = remove_stopwords(doc) doc = p.stem_sentence(doc) words = simple_preprocess(doc, deacc=True) yield TaggedDocument(words=words, tags=[jsoncontent['index']])
def preprocess_documents(documents): # preprocess each doc documents = [preprocess_doc(doc) for doc in documents] # stem the documents stemmer = PorterStemmer() documents = stemmer.stem_documents(documents) # split all the documents into list of tokens documents = [doc.split() for doc in documents] return documents
def __iter__(self): p = PorterStemmer() for index, row in self.train_data.iterrows(): name = row['ScriptLink'] with open('./movie_scripts/' + name) as file: #print("Im here") script = file.readlines() script = "".join(script) script = remove_stopwords(script) script = p.stem_sentence(script) words = simple_preprocess(script) yield TaggedDocument(words=words, tags=[index])
def load_data(tweets_tsv, tweets_postag): """ Return tweets id,user id,tweets label,raw tweets,tokenized tweets, tweets in PoS, PoS tagged tweets and stemmed tweets in a pandas Dataframe. :param tweets_tsv: <SID><tab><UID><tab><CLASS><tab><TWITTER_MESSAGE> :parm tweets_postag: ark-TweetNLP `./runTagger.sh --output-format conll --input-formt txt --input-field 4` :rtype: pandas.DataFrame """ o = open(tweets_tsv, 'r', encoding='utf-8').readlines() p = open(tweets_postag).read() raw = p.split('\n\n') raw_pos_data = [line.split('\n') for line in raw] pos_data = [] for tweet in raw_pos_data: pos_data.append([tuple(word_pos.split('\t')) for word_pos in tweet]) stemmer = PorterStemmer() data = {} for idx, line in enumerate(o): tweet_id, user_id, adr, text = line.split('\t') data[tweet_id] = {} data[tweet_id]['user_id'] = user_id data[tweet_id]['adr'] = adr data[tweet_id]['raw_text'] = text data[tweet_id]['stem_text'] = [ stemmer.stem(w_pos[0]) for w_pos in pos_data[idx] ] data[tweet_id]['tok_text'] = [w_pos[0] for w_pos in pos_data[idx]] data[tweet_id]['pos_token'] = [w_pos[1] for w_pos in pos_data[idx]] data[tweet_id]['pos_text'] = [ '#'.join(list(w_pos)) for w_pos in pos_data[idx] ] df = pd.DataFrame.from_dict(data, orient='index') df.adr = df.adr.astype('int') df.user_id = df.user_id.astype('int') logger.info("Loaded dataframe from {0} and {1}".format( tweets_tsv, tweets_postag)) logger.info("Dataframe information:\n") df.info() return df
def main(): ############################## Setup Code ##################################### global document_index path = "./myroot" file3 = open("cmptext.txt", "w+") number_of_documents = recursive_read(path, file3) file3.close() print 'All files read' file3 = open("cmptext.txt", "r") preprocess(file3, number_of_documents) file3.close() print 'All files processed' print 'Word2Vec begins' model = get_word2vec(number_of_documents) #includes trigrams model.save('vocab.txt') print 'Word2Vec done' vocabulary = model.wv.vocab.keys() inverted_index = get_inverted_index(vocabulary) for item in inverted_index.keys(): if not inverted_index[item]: del inverted_index[item] with open("inverted-index.txt", "wb") as fp: pickle.dump(inverted_index, fp) fp.close() get_tfidf_vectors(inverted_index, number_of_documents) get_norms() doc_num = 0 file1 = open("cmptext.txt", "r") stemmer = PorterStemmer() for document in file1: spreprocessed = [] doc_num += 1 for line in document.split('. '): temp1 = [] temp2 = [] temp1 = gensim.utils.simple_preprocess(line, max_len=20) for word in temp1: if word not in stop_words: temp2.append(word) spreprocessed.append(stemmer.stem_documents(temp2)) with open("spreprocessed" + str(doc_num) + ".txt", "w+") as fp: pickle.dump(spreprocessed, fp) fp.close() del spreprocessed[:] file1.close() with open("document-index.txt", "wb") as fp: pickle.dump(document_index, fp) fp.close()
class Word2VecPipeline(object): def open_spider(self, spider): # Create an empty model w2v = gensim.models.Word2Vec([['seo']], min_count=1) self.name = '/tmp/Word2Vec' + str(time.time()) # Save it w2v.save(self.name) self.p = PorterStemmer() self.stop_words = set(stopwords.words('french')) def process_item(self, item, spider): if 'title' in item: # This time, we don't update the item, instead we build the model. document = item.get('title') + ' ' + item.get('body') words = [ word_tokenize(self.p.stem_sentence(s)) for s in sent_tokenize(document) ] # Load current model w2v = gensim.models.Word2Vec.load(self.name) # Train our model w2v.build_vocab(words, update=True) w2v.train(words, total_examples=w2v.corpus_count, epochs=w2v.iter) # Save it for the next item w2v.save(self.name) return item
def __init__(self, docs: List[str], index_path: str, root: str = "lyrics/") -> None: """Initialize Indexer by assigning attributes and opening index file. Args: docs: List of documents filenames. index_path: Path to index file. root: Directory where songs lyrics is. """ self.root = root self.docs = docs self.stemmer = PorterStemmer() self.get_word_count() self.index = shelve.open(index_path)
def __init__(self): self.morph = {'ru': MorphAnalyzer(), 'en': PorterStemmer()} self.other_significance = 10 self.stopwords = dict() with open('../thirdparty/stop_ru.json', 'r', encoding='utf-8') as f: self.stopwords['ru'] = json.load(f) with open('../thirdparty/stop_en.json', 'r', encoding='utf-8') as f: self.stopwords['en'] = json.load(f)
def assign_country_label_ids(country_scores, label_score): """Output: Dictionary --> key = country, value = label""" ps = PorterStemmer() country_scores['stem'] = ps.stem_documents([str(word) for word in country_scores['label']]) country_scores = country_scores.sort_values(by=label_score, ascending=False) used_stems = set() final_labels = {} final_ids = {} for row in country_scores.itertuples(): if row.country not in final_labels and row.stem not in used_stems and row.stem not in BLACK_LIST: final_labels[row.country] = [row.label.lower().replace('_', ' ').strip(), row.tfidf, row.pmi] final_ids[row.country] = row.label_id used_stems.add(row.stem) return final_labels, final_ids
def get_top_labels(country_scores, label_score, num_candidates=5): """Output: Dictionary --> key = country, value = list of top labels""" ps = PorterStemmer() country_scores['stem'] = ps.stem_documents([str(word) for word in country_scores['label']]) country_scores = country_scores.sort_values(by=label_score, ascending=False) num_labels_per_country = defaultdict(int) top_labels = [] used_stems = set() for row in country_scores.itertuples(): if row.stem not in used_stems: if num_labels_per_country[row.country] < num_candidates: top_labels.append([row.country, row.label_id, row.label.lower().replace('_', ' ').strip(), row.tfidf, row.pmi]) used_stems.add(row.stem) num_labels_per_country[row.country] += 1 return top_labels
def preprocess(file_name, number_of_documents): stemmer = PorterStemmer() fp1 = open("preprocessed.txt", "wb") fp2 = open("preprocessed-cmptext.txt", "wb") pickle.dump(number_of_documents, fp1) for line in file_name: preprocess_list1 = gensim.utils.simple_preprocess(line, max_len=20) preprocess_list2 = [] for word in preprocess_list1: if word not in stop_words: preprocess_list2.append(word) pickle.dump(stemmer.stem_documents(preprocess_list2), fp1) for word in preprocess_list2: fp2.write(stemmer.stem(word.encode('utf-8'))) fp2.write(' ') fp2.write('\n') fp1.close() fp2.close()
def spimi_invert( files: List[str], stemmer: PorterStemmer, blocks_dir: str, memory_available: int, ) -> List[str]: """SPIMI-Invert procedure. Collect terms, docIDs, term-frequencies into a block (dictionary of dictionaries) that fits in available memory, write each block's dictionary to disk, and start a new dictionary for the next block. Args: files: List of filepaths. stemmer: Gensim porter stemmer. blocks_dir: Directory where blocks are saved. memory_available: Available memory in bytes. Returns: List of filenames of saved blocks. """ memory_used = 0 outputed_blocks = [] block_index = 0 dictionary = {} for docId, token in token_stream(files): memory_used += sys.getsizeof(token) term = stemmer.stem(token) if term not in dictionary.keys(): dictionary[term] = {} if docId not in dictionary[term].keys(): dictionary[term][docId] = 0 dictionary[term][docId] += 1 # save term freq. in document if memory_used > memory_available: # Sort terms and write to disk with shelve.open(blocks_dir + "block" + str(block_index)) as f: for k in sorted(dictionary.keys()): f[k] = dictionary[k] outputed_blocks.append("block" + str(block_index)) block_index += 1 memory_used = 0 dictionary = {} # Save last block if dictionary: with shelve.open(blocks_dir + "block" + str(block_index)) as f: for k in sorted(dictionary.keys()): f[k] = dictionary[k] outputed_blocks.append("block" + str(block_index)) return outputed_blocks
def get_task_topic_dist(ldah, task): #clean, stem and tokenize the prompt/task string task = clean_text(task) task = PorterStemmer().stem_sentence(task) tokens = word_tokenize(task) tokens = [word for word in tokens if not word in stop_words] #compute topic distribution and sort dist = get_topic_dist(ldah, tokens) dist.sort() return dist
def processing(body_text): p = PorterStemmer() stopset = set([ 'doi', 'preprint', 'copyright', 'org', 'https', 'et', 'al', 'author', 'figure', 'table', 'rights', 'reserved', 'permission', 'use', 'used', 'using', 'biorxiv', 'medrxiv', 'license', 'fig', 'fig.', 'al.', 'Elsevier', 'PMC', 'CZI', '-PRON-', 'usually', r'\usepackage{amsbsy', r'\usepackage{amsfonts', r'\usepackage{mathrsfs', r'\usepackage{amssymb', r'\usepackage{wasysym', r'\setlength{\oddsidemargin}{-69pt', r'\usepackage{upgreek', r'\documentclass[12pt]{minimal' ]) cStopwords = STOPWORDS.union(stopset) resultlist = [] for text in body_text: tokens = [] for item in gensim.parsing.preprocess_string(text): if item not in cStopwords: p.stem(item) tokens.append(item) yield model.infer_vector(tokens)
def __init__(self): # info self.cluster_info = dict() self.article_info = dict() if config_meta['word_tokenizer'] == 'bert': self.word_tokenize = config.bert_tokenizer.tokenize elif config_meta['word_tokenizer'] == 'nltk': self.word_tokenize = nltk.tokenize.word_tokenize else: raise ValueError('Invalid word_tokenizer: {}'.format( config_meta['word_tokenizer'])) self.sent_tokenize = nltk.tokenize.sent_tokenize self.porter_stemmer = PorterStemmer() if config_meta['texttiling']: self.para_tokenize = TextTilingTokenizer() # base pat BASE_PAT = '(?<=<{0}> )[\s\S]*?(?= </{0}>)' BASE_PAT_WITH_NEW_LINE = '(?<=<{0}>\n)[\s\S]*?(?=\n</{0}>)' BASE_PAT_WITH_RIGHT_NEW_LINE = '(?<=<{0}>)[\s\S]*?(?=\n</{0}>)' # query pat self.id_pat = re.compile(BASE_PAT.format('num')) self.title_pat = re.compile(BASE_PAT.format('title')) self.narr_pat = re.compile(BASE_PAT_WITH_NEW_LINE.format('narr')) # article pat self.text_pat = re.compile(BASE_PAT_WITH_NEW_LINE.format('TEXT')) self.graphic_pat = re.compile(BASE_PAT_WITH_NEW_LINE.format('GRAPHIC')) self.type_pat = re.compile(BASE_PAT_WITH_NEW_LINE.format('TYPE')) self.para_pat = re.compile(BASE_PAT_WITH_NEW_LINE.format('P')) self.proc_params_for_questions = { 'rm_dialog': False, 'rm_stop': False, 'stem': True, }
def stem_text(text): """Transform `s` into lowercase and stem it. Parameters ---------- text : str Returns ------- str Unicode lowercased and porter-stemmed version of string `text`. Examples -------- >>> from gensim.parsing.preprocessing import stem_text >>> stem_text("While it is quite useful to be able to search a large collection of documents almost instantly.") u'while it is quit us to be abl to search a larg collect of document almost instantly.' """ text = utils.to_unicode(text) p = PorterStemmer() return ' '.join(p.stem(word) for word in text.split())
def stem_text(text): """Transform `s` into lowercase and stem it. Parameters ---------- text : str Returns ------- str Unicode lowercased and porter-stemmed version of string `text`. Examples -------- >>> from gensim.parsing.preprocessing import stem_text >>> stem_text("While it is quite useful to be able to search a large collection of documents almost instantly.") u'while it is quit us to be abl to search a larg collect of document almost instantly.' """ #text = utils.to_unicode(text) p = PorterStemmer() return ' '.join(p.stem(word) for word in text.split())
def stem_text(text): """ Return lowercase and (porter-)stemmed version of string `text`. """ p = PorterStemmer() return ' '.join(p.stem(word) for word in text.lower().split()) # lowercasing required by the stemmer