def preprocess(text): #convert text to lower case text = text.lower() #removing whitespace text.strip() #removing digits text = gensim.parsing.preprocessing.strip_numeric(text) #text = ' '.join(s for s in text.split() if not any(c.isdigit() for c in s)) #print(text) #remove stopwords text = gensim.parsing.preprocessing.remove_stopwords(text) #strip punctutation text = gensim.parsing.preprocessing.strip_punctuation2(text) #strip multiple whitepsace that might occur after we remove stopwords text = gensim.parsing.preprocessing.strip_multiple_whitespaces(text) p = PorterStemmer() text = ' '.join(p.stem(word) for word in text.split()) #print(text) return text
def dataToXYListRead(fileName): with open(fileName) as file: porter_stemmer = PorterStemmer() lineCount = 0 wordSentenceDbLi = [] while True: line = file.readlines(1) if not line: break # if lineCount == 20: # break jsonLine = json.loads(line[0]) # noStopWords = remove_stopwords(jsonLine['text']) # stemWords = porter_stemmer.stem(noStopWords) stemWords = porter_stemmer.stem(jsonLine['text']) tokenWords = simple_preprocess(stemWords, deacc=True) # print(tokenWords) wordSentenceDbLi.append(tokenWords) lineCount += 1 # yelpDic = corpora.Dictionary(wordSentenceDbLi) # yelpDic.save('yelpDictionary.dict') # print(yelpDic.token2id) # print(yelpDic[8]) return wordSentenceDbLi
def preprocess(data, stem_data, remove_stopwords): processed = [] stemmer = PorterStemmer() for file in data: # lowercasing all text file = str(file).lower() # removing non-alpha characters file = re.sub('[^a-zA-Z]', ' ', file) # tokenizing articles tokenized = word_tokenize(file) # removing stop words from tokens stop_removed_tokens = [] if remove_stopwords: for word in tokenized: if word not in stop_words: stop_removed_tokens.append(word) else: stop_removed_tokens = tokenized if stem_data: stemmed = [] for token in stop_removed_tokens: stemmed.append(stemmer.stem(token)) processed.append(stemmed) else: processed.append(stop_removed_tokens) return processed
def stem_text(text): """ Return lowercase and (porter-)stemmed version of string `text`. """ text = utils.to_unicode(text) p = PorterStemmer() return ' '.join(p.stem(word) for word in text.split())
def token_stem(text): tokens = simple_preprocess(text, deacc=True) porter_stemmer = PorterStemmer() stem_tokens = [porter_stemmer.stem(word) for word in tokens] return stem_tokens
def stem_text(text): """ Return lowercase and (porter-)stemmed version of string `text`. """ p = PorterStemmer() return ' '.join( p.stem(word) for word in text.lower().split()) # lowercasing required by the stemmer
def document_preprocess(text): p = PorterStemmer() first = text.encode('ascii', 'ignore').decode('utf-8').lower() second = preprocessing.remove_stopwords(first) third = preprocessing.strip_punctuation(second) fourth = preprocessing.strip_short(preprocessing.strip_numeric(third)) fifth = p.stem(fourth) return fifth
def find_documents(self, term, stemming=False): stemmer = PorterStemmer() if stemming: term = stemmer.stem(term) term_id = self.get_id_for_term(term) if term_id < 0: return set() docs = self.get_related_documents(term_id) return set(docs)
def cleanText(self, textToClean): textLower = str(textToClean).lower() englishText = "".join( [char for char in textLower if char in string.printable]) textNoPunc = "".join( [char for char in englishText if char not in string.punctuation]) textStop = remove_stopwords(textNoPunc) porter = PorterStemmer() textStemmed = porter.stem(textStop) return (textStemmed.split())
def spimi_invert( files: List[str], stemmer: PorterStemmer, blocks_dir: str, memory_available: int, ) -> List[str]: """SPIMI-Invert procedure. Collect terms, docIDs, term-frequencies into a block (dictionary of dictionaries) that fits in available memory, write each block's dictionary to disk, and start a new dictionary for the next block. Args: files: List of filepaths. stemmer: Gensim porter stemmer. blocks_dir: Directory where blocks are saved. memory_available: Available memory in bytes. Returns: List of filenames of saved blocks. """ memory_used = 0 outputed_blocks = [] block_index = 0 dictionary = {} for docId, token in token_stream(files): memory_used += sys.getsizeof(token) term = stemmer.stem(token) if term not in dictionary.keys(): dictionary[term] = {} if docId not in dictionary[term].keys(): dictionary[term][docId] = 0 dictionary[term][docId] += 1 # save term freq. in document if memory_used > memory_available: # Sort terms and write to disk with shelve.open(blocks_dir + "block" + str(block_index)) as f: for k in sorted(dictionary.keys()): f[k] = dictionary[k] outputed_blocks.append("block" + str(block_index)) block_index += 1 memory_used = 0 dictionary = {} # Save last block if dictionary: with shelve.open(blocks_dir + "block" + str(block_index)) as f: for k in sorted(dictionary.keys()): f[k] = dictionary[k] outputed_blocks.append("block" + str(block_index)) return outputed_blocks
def processing(body_text): p = PorterStemmer() stopset = set([ 'doi', 'preprint', 'copyright', 'org', 'https', 'et', 'al', 'author', 'figure', 'table', 'rights', 'reserved', 'permission', 'use', 'used', 'using', 'biorxiv', 'medrxiv', 'license', 'fig', 'fig.', 'al.', 'Elsevier', 'PMC', 'CZI', '-PRON-', 'usually', r'\usepackage{amsbsy', r'\usepackage{amsfonts', r'\usepackage{mathrsfs', r'\usepackage{amssymb', r'\usepackage{wasysym', r'\setlength{\oddsidemargin}{-69pt', r'\usepackage{upgreek', r'\documentclass[12pt]{minimal' ]) cStopwords = STOPWORDS.union(stopset) resultlist = [] for text in body_text: tokens = [] for item in gensim.parsing.preprocess_string(text): if item not in cStopwords: p.stem(item) tokens.append(item) yield model.infer_vector(tokens)
def build_name_index(docs: List[str], stemmer: PorterStemmer) -> None: """Build index from list of song names. Args: docs: List of filenames. stemmer: Gensim porter stemmer. """ index_names = defaultdict(dict) for docId, doc in enumerate(docs): for token in pretty_doc(doc).split(): term = stemmer.stem(token) index_names[term][docId] = 1 with shelve.open("index_names") as index: index.update(index_names)
def load_data(tweets_tsv, tweets_postag): """ Return tweets id,user id,tweets label,raw tweets,tokenized tweets, tweets in PoS, PoS tagged tweets and stemmed tweets in a pandas Dataframe. :param tweets_tsv: <SID><tab><UID><tab><CLASS><tab><TWITTER_MESSAGE> :parm tweets_postag: ark-TweetNLP `./runTagger.sh --output-format conll --input-formt txt --input-field 4` :rtype: pandas.DataFrame """ o = open(tweets_tsv, 'r', encoding='utf-8').readlines() p = open(tweets_postag).read() raw = p.split('\n\n') raw_pos_data = [line.split('\n') for line in raw] pos_data = [] for tweet in raw_pos_data: pos_data.append([tuple(word_pos.split('\t')) for word_pos in tweet]) stemmer = PorterStemmer() data = {} for idx, line in enumerate(o): tweet_id, user_id, adr, text = line.split('\t') data[tweet_id] = {} data[tweet_id]['user_id'] = user_id data[tweet_id]['adr'] = adr data[tweet_id]['raw_text'] = text data[tweet_id]['stem_text'] = [ stemmer.stem(w_pos[0]) for w_pos in pos_data[idx] ] data[tweet_id]['tok_text'] = [w_pos[0] for w_pos in pos_data[idx]] data[tweet_id]['pos_token'] = [w_pos[1] for w_pos in pos_data[idx]] data[tweet_id]['pos_text'] = [ '#'.join(list(w_pos)) for w_pos in pos_data[idx] ] df = pd.DataFrame.from_dict(data, orient='index') df.adr = df.adr.astype('int') df.user_id = df.user_id.astype('int') logger.info("Loaded dataframe from {0} and {1}".format( tweets_tsv, tweets_postag)) logger.info("Dataframe information:\n") df.info() return df
def preprocess(file_name, number_of_documents): stemmer = PorterStemmer() fp1 = open("preprocessed.txt", "wb") fp2 = open("preprocessed-cmptext.txt", "wb") pickle.dump(number_of_documents, fp1) for line in file_name: preprocess_list1 = gensim.utils.simple_preprocess(line, max_len=20) preprocess_list2 = [] for word in preprocess_list1: if word not in stop_words: preprocess_list2.append(word) pickle.dump(stemmer.stem_documents(preprocess_list2), fp1) for word in preprocess_list2: fp2.write(stemmer.stem(word.encode('utf-8'))) fp2.write(' ') fp2.write('\n') fp1.close() fp2.close()
def stem_text(text): """Transform `s` into lowercase and stem it. Parameters ---------- text : str Returns ------- str Unicode lowercased and porter-stemmed version of string `text`. Examples -------- >>> from gensim.parsing.preprocessing import stem_text >>> stem_text("While it is quite useful to be able to search a large collection of documents almost instantly.") u'while it is quit us to be abl to search a larg collect of document almost instantly.' """ text = utils.to_unicode(text) p = PorterStemmer() return ' '.join(p.stem(word) for word in text.split())
def stem_text(text): """Transform `s` into lowercase and stem it. Parameters ---------- text : str Returns ------- str Unicode lowercased and porter-stemmed version of string `text`. Examples -------- >>> from gensim.parsing.preprocessing import stem_text >>> stem_text("While it is quite useful to be able to search a large collection of documents almost instantly.") u'while it is quit us to be abl to search a larg collect of document almost instantly.' """ #text = utils.to_unicode(text) p = PorterStemmer() return ' '.join(p.stem(word) for word in text.split())
def __init__(self, path): """Load the downloaded corpus. Parameters ---------- path : string Path to the extracted zip file. If 'summaries-gold' is in a folder called 'opinosis', then the Path parameter would be 'opinosis', either relative to you current working directory or absolute. """ # citation path = os.path.join(path, "summaries-gold") dictionary = Dictionary() corpus = [] stemmer = PorterStemmer() for directory, b, filenames in os.walk(path): # each subdirectory of path is one collection of reviews to a specific product # now get the corpus/documents for filename in filenames: filepath = directory + os.sep + filename # write down the document and the topicId and split into train and testdata with open(filepath) as file: doc = file.read() preprocessed_doc = [ stemmer.stem(token) for token in re.findall(r'\w+', doc.lower()) if token not in STOPWORDS ] dictionary.add_documents([preprocessed_doc]) corpus += [dictionary.doc2bow(preprocessed_doc)] # and return the results the same way the other corpus generating functions do self.corpus = corpus self.id2word = dictionary
from gensim.parsing.porter import PorterStemmer sentence = [ "This", "sentence", "was", "transformed", "using", "Porter", "Stemmer" ] porterStemmer = PorterStemmer() print(" ".join([porterStemmer.stem(word) for word in sentence]))
def clean(self, stopfile=None, startindex=0, stem=False): #need a filename if self.__raw_text == None: raise FileNotFoundError('No raw text file provided') print('Cleaning raw text data...', end='', flush=True) #rename class level cleaned text filename to one created here cleaned_text = '{}_cleaned.txt'.format(self.__raw_text[:-4]) #initialize stopwords. default is nltk stopwords if stopfile == None: stops = set(stopwords.words('english')) elif os.path.isfile(stopfile): with codecs.open(stopfile, 'r', encoding='utf-8', errors='ignore') as f: stops = set([word.strip().lower() for word in f.readlines()]) else: raise Exception('Stopfile not found') #this regex object will remove all punctuation regex = re.compile(r'[^a-zA-Z0-9\s]|[\_\^\`\[\]\\]', re.IGNORECASE) #clean the review file with codecs.open(self.__raw_text, 'r', encoding='utf-8', errors='ignore') as f: with open(cleaned_text, 'w') as cleaned: t1 = time.time() #go through every line in the file for line in f: #remove non-alphanumeric symbols line = regex.sub(' ', line) #split into tokens and ignore stopwords if stem: stemmer = PorterStemmer() tokens = [ stemmer.stem(word.lower().strip()) for word in line.split(' ') if word not in stops ] else: tokens = [ word.lower().strip() for word in line.split(' ') if word not in stops ] #remove empty elements from the list tokens = [word for word in tokens if word != ''] #ignore elements before start index tokens = tokens[startindex:] #write cleaned data to file if len(tokens) > 0: cleaned.write('{}\n'.format(' '.join(tokens))) t2 = time.time() #update the text file name self.__raw_text = cleaned_text print('done') #display time it took to do all of this print('Raw text cleaned in {} minutes.'.format(int((t2 - t1) / 60)))
def postprocess_words(words): for i in range(len(words)): p = PorterStemmer() words[i] = p.stem(words[i]) return words
class nlp_engine: def __init__(self, ): # self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") # self.model = BertModel.from_pretrained("bert-base-uncased") self.use_coref = False self.vectorizer = data.vectorize('fast') self.stemmer = PorterStemmer() def make_multiple_choice(self, word, sentence, ai=False): if (len(word.split(' ')) == 1): if (word in sentence.split(' ')): most = self.vectorizer.most_similar(word.lower(), topn=20) choices = [x[0].lower() for x in most] tmp = list() stems = list() tmp.append(word.lower()) stems.append(self.stemmer.stem(word.lower())) for x in choices: stem = self.stemmer.stem(x.replace('.', '')) if (stem not in stems): stems.append(stem) tmp.append(x) return { "type": 'mc', "question": sentence.replace(word, '______'), "answer": tmp[:4] } else: return None else: return None def fill_in_blank(self, word, sentence): if (len(word.split(' ')) == 1): if (word in sentence.split(' ')): return { "type": 'fb', "question": sentence.replace(word, '______'), "answer": word } else: return None else: return None def __call__(self, context): context_doc = nlp(context) ents = context_doc.ents sentences_doc = [x.text for x in context_doc.sents] self.use_coref = USE_COREF and context_doc._.has_coref sentence_lengths = [len(sentences_doc[0])] for i in range(1, len(sentences_doc)): sentence_lengths.append(sentence_lengths[i - 1] + len(sentences_doc[i])) ner_spans = list() for ent in ents: #use ner for i in range(len(sentence_lengths)): if (ent.start_char < sentence_lengths[i]): ner_spans.append((ent.text, sentences_doc[i])) nn_spans = list() if (self.use_coref): for token in context_doc: if ((token.pos_ == 'PROPN' or token.pos_ == 'NOUN') and token._.in_coref): for cluster in token._.coref_clusters: nn_spans.append((token.text, cluster.main.text)) sa_pairs = dict() for a, s in ner_spans + nn_spans: if (a not in sa_pairs): sa_pairs[a] = [s] elif (s != sa_pairs[a]): sa_pairs[a].append(s) qa_pairs = list() mc_pairs = list() for w, sents in sa_pairs.items(): for s in sents: o = self.fill_in_blank(w, s) if (o != None): qa_pairs.append(o) o = self.make_multiple_choice(w, s, False) if (o != None): qa_pairs.append(o) mc_pairs.append(o) print(len(qa_pairs)) print(mc_pairs) return qa_pairs
class Indexer: """Class that implements querying and printing results. Attributes: root: Directory where songs lyrics is. docs: List of documents filenames, that is used to get docID. word_count: Length of each document. stemmer: Gensim porter stemmer. index: Index file descriptor. """ def __init__(self, docs: List[str], index_path: str, root: str = "lyrics/") -> None: """Initialize Indexer by assigning attributes and opening index file. Args: docs: List of documents filenames. index_path: Path to index file. root: Directory where songs lyrics is. """ self.root = root self.docs = docs self.stemmer = PorterStemmer() self.get_word_count() self.index = shelve.open(index_path) def get_word_count(self) -> None: """Get length of each document.""" self.word_count = [] for _, doc in enumerate(self.docs): with open(self.root + doc, "r") as f: self.word_count.append(sum(len(line.split()) for line in f)) def tfidf(self, posting: Dict[int, int]) -> List[Posting]: """Calculate tf-idf for documents in posting list. Args: posting: Posting list with term frequences of some term. Returns: List of (docID, tf-idf score), sorted by docID. """ return [(k, v / self.word_count[k] * log2(len(self.docs) / len(posting))) for k, v in sorted(posting.items())] def query_boolean(self, tokens: List[str]) -> List[Posting]: """Recursively parse boolean query in DNF. Args: tokens: List of tokens. Returns: List of (docID, tf-idf score) of query hits. """ try: split_idx = tokens.index("OR") return or_postings( self.query_boolean(tokens[:split_idx]), self.query_boolean(tokens[split_idx + 1:]), ) except ValueError: pass try: split_idx = tokens.index("AND") return and_postings( self.query_boolean(tokens[:split_idx]), self.query_boolean(tokens[split_idx + 1:]), ) except ValueError: pass try: split_idx = tokens.index("NOT") return not_postings(self.query_boolean(tokens[split_idx + 1:]), len(self.docs)) except ValueError: pass term = self.stemmer.stem(tokens[0]) try: posting = self.tfidf(self.index[term]) except KeyError: return [] return posting def render_file(self, tokens: List[str], filename: str, offset: int = 20) -> None: """Print song name and text snippet. Args: tokens: List of query tokens. filename: Song filename. offset: How much to extend text snippet in symbols. """ # Print band and song name print("\033[4m{}\033[0m:".format(pretty_doc(filename))) # Try to find term in song text with open(self.root + filename) as f: text = "".join(f.readlines()) lowered_text = text.lower() for token in tokens: try: w = self.stemmer.stem(token) w_match = re.search(r"\b{}\w*\b".format(w), lowered_text) l_match = re.search(r"\b{}.*?\n".format(w), lowered_text) if w_match.start() > offset: print("...", end="") start = max(0, w_match.start() - offset) print("{}\033[1m{}\033[0m{}".format( text[start:w_match.start()], text[w_match.start():w_match.end()], text[w_match.end():l_match.end() - 1], )) except AttributeError: print("-") def render(self, tokens: List[str], hits: List[Posting], count: int) -> None: """Print the results of query. Args: tokens: List of query tokens. hits: Query results as a list of (docID, tf-idf score). count: How many hits to print. """ if not hits: print("Nothing found") return tokens = [t for t in tokens if t not in ["AND", "OR", "NOT"]] print("{} hits found.\n".format(len(hits))) for docId, v in hits[:count]: print("[relevance = {:.3f}]".format(v)) self.render_file(tokens, self.docs[docId]) print() def query(self, query: str, count: int = 10) -> None: """Query index and print results, sorted by tf-idf. Args: query: Query string. count: How many hits to print. """ tokens = query.split() hits = self.query_boolean(tokens) hits = sorted(hits, key=lambda item: item[1], reverse=True) self.render(tokens, hits, count) def close(self) -> None: """Close index file.""" self.index.close()
def stem_text(text): """ Return lowercase and (porter-)stemmed version of string `text`. """ p = PorterStemmer() return ' '.join(p.stem(word) for word in text.lower().split()) # lowercasing required by the stemmer
def stem_text(self, text): text = utils.to_unicode(text) p = PorterStemmer() return ' '.join(p.stem(word) for word in text.split())
def normalize(tokens): stemmer = PorterStemmer() return [stemmer.stem(word) for word in tokens]