def removeURLsCleanStem(self): # preprocessor ''' Remove URLs and punct, lower case everything, Convert '-/-' to 'mut_mut', Keep tokens that start w/ letter or _ and are 2 or more chars. Stem, Replace \n with spaces ''' # This is currently the only preprocessor that uses a stemmer. # Would be clearer to import and instantiate one stemmer above, # BUT that requires nltk (via anaconda) to be installed on each # server we use. This is currently not installed on our linux servers # By importing here, we can use RefSample in situations where we don't # call this preprocessor, and it will work on our current server setup. global stemmer if not stemmer: import nltk.stem.snowball as nltk stemmer = nltk.EnglishStemmer() #------ def _removeURLsCleanStem(text): output = '' for s in urls_re.split(text): # split and remove URLs s = featureTransform.transformText(s).lower() for m in token_re.finditer(s): output += " " + stemmer.stem(m.group()) return output #------ self.setTitle( _removeURLsCleanStem( self.getTitle()) ) self.setAbstract( _removeURLsCleanStem( self.getAbstract()) ) self.setExtractedText( _removeURLsCleanStem( self.getExtractedText()) ) return self
def stem( token ): token_stem = token.decode( 'utf-8', 'ignore' ) try: # note: stemming includes lower casing token_stem = snowball.EnglishStemmer().stem( token_stem.lower() ) except: pass return token_stem
def get_stemmer(stemmer_type): if stemmer_type == 'lancaster': stemmer = LancasterStemmer() elif stemmer_type == 'porter': stemmer = PorterStemmer() else: stemmer = snowball.EnglishStemmer() return stemmer
def __init__(self, language='english'): self.documents = None self.training_queries = None self.validation_queries = None self.test_queries = None self.language = language self.stemmer = snowball.EnglishStemmer() self.stop_words = set(stopwords.words('english'))
def stemmed_count(words): stemmed_to_variant_counter = collections.defaultdict(collections.Counter) stemmer = snowball.EnglishStemmer() for word in words: normalized_word = stemmer.stem(word).lower() stemmed_to_variant_counter[normalized_word][word] += 1 return flatten_dict_of_counts(stemmed_to_variant_counter)
def __init__(self, k=5, language='english'): """ documents: pandas.DataFrame, [docid] => text string k: number of folders for the queries """ self.documents = None self.k = k self.language = language self.stemmer = snowball.EnglishStemmer() self.stop_words = set(stopwords.words('english'))
def word_count(): """ Get the word count of each word in the corpus (to be used for calculating vocabulary richness). Note that this uses the results of the SENNA tagger for POS, not the Stanford Tagger. """ count = {} # map words to counts stemmer = snowball.EnglishStemmer( ) # for stemming words so that different forms aren't counted separately stoplist = set( """( ) : , . { } [ ] ; . ' " ! ? @ # $ % * \ + what at "n't" if for a an on of the and to from in by or either neither so where there those these this that it which who whose but be is have should would it about into 've he she them i i. w. hi him her my me you your their our we with 't 's then than when have not""" .split()) dn = '/Volumes/Seagate Slim/litlab/tagged/' books = [fn for fn in os.listdir(dn) if fn.endswith('_tagged.txt')] for book in books: print book with open(dn + book) as f: sent_count = 0 # skip the first three sentences (which is just meta-data) while sent_count < 3: line = f.readline().strip() if not line: sent_count = sent_count + 1 # each token is on a separate line for line in f: if line.strip(): token, pos = line.split()[:2] parenth_replacements = {'(': '-LRB-', ')': '-RRB-'} token = parenth_replacements.get(token, token) # handle edge case that you forgot to handle earlier if token.startswith('grey-'): token = 'gray-' + token[5:] # for colors or things that share a name with a color is_color = token.startswith( "__COLOR__") and token.endswith("__COLOR__") tokens = (token[9:-9] if is_color else token).split("_") for t in tokens: t = stemmer.stem(t.lower( )) # lowercase and stem words for normalization if t not in stoplist: # exclude proper nouns, common words, and punctuation if t not in count: count[t] = 1 else: count[t] = count[t] + 1 pickle.dump(count, open('word_count', 'w'))
def word_to_id(word, index): """Returns the id of the well-formatted version of 'word'. If 'word' is not in 'index', puts it in, and returns the new id.""" word = word.decode('utf-8').lower() word = word.strip("\"") word = word.split("\'s")[0] esb = snowball.EnglishStemmer() word = esb.stem(word) try: return index[word] except KeyError: id_ = len(index) index[word] = id_ return id_
def __init__(self): self.patt = re.compile( '(?u)\w+://[\w\./#]+|&\w+;|\s+-\s+|\s+:\s+|\s+|[^\w\'-]+') self.stopwords = [] self.stemmer = snowball.EnglishStemmer() if 'stopwords.txt' in os.listdir(): try: f = open('./stopwords.txt', encoding='utf-8') self.stopwords = eval(f.read()) except: f.close() self.stopwords = stopwords.words('english') else: self.stopwords = stopwords.words('english')
def get_all_words(): if 'all_words' not in datavars and os.path.exists(FILENAME): with open(FILENAME, 'r') as datafile: datavars.update(json.load(datafile)) if 'all_words' not in datavars: # even after loading file stemmer = snowball.EnglishStemmer(ignore_stopwords=True) all_words = sorted( list( set( stemmer.stem(w) for w in corpus.words() if w.isalnum() and len(w) > 3) - set(stopwords.words()))) datavars['all_words'] = all_words with open(FILENAME, 'w') as datafile: json.dump(datavars, datafile, indent=4) return datavars['all_words']
def calc_vocab_score(tokens): """ Return a float that represents the richness of the vocabulary in the sentence. The more unique the vocabulary, the higher the score (all scores are positive). The score is calculated as follows: for every word w in the sentence, score = \sum{ log( ||C|| / w_c ) } / len(tokens), where w_c is the frequency in the corpus. This favours the use of rare words over penalization of the use of common words. """ stemmer = snowball.EnglishStemmer() # for stemming words so that different forms aren't counted separately score = 0 num_used = 0 # number of tokens actually used in the calculation for token in tokens: token = stemmer.stem(token.lower()) if token in corpus_count: # this check automatically excludes stopwords and pronouns score = score + math.log( corpus_count['_TOTAL_'] ) - math.log( corpus_count[token] ) num_used = num_used + 1 return score / max(float(num_used), 1)
def preprocess_text(tokens): wnl = nltk.WordNetLemmatizer() st = snowball.EnglishStemmer() # remove stopwords, punctuation, non-alphabetic characters; stem and lemmatize sentences = [] for i in range(len(tokens)): words = [ w for w in tokens[i] if w.lower() not in stopwords.words('english') and not w in string.punctuation ] words = [w for w in words if w.isalpha()] words = [st.stem(w) for w in words] # need to exclude pos from lemmatization for j in range(len(words)): if words[j] != 'pos': words[j] = wnl.lemmatize(words[j]) sentences.append(words) return sentences
from nltk import pos_tag from nltk.stem import snowball, WordNetLemmatizer, PorterStemmer, SnowballStemmer stemmer = snowball.EnglishStemmer() import re import sys from myModule import extraPrograms from myModule.objects import bidict import numpy as np from nltk import wordnet from nltk.tokenize import word_tokenize from chemtok import ChemTokeniser wordnet = wordnet.wordnet nltk_pos = [ 'LS', 'TO', 'VBN', "''", 'WP', 'UH', 'VBG', 'JJ', 'VBZ', '--', 'VBP', 'NN', 'DT', 'PRP', ':', 'WP$', 'NNPS', 'PRP$', 'WDT', '(', ')', '.', ',', '``', '$', 'RB', 'RBR', 'RBS', 'VBD', 'IN', 'FW', 'RP', 'JJR', 'JJS', 'PDT', 'MD', 'VB', 'WRB', 'NNP', 'EX', 'NNS', 'SYM', 'CC', 'CD', 'POS', '#' ] pos2idx = bidict({pos: i for (i, pos) in enumerate(nltk_pos)}) class chemtok: def __init__(self, kwargs={}): self.kwargs = kwargs def __call__(self, sents): return [ ChemTokeniser(sent, **self.kwargs).getTokenStringList()
def process_words(text, language=None, stem=True, to_ascii=True, character_level=False): if language is None: translator = Translator() if isinstance(text, list): language = translator.detect(text)[0].lang if language == "ro": if isinstance(text, list): text = [replace_diactitics(subtext) for subtext in text] else: text = replace_diactitics(text) if isinstance(text, list): if to_ascii: text = [ unicodedata.normalize('NFKD', subtext).encode('ascii', 'ignore').decode("ascii") for subtext in text ] text = [subtext.lower() for subtext in text] else: if to_ascii: text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode("ascii") text = text.lower() if language == "ro": stemmer = snowball.RomanianStemmer() if isinstance(text, list): words = [nltk.word_tokenize(subtext) for subtext in text] else: words = nltk.word_tokenize(text) procced_text = [] elif language == "it": stemmer = snowball.ItalianStemmer() if isinstance(text, list): words = [nltk.word_tokenize(subtext) for subtext in text] else: words = nltk.word_tokenize(text) procced_text = [] elif language == "en": stemmer = snowball.EnglishStemmer() if isinstance(text, list): words = [nltk.word_tokenize(subtext) for subtext in text] else: words = nltk.word_tokenize(text) procced_text = [] else: if isinstance(text, list): words = [nltk.word_tokenize(subtext) for subtext in text] else: words = nltk.word_tokenize(text) procced_text = [] stopw = [] if language in stopwords: stopw = stopwords[language] if isinstance(text, list): for i in range(len(words)): sent = words[i] sentence = [] if stem: for word in sent: word = keep_only_letters(word) if word not in stopw: if character_level: sentence += list(word) else: sentence.append(stemmer.stem(word)) else: for word in sent: word = keep_only_letters(word) if word not in stopw: if character_level: sentence += list(word) else: sentence.append(word) procced_text.append(sentence) else: for word in words: word = keep_only_letters(word) if word not in stopw: if character_level: procced_text += list(word) else: procced_text.append(word) return procced_text
def process_for_named_entity(text, language, to_ascii=True, stem=False, shorten=False): if language == "ro": if isinstance(text, list): text = [replace_diactitics(subtext) for subtext in text] else: text = replace_diactitics(text) if isinstance(text, list): if to_ascii: text = [ unicodedata.normalize('NFKD', subtext).encode('ascii', 'ignore').decode("ascii") for subtext in text ] text = [subtext.lower() for subtext in text] else: if to_ascii: text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode("ascii") text = text.lower() if language == "ro": stemmer = snowball.RomanianStemmer() if isinstance(text, list): words = [nltk.word_tokenize(subtext) for subtext in text] else: words = nltk.word_tokenize(text) procced_text = [] elif language == "it": stemmer = snowball.ItalianStemmer() if isinstance(text, list): words = [nltk.word_tokenize(subtext) for subtext in text] else: words = nltk.word_tokenize(text) procced_text = [] elif language == "en": stemmer = snowball.EnglishStemmer() if isinstance(text, list): words = [nltk.word_tokenize(subtext) for subtext in text] else: words = nltk.word_tokenize(text) procced_text = [] else: if isinstance(text, list): words = [nltk.word_tokenize(subtext) for subtext in text] else: words = nltk.word_tokenize(text) procced_text = [] if isinstance(text, list): for i in range(len(words)): sent = words[i] sentence = [] if stem: for word in sent: word = re.sub("[^a-z0-9]", "", word) if word != '': sentence.append(stemmer.stem(word)) else: for word in sent: word = re.sub("[^a-z0-9]", "", word) if word != '': sentence.append(word) procced_text.append(sentence) else: for word in words: word = re.sub("[^a-z0-9]", "", word) if word != '': if stem: word = stemmer.stem(word) procced_text.append(word) if isinstance(text, list): for i in range(len(procced_text)): company_name = procced_text[i] if len(company_name) > 0 and company_name[0] != 'null': if False and company_name[-1] in [ 'srl', 'ltd', 'spa', 'ltda', 'sl', 'snc' ]: contracted = ' '.join(company_name[:-1]) if not check_if_text_in_language( company_name[:-1] ) and len(contracted) > 6 and not is_number( contracted) and contracted not in [ 'data', 'aprile', 'group', 'azienda', 'profilo', 'alumino', 'stato', 'roma', 'service', 'area', 'estate', 'date 4', 'work', 'altre', 'italia', 'stage', 'ottobre 2008', 'strada', '16 luglio', 'espresso', 'export', 'prime', 'sala', 'panelli' ]: del company_name[-1] if shorten: while len(contracted) > 23: if len(contracted) - len(company_name[0]) < 15: break del company_name[0] contracted = ' '.join(procced_text) procced_text[i] = ' '.join(company_name) else: if len(procced_text) > 0 and procced_text[0] != 'null': if False and procced_text[-1] in [ 'srl', 'ltd', 'spa', 'ltda', 'sl', 'snc' ]: contracted = ' '.join(procced_text[:-1]) if not check_if_text_in_language(procced_text[:-1]) and len( contracted) > 6 and not is_number( contracted) and contracted not in [ 'data', 'aprile', 'group', 'azienda', 'profilo', 'allumino', 'stato', 'roma', 'service', 'area', 'estate', 'metalmeccanica' 'date 4', 'work', 'castel' 'altre', 'italia', 'controlo qualita', 'stage', 'ottobre 2008', 'atena', 'strada', '16 luglio', 'industriale', 'espresso', 'export', 'prime', 'sala', 'panelli' ]: del procced_text[-1] if shorten: while len(contracted) > 23: if len(contracted) - len(procced_text[0]) < 15: break del procced_text[0] contracted = ' '.join(procced_text) procced_text = ' '.join(procced_text) return procced_text
def tokenizer(string): stemmer = snowball.EnglishStemmer(ignore_stopwords=True) regex = re.compile('\w\w+') return tuple(stemmer.stem(w) for w in regex.findall(string))
def process(directory, save=".", threshold=0.7, include=False, subdirs=True): try: threshold = float(threshold) except ValueError: print("Invalid threshold value provided.") print("Using default 0.7") threshold = 0.7 if threshold > 1: threshold /= 100 # get filepaths filepaths = generate_file_list(directory, subdirs) # terminate program if no paths if len(filepaths) < 1: print("No files found") return "No files found" # terminate if only one file if len(filepaths) == 1: print("The comparison process requires at least two documents.") return "The comparison process requires at least two documents." # invalid files are reported in error report # valid files are processed further containers, invalid_files = list(), list() # record vocabulary vocabulary = list() stemmer = snowball.EnglishStemmer() for index, path in enumerate(filepaths, 1): try: file_contents = read_file(path) # file can be opened except Exception: # file cannot be opened print("File %s could not be opened\n" % path) invalid_files.append((path, "File could not be opened.")) continue if file_contents is False: # file contents cannot be read invalid_files.append((path, "File is not in an acceptable format.")) continue else: # file contents can be read print("\n") print("Parsing %s" % path) c = Container() c.set_path(path) c.set_index(index) # store raw file contents c.set_raw_text(file_contents) # retrieve named entities c.set_named_entities(get_named_entities(c.get_raw_text())) # print("Retrieved raw text") # tokenise file conents c.set_token_list(word_tokenize(c.get_raw_text())) # print("Tokenised") # only keep token stems if tokens aren't punctuation or stop words normalised = [stemmer.stem(token.lower()) for token in c.get_token_list() if token not in punctuation and token.lower() not in stopwords.words("english")] # add named entities to list of normalised tokens [normalised.append(entity) for entity in c.get_named_entities()] c.set_normalised_text_list(normalised) # print("Normalised") # cannot compare files with no text, report error and skip if len(c.get_normalised_text_list()) < 1: print("File %s is either empty or contains no significant terms." % path) invalid_files.append((path, "File is either empty or contains no significant terms")) continue # add any new words to global vocabulary [vocabulary.append(token) for token in c.normalised if token not in vocabulary] # # initialise frequency dictionaries # # c.term_frequencies = dict() # c.inverse_document_frequencies = dict() # store container to allow further processing later containers.append(c) print("File %s parsed" % path) # refuse further processing if there aren't at least two files containing text if len(containers) < 2: return "At least two documents containing text are required." # collect normalised documents in a single list to be used in idf calculation print("\nGathering normalised documents...") normalised_documents = [container.get_normalised_text_list() for container in containers] # print("Normalised documents gathered") # calculate term frequencies and inverse document frequencies for each term in each file print("Calculating inverse document frequencies...") inverse_document_frequencies = dict() for word in vocabulary: inverse_document_frequencies[word] = calculate_idf(word, normalised_documents) print("Calculating term frequencies...") for container in containers: term_freqs = dict() for word in vocabulary: term_freqs[word] = container.normalised.count(word) / len(container.normalised) container.set_term_frequencies(term_freqs) # shallow-copying already existing idf dict is faster than recreating each time for container in containers: container.set_inverse_document_frequencies(inverse_document_frequencies.copy()) # print("Calculated term frequencies and inverse document frequencies") print("Comparing documents...") results = [] threshold *= 100 # calculate similarity for every pair of documents for x in range(0, len(containers)): for y in range(x + 1, len(containers)): container_x = containers[x] container_y = containers[y] vector_x, vector_y = create_dense_vectors(container_x, container_y) if len(vector_x) == 0 or len(vector_y) == 0: similarity = 0.0 else: similarity = round(cosine_similarity(vector_x, vector_y) * 100, 2) if similarity >= threshold or include: results.append((container_x.get_path(), container_y.get_path(), similarity)) print("Ordering results...") results = order_similarity_tuples(results) print("Generating report...") # generate html string to write to file string = generate_html_string(results, invalid_files, threshold) # write html string to file write_path = "%s/similarity_report.html" % save try: write_string_to_file(string, write_path) except FileNotFoundError as e: write_path = "./similarity_report.html" write_string_to_file(string, write_path) print("Report generated!") return "Success"
def __init__(self, k=5, language='english'): self.documents = None self.k = k self.language = language self.stemmer = snowball.EnglishStemmer() self.stop_words = set(stopwords.words('english'))
from bs4 import BeautifulSoup, NavigableString, Tag from nltk.stem import snowball import re import datetime import numpy as np from sklearn import preprocessing from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.cross_validation import train_test_split stemmers = [snowball.FrenchStemmer(), snowball.EnglishStemmer()] def _vectorizer(tfidf): if tfidf: return TfidfVectorizer(analyzer="word", strip_accents='unicode', tokenizer=None, encoding=u'utf-8', preprocessor=None, stop_words=None, max_features=5000) else: return CountVectorizer(analyzer="word", strip_accents='unicode', tokenizer=None, encoding=u'utf-8', preprocessor=None, stop_words=None, max_features=5000) def preprocess_posts(data, with_stemmer): print "Preprocessing posts..."
#Spanish version of preprocessing #Bing translator Spanish->English #English stopwords #English SnowballStemmer import sys import re from nltk.corpus import stopwords from mstranslator import Translator #English and Spanish stemmer available from nltk.stem import snowball import string import regex #English because we translate first stemmer = snowball.EnglishStemmer(ignore_stopwords=False) translator = Translator('60864ac93121426d8fbbb1e2581a8c3e') stop_words_list = [] flat_stop_words_list = [] exclusion_list_en_es = [] has_hashtag_or_mention = [False] * 800 punctuation = [] punctuation.append(list(string.punctuation[2:6])) punctuation.append(string.punctuation[9]) punctuation.append(list(string.punctuation[20:22])) def make_stop_words_list(): #exclude words which are in both dictionaries
def test_stem_document(self): before = ["Computer", "Science"] after = TextProcessing.stem_document(before, snowball.EnglishStemmer()) assert after[0] == "comput" and after[1] == "scienc"
def __getStemmer(language): return { 'russian': snowball.RussianStemmer(), 'english': snowball.EnglishStemmer() }.get(language)
def main(out_dir, source, years): page = requests.get( "http://storage.googleapis.com/books/ngrams/books/datasetsv2.html" ) # gets the urls of the 1gram datafiles pattern = re.compile('href=\'(.*%s-%s-%s-.*\.gz)' % (source, TYPE, VERSION)) urls = pattern.findall(page.text) del page year_counts = { } # These are dicts that contain the occurence of a word in each year year_doc_counts = {} year_pos = {} for year in years: year_pos[year] = { } # Counts the occurrence of a word (distinguish words by pos) year_counts[year] = { } # Counts the occurrence of a word (does not distinguish words by pos) year_doc_counts[year] = {} # Counts the books where the word occurred print "Start loop" for url in urls: # iterates through the urls name = re.search('%s-(.*).gz' % VERSION, url).group(1) print "Downloading", name success = False while not success: # downloads the acutal datafile with open(out_dir + name + '.gz', 'w') as f: try: f.write(urllib2.urlopen(url, timeout=60).read()) success = True except: continue print "Unzipping", name # unzips the downloaded datafile subprocess.call(['gunzip', '-f', out_dir + name + '.gz', '-d']) print "Going through", name # iterates through the lines of the datafile and counts the uccurrence of the words with open(out_dir + name) as f: for l in f: try: split = l.strip().split('\t') if not POS.match(split[0]): continue count = int(split[2]) if count < 10: continue word_info = split[0].split("_") pos = word_info[-1] word = word_info[0].decode('utf-8').lower() word = word.strip("\"") word = word.split("\'s")[0] if not word.isalpha(): continue esb = snowball.EnglishStemmer() word = str(esb.stem(word)) year = int(split[1]) doc_count = int(split[3]) if not year in years: continue if not word in year_counts[year]: year_counts[year][word] = 0 year_doc_counts[year][word] = 0 year_pos[year][word] = collections.Counter() year_counts[year][word] += count year_doc_counts[year][word] += doc_count year_pos[year][word][pos] += count except UnicodeDecodeError: pass print "Deleting", name # deletes the downloaded files try: os.remove(out_dir + name) os.remove(out_dir + name + '.gz') except: pass print "Writing..." # writes the data into pkl files for year in years: ioutils.write_pickle(year_counts[year], out_dir + str(year) + "-counts.pkl") ioutils.write_pickle(year_doc_counts[year], out_dir + str(year) + "-doc_counts.pkl") ioutils.write_pickle(year_pos[year], out_dir + str(year) + "-pos.pkl")
def stem(word, stemmer=snowball.EnglishStemmer()): """Stem a word using Snowball by default. """ return stemmer.stem(word)
# Stemming.... # Probably best to preprocess the whole data set once # and stem it (and remove URLs) if stemming makes a big enough difference. # # Stemming in Vectorizer subclasses: # See: https://stackoverflow.com/questions/36182502/add-stemming-support-to-countvectorizer-sklearn # This is subtle: # Vectorizers have build_preprocessor() method that returns a preprocessor() # function. # The preprocessor() function is called for each document (string) to do any # preprocessing, returning string. # What we do here: Subclass each of the common Vectorizers # and override the build_preprocessor() method to return a stemming # preprocessor function. # --------------------------- stemmer = nltk.EnglishStemmer() token_re = re.compile("\\b([a-z_]\w+)\\b", re.IGNORECASE) # match words class StemmedCountVectorizer(CountVectorizer): def build_preprocessor(self): # override super's build_preprocessor method ''' Return preprocessor function that stems. ''' # get the super class's preprocessor function for this object. preprocessor = super(type(self), self).build_preprocessor() # Tokenize and stem the string returned by the super's preprocessor # method. # This should stem all words in {bi|tri|...}grams and preserve any # functionality implemented in the preprocessor.
class SampleRecord(object): """ Represents a training sample or a sample to predict. A training sample has a known class that it belongs to, A sample to predict may or may not have a known class (sometimes we do predictions on samples for which we know what class they belong to) Knows how to take a text representation of a record (typically a text string with delimitted fields) and parse into its fields Provides various methods to preprocess a sample record (if any) A SampleRecord can be marked as "reject". Has rejectReason, ... """ def __init__( self, s, ): self.rejected = False self.rejectReason = None self.parseInput(s) #---------------------- def parseInput(self, s): fields = s.split(FIELDSEP) if len(fields) == 6: # have known class name as 1st field self.knownClassName = fields[0] fields = fields[1:] else: self.knownClassName = None self.ID = str(fields[0]) self.isDiscard = str(fields[1]) self.status = fields[2] self.journal = fields[3] self.doc = self.constructDoc(fields[4]) #---------------------- def constructDoc(self, text): # Do what needs to be done to construct the text portion return text #---------------------- def getSampleAsText(self): # return this record as a text string if self.rejected: return None if self.knownClassName: fields = [self.knownClassName] else: fields = [] fields += [ self.ID, self.isDiscard, self.status, self.journal, self.doc, ] return FIELDSEP.join(fields) + RECORDSEP #---------------------- def getSampleName(self): return self.ID def getDiscard(self): return self.isDiscard def getStatus(self): return self.status def getJournal(self): return self.journal def getDocument(self): return self.doc def getKnownClassName(self): return self.knownClassName def isReject(self): return self.rejected def getRejectReason(self): return self.rejectReason #---------------------- # "Preprocessor" functions. # Each preprocessor should modify this sample and return itself #---------------------- refRemover = RefSectionRemover(maxFraction=0.4) # finds ref sections def removeRefSection(self): self.doc = SampleRecord.refRemover.getBody(self.doc) return self # --------------------------- miceRegex = re.compile(r'\bmice\b', flags=re.IGNORECASE) def rejectIfNoMice(self): if not SampleRecord.miceRegex.search(self.doc): self.rejected = True self.rejectReason = "Mice not found" return self # --------------------------- urls_re = re.compile(r'\bhttps?://\S*', re.IGNORECASE) # match URLs token_re = re.compile(r'\b([a-z_]\w+)\b') # match lower case words stemmer = nltk.EnglishStemmer() def removeURLsCleanStem(self): ''' Remove URLs and punct, lower case everything, Convert '-/-' to 'mut_mut', Keep tokens that start w/ letter or _ and are 2 or more chars. Stem, Replace \n with spaces ''' output = '' for s in SampleRecord.urls_re.split(self.doc): # split and remove URLs s = s.replace('-/-', ' mut_mut ').lower() for m in SampleRecord.token_re.finditer(s): output += " " + SampleRecord.stemmer.stem(m.group()) self.doc = output return self # --------------------------- def removeURLs(self): ''' Remove URLs, lower case everything, Convert '-/-' to 'mut_mut', ''' output = '' for s in SampleRecord.urls_re.split(self.doc): s = s.replace('-/-', ' mut_mut ').lower() output += ' ' + s self.doc = output return self # --------------------------- def addJournalFeature(self): ''' add the journal name as a text token to the document ''' jtext = 'journal__' + '_'.join(self.journal.split(' ')).lower() self.doc += " " + jtext return self # --------------------------- def truncateText(self): # for debugging, so you can see a sample record easily self.doc = self.doc[:50] return self