def __init__(self, filename): self.filename = filename self.alpha_re = re.compile( "^[a-zA-Z]+'?[a-zA-Z]*?$" ) # allow single apostrophes but not double apostrophes: note, this doesn't allow 'ere if stemming: self.stemmer = PorterStemmer() self.treebank_word_tokenizer = TreebankWordTokenizer()
class MySentences(object): def __init__(self, filename): self.filename = filename self.alpha_re = re.compile("^[a-zA-Z]+'?[a-zA-Z]*?$") # allow single apostrophes but not double apostrophes: note, this doesn't allow 'ere if stemming: self.stemmer = PorterStemmer() self.treebank_word_tokenizer = TreebankWordTokenizer() # TODO: use http://www.nltk.org/howto/stem.html def __iter__(self): for line in open(self.filename): # TODO find a better way to distinguish sentence-initial caps from proper noun # sentences come like this: # 80 10:11 p.m., an unwanted person was reported on College Avenue. # 81 10:13 a.m., a report of shoplifting was investigated at Maine Smoke Shop on College Avenue. # 82 10:14: The proportion of A-levels awarded at least an A grade has fallen for the second year in a row. # 141529 But the debt ceiling may end up being the larger inflection point, especially as Obama staked out a hard-lined position against negotiating over that vote. sentence = line.decode("UTF8").split("\t", 1)[-1].replace(".", ' ') words = [word.lower() for word in self.treebank_word_tokenizer.tokenize(sentence) if re.match(self.alpha_re, word) ] if stemming: stems = [self.stemmer.stem(word) for word in words] yield stems else: yield words
class MySentences(object): def __init__(self, filename): self.filename = filename self.alpha_re = re.compile( "^[a-zA-Z]+'?[a-zA-Z]*?$" ) # allow single apostrophes but not double apostrophes: note, this doesn't allow 'ere if stemming: self.stemmer = PorterStemmer() self.treebank_word_tokenizer = TreebankWordTokenizer() # TODO: use http://www.nltk.org/howto/stem.html def __iter__(self): for line in open(self.filename): # TODO find a better way to distinguish sentence-initial caps from proper noun # sentences come like this: # 80 10:11 p.m., an unwanted person was reported on College Avenue. # 81 10:13 a.m., a report of shoplifting was investigated at Maine Smoke Shop on College Avenue. # 82 10:14: The proportion of A-levels awarded at least an A grade has fallen for the second year in a row. # 141529 But the debt ceiling may end up being the larger inflection point, especially as Obama staked out a hard-lined position against negotiating over that vote. sentence = line.decode("UTF8").split("\t", 1)[-1].replace(".", ' ') words = [ word.lower() for word in self.treebank_word_tokenizer.tokenize(sentence) if re.match(self.alpha_re, word) ] if stemming: stems = [self.stemmer.stem(word) for word in words] yield stems else: yield words
prefix = '' if len(sys.argv) > 2: prefix = sys.argv[2] in_folder = sys.argv[1] def ensure_dir(f): d = os.path.dirname(f) if d and not os.path.exists(d): os.makedirs(d) if prefix: ensure_dir(prefix) tokr = TreebankWordTokenizer() word_tokenize = tokr.tokenize stopwordlist = set() with codecs.open("stop-en", "r", encoding='utf-8') as stopfile: for word in stopfile: stopwordlist.add(word.strip()) ## tokeniser changes punctuation in unfortunate ways.. stopwordlist.add(u"``") stopwordlist.add(u"''") stopwordlist.add(u"link") ipaddress = re.compile(r"[0-9]+(\.[0-9]+){3}$")
def __init__(self, filename): self.filename = filename self.alpha_re = re.compile("^[a-zA-Z]+'?[a-zA-Z]*?$") # allow single apostrophes but not double apostrophes: note, this doesn't allow 'ere if stemming: self.stemmer = PorterStemmer() self.treebank_word_tokenizer = TreebankWordTokenizer()
def rephrase(sentence, theme=None): new_version = [] # def reducer(memo, char): #TODO: should not split on apostrophes in "can't" or "eatin'" but should for "spiders'" # if (char.isalpha() or char == "'") == (memo[-1][-1].isalpha() or memo[-1][-1] == "'"): # memo[-1] = memo[-1] + char # return memo # else: # return memo + [char] # words = reduce( reducer , list(sentence.strip()), [' ']) words = [] my_sentence = sentence[0:] tokens = TreebankWordTokenizer().tokenize(sentence) for token in tokens: split_idx = my_sentence.index(token) if (len(my_sentence[0:split_idx]) > 0): words.append(my_sentence[0:split_idx]) words.append(my_sentence[split_idx:(split_idx + len(token))]) my_sentence = my_sentence[(split_idx + len(token)):] # words = [word for sublist in [[word, ' '] for word in TreebankWordTokenizer().tokenize(sentence)] for word in sublist ] # # TODO: put this in the API # bigrams_model_name = 'bigrams_model_nyt_sentences_5.5M_5.bin' # trigrams_model_name = "trigrams_model_nyt_sentences_5.5M_5.bin" # ngrams_models = { # "bigrams": bigrams_model_name, # "trigrams": trigrams_model_name # } # which_ngrams_model = "trigrams" # ngrams_model = Phrases.load(ngrams_models[which_ngrams_model]) # print("ngrammized", ngrams_model[words]) resp = requests.get("http://localhost:5000/phrases/" + ','.join(words)).json() if len(resp["grouped"]) > 0: phrases = resp["grouped"] else: phrases = words for word in phrases: if word == '': continue if word in stopwords or not word[0].isalpha(): new_version.append(word) else: # TODO: retain punctuation prefixes = [] suffixes = [] for thing, fix in [(list(word), prefixes), (list(reversed(word)), suffixes)]: for char in thing: if non_alpha_chars.match(char): fix.append(char) else: break # print(word) # removes non-alphabetic chars in the word. # I don't recall why this is in here. # removing it so "you're" stays "you're" not "youre" # word = non_alpha_chars.sub('', word) # print(word) if stemming: word_stem = stemmer.stem(word) word_morphology = word_diff(word, word_stem) random_weird_changes = word_diff(word_stem, word) print("%s = %s + %s (- %s)" % (word, word_stem, word_morphology, random_weird_changes)) synonym_stem = get_synonym(word_stem, theme) if random_weird_changes: print("random_weird_changes: %s" % random_weird_changes) reversed_synonym_stem = list( synonym_stem[::-1] ) # [::-1] reverses a string because Python is intuitive for subtraction in reversed(random_weird_changes): if reversed_synonym_stem[0] == subtraction: print("removed %s" % reversed_synonym_stem[0]) reversed_synonym_stem = reversed_synonym_stem[1:] else: break synonym_stem = reversed_synonym_stem[:: -1] # [::-1] reverses a string because Python is intuitive misspelled_synonym = ''.join(synonym_stem) + "".join( word_morphology) synonym = spell_check.correct(misspelled_synonym) else: synonym = get_synonym(word, theme) print("new version" + str(new_version)) new_version.append(''.join(prefixes) + synonym + ''.join(reversed(suffixes))) print("new version" + str(new_version)) return titlecase(''.join(new_version).strip().replace(" 's", "'s"))