Пример #1
0
def get_list(text):  # Tokenize each string and change to normalize
    list1 = RegexpTokenizer(r'\w+|\$[\d\.]+|\S+')
    list1_text = list1.tokenize(str(text))
    temp = []
    for lis in list1_text:
        lis = lis.lower()
        temp.append(lis)
    return temp
Пример #2
0
def filter_words(text_file):
    text = read_text(text_file)
    tokenizer = RegexpTokenizer(r'\w+')
    token = tokenizer.tokenize(text)
    pos_list = nltk.pos_tag(token)
    filtered_words = [
        w for w in pos_list if not w[0] in stopwords.words('english')
    ]
    return filtered_words
Пример #3
0
    def tokenization(self):
        """ Tokenize the contents of the posts and remove the strings that contains punctuations,
            numbers or only single letter

        :return: a dataframe which each row becomes list of tokens
        """

        tqdm.pandas()
        tokenizer = RegexpTokenizer(r'[a-zA-Z]{2,}')
        tokens_df = self.text_df.progress_apply(
            lambda x: tokenizer.tokenize(x.lower()))
        return tokens_df
Пример #4
0
def inference(model_dir, text):
    tokenizer = RegexpTokenizer(r'[a-zA-Z]{2,}')
    tokens = tokenizer.tokenize(text.lower())
    text = ' '.join(tokens)

    # load vectorizer
    transformer = pickle.load(open(model_dir + "tfidf_transformer.pkl", 'rb'))

    # Create new tfidfVectorizer with old vocabulary
    vectorizer = TfidfVectorizer(stop_words='english',
                                 ngram_range=(1, 2),
                                 lowercase=True,
                                 vocabulary=transformer.vocabulary_)

    vec = vectorizer.fit_transform([text])
    # load model from file
    model = pickle.load(open(model_dir + "bigram_SVM.dat", "rb"))
    y_pred = model.predict(vec)
    return y_pred[0]  # 1: STRESS
Пример #5
0
lines = file.readlines()

for index, line in enumerate(lines):
  if "PROLOGUE" in line:
    lines = lines[index+1:]

sentences = []
for line in lines:
  if len(line) > 1:
    for sentence in sent_tokenize(line):
      sentences.append(sentence)

#print(sentences)

stop_words = set(stopwords.words("english"))
tokenizer = RegexpTokenizer(r'\w+')
filtered_sentences = []

for sentence in sentences:
  sent = []
  strspace = " "
  #print(sent.join(tokenizer.tokenize(str(sentence))))
  for word in word_tokenize( " ".join(tokenizer.tokenize(str(sentence)))):
    if word not in list(stop_words):
      sent.append(word.lower())
  filtered_sentences.append(" ".join(sent))

#print(filtered_sentences[:100])

model = gensim.models.Word2Vec(filtered_sentences)
Пример #6
0
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.corpus import RegexpTokenizer

stop_words = stopwords.words("english")
tokenizer = RegexpTokenizer(r"\w+")


def semantic_info(word, lemma, context):
    #context = set(w for w in context
    #              if w != word
    #              and w != lemma)

    context.discard(word)
    context.discard(lemma)

    return simplified_lesk(lemma, context)


def synsets_for(word):
    return wn.synsets(word)


def remove_stopwords(sentence):
    return set(w for w in sentence if w not in stop_words)


def simplified_lesk(lemma, context):
    synsets = synsets_for(lemma)

    best_sense = synsets[0]
Пример #7
0
f = open('data.txt')
documents = f.readlines()
f.close()

# stoplist = set('for a of the and to in'.split())
# texts = [[word for word in document.lower().split() if word not in stoplist]
#          for document in documents]

from nltk.corpus import RegexpTokenizer
from nltk.corpus import stopwords
from string import punctuation

texts = []
for sentence in documents:
    sentence = sentence.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(sentence)
    custom_set = set(stopwords.words('english') + list(punctuation))
    filtered_words = [w for w in tokens if w not in custom_set]
    texts.append(filtered_words)

from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [[token for token in text if frequency[token] > 1] for text in texts]

from pprint import pprint  # pretty-printer
from gensim import corpora
Пример #8
0
'''
Created on 06/05/2013

@author: Rodrigo
'''

from nltk.corpus import stopwords, RegexpTokenizer
english_stops = set(stopwords.words('english'))
tokenizer = RegexpTokenizer('\s+', gaps=True)
print [w for w in tokenizer.tokenize("This is not a common book") if not w in english_stops]
Пример #9
0
from languagemodeling.ngram import NGram, AddOneNGram, InterpolatedNGram


if __name__ == '__main__':
    opts = docopt(__doc__)

    # load the data
    pattern = r'''(?ix)    # set flag to allow verbose regexps
          (?:sr\.|sra\.)
        | (?:[A-Z]\.)+        # abbreviations, e.g. U.S.A.
        | \w+(?:-\w+)*        # words with optional internal hyphens
        | \$?\d+(?:\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
        | \.\.\.            # ellipsis
        | [][.,;"'?():-_`]  # these are separate tokens; includes ], [
    '''
    tokenizer = RegexpTokenizer(pattern)

    root = '.'
    corpus = PlaintextCorpusReader(root, 'books\.txt', word_tokenizer=tokenizer)

    sents = corpus.sents()

    # train the model
    n = int(opts['-n'])

    if opts['-m'] == 'addone':
        model = AddOneNGram(n, sents)
    elif opts['-m'] == 'inter':
        gamma = opts['-g']
        if gamma is None:
            model = InterpolatedNGram(n, sents, None, False)