def extract_tokens(row, lemmatize=True, use_tag=True):
    tokenizer = WhitespaceTokenizer()
    if lemmatize:  # reduce words to lemmas
        pattern = '[().*+,?!\'\";:]*'
        token_list = list()
        if use_tag:  # use POS tags to obtain more accurate lemmas
            pos_tags = PerceptronTagger().tag(tokenizer.tokenize(row['text']))
            lemmatizer_input = map(
                lambda x: (x[0], nltk_to_wordnet.get(x[1][0])), pos_tags)
            lemmatizer = WordNetLemmatizer()
            for word, tag in lemmatizer_input:
                if word != 'urlLink' and 'http:' not in word:
                    word = word.lower()
                    if tag is None:
                        tok = lemmatizer.lemmatize(word)
                        tok = re.sub(pattern, '', tok)
                        if not tok.isdigit():
                            token_list.append(tok)
                    else:
                        tok = lemmatizer.lemmatize(word, tag)
                        tok = re.sub(pattern, '', tok)
                        if not tok.isdigit():
                            token_list.append(tok)
        else:  # do not use a tagger if not specified and speed up computation
            lemmatizer_input = tokenizer.tokenize(row['text'])
            lemmatizer = WordNetLemmatizer()
            for word in lemmatizer_input:
                if word != 'urlLink' and 'http:' not in word:
                    tok = lemmatizer.lemmatize(word.lower())
                    tok = re.sub(pattern, '', tok)
                    if not tok.isdigit():
                        token_list.append(tok)
    else:  # simply tokenize based on whitespaces
        token_list = tokenizer.tokenize(row['text'])
    return token_list
Пример #2
0
    def _calculate_sentence_title_score(self, sentence):
        """Calculates a score based on how many words the sentence shares with the article title."""
        title = self._remove_punctuation(self.title)
        sentence = self._remove_punctuation(sentence)
        tokenizer = WhitespaceTokenizer()
        tokenized_title = tokenizer.tokenize(title)
        tokenized_sentence = tokenizer.tokenize(sentence)
        
        common_words = set()
        for word in tokenized_sentence:
            if word in tokenized_title:
                common_words.add(word)

        score = float(len(common_words)) / len(tokenized_sentence)
        return SENTENCE_SCORE_WEIGHTS['title'] * score
Пример #3
0
def es_tokenize(sentence,cxt=False):

    tokenizer=WhitespaceTokenizer()
    token_sentence = []
    if cxt:
        for i in sentence:
            tmp = []
            for k in i:
                tmp.append(tokenizer.tokenize(k))
            token_sentence.append(tmp)

    else:
        for i in sentence:
            token_sentence.append(tokenizer.tokenize(i))

    return token_sentence
Пример #4
0
def main(tweet):
    #tweet = input("enter tweet here: ")
    tk = WhitespaceTokenizer()
    words = tk.tokenize(tweet)
    words_with_pos = pos_tag(words)
    queries = formQueries(words_with_pos)
    return scrapeWebForEachQuery(queries)
def read_session(lines):
    """
    it takes a path to a transcription file and returns a dictionary that maps conversation id to a list of words.
        :param lines: <class '_io.TextIOWrapper'>

    remember:
    *v: non-Dutch words,  *n: new non-existing words, *s: street  words,
    *a: incomplete words, *u: distorted words, *x: unclear word,
    xxx: unclear utterances, vvv: non-Dutch sentences, ggg: sounds made by the speaker
    """
    lines_to_words = lines.read()
    lines_to_words = re.sub('[0-9]*\.[0-9]*\t', '',
                            lines_to_words)  # to remove timestamps
    lines_to_words = re.sub(
        '[A-Za-z]*\*[anuxANUX]{1}', '',
        lines_to_words)  # to remove words with *n, *a, *u, and *x
    lines_to_words = re.sub('[A-Za-z]*\*[etV]{1}', '',
                            lines_to_words)  # unknown notation
    lines_to_words = re.sub('[A-Za-z]*\*op', '', lines_to_words)  # a mistake?

    lines_to_words = lines_to_words.replace('start\tend\ttext\n', '').replace('.', '').replace('-', ' ')\
        .replace('?', '').replace('\n', ' ').replace('xxx', '').replace('ggg', '').replace('vvv', '')\
        .replace('*v', '').replace('*s', '')

    lines_to_words = re.sub('[A-Za-z]*\*', '',
                            lines_to_words)  # for words with missing notation

    # s = lines_to_words.translate({ord(c): None for c in string.punctuation if c != '*'})
    tk = WhitespaceTokenizer()
    words = tk.tokenize(lines_to_words)

    return words
Пример #6
0
def clean_text(text):
    """ Removes punctuation, capitalizations, numbers, stop words, and stems words"""
    ps = PorterStemmer()

    stop_words = set(stopwords.words('english'))

    text = text.lower()
    text = contractions.expandContractions(text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)  # remove punctuation
    text = re.sub('\s+', ' ', text)
    text = re.sub('\d+', ' ', text)  # remove numbers
    text = re.sub(
        r'(.)\1\1+', r'\1\1',
        text)  # letters repeated 3 or more times in a row are repeated twice
    text = re.sub(r'(ha)\1\1+', r'haha', text)
    text = re.sub(r'(lo)\1\1+', r'lol', text)
    text = text.strip(' ')

    # stem words
    tokenizer = WhitespaceTokenizer()
    tokenized_comment = tokenizer.tokenize(text)
    filtered_sentence = [w for w in tokenized_comment if not w in stop_words]
    stemmed_comment = [ps.stem(word) for word in filtered_sentence]
    text = " ".join(stemmed_comment)
    return text
Пример #7
0
class PreProcess:
    def __init__(self, corpus_content):
        self.wst = WhitespaceTokenizer()
        self.all_tokens = []
        self.unique_token = []
        self.bigrams_collection = []
        self.bigrams_dict = defaultdict(list)
        self.trigrams_collection = []
        self.trigrams_dict = defaultdict(list)
        self.tokenize(corpus_content)
        self.trigrams_collection_generator()
        self.markov_form(self.bigrams_collection, self.bigrams_dict)
        self.markov_form(self.trigrams_collection, self.trigrams_dict)

    def tokenize(self, corpus_content):
        self.all_tokens = self.wst.tokenize(corpus_content)
        self.unique_token = list(set(self.all_tokens))
        self.bigrams_collection = list(bigrams(self.all_tokens))

    def markov_form(self, collection, dictionary):
        for key, value in collection:
            dictionary[key].append(value)
        for key, value in dictionary.items():
            dictionary[key] = Counter(value)

    def trigrams_collection_generator(self):
        for i in range(len(self.all_tokens) - 2):
            self.trigrams_collection.append(
                (" ".join([self.all_tokens[i],
                           self.all_tokens[i + 1]]), self.all_tokens[i + 2]))
Пример #8
0
    def whitespace_tokenizer(self, review):
        tokenizer = WhitespaceTokenizer()
        if self.features in [1, 2]:
            tokens = [
                process_word(word.lower())
                for word in tokenizer.tokenize(self.data[review]['Content'])
            ]
        else:
            tokens = [
                word.lower()
                for word in tokenizer.tokenize(self.data[review]['Content'])
            ]

        tags = nltk.pos_tag(tokens)

        return tokens, tags
Пример #9
0
def withoutverbs(text):
    tokenizer = WhitespaceTokenizer()
    sent = nltk.pos_tag(tokenizer.tokenize(text))
    return [
        x for (x, y) in sent
        if ((y not in ('VBN')) and (y not in ('VBG')) and (y not in ('VBP'))
            and (y not in ('VB')) and (y not in ('VBD')) and (y not in ('RB')))
    ]
Пример #10
0
def lemmatize_text(text):
    # lemmatize words
    w_tokenizer = WhitespaceTokenizer()
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(w)+' ' for w in w_tokenizer.tokenize(text.lower())]
    text = ''.join(words)

    return text
    def parse(self, corpus_filename, key):
        assert type(corpus_filename) == str, "the filename must be a string"
        assert type(key) == str, "the key must be a string"

        wst = WhitespaceTokenizer()
        with codecs.open(corpus_filename, encoding="utf8") as input:
            corpus = [wst.tokenize(l) for l in input]
        return {key: corpus}
Пример #12
0
def es_tokenize(sentence):

    tokenizer = WhitespaceTokenizer()
    token_sentence = []

    for i in sentence:
        token_sentence.append(tokenizer.tokenize(i))

    return token_sentence
Пример #13
0
def lemmatize_text(text):
    """
    :param text: string
    :return: bag of words array
    """
    lemmatizer = WordNetLemmatizer()
    w_tokenizer = WhitespaceTokenizer()
    return ' '.join(map(str, [lemmatizer.lemmatize(word)
                              for word in w_tokenizer.tokenize(text)]))
Пример #14
0
 def get_texts_raw(self):
     """
     Parse documents analogously to SimpleCorpus.get_texts(),
     but tokenized by whitespace only
     """
     wst = WhitespaceTokenizer()
     with self.getstream() as stream:
         for doc in stream:
             yield [word for word in wst.tokenize(utils.to_unicode(doc))]
 def tokenizeDoc(self, doc):
     """
     Get the tokens (words) from the doc
     uses nltk.
     """
     #print ("Tokenizing doc")
     tokenizer = WhitespaceTokenizer()
     docTokens = tokenizer.tokenize(doc)
     return docTokens
def stemmed_words(text):
    
    stemmer = SnowballStemmer('english')
    w_tokenizer = WhitespaceTokenizer()
    wrdslist = []
    for w in w_tokenizer.tokenize(text):
        lemwrd = stemmer.stem(w)
        wrdslist.append(lemwrd)
    return " ".join(wrdslist)
Пример #17
0
 def get_texts_raw(self):
     """
     Parse documents analogously to SimpleCorpus.get_texts(),
     but tokenized by whitespace only
     """
     wst = WhitespaceTokenizer()
     with self.getstream() as stream:
         for doc in stream:
             yield [word for word in wst.tokenize(utils.to_unicode(doc))]
Пример #18
0
def tokenize_english_document(input_text):
    """
    This is a crude tokenizer for input conversations in English.
    :param input_text:
    :return:
    """
    end_list = []
    block_tokenizer = BlanklineTokenizer()
    sentence_tokenizer = PunktSentenceTokenizer()
    word_tokenizer = WhitespaceTokenizer()
    # using the 38 characters in one line rule from ITV subtitle guidelines
    characters_per_line = 38
    lines_per_subtitle = 2

    blocks = block_tokenizer.tokenize(input_text)
    for block in blocks:
        # We have one speaker
        sentences = sentence_tokenizer.tokenize(block)
        # We have the sentences
        for sentence in sentences:
            words = word_tokenizer.tokenize(sentence)
            reverse_words = words[::-1]

            lines = []
            current_line = ''
            line_full = False
            while reverse_words:
                word = reverse_words.pop()
                longer_line = ' '.join([current_line, word]).strip()
                if len(longer_line) > characters_per_line and len(
                        current_line):
                    # The longer line is overreaching boundaries
                    reverse_words.append(word)
                    line_full = True
                elif len(word) >= characters_per_line:
                    # Very long words
                    current_line = longer_line
                    line_full = True
                else:
                    current_line = longer_line

                if line_full:
                    lines.append(current_line)
                    current_line = ''
                    line_full = False

                if len(lines) >= lines_per_subtitle:
                    end_list.append(lines)
                    lines = []
            if current_line:
                lines.append(current_line)
            if lines:
                end_list.append(lines)

    return end_list
Пример #19
0
def correct_text(text, cdict):
    w = WhitespaceTokenizer()
    p = WordPunctTokenizer()
    token = w.tokenize(text=text)
    for i, s in enumerate(token):
        split = p.tokenize(s)
        for j, e in enumerate(split):
            if e in set(cdict.keys()):
                split[j] = cdict[e]
        token[i] = "".join(split)
    return " ".join(token)
Пример #20
0
def tokenize_english_document(input_text):
    """
    This is a crude tokenizer for input conversations in English.
    :param input_text:
    :return:
    """
    end_list = []
    block_tokenizer = BlanklineTokenizer()
    sentence_tokenizer = PunktSentenceTokenizer()
    word_tokenizer = WhitespaceTokenizer()
    # using the 38 characters in one line rule from ITV subtitle guidelines
    characters_per_line = 38
    lines_per_subtitle = 2

    blocks = block_tokenizer.tokenize(input_text)
    for block in blocks:
        # We have one speaker
        sentences = sentence_tokenizer.tokenize(block)
        # We have the sentences
        for sentence in sentences:
            words = word_tokenizer.tokenize(sentence)
            reverse_words = words[::-1]

            lines = []
            current_line = ''
            line_full = False
            while reverse_words:
                word = reverse_words.pop()
                longer_line = ' '.join([current_line, word]).strip()
                if len(longer_line) > characters_per_line and len(current_line):
                    # The longer line is overreaching boundaries
                    reverse_words.append(word)
                    line_full = True
                elif len(word) >= characters_per_line:
                    # Very long words
                    current_line = longer_line
                    line_full = True
                else:
                    current_line = longer_line

                if line_full:
                    lines.append(current_line)
                    current_line = ''
                    line_full = False

                if len(lines) >= lines_per_subtitle:
                    end_list.append(lines)
                    lines = []
            if current_line:
                lines.append(current_line)
            if lines:
                end_list.append(lines)

    return end_list
Пример #21
0
 def split_and_tokenize_reviews(self):
     # split sentences and tokenize each sentence to a list
     reviews = self.clean_and_separate_reviews()
     train_sentences = []
     tokenizer = WhitespaceTokenizer()
     for review in reviews:
         sentences = re.split("[.?!]", str(review))
         for sentence in sentences:
             train_sentences.append(tokenizer.tokenize(sentence))
     print("train_sentences length %s" % len(train_sentences))
     return train_sentences
Пример #22
0
def main(args):
    tokenizer = WhitespaceTokenizer()
    voc = set()

    dir = args.train_dir

    dir_pos = os.path.join(dir, 'pos')
    cnt = 0
    fmt = 'Processed %d positive docs'
    for fname in os.listdir(dir_pos):
        if not fname.endswith('.txt'):
            continue
        cnt += 1
        if cnt % REPORT_INTERVAL == 0:
            print fmt % cnt

        f = open(os.path.join(dir_pos, fname), 'rb')
        voc.update(map(lambda s: s.lower(), tokenizer.tokenize(f.read())))
        f.close()
    print fmt % cnt

    dir_neg = os.path.join(dir, 'neg')
    cnt = 0
    fmt = 'Processed %d negative docs'
    for fname in os.listdir(dir_neg):
        if not fname.endswith('.txt'):
            continue
        cnt += 1
        if cnt % REPORT_INTERVAL == 0:
            print fmt % cnt

        f = open(os.path.join(dir_neg, fname), 'rb')
        voc.update(map(lambda s: s.lower(), tokenizer.tokenize(f.read())))
        f.close()
    print fmt % cnt

    voc = sorted(list(voc))
    f = open(args.output, 'wb')
    pickle.dump(voc, f)
    f.close()
Пример #23
0
def skip_grams(sequence_df,
               feature_size=100,
               window=4,
               min_activity_count=0,
               **kwargs):
    """Vectorizes sequences by blank space and returns skip gram features for 
    each activity_ID in sequences
    
    Parameters
    ----------
    sequence_df : dataframe
                  Pandas dataframe (from activities.create_corpus func) containing sequences of activity_ID
    feature_size : integer (default=100)
                   Number of dimensions or size of vector to produce for each activity
    window : integer (default=4)
             Size of context window for each activity
    min_activity_count : integer (default=0)
                         Minimum number of activity instances to be considered
    
    
    Returns
    -------
    dictionary of activity_IDs and corresponding features from word2vec skip grams model
    """
    assert len(
        sequence_df
    ) > 0 and 'seq_str' in sequence_df.columns, "sequence_df must contain a 'seq_str' column to tokenize."
    try:
        feature_size, window, min_activity_count = int(feature_size), int(
            window), int(min_activity_count)
    except TypeError:
        print("feature_size, window, and min_activity_count must be integers.")
    tokenizer = WhitespaceTokenizer()
    tokenized_corpus = [
        tokenizer.tokenize(sequence) for sequence in sequence_df['seq_str']
    ]

    # Train model on corpus using skip-gram method
    w2v_model = word2vec.Word2Vec(tokenized_corpus,
                                  size=feature_size,
                                  window=window,
                                  min_count=min_activity_count,
                                  sg=1,
                                  **kwargs)  #,sample=1e-5, iter=50)

    # Get unique list of activities
    vocab_activities = [k for k in w2v_model.wv.vocab.keys()]

    # Zip activity_ID and features from w2v model
    w2v_dict = dict(zip(vocab_activities, w2v_model.wv[vocab_activities]))

    return w2v_dict
Пример #24
0
def lemmatize_str(string, wordnet):
    '''
    Lemmatize string using nltk WordNet

    Input: string
    Output: string
    '''
    if wordnet:
        w_tokenizer = WhitespaceTokenizer()
        lemmatizer = WordNetLemmatizer()
        lemmed = " ".join(
            [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(string)])
        return lemmed
Пример #25
0
def remove_low_freq_noverbs(text_to_modify):
    lists = df['Body']
    words2 = []
    for wordList in lists:
        words2 += wordList
    fdist = FreqDist(words2)
    fdist_min = sorted(w for w in set(words2) if fdist[w] < 21)
    sw_low_noverbs = set()
    sw_low_noverbs.update(fdist_min)
    stopwords = sw_low_noverbs
    tokenizer = WhitespaceTokenizer()
    words = tokenizer.tokenize(text_to_modify)
    return [word for word in words if word not in stopwords]
def stemming_documents(documents):
    whitespace_tokenizer = WhitespaceTokenizer()
    stemmer = PorterStemmer()
    stemmed_documents = []

    for document in documents:
        sentence = ' '.join([
            stemmer.stem(word.lower())
            for word in whitespace_tokenizer.tokenize(document)
        ])
        stemmed_documents.append(sentence)

    return np.array(stemmed_documents, dtype='object')
Пример #27
0
def lemmatize_series(series, lematize=False, spacy=False):
    if isinstance(series, pd.Series):
        series = series.copy()
        w_tokenizer = WhitespaceTokenizer()
        lemmatizer = WordNetLemmatizer()
        tokenize_lematize_word_list = []
        for i in list(series):
            if lematize:
                if spacy == True:
                    doc = nlp(i)
                    tokenize_lematize_word_list.append(
                        [token.lemma_ for token in doc])
                else:
                    tokenize_lematize_word_list.append([
                        lemmatizer.lemmatize(w)
                        for w in w_tokenizer.tokenize(i)
                    ])
            else:
                tokenize_lematize_word_list.append(
                    [w for w in w_tokenizer.tokenize(i)])
        return pd.Series(tokenize_lematize_word_list)
    else:
        raise ValueError("Need pandas series as input")
Пример #28
0
def unidas(filename):
    chromeOptions = webdriver.ChromeOptions()
    prefs = {"profile.managed_default_content_settings.images":2}
    chromeOptions.add_experimental_option("prefs",prefs)
    chromeOptions.add_argument("--incognito")
    browser = webdriver.Chrome(chrome_options=chromeOptions)


    unidasURL = "https://www.seminovosunidas.com.br/veiculos"
    unidasSection = "/page:"

    tknzr = WhitespaceTokenizer()

    browser.get(unidasURL)
    page_soup = soup(browser.page_source, "html5lib")
    numberOfPages = str(page_soup.body.find("ul", {"class": "list-unstyled list-inline header-paginator pull-right"}).findAll("li")[4].find("a"))
    numberOfPages = numberOfPages[numberOfPages.find('">')+2:numberOfPages.find("</")]
    print(numberOfPages)

    filename = filename + ".csv"
    f = open(filename, "w")


    for i in range(1, int(numberOfPages)+1, 1):
        url = unidasURL + unidasSection + str(i)
        browser.get(url)
        page_soup = soup(browser.page_source, "html5lib")
        containers = page_soup.body.find("div", {"class": "container busca-resultados"}).find("div", {"class": "resultados"}).ul.findAll("li")
        for vehicle in containers:
            car = vehicle
            fabricante = car.find("article").find('div').find("div", {"class":"col-sm-6 col-md-9"}).find("span", {"class": "makeModel"}).findAll("span")[0].text
            modelo = car.find("article").find('div').find("div", {"class":"col-sm-6 col-md-9"}).find("span", {"class": "makeModel"}).findAll("span")[1].text
            ano = car.find("article").find('div').find("div", {"class":"col-sm-6 col-md-9"}).find("span", {"class": "description"}).text
            km = car.find("article").find('div').find("div", {"class":"col-sm-6 col-md-9"}).find("span", {"class": "details"}).text
            preco = car.find("article").find('div').find("div", {"class":"col-sm-6 col-md-9"}).find("span", {"class": "valor"}).text
            ano = tknzr.tokenize(ano)
            ano = ano[len(ano)-1].replace(")", "")
            km = km[km.find("Km: ") + 4:]
            km = km[:km.find(",")]
            preco = preco[:len(preco)-3]
            if fabricante == 'MERCEDES':
                fabricante = "MERCEDES-BENZ"
            print("Fabricante: " + fabricante)
            print("Modelo: " + modelo)
            print("Ano: " + ano)
            print("Quilometragem: " + km)
            print("Preço: " + preco)
            print("\n")
            f.write("Unidas," + fabricante + "," + modelo.replace(" ","").upper() + "," + ano + "," + km.replace(".", "") + "," + preco.replace(".", "") + "\n")
Пример #29
0
def get_words(document):
    '''
    Return a list of unique words in document
    '''
    regex1 = re.compile('\W')          # match non-alphanumeric
    regex2 = re.compile('&(#)*(\w)*;')  # match html entities
    regex3 = re.compile('( ){2,}')      # match more than 2 spaces
    lemmatizer = WordNetLemmatizer()
    tokenizer  = WhitespaceTokenizer()
    # lowercase document, remove punctuation, and html entities
    document   = regex3.sub(' ', regex2.sub(' ', regex1.sub(' ', document.lower())))
    words = [
             lemmatizer.lemmatize(word)
             for word in tokenizer.tokenize(document)
             if word not in STOPWORDS and len(word) > 2
            ]
    return FreqDist(words)
Пример #30
0
def preprocess_article_content(text_df):

    print 'preprocessing article text...'

    # text_df is data frame from SQL query, column 'content' contains text content from each article
    article_list = []

    # define punctuation to remove
    punc = set('''`~!@#$%^&*()-_=+\|]}[{;:'",<.>/?''')

    tokenizer = WhitespaceTokenizer()
    stop_words = set(stopwords.words('english'))
    #stemmer = SnowballStemmer('english')
    lemmatizer = WordNetLemmatizer()

    kept_rows = []

    for row, article in enumerate(text_df['content']):

        cleaned_tokens = []

        tokens = tokenizer.tokenize(
            article.decode('unicode-escape', 'ignore').lower())

        for token in tokens:
            token = ''.join(ch for ch in token if ch not in punc)

            if token not in stop_words:

                if len(token) > 0 and len(token) < 20:

                    if not token[0].isdigit() and not token[-1].isdigit():
                        #stemmed_token = stemmer.stem(token)
                        lemmatized_tokens = lemmatizer.lemmatize(token)
                        #cleaned_tokens.append(stemmed_token)
                        cleaned_tokens.append(lemmatized_tokens)

        # join cleaned tokens into a string for subsequent LDA
        # filtering out content that is likely noise (error messages etc)
        if len(cleaned_tokens) > 100:
            article_list.append(' '.join(wd for wd in cleaned_tokens))
            kept_rows.append(row)

    print 'preprocessed content for %d articles' % len(article_list)

    return article_list, kept_rows
Пример #31
0
def CleanAndTokenize(text):
    # Strip URLs and replace with token "URLURLURL"
    r = re.compile(
        r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
    )
    text = re.sub(r, " URLURLURL", text)
    # Strip html tags
    soup = BeautifulSoup(text)
    for tag in soup.findAll(True):
        tag.replaceWithChildren()
        text = soup.get_text()
    # Normalize everything to lower case
    text = text.lower()
    # Strip line breaks and endings \r \n
    r = re.compile(r"[\r\n]+")
    text = re.sub(r, "", text)
    # get rid of em dashes
    # table = {
    #     ord(u'\u2018') : u"'",
    #     ord(u'\u2019') : u"'",
    #     ord(u'\u201C') : u'"',
    #     ord(u'\u201d') : u'"',
    #     ord(u'\u2026') : u'',
    #     ord(u'\u2014') : u'',
    # }
    # text = text.translate(table)

    # Normalize contractions
    # e.g. can't => can not, it's => it is, he'll => he will
    text = NormalizeContraction(text)

    # Strip punctuation (except for a few)
    punctuations = string.punctuation
    # includes following characters: !"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~
    excluded_punctuations = ["$", "%"]
    for p in punctuations:
        if p not in excluded_punctuations:
            text = text.replace(p, " ")

    # Condense double spaces
    text = text.replace("  ", " ")

    # Tokenize the text
    tokenizer = WhitespaceTokenizer()
    text_tokens = tokenizer.tokenize(text)
    return text_tokens
Пример #32
0
def preprocess_article_content(text_df):

    print 'preprocessing article text...'
    
    # text_df is data frame from SQL query, column 'content' contains text content from each article
    article_list = []
    
    # define punctuation to remove
    punc=set('''`~!@#$%^&*()-_=+\|]}[{;:'",<.>/?''')
    
    tokenizer = WhitespaceTokenizer()
    stop_words = set(stopwords.words('english'))
    #stemmer = SnowballStemmer('english')
    lemmatizer = WordNetLemmatizer()
    
    kept_rows = []
    
    for row, article in enumerate(text_df['content']):
        
        cleaned_tokens = []
        
        tokens = tokenizer.tokenize(article.decode('unicode-escape', 'ignore').lower())
        
        for token in tokens:
            token = ''.join(ch for ch in token if ch not in punc)
            
            if token not in stop_words:
                
                if len(token) > 0 and len(token) < 20: 
                    
                    if not token[0].isdigit() and  not token[-1].isdigit(): 
                        #stemmed_token = stemmer.stem(token)
                        lemmatized_tokens = lemmatizer.lemmatize(token)
                        #cleaned_tokens.append(stemmed_token)
                        cleaned_tokens.append(lemmatized_tokens)
        
        # join cleaned tokens into a string for subsequent LDA
        # filtering out content that is likely noise (error messages etc)
        if len(cleaned_tokens) > 100:
            article_list.append(' '.join(wd for wd in cleaned_tokens))
            kept_rows.append(row)

    print 'preprocessed content for %d articles' % len(article_list)
        
    return article_list, kept_rows
Пример #33
0
 def extract(self, corpus):
     from nltk.stem import WordNetLemmatizer
     from nltk.corpus import stopwords
     from nltk.tokenize import WhitespaceTokenizer
     exclude_words = stopwords.words('english')
     exclude_words.append('rt')
     exclude_words.append('&amp;')
     tok = WhitespaceTokenizer()
     lem = WordNetLemmatizer()
     tsents = [tok.tokenize(sent) for sent in corpus]
     norm_words = []
     for sent in tsents:
         for word in sent:
             if word.startswith('http://'): continue
             nword = lem.lemmatize(word.lower())
             if nword not in exclude_words:
                 norm_words.append(nword)
     return nltk.FreqDist(norm_words)
Пример #34
0
def CleanAndTokenize(text):
    # Strip URLs and replace with token "URLURLURL"
    r = re.compile(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")
    text = re.sub(r, " URLURLURL", text)
    # Strip html tags
    soup = BeautifulSoup(text, "html.parser")
    for tag in soup.findAll(True):
        tag.replaceWithChildren()
        text = soup.get_text()
    # Normalize everything to lower case
    text = text.lower()
    # Strip line breaks and endings \r \n
    r = re.compile(r"[\r\n]+")
    text = re.sub(r, "", text)
    # get rid of em dashes
    # table = {
    #     ord(u'\u2018') : u"'",
    #     ord(u'\u2019') : u"'",
    #     ord(u'\u201C') : u'"',
    #     ord(u'\u201d') : u'"',
    #     ord(u'\u2026') : u'',
    #     ord(u'\u2014') : u'',
    # }
    # text = text.translate(table)

    # Normalize contractions
    # e.g. can't => can not, it's => it is, he'll => he will
    text = NormalizeContraction(text)

    # Strip punctuation (except for a few)
    punctuations = string.punctuation
    # includes following characters: !"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~
    excluded_punctuations = ["$", "%"]
    for p in punctuations:
        if p not in excluded_punctuations:
            text = text.replace(p, " ")

    # Condense double spaces
    text = text.replace("  ", " ")

    # Tokenize the text
    tokenizer = WhitespaceTokenizer()
    text_tokens = tokenizer.tokenize(text)
    return text_tokens
Пример #35
0
def build_word_frequency(filepath: Path, language: str):
    """ Parse the passed in text file (likely from Open Subtitles) into
        a word frequency list and write it out to disk

        Args:
            filepath (Path):
            language (str):
        Returns:
            Counter: The word frequency as parsed from the file
        Note:
            This only removes words that are proper nouns (attempts to...) and
            anything that starts or stops with something that is not in the alphabet.
    """
    try:
        from nltk.tag import pos_tag
        from nltk.tokenize import WhitespaceTokenizer
        from nltk.tokenize.toktok import ToktokTokenizer
    except ImportError as ex:
        raise ImportError("To build a dictioary from scratch, NLTK is required!\n{}".format(ex.message))

    word_frequency = Counter()
    tok = WhitespaceTokenizer()

    with open(filepath, mode="r") as fobj:
        for line in tqdm.tqdm(fobj):
            # tokenize into parts
            parts = tok.tokenize(line)

            # Attempt to remove proper nouns
            # Remove things that have leading or trailing non-alphabetic characters.
            tagged_sent = pos_tag(parts)
            words = [
                word[0].lower() for word in tagged_sent
                if word[0]
                and not word[1] == "NNP"
                and word[0][0].isalpha()
                and word[0][-1].isalpha()
            ]

            if words:
                word_frequency.update(words)

    return word_frequency
Пример #36
0
def tokenize(sent,tokenizer_type):
	#tokenizer_type is [0] the tokenizer [1] the REGEX or ''
	tokenizer = 'not_implemented'
	#split on custom is the only non-nltk tokenizer
	if tokenizer_type == 'split_on_custom':
		return [sent.split(tokenizer_type[1]) for sent in sents]
	if tokenizer_type[0] == 'whitespace':
		tokenizer = WhitespaceTokenizer()
	if tokenizer_type[0] == 'wordpunkt':
		tokenizer = WordPunctTokenizer()
	if tokenizer_type[0] == 'regexp':
		tokenizer = RegexpTokenizer(tokenizer_type[1])
	if tokenizer_type[0] == 'treebank':
		tokenizer = TreebankWordTokenizer()
	try:
		if tokenizer != "not_implemented":
			return tokenizer.tokenize(sent)
		else:
			return 'Tokenizer not implemented'
	except ValueError: #if the input is not a list of strings
		pass
Пример #37
0
def instanciate_dict(message):
    tk = WhitespaceTokenizer()
    tokens = tk.tokenize(message)

    all_counts = dict()
    sorted_dict = dict()

    for size in 1, 2, 3:
        all_counts[size] = FreqDist(ngrams(tokens, size))

    for index in range(1, 4):
        all_counts[index] = {
            k: v
            for k, v in all_counts[index].items() if v >= 2
        }
        sorted_dict[index] = dict(
            sorted(all_counts[index].items(),
                   key=operator.itemgetter(1),
                   reverse=True))

    return sorted_dict
    def buildVocab(self):
        self.vocabSize = int(self.vocabSize)
        print ("Building vocab from frequencies")
        # get tokenized corpus and get word counts
        self.tokenizedCorpus = []
        self.vocabSet = set()
        
        tokenizer = WhitespaceTokenizer()
        
        for doc in self.corpus:
            # tokenize doc
            docTokens = tokenizer.tokenize(doc)
            self.tokenizedCorpus.extend(docTokens)
        print ("  Tokenized corpus = ", len(self.tokenizedCorpus))

        # vocab for entire corpus
        self.fullVocab = set(self.tokenizedCorpus)
        print ("  Full vocab = ", len(self.fullVocab))
        
        self.vocabCounts = {}        
        # Extremely inefficient since has to iterate entire corpus for each word
        # generate counts for each word
        #for w in self.fullVocab:
        #    self.vocabCounts[w] = self.tokenizedCorpus.count(w)
        
        # for each word in corpus
        for w in self.tokenizedCorpus:
            if w in self.vocabCounts:
                self.vocabCounts[w] += 1
            else:
                self.vocabCounts[w] = 1


        # sort counts with most frequent first
        sortedCounts = sorted(self.vocabCounts.items(), key=operator.itemgetter(1), reverse=True)
        
        # generate vocab from first vocabSize words
        vocabCounts = sortedCounts[0:self.vocabSize]
        self.vocab = [e[0] for e in vocabCounts]
        print ("  vocab = ", self.vocab)
Пример #39
0
def bag_of_words(voc, doc, handle_negation=False, handle_bigrams=False):
    """
    Generate bag of words according to dictionary.
    Haven'd done sanity check on dictionary.
    Please make each word in dictionary unique and sorted.
    :param voc: list of words
    :param doc: string
    :return: list of feature vector.
        0 as not appearing.
        1 as appearing positive.
        -1 as appearing negative.
        Has the same size of dictionary.
    """
    tokenizer = WhitespaceTokenizer()
    tokens = tokenizer.tokenize(doc)
    fv = np.zeros_like(voc, np.int8)
    is_previous_negative = False
    is_previous_enhanced = False

    for token in tokens:
        word = token.lower()
        if is_skip_word(word):
            continue
        if is_negative(word):
            is_previous_negative = True
            continue
        if is_degree(word):
            is_previous_enhanced = True
            continue

        try:
            idx = voc.index(word)
            fv[idx] = 1
            fv[idx] *= -1 if handle_negation and is_previous_negative else 1
            fv[idx] *= 2 if handle_bigrams and is_previous_enhanced else 1
        except ValueError, e:
            pass
        is_previous_negative = False
        is_previous_enhanced = False
Пример #40
0
    def tokenize(self):
        '''
        tokenize, filter numbers and remove links and save in self.to_write
        :return:
        '''
        print("Tokenizing")
        tokenizer2 = RegexpTokenizer(r'\w+')
        tokenizer1 = WhitespaceTokenizer()
        tokens = []
        for i in range(len(self.texts)):
            raw = self.texts[i].lower()
            # white space tokenize
            token = tokenizer1.tokenize(raw)
            # extending contractions
            for i in range(0, len(token)):
                if token[i] in contractions.keys():
                    token[i] = contractions[str(token[i])]
                # removing links
                if (re.search('http', token[i])):
                    token[i] = ''

            raw = " ".join(token)
            # regex tokenizing
            token = tokenizer2.tokenize(raw)
            for i in range(0, len(token)):
                if token[i].isalnum() == False:
                    token[i] = ''
                if (token[i] not in self.unique_words):
                    self.vocab_size += 1
                    self.unique_words.append(token[i])
                self.words.append(token[i])

            tokens.append(token)
            raw = " ".join(token)
            self.to_write.append(raw)
        return tokens
Пример #41
0
def CleanAndTokenize(text):
	
	# Strip URLs and replace with token "URLURLURL"
	r = re.compile(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")
	text = re.sub(r, " URLURLURL", text)

	# Strip html tags
	soup = BeautifulSoup(text)
	for tag in soup.findAll(True):
		tag.replaceWithChildren()
		text = soup.get_text()

	# Normalize everything to lower case
	text = text.lower()
	
	# Strip line breaks and endings \r \n
	r = re.compile(r"[\r\n]+")
	text = re.sub(r, "", text)

	table = {
		ord(u'\u2018') : u"'",
		ord(u'\u2019') : u"'",
		ord(u'\u201C') : u'"',
		ord(u'\u201d') : u'"',
		ord(u'\u2026') : u'',
		ord(u'\u2014') : u'', # get rid of em dashes
	}
	text = text.translate(table)

	# Normalize contractions
	# e.g. can't => can not, it's => it is, he'll => he will
	text = text.replace("can't", "can not")
	text = text.replace("couldn't", "could not")
	text = text.replace("don't", "do not")
	text = text.replace("didn't", "did not")
	text = text.replace("doesn't", "does not")
	text = text.replace("shouldn't", "should not")
	text = text.replace("haven't", "have not")
	text = text.replace("aren't", "are not")
	text = text.replace("weren't", "were not")
	text = text.replace("wouldn't", "would not")
	text = text.replace("hasn't", "has not")
	text = text.replace("hadn't", "had not")
	text = text.replace("won't", "will not")
	text = text.replace("wasn't", "was not")
	text = text.replace("can't", "can not")
	text = text.replace("isn't", "is not")
	text = text.replace("ain't", "is not")    
	text = text.replace("it's", "it is")
	
	text = text.replace("i'm", "i am")
	text = text.replace("i'm", "i am")
	text = text.replace("i've", "i have")
	text = text.replace("i'll", "i will")
	text = text.replace("i'd", "i would")
 
	text = text.replace("we've", "we have")
	text = text.replace("we'll", "we will")
	text = text.replace("we'd", "we would")
	text = text.replace("we're", "we are")
	
	text = text.replace("you've", "you have")
	text = text.replace("you'll", "you will")
	text = text.replace("you'd", "you would")
	text = text.replace("you're", "you are")
	
	text = text.replace("he'll", "he will")
	text = text.replace("he'd", "he would")
	text = text.replace("he's", "he has")
	
	text = text.replace("she'll", "she will")
	text = text.replace("she'd", "she would")
	text = text.replace("she's", "she has")
	
	text = text.replace("they've", "they have")
	text = text.replace("they'll", "they will")
	text = text.replace("they'd", "they would")
	text = text.replace("they're", "they are")
	
	text = text.replace("that'll", "that will")
	text = text.replace("that's", "that is")
	text = text.replace("there's", "there is")

	
	# Strip punctuation (except for a few)
	punctuations = string.punctuation # includes following characters: !"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~
	excluded_punctuations = ["$", "%"]
	for p in punctuations:
		if p not in excluded_punctuations:
			text = text.replace(p, " ")

	# Condense double spaces
	text = text.replace("  ", " ")

	# Tokenize the text 
	# NOTE: Using a simple tokenizer based on spaces ... 
	# Could also try a more sophisticated tokenizer if abbreviations / contractions should be conserved
	tokenizer = WhitespaceTokenizer()
	text_tokens = tokenizer.tokenize(text)
	
	return text_tokens
Пример #42
0
for f in listdir('corpus/'):
 	if f[-4:] == ".txt" and not f in skipOver:
 		fileName = f

 		F = open('corpus/'+f)
 		text = F.read()
 		F.close()

		alphanum = letters+octdigits

		paragraphs = [s for s in text.split("\n\n") if s != "" ][:-1]
		numParagraphs = len(paragraphs)

		# average paragraph size
		wst = WhitespaceTokenizer()
		paraWordCounts = [len(wst.tokenize(p)) for p in paragraphs]

		# the approximate number of words in the document
		numWords = sum(paraWordCounts)

		# the average number of words per paragraph
		avgParagraphLen = mean(paraWordCounts)

		# rejoin the paragraphs
		text = ' '.join(paragraphs)

 		# part of speech word list for the text
 		text = [word for subl in [pos_tag(wt(s)) for s in st(text)] for word in subl]

 		# remove symbols from list by checking the first character of the word
 		text = [word for word in text if word[0][0] in alphanum]
Пример #43
0
from nltk.tokenize import WhitespaceTokenizer

TOKENIZER = WhitespaceTokenizer()

def read(file_name):
	try:
		f_in = '%s.txt' % file_name
		file_in = open(f_in, 'r')
		f_out = '%s.csv' % file_name
		file_out = open(f_out, 'wb')
	except Exception, e:
		raise e

	data = ', '.join( [TOKENIZER.tokenize(line)[1] for line in file_in] )

	try:
		file_out.write(data)
	except Exception, e:
		raise e

#read()

if __name__ == "__main__":
    # Command line arguments
    import argparse
    parser = argparse.ArgumentParser(
        description='Converts a space two column space separted file into csv containing second column'
    )
    parser.add_argument('file', help='The file to convert')
    args = parser.parse_args()
Пример #44
0
def tokenize_text(text):
    whitespace_tokenizer = WhitespaceTokenizer()

    return whitespace_tokenizer.tokenize(text)
Пример #45
0
class Prototype:
    """Prototype system that searches for RDF pattern (aka Q-Calculus pattern) to find textsnippets."""

    def __init__(self, mongo_db, postgre_db, sentence_mode=True, punctuation_mode=False, window_size=0):
        """Initialize a prototype with a specified configurations.

        Parameters:
        mongo_db -- Mongo DB connection
        postgre_db -- PostGre DB connection
        sentence_mode -- whether or not to use sentence window mode (default True)
        window_size -- the size of the sentence or word window (default 0)
        """
        self.__mongo_db = mongo_db
        self.__postgre_db = postgre_db
        self.__sentence_mode = sentence_mode
        self.___punctuation_mode = punctuation_mode
        self.__window_size = window_size
        self.tokenizer = WhitespaceTokenizer()
        self.parser = Parser()

    def exit(self):
        """Close down the prototype."""
        self.__mongo_db.close_connection()
        self.__postgre_db.close_connection()

    def create_new_collection(self, schema_name):
        self.__postgre_db.create_schema(schema_name)

    def get_window_size(self):
        """Gets the current window size."""
        return self.__window_size

    def get_sentence_mode(self):
        """Returns True if sentence window mode is activated, else False."""
        return self.__sentence_mode

    def change_window_size(self, size):
        """Change the current window size to a new size."""
        value = 0
        try:
            value = int(size)
        except ValueError:
            raise ValueError("Please type in a valid number.")

        if value >= 0:
            self.__window_size = value
        else:
            raise ValueError("Please type in a valid positive number.")

    def activate_sentence_window_mode(self):
        """Activate sentence window mode."""
        self.__sentence_mode = True

    def activate_word_window_mode(self):
        """De-activate sentence window mode."""
        self.__sentence_mode = False

    def activate_punctuation_mode(self):
        self.___punctuation_mode = True

    def deactivate_punctuation_mode(self):
        self.___punctuation_mode = False

    def get_punctuation_mode(self):
        return self.___punctuation_mode

    def get_word_window(self, pattern, tokens, constraints):
        """Get a word window list with a specific number of words.

        Parameters:
        pattern -- the pattern to search for
        tokens -- the tokens to search in
        constraints -- a constraint tuple list
        """
        split_pattern = pattern.split()
        if len(split_pattern) > 1:
            textsnippets = self.__get_word_window_more_words_help(split_pattern, tokens, constraints)
        else:
            textsnippets = self.__get_word_window_one_word_help(pattern, tokens, constraints)
        return textsnippets

    def __get_word_window_more_words_help(self, split_pattern, tokens, constraints):
        """Find pattern with more than one word.
        """
        textsnippets = []
        textlength = len(tokens)
        for ind, token in enumerate(tokens):
            p_index = 0
            end_index = ind
            while p_index < len(split_pattern):
                if self.check_pattern(split_pattern[p_index], tokens[end_index]):
                    p_index += 1
                    end_index += 1
                else:
                    break
            if p_index == len(split_pattern):
                if constraints is not None:
                    self.__check_constraints(constraints, (ind, end_index - 1), ind, split_pattern, None, None, textsnippets, tokens)
                else:
                    pattern = " ".join(item for item in split_pattern)
                    self.__get_word_window_help((ind, end_index - 1), textsnippets, textlength, tokens, pattern)
        return textsnippets

    def __get_word_window_one_word_help(self, pattern, tokens, constraints):
        """Find pattern with only one word."""
        textsnippets = []
        textlength = len(tokens)
        for ind, token in enumerate(tokens):
            if self.check_pattern(pattern, token):
                if constraints is not None:
                    self.__check_constraints(constraints, (ind, ind), ind, pattern, None, None, textsnippets, tokens)
                else:
                    self.__get_word_window_help((ind, ind), textsnippets, textlength, tokens, pattern)
        return textsnippets

    def __get_word_window_help(self, token_pos, textsnippets, textlength, tokens, pattern):
        snippet = self.__get_textsnippets(token_pos[0], token_pos[1], textlength, tokens)
        offset_start = re.search(pattern, snippet).span()[0]
        offset_end = offset_start + (len(pattern) - 1)
        SentObj = namedtuple('Sentence_Object', ['snippet', 'offset_start', 'offset_end'])
        textsnippets.append(SentObj(snippet=snippet, offset_start=offset_start, offset_end=offset_end))

    def __get_textsnippets(self, indl, indr, textlength, tokens):
        if (indl - self.__window_size < 0) and (indr + self.__window_size > textlength):
            left_index = self.__window_size - 1
            while not (indl - left_index) == 0:
                left_index -= 1
            right_index = self.__window_size - 1
            while not (indr + right_index) == textlength:
                right_index -= 1
            return " ".join(tokens[indl - left_index:indr + right_index])

        elif indr + self.__window_size > textlength:
            right_index = self.__window_size - 1
            while not (indr + right_index) == textlength:
                right_index -= 1
            return " ".join(tokens[indl - self.__window_size:indr + right_index])

        elif indl - self.__window_size < 0:
            left_index = self.__window_size - 1
            while not (indl - left_index) == 0:
                left_index -= 1
            return " ".join(tokens[indl - left_index:indr + self.__window_size + 1])
        else:
            return " ".join(tokens[indl - self.__window_size:indr + (self.__window_size + 1)])

    def get_sentence_window(self, pattern, sentences, constraints):
        """Get a list with a specific number of sentences. size 0 will return the
        current sentence the pattern is found in. size n will return n sentences left and right
        from the initial sentence.

        Parameters:
        pattern -- the pattern to search for
        sentences -- the sentences to search in
        constraints -- the constraint tuple list
        """
        split_pattern = pattern.split()

        if len(split_pattern) > 1:
            textsnippets = self.__get_sentence_window_more_words(split_pattern, sentences, constraints)
        else:
            textsnippets = self.__get_sentence_window_one_word(pattern, sentences, constraints)
        return textsnippets

    def __get_sentence_window_one_word(self, pattern, sentences, constraints):
        """Get sentence snippets with pattern containing of only one words according to window size."""
        textsnippets = []
        for ind, sent in enumerate(sentences):
            tokens = self.tokenizer.tokenize(sent)
            for i, token in enumerate(tokens):
                if self.check_pattern(pattern, token):
                    if constraints is not None:
                        self.__check_constraints(constraints, (i, i), ind, pattern, sent, sentences, textsnippets, tokens)
                    else:
                        self.__get_sentence_window_help(ind, sentences, textsnippets, pattern)
        return textsnippets

    def __check_constraints(self, constraints, token_pos, sent_num, pattern, sent, sentences, textsnippets, tokens):
        """Traverse the given list of constraints and find target words near the keyword. The number of word distance
        is given in the constraint list.
        add_info[0] is the keyword aka pattern.
        add_info[1] is the target_word aka the constraint.
        add_info[2] is the word distance from constraint to the pattern."""
        pos = 0
        more_words_flag = False
        if token_pos[0] == token_pos[1]:
            pos = token_pos[0]
        else:
            more_words_flag = True

        for add_info in constraints:
            # find pattern that matches target word
            index = add_info[2]
            found_constraint_flag = True
            if more_words_flag:
                constraint = add_info[0].split()
                i = 0
                while found_constraint_flag and i < len(pattern) and i < len(constraint):
                    if self.check_pattern(pattern[i], constraint[i]):
                        pass
                    else:
                        found_constraint_flag = False
                        break
                    i += 1

            if found_constraint_flag or self.check_pattern(pattern, add_info[0]):
                # set token_pos depending if index is positive or negative
                if more_words_flag and index > 0:
                    pos = token_pos[1]
                elif more_words_flag and index < 0:
                    pos = token_pos[0]

                if self.__sentence_mode:
                    if (0 <= pos + index < len(tokens)) and self.check_pattern(add_info[1], tokens[pos + index]):
                        self.__get_sentence_window_help(sent_num, sentences, textsnippets, pattern)
                    else:
                        while index != 0:
                            if index > 0:
                                index -= 1
                            else:
                                index += 1
                            if (0 < pos + index < len(tokens)) and self.check_pattern(add_info[1], tokens[pos + index]):
                                self.__get_sentence_window_help(sent_num, sentences, textsnippets, pattern)
                                break
                else:
                    if (0 <= pos + index < len(tokens)) and self.check_pattern(add_info[1], tokens[pos + index]):
                        self.__get_word_window_help(token_pos, textsnippets, len(tokens), tokens, pattern)
                    else:
                        while index != 0:
                            if index > 0:
                                index -= 1
                            else:
                                index += 1
                            if (0 < pos + index < len(tokens)) and self.check_pattern(add_info[1], tokens[pos + index]):
                                self.__get_word_window_help(token_pos, textsnippets, sent, tokens, pattern)
                                break

    def __get_sentence_window_help(self, ind, sentences, textsnippets, pattern):
        sentence = self.__get_sentences(ind, sentences)
        # get offsets
        offset_start = re.search(pattern, sentence).span()[0]
        offset_end = offset_start + (len(pattern) - 1)
        SentObj = namedtuple('Sentence_Object', ['snippet', 'offset_start', 'offset_end'])
        textsnippets.append(SentObj(snippet=sentence, offset_start=offset_start, offset_end=offset_end))

    def __get_sentence_window_more_words(self, split_pattern, sentences, constraints):
        """Get sentence snippets with pattern containing of more than 2 words according to window size."""
        textsnippets = []
        for ind, sent in enumerate(sentences):
            tokens = self.tokenizer.tokenize(sent)
            p_index = 0
            begin_index = 0
            end_index = 0
            while p_index < len(split_pattern):
                if (end_index < len(tokens)) and self.check_pattern(split_pattern[p_index], tokens[end_index]):
                    if p_index == 0:
                        begin_index = end_index
                    else:
                        begin_index = begin_index + end_index - end_index
                    p_index += 1
                    end_index += 1
                else:
                    break
            end_index -= 1
            if p_index == len(split_pattern):
                # search for constraints in sentence
                if constraints is not None:
                    self.__check_constraints(constraints, (begin_index, end_index), ind, split_pattern, sent, sentences,
                                             textsnippets, tokens)
                else:
                    pattern = " ".join(item for item in split_pattern)
                    self.__get_sentence_window_help(ind, sentences, textsnippets, pattern)
        return textsnippets

    def __get_sentences(self, ind, sentences):
        if self.__window_size == 0:
            return sentences[ind]

        elif self.__window_size > 0:
            left_window_border = ind - self.__window_size
            right_window_border = ind + self.__window_size + 1
            if left_window_border < 0:
                left_window_border = 0
            if right_window_border >= len(sentences):
                right_window_border = len(sentences)
            return " ".join(sentences[left_window_border:right_window_border])

    def find_text_window(self, schema, text, text_id, constraints=None):
        """Finds text windows with variable size and pushes the found results in the PostGre database.

        Parameters:
        text -- text to search in
        text_id -- id of the text
        constraints -- the constraint tuple list"""

        # this is only a quick and dirty fix: replace weird quotes to basic ones
        for ch in ['›', '‹', '»', '«']:
            if ch in text:
                text = text.replace(ch, '"')

        tokenized_text = self.tokenizer.tokenize(text)
        if self.___punctuation_mode:
            punctuation_text = re.split('[!?.,;:]', text)
            punctuation_text = [item for item in punctuation_text if item != '']
        for pattern in self.__postgre_db.get_data_from_table(schema, "single_pattern"):
            if self.___punctuation_mode and self.__sentence_mode:
                windows_objects = self.get_sentence_window(
                    pattern['single_pattern'], punctuation_text, constraints)
            elif self.__sentence_mode:
                windows_objects = self.get_sentence_window(
                pattern['single_pattern'], sent_tokenize(text, language='german'), constraints)
            else:
                windows_objects = self.get_word_window(pattern['single_pattern'], tokenized_text, constraints)

            # push found snippets onto database
            if len(windows_objects) > 0:
                single_pattern_id = pattern['id']
                for sent_obj in windows_objects:
                    # push snippets
                    self.__push_snippets(schema, sent_obj.snippet)
                    snippet_id = self.__postgre_db.get_id(schema,"snippets", "snippet=" + add_quotes(
                        replace_special_characters(sent_obj.snippet)))
                    # push relations
                    self.__push_texts_snippets(schema, text_id, snippet_id)
                    self.__push_snippet_offsets(schema,
                        single_pattern_id, snippet_id, sent_obj.offset_start, sent_obj.offset_end)

    def __push_snippets(self, schema, snippet):
        """Push found snippets onto the snippets table in PostGre DB, if not already in the table.
        Afterwards push the single_pattern and snippets relation."""
        if not self.__postgre_db.is_in_table(schema, "snippets", "snippet=" + add_quotes(
                replace_special_characters(snippet))):
            self.__postgre_db.insert(schema,"snippets", {"snippet": snippet})

    def __push_texts_snippets(self, schema, text_id, snippet_id):
        """Get all saved snippets that occur in a text and push them onto PostGre DB."""
        self.__push_relation(schema, text_id, snippet_id, "text_id", "snippet_id", "texts_snippets")

    def __push_snippet_offsets(self, schema, single_pattern_id, snippet_id, offset_start, offset_end):
        """Push found single_pattern in snippets and their respective offset."""
        if not self.__postgre_db.is_in_table(
                schema, "snippet_offsets", "single_pattern_id=" + str(single_pattern_id) + " and snippet_id=" + str(
                    snippet_id)):
            self.__postgre_db.insert(schema, "snippet_offsets", {
                "single_pattern_id": single_pattern_id, "snippet_id": snippet_id, "offsets": [
                    [offset_start, offset_end]]})
        else:
            old_list = self.__postgre_db.get(schema, "snippet_offsets", "single_pattern_id=" + str(
                single_pattern_id) + " and snippet_id=" + str(snippet_id), "offsets")
            old_list.append([offset_start, offset_end])
            pid = self.__postgre_db.get_id(schema, "snippet_offsets", "single_pattern_id=" + str(
                single_pattern_id) + " and snippet_id=" + str(snippet_id))
            self.__postgre_db.update(schema, "snippet_offsets", "offsets=" + add_quotes(replace_brackets(str(
                old_list))), "id=" + str(pid))

    def __push_relation(self, schema, id1, id2, id1_name, id2_name, table):
        """Push a relation onto the PostGre DB. The relation has to have a primary key."""
        # case: No entry about relation is in DB yet
        if not self.__postgre_db.is_in_table(schema, table, id1_name + "=" + str(
                id1)):
            self.__postgre_db.insert(schema, table, {
                id1_name: id1, id2_name: [id2], "aggregation": 0})

        # case: Entry about single_pattern is in DB
        else:
            old_list = self.__postgre_db.get(schema, table, id1_name + "=" + str(
                id1), id2_name)
            new_list = list(set(old_list + [id2]))
            self.__postgre_db.update(schema, table, id2_name + "=" + add_quotes(replace_brackets(str(
                new_list))), id1_name + "=" + str(id1))

    def __push_aggregation_lowest_layer(self, schema, aggregation_object, aggregation_name, table, id_name):
        """Push the aggregated snippet numbers onto corresponding the lower layer tables."""
        for aggregation in aggregation_object:
            id = aggregation[aggregation_name][0]
            aggregation_value = aggregation[aggregation_name][1]
            self.__postgre_db.update(schema, table, "aggregation=" + str(aggregation_value), id_name + "=" + str(id))

    def __push_aggregation(self, schema, table, sub_table, table_id, sub_table_id):
        """Calculate and push aggregation on the rest layer tables."""
        table_entries = self.__postgre_db.get_data_from_table(schema, table)
        for entry in table_entries:
            aggregation = 0
            entry_id = entry[table_id]
            entries_to_look_up = entry[sub_table_id]

            for look_up in entries_to_look_up:
                query = "SELECT SUM(aggregation) FROM " + schema + "." + sub_table + " WHERE " + sub_table_id + "=" + str(look_up)
                stored_value = self.__postgre_db.query(query)[0]['sum']
                if stored_value is None:
                    stored_value = 0
                aggregation += stored_value
            self.__postgre_db.update(schema, table, "aggregation=" + str(aggregation), table_id + "=" + str(entry_id))

    def get_snippets(self, schema, constraints):
        """Get snippets for the whole corpus.

        Parameter:
        constraints -- the constraint tuple list"""
        for ind, text in enumerate(self.__mongo_db.get(schema, {})):
            self.__postgre_db.insert(schema, "texts", {"title": text['title']})
            self.find_text_window(schema, text['text'], text['id'], constraints)
            print("Finished extracting snippets from chapter " + str(text['id']) + ".")

    def aggregation(self, schema):
        """Calculate aggregation bottom-up and store the interim data onto the database."""
        aggregation_texts_snippets = self.__postgre_db.query("SELECT " + schema + ".aggregate_texts_snippets()")
        aggregation_snippet_offsets = self.__postgre_db.query("SELECT " + schema + ".aggregate_snippet_offsets()")

        # push 2 lowest levels of the hierarchy
        self.__push_aggregation_lowest_layer(schema,
            aggregation_texts_snippets, str('aggregate_texts_snippets'), "texts_snippets", "text_id")
        self.__push_aggregation_lowest_layer(schema,
            aggregation_snippet_offsets, str('aggregate_snippet_offsets'), "snippet_offsets", "id")

        # push rest of the hierarchy
        self.__push_aggregation(schema,
            "pattern_single_pattern", "snippet_offsets", str('pattern_id'), str('single_pattern_id'))
        self.__push_aggregation(schema, "has_object", "pattern_single_pattern", str('bscale_id'), str('pattern_id'))
        self.__push_aggregation(schema, "has_attribute", "has_object", str('bsort_id'), str('bscale_id'))

    def aggregate_bscale(self, schema, new_bscale, bsort, scale_type, *args):
        pattern_info = self.__add_new_bscale(schema, new_bscale, bsort, scale_type, *args)
        if pattern_info is not None:
            pattern_ids = pattern_info[0]
            new_bscale_id = pattern_info[1]
            new_pattern_list = list(set.union(*[set(item) for item in pattern_ids]))
            aggregation = 0
            for item in new_pattern_list:
                aggregation += self.__postgre_db.get(schema, "pattern_single_pattern", "pattern_id=" + str(item), "aggregation")
            self.__postgre_db.insert(schema, "has_object", {"bscale_id": new_bscale_id, "pattern_id": new_pattern_list, "aggregation": aggregation})

    def intersect_bscale(self, schema, new_bscale, bsort, scale_type, *args):
        pattern_info = self.__add_new_bscale(schema, new_bscale, bsort, scale_type, *args)
        if pattern_info is not None:
            pattern_ids = pattern_info[0]
            new_bscale_id = pattern_info[1]
            new_pattern_list = list(set.intersection(*[set(item) for item in pattern_ids]))
            aggregation = 0
            for item in new_pattern_list:
                aggregation += self.__postgre_db.get(schema, "pattern_single_pattern", "pattern_id=" + str(item), "aggregation")
            self.__postgre_db.insert(schema, "has_object", {"bscale_id": new_bscale_id, "pattern_id": new_pattern_list, "aggregation": aggregation})

    def __add_new_bscale(self, schema, new_bscale, bsort, scale_type, *args):
        if args is not None:
            bscale_table = self.__postgre_db.get_data_from_table(schema, "bscale")
            bscale_ids = []
            for scale in args:
                scale_found = False
                for bscale in bscale_table:
                    if scale == bscale['bscale']:
                        bscale_ids.append(bscale['id'])
                        scale_found = True
                if not scale_found:
                    raise Exception("Chosen Bscale does not exist.")
            if not self.__postgre_db.is_in_table(schema, "bscale", "bscale=" + add_quotes(new_bscale)):
                self.__postgre_db.insert(schema, "bscale", {"bscale": new_bscale, "nominal": False, "ordinal": False, "interval": False})
            new_bscale_id = self.__postgre_db.get_id(schema, "bscale", "bscale=" + add_quotes(new_bscale))
            self.__postgre_db.update(schema, "bscale", scale_type + "=" + add_quotes('True'), "id=" + str(new_bscale_id))
            bsort_id = self.__postgre_db.get_id(schema, "bsort", "bsort=" + add_quotes(bsort))
            if self.__postgre_db.is_in_table(schema, "has_attribute", "bsort_id=" + str(bsort_id)):
                old_list = self.__postgre_db.get(schema, "has_attribute", "bsort_id=" + str(bsort_id), "bscale_id")
                old_list.append(new_bscale_id)
                self.__postgre_db.update(schema, "has_attribute", "bscale_id=" + add_quotes(
                    replace_brackets(str(old_list))), "bsort_id=" + str(bsort_id))
            else:
                self.__postgre_db.insert(schema, "has_attribute",
                                         {"bsort_id": bsort_id, "bscale_id": [new_bscale_id], "aggregation": 0})

            scale_obj = self.__postgre_db.get_data_from_table(schema, "has_object")
            pattern_ids = []
            for scale_id in bscale_ids:
                for item in scale_obj:
                    if scale_id == item['bscale_id']:
                        pattern_ids.append(item['pattern_id'])

            return (pattern_ids, new_bscale_id)

    def find_correlating_pattern(self, schema):
        all_snippets_table = self.__postgre_db.get_data_from_table(schema, "snippets")
        all_snippets = [snippet['snippet'] for snippet in all_snippets_table]
        all_bscales_table = self.__postgre_db.get_data_from_table(schema, "bscale")
        all_bscales = [bscale['id'] for bscale in all_bscales_table]

        for bscale_id in all_bscales:
            pattern_list = self.__postgre_db.get(schema, "has_object", "bscale_id=" + str(bscale_id), "pattern_id")
            for pattern_id in pattern_list:
                single_pattern_id_list = self.__postgre_db.get(
                    schema, "pattern_single_pattern", "pattern_id=" + str(pattern_id), "single_pattern_id")
                for single_pattern_id in single_pattern_id_list:
                    single_pattern = self.__postgre_db.get(schema, "single_pattern", "id=" + str(single_pattern_id), "single_pattern")
                    self.__postgre_db.insert(schema, "bscale_single_pattern", {"bscale_id": bscale_id, "single_pattern_id": single_pattern_id, "single_pattern": single_pattern , "count": 0})
        for snippet in self.parser.nlp.pipe(all_snippets, batch_size=3000, n_threads=-1):
            correlating_pattern = self.parser.get_correlating_nouns_and_adjectives(snippet)
            for ind, item in enumerate(correlating_pattern):
                if self.__postgre_db.is_in_table(schema, "bscale_single_pattern",
                                                 "single_pattern=" + add_quotes(item)):
                    pattern_id = self.__postgre_db.get(schema, "bscale_single_pattern", "single_pattern=" + str(add_quotes(item)), "single_pattern_id")
                    index = ind + 1
                    while index < len(correlating_pattern):
                        next_item = correlating_pattern[index]
                        if self.__postgre_db.is_in_table(schema, "bscale_single_pattern",
                                                 "single_pattern=" + add_quotes(next_item)):
                            pattern_next_item_id = self.__postgre_db.get(schema, "bscale_single_pattern", "single_pattern=" + str(add_quotes(next_item)), "single_pattern_id")
                            if pattern_id != pattern_next_item_id:
                                first_combination_in_table = self.__postgre_db.is_in_table(
                                        schema, "correlating_pattern", "pattern_a=" + str(pattern_id) + " and pattern_b=" + str(pattern_next_item_id))
                                second_combination_in_table = self.__postgre_db.is_in_table(
                                    schema, "correlating_pattern",
                                    "pattern_a=" + str(pattern_next_item_id) + " and pattern_b=" + str(pattern_id))

                                # update entry if already exists in table correlating_pattern
                                if first_combination_in_table:
                                    old_count = self.__postgre_db.get(schema, "correlating_pattern", "pattern_a=" + str(pattern_id) + " and pattern_b=" + str(pattern_next_item_id), "count")
                                    new_count = old_count + 1
                                    self.__postgre_db.update(schema, "correlating_pattern", "count=" + str(new_count), "pattern_a=" + str(pattern_id) + " and pattern_b=" + str(pattern_next_item_id))
                                elif second_combination_in_table:
                                    old_count = self.__postgre_db.get(schema, "correlating_pattern", "pattern_a=" + str(
                                        pattern_next_item_id) + " and pattern_b=" + str(pattern_id), "count")
                                    new_count = old_count + 1
                                    self.__postgre_db.update(schema, "correlating_pattern", "count=" + str(new_count),
                                                             "pattern_a=" + str(pattern_next_item_id) + " and pattern_b=" + str(
                                                                 pattern_id))
                                else:
                                    # create new entry for pattern pair if none exists
                                    self.__postgre_db.insert(schema, "correlating_pattern", {
                                        "pattern_a": pattern_id, "pattern_b": pattern_next_item_id, "count": 1})
                        index += 1

    def find_spo_and_adjectives(self, schema):
        all_snippets_table = self.__postgre_db.get_data_from_table(schema, "snippets")
        all_snippets = [snippet['snippet'] for snippet in all_snippets_table]
        for snippet in self.parser.nlp.pipe(all_snippets, batch_size=3000, n_threads=-1):
            spo = self.parser.get_SVO(snippet)
            for item in spo:
                if item is not None:
                    # subject is pattern
                    if item.subject != "'":
                        if self.__postgre_db.is_in_table(schema, "single_pattern", "single_pattern=" + add_quotes(item.subject)):
                            self.push_parser_items(schema, item.subject, "subject_occ", "subject")
                            self.push_parser_items(schema, item.verb, "verb_occ", "verb")
                            self.push_parser_item_relationship(
                                schema, item.subject, item.verb, "subject_verb_occ", "subject", "verb")
                            if item.object != '':
                                self.push_parser_items(schema, item.object, "object_occ", "object")
                                self.push_parser_item_relationship(schema,
                                        item.subject, item.object, "subject_object_occ", "subject", "object")
                        #object is pattern
                        elif self.__postgre_db.is_in_table(schema, "single_pattern", "single_pattern=" + add_quotes(item.object)):
                            self.push_parser_items(schema, item.object, "object_occ", "object")
                            self.push_parser_items(schema, item.verb, "verb_occ", "verb")
                            self.push_parser_item_relationship(schema,
                                                               item.object, item.verb, "object_verb_occ", "object", "verb")
                            if item.subject != '':
                                self.push_parser_items(schema, item.subject, "subject_occ", "subject")
                                self.push_parser_item_relationship(schema,
                                        item.subject, item.object, "subject_object_occ", "subject", "object")

            noun_adjectives = self.parser.nouns_adj_spacy(snippet)
            for item in noun_adjectives:
                subject = item['noun']
                adjective = item['adj']
                if self.__postgre_db.is_in_table(
                        schema, "single_pattern", "single_pattern=" + add_quotes(item['noun'])):
                    self.push_parser_items(schema, subject, "subject_occ", "subject")
                    self.push_parser_items(schema, adjective, "adjective_occ", "adjective")
                    self.push_parser_item_relationship(
                        schema, subject, adjective, "subject_adjective_occ", "subject", "adjective")

    def push_parser_items(self, schema, word, table, word_type):
        if not self.__postgre_db.is_in_table(schema, table, word_type + "=" + add_quotes(word)):
            self.__postgre_db.insert(schema, table, {word_type: word, "count": 0})

    def push_parser_item_relationship(self, schema, word1, word2, table, word_type1, word_type2):
        word1_id = self.__postgre_db.get_id(schema, word_type1 + "_occ", word_type1 + "=" + add_quotes(word1))
        word2_id = self.__postgre_db.get_id(schema, word_type2 + "_occ", word_type2 + "=" + add_quotes(word2))

        if not self.__postgre_db.is_in_table(schema, table, word_type1 + "=" + str(
                word1_id) + " and " + word_type2 + "=" + str(word2_id)):
            self.__postgre_db.insert(schema, table, {word_type1: word1_id, word_type2: word2_id, "count": 1})
        else:
            table_id = self.__postgre_db.get_id(schema, table, word_type1 + "=" + str(word1_id) + " and " + word_type2 + "=" + str(word2_id))
            old_count = self.__postgre_db.get(schema, table, "id=" + str(table_id), "count")
            self.__postgre_db.update(schema, table, "count=" + str(old_count + 1), "id=" + str(table_id))

    def aggregate_occurences_help(self, text_counter, word):
        count = text_counter[word]
        if count == 0:
            return 1
        else:
            return count

    def calculate_pmi(self, schema):
        print("Calculating PMI for " + schema)
        corpus_count = 0
        for item in self.__mongo_db.get(schema, {}):
            corpus_count += len(word_tokenize(item['text']))
        print(corpus_count)
        print("Lemmatizing corpus.")
        lemmatized_text = []
        for ind, text in enumerate(self.__mongo_db.get(schema, {})):
            doc = text['text']
            for ch in ['›', '‹', '»', '«']:
                if ch in doc:
                    doc = doc.replace(ch, '"')
            lemmatized_text += self.parser.lemmatize_chunk(doc)
            print("Part " + str(ind) + " lemmatized.")
        self.aggregate_occurences(schema, "subject", lemmatized_text)
        self.aggregate_occurences(schema, "object", lemmatized_text)
        self.aggregate_occurences(schema, "adjective", lemmatized_text)
        self.aggregate_occurences(schema, "verb", lemmatized_text)
        print("Finished aggregating occurences.")
        self.calculate_pmi_helper(schema, corpus_count, "subject_adjective_occ", "subject", "adjective")
        self.calculate_pmi_helper(schema, corpus_count, "subject_verb_occ", "subject", "verb")
        self.calculate_pmi_helper(schema, corpus_count, "subject_object_occ", "subject", "object")
        self.calculate_pmi_helper(schema, corpus_count, "object_verb_occ", "object", "verb")

    def aggregate_occurences(self, schema, word_table, lemmatized_text):
        table = self.__postgre_db.get_data_from_table(schema, word_table + "_occ")
        for item in table:
            word = item[word_table]
            split_word = word.split(" ")
            length = len(split_word)
            if length > 1:
                if length == 2:
                    counter = list(bigrams(lemmatized_text))
                    word_tuple = (split_word[0], split_word[1])
                elif length == 3:
                    counter = list(trigrams(lemmatized_text))
                    word_tuple = (split_word[0], split_word[1], split_word[2])
                else:
                    counter = []
                count = counter.count(word_tuple)
            else:
                word = item[word_table]
                count = self.aggregate_occurences_help(Counter(lemmatized_text), word)
            print(word, str(count))
            self.__postgre_db.update(schema, word_table + "_occ", "count=" + str(count), "id=" + str(item['id']))

    def calculate_pmi_helper(self, schema, corpus_count, co_occurence, word1, word2):
        co_occ_table = self.__postgre_db.get_data_from_table(schema, co_occurence)
        for item in co_occ_table:
            item_id = item['id']
            co_occ_freq = float(item['count'] / corpus_count)
            word1_id = item[word1]
            word2_id = item[word2]
            word1_occ = self.__postgre_db.get(schema, word1 + "_occ", "id=" + str(word1_id), "count")
            word2_occ = self.__postgre_db.get(schema, word2 + "_occ", "id=" + str(word2_id), "count")
            pmi = log2(co_occ_freq / (float(word1_occ / corpus_count) * float(word2_occ / corpus_count)))
            self.__postgre_db.update(schema, co_occurence, "pmi=" + str(pmi), "id=" + str(item_id))

    def calculate_pmi_use_case2(self, schema):
        print("Calculating PMI for " + schema)
        corpus_count = 0
        text = []
        for item in self.__mongo_db.get(schema, {}):
            text += word_tokenize(item['text'], language='german')
            corpus_count += len(word_tokenize(item['text'], language='german'))
        print(corpus_count)
        counter = Counter(text)
        single_pattern_table = self.__postgre_db.get_data_from_table(schema, "bscale_single_pattern")
        # counting single pattern occurrences
        for item in single_pattern_table:
            word = item['single_pattern']
            count = counter[word]
            self.__postgre_db.update(schema, "bscale_single_pattern", "count=" + str(count), "single_pattern=" + add_quotes(word))

        # pmi calculation
        co_occ_table = self.__postgre_db.get_data_from_table(schema, "correlating_pattern")
        for item in co_occ_table:
            item_id = item['id']
            co_occ_freq = float(item['count'] / corpus_count)
            word1_id = item['pattern_a']
            word2_id = item['pattern_b']
            word1_occ = self.__postgre_db.get(schema, "bscale_single_pattern", "id=" + str(word1_id), "count")
            print(word1_occ)
            word2_occ = self.__postgre_db.get(schema, "bscale_single_pattern", "id=" + str(word2_id), "count")
            print(word2_occ)
            pmi = log2(co_occ_freq / (float(word1_occ / corpus_count) * float(word2_occ / corpus_count)))
            print(pmi)
            self.__postgre_db.update(schema, "correlating_pattern", "pmi=" + str(pmi), "id=" + str(item_id))

    def get_results_use_case2(self, schema):
        print("Colour + Nature")
        pprint(self.__postgre_db.query(
            """SELECT C.pmi, S.single_pattern AS pattern_a, T.single_pattern AS pattern_b FROM """ + schema + """.correlating_pattern C, """ + schema + """.bscale_single_pattern S, """ + schema + """.bscale_single_pattern T WHERE (S.bscale_id = 1 AND T.bscale_id = 2 AND C.pattern_a = S.single_pattern_id AND C.pattern_b = T.single_pattern_id) OR (S.bscale_id = 2 AND T.bscale_id = 1 AND C.pattern_a = S.single_pattern_id AND C.pattern_b = T.single_pattern_id) ORDER BY pmi DESC"""))
        print("Colour + Location")
        pprint(self.__postgre_db.query(
            """SELECT C.pmi, S.single_pattern AS pattern_a, T.single_pattern AS pattern_b FROM """ + schema + """.correlating_pattern C, """ + schema + """.bscale_single_pattern S, """ + schema + """.bscale_single_pattern T WHERE (S.bscale_id = 1 AND T.bscale_id = 3 AND C.pattern_a = S.single_pattern_id AND C.pattern_b = T.single_pattern_id) OR (S.bscale_id = 3 AND T.bscale_id = 1 AND C.pattern_a = S.single_pattern_id AND C.pattern_b = T.single_pattern_id) ORDER BY pmi DESC"""))
        print("Colour + Social")
        pprint(self.__postgre_db.query(
            """SELECT C.pmi, S.single_pattern AS pattern_a, T.single_pattern AS pattern_b FROM """ + schema + """.correlating_pattern C, """ + schema + """.bscale_single_pattern S, """ + schema + """.bscale_single_pattern T WHERE (S.bscale_id = 1 AND T.bscale_id = 4 AND C.pattern_a = S.single_pattern_id AND C.pattern_b = T.single_pattern_id) OR (S.bscale_id = 4 AND T.bscale_id = 1 AND C.pattern_a = S.single_pattern_id AND C.pattern_b = T.single_pattern_id) ORDER BY pmi DESC"""))
        print("Nature + Location")
        pprint(self.__postgre_db.query(
            """SELECT C.pmi, S.single_pattern AS pattern_a, T.single_pattern AS pattern_b FROM """ + schema + """.correlating_pattern C, """ + schema + """.bscale_single_pattern S, """ + schema + """.bscale_single_pattern T WHERE (S.bscale_id = 2 AND T.bscale_id = 3 AND C.pattern_a = S.single_pattern_id AND C.pattern_b = T.single_pattern_id) OR (S.bscale_id = 3 AND T.bscale_id = 2 AND C.pattern_a = S.single_pattern_id AND C.pattern_b = T.single_pattern_id) ORDER BY pmi DESC"""))
        print("Nature + Social")
        pprint(self.__postgre_db.query(
            """SELECT C.pmi, S.single_pattern AS pattern_a, T.single_pattern AS pattern_b FROM """ + schema + """.correlating_pattern C, """ + schema + """.bscale_single_pattern S, """ + schema + """.bscale_single_pattern T WHERE (S.bscale_id = 2 AND T.bscale_id = 4 AND C.pattern_a = S.single_pattern_id AND C.pattern_b = T.single_pattern_id) OR (S.bscale_id = 4 AND T.bscale_id = 2 AND C.pattern_a = S.single_pattern_id AND C.pattern_b = T.single_pattern_id) ORDER BY pmi DESC"""))
        print("Location + Social")
        pprint(self.__postgre_db.query(
            """SELECT C.pmi, S.single_pattern AS pattern_a, T.single_pattern AS pattern_b FROM """ + schema + """.correlating_pattern C, """ + schema + """.bscale_single_pattern S, """ + schema + """.bscale_single_pattern T WHERE (S.bscale_id = 3 AND T.bscale_id = 4 AND C.pattern_a = S.single_pattern_id AND C.pattern_b = T.single_pattern_id) OR (S.bscale_id = 4 AND T.bscale_id = 3 AND C.pattern_a = S.single_pattern_id AND C.pattern_b = T.single_pattern_id) ORDER BY pmi DESC"""))

    def check_pattern(self, pattern, token):
        """Strip token and check if the token matches the defined pattern.

        Parameter:
        pattern -- the pattern to search for
        token -- the token to match with the pattern
        """
        split_token = re.split('\W+', token)
        if split_token[0] == '':
            split_token = split_token[1]
        else:
            split_token = split_token[0]
        return split_token == pattern

    def get_result(self, schema):
        print(self.__postgre_db.query("""SELECT SUM(SV.count) FROM """ + schema + """.subject_verb_occ SV"""))
        print(self.__postgre_db.query("""SELECT SUM(SV.count) FROM """ + schema + """.object_verb_occ SV"""))
        print(self.__postgre_db.query("""SELECT SUM(SV.count) FROM """ + schema + """.subject_object_occ SV"""))
        print(self.__postgre_db.query("""SELECT SUM(SV.count) FROM """ + schema + """.subject_adjective_occ SV"""))
        pprint(self.__postgre_db.query("""SELECT S.subject, V.verb, SV.pmi FROM """ + schema + """.subject_verb_occ SV, """ + schema + """.subject_occ S, """ + schema + """.verb_occ V WHERE SV.subject = S.id AND SV.verb = V.id ORDER BY subject DESC, pmi DESC"""))
        pprint(self.__postgre_db.query("""SELECT O.object, V.verb, OV.pmi FROM """ + schema + """.object_verb_occ OV, """ + schema + """.object_occ O, """ + schema + """.verb_occ V WHERE OV.object = O.id AND OV.verb = V.id ORDER BY object DESC, pmi DESC"""))
        pprint(self.__postgre_db.query("""SELECT O.object, S.subject, SO.pmi FROM """ + schema + """.subject_object_occ SO, """ + schema + """.subject_occ S, """ + schema + """.object_occ O WHERE SO.object = O.id AND SO.subject = S.id ORDER BY subject DESC, pmi DESC"""))
        pprint(self.__postgre_db.query("""SELECT S.subject, A.adjective, SA.pmi FROM """ + schema + """.subject_adjective_occ SA, """ + schema + """.subject_occ S, """ + schema + """.adjective_occ A WHERE SA.subject = S.id AND SA.adjective = A.id ORDER BY subject DESC, pmi DESC"""))
Пример #46
0
def tokenize(text):
	tknzr = WhitespaceTokenizer()
	tokens = tknzr.tokenize(text)
	# tokens = nltk.word_tokenize(text)
	return tokens
Пример #47
0
def word_parser( input_str ):
    tokenizer = WhitespaceTokenizer()
    return tokenizer.tokenize( input_str )
Пример #48
0
import nltk
nltk.download()

from nltk.tokenize import WhitespaceTokenizer, WordPunctTokenizer, TreebankWordTokenizer

text = "this is a block of text. I am writing a piece to explain the use of nlp packages."
text = 'Feet wolves talked cats'

######tokenize
tokenizer1 = WhitespaceTokenizer()  #extract based o white space
tokenizer2 = WordPunctTokenizer(
)  #extract based on the white space as well as punctuation
tokenizer3 = TreebankWordTokenizer()

tokens1 = tokenizer1.tokenize(text)
tokens2 = tokenizer2.tokenize(text)
tokens3 = tokenizer3.tokenize(text)

######
#best is first try to lemmetizing and then stem
from nltk.stem import PorterStemmer, WordNetLemmatizer

ps = PorterStemmer()
lem = WordNetLemmatizer()

lemmatized_tokens = []
for token in tokens3:
    lemmatized_tokens.append(lem.lemmatize(token))

#lemmatized and stemmed
Пример #49
0
# but now works
ic_dict = {}

cong = []
all_tokens = 0
#create IC dict
#tokenizer = RegexpTokenizer(r'\w+')
tokenizer = WhitespaceTokenizer()
filename = "../Subtlex.US.txt"
for line in open(filename,"r").readlines():
    line = line.lower()
    line = line.strip()
    #line = line.replace("-"," ")
    #line = "self-support"
    line = ' '.join(word.strip(string.punctuation) for word in line.split())
    print tokenizer.tokenize(line)
    t_list = tokenizer.tokenize(line)
    
    for token in t_list:
        try:
            token = token.encode("ascii", "ignore").lower()
            #token = unicode(token, 'utf8')
            #token = token.encode('utf8')
            try:
                ic_dict[token]
                ic_dict[token]+=1
                
            except:
                ic_dict[token] = 1
                
            all_tokens+=1
Пример #50
0
	SymmetricProximateTokensTemplate(ProximateWordsRule, (1,3)),
	ProximateTokensTemplate(ProximateTagsRule, (-1, -1), (1,1)),
	ProximateTokensTemplate(ProximateWordsRule, (-1, -1), (1,1)),	]

tagged_sentences=[]
tokenizer =WhitespaceTokenizer()
with open("datascience_6.txt","r") as openfile:
	for line in openfile:
		words = line.lower().strip()
		words=re.sub(r'\~|\`|\@|\$|\%|\^|\&|\*|\(|\)|\_|\=|\{|\[|\}|\]|\\|\<|\,|\<|\>|\?|\/|\;|\:|\"|\'', '',words)
		words=words.split('\r')
		jobposts = [s.lstrip() for s in words]
		for jobpost in jobposts:
			sentences=jobpost.split('.')
			for sentence in sentences:
				tokenized_sentence=tokenizer.tokenize(sentence)
				initial_tagged_sentence=nltk.pos_tag(tokenized_sentence)
				tagged_sentences.append(initial_tagged_sentence)
tagged_no_empties = []
a =[]
for i in tagged_sentences:
	if a==i:
		pass	
	else:
		tagged_no_empties.append(i)
unigram_tagger=nltk.UnigramTagger(tagged_no_empties)
trainer = FastBrillTaggerTrainer(initial_tagger=unigram_tagger,
templates=templates, trace=3,deterministic=True)
brill_tagger = trainer.train(tagged_sentences, max_rules=10)

Пример #51
0
	BiGramTriGram = Counter()
	TriGramTriGram = Counter()
	tokenizer = WhitespaceTokenizer()
	for line in openfile:
		words = line.lower().strip().replace('(',',').replace(')',',')
		words=re.sub(r'\~|\`|\@|\$|\%|\^|\&|\*|\(|\)|\_|\=|\{|\[|\}|\]|\\|\<|\,|\<|\.|\>|\?|\/|\;|\:|\"|\'', '',words)
		words = pattern.sub('', words)
		words=words.split('\r')
		words = [s.lstrip() for s in words]
		ReservoirALL={}
		for word in words:
			CountWordGrams = Counter()
			CountBiGrams = Counter()
			CountTriGrams = Counter()
			
			wordsplit= tokenizer.tokenize(word)
			wordsplit = [s.lstrip() for s in wordsplit]
			NoDupes = list(set(wordsplit))
			TuplesNoDupes=[tuple(i.split()) for i in NoDupes]
			skillsonly=[x for x in TuplesNoDupes if x in SKILLS]
			skillsclean = [token for token in skillsonly if token not in Stopwords]
			
			BiGrams=bigrams(wordsplit)
			NoDupesBiGrams = list(set(BiGrams))
			BiGrams=[x for x in NoDupesBiGrams if x in SKILLS]
			TriGrams=trigrams(wordsplit)
			NoDupesTriGrams = list(set(TriGrams))
			TriGrams=[x for x in NoDupesTriGrams if x in SKILLS]
		
			CountWordGrams.update(skillsclean)
			CountBiGrams.update(BiGrams)