Exemplo n.º 1
0
def frequency_vectorize_gensim(corpus):
    corpus = [tokenize(doc) for doc in corpus]
    id2word = gensim.corpora.Dictionary(corpus)
    vectors = [
        id2word.doc2bow(doc) for doc in corpus
    ]
    return vectors
Exemplo n.º 2
0
def one_hot_vectorize_gensim(corpus):
    corpus = [tokenize(doc) for doc in corpus]
    id2word = gensim.corpora.Dictionary(corpus)
    vectors = [
        [(token[0], 1) for token in id2word.doc2bow(doc)]
        for doc in corpus
    ]
    return vectors
Exemplo n.º 3
0
def post_number(sentence):
    words = utils.tokenize(sentence)

    abb_map = mapper.load(POSTNUM_ABB_PATH, first_caps=True)

    for i in range(1, len(words)):
        word = utils.strip(words[i-1])
        if word in abb_map and utils.is_number(utils.strip(words[i])):
            words[i-1] = utils.replace(words[i-1], str(abb_map[word]))

    return " ".join(words)
Exemplo n.º 4
0
def basic(sentence):
    words = utils.tokenize(sentence)

    abb_map = mapper.load(ABB_PATH)

    for i in range(0, len(words)):
        word = utils.strip(words[i])
        if word in abb_map:
            words[i] = utils.replace(words[i], str(abb_map[word]))

    return " ".join(words)
Exemplo n.º 5
0
def full_name(sentence):
    words = utils.tokenize(sentence)

    first_list = lister.load(FIRSTNAME_PATH)
    sur_list = lister.load(SURNAME_PATH)

    for i in range(1, len(words)):
        first = utils.strip(words[i-1])
        last = utils.strip(words[i])
        if last in sur_list and first in first_list:
            words[i-1] = utils.replace(words[i-1], str(first[:1] + "."))

    return " ".join(words)
Exemplo n.º 6
0
 def build_model(self, corpus):
     """
     Build bigram model
     :param corpus: Space separated string of all sentences
     :return:
     """
     words = utils.tokenize(corpus)
     self.word_model = Counter(words)  # Count(word)
     bigrams = list(utils.get_chunks(
         words, 2))  #Can be changed to any arbitrary ngrams
     self.bigram_model = defaultdict(Counter)  # Count(word2|word1)
     for tup in bigrams:
         try:
             self.bigram_model[tup[0]][tup[1]] += 1
         except:
             pass
     self.save_model()
Exemplo n.º 7
0
def execute(sentence):
    words = utils.tokenize(sentence)
    parsed = []
    last_ok = -1
    num_map = mapper.load(NUM_PATH)
    for i in range(0, len(words)):
        word = words[i]
        changed = utils.strip(word)
        if changed in num_map and last_ok < i:
            number = 0
            buffer = 0
            last_ok = i
            for j in range(i, len(words)):
                actual = words[j]
                actual = utils.strip(actual)
                if actual not in num_map:
                    break

                if num_map[actual] == '1000000':
                    if buffer == 0:
                        break
                    else:
                        number += 1000000 * buffer
                        buffer = 0
                elif num_map[actual] == '1000':
                    if buffer == 0:
                        break
                    else:
                        number += 1000 * buffer
                        buffer = 0
                elif num_map[actual] == '100':
                    buffer = buffer * 100
                else:
                    buffer += int(num_map[actual])
                last_ok = j
                if actual != words[j]:
                    break

            number += buffer
            parsed.append(utils.replace(words[last_ok], str(number)))
        else:
            if last_ok < i:
                parsed.append(utils.replace(word, changed))
    return " ".join(parsed)
Exemplo n.º 8
0
def tf_idf_vectorize_gensim(corpus):
    corpus = [tokenize(doc) for doc in corpus]
    lexicon = gensim.corpora.Dictionary(corpus)
    tfidf = gensim.models.TfidfModel(dictionary=lexicon, normalize=True)
    vectors = [tfidf[lexicon.doc2bow(doc)] for doc in corpus]
    return vectors
Exemplo n.º 9
0
def frequency_vectorize_nltk(doc):
    features = defaultdict(int)
    for token in tokenize(doc):
        features[token] +=1
    return features
Exemplo n.º 10
0
def one_hot_vectorize_nltk(doc):
    vectors = {
        token: True
        for token in tokenize(doc)
    }
    return vectors