示例#1
0
def preprocess_data(train_data, test_data):
    custom_stopwords = set(ENGLISH_STOP_WORDS)
    custom_stopwords.update(["say", "says", "said", "saying", "just", "year", "man", "men", "woman", \
     "women", "guy", "guys", "run", "running", "ran", "run", "do", "don't", "does", "doesn't" , \
     "doing", "did", "didn't",  "use", "used", "continue", "number", "great", "big", "good", "bad", \
     "better", "worse", "best", "worst", "actually", "fact", "way", "tell", "told", "include", "including", \
     "want", "wanting", "will", "won't", "give", "given", "month", "day", "place", "area", "look", \
     "looked", "far", "near", "get", "getting", "got", "know", "knows", "knew", "long", "week", "have", \
     "has", "haven't", "hasn't", "having", "had", "hadn't", "not", "think", "thinking", "Monday", \
     "Tuesday", "Wednesday", "Thursday", "Saturday", "Sunday", "high", "low", "thing", "there", "they're", \
     "It", "I've", "I'd", "He's", "She's", "They've", "I'm", "You're", "your", "their", "his", "hers", \
     "mine", "today", "yesterday", "it", "ve", "going", "go", "went", "lot", "don", "saw", "seen", "come", "came"])

    titled_train_data = add_titles(train_data['Content'], train_data['Title'])
    if test_data is not None:
        titled_test_data = add_titles(test_data['Content'], test_data['Title'])

    # Removing stopwords:
    new_train_data = []
    for doc in titled_train_data:
        doc_wordlist = doc.split()
        new_doc_wordlist = [
            word for word in doc_wordlist if word not in custom_stopwords
        ]
        new_doc = ' '.join(new_doc_wordlist)
        new_train_data.append(new_doc)
    if test_data is not None:
        new_test_data = []
        for doc in titled_test_data:
            doc_wordlist = doc.split()
            new_doc_wordlist = [
                word for word in doc_wordlist if word not in custom_stopwords
            ]
            new_doc = ' '.join(new_doc_wordlist)
            new_test_data.append(new_doc)

    p = PorterStemmer()
    train_docs = p.stem_documents(new_train_data)
    if test_data is not None:
        test_docs = p.stem_documents(new_test_data)
    print "my_method: Stemmed data."

    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(train_docs)
    if test_data is not None:
        Test = vectorizer.transform(test_docs)
    else:
        Test = None
    print "my_method: Vectorized data"

    svd_model = TruncatedSVD(n_components=200)  # random_state=13
    X = svd_model.fit_transform(X)
    if test_data is not None:
        Test = svd_model.transform(Test)
    print "SVD'd data"

    return X, Test
示例#2
0
def assign_country_label_ids(country_scores, label_score, num_candidates, use_label_candidates):
    """Output: Dictionary --> key = country, value = label"""

    ps = PorterStemmer()
    country_scores['stem'] = ps.stem_documents([str(word) for word in country_scores['label']])
    country_scores = country_scores.sort_values(by=label_score, ascending=False)
    used_stems = set()

    if use_label_candidates is True:
        # print('USING SOFT LABELING')
        final_labels = defaultdict(set)
        final_ids = defaultdict(set)

        for row in country_scores.itertuples():
            if len(final_labels[row.country]) <= num_candidates and row.stem not in used_stems and row.stem not in BLACK_LIST:
                final_labels[row.country].add([row.label.lower().replace('_', ' ').strip(), row.tfidf, row.pmi])
                final_ids[row.country].add(int(row.label_id))
                used_stems.add(row.stem)
    else:

        final_labels = {}
        final_ids = {}

        for row in country_scores.itertuples():
            if row.country not in final_labels and row.stem not in used_stems and row.stem not in BLACK_LIST:
                final_labels[row.country] = [row.label.lower().replace('_', ' ').strip(), row.tfidf, row.pmi]
                final_ids[row.country] = row.label_id
                used_stems.add(row.stem)
    return final_labels, final_ids
def get_top_labels(country_scores):
    """Output: Dictionary --> key = country, value = list of top labels"""
    ps = PorterStemmer()
    country_scores['stem'] = ps.stem_documents(
        [str(word) for word in country_scores['label']])
    country_scores = country_scores.sort_values(by="tfidf", ascending=False)
    country_scores_pmi = country_scores.sort_values(by="pmi", ascending=False)
    top_labels = [[] for x in range(country_scores['num_countries'][0])]
    top_labels_pmi = [[]
                      for x in range(country_scores_pmi['num_countries'][0])]

    used_stems = set()
    used_stems_pmi = set()

    for row in country_scores.itertuples():
        if row.stem not in used_stems:
            if len(top_labels[row.country]) < 40:
                top_labels[row.country].extend([
                    row.label.lower().replace('_', ' ').strip(), row.tfidf,
                    row.pmi, row.country
                ])
                used_stems.add(row.stem)

    for row in country_scores_pmi.itertuples():
        if row.stem not in used_stems_pmi:
            if len(top_labels_pmi[row.country]) < 40:
                top_labels_pmi[row.country].extend([
                    row.label.lower().replace('_', ' ').strip(), row.tfidf,
                    row.pmi, row.country
                ])
                used_stems_pmi.add(row.stem)

    return top_labels, top_labels_pmi
示例#4
0
def preprocess_documents(documents):
    # preprocess each doc
    documents = [preprocess_doc(doc) for doc in documents]

    # stem the documents
    stemmer = PorterStemmer()
    documents = stemmer.stem_documents(documents)

    # split all the documents into list of tokens
    documents = [doc.split() for doc in documents]

    return documents
示例#5
0
def main():
    ############################## Setup Code #####################################
    global document_index
    path = "./myroot"
    file3 = open("cmptext.txt", "w+")
    number_of_documents = recursive_read(path, file3)
    file3.close()
    print 'All files read'
    file3 = open("cmptext.txt", "r")
    preprocess(file3, number_of_documents)
    file3.close()
    print 'All files processed'
    print 'Word2Vec begins'
    model = get_word2vec(number_of_documents)  #includes trigrams
    model.save('vocab.txt')
    print 'Word2Vec done'
    vocabulary = model.wv.vocab.keys()
    inverted_index = get_inverted_index(vocabulary)
    for item in inverted_index.keys():
        if not inverted_index[item]:
            del inverted_index[item]
    with open("inverted-index.txt", "wb") as fp:
        pickle.dump(inverted_index, fp)
    fp.close()
    get_tfidf_vectors(inverted_index, number_of_documents)
    get_norms()
    doc_num = 0
    file1 = open("cmptext.txt", "r")
    stemmer = PorterStemmer()
    for document in file1:
        spreprocessed = []
        doc_num += 1
        for line in document.split('. '):
            temp1 = []
            temp2 = []
            temp1 = gensim.utils.simple_preprocess(line, max_len=20)
            for word in temp1:
                if word not in stop_words:
                    temp2.append(word)
            spreprocessed.append(stemmer.stem_documents(temp2))
        with open("spreprocessed" + str(doc_num) + ".txt", "w+") as fp:
            pickle.dump(spreprocessed, fp)
        fp.close()
        del spreprocessed[:]
    file1.close()
    with open("document-index.txt", "wb") as fp:
        pickle.dump(document_index, fp)
    fp.close()
示例#6
0
def assign_country_label_ids(country_scores, label_score):
    """Output: Dictionary --> key = country, value = label"""

    ps = PorterStemmer()
    country_scores['stem'] = ps.stem_documents([str(word) for word in country_scores['label']])
    country_scores = country_scores.sort_values(by=label_score, ascending=False)
    used_stems = set()

    final_labels = {}
    final_ids = {}

    for row in country_scores.itertuples():
        if row.country not in final_labels and row.stem not in used_stems and row.stem not in BLACK_LIST:
            final_labels[row.country] = [row.label.lower().replace('_', ' ').strip(), row.tfidf, row.pmi]
            final_ids[row.country] = row.label_id
            used_stems.add(row.stem)
    return final_labels, final_ids
示例#7
0
def preprocess(file_name, number_of_documents):
    stemmer = PorterStemmer()
    fp1 = open("preprocessed.txt", "wb")
    fp2 = open("preprocessed-cmptext.txt", "wb")
    pickle.dump(number_of_documents, fp1)
    for line in file_name:
        preprocess_list1 = gensim.utils.simple_preprocess(line, max_len=20)
        preprocess_list2 = []
        for word in preprocess_list1:
            if word not in stop_words:
                preprocess_list2.append(word)
        pickle.dump(stemmer.stem_documents(preprocess_list2), fp1)
        for word in preprocess_list2:
            fp2.write(stemmer.stem(word.encode('utf-8')))
            fp2.write(' ')
        fp2.write('\n')
    fp1.close()
    fp2.close()
示例#8
0
def get_top_labels(country_scores, label_score, num_candidates=5):
    """Output: Dictionary --> key = country, value = list of top labels"""

    ps = PorterStemmer()
    country_scores['stem'] = ps.stem_documents([str(word) for word in country_scores['label']])
    country_scores = country_scores.sort_values(by=label_score, ascending=False)
    num_labels_per_country = defaultdict(int)
    top_labels = []
    used_stems = set()

    for row in country_scores.itertuples():
        if row.stem not in used_stems:
            if num_labels_per_country[row.country] < num_candidates:
                top_labels.append([row.country, row.label_id, row.label.lower().replace('_', ' ').strip(), row.tfidf, row.pmi])
                used_stems.add(row.stem)
                num_labels_per_country[row.country] += 1

    return top_labels
def get_top_labels(country_scores):
    """Output: Dictionary --> key = country, value = list of top labels"""
    ps = PorterStemmer()
    country_scores['stem'] = ps.stem_documents(
        [str(word) for word in country_scores['label']])
    country_scores = country_scores.sort_values(by="tfidf", ascending=False)
    top_labels = [[] for x in range(country_scores['num_countries'][0])]
    used_stems = set()
    country_scores = country_scores.fillna(0)
    for row in country_scores.itertuples():
        if row.stem not in used_stems and row.stem is not int:
            # selecting top 20 labels
            if len(top_labels[row.country]) < 90:
                top_labels[row.country].extend([
                    str(row.label).lower().replace(' ', '_').strip(),
                    float(row.tfidf), row.country
                ])
                used_stems.add(row.stem)
                if (row.label == "preparation"):
                    print(row.label)
                    print("Ccccccccccccccccc")
    return top_labels
y = le.transform(train_data["Category"])

titled_train_data = add_titles(train_data['Content'], train_data['Title'])\

# Removing stopwords:
new_train_data = []
for doc in titled_train_data:
    doc_wordlist = doc.split()
    new_doc_wordlist = [
        word for word in doc_wordlist if word not in custom_stopwords
    ]
    new_doc = ' '.join(new_doc_wordlist)
    new_train_data.append(new_doc)

p = PorterStemmer()
train_docs = p.stem_documents(new_train_data)
print "Stemmed data."

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(train_docs)
print "Vectorized data"

svd_model5 = TruncatedSVD(n_components=5)  # random_state=42
svdX5 = svd_model5.fit_transform(X)
svd_model50 = TruncatedSVD(n_components=50)  # random_state=13
svdX50 = svd_model50.fit_transform(X)
svd_model200 = TruncatedSVD(n_components=200)  # random_state=13
svdX200 = svd_model200.fit_transform(X)
print "SVD'd data"

# Cross Validation:
示例#11
0
    for line in tokens:
        documents[0] += line

with open(target_document_path) as f:
    tokens = sent_tokenize(f.read())
    for line in tokens:
        documents[1] += line

#bag of word
texts = [[text for text in simple_preprocess(doc, deacc=True)]
         for doc in documents]

#stemming
p = PorterStemmer()
for k in range(len(texts)):
    texts[k] = p.stem_documents(texts[k])

#Reconvert documents to collection of words/bigrams
#texts_bigrams = [[text for text in bigram_phraser[ simple_preprocess(doc, deacc=True)]] for doc in documents]

# build N-gram
texts_bigrams = [[]] * 2
for k in range(len(texts)):
    texts_bigrams[k] = [""] * (len(texts[k]) - 1)
    for kk in range(len(texts[k])):
        if (kk < len(texts[k]) - 1):
            texts_bigrams[k][kk] = texts[k][kk] + "_" + texts[k][kk + 1]

# remove most frequency word, stop word
for k in range(len(texts)):
    word_counter = {}
示例#12
0
neg_list = pd.read_csv('./model_inputs/lm_word_lists/lm_negative.csv',
                       header=None)[0].str.lower().to_list()
pos_neg = pos_list + neg_list

# tokenize and remove punctuation
calls['text'] = calls['text'].apply(
    lambda x: [w for w in tokenize(x, deacc=True)])

# remove stopwords and uppercase words
calls['text'] = calls['text'].apply(lambda x: [remove_stopwords(w) for w in x])
calls['text'] = calls['text'].apply(
    lambda x: [w for w in x if (2 < len(w) < 15) and (w.islower())])

# stemming (porter)
p = PorterStemmer()
calls['text'] = calls['text'].apply(lambda x: p.stem_documents(x))

# create dictionary object
dictionary = corpora.Dictionary(calls['text'])

agg_word_freq = {dictionary[k]: v for k, v in dictionary.cfs.items()}
agg_word_freq = pd.DataFrame({
    'token': agg_word_freq.keys(),
    'cf': agg_word_freq.values()
})

agg_doc_freq = {dictionary[k]: v for k, v in dictionary.dfs.items()}
agg_doc_freq = pd.DataFrame({
    'token': agg_doc_freq.keys(),
    'df': agg_doc_freq.values()
})
示例#13
0
def live(model, input_query, vocabulary, length_preprocessed, inverted_index,
         document_dictionary, norms, nod, list_of_document_tfidf_dicts,
         user_id):
    ################################ Live Code ########################################
    ERROR_MESSAGE = ""
    ANSWER = {
        "error": None,
        "main_ans": None,
        "Ans_1": None,
        "Ans_2": None,
        "Ans_3": None,
        "Ans_4": None,
        "Ans_5": None,
        "Ans_6": None,
        "Ans_7": None,
        "Ans_8": None,
        "Ans_9": None,
        "Ans_10": None
    }
    stemmer = PorterStemmer()
    preprocessed_query = gensim.utils.simple_preprocess(input_query,
                                                        max_len=20)
    filtered_sentence = []
    for word in preprocessed_query:
        if word not in stop_words:
            filtered_sentence.append(word)
    preprocessed_query = filtered_sentence
    preprocessed_query = stemmer.stem_documents(preprocessed_query)
    expanded_query = set(
        get_expanded_query(preprocessed_query, model, vocabulary))
    eq_vector = get_eq_tfidf_vector(inverted_index, expanded_query,
                                    preprocessed_query, length_preprocessed)
    relevant_docs = get_relevantdocs(expanded_query, inverted_index)
    try:
        tfidf_scores = get_scores(relevant_docs, eq_vector, norms, nod,
                                  list_of_document_tfidf_dicts)
        if not tfidf_scores:
            ANSWER['error'] = 'Match Not Found.'
            return ANSWER
    except ZeroDivisionError:
        ANSWER['error'] = 'Please be more specific.'
        return ANSWER
    _sae = torch.load('my_sae.pt')
    fp = open('document-index.txt', 'rb')
    doc_index = pickle.load(fp)
    fp.close()
    nb_documents = len(doc_index)
    user_document_array = np.zeros(nb_documents)
    doc_ids = doc_index.values()
    rows = json.load(
        urllib.urlopen("http://127.0.0.1:8000/clicks/" + str(user_id) +
                       '/'))['clicks']
    for row in rows:
        user_document_array[doc_ids.index(str(row[0]))] = row[1]
    user_document_array = torch.FloatTensor(user_document_array)
    reco_sys_scores = _sae.forward(Variable(user_document_array).unsqueeze(0))
    db.close()
    scores = dict()
    factor = 0.01
    #print reco_sys_scores
    #print reco_sys_scores.data[0]
    # print (reco_sys_scores.numpy())
    for doc_number in tfidf_scores.keys():
        scores[doc_number] = (
            0.9 * tfidf_scores[doc_number] +
            0.1 * factor * reco_sys_scores.data[0].data[doc_number]).item()
    print scores
    print tfidf_scores
    heap_docs = [(-value, key) for key, value in scores.items()]
    largest_docs = heapq.nsmallest(100, heap_docs)
    largest_docs = [(key, -value) for value, key in largest_docs]
    print largest_docs
    ############################### End of document ranking ######################################
    preprocessed_tuple = []
    for x in largest_docs:
        fp = open('spreprocessed' + str(x[0]) + '.txt', 'rb')
        list_temp = pickle.load(fp)
        preprocessed_tuple.extend([(x[0], y) for y in list_temp])
        fp.close()
    vocabulary2 = list(expanded_query)
    inverted_index2 = get_inverted_index_query_terms(preprocessed_tuple,
                                                     vocabulary2)
    relevant_sent = get_relevantdocs(expanded_query, inverted_index2)
    scores_bm25 = get_bm25(relevant_sent, inverted_index2, expanded_query,
                           preprocessed_tuple)
    heap_sentences = [(-value, key) for key, value in scores_bm25.items()]
    largest_sentences = heapq.nsmallest(100, heap_sentences)
    largest_sentences = [(key, -value) for value, key in largest_sentences]
    sentenced_docs = set()
    for sentence in [x[0] for x in largest_sentences]:
        sentenced_docs.add(preprocessed_tuple[sentence - 1][0])
        sentenced_docs_copy = [x for x in sentenced_docs]
    index_ans = get_index_ans(preprocessed_tuple,
                              sentence_number=largest_sentences[0][0])
    fp = open('cmptext.txt', 'rb')
    doc = 1
    for line in fp:
        if doc == preprocessed_tuple[largest_sentences[0][0] - 1][0]:
            #ANSWER +=  'MAIN ANSWER : ' + str(line.split('. ')[index_ans]) + '\n'
            ANSWER['main_ans'] = str(line.split('. ')[index_ans])
        doc += 1
    fp.close()
    link = open("links", "r")
    links = pickle.load(link)
    doc_num_ans = 0
    ans_count = 0
    for sentence in [x[0] for x in largest_sentences]:
        doc_number_of_sentence = preprocessed_tuple[sentence - 1][0]
        if doc_number_of_sentence in sentenced_docs:
            ans_count += 1
            #ANSWER += 'Document ' + str(doc_number_of_sentence) + '\n'
            doc_num_ans += 1
            index_ans = get_index_ans(preprocessed_tuple,
                                      sentence_number=sentence)
            fp = open('cmptext.txt', 'rb')
            doc = 1
            for line in fp:
                if doc == doc_number_of_sentence:
                    #ANSWER += 'SENTENCE : ' + str(line.split('. ')[index_ans]) + '\n'
                    temp = list()
                    temp.append(document_dictionary[doc_number_of_sentence])
                    temp.append(str(line.split('. ')[index_ans]))
                    temp.append(str(links[doc_num_ans]))
                    ANSWER['Ans_' + str(doc_num_ans)] = temp
                    #ANSWER['Ans_' + str(doc_num_ans)] = list(document_dictionary[doc_number_of_sentence], str(line.split('. ')[index_ans])).append(str(links[index_ans]))
                    #print 'Ans_' + str(doc_num_ans) + "\t" + str(list(document_dictionary[doc_number_of_sentence], str(line.split('. ')[index_ans])).append(str(links[index_ans])))
                    print temp
                doc += 1
            fp.close()
            sentenced_docs.remove(doc_number_of_sentence)
        else:
            continue
    ANSWER['count'] = ans_count
    # print ANSWER
    return ANSWER
示例#14
0
calls = calls_raw.loc[:, ['ticker_name','text']].groupby(['ticker_name'])['text'].apply(lambda x: ''.join(x)).reset_index()

# tokenize and remove punctuation
calls['text'] = calls['text'].apply(lambda x: simple_preprocess(x, min_len=2, max_len=15, deacc=True))

# Build the bigram model
bigram = models.Phrases(calls['text'], min_count=5, threshold=100) # higher threshold fewer phrases.
bigram_model = models.phrases.Phraser(bigram)
calls['text'] = calls['text'].apply(lambda x: bigram_model[x])

# remove stopwords
calls['text'] = calls['text'].apply(lambda x: remove_stopwords(x.lower()))

# stemming (porter)
p = PorterStemmer()
calls['text'] = p.stem_documents(calls['text'])

# create dictionary object
dictionary = corpora.Dictionary(calls['text'])

# filter extremes
dictionary.filter_extremes(no_below=2, no_above=0.5)

# bag-of-words transformation
corpus = [dictionary.doc2bow(text) for text in calls['text']]

# tfidf transformation
tfidf = models.TfidfModel(corpus)  # fit model
corpus_tfidf = tfidf[corpus]  # apply model

# -------------------------------------------------------------------------------
示例#15
0
    def compare(self):
        with open (self.source_document_path , encoding = "ISO-8859-1") as f:
            tokens = sent_tokenize(f.read())
            for line in tokens:
                self.documents[0] += line

        with open (self.target_document_path , encoding = "ISO-8859-1") as f:
            tokens = sent_tokenize(f.read())
            for line in tokens:
                self.documents[1] += line

        #bag of word
        texts = [[text for text in simple_preprocess(doc, deacc=True)] for doc in self.documents]

        #stemming
        p = PorterStemmer()
        for k in range(len(texts)):
            texts[k] = p.stem_documents(texts[k])

        #Reconvert documents to collection of words/bigrams
        bigram_phraser = Phrases(texts, min_count=1)
        texts_bigrams = [[text for text in bigram_phraser[ simple_preprocess(doc, deacc=True)]] for doc in self.documents]

        print(texts_bigrams)
        exit()

        # build N-gram
        texts_bigrams = [[]] * 2
        for k in range(len(texts)):
            texts_bigrams[k] = [""] * (len(texts[k])-1)
            for kk in range(len(texts[k])):
                if(kk<len(texts[k])-1):
                    texts_bigrams[k][kk]=texts[k][kk]+"_"+texts[k][kk+1]

        # remove most frequency word, stop word
        for k in range(len(texts)):
            word_counter = {}
            for word in texts_bigrams[k]:
                if word in word_counter:
                    word_counter[word] += 1
                else:
                    word_counter[word] = 1
            popular_words = sorted(word_counter, key = word_counter.get, reverse = True)
            top = popular_words[:3]
            for kk in range(len(top))[:]:
                texts_bigrams[k][:] = (value for value in texts_bigrams[k] if value != top[kk])



        #Create dictionary
        dictionary = corpora.Dictionary(texts_bigrams)

        #Create corpus
        corpus = [dictionary.doc2bow(docString) for docString in texts_bigrams]


        model = gensim.models.TfidfModel(corpus)  # fit model
        vector = model[corpus[0]] 

        #cosine similarity
        index = Similarity(corpus=corpus,num_features=len(dictionary),output_prefix='on_disk_output')

        for similarities in index:
            similar_docs = list(enumerate(similarities))
            break

        return similar_docs[1][1]
示例#16
0
    for word in temp_doc.split():
        word=unicode(word)
        doc.append(word)
    tagged = nltk.pos_tag(doc)
        
    temp_doc=u""
    for word in tagged:
        tmp=unicode(word[0])
        temp_doc= temp_doc + lemmatizer.lemmatize(tmp,get_wordnet_pos(word[1]))+" "
    '''

    train_data["Content"].replace(to_replace=train_data["Content"][i],
                                  value=temp_doc,
                                  inplace=True)

train_data["Content"] = pstem.stem_documents(train_data["Content"])

for i in range(0, len(test_data["Content"])):
    temp_doc = u""
    for word in test_data["Content"][i].split():
        word = unicode(word)
        word = unicode(word.lower())
        word = unicode(strip_punctuation(word))
        #word =check_money(word)
        if unicode(word) not in stopwords and unicode(
                word) != u"–" and unicode(word) != u"…":
            temp_doc = temp_doc + word + " "
    '''doc=[]
    for word in temp_doc.split():
        word=unicode(word)
        doc.append(word)