class Chatbot:
    def __init__(self, questions):
        '''
        questions = []
        with codecs.open(file, 'r', 'utf-8') as f:
            for line in f:
                questions.append(line)

        status = []
        for i in range(len(questions)):
            status.append(i)
        self.__data = {
            'text': questions,
            'status': status
        }
        '''

        questions_text = []
        for question in questions:
            questions_text.append(question.Title)

        status = []
        for i in range(len(questions_text)):
            status.append(i)

        self.__data = {'text': questions_text, 'status': status}

    def frame(self):
        frame = pandas.DataFrame(self.__data)
        self.frame_x = frame['text']
        self.frame_y = frame['status']

    def learning(self):
        self.vect = TfidfVectorizer(min_df=1)
        self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(
            self.frame_x, self.frame_y, test_size=0.2, random_state=4)
        self.x_trainvect = self.vect.fit_transform(self.x_train)
        self.x_trainvect.toarray()
        self.vect1 = TfidfVectorizer(min_df=1)
        self.x_trainvect = self.vect1.fit_transform(self.x_train)
        a = self.x_trainvect.toarray()
        self.vect1.inverse_transform(a[0])

    def bayes(self):
        self.mnb = MultinomialNB()
        self.y_train = self.y_train.astype('int')
        self.mnb.fit(self.x_trainvect, self.y_train)

    def ask(self, sentence):
        logger.debug('ask to bot for: {question}'.format(question=sentence))
        start = datetime.datetime.now()
        self.frame()
        self.learning()
        self.bayes()
        x_testvect = self.vect1.transform([sentence])
        pred = self.mnb.predict(x_testvect)
        end = datetime.datetime.now()
        logger.debug('time elapsed {time}'.format(time=end - start))
        return self.frame_x[pred[0]]
def preprocess(train, test, max_feature=3000, stop_word=True):
    if stop_word:
        vectorizer = TfidfVectorizer(stop_words='english',
                                     max_features=max_feature)
    else:
        vectorizer = TfidfVectorizer(max_features=max_feature)
    # this may take a while
    tmp = vectorizer.fit_transform(train)
    tmp2 = vectorizer.transform(test)
    # inverse back to normal words
    tmp = vectorizer.inverse_transform(tmp)
    tmp2 = vectorizer.inverse_transform(tmp2)
    return tmp, tmp2
예제 #3
0
    def get(self, request, *args, **kwargs):
        lines = []
        contents = []
        dup_lines = []
        file = File.objects.get(id=self.kwargs['file_id'])
        df = pd.read_csv(file.file.url, sep='delimiter', header=None)
        start_time = time.time()  # Performance test

        # removing stopwords
        df[0] = df[0].apply(lambda x: ' '.join(
            [word for word in x.split() if word not in (stop)]))
        # removing special characters
        df[0] = df[0].str.replace(r"[^a-zA-Z ]+", " ").str.strip()
        pks = df[0].values
        pattern = '(?u)[^ ]+'
        tv = TfidfVectorizer(encoding='utf-8', token_pattern=pattern)
        vect = tv.fit_transform(df[0])
        vect_array = vect.toarray()
        distances = pd.DataFrame(
            distance.cdist(vect_array, vect_array, 'hamming'))
        for i in range(0, len(distances.columns)):
            # Finding duplicates
            dup_index = distances.index[distances[i] < file.threshold].tolist()
            if len(dup_index) > 1:
                for d in dup_index:
                    if d != i:
                        print(distances[i][d])
                        print('Line', i)
                        lines.append(i)
                        print(df[0][i])
                        print(tv.inverse_transform(vect_array[i]))
                        print('Line', d)
                        dup_lines.append(d)
                        print(tv.inverse_transform(vect_array[d]))
                        print(df[0][d])
                        print('-----------------------------')
                    if d == i:
                        print('here')

        contents = (df[0][lines].tolist())

        print("TFIDF vector generated.\nTime: %s seconds\n" %
              (time.time() - start_time))  # Performance test

        return render(request, self.template_name, {
            'lines': lines,
            'contents': contents,
            'dup_lines': dup_lines
        })
예제 #4
0
def main(mbox_path: ('MBox Path', 'option', 'm')):
    messages = []
    texts = []
    for idx, message in enumerate(mailbox.mbox(mbox_path)):
        content = message.get_payload()[0].get_payload()
        stripped_content = striphtml(content).replace('\r', '').replace(
            '\n', '').replace('=2C', '').replace('=', '')

        matches = re.findall(r'"([^"]*)"', stripped_content)
        if len(matches) == 0:
            print("{}: Failed to extract message.".format(idx))
            continue

        messages.append(message)
        texts.append({'text': matches[0]})
    df = pd.DataFrame(texts)
    vectorizer = TfidfVectorizer()
    vectorized = vectorizer.fit_transform(df['text'].values).toarray()
    indexes_to_keep = np.flip(vectorized.argsort(axis=-1), -1)[:, :5]
    arr = np.zeros(vectorized.shape)
    for idx, set_one_idxs in enumerate(indexes_to_keep):
        arr[idx][set_one_idxs] = 1.

    terms_per_document = vectorizer.inverse_transform(arr)
    all_terms = []

    for terms in terms_per_document:
        all_terms += terms.tolist()

    count = Counter(all_terms)
    print(count.most_common(20))
예제 #5
0
    def __vectorize(self):
        """
		Vectorize training data, i.e. perform a 2-gram feature extraction and selection using a TF-IDF method 
		:return: Result is a numeric and weighted feature vector notation for each item
		"""
        logging.debug("Vectorizing text contents...")
        tfidf = TfidfVectorizer(analyzer='word',
                                ngram_range=(1, 2),
                                min_df=2,
                                max_df=0.5,
                                stop_words=stopwords.words('portuguese'))
        self.__tfidf_matrix = tfidf.fit_transform(
            self.__dataframe['video_contents'])
        vectors = self.__tfidf_matrix.toarray()

        i = 0
        for video_id, row in self.__dataframe.iterrows():
            tokens = ", ".join(tfidf.inverse_transform(vectors[i])[0])
            video_id = row['video_id']
            i += 1

            # videotokens = VideoTokens.objects.filter(video_id=video_id)
            # if videotokens.count() == 0:
            # 	videotokens = VideoTokens(video_id=video_id, tokens=tokens)
            # 	videotokens.save()
            # else:
            # 	videotokens[0].tokens = tokens
            # 	videotokens[0].save()

        logging.debug("Number of features found: %s" % len(tfidf.vocabulary_))
예제 #6
0
def buildVectorizer(bio):
    nounlist = []
    for doc in bio:
        st = ""
        for (word, pos) in tag(doc):
            if pos in ["JJ", "NNS", "NN", "NNP"]:
                st = st+word+" "
            else:
                if st!= "":
                    st = st[0:-1]+" "
                    #print "got one"
        nounlist.extend([st])
    sciencestopwords = set([u'model','according', 'data', u'models', 'function', 'properties', 'approach', 'parameters', 
                    'systems', 'number', 'order', u'data', 'analysis', u'information', u'journal',
                    'results','using','research', 'consumers', 'scientists', 'model', 'models', 'journal',
                    'researchers','paper','new','study','time','case', 'simulation', u'simulation', 'equation',
                    'based','years','better', 'theory', 'particular','many','due','much','set', 'studies', 'systems',
                    'simple', 'example','work','non','experiments', 'large', 'small', 'experiment', u'experiments',
                    'provide', 'analysis', 'problem', 'method', 'used', 'methods'])
    #now doing the new vectorizer
    from sklearn.feature_extraction.text import TfidfVectorizer
    english = nltk.corpus.stopwords.words('english')
    newstop = english+list(sciencestopwords) 
    vectorizer = TfidfVectorizer(min_df=1, max_df=.5, stop_words=newstop, decode_error='ignore')
    X = vectorizer.fit_transform(nounlist)
    Xinv = vectorizer.inverse_transform(X)
        #X is a sparse matrix of docs x vocab size (7638). 
    #so X[doc_num] is the sparse vector of its words. 
    #the ||X[doc_num]|| = 1 there are 7638 unique words and 755 docs. with a total number of 38888 non-zeros.
    #Xinv[doc_num] is the list of words in the doc.
     
    return nounlist, vectorizer, X, Xinv
예제 #7
0
    def keyword_extract():
        '''Extract keywords from tagline using tf-idf'''

        tagline_doc = []
        slug_list = []
        sql = "SELECT project_slug, tagline FROM project"
        self._cursor.execute(sql)
        results = self._cursor.fetchall()
        for row in results:
            tagline_doc.append(row[1])
            slug_list.append(row[0])

        vectorizer = TfidfVectorizer(stop_words="english", max_features=1000)
        response = vectorizer.fit_transform(tagline_doc)
        keyword_array = vectorizer.inverse_transform(response)

        for i in range(0, len(keyword_array), 1):
            try:
                sql = "UPDATE project SET keywords = '" + json.dumps(
                    keyword_array[i].tolist(
                    )) + "' WHERE project_slug = '" + slug_list[i] + "'"
                self._cursor.execute(sql)
                self._db.commit()
            except MySQLdb.Error as e:
                try:
                    logging.error("MySQL Error [%d]: %s; Error SQL: %s",
                                  e.args[0], e.args[1], sql)
                except IndexError:
                    logging.error("MySQL Error %s", str(e))
예제 #8
0
파일: dic.py 프로젝트: Arthur-Null/rumor
def weibo():
    f_stop = open('dataset/stop_word', 'r', encoding='utf-8')
    stopwords=['ru0b1os', 'ruyjamg', 'zy1qwp0', 'oxukk', '\t']
    for line in f_stop.readlines():
        stopwords.append(line[:-1])
    train = []
    for (root, dir, files) in os.walk("dataset/weibo_train/"):
        for f in files:
            train += pkl.load(open('dataset/weibo_train/' + f, 'rb'))
    vec = TfidfVectorizer(max_features=5000, min_df=5, tokenizer=jieba.cut, stop_words=stopwords)
    vec.fit(train)
    attention = pkl.load(open('attention.pkl','rb'))
    l=vec.inverse_transform(attention)[0]
    fout = open('attention','w', encoding='utf-8')
    for w in l:
        fout.write(str(w))
    # print(np.sum(vec.transform(train).toarray()[:5]))
    f_train = open("dataset/weibo_train.pkl", 'wb')
    f_test = open("dataset/weibo_test.pkl", 'wb')
    print(vec.vocabulary_)

    for (root, dir, files) in os.walk("dataset/weibo_train/"):
        for f in files:
            l = pkl.load(open('dataset/weibo_train/' + f, 'rb'))
            pkl.dump(vec.transform(l).toarray(), f_train)
            # print(f.split('_')[1][0])
            pkl.dump(int(f.split('_')[1][0]), f_train)

    for (root, dir, files) in os.walk("dataset/weibo_test/"):
        for f in files:
            l = pkl.load(open('dataset/weibo_test/' + f, 'rb'))
            pkl.dump(vec.transform(l).toarray(), f_test)
            # print(f.split('_')[1][0])
            pkl.dump(int(f.split('_')[1][0]), f_test)
예제 #9
0
class WeightedWordVectors(TransformerMixin):
    """    
    Libraries & Versions:
    Python==3.6.5
    Pandas=='0.23.1' as pd
    nltk=='3.3'
    numpy=='1.14.5'
    
    Keyword arguments:
    X -- Pandas Series of text as strings
    """
    def __init__(self, model=None, meta=None, disk=None):
        self.meta = meta
        
        if model == None:
            self.word2vec = Word2Vec(sentences=self.meta.str.split().tolist())
        else:
            self.word2vec =  model
            self.word2vec.train(sentences=self.meta.str.split().tolist())
        
        #Fit TFIDF
        self.tfidf = TfidfVectorizer()
        self.idfs = self.tfidf.fit_transform(self.meta)
        self.inverse_idfs = self.tfidf.inverse_transform(self.idfs)
    
        return self
        
    
    def fit(self, X, y=None, meta=None, P=None):
        #Split into sentences
        for i in X.index:
            self.word2vec.build_vocab(X.loc[i].str.split().tolist(), keep_raw_vocab=True, update=True)
            self.word2vec.model_trimmed_post_training = False
            self.word2vec.min_alpha_yet_reached = False
            self.word2vec.batch_words = X.loc[i].apply(lambda x: len(x.split())).max()
            self.word2vec.train(X.loc[i].str.split().tolist(), 
                           total_examples=X.loc[i].shape[0], start_alpha=0.05, 
                           end_alpha=0.01, epochs=1, compute_loss=True)
            
            disk.append( (euclidean_distances(self.transform(self, meta)) @ P).sum(1) )
            
        return self

    def transform(self, meta):        
        #Weighted Vectors
        weighted_docs = []        
        for idf, inv in zip(self.idfs, self.inverse_idfs):
            try:
                weight = idf[idf!=0]
                vector = self.word2vec[inv]
                weighted_doc = weight.dot(vector)
            except:
                weighted_doc = np.empty((1, 300), np.float64)    
            weighted_docs.append(weighted_doc.tolist()[0])
                
        return pd.DataFrame(weighted_docs).fillna(0)
        
    '''
예제 #10
0
class MachineLearning():

    def __init__(self):
        questions = []
        with open("deneme.txt", "r") as f:
            for line in f:
                questions.append(line)

        status = []
        for i in range(len(questions)):
            status.append(i)
        self.__data = {'text': questions, 'status': status}


    def frame(self):
        frame = pandas.DataFrame(self.__data)
        self.frame_x=frame["text"]
        self.frame_y=frame["status"]



    def learning(self):
        self.vect = TfidfVectorizer(min_df=1)
        self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(self.frame_x, self.frame_y, test_size=0.2, random_state=4)
        self.x_trainvect = self.vect.fit_transform(self.x_train)
        self.x_trainvect.toarray()
        self.vect1 = TfidfVectorizer(min_df=1)
        self.x_trainvect = self.vect1.fit_transform(self.x_train)
        a = self.x_trainvect.toarray()
        self.vect1.inverse_transform(a[0])


    def bayes(self):
        self.mnb = MultinomialNB()
        self.y_train=self.y_train.astype('int')
        self.mnb.fit(self.x_trainvect,self.y_train)


    def find(self, sentence):
        self.frame()
        self.learning()
        self.bayes()
        x_testvect = self.vect1.transform([sentence])
        pred = self.mnb.predict(x_testvect)
        return self.frame_x[pred[0]]
예제 #11
0
class TopicModeling(object):
    def __init__(self, n_topics, method='LSA'):
        assert method in ['LSA', 'LDA']
        if method == 'LDA': raise NotImplementedError
        self.method = method
        self.n_topics = n_topics
        self.tfidf = TfidfVectorizer()
        if method == 'LSA':
            self.model = TruncatedSVD(n_components=n_topics)
        else:
            self.model = LDA(n_components=n_topics)

    def __call__(self, corpus):
        self.term_matrix = self.tfidf.fit_transform(corpus)
        self.topic_matrix = self.model.fit_transform(self.term_matrix)
        self.topic_keys = self.topic_matrix.argmax(axis=1).tolist()

    def get_count_pairs(self):
        return np.unique(self.topic_keys, return_counts=True)

    def get_top_n_words(self, n):
        '''
        returns a list of n_topic strings, where each string contains the n most common
        words in a predicted category (topic), in order
        '''
        top_word_indices = []
        for topic in range(self.n_topics):
            temp_vector_sum = 0
            for j in range(len(self.topic_keys)):
                if self.topic_keys[j] == topic:
                    temp_vector_sum += self.term_matrix[j]
            temp_vector_sum = temp_vector_sum.toarray()
            top_n_word_indices = np.flip(
                np.argsort(temp_vector_sum)[0][-n:], 0)
            top_word_indices.append(top_n_word_indices)
        top_words = []
        for topic in top_word_indices:
            topic_words = []
            for index in topic:
                temp_word_vector = np.zeros((1, self.term_matrix.shape[1]))
                temp_word_vector[:, index] = 1
                the_word = self.tfidf.inverse_transform(temp_word_vector)[0][0]
                topic_words.append(the_word.encode('ascii').decode('utf-8'))
            top_words.append(" ".join(topic_words))
        return top_words

    def plot_tsne(self, n_components=2):
        topic_embedding = TSNE(n_components=n_components).fit_transform(
            self.topic_matrix)
        _, ax = plt.subplots(figsize=(16, 10))
        scatter = ax.scatter(topic_embedding[:, 0],
                             topic_embedding[:, 1],
                             c=self.topic_keys,
                             cmap='tab20')
        legend = ax.legend(*scatter.legend_elements(), title='Topics')
        ax.add_artist(legend)
예제 #12
0
def tfidf1(c1,c2):

    tfidf = TfidfVectorizer()

    data = tfidf.fit_transform([c1,c2])

    print(tfidf.get_feature_names())

    print(data.toarray())

    print(tfidf.inverse_transform(data))
    def test_add_hashtag_bow_to_graph(self):
        g = IU.add_hastag_bow_to_graph(self.g_undecom)
        tfidf = TfidfVectorizer(preprocessor=None,
                                tokenizer=lambda s: s.split(),
                                stop_words=None)
        tfidf.fit([' '.join(g.node[n]['hashtags']) for n in g.nodes_iter()])

        for n in g.nodes_iter():
            assert_true(issparse(g.node[n]['hashtag_bow']))
            assert_equal(
                sorted(g.node[n]['hashtags']),
                sorted(
                    tfidf.inverse_transform(
                        g.node[n]['hashtag_bow'])[0].tolist()))
예제 #14
0
def processing_question(ques, paragraphs, domain_lemma_cache, domain_pickle):
    """Return answer"""
    #Lemmatizing whole csv text column
    lemma_cache = domain_lemma_cache
    if not os.path.isfile(lemma_cache):
        lemmas = [lemmatize(par) for par in tqdm(paragraphs)]
        df = pd.DataFrame(data={'context': paragraphs, 'lemmas': lemmas})
        df.to_feather(lemma_cache)
    df = pd.read_feather(lemma_cache)
    paragraphs = df.context
    lemmas = df.lemmas
    #Vectorizor cache
    if not os.path.isfile(VEC_PICKLE_LOC):
        vectorizer = TfidfVectorizer(stop_words='english',
                                     min_df=5,
                                     max_df=.5,
                                     ngram_range=(1, 3))
        vectorizer.fit_transform(lemmas)
        pickle.dump(vectorizer, open(VEC_PICKLE_LOC, "wb"))
    #Vectorized lemmas cache cache
    if not os.path.isfile(domain_pickle):
        tfidf = vectorizer.fit_transform(lemmas)
        pickle.dump(tfidf, open(domain_pickle, "wb"))
    #loading the pickle file
    vectorizer = pickle.load(open(VEC_PICKLE_LOC, "rb"))
    tfidf = pickle.load(open(domain_pickle, "rb"))
    question = ques
    #Transform the lemmatized questions and paragraph to vector representation
    query = vectorizer.transform([lemmatize(question)])
    (query > 0).sum(), vectorizer.inverse_transform(query)
    scores = (tfidf * query.T).toarray()
    #finding the cosine similarity of the question and the paragraph and take the top 10 paragraphs and put those paragraphs in the question answering pipeline
    results = (np.flip(np.argsort(scores, axis=0)))
    qapipe = pipeline('question-answering',
                      model='distilbert-base-uncased-distilled-squad',
                      tokenizer='bert-base-uncased',
                      device=0)
    #after putting it through the pipeline, it will get the top 10 answers and put it in a dataframe
    candidate_idxs = [(i, scores[i]) for i in results[0:10, 0]]
    contexts = [(paragraphs[i], s) for (i, s) in candidate_idxs if s > 0.01]
    question_df = pd.DataFrame.from_records([{
        'question': question,
        'context': ctx
    } for (ctx, s) in contexts])
    preds = qapipe(question_df.to_dict(orient="records"))
    answer_df = pd.DataFrame.from_records(preds)
    answer_df["context"] = question_df["context"]
    answer_df = answer_df.sort_values(by="score", ascending=False)
    #return a dataframe that contains the answers
    return answer_df
예제 #15
0
def calculateNgramAccuracy(range_min, range_max):
    vectorClassifier = TfidfVectorizer(min_df=1,
                                       stop_words='english',
                                       ngram_range=(range_min, range_max))
    messages_trainData, messages_testData, label_trainData, label_testData = train_test_split(
        messagesDataSet, labelDataSet, test_size=0.2, random_state=4)
    x_traincv = vectorClassifier.fit_transform(messages_trainData)
    trainData = x_traincv.toarray()
    print(trainData)
    featureNames = vectorClassifier.get_feature_names()
    print(featureNames)

    values = trainData[0]
    print(values)
    length = len(trainData[0])
    print(length)

    data = vectorClassifier.inverse_transform(trainData[0])
    print(data)

    actualData = messages_trainData.iloc[0]
    print(actualData)

    multiNB = MultinomialNB()
    label_trainData = label_trainData.astype('int')
    multiNBData = multiNB.fit(x_traincv, label_trainData)
    print(multiNBData)
    x_testcv = vectorClassifier.transform(messages_testData)
    predict = multiNB.predict(x_testcv)
    print(predict)

    actualTestDataLabels = np.array(label_testData)
    print(actualTestDataLabels)

    testEqualResult = 0
    for i in range(len(label_testData)):
        if (actualTestDataLabels[i] == predict[i]):
            testEqualResult += 1

    TtestEqualResult = testEqualResult
    PredictionDataLength = len(predict)

    #n-gram
    print("range", range_min, "-", range_max, "Ngram Equal Data count = ",
          TtestEqualResult)
    print("range", range_min, "-", range_max, "Ngram tested data count = ",
          PredictionDataLength)
    print("range", range_min, "-", range_max, "Ngram accuracy = ",
          TtestEqualResult * 100.0 / PredictionDataLength, "% ~> ",
          TtestEqualResult * 100.0 // PredictionDataLength, "%")
예제 #16
0
def processing_question(ques, paragraphs, domain_lemma_cache, domain_pickle):
    """Return answer"""
    #Lemmatizing whole csv text column
    lemma_cache = domain_lemma_cache
    if not os.path.isfile(lemma_cache):
        lemmas = [lemmatize(par) for par in tqdm(paragraphs)]
        df = pd.DataFrame(data={'context': paragraphs, 'lemmas': lemmas})
        df.to_feather(lemma_cache)
    df = pd.read_feather(lemma_cache)
    paragraphs = df.context
    lemmas = df.lemmas
    #Vectorizor cache
    if not os.path.isfile(VEC_PICKLE_LOC):
        vectorizer = TfidfVectorizer(stop_words='english',
                                     min_df=5,
                                     max_df=.5,
                                     ngram_range=(1, 3))
        vectorizer.fit_transform(lemmas)
        pickle.dump(vectorizer, open(VEC_PICKLE_LOC, "wb"))
    #Vectorized lemmas cache cache
    if not os.path.isfile(domain_pickle):
        tfidf = vectorizer.fit_transform(lemmas)
        pickle.dump(tfidf, open(domain_pickle, "wb"))
    vectorizer = pickle.load(open(VEC_PICKLE_LOC, "rb"))
    tfidf = pickle.load(open(domain_pickle, "rb"))
    question = ques
    query = vectorizer.transform([lemmatize(question)])
    (query > 0).sum(), vectorizer.inverse_transform(query)
    scores = (tfidf * query.T).toarray()
    results = (np.flip(np.argsort(scores, axis=0)))
    qapipe = pipeline('question-answering',
                      model='distilbert-base-uncased-distilled-squad',
                      tokenizer='bert-base-uncased',
                      device=0)
    candidate_idxs = [(i, scores[i]) for i in results[0:10, 0]]
    contexts = [(paragraphs[i], s) for (i, s) in candidate_idxs if s > 0.01]
    question_df = pd.DataFrame.from_records([{
        'question': question,
        'context': ctx
    } for (ctx, s) in contexts])
    preds = qapipe(question_df.to_dict(orient="records"))
    torch.cuda.empty_cache()
    gc.collect()
    answer_df = pd.DataFrame.from_records(preds)
    #torch.cuda.empty_cache()
    #gc.collect()
    answer_df['context'] = question_df['context']
    answer_df = answer_df.sort_values(by="score", ascending=False)
    return answer_df
예제 #17
0
    def fit(self, preload=False):
        vectorizer = TfidfVectorizer(tokenizer=self.__custom,
                                     strip_accents='ascii',
                                     stop_words='english',
                                     token_pattern=None)
        vectorizer.fit(self.corpus)

        self.X = vectorizer.transform(self.corpus)

        self.vocab = vectorizer.get_feature_names()
        self.V = len(self.vocab)
        self.word_to_ix = vectorizer.vocabulary_

        self.idf = vectorizer.idf_
        self.doc_terms = vectorizer.inverse_transform(self.X)
예제 #18
0
 def get_semantic_features(self, data):
     # Determine which tokens are used for each data point
     vectorizer_to_find_tokens = TfidfVectorizer(use_idf=True, norm='l2', binary=False, sublinear_tf=True, \
                                                     min_df=2, max_df=0.2, ngram_range=(1, 1), stop_words = self.stop_word_list_longer)
     transformed_to_find_tokens = vectorizer_to_find_tokens.fit_transform(
         data)
     semantic_features = []
     for data_point in data:
         transformed_point = vectorizer_to_find_tokens.transform(
             [data_point])
         inversed = vectorizer_to_find_tokens.inverse_transform(
             transformed_point)[0]
         summed_tokens = numpy.copy(self.default_vector)
         for token in inversed:
             summed_tokens = summed_tokens + self.get_vector(token)
         semantic_features.append(summed_tokens)
     return semantic_features
예제 #19
0
    def test_add_hashtag_bow_to_graph(self):
        g = IU.add_hastag_bow_to_graph(self.g_undecom)
        tfidf = TfidfVectorizer(preprocessor=None,
                                tokenizer=lambda s: s.split(),
                                stop_words=None)
        tfidf.fit([' '.join(g.node[n]['hashtags'])
                   for n in g.nodes_iter()])

        for n in g.nodes_iter():
            assert_true(issparse(g.node[n]['hashtag_bow']))
            assert_equal(
                sorted(g.node[n]['hashtags']),
                sorted(
                    tfidf.inverse_transform(
                        g.node[n]['hashtag_bow']
                    )[0].tolist()
                )
            )
예제 #20
0
    def test11(self):
        corpus = [
            'This is the first document.',
            'This document is the second document.',
            'And this is the third one.', 'Is this the first document?'
        ]

        vectorizer = TfidfVectorizer()
        # X = vectorizer.fit_transform(corpus)
        model = vectorizer.fit(corpus)
        X = model.transform(corpus)
        print(vectorizer.get_feature_names())
        print(vectorizer.get_stop_words())
        print(vectorizer.inverse_transform(X))
        print(X.shape)
        print(X)
        print(model.vocabulary_)
        print(model.idf_)
예제 #21
0
def tfidf_vectorizer():
    """
	汉语句子文本特征值化
	使用tf-idf方法
	:return:
	"""
    # 切分汉语句子, 得出分词结果
    c1, c2 = cut_words()

    tf = TfidfVectorizer()

    # 调用 fit_transform 输入数据并转换   列表形式
    response = tf.fit_transform([c1, c2])

    print(response)  # 转为sparse矩阵形式
    print('*' * 15)
    print(tf.get_feature_names())  # 统计文章中没有重复出现的词
    print('*' * 15)
    print(response.toarray())  # 转为数组形式
    print('*' * 15)
    print(tf.inverse_transform([c1, c2]))
예제 #22
0
def buildVectorizer(bio):
    nounlist = []
    for doc in bio:
        st = ""
        for (word, pos) in tag(doc):
            if pos in ["JJ", "NNS", "NN", "NNP"]:
                st = st + word + " "
            else:
                if st != "":
                    st = st[0:-1] + " "
                    #print "got one"
        nounlist.extend([st])
    sciencestopwords = set([
        u'model', 'according', 'data', u'models', 'function', 'properties',
        'approach', 'parameters', 'systems', 'number', 'order', u'data',
        'analysis', u'information', u'journal', 'results', 'using', 'research',
        'consumers', 'scientists', 'model', 'models', 'journal', 'researchers',
        'paper', 'new', 'study', 'time', 'case', 'simulation', u'simulation',
        'equation', 'based', 'years', 'better', 'theory', 'particular', 'many',
        'due', 'much', 'set', 'studies', 'systems', 'simple', 'example',
        'work', 'non', 'experiments', 'large', 'small', 'experiment',
        u'experiments', 'provide', 'analysis', 'problem', 'method', 'used',
        'methods'
    ])
    #now doing the new vectorizer
    from sklearn.feature_extraction.text import TfidfVectorizer
    english = nltk.corpus.stopwords.words('english')
    newstop = english + list(sciencestopwords)
    vectorizer = TfidfVectorizer(min_df=1,
                                 max_df=.5,
                                 stop_words=newstop,
                                 decode_error='ignore')
    X = vectorizer.fit_transform(nounlist)
    Xinv = vectorizer.inverse_transform(X)
    #X is a sparse matrix of docs x vocab size (7638).
    #so X[doc_num] is the sparse vector of its words.
    #the ||X[doc_num]|| = 1 there are 7638 unique words and 755 docs. with a total number of 38888 non-zeros.
    #Xinv[doc_num] is the list of words in the doc.

    return nounlist, vectorizer, X, Xinv
def vectorizerAllData(newsgroups_data):
    # Create vectorizer
    vectorizer = TfidfVectorizer()

    # check dataset
    print('Total Datasets : ')
    print(newsgroups_data.data.__len__())

    print('First Dataset : ')
    print(newsgroups_data.data[0])

    arr = vectorizer.fit_transform(newsgroups_data.data).toarray()
    print(arr[0])

    # What's the length?
    print('First Dataset (vectorized) length: ')
    print(len(arr[0]))

    # Check words?
    print('To the source:')
    print(vectorizer.inverse_transform(arr[0]))
    print()
예제 #24
0
tfidf = TfidfVectorizer(ngram_range=(1,1))
tfidf.fit(example_doc)

top = 10
# get idf score of vocabularies
idf = tfidf.idf_
print('[vocabularies with smallest idf scores]')
sorted_idx = idf.argsort()
for i in range(top):
    print('%s: %.2f' % (tfidf.get_feature_names()[sorted_idx[i]], idf[sorted_idx[i]]))

doc_tfidf = tfidf.transform(example_doc).toarray()
tfidf_sum = np.sum(doc_tfidf, axis=0)
print("\n[vocabularies with highest tf-idf scores]")
for tok, v in zip(tfidf.inverse_transform(np.ones(tfidf_sum.shape[0]))[0][tfidf_sum.argsort()[::-1]][:top], 
                  np.sort(tfidf_sum)[::-1][:top]):
    print('%s: %f' % (tok, v))

# [vocabularies with smallest idf scores]
# 蟋蟀: 2.87
# 可以: 4.36
# 就是: 4.41
# 聲音: 4.46
# 這樣: 4.46
# 你們: 4.56
# 真的: 4.62
# 還有: 4.68
# 豆油伯: 4.68
# 比較: 4.68
df.head()
df_x=df["messege"]
df_y=df["class"]
cv = TfidfVectorizer(min_df=1,stop_words='english')

x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.5, random_state=4)
x_train=x_train.astype('str')
x_test=x_test.astype('str')
x_train.head()

cv1 = TfidfVectorizer(min_df=1,stop_words='english')

x_traincv=cv1.fit_transform(x_train)

a=x_traincv.toarray()
cv1.inverse_transform(a[0])
x_train.iloc[0]
x_testcv=cv1.transform(x_test)
#mnb = MultinomialNB()
mnb = svm.SVC()


y_train

y_train
mnb.fit(x_traincv,y_train)
y_sc=mnb.decision_function(x_testcv)


y_predict=mnb.predict(x_testcv)
예제 #26
0
X = df['CONTENT']
y = df['CLASS']

# Split dataset into training and testing data.
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=2)

# Use a count vectorizer to keep count of all the words used.
# This is what the classifier will be using to distinguish between ham and spam.
cv = TfidfVectorizer(min_df=1, stop_words='english')

# Fit the vectorizer, each word counts as a feature.
X_traincv = cv.fit_transform(X_train)
X_traincv = X_traincv.toarray()
cv.inverse_transform(X_traincv[0])

X_testcv = cv.transform(X_test)
X_testcv = X_testcv.toarray()
cv.inverse_transform(X_testcv[0])

# Train the classifier.
clf = MultinomialNB()
clf.fit(X_traincv, y_train)
pred = clf.predict(X_testcv)

# See how well it performed.
accuracy = clf.score(X_traincv, y_train)
print(accuracy)
예제 #27
0
df.head()


df_x=df["Message"]
df_y=df["label"]
tfd=TfidfVectorizer(min_df=1, stop_words='english')
x_train, x_test, y_train, y_test = train_test_split(
   df_x, df_y, test_size=0.20, random_state=0)


# In[161]:


x_traincv = tfd.fit_transform(x_train)
a=x_traincv.toarray()
tfd.inverse_transform(a[0])





x_train.iloc[0]



#from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
mnb=MultinomialNB()
#clf = RandomForestClassifier(max_depth=2, random_state=0)

예제 #28
0
def main(args):
    logger.debug("Arguments: %r", args)
    tfidf_vect = TfidfVectorizer(
        preprocessor=get_preprocessor(args.fields),
        analyzer='word',  # maybe callable
        token_pattern=r'\b[a-z]\w+\b',
        ngram_range=(args.min_ngrams, args.max_ngrams),
        max_df=args.max_df,
        max_features=args.max_features,
        sublinear_tf=args.sublinear_tf,
        stop_words=STOP_WORDS,
        norm=args.norm,
    )

    with LogRuntime("Loaded input data in {elapsed} seconds", logger):
        data = get_data(args)
    if data:
        logger.debug("Corpus size: {0}".format(len(data)))
    else:
        logger.error("Empty data")
        return

    with LogRuntime("Fitted in {0.elapsed} seconds", logger):
        X = tfidf_vect.fit_transform(data)

    logger.debug("Vocabulary size: {}".format(len(tfidf_vect.vocabulary_)))
    logger.debug("Max DF stop words size: {}".format(
        len(tfidf_vect.stop_words_)))
    logger.debug("Stop words size: {}".format(len(tfidf_vect.stop_words)))

    if args.clusters:
        true_k = args.clusters
    else:
        # ref: http://en.wikipedia.org/wiki/Determining_the_number_of_clusters_in_a_data_set#Finding_Number_of_Clusters_in_Text_Databases
        m_docs, n_terms = X.shape
        t_nonzeros = len(X.nonzero()[0])
        true_k = (m_docs * n_terms) / t_nonzeros
        logger.debug("Calculated number of clusters: {}".format(true_k))

    if args.minibatch:
        km = MiniBatchKMeans(
            n_clusters=true_k,
            init='k-means++',
            n_init=10,
            init_size=1000,
            batch_size=1000,
            verbose=-1)
    else:
        km = KMeans(
            n_clusters=args.clusters,
            init='random',
            max_iter=100,
            n_init=10,
            verbose=1,
            n_jobs=-1)

    with LogRuntime("KMeans Fitted in {0.elapsed} seconds", logger):
        km.fit(X)

    if args.sample_random and args.sample_size:
        sample = [
            data[i]
            for i in np.random.random_integers(0, len(data), args.sample_size)
        ]
    elif args.sample_size:
        sample = data[args.sample_skip:args.sample_size]
    else:
        sample = data

    Y = tfidf_vect.transform(sample)
    sample_terms = tfidf_vect.inverse_transform(Y)

    labels = km.predict(Y)
    distances = km.transform(Y)
    center_terms = tfidf_vect.inverse_transform(km.cluster_centers_)

    clusters = defaultdict(list)
    vocabulary = tfidf_vect.vocabulary_

    for i, doc in enumerate(sample):
        clusters[labels[i]].append((i, doc))

    truncate = lambda t: t[:100] + '...' if len(t) > 100 else t

    for label, result in sorted(clusters.iteritems()):
        # skip single results
        if len(result) < args.cluster_minsize:
            continue
        terms_joined = ', '.join(
            sorted(
                center_terms[label],
                reverse=True,
                key=lambda t: km.cluster_centers_[label, vocabulary[t]]))
        print '=' * 79
        print '=' * 79
        print '=' * 79
        print '-> ' + truncate(terms_joined) + '\n\n'
        result = sorted(
            result,
            key=lambda (i, _): distances[i, label],
        )

        j = 0
        for i, doc in result:
            j += 1
            doc_terms = ', '.join(
                sorted(
                    sample_terms[i],
                    reverse=True,
                    key=lambda t: Y[i, vocabulary[t]],
                ))
            print doc['headline']
            print get_corpus_key(doc)
            print doc['url']
            print truncate(doc_terms)
            print
            if j > 10:
                print '...'
                break

        print

    if args.shell:
        from IPython import embed
        embed()
예제 #29
0
            npindL = np.array(indL)
            freq_th = max(3, int(X.shape[0]*0.0025))
            cluster_score = {}
            # score_tweet = {}
            for clfreq in freqTwCl.most_common(50):
                cl = clfreq[0]
                freq = clfreq[1]
                cluster_score[cl] = 0
                # only get cluster have frequent appear higher than frequent threshold
                if freq >= freq_th:
                    clidx = (npindL == cl).nonzero()[0].tolist()
                    cluster_centroid = X[clidx].sum(axis=0)
                    # print("center ", cluster_centroid.shape)
                    try:
                        cluster_tweet = vectorizer.inverse_transform(cluster_centroid)
                        # print("ttt ", cluster_tweet)
                        for term in np.nditer(cluster_tweet):
                            try:
                                # cluster_score[cl] = max(cluster_score[cl], boosted_wtfVoc[str(term).strip()])
                                # print(term)
                                cluster_score[cl] += boosted_wtfVoc[str(term).strip()]
                            except: pass
                    except: pass
                    # print("cscs, ", cluster_score)
                    cluster_score[cl] /= freq
                else: break
            sorted_clusters = sorted( ((v,k) for k,v in cluster_score.items()), reverse=True)
            # print ("sorted cluster_score:")
            # print (sorted_clusters)
            # print(cluster_score)
예제 #30
0
class CommentsAnalyzer(pmlutil.Configurable):
    
    def configTypes(self):
        return dict(amount=int, min_ngram=int, max_ngram=int, min_df=int, max_df=float, use_idf=int, alpha=readArray, l1_ratio=readArray, n_folds=int)

    def _loadData(self):
        logging.info("loading data")
        self.data = []
        count = 0
        for fn in os.listdir(self._datafolder):
            if not self._amount < 1 and count >= self._amount:
                break
            if fn.endswith(self._metaextension):
                mfn = self._datafolder + "/" + fn
                ddm = pml.Datum(mfn,None)
                if len(ddm.meta()['comments'])>0:
                    self.data.append(ddm)
                    count +=1
        logging.info("loaded %d data" % count)

    def __init__(self):
        self.data=[]

    def _aggregateComments(self, subset):
        allcomments = []
        for datum in subset:
            comments = []
            for comment in datum.meta()['comments']:
                comments.append(comment['text'])
            allcomments.append(" ".join(comments))
        return np.array(allcomments)

    def _buildDictionary(self, allcomments):
        print allcomments
        self.vectorizer = TfidfVectorizer(analyzer=self._analyzer, ngram_range=(self._min_ngram,self._max_ngram),
                                     min_df=self._min_df, max_df=self._max_df, norm='l2', smooth_idf=True, use_idf=bool(self._use_idf))
        self.vectorizer.fit(allcomments)

    def run(self):
        allcomments = self._aggregateComments(self.data)
        self._buildDictionary(allcomments)

        # create representation of documents
        tfidfArray = self.vectorizer.transform(allcomments)

        # create labelling
        labels = []
        for datum in self.data:
            labels.append(len(datum.meta()['favorites']))
        labels = np.array(labels)

        print self.vectorizer.get_params()
        print self.vectorizer.get_feature_names()

        # training
        self.elasticNet = ElasticNetCV(alphas=self._alpha, l1_ratio=self._l1_ratio, fit_intercept=True, normalize=False, precompute='auto', max_iter=1000, copy_X=True, tol=0.0001, rho=None, cv=self._n_folds)
        self.elasticNet.fit(tfidfArray,labels)

        for i,l1_ratio in enumerate(self._l1_ratio):
            for j,alpha in enumerate(self._alpha):
                print "alpha: %f, l1_ratio: %f --> %f" % (alpha,l1_ratio,np.mean(self.elasticNet.mse_path_[i,j,:]))

        print self.vectorizer.inverse_transform(self.elasticNet.coef_)
def summarize_cisco_support_forum_texts():
    # cisco_plain_text = LazyCorpusLoader(
    #    'content', PlaintextCorpusReader, r'(?!\.).*\.txt', encoding='latin_1')
    cisco_plain_text = LazyCorpusLoader(
        "cisco_forum_subset", PlaintextCorpusReader, r"(?!\.).*\.txt", encoding="latin_1"
    )
    token_dict = {}
    for article in cisco_plain_text.fileids():
        token_dict[article] = cisco_plain_text.raw(article)

    tfidf = TfidfVectorizer(tokenizer=tokenize_and_stem, stop_words="english", decode_error="ignore")

    sys.stdout.flush()

    # creates Compressed Sparse Row format numpy matrix
    tdm = tfidf.fit_transform(token_dict.values())
    feature_names = tfidf.get_feature_names()

    # problem_statement_#1 - summarize support_forum articles automatically
    for article_id in range(0, tdm.shape[0] - 2):
        article_text = cisco_plain_text.raw(cisco_plain_text.fileids()[article_id])
        sent_scores = []
        for sentence in nltk.sent_tokenize(article_text):
            score = 0
            sent_tokens = tokenize_and_stem(sentence)
            for token in (t for t in sent_tokens if t in feature_names):
                score += tdm[article_id, feature_names.index(token)]
            sent_scores.append((score / len(sent_tokens), sentence))
        summary_length = int(math.ceil(len(sent_scores) / 5))
        sent_scores.sort(key=lambda sent: sent[0])
        print "\n*** SUMMARY ***"
        for summary_sentence in sent_scores[:summary_length]:
            print summary_sentence[1]
        print "\n*** ORIGINAL ***"
        print article_text

    # problem_statement_#2 - automatically categorize forum posts by tags into various groups
    reduce_dimensionality_and_cluster_docs(tfidf, tdm, num_features=200)

    # problem_statement_#3 - find similar documents to a current document (that user is reading) automatically
    # eg - quora: find similar questions, find similar answers
    cosine_similarity(tdm[0:1], tdm)
    """
    output looks like this
    array([[ 1.        ,  0.22185251,  0.0215558 ,  0.03805012,  0.04796646,
         0.05069365,  0.05507056,  0.03374501,  0.03643342,  0.05308392,
         0.06002623,  0.0298806 ,  0.04177088,  0.0844478 ,  0.07951179,
         0.02822186,  0.03036787,  0.11022385,  0.0535391 ,  0.10009412,
         0.07432719,  0.03753424,  0.06596462,  0.01256566,  0.02135591,
         0.13931643,  0.03062681,  0.02595649,  0.04897851,  0.06276997,
         0.03173952,  0.01822134,  0.04043555,  0.06629454,  0.05436211,
         0.0549144 ,  0.04400169,  0.05157118,  0.05409632,  0.09541703,
         0.02473209,  0.05646599,  0.05728387,  0.04672681,  0.04519217,
         0.04126276,  0.06289187,  0.03116767,  0.04828476,  0.04745193,
         0.01404426,  0.04201325,  0.023492  ,  0.07138136,  0.03778315,
         0.03677206,  0.02553581]])
    The first document is compared to the rest, with the most similar to it being itself with score of 1, next most similar to it is document with score 0.22185251
    """

    cosine_similarities = linear_kernel(tdm[0:1], tdm).flatten()

    # mapping back to document_name space
    related_docs_indices = cosine_similarities.argsort()
    """
    document_ids
    array([23, 50, 31, 24,  2, 52, 40, 56, 27, 15, 11, 16, 26, 47, 30,  7,  8,
       55, 21, 54,  3, 32, 45, 12, 51, 36, 44, 43, 49,  4, 48, 28,  5, 37,
        9, 18, 38, 34, 35,  6, 41, 42, 10, 29, 46, 22, 33, 53, 20, 14, 13,
       39, 19, 17, 25,  1,  0])

       docs 0 and 1 are very similar which are the following posts (last 2 array elements above when sorted)
        https://supportforums.cisco.com/discussion/11469881/aniserver-failed-run-lms-40
        and
        supportforums.cisco.com/discussion/11469606/eos-lms-31-support-quest
    """

    cosine_similarities[related_docs_indices]
    for key, value in token_dict.iteritems():
        print key, value
    # find the actual posts which are the most similar
    tfidf.inverse_transform(tdm)[0]
    tfidf.inverse_transform(tdm)[1]
예제 #32
0
#!/usr/bin/env python
# encoding: utf-8
""" 
@author: payneLi  
@time: 18-7-11 下午2:29
@email: [email protected]  

"""
from sklearn.feature_extraction.text import TfidfVectorizer
import jieba

str_1 = "今天很残酷,明天更残酷,后天很美好,但绝对大部分是死在明天晚上,所以每个人不要放弃今天"

str_2 = "我们看到的从很远星系来的光是在几百万年之前发出的,这样当我们看到宇宙时,我们是在看它的过去"

str_3 = "如果只用一种方式了解某样事物,你就不会真正了解它。了解事物真正含义的秘密取决于如何将其与我们所了解的事物相联系"

tf_idf = TfidfVectorizer()

data = [str_1, str_2, str_3]
content = [" ".join(jieba.cut(sentence)) for sentence in data]

result = tf_idf.fit_transform(content)
feature_names = tf_idf.get_feature_names()
target = result.toarray()
content_inverse = tf_idf.inverse_transform(result)
print("feature_name:", feature_names, "\ntarget:", target,
      "\ncontent_inverse:", content_inverse)
예제 #33
0
class DSOM(object):
    
    def __init__(self, inputFile=None, fileType=None, widthOfMap=2, useGPU=True):
        self.inputFile = inputFile
        self.fileType = fileType
        self.widthOfMap = widthOfMap
        self.useGPU = useGPU
        self.arrayTrain = []
        self.Y = None
        self.vectorizer = None
        self.nodeHolder = dict()
        self.text = ""
        self.dataset = ""
        
        
        
    def readDocument(self):
        if(self.fileType == 'pdf'):
            self.text = readPDF.pdfparser(self.inputFile)
        else:   
            self.text = open(self.inputFile, "r").read()    
        self.dataset = self.text.split("\n\n")         
        
    def train(self, inputFile=None):
        ###############################################################################
        #clean_file = open("data/paragraph_vector_output (copy).txt")
        #dataset = clean_file.read().split("\n\n")
    #     print(dataset)
    #     print("%d Paragraphs " % len(dataset))
    #     print()
    #     print("Extracting features from the dataset using a sparse vectorizer")
        #t0 = time()
        self.vectorizer = TfidfVectorizer(max_df=0.5, max_features=1000,
                                         min_df=2, stop_words='english',
                                         use_idf=True, sublinear_tf=True)
        self.Y = self.vectorizer.fit_transform(self.dataset)
        
        #arrayTrain = X.toarray()
        svd = TruncatedSVD(n_components=100, random_state=42)
        X = svd.fit_transform(self.Y)
        self.arrayTrain = X
        #print("done in %fs" % (time() - t0))
        #print("n_samples: %d, n_features: %d" % X.shape)
        #print()
        ###############################################################################
        ## SOM
        #For plotting the images
        
        #Train a 20x30 SOM with 400 iterations
        #print("<-- Starting SOM -- >")
        mapSide = self.widthOfMap
        som = SOM.SOM(DATA=self.arrayTrain, num_units=mapSide*mapSide, width=mapSide, height=mapSide)
        #print("<-- Training SOM -- >")
        #t0 = time()
        if(self.useGPU == True):
            try:
                import theano.sandbox.cuda
                theano.sandbox.cuda.use('gpu')
            except: 
                print("Switching to GPU didn't work, will fallback to CPU.")
            som.train_batch_theano(verbose=False)
        else:
            som.train_batch(verbose=False)
        #print("<-- Done Training SOM %fs -- >" %(time()-t0))
        #Get output grid
        #print("<-- Testing SOM -- >")
        #print("<-- Begin Output -- >")
        #np.set_printoptions(threshold='nan')
        clusters = som.ins_unit_assign
        #print(clusters)
        
        
        for i in range(mapSide*mapSide):
                self.nodeHolder[i] = []
                
        for i, m in enumerate(clusters):
            if (m) in self.nodeHolder:
                self.nodeHolder[m].append(i)
            else:
                self.nodeHolder[m] = [i]
    
    def getClusters(self):
        return self.nodeHolder
    
    def getDataset(self):
        return self.dataset
    
    def tfIDFArray(self):
        inverse = self.vectorizer.inverse_transform(self.Y)
        outList = []
        for x in inverse:
            outList.append([y.encode('UTF8') for y in x])
        return outList
def build(summary,
          genre,
          text_feature=1,
          baseline=1,
          top_genre=10,
          top_phrases=10):
    """parameter tuned classify models"""
    #remove punctuation
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    no_punct = summary.apply(lambda x: tokenizer.tokenize(x))

    #label binarizer
    multilabel_binarizer = sklearn.preprocessing.MultiLabelBinarizer()
    multilabel_binarizer.fit(genre)
    label = multilabel_binarizer.transform(genre)

    #split training and validation set
    xtrain, xval, ytrain, yval = train_test_split(summary,
                                                  label,
                                                  test_size=0.2,
                                                  random_state=1000)
    tfidf_vectorizer = TfidfVectorizer(analyzer='word',
                                       stop_words={'english'},
                                       max_df=0.8,
                                       max_features=10000)
    xtrain_tfidf = tfidf_vectorizer.fit_transform(xtrain)
    xval_tfidf = tfidf_vectorizer.transform(xval)

    #hyperparameter grid search
    parameters = {
        "estimator__C": [0.1, 1, 5, 10, 15],
    }

    if baseline:
        lr = sklearn.linear_model.LogisticRegression()
        clf = OneVsRestClassifier(lr, n_jobs=8)
        clf.fit(xtrain_tfidf, ytrain)
        y_pred = clf.predict(xval_tfidf)
    else:
        svc = svm.LinearSVC()
        clf = OneVsRestClassifier(svc, n_jobs=8)
        clf = GridSearchCV(clf,
                           param_grid=parameters,
                           cv=3,
                           verbose=3,
                           scoring='f1_micro',
                           refit=True)
        clf.fit(xtrain_tfidf, ytrain)
        y_pred = clf.predict(xval_tfidf)
        clf = clf.best_estimator_

    # Predicted label
    actual_genre = multilabel_binarizer.inverse_transform(yval)
    predicted_genre = multilabel_binarizer.inverse_transform(y_pred)

    #evaluation
    f1 = "f1-score: " + str(
        sklearn.metrics.f1_score(yval, y_pred, average="micro"))

    e1 = 'percentage of genres that are correctly predicted: '+ str(np.sum([len(set(a).intersection(b)) for a, b in \
                  zip(pd.Series(predicted_genre), pd.Series(actual_genre))])/sum(genre.apply(len)))
    e2 = 'percentage of movies that have at least one gnere predicted right: '+str(np.sum([len(set(a).intersection(b))>0 for a, b in\
                  zip(pd.Series(predicted_genre), pd.Series(actual_genre))])/len(genre))

    lst = []
    new_genre_label = []
    genre_label = multilabel_binarizer.classes_
    for a, b in zip(clf.estimators_, genre_label):
        try:
            lst.append(a.coef_)
            new_genre_label.append(b)
        except:
            pass

    dist = genre.explode().value_counts(ascending=False)
    genre_coef = dict(zip(new_genre_label, np.vstack(lst)))
    fig, ax = plt.subplots(top_genre // 3 + 1, 3, figsize=(20, top_genre * 2))
    for o, g in enumerate(dist[:top_genre].index):
        c = genre_coef[g]
        words = tfidf_vectorizer.inverse_transform(c)[0]
        evd = [t for t in c if t > 0]
        d = dict(zip(words, evd))
        sorted_words = sorted(d.items(),
                              key=lambda item: item[1])[-top_phrases:]
        x = [i[0] for i in sorted_words]
        y = [i[1] for i in sorted_words]
        ax[o // 3][o % 3].barh(x, y)
        ax[o // 3][o % 3].set_title(g)
    fig.tight_layout()
    if text_feature:
        if baseline:
            fig.savefig(
                'data/figures/baseline model with summary text results.png')
        else:
            fig.savefig(
                'data/figures/final model with summary text results.png')
    else:
        if baseline:
            fig.savefig('data/figures/baseline model with phrases results.png')
        else:
            fig.savefig('data/figures/final model with phrases results.png')
    return (f1 + "\n" + e1 + "\n" + e2 + "\n")
예제 #35
0
def main():
    """
    Read sgm files and parse each article from the individual documents
    :return:
    """
    t0 = time()
    article_list = []
    article_info = {}

    for i in range(0, 22):
        filename = 'data{}'.format(str(i).zfill(2))
        with open('dataset/{}.sgm'.format(filename), 'r') as f:
            data = f.read()
        parser = BeautifulSoup(data, 'html.parser')
        '''
        Looping over each article distinguished by reuters tag , creating a dictionary out of each article of the format :
        {
            'Body': [u'jaguar', u'jaguar', u'plc', u'jagrl', u'sell', u'new', u'xj', u'model', u'us', u'japanes' ],
            'Places': [u'uk'],
            'Title': [u'jaguar', u'see', u'strong', u'growth', u'new', u'model', u'sale'],
            'Topics': [u'earn'],
            u'topics': u'YES',
            u'lewissplit': u'TRAIN',
            u'newid': u'2001',
            u'oldid': u'18419',
            'Date': [u'mar'],
            u'cgisplit': u'TRAINING-SET'
        }

        The content of each dictionary tag is after removing stop words and stemming the contents
        '''

        for article in parser.findAll('reuters'):
            try:
                article_list.append(article.body.text)

            except AttributeError:
                continue

            article_info[article['newid']] = {}
            article_info[article['newid']]['topic'] = []
            article_info[article['newid']]['place'] = []

            place_parser = article.places
            topic_parser = article.topics
            topic_list = []
            for topic in topic_parser.findAll('d'):
                topic_list.append(topic.text)

            for place in place_parser.findAll('d'):
                article_info[article['newid']]['place'].append(place.text)

            article_info[article['newid']]['label'] = article['lewissplit']
            if(len(topic_list)==0):
                article_list.pop()
                article_info.popitem()
            else:
                article_info[article['newid']]['topic'].append(topic_list)


        '''
        Extracting the dictionary of features into a .csv file
        Format :
            Article ID,Topic,Place, Label
            20057,[u'south-korea'],[],TEST
        '''

    # with open('dictionary.csv', 'wb') as f:
    #     f.write('Article ID,Topic,Place,Label')
    #     f.write('\n')
    #     for key, value in article_info.iteritems():
    #         f.write(key)
    #         f.write(',')
    #         for inner_key,inner_value in value.items():
    #             f.write(str(inner_value))
    #             f.write(',')
    #         f.write('\n')

    print 'No of valid articles = {}'.format(len(article_list))
    #To create a global list of topics used while tokenization(used in tokenize function) to make sure feature words do not belong to topic list
    global topics_list
    topics_list = list()
    topics = getTopics(article_info,[])

    for topic_article in topics:
        if topic_article:
            for topic in topic_article:
                if topic:
                    for top in topic:
                        topics_list.append(top)


    with open('topic_labels', 'wb') as outfile:
        pickle.dump(topics, outfile, pickle.HIGHEST_PROTOCOL)

        # with open('initial_word_count.txt', 'wb') as ini:
        #     sum =0
        #     for word in article_list:
        #         sum += len(word.split())
        #  ini.write('Total words in body tag of all the 21578 documents initially :'+str(sum))


    vectorizer = TfidfVectorizer(min_df= 0.001,max_df=0.9, tokenizer=tokenize, strip_accents='unicode', smooth_idf=True)

    feature_vector = vectorizer.fit_transform(article_list)

    feature_list = vectorizer.get_feature_names()

    with open('feature_vector', 'wb') as outfile:
        pickle.dump(feature_vector, outfile, pickle.HIGHEST_PROTOCOL)

    with open('features_list', 'wb') as f:
        pickle.dump(feature_list, f, pickle.HIGHEST_PROTOCOL)

# with open('feature_list.csv','wb') as feature:
#     for value in feature_list:
#         feature.write(str(value)+'\n')

    counter_vectorizer = CountVectorizer(vocabulary=vectorizer.vocabulary_, strip_accents='unicode')

    # for the word frequency counts
    data_matrix = counter_vectorizer.fit_transform(article_list)  # data matrix
    transaction_matrix = vectorizer.inverse_transform(feature_vector)  # transaction matrix
    # terms = counter_vectorizer.get_feature_names()
    # freqs = data_matrix
    # result = dict(zip(terms, freqs))
    # print result
    # print(len(result))



## Un-comment from here to generate data_matrix and transaction_matrix

# with open('data_matrix.dat', 'wb') as outfile:
#     pickle.dump(data_matrix, outfile, pickle.HIGHEST_PROTOCOL)
#
# with open('transaction_matrix.dat', 'wb') as outfile:
#     pickle.dump(transaction_matrix, outfile, pickle.HIGHEST_PROTOCOL)

# with open('unigram_word_count.txt','wb') as ini:
#         sum = len(vectorizer.get_feature_names())
#         ini.write('Total words in body tag remaining after stemming , removing stop words and computing tf-idf counts :'+str(sum))

    bigram_vectorizer = TfidfVectorizer(min_df=0.001, tokenizer=tokenize, ngram_range=(2,2), strip_accents='unicode', max_df=0.9, smooth_idf=True)

    bigram_feature_vector = bigram_vectorizer.fit_transform(article_list)

    indices = np.argsort(bigram_vectorizer.idf_)[::-1]
    features = bigram_vectorizer.get_feature_names()
    top_n = 20
    top_features = [features[i] for i in indices[:top_n]]
    print top_features
    # with open('top_20_bigrams.txt','wb') as ini:
    #          ini.write(str(top_features))
    print("Done in %0.3fs" % (time() - t0))

if __name__ == '__main__':
    corpus = build_corpus_from_dir('.')
    #corpus=["Hi how are you, what you doingg?", "Hey what's up bro? you are cool","Hi what are you up to? Such a cool day"]

    vectorizer = TfidfVectorizer(tokenizer=tokenize,
                                 stop_words='english',
                                 max_features=250,
                                 min_df=5,
                                 max_df=0.5)
    train_cv = vectorizer.fit_transform(corpus)

    a = train_cv.toarray()
    #print('\nThis is A:\n',a)
    b = vectorizer.inverse_transform(a)
    features = vectorizer.get_feature_names()
    #print('\nThis is B:\n',b)
    print('\n\n\n\n\n\n')
    #print(features)
    #print(len(features))

    #dist = 1 - cosine_similarity(a)

    num_clusters = 4

    km = KMeans(n_clusters=num_clusters)

    km.fit(a)

    centroids = km.cluster_centers_
예제 #37
0
class Chatbot():
    def __init__(self, method=None):
        self.method = method
        self.spacy_model = spacy.load('en')
        self.labels = self.get_labels("annotations.json")

        self.library_df = pd.read_csv("library_data.csv")
        self.library_df.drop_duplicates(subset=['name'])
        self.lemmatized_library_descriptions = self.lemmatize_text(
            list(self.library_df['description']))
        self.library_vectorizer = TfidfVectorizer(
            stop_words=list(stop_words.ENGLISH_STOP_WORDS) + [
                'a', 'python', 'framework', 'library', 'should', 'import',
                'want', 'use', 'pron'
            ],
            ngram_range=(1, 1))
        self.library_desc_vectors = self.library_vectorizer.fit_transform(
            self.lemmatized_library_descriptions)
        self.library_desc_vectors = csr_matrix(
            self.library_desc_vectors).toarray()

        self.error_df = pd.read_csv(
            "C:\\Users\\user\\EECE 634\\chatbot\\error_data.csv")
        self.error_lemmatized_descriptions = self.lemmatize_text(
            list(self.error_df['error']))
        self.error_vectorizer = TfidfVectorizer(
            stop_words=list(stop_words.ENGLISH_STOP_WORDS) +
            ['python', 'should', 'want', 'use', 'pron'],
            ngram_range=(1, 1))
        self.error_desc_vectors = self.error_vectorizer.fit_transform(
            self.error_lemmatized_descriptions)
        self.error_desc_vectors_arr = csr_matrix(
            self.error_desc_vectors).toarray()

        self.k = []
        self.threshold = [0.8, 0.5, 0.55, 0.55, 0.5]
        self.vectorizers = []
        self.dff = []
        self.df = pd.read_csv("data.csv", encoding="ISO-8859-1")
        for cat in range(2, 7):
            if cat == 2:  # represents category 0
                vectorizer = TfidfVectorizer(stop_words=None,
                                             ngram_range=(1, 1))
                self.vectorizers.append(vectorizer)
                df1 = self.df[self.df['Type'] == 0]
            else:
                vectorizer = TfidfVectorizer(stop_words=[
                    'a', 'the', 'python', 'should', 'want', 'use', 'pron'
                ],
                                             ngram_range=(1, 1))
                self.vectorizers.append(vectorizer)
                df1 = self.df[self.df['Type'] == cat]
            df1 = df1.reset_index(drop=True)
            self.dff.append(df1)
            corpus = list(df1['user1'])
            lemmatized_corpus = self.lemmatize_text(corpus)
            X = vectorizer.fit_transform(lemmatized_corpus)
            self.k.append(csr_matrix(X).toarray())

    def lemmatize_text(self, input_list):
        lemmatized_descriptions = []
        for desc in input_list:
            current_desc = []
            doc = self.spacy_model(desc)
            for token in doc:
                current_desc.append(token.lemma_)
            lemmatized_descriptions.append(" ".join(current_desc))
        return lemmatized_descriptions

    def get_labels(self, arg):
        with open(arg) as json_file:
            data = json.load(json_file)
            labels = {
                "Greetings": [0, []],
                "Library": [1, []],
                "Error": [2, []],
                "Syntax": [3, []],
                "Interpreted": [4, []],
                "Methods": [5, []],
                "Directory": [6, []]
            }

            for item in data["entities"]:
                value = item["offsets"][0]["text"]
                if (item["classId"] == "e_7"):
                    if value not in labels["Greetings"][1]:
                        labels["Greetings"][1].append(value)
                elif (item["classId"] == "e_8"):
                    if value not in labels["Library"][1]:
                        labels["Library"][1].append(value)
                elif (item["classId"] == "e_9"):
                    if value not in labels["Error"][1]:
                        labels["Error"][1].append(value)
                elif (item["classId"] == "e_10"):
                    if value not in labels["Syntax"][1]:
                        labels["Syntax"][1].append(value)
                elif (item["classId"] == "e_11"):
                    if value not in labels["Interpreted"][1]:
                        labels["Interpreted"][1].append(value)
                elif (item["classId"] == "e_12"):
                    if value not in labels["Methods"][1]:
                        labels["Methods"][1].append(value)
                elif (item["classId"] == "e_13"):
                    if value not in labels["Directory"][1]:
                        labels["Directory"][1].append(value)

            for category in labels:
                txt_file = "features/annotated_" + str(
                    labels[category][0]) + "_" + category + ".txt"
                with open(txt_file, 'w') as file:
                    file.write(json.dumps(labels[category][1]))

            for category in labels:
                txt_file = "features/added_" + str(
                    labels[category][0]) + "_" + category + ".txt"
                with open(txt_file, 'r') as file:
                    x = file.read().splitlines()
                    for value in x:
                        if x not in labels[category][1]:
                            labels[category][1].append(value)
                    file.close()
            return labels

    def answer(self, question, cat):
        if cat == 1:
            v = self.library_vectorizer.transform(
                self.lemmatize_text([question.lower()]))
            isAnswered = 0
            if self.library_vectorizer.inverse_transform(
                    self.library_vectorizer.transform(
                        self.lemmatize_text([question.lower()
                                             ])))[0].shape[0] == 0:
                scores = [0] * len(self.library_desc_vectors)
            else:
                scores = []
                for item in self.library_desc_vectors:
                    scores.append(
                        1 - spatial.distance.cosine(item,
                                                    csr_matrix(v).toarray()))
                scores = np.array(scores)
                answer_list = []
                for item in scores.argsort()[-3:][::-1]:
                    if scores[item] > 0.173:
                        if isAnswered:
                            answer_list.append("Maybe " +
                                               self.library_df['name'][item] +
                                               " would help")
                        else:
                            answer_list.append(self.library_df['name'][item] +
                                               " is a good choice")
                            isAnswered = 1
                    elif 0.173 > scores[item] > 0.129:
                        answer_list.append("I'm not sure, but " +
                                           self.library_df['name'][item] +
                                           " may help")
                        isAnswered = 1
            if isAnswered == 0:
                return 'Sorry i cannot answer this question yet :)'
            else:
                return ". ".join(answer_list)
        elif cat == 2:
            lemmatized_qs = self.lemmatize_text([question])
            for i, qs in enumerate(lemmatized_qs):
                v = self.error_vectorizer.transform([qs.lower()])
                isAnswered = 0
                if self.error_vectorizer.inverse_transform(
                        self.error_vectorizer.transform([qs
                                                         ]))[0].shape[0] == 0:
                    scores = [0] * len(self.error_desc_vectors_arr)
                else:
                    scores = []
                    for item in self.error_desc_vectors_arr:
                        scores.append(
                            1 -
                            spatial.distance.cosine(item,
                                                    csr_matrix(v).toarray()))
                    scores = np.array(scores)
                    for item in scores.argsort()[-3:][::-1]:
                        if scores[item] > 0.45:
                            isAnswered = 1
                            if "pip install <package>" in self.error_df[
                                    'how to solve'][item]:
                                try:
                                    return self.error_df['how to solve'][
                                        item].replace(
                                            '<package>',
                                            re.search(
                                                r'(?<=named\s)\s*(.)*?(?=([\s;,\.\n]|$))',
                                                question.lower().replace(
                                                    "'", "")).group(0))
                                except:
                                    return self.error_df['how to solve'][item]

                            else:

                                return self.error_df['how to solve'][item]

                            break

                if isAnswered == 0:
                    return 'Sorry i cannot answer this question yet :)'
        else:
            c = 0 if cat == 0 else cat - 2
            lemmatized_qs = self.lemmatize_text([question])
            for i, qs in enumerate(lemmatized_qs):
                v = self.vectorizers[c].transform([qs.lower()])
                scores = []
                for item in self.k[c]:
                    scores.append(
                        1 - spatial.distance.cosine(item,
                                                    csr_matrix(v).toarray()))
                scores = np.array(scores)
                index = scores.argsort()[-3:][::-1][0]
                if scores[index] > self.threshold[c]:
                    return self.dff[c]['user2'][index]
                else:
                    return 'Sorry i cannot answer this question yet :)'

    def classify_functional(self, question):
        cat = -1
        cat_found = []
        for category in self.labels:
            for phrase in self.labels[category][1]:
                x = re.search("(^|[^a-zA-Z])" + phrase + "($|[^a-zA-Z])",
                              question, re.IGNORECASE)
                if (x is not None):
                    cat_found.append(category)
                    break
        if (cat_found == []):
            cat = -1
        elif (cat_found == ["Greetings"]):
            cat = 0
        elif (len(cat_found) >= 1):
            if ("Greetings" in cat_found): cat_found.remove("Greetings")
            if (len(cat_found) == 1):
                cat = self.labels[cat_found[0]][0]
            elif ("Error" in cat_found):
                cat = 2
            elif ("Syntax" in cat_found):
                cat = 3
            elif ("Interpreted" in cat_found):
                cat = 4
            elif ("Directory" in cat_found):
                cat = 6
            elif ("Methods" in cat_found):
                cat = 5
            else:
                cat = 1
        if (cat == -1):
            return "I don't understand, please be more specific."
        else:
            return self.answer(question, cat)