示例#1
0
def test():
    logger.info("loading models --->")
    le = pickle_load(label_enco)
    vectorizer = pickle_load(semhash_feature)
    clf = pickle_load(model_NB)
    list_of_nnp = pickle_load(nnp)
    list_of_day = pickle_load(day)
    list_of_dayperiod = pickle_load(dayperiod)
    logger.info("loading testing data --->")
    with open(path_nlp_test_data, 'r', encoding="utf-8-sig") as f:
        reader = csv.reader(f)
        test_data = [(remove_stopwords(row[0]), row[1]) for row in reader]
    count = 0
    for i in tqdm(test_data):
        #removing inconstant words like proper nouns words and weekday names and period of day since our classification should be independent of them
        test_query = pos_remove(i[0], list_of_nnp, list_of_day,
                                list_of_dayperiod)
        test_query = remove_stopwords(test_query)
        if (len(test_query) > 0):
            test_quer = vectorizer.transform([test_query])
            pred = clf.predict(test_quer)
            pro = clf.predict_proba(test_quer)[0]
            PRO = pro[np.argmax(pro)]
            pred = le.inverse_transform(pred)[0]
            #slot filling modifies the intent according to certain keywords
            pred = slot_filling(test_query, pred, flight_no, airfare, airline)
            if pred == i[1]:
                count += 1

    return count, len(test_data)
示例#2
0
def generate_terms(content_list, stopword_list):
    cleared_content = ' '.join(content_list)

    tokens = cleared_content.split(' ')
    tokens = list(set(tokens))  # remove duplicates

    stopword_list.append('')
    stopword_list.extend(list(map(chr, range(97, 123))))  # alphebat letters
    updated_tokens = remove_stopwords(stopword_list, tokens)

    stemmed_tokens = [PorterStemmer().stem(token) for token in updated_tokens]
    stemmed_tokens = list(set(stemmed_tokens))  # remove duplicates
    terms = remove_stopwords(stopword_list, stemmed_tokens)

    return terms
 def getPreprocessing(self):
     if "preprocessed_corpus.pkl" in os.listdir(self.path):
         print("loading preprocessed corpus")
         with open(self.path + "/preprocessed_corpus.pkl", "rb") as file:
             self.corpus = pickle.load(file)
     else:
         corpus_preprocessed = []
         tweet_tokenizer = TweetTokenizer()
         for text in tqdm(self.corpus):
             text = utils.remove_stopwords(text)
             #text = clean(text,fix_unicode=True, to_ascii=True,lower=True,
             #                 no_line_breaks=False, no_urls=True,no_emails=True,
             #                 no_phone_numbers=True, no_numbers=False, no_digits=False,
             #                 no_currency_symbols=True, no_punct=False,
             #                 replace_with_url="<URL>",replace_with_email="<EMAIL>",
             #                 replace_with_phone_number="<PHONE>",replace_with_number="<NUMBER>",
             #                 replace_with_digit="0",replace_with_currency_symbol="<CUR>",
             #                 lang="en")
             #text_normalized = []
             #for token in gensim.utils.simple_preprocess(text):
             #    if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3 and token not in self.stopwords:
             #        text_normalized.append(token)
             #corpus_preprocessed.append(tweet_tokenizer.tokenize(" ".join(text_normalized)))
             corpus_preprocessed.append(text.split(" "))
         with open(self.path+"/preprocessed_corpus.pkl","wb") as file:
             pickle.dump(corpus_preprocessed,file)
         self.corpus = corpus_preprocessed
         del corpus_preprocessed
     print("Size in memory:", sys.getsizeof(self.corpus))
     return
示例#4
0
def get_most_frequent_keywords(tweets):
    tweetsWithoutStopwords = ([
        utils.remove_stopwords(tweet) for tweet in tweets
    ])
    counter = Counter()
    # get rid of sentence structure after tokenization
    newTweets = [sum(tweet, []) for tweet in tweetsWithoutStopwords]
    for tweet in newTweets:
        counter.update(tweet)
    return counter.most_common(25)
    def cluster(self, doc_file):
        texts = pre_process_doc(doc_file)
        clean_text = utils.remove_stopwords(texts, self.stopwords)
        corpus = [self.dictionary.doc2bow(text) for text in clean_text]
        corpus_tfidf = self.tfidf[corpus]
        top_topics = self.model.get_document_topics(corpus_tfidf[0],
                                                    minimum_probability=0.0)
        topic_vec = [top_topics[j][1] for j in range(self.num_topics)]

        return [topic_vec]
示例#6
0
def calculate_class_score(sentence, class_name):
    score = 0
    sentence = normalize(sentence)
    sentence = remove_stopwords(sentence)
    sentence = stemming(sentence)
    dados = load_corpus()
    for word in sentence:
        if word in dados[class_name]:
            score += dados[class_name][word]
    return score
示例#7
0
def old_calculate_similarity(first_text, second_text):
    lang = language_detection(first_text + second_text)

    first_text = remove_stopwords(preprocess(first_text), lang)
    second_text = remove_stopwords(preprocess(second_text), lang)

    index2word_set = set(get_index_to_word(lang))

    #get average vector for sentence 1
    first_text_avg_vector = avg_feature_vector(first_text, lang, 200,
                                               index2word_set)

    #get average vector for sentence 2
    second_text_avg_vector = avg_feature_vector(second_text, lang, 200,
                                                index2word_set)

    similarity = 1 - spatial.distance.cosine(first_text_avg_vector,
                                             second_text_avg_vector)
    if math.isnan(similarity):
        similarity = 0
    return similarity
    def create_corpus(self, type_corpus='train', mode='create'):
        if mode == 'create':
            print('Creating ' + type_corpus + ' corpus ...')
            if type_corpus == 'train':
                X_train, y = get_data('./data/Train_Full', mode='from_file')
                texts = utils.remove_stopwords(X_train, self.stopwords)
                corpus = [self.dictionary.doc2bow(text) for text in texts]

            elif type_corpus == 'test':
                X_test, y = get_data('./data/Test_Full', mode='from_file')
                texts = utils.remove_stopwords(X_test, self.stopwords)
                corpus = [self.dictionary.doc2bow(text) for text in texts]

        elif mode == 'load':
            print('load corpus')
            with open('./data/' + type_corpus + '_corpus.pkl', 'rb') as f:
                corpus = pickle.load(f)

            with open('./data/y_' + type_corpus + '.pkl', 'rb') as f:
                y = pickle.load(f)

        return corpus, y
示例#9
0
def calculate_similarity(first_text, second_text):
    lang = language_detection(first_text + second_text)

    first_text = remove_stopwords(preprocess(first_text), lang)
    second_text = remove_stopwords(preprocess(second_text), lang)

    index2word_set = set(get_index_to_word(lang))

    #get average vector for sentence 1
    first_text_avg_vector = getAvgFeatureVecs(first_text, lang, 200,
                                              index2word_set)

    #get average vector for sentence 2
    second_text_avg_vector = getAvgFeatureVecs(second_text, lang, 200,
                                               index2word_set)

    # print(first_text_avg_vector)
    # print(second_text_avg_vector)
    # print("HAHAHAAHHA")
    mat = np.array(np.array(first_text_avg_vector),
                   np.array(second_text_avg_vector))
    mat_sparse = sparse.csr_matrix(mat)
    similarity = cosine_similarity(mat_sparse)
    return similarity
示例#10
0
文件: train.py 项目: luizfan/nlp
def learning(training_data):
    corpus_words = load_corpus()
    for data in training_data:
        phrase = data['phrase']
        phrase = normalize(phrase)
        phrase = remove_stopwords(phrase)
        phrase = stemming(phrase)

        class_name = data['class']
        if class_name not in list(corpus_words.keys()):
            corpus_words[class_name] = {}
        for word in phrase:
            if word not in list(corpus_words[class_name].keys()):
                corpus_words[class_name][word] = 1
            else:
                corpus_words[class_name][word] += 1
    return corpus_words
示例#11
0
def requre(link, id):
    r = requests.get(link + id)
    # r = requests.get('http://localhost:61101/api/Motels/GetDataPython/'+ id)
    d = r.json()
    data = []
    if len(data) != 0:
        data = []
    if len(d) != 0:
        for i in range(0, len(d), 1):
            merge = d[i]['title']+' '+ utils.cleanhtml(d[i]['description'])
            key = utils.clean_text(merge)
            removeKey = utils.remove_stopwords(key)
            motel = Motel(d[i]['id'], d[i]['title'], d[i]['price'], d[i]['priceType'], d[i]['dateUpdate'], d[i]['dateDue'], d[i]['time'], d[i]['status'], 
                    d[i]['verify'], d[i]['address'], utils.cleanhtml(d[i]['description']), d[i]['phone'], d[i]['typemotel'], d[i]['areaZone'], d[i]['areaZoneType'], 
                    d[i]['typeservice'], d[i]['bathroom'], d[i]['livingroom'], d[i]['latitude'], d[i]['longitude'], d[i]['city'], d[i]['province'], 
                    d[i]['district'], d[i]['street'], d[i]['user'], merge, removeKey)
            data.append(motel.__dict__)
    return data
    
示例#12
0
def train():
    logger.info("loading training data --->")
    with open(path_nlp_data, 'r', encoding="utf-8-sig") as f:
        reader = csv.reader(f)
        training_data = [(remove_stopwords(row[0]), row[1]) for row in reader]
    '''NER TAGS extracted using BIOS TAGS'''
    list_of_nnp, list_of_day, list_of_rt, list_of_dayperiod = ner_tagging(
        path_ner_data)
    '''TRAINING USING NB + CV'''
    vectorizer, clf, le = training(training_data, list_of_nnp, list_of_day,
                                   list_of_dayperiod)
    '''SAVING MODELS AND DATA '''
    logger.info("trained NB on training data and saving models --->")
    pickle_dump(clf, model_NB)
    pickle_dump(le, label_enco)
    pickle_dump(vectorizer, semhash_feature)
    pickle_dump(list_of_nnp, nnp)
    pickle_dump(list_of_day, day)
    pickle_dump(list_of_rt, rt)
    pickle_dump(list_of_dayperiod, dayperiod)
示例#13
0
def predict():
    maxlen = 31
    print("Loading model .....and predicting again")
    import tensorflow as tf
    from keras.models import model_from_json
    import gensim
    filep = 'LSTM_spatial_WOW11_{epoch:02d}-{val_acc:.2f}.h5'
    early = keras.callbacks.EarlyStopping(monitor='val_acc', min_delta=0, patience=4, verbose=0, mode='max')
    early_stopping = keras.callbacks.ModelCheckpoint(filep, monitor='val_acc', verbose=0, save_best_only=True, save_weights_only=False, mode='max', period=2)
    model = load_model('LSTM_spatial_WOW.h5',custom_objects={'sensitivity': sensitivity,'f1score':f1score,'precision':precision})
    with open(os.path.join(path_models,'lstm_weightsWOW.pickle'), 'rb') as handle:
        weight = pickle.load(handle)
    with open(os.path.join(path_models,'lstm_tkWOW.pickle'), 'rb') as handle:
        tokenizer = pickle.load(handle)
    with open(os.path.join(path_models,'lstm_labelWOW.pickle'), 'rb') as handle:
        label_encoder = pickle.load(handle)
    with open(os.path.join(path_models,'lstm_X_trainWOW.pickle'), 'rb') as handle:
        X_train = pickle.load(handle)
    with open(os.path.join(path_models,'lstm_y_trainWOW.pickle'), 'rb') as handle:
        y_train = pickle.load(handle)
    with open(os.path.join(path_models,'lstm_X_testWOW.pickle'), 'rb') as handle:
        X_test = pickle.load(handle)
    with open(os.path.join(path_models,'lstm_y_testWOW.pickle'), 'rb') as handle:
        y_test = pickle.load(handle)
    count=0
    for i in test_data:
        test_query = remove_stopwords(i[0])
        print(test_query)
        if(len(test_query) > 0):
            seq = tokenizer.texts_to_sequences([test_query])
            X = sequence.pad_sequences(seq, maxlen=maxlen,padding='post')
            y_pred = model.predict(X)
            y_pred1 = np.argmax(y_pred,axis=1)
            classes = label_encoder.inverse_transform(y_pred1)[0]
            print(classes)
            if classes == i[1]:
                count+=1
    print("Accuracy on test data --> " ,(count/len(test_data))*100)
    
    print("DONE VALIDATING....")
示例#14
0
    def create_dictionary(self):
        print('Creating dictionary and tfidf ...')
        X_train, _ = get_data('./data/Train_Full', mode='from_file')
        texts = utils.remove_stopwords(X_train, self.stopwords)

        dictionary = corpora.Dictionary(texts)
        dictionary.filter_extremes(no_below=2, no_above=0.7, keep_n=DICT_SIZE)

        # create tfidf
        corpus = [dictionary.doc2bow(text) for text in texts]
        tfidf = models.TfidfModel(corpus)

        # saving dictionary
        with open('./data/dictionary.pkl', 'wb') as f:
            pickle.dump(dictionary, f)

        with open('./data/tfidf.pkl', 'wb') as f:
            pickle.dump(tfidf, f)

        print('Created dictionary and tfidf.')

        return dictionary, tfidf
示例#15
0
文件: svm.py 项目: mquezada/cc6909
def generate_documents():
    events = r.keys('event:*:title')
    for event_key in events:
        event_id = id(event_key)
        lang = r.get('event:' + event_id + ':lang')
        docs = r.keys('document:*:' + event_id)
        documents[event_id] = []
        for doc_key in docs:
            doc_id = id(doc_key)
            tweet_ids = r.lrange('document:' + doc_id + ':tweets', 0, -1)
            document = []
            for tweet_id in tweet_ids:
                # esto se puede mejorar...
                tweet = utils.remove_entities(tweet_id)
                tweet = parser.unescape(' '.join(tweet.split()))
                if len(tweet) == 0 or len(tweet.split()) == 0:
                    continue
                tweet = utils.strip_accents(tweet)
                tweet = utils.remove_stopwords(tweet, lang)
                tweet = ' '.join([stemmers[lang].stem(token) for token in tweet.split()])
                document.append(tweet)
            documents[event_id].append(' '.join(document))
示例#16
0
文件: svm.py 项目: mquezada/cc6909
def generate_documents_for(event_id):
    lang = r.get('event:' + event_id + ':lang')
    if lang is None:
        lang = 'spanish'
    docs = r.keys('document:*:' + event_id)
    documents[event_id] = []
    documents_ids[event_id] = []

    keys = []
    for eid in docs:
        keys.append(id(eid))

    docs = set(keys)
    for doc_id in docs:
        #doc_id = id(doc_key)

        # fb no se dejo resolver, y quedan muchos documentos apuntando a unsuportedbrowser
        # se ignora fb mientras no se arregle este problema
        url = r.get('document:%s:url' % doc_id)
        if urlparse(url).netloc == 'www.facebook.com':
            continue

        documents_real_ids.append(doc_id)
        tweet_ids = r.lrange('document:' + doc_id + ':tweets', 0, -1)
        documents_ids[event_id].append(tweet_ids)

        document = []
        for tweet_id in tweet_ids:
            # esto se puede mejorar...
            tweet = utils.remove_entities(tweet_id)
            tweet = parser.unescape(' '.join(tweet.split()))
            if len(tweet) == 0 or len(tweet.split()) == 0:
                continue
            tweet = utils.strip_accents(tweet)
            tweet = utils.remove_stopwords(tweet, lang)
            tweet = ' '.join([stemmers[lang].stem(token) for token in tweet.split()])
            document.append(tweet)
        documents[event_id].append(' '.join(document))
示例#17
0
def get_search_terms_news(redis, news_id, lang):
    # obtener todas las paginas hijas del event id=news_id
    # que no hayan sido procesadas antes*
    keys = redis.keys('page:*:news_%s' % news_id)

    terms = []
    for key in keys:
        id = key.split(':')[1]

        got = redis.get('page:%s:searched' % id)

        # para poder buscar 2 veces tweets de una pagina de un evento
        if got is None or got < 2:
            title = redis.get('page:%s:title' % id)
            title = title.decode('utf-8', errors='ignore')
            title = h.unescape(title)
            title = utils.strip_accents(title)
            title = utils.remove_stopwords(title, lang=lang)
            terms.append(title)

            redis.incr('page:%s:searched' % id)

    print tag, 'got', len(terms), 'search terms for news'
    return terms
示例#18
0
文件: server.py 项目: luizfan/nlp
def return_stemming():
    phrase = request.form.get('phrase')
    return create_response(
        200, {"phrase": stemming(remove_stopwords(normalize(phrase)))})
示例#19
0
def get_tokens(question):
    seg_list = jieba.lcut(question)
    return remove_stopwords(seg_list)
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

data_words = list(sent_to_words(data))

# Construimos modelos de bigrams y trigrams
# https://radimrehurek.com/gensim/models/phrases.html#gensim.models.phrases.Phrases
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)

# Aplicamos el conjunto de bigrams/trigrams a nuestros documentos
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# Eliminamos stopwords
data_words_nostops = remove_stopwords(data_words, stop_words)
# Formamos bigrams
data_words_bigrams = make_bigrams(data_words_nostops, bigram_mod)
# python3 -m spacy download en_core_web_lg
# Inicializamos el modelo 'en_core_web_lg' con las componentes de POS únicamente
nlp = spacy.load('en_core_web_lg', disable=['parser', 'ner'])

# Lematizamos preservando únicamente noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams,
                                nlp,
                                allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

# Creamos diccionario
id2word = corpora.Dictionary(data_lemmatized)
# Create Corpus
texts = data_lemmatized
示例#21
0
文件: ytpy.py 项目: mac389/ytpy
def list_comments(video):										
	return tech.lemmingtize(tech.remove_stopwords(video['comments']))		
示例#22
0
 def test_no_stopwords(self):
     s = [["i", "couldn", "t", "wouldn", "t", "to", "do", "this"]]
     self.assertEqual(ut.remove_stopwords(s), [[]])
示例#23
0
文件: class_map.py 项目: zjzh/data
            swift_class_name_list["NSString"]])
    all_path_list.extend(swift_path_list)
    print("all_class_num: ", len(all_path_list))

    #制作文档集
    all_class_corpus = []  #每个元素是一个类
    for i, class_info_path in enumerate(all_path_list):
        #def extrac_class_des(path,is_class_name_des=True,is_method_des=False,is_para_des=False,is_return_des=False,is_signature=False) :
        sen = extrac_class_des(class_info_path, True, True, True, True, True)
        #        print("sen: ",sen)
        sen = preprocess_sen_new(sen, True)
        #        print("sen_pre: ",sen)
        lancaster_stemmer = LancasterStemmer()
        input_str = word_tokenize(sen)
        sen = [lancaster_stemmer.stem(value) for value in input_str]
        sen = " ".join(remove_stopwords(sen))
        #        print("sen: ",sen)
        #        if i>2:
        #            break
        all_class_corpus.append(sen)
#    all_class_corpus=["Swift makes it easy to create arrays in your code using an array literal: simply surround a comma-separated list of values with square brackets. Without any other information, Swift creates an array that includes the specified values, automatically inferring the array’s Element type."
#                      ,"Arrays are one of the most commonly used data types in an app. You use arrays to organize your app’s data. Specifically, you use the Array type to hold elements of a single type, the array’s Element type. An array can store any kind of elements—from integers to strings to classes."]
#    for class_1 in all_class_corpus:
#        print("class: ",class_1)
#        break
    weight, word = Tfidf(all_class_corpus)

    top_k = 10
    filter_weight = get_all_key_words(weight, top_k)
    sim_matrixs = cos_two_matrixs(np.array(filter_weight[:java_class_num]),
                                  np.array(filter_weight[java_class_num:]))
示例#24
0
def get_keywords(question):
    tokens = basicTokenizer.tokenize(question)
    return remove_stopwords(tokens)
示例#25
0
if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("usage: python3 %s <test txt> [model_name]" % sys.argv[0])
        sys.exit(2)

    test_thing = sys.argv[1]

    if len(sys.argv) > 2:
        model_name = sys.argv[2]
    else:
        model_name = 'knn_model'

    if not model_name.endswith('.clf'):
        model_name += '.clf'

    line_segs = jieba.cut(test_thing)
    line_segs = [i for i in line_segs if len(i) > 0]
    line_df = pd.DataFrame({'segment': line_segs})
    line_df = utils.remove_stopwords(line_df)

    # Using the trained classifier, make predictions for unknown text
    start_time = datetime.now()
    predictions = knn.predict(line_df,
                              model_path=model_name,
                              distance_threshold=1000)
    print('[Time taken: {!s}]'.format(datetime.now() - start_time))

    # Print results on the console
    print(predictions[0])
示例#26
0
def predict(sentence, model_path='./model/svm_classifier.joblib'):
    classifier = load(abspath(join(*model_path.split('/'))))
    sentence = remove_stopwords(sentence)
    pred_class = 'BeneficialTweet' if classifier.predict(
        sentence_vectorizer(sentence))[0] == 1 else 'Non-BeneficialTweet'
    return pred_class
示例#27
0
def process_content(real_content, lang):
    real_content = utils.strip_accents(real_content)
    real_content = utils.remove_stopwords(real_content, lang)
    real_content = utils.stem(real_content, lang)
    return real_content
示例#28
0
dir_path = os.path.dirname(os.path.realpath(__file__))
path_data = os.path.join(dir_path,'data')
path_models = os.path.join(dir_path,'models')
path_nlp_data = os.path.join(dir_path,'data/ATIS_TRAINING_DATA.csv')
path_ner_data = os.path.join(dir_path,'data/ATIS_NER.csv')
path_nlp_test_data = os.path.join(dir_path,'data/ATIS_TEST_DATA.csv')
pattern = re.compile(r'\w*\s["*"]')
flight_no = re.compile(r'flight number')
airline = re.compile(r'airline')
airfare = re.compile(r'flight fare')

list_of_nnp,list_of_day,list_of_rt,list_of_dayperiod=[],[],[],[]

with open(path_nlp_data, 'r',encoding="utf-8-sig") as f:
    reader = csv.reader(f)
    training_data = [(remove_stopwords(row[0]),row[1]) for row in reader]
with open(path_nlp_test_data, 'r',encoding="utf-8-sig") as f:
    reader = csv.reader(f)
    test_data = [(remove_stopwords(row[0]),row[1]) for row in reader]
with open(path_ner_data, 'r',encoding="utf-8-sig") as f:
    reader = csv.reader(f)
    for row in reader:
        row = list(filter(None, row))
        row =  row[:-1]
        rem = int((len(row)-1)/2)
        del row[rem]
        expression_length = len(row)
        list_of_nnp_indices = [i-int(expression_length/2) for i, x in enumerate(row) if x in ["B-fromloc.city_name","B-toloc.city_name"]]
        lis = list(map(row.__getitem__,list_of_nnp_indices))
        list_of_nnp.extend(lis)
        list_of_day_indices = [i-int(expression_length/2) for i, x in enumerate(row) if x in ["B-depart_date.day_name","B-arrive_date.day_name"]]
示例#29
0
def text_analysis(
        data_path,
        column,
        groups,
        language,
        lemmatize,
        ngram_range,
        num_topics,
        num_words,
        manual_mappings,
        generate_word_cloud,
        word_cloud_filename,
        frequent_words_filename,
        frequent_words_plot_filename,
        top_tfidf_words_filename,
        top_tfidf_words_plot_filename,
        predict_topics,
        topics_filename,
        predicted_topics_filename,
        ldavis_filename_prefix,
        predict_sentiment,
        predicted_sentiment_filename,
        should_upload_db,
        account_key_path
):
    print("Loading data...")
    data_df = load_data(data_path, column, groups)
    print("Loaded data sample")
    print(data_df.head())
    print()

    print("Cleaning data...")
    data_df[column] = clean_data(data_df[column])
    print("Clean data sample")
    print(data_df.head())
    print()

    print("Removing stop words from data...")
    data_df[column] = remove_stopwords(data_df[column], language)
    print("Data sample")
    print(data_df.head())
    print()

    if lemmatize:
        print("Lemmatizing data...")
        data_df[column] = lemmatize_text(data_df[column], language)
        print("Lemmatized data sample")
        print(data_df.head())
        print()

    if manual_mappings:
        print("Applying manual mappings...")
        data_df[column] = apply_manual_mappings(data_df[column], manual_mappings)
        print("Manually mapped data sample")
        print(data_df.head())
        print()

    if generate_word_cloud:
        print("Generating word cloud...")
        plot_word_cloud(data_df[column], word_cloud_filename, language)
        print("word_cloud saved to:", word_cloud_filename)
        print()

    count_vectorizer, count_data = get_count_vectorizer_and_transformed_data(
        data_df[column], language, ngram_range
    )
    all_word_count_pair_list = most_frequent_words(
        count_data, count_vectorizer, count_data.shape[0] + 1
    )
    word_count_pair_list = all_word_count_pair_list[:num_words]

    tfidf_vectorizer, tfidf_data = get_tfidf_vectorizer_and_transformed_data(
        data_df[column], language, ngram_range
    )
    all_tfidf_pair_list = most_frequent_words(
        tfidf_data, tfidf_vectorizer, tfidf_data.shape[0] + 1
    )
    tfidf_pair_list = all_tfidf_pair_list[:num_words]

    print("Saving frequent words...")
    save_words(
        all_word_count_pair_list,
        frequent_words_filename
    )
    print("Frequent words saved to:", frequent_words_filename)
    print()

    if should_upload_db:
        db_client = connect_db(account_key_path)
    else:
        db_client = None

    if should_upload_db:
        print("Uploading frequent words to db...")
        upload_db(db_client, 'frequent_words', {
            column: {w: int(c) for w, c in word_count_pair_list}
        })
        print('Done')
        print()

    print("Generating frequent word plot...")
    plot_top_words(word_count_pair_list, frequent_words_plot_filename)
    print("Frequent word plot saved to:", frequent_words_plot_filename)
    print()

    print("Saving top tfidf words...")
    save_words(
        all_tfidf_pair_list,
        top_tfidf_words_filename
    )
    print("Top tfidf words saved to:", top_tfidf_words_filename)
    print()

    if should_upload_db:
        print("Uploading frequent words to db...")
        upload_db(db_client, 'top_tfidf', {
            column: {w: int(c) for w, c in tfidf_pair_list}
        })
        print('Done')
        print()

    print("Generating top tfidf word plot...")
    plot_top_words(tfidf_pair_list, top_tfidf_words_plot_filename)
    print("Top tfidf word plot saved to:", top_tfidf_words_plot_filename)
    print()

    if groups:
        group_unique_vals = {}
        for group in groups:
            group_unique_vals[group] = data_df[group].unique()

        splits = {}
        for group, unique_vals in group_unique_vals.items():
            for val in unique_vals:
                splits[(group, val)] = data_df[group] == val

        for i in range(len(groups) - 1):
            splits = concat_splits(splits)

        grouped_words_counts = {}
        grouped_words_tfidf = {}

        for key, split_idcs in splits.items():
            split = data_df[split_idcs]
            split_texts = split[column]

            if len(split_texts) > 0 and any(split_texts.str.len() > 0):
                word_cloud_filename_val = add_prefix_to_filename(
                    word_cloud_filename, key
                )
                frequent_words_filename_val = add_prefix_to_filename(
                    frequent_words_filename, key
                )
                frequent_words_plot_filename_val = add_prefix_to_filename(
                    frequent_words_plot_filename, key
                )
                top_tfidf_words_filename_val = add_prefix_to_filename(
                    top_tfidf_words_filename, key
                )
                top_tfidf_words_plot_filename_val = add_prefix_to_filename(
                    top_tfidf_words_plot_filename, key
                )

                if generate_word_cloud:
                    print("Generating word cloud...")
                    plot_word_cloud(split_texts, word_cloud_filename_val, language)
                    print("word_cloud saved to:", word_cloud_filename_val)
                    print()

                try:
                    count_vectorizer, count_data = get_count_vectorizer_and_transformed_data(
                        split_texts, language, ngram_range
                    )
                    all_word_count_pair_list = most_frequent_words(
                        count_data, count_vectorizer, count_data.shape[0] + 1
                    )
                    word_count_pair_list = all_word_count_pair_list[:num_words]

                    tfidf_vectorizer, tfidf_data = get_tfidf_vectorizer_and_transformed_data(
                        split_texts, language, ngram_range
                    )
                    all_tfidf_pair_list = most_frequent_words(
                        tfidf_data, tfidf_vectorizer, tfidf_data.shape[0] + 1
                    )
                    tfidf_pair_list = all_tfidf_pair_list[:num_words]

                    print("Saving frequent words...")
                    save_words(
                        all_word_count_pair_list,
                        frequent_words_filename_val
                    )
                    print("Frequent words saved to:", frequent_words_filename_val)
                    print()

                    print("Generating frequent word plot...")
                    plot_top_words(word_count_pair_list, frequent_words_plot_filename_val)
                    print("Frequent word plot saved to:", frequent_words_plot_filename_val)
                    print()

                    print("Saving top tfidf words...")
                    save_words(
                        all_tfidf_pair_list,
                        top_tfidf_words_filename_val
                    )
                    print("Top tfidf words saved to:", top_tfidf_words_filename_val)
                    print()

                    print("Generating top tfidf word plot...")
                    plot_top_words(tfidf_pair_list, top_tfidf_words_plot_filename_val)
                    print("Top tfidf word plot saved to:", top_tfidf_words_plot_filename_val)
                    print()

                    grouped_words_counts[key[1::2]] = {
                        w: int(c) for w, c in all_word_count_pair_list
                    }
                    grouped_words_tfidf[key[1::2]] = {
                        w: int(c) for w, c in all_tfidf_pair_list
                    }
                except:
                    print("Error processing", key,
                          "skipping it. texts are probably all stopwords")

        print("Saving grouped frequent words...")
        group_frequent_words_filename = add_prefix_to_filename(
            frequent_words_filename, groups
        )
        remapped_grouped_words_counts = remap_keys(grouped_words_counts, groups)
        with open(group_frequent_words_filename, 'w', encoding="utf8") as f:
            json.dump(remapped_grouped_words_counts, f, ensure_ascii=False)
        print("Frequent words saved to:", group_frequent_words_filename)
        print()

        if should_upload_db:
            print("Uploading grouped_words_counts to db...")
            upload_db(db_client, 'grouped_words_counts', {
                column: remap_to_dict(remapped_grouped_words_counts)
            })
            print('Done')
            print()

        print("Saving grouped top tfidf words...")
        group_top_tfidf_words_filename = add_prefix_to_filename(
            top_tfidf_words_filename, groups
        )
        remapped_grouped_words_tfidf = remap_keys(grouped_words_tfidf, groups)
        with open(group_top_tfidf_words_filename, 'w', encoding="utf8") as f:
            json.dump(remapped_grouped_words_tfidf, f, ensure_ascii=False)
        print("Top tfidf words saved to:", group_top_tfidf_words_filename)
        print()

        if should_upload_db:
            print("Uploading grouped_words_tfidf to db...")
            upload_db(db_client, 'grouped_words_tfidf', {
                column: remap_to_dict(remapped_grouped_words_tfidf)
            })
            print('Done')
            print()

    if predict_topics:
        print("Calculating topic model...")
        lda, predicted_topics = learn_topic_model(tfidf_data, num_topics)
        print("Topics found via LDA:")
        print_topics(lda, tfidf_vectorizer, num_words)
        print("Saving topics...")
        save_topics(lda, tfidf_vectorizer, topics_filename)
        print("Topics saved to:", topics_filename)
        print()

        print("Saving predicted topics...")
        save_predicted_topics(predicted_topics, predicted_topics_filename)
        print("Predicted topics saved to:", predicted_topics_filename)
        print()

        if should_upload_db:
            print("Uploading predicted topics to db...")
            upload_db(db_client, 'predicted_topics', {
                column: json.loads(pd.DataFrame(predicted_topics).to_json(
                    orient='index', force_ascii=False
                ))
            })
            print('Done')
            print()

        print("Generating LDA visualization...")
        visualize_topic_model(lda, count_data, tfidf_vectorizer,
                              num_topics, ldavis_filename_prefix)
        print("LDA visualization saved to:", ldavis_filename_prefix)
        print()

    if predict_sentiment:
        if language == 'it':
            print("Predict sentiment...")
            predicted_sentiment = predict_sentiment_with_sentita(data_df[column])
            save_sentiment(predicted_sentiment, predicted_sentiment_filename)
            print("Predict sentiment saved to:", predicted_sentiment_filename)
            print()

            if should_upload_db:
                print("Uploading predicted sentiment to db...")
                upload_db(db_client, 'predicted_sentiment', {
                    column: json.loads(pd.DataFrame(predicted_sentiment).to_json(
                        orient='index', force_ascii=False
                    ))
                })
                print('Done')
                print()

        elif language == 'en':
            print("Predict sentiment...")
            predicted_sentiment = predict_sentiment_with_paralleldots(data_df)
            save_sentiment(predicted_sentiment, predicted_sentiment_filename)
            print("Predict sentiment saved to:", predicted_sentiment_filename)
            print()

            if should_upload_db:
                print("Uploading predicted sentiment to db...")
                upload_db(db_client, 'predicted_sentiment', {
                    column: json.loads(pd.DataFrame(predicted_sentiment).to_json(
                        orient='index', force_ascii=False
                    ))
                })
                print('Done')
                print()
        else:
            print("Sentiment analysis on {} language is not supported")
            print()
示例#30
0
def manual_classes_classifier(data_path, column, language, lemmatize,
                              manual_mappings, manual_classes,
                              predicted_classes_filename, should_upload_db,
                              account_key_path):
    print("Build classifier...")
    with open(manual_classes, encoding="utf8") as json_data:
        manual_classes_dict = json.load(json_data)
    classifier = Classifier(manual_classes_dict, language)
    print("Classifier built")
    print()

    print("Loading data...")
    data_df = load_data(data_path, column)
    print("Loaded data sample")
    print(data_df.head())
    print()

    print("Cleaning data...")
    data_df[column] = clean_data(data_df[column])
    print("Clean data sample")
    print(data_df.head())
    print()

    print("Removing stopwors...")
    data_df[column] = remove_stopwords(data_df[column], language)
    print("Data sample")
    print(data_df.head())
    print()

    if lemmatize:
        print("Lemmatizing data...")
        data_df[column] = lemmatize_text(data_df[column], language)
        print("Lemmatized data sample")
        print(data_df.head())
        print()

    if manual_mappings:
        print("Applying manual mappings...")
        data_df[column] = apply_manual_mappings(data_df[column],
                                                manual_mappings)
        print("Manually mapped data sample")
        print(data_df.head())
        print()

    print("Predict classes...")
    predicted_classes = predict(classifier, data_df[column])
    save_classes(predicted_classes, predicted_classes_filename)
    print("Predicted classes saved to:", predicted_classes_filename)
    print()

    if should_upload_db:
        db_client = connect_db(account_key_path)
        print("Uploading predicted classes to db...")
        upload_db(
            db_client, 'predicted_classes', {
                column:
                json.loads(
                    pd.DataFrame(predicted_classes).to_json(orient='index',
                                                            force_ascii=False))
            })
        print('Done')
        print()
示例#31
0
def test_remove_stopwords():
    """
    test suppression des stopwords
    """
    assert utils.remove_stopwords("is it fake news ?") == "fake news"
示例#32
0
#wordlist = open('list_body_parts.list.txt','rb').read().splitlines()
data = []
filename = '/Users/andymckenzie/Dropbox/ToxTweet/APIs/YouTube/nonrated_robotrip_comments.csv'
with open(filename,'rU') as f:
	reader = csv.reader(f,delimiter=',')
	for row in reader:
		try:
			data.append((tech.clean(row[4]),row[0]))
		except:
			pass
	
neg = [datum[0] for datum in data if datum[1] == '-1']
pos = [datum[0] for datum in data if datum[1] == '1']

neg_flat = tech.remove_stopwords((' '.join(neg)).split())
pos_flat = tech.remove_stopwords((' '.join(pos)).split())
cPickle.dump((pos_flat,neg_flat),open('%s-flat-yt.pkl'%drug,'wb'))

print neg_flat[0]

'''
txt = []	
with open('/Volumes/My Book/Dropbox/toxtweet/APIs/YouTube/nonrated_marijuana_YT_comments_CLASSIFIED_NO_CANCER.csv','r') as f:
	text = f.read().splitlines()
	text = [comment.split() for comment in text]
	text = [item for sublist in text for item in sublist]
	txt.extend(text)
'''