Exemplo n.º 1
0
def scrape (visited,vocab) :
    i=0
    for url in visited:
        try:
            response = urlopen("http://"+url)
            print("scraping",i)
            i+=1
        except:
            continue
        base = [urlparse(u).netloc for u in url]    
        bs = BeautifulSoup(response,'html.parser')
        try:
            title = bs.find('title').text
        except :
            continue
        tags=['p','span','h1','h2','h3','h4','h5','h6','div']
        if(title):
            content=title
        else:
            content=''
        for tag in tags:
            text_tag=bs.find_all(tag)
            textContent=[x.text for x in text_tag]
            content +=' '.join(textContent)
        
        page_content[url] = {'data':content} 
        content = re.sub('\n',' ',content)

        tokens = process.tokenizer_fun(content)
        cleaned = process.remove_stopwords(tokens)
        stemmed = process.stemming(cleaned)
        cleaned2 = process.remove_stopwords(stemmed)
        cleaned_text = process.length2(cleaned2)

        word_count[url] = {}
        v_flag = True
        for token in cleaned_text:
            
            if token not in vocab:
                vocab[token] = 1
            elif v_flag:
                vocab[token] += 1
                v_flag = False
                
            if token in word_count[url].keys():
                word_count[url][token] += 1
            else:
                word_count[url][token] = 1
                
        links = [urljoin(url, l.get('href')) for l in bs.findAll('a')]
        links = [l.rstrip("/") for l in links if urlparse(l).netloc in base]
        finalData = (url,cleaned_text,list(set(links)))
        if finalData != (-1) :
            crawler_tuple[url] = finalData

    return crawler_tuple
def one_classifier(text, lang, embedding_name, model_path, model_file):

    #--------------------------------------------------------------------------------------------
    #--- LOAD MODEL AND EMBEDDING
    #--------------------------------------------------------------------------------------------
    print(model_file)
    cls = pickle.load(open(model_path + model_file, 'rb'))

    embedding = Embeddings(embedding_name)

    #--------------------------------------------------------------------------------------------
    #--- PROCESSING
    #--------------------------------------------------------------------------------------------

    processed_text = preprocess(text)

    no_stpw_text = remove_stopwords(processed_text, lang)

    if len(
            to_vector_single_nonzeros(no_stpw_text, embedding,
                                      len(no_stpw_text))) > 0:
        vectorized_text = np.mean(to_vector_single_nonzeros(
            no_stpw_text, embedding, len(no_stpw_text)),
                                  axis=0)
        vectorized_text2 = np.reshape(vectorized_text, (1, -1))
        prob = cls.predict_proba(vectorized_text2)[:, 1]
    else:
        vectorized_text = np.zeros((300, ) * 1)
        prob = 0
    #print(cls.classes_) # check that class at second position is L1

    for i in list(prob):

        return (i)
def processTitle(title):
    # print('Title before', title)
    title = title.lower()
    title = tokenise(title)
    title = remove_stopwords(title)
    title = stem(title)
    # print('Title: ', title)
    return title
def processBody(text):
    # print('Body: ',text)
    data = re.sub(r'\{\{.*\}\}', r' ', text)
    data = tokenise(data)
    data = remove_stopwords(data)
    data = stem(data)
    # print('Body: ',data)
    return data
def processLinks(text):
    data = text.split('\n')
    links = []
    for line in data:
        if re.match(r'\*[\ ]*\[', line):
            links.append(line)
    data = tokenise(' '.join(links))
    data = remove_stopwords(data)
    data = stem(data)
    # print('Links: ', )
    return data
def processCategories(text):
    data = text.split('\n')
    categories = []
    for line in data:
        if re.match(r'\[\[category', line):
            categories.append(re.sub(r'\[\[category:(.*)\]\]', r'\1', line))
    data = tokenise(' '.join(categories))
    data = remove_stopwords(data)
    data = stem(data)
    # print('Categories: ', data)
    return data
Exemplo n.º 7
0
 def full_preprocessing(self):
     """General preprocessing on document sample. This method include:
     remove punctuation ( . and , are kept), remove english stop words,
     tokenize to sentences, words and list of tokenized sentences to words."""
     self.text = pre.remove_punctuation(self.text)
     self.text = pre.to_lowercase(self.text)
     self.words = pre.tokenize_to_words(self.text)
     self.words = pre.remove_stopwords(self.words)
     self.text = ' '.join(self.words)
     self.sentences = pre.tokenize_to_sentences(self.text)
     self.normalized_sample = [pre.tokenize_to_words(sent) for sent in self.sentences]
     return self.sentences
Exemplo n.º 8
0
	def __init__(self,docs,num_clu):
		self.no_clusters = num_clu
		#self.sentences =  preprocessing.load_sentences(docs)
		self.sentences =  preprocessing.load_duc_xml(docs)
		self.sent_no_swords = preprocessing.remove_stopwords(self.sentences)
		#self.full_doc = helper.fulldoc(self.sentences)
		#self.sent_no_swords.append(self.full_doc)
		self.unique_terms = helper.uniqueterms(self.sent_no_swords)
		self.sent_weight = helper.tfisf(self.sent_no_swords,self.unique_terms)
		#self.sent_weight = helper.word_vector(self.sent_no_swords,self.unique_terms)
		self.sent_similarity = helper.similarity(self.sent_weight,self.sent_weight)
		self.clusters = cluster.kmedoid(self.sent_similarity,self.no_clusters)
Exemplo n.º 9
0
def tokenize_corpus(corpus_file):
    raw_document_ko = read_txt(corpus_file)
    lines = raw_document_ko.split('\n')

    processed_doc_ko = remove_stopwords(
        remove_extraneous(remove_english(remove_extraneous(lines))))
    doc_ko = ' '.join(str(word) for line in processed_doc_ko for word in line)

    t = Okt()
    tokens_ko = t.morphs(doc_ko)

    return tokens_ko
Exemplo n.º 10
0
def make_vectorize():
    try:
        #Load the data
        data = request.get_json()

    except Exception as e:
        raise e

    if data == {}:
        return (bad_request())
    else:
        #Get the text and the language
        try:
            lang = data['lang']
        except:
            try:
                lang = detect_language(data['text'])
                print(lang)
            except:
                responses = jsonify(
                    "Error in vectorize: language field is missing")
                return responses
        try:
            text = data['text']
        except:
            responses = jsonify("Error in vectorize: text is missing")
            return responses

        if lang not in ['en', 'es', 'ar', 'ro', 'fr']:
            responses = jsonify(
                "Language not available. Language must be in ['en','es','ar','ro','fr']"
            )
            return responses
        #Preprocess the text
        print("Vectorize...")

        embeddings = Embeddings(emb_dict[lang])

        processed_text = preprocess(text)
        no_stpw_text = remove_stopwords(processed_text, lang)
        vectorized_tokens = to_vector_single_nonzeros(no_stpw_text, embeddings,
                                                      len(no_stpw_text))

        if len(vectorized_tokens) > 0:
            vectorized_text = np.mean(vectorized_tokens, axis=0)
        else:
            vectorized_text = np.zeros((300, ) * 1)
            print(vectorized_text)

        #Send the response codes
        responses = jsonify(vector=vectorized_text.tolist())
        responses.status_code = 200
        return responses
Exemplo n.º 11
0
	def __init__(self,docs,num_clu):
		self.no_clusters = num_clu
		print "Loading Sentences..."
		self.sentences =  preprocessing.load_sentences(docs)
		print "Preprocessing..."
		self.sent_no_swords = preprocessing.remove_stopwords(self.sentences)
		self.unique_terms = helper.uniqueterms(self.sent_no_swords)
		self.sent_weight = helper.tfisf(self.sent_no_swords,self.unique_terms)
		#self.sent_weight = helper.word_vector(self.sent_no_swords,self.unique_terms)
		print "Finding Similarity Graph..."
		self.sent_similarity = helper.similarity(self.sent_weight,self.sent_weight)
		print "Clustering..."
		self.clusters = cluster.kmedoid(self.sent_similarity,self.no_clusters)
		'''
Exemplo n.º 12
0
 def __init__(self, docs, num_clu):
     self.no_clusters = num_clu
     print "Loading Sentences..."
     self.sentences = preprocessing.load_sentences(docs)
     print "Preprocessing..."
     self.sent_no_swords = preprocessing.remove_stopwords(self.sentences)
     self.unique_terms = helper.uniqueterms(self.sent_no_swords)
     self.sent_weight = helper.tfisf(self.sent_no_swords, self.unique_terms)
     #self.sent_weight = helper.word_vector(self.sent_no_swords,self.unique_terms)
     print "Finding Similarity Graph..."
     self.sent_similarity = helper.similarity(self.sent_weight,
                                              self.sent_weight)
     print "Clustering..."
     self.clusters = cluster.kmedoid(self.sent_similarity, self.no_clusters)
     '''
Exemplo n.º 13
0
def get_topic_keywords(qnas, embedding_model=None):
    """Define the topic keywords.

    Args:
        qnas (list): List of questions and answers.
        embedding_model (wordembedding.WordEmbedding): Word Embedding
            model.

    Return:
        str: Topic file content.
    """
    answers_kwords = list()
    questions_kwords = list()
    similar_answers_kwords = list()
    similar_questions_kwords = list()

    for question, answer in qnas:
        question_kwords = list()
        # Obtain keywords
        aux = re.sub(r'((\*~\d+)|(\[.*?\]))', ' ', question)
        # Split with two spaces to preserve words pairs
        for q in aux.split('  '):
            word = q.strip()
            if ' ' in word:
                question_kwords.append('\"{}\"'.format(word))
            elif word:
                question_kwords.append(word)

        questions_kwords.extend(question_kwords)
        answer_no_sw = preprocessing.remove_stopwords(answer)
        answers_kwords.extend(find_keywords.find_entities(answer_no_sw))

    if embedding_model is not None:
        for word in questions_kwords:
            word_similars = embedding_model.get_similar(word, top_n=2)
            similar_questions_kwords.extend(word_similars)

        for word in answers_kwords:
            word_similars = embedding_model.get_similar(word, top_n=2)
            similar_answers_kwords.extend(word_similars)

    result = set(questions_kwords + similar_questions_kwords
                 # + answers_kwords + similar_answers_kwords
                 )

    return result
Exemplo n.º 14
0
def analyze(text, lang, registry):

    topics_path = registry['topics']["topics_path"]
    patterns_path = registry["key_ideas"]["patterns_path"]

    processed_text = preprocess(text)
    no_stpw_text = remove_stopwords(processed_text, lang)

    tagger = Tagger(lang, registry['pos_models'])
    pos = tagger.pos_tag(processed_text)

    concepts = get_concepts(pos, lang)
    key_ideas = get_key_ideas(pos, lang, patterns_path)
    topics = get_topics(no_stpw_text, lang, topics_path)

    result = [concepts, key_ideas, topics]
    #return {"concepts": concepts, "key_ideas": key_ideas, "topics": topics}
    return result
def processInfo(text):
    data = text.split('\n')
    flag = -1
    info = []
    st = "}}"
    for line in data:
        if re.match(r'\{\{infobox', line):
            info.append(re.sub(r'\{\{infobox(.*)', r'\1', line))
            flag = 0
        elif flag == 0:
            if line == st:
                flag = -1
                continue
            info.append(line)
    data = tokenise(' '.join(info))
    data = remove_stopwords(data)
    data = stem(data)
    # print("Info: ", data)
    return data
Exemplo n.º 16
0
    def process_data(
        self, data_id: int, data: str, add_document: bool
    ) -> Optional[dict[str, float]]:
        """
        Preprocesses and processes a document.

        :param data_id: The document's ID.
        :param data: The content of the document.
        :param add_document: Whether the document should immediately be added to the
            TF.IDF collection.
        :return: The TF.IDF scores of each term in the document, unless the document was
            added to the collection. In that case nothing is returned.
        """
        preprocessed = remove_stopwords(lemmatize(split_text(data)))
        if not add_document:
            return self.tfidf.process_document(preprocessed)
        else:
            index = self.tfidf.add_document(preprocessed)
            self.data_ids[data_id] = index
Exemplo n.º 17
0
def create_pre_calculated_result_csv(start, end, csv_name):

    essays_scores = pd.read_csv('essays_and_scores.csv', encoding="ISO-8859-1")
    essays_scores = essays_scores.iloc[start:end, :]
    essays = essays_scores['essay'].values
    scores1 = essays_scores['rater1_domain1'].values
    scores2 = essays_scores['rater2_domain1'].values

    sentence_counts = sentence.find_counts(essays)
    words_without_stopwords = preprocessing.remove_stopwords(essays)
    tf_idf_values = vectorization.find_word_vector(words_without_stopwords)
    dataset = combine_lists(sentence_counts, words_without_stopwords, tf_idf_values, scores1, scores2)

    # Create dataframe for Random Forest Algorithm
    df = DataFrame(dataset,
                   columns=['sentence_count', 'english_word', 'non_english_word', 'CC', 'DT-PDT', 'IN', 'JJ', 'JJR',
                            'JJS',
                            'MD', 'NN', 'NNS', 'NNP', 'NNPS', 'RB2', 'RBR', 'RBS', 'VB', 'VBD-VBN', 'VBG',
                            'VBP-VBZ',
                            'other_tags', 'word_count', 'td_idf', 'unique', 'score', 'essay_wo_stopwords'])

    df.to_csv(csv_name, index=False)
Exemplo n.º 18
0
def two_classifier(text, lang, embedding_name, model_path, model_file_JIH,
                   model_file_EXR):
    #--------------------------------------------------------------------------------------------
    #--- LOAD MODEL AND EMBEDDING
    #--------------------------------------------------------------------------------------------

    cls_JIH = pickle.load(open(model_path + model_file_JIH, 'rb'))
    cls_EXR = pickle.load(open(model_path + model_file_EXR, 'rb'))

    embedding = Embeddings(embedding_name)

    #--------------------------------------------------------------------------------------------
    #--- PROCESSING
    #--------------------------------------------------------------------------------------------

    processed_text = preprocess(text)
    no_stpw_text = remove_stopwords(processed_text, lang)
    if len(
            to_vector_single_nonzeros(no_stpw_text, embedding,
                                      len(no_stpw_text))) > 0:
        vectorized_text = np.mean(to_vector_single_nonzeros(
            no_stpw_text, embedding, len(no_stpw_text)),
                                  axis=0)
        vectorized_text2 = np.reshape(vectorized_text, (1, -1))
        prob_JIH = cls_JIH.predict_proba(vectorized_text2)[:, 1]
        prob_EXR = cls_EXR.predict_proba(vectorized_text2)[:, 1]

    else:
        vectorized_text = np.zeros((300, ) * 1)
        prob_JIH = 0
        prob_EXR = 0

    if prob_JIH > prob_EXR:
        prob = prob_JIH
    else:
        prob = prob_EXR

    for i in list(prob):
        return (i)
def process_query(query):
    cleaned_query = pro.tokenizer_fun(query)
    cleaned_query = pro.remove_stopwords(cleaned_query)
    cleaned_query = pro.stemming(cleaned_query)
    return cleaned_query
Exemplo n.º 20
0
 def testStripStopwords(self):
     self.assertEqual(remove_stopwords("the world is square"),
                      " world  square")
     self.assertEqual(remove_stopwords(u"一般使用的单位是人数或居住的人口数。"),
                      u"使用单位人数居住人口数。")
Exemplo n.º 21
0
def begin_search():
    f = open('./inverted_index/fileNumber.txt', 'r')
    global number_of_files
    number_of_files = int(f.read().strip())
    f.close()

    query_file = sys.argv[1]
    with open(query_file, 'r') as q:
        queries = q.readlines()
    data = ""
    for query in queries:
        global K
        K = query.split(', ')[0]
        K = int(K)
        query = query.split(', ')[1:]
        temp_query = ''
        for i in query:
            temp_query += i + ' '
        query = temp_query
        query = query.lower()
        start = timeit.default_timer()
        if re.match(r'[t|b|i|c|l]:', query):
            tempFields = re.findall(r'([t|b|c|i|l]):', query)
            words = re.findall(r'[t|b|c|i|l]:([^:]*)(?!\S)', query)
            # print(tempFields, words)
            fields, tokens = [], []
            si = len(words)
            i = 0
            while i < si:
                for word in words[i].split():
                    fields.append(tempFields[i])
                    tokens.append(word)
                i += 1
            tokens = remove_stopwords(tokens)
            tokens = stem(tokens)
            # print(fields, tokens)
            results = field_query_ranking(tokens, fields)
            # print(results)

        else:
            tokens = tokenise(query)
            tokens = remove_stopwords(tokens)
            tokens = stem(tokens)
            results = simple_query_ranking(tokens)
            # print(results)
        if len(results) > 0:
            results = sorted(results, key=results.get, reverse=True)
            if (len(results) > K):
                results = results[:K]
            for key in results:
                key.rstrip()
                title, title_doc_num = find_title(key)
                data += title_doc_num
                data += ', '
                # print(title_doc_num, end = ' ')
                if title is not None:
                    for i in title:
                        data += i + ' '
                        # print(i, end = ' ')
                    data = data[:-1]
        else:
            data += "No results found! Try modifying the search by reducing the length maybe?\n"
        end = timeit.default_timer()
        data += str(end - start) + ', '
        data += str((end - start) / K)
        data += '\n\n'
        # print('\n')
    # print('data', data)
    with open('queries_op.txt', 'w') as f:
        f.write(data)
Exemplo n.º 22
0
	def preprocess(self):
		l_0=pre.tokenize_tweet(self.txt)
		l_1=pre.remove_stopwords(l_0)
		#self.term=pre.stemming(l_1)
		self.term=l_1
Exemplo n.º 23
0
def predict(essay, selected_topic):
    if selected_topic == 'Computer':
        dataframe = pd.read_csv('result.csv', encoding="ISO-8859-1")
    elif selected_topic == 'Library':
        dataframe = pd.read_csv('library_result.csv', encoding="ISO-8859-1")
    elif selected_topic == 'Cyclist':
        dataframe = pd.read_csv('cyclist_result.csv', encoding="ISO-8859-1")

    essays_without_stopwords = dataframe.iloc[:, 26].values

    essay = [essay]
    essay_sentence_counts = sentence.find_counts(essay)
    essay_words_without_stopwords = preprocessing.remove_stopwords(essay)
    essay_words_without_stopwords_count = len(essay_words_without_stopwords[0])
    essay_without_stopwords = ' '.join(essay_words_without_stopwords[0])
    essays_without_stopwords = np.append(essays_without_stopwords,
                                         essay_without_stopwords)
    tf_idf_scores = vectorization.find_word_vector_v2(essays_without_stopwords)

    for i in range(len(dataframe)):
        dataframe.iloc[i, 23] = tf_idf_scores[i][0]

    essay_data = [
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0
    ]
    for i in range(len(essay_sentence_counts[0])):
        essay_data[i] = essay_sentence_counts[0][i]
    essay_data[22] = essay_words_without_stopwords_count
    essay_data[23] = tf_idf_scores[-1][0]
    essay_data[24] = tf_idf_scores[-1][1]
    essay_data = [np.array(essay_data)]
    """ Random Forest Algorithm """
    # Split train and test sets
    X = dataframe.iloc[:, 0:25].values
    y = dataframe.iloc[:, 25].values
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=0)

    # Scale values
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    essay_test = sc.transform(essay_data)

    # Run Random Forest
    regressor = RandomForestRegressor(n_estimators=100, random_state=0)
    regressor.fit(X_train, y_train)
    essay_score_pred = regressor.predict(essay_test)[0]
    ''' End of the Random Forest '''

    # Predict Topic
    essay_topic_pred = topic.find_topic(essay_words_without_stopwords)

    if selected_topic != str(essay_topic_pred).capitalize():
        print(
            'Selected topic and predicted topic did not match. Are you sure you selected right topic?'
        )
        print('Predicted Topic: ', str(essay_topic_pred).capitalize())
        essay_score_pred *= 0.6
    print('Predicted Score: ', essay_score_pred)
Exemplo n.º 24
0
def classifier(annotated_data, lang, user_id, case_id, clas_name):

    #--------------------------------------------------------------------------------------------
    #--- DEFINE FILES AND LANGUAGE
    #--------------------------------------------------------------------------------------------

    model_path = './data/probability/insikt/'
    model_file = user_id + '_' + case_id + '_' + clas_name + '_classifier.model'

    if (lang == 'en'):
        embedding_name = 'embedding-EN'

    if (lang == 'ar'):
        embedding_name = 'embedding-AR'

    if (lang == 'es'):
        embedding_name = 'embedding-ES'

    if (lang == 'ro'):
        embedding_name = 'embedding-RO'

    if (lang == 'fr'):
        embedding_name = 'embedding-FR'

    embedding = Embeddings(embedding_name)
    #--------------------------------------------------------------------------------------------
    #--- GENERAL SCRIPT
    #--------------------------------------------------------------------------------------------

    ########## Tokenize + stopwords
    #print(annotated_data)
    #raw_data=np.array(annotated_data)
    x_train = [i[0] for i in annotated_data]
    #print(x_train)
    y_train = [i[1] for i in annotated_data]  #replace N0 for L0...!!!
    #print(y_train)
    x_train_DL = []

    print('Data training with ' + str(len(x_train)) + ' texts')

    for text in x_train:
        #print(text)
        processed_text = preprocess(text)
        no_stpw_text = remove_stopwords(processed_text, lang)
        if len(
                to_vector_single_nonzeros(no_stpw_text, embedding,
                                          len(no_stpw_text))) > 0:
            vectorized_text = np.mean(to_vector_single_nonzeros(
                no_stpw_text, embedding, len(no_stpw_text)),
                                      axis=0)
        else:
            vectorized_text = np.zeros((300, ) * 1)
        #print(vectorized_text)
        #x_train_DL.append(np.reshape(vectorized_text,(1,-1)))
        x_train_DL.append(vectorized_text)

########## Build and test classifiers with 10-fold -cross validation

    skf = StratifiedKFold(n_splits=10, shuffle=True)

    #	Stochastic Descent Gradient

    cls = SGDClassifier(loss="log", penalty="l2",
                        max_iter=500).fit(x_train_DL, y_train)
    scores = cross_val_score(cls,
                             x_train_DL,
                             y_train,
                             cv=skf,
                             scoring='accuracy')
    print("Accuracy C-10V EN: %2.1f (+/- %2.1f)" %
          (100 * scores.mean(), scores.std() * 200))
    print(cls.classes_)  # check that class at the second position is 'Yes'
    accuracy = round((100 * scores.mean()), 2)
    ########## Save the model

    pickle.dump(cls, open(model_path + model_file, 'wb'))
    return (accuracy)
from gensim.models.word2vec import FAST_VERSION 
FAST_VERSION=1

import sys
sys.path.append('../lib/')

file=[]
path=r'rvm.txt' # path to file
for string in open(path,'r',encoding='cp1251'):
    file.append(string.lower())
    

file_split=split_file(file)
text=clean_text([file_split[i][0] for i in range(len(file_split))]) # remove symbols in text
clear_text=remove_stopwords(text) # remove stop-words in text
s=func_lemma(func_container(clear_text)) # lemmatization procedure
w=func_tokenize(s) # w train dataset after preprocessing procedure


path=r'lenta-ru-news.csv' # path to test dataset
df = pd.read_csv(path,engine='python', delimiter=',',encoding = "utf-8-sig")

# plot topic news distribution
y_pos=np.arange(len(df['topic'].value_counts()))
performance=df['topic'].value_counts()
plt.figure(figsize=(8,6))
plt.bar(y_pos,performance,align='center',alpha=0.5,color='g',width=0.8)
plt.xticks(y_pos,df['topic'].value_counts().index.tolist(),rotation=90,size=15)
plt.yticks(size=15)
plt.xlabel('Topics',size=15)