Exemplo n.º 1
0
 def stemming(data):
     """Stemming"""
     stemmer = ItalianStemmer()
     filtered = []
     for word in data:
         filtered.append(stemmer.stem(word))
     return filtered
def item_preprocessing(descr_text):
    """
    Preprocess of a text string
    :param descr_text:
    :return:
    """

    # Tokenizing the string by excluding all the special characters that are not alphanumeric and underscore
    tokenizer = RegexpTokenizer(r'\w+')
    token_list = tokenizer.tokenize(descr_text.lower())

    # Creating a italian step words dictionary
    stop_words = set(stopwords.words('italian'))

    # Creating both italian and english stemmers due to the particular dataset with mixed languages
    ita_stemmer = ItalianStemmer()
    eng_stemmer = PorterStemmer()

    # Removing stop words
    filtered_token = [token for token in token_list if not token in stop_words]

    # Removing tokens composed by only a number
    filtered_token = [
        token for token in filtered_token
        if not re.search(r'\b[0-9]+\b\s*', token)
    ]

    # Stemming the tokens for both italian and english
    filtered_token = [ita_stemmer.stem(token) for token in filtered_token]
    filtered_token = [eng_stemmer.stem(token) for token in filtered_token]

    filtered_token = FreqDist(filtered_token).most_common(50)

    return filtered_token
Exemplo n.º 3
0
def preprocess(text, NUM_DOCS, num_preprocessed, stemming):
    global i
    if i == 0:
        i = num_preprocessed
    i += 1
    result = []
    stemmer = ItalianStemmer()
    if i % 20 == 0:
        print(f"\t{i} out of {NUM_DOCS+num_preprocessed} documents preprocessed")
    nlp = Italian()
    t0 = text.split("Lingua processuale")[0].split("Sentenza")[-1]
    t1 = "".join(t0)
    t1 =  re.sub(r"’|'|«|»|\d{1,4}\/\d{1,4}\/(cee|ce)|\d+|---\|*|^(http:\/\/www\.|https:\/\/www\.|http:\/\/|https:\/\/)?[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(:[0-9]{1,5})?(\/.*)?$", " ", t1, flags=re.IGNORECASE)
    # print(t1)
    doc = nlp(t1)
    for token in doc:
        if token.text.lower() not in it_stopwords and not token.is_punct | token.is_space and len(token) > 3:
            assert token.lang_ == "it"
            if stemming:
                result.append(stemmer.stem(word=token.text))
            else:
                result.append(token.lemma_.lower())
            if "'" in result[-1] or "’" in result[-1]:
                raise Exception(f"Detected_ {token.lemma_}")
    return result
def stemm(reviews):
  stemm_reviews = []
  stemmer = ItalianStemmer()
  for review in reviews:
    clean = (' '.join([stemmer.stem(w) for w in review]))
    stemm_reviews.append(clean)  
  return stemm_reviews
def clean_stop_words(df, column, lang, stem=True):
    """
    (df,str,str) -> df
    cleans column of dataframe from stopwords with the given language
    :param df: dataframe to clean
    :param column: column of dataframe to clean
    :param lang: language of stopwords
    :param stem: if stemming activated
    :return: cleaned dataframe
    """
    for i in range(df.shape[1]):
        df.loc[i, column] = re.sub('[^a-zA-Z]', ' ', df[column][i])
    document = df[column].str.lower().str.split()
    sentence_stem = []
    document_stem = []

    nltk_stop = stopwords.words(lang)
    clean_document = document.apply(
        lambda x: [item for item in x if item not in nltk_stop])
    stemmer = ItalianStemmer()
    if stem:
        for sentence in clean_document:
            for word in sentence:
                word = stemmer.stem(word)
                sentence_stem.append(word)
            document_stem.append(sentence_stem)
            sentence_stem = []
        sentences = [' '.join(i) for i in document_stem]
        cleaned_series = pd.Series((v for v in sentences))
        df[column] = cleaned_series
    else:
        sentences = [' '.join(i) for i in clean_document]
        cleaned_series = pd.Series((v for v in sentences))
        df[column] = cleaned_series
    return df
Exemplo n.º 6
0
    def __init__(self):
        self.data_dir = "./data/"
        if not isdir(self.data_dir):
            os.mkdir(self.data_dir)

        self.info = None
        self.desc = None
        self.desc_index = None

        self.stemmer = ItalianStemmer()
        self.stop_words = set(stopwords.words('italian'))

        self.vocab = None
        self.documents = None
        self.inv_index = None  # no inverted index yet available
        self.idf = None  # no inverse document frequency yet available

        self.nltk_check_downloaded()

        self.url_base = "https://www.immobiliare.it"
        self.url_search = "/vendita-case/roma/?criterio=rilevanza&pag="
        try:
            html = requests.get(self.url_base + self.url_search + "1").content
            soup = BeautifulSoup(html, "html.parser")

            pag_number_list = soup.find("ul", class_="pagination pagination__number")
            self.max_pag_nr = int(pag_number_list.find_all("li")[-1].text)
        except requests.exceptions.ConnectionError:
            pass
Exemplo n.º 7
0
def stem_words(wrd):
    stemmer = ItalianStemmer()  # Selects the stemmmer from nltk
    stems = []  # List of updated words

    for word in wrd:
        stem = stemmer.stem(word)  # Stems the word
        stems.append(stem)  # and appends it to the list

    return stems
Exemplo n.º 8
0
def preprocess_string(s,
                      lower=True,
                      stem=True,
                      remove_stopwords=True,
                      remove_punctuation=True):
    """ Cleanup a string

    Keyword arguments:
        s       --  the input string
        lower   --  lower every char
        stem    --  extract root of every word
        remove_stopwords    --  well, self-explanatory
        remove_punctuation  --  self-explanatory too
    """

    # lower every char
    # entity recognition also uses upper chars
    if lower:
        s = s.lower()

    # replace accents
    accent_chars = {
        "è": "e",
        "é": "e",
        "à": "a",
        "ò": "o",
        "ó": "o",
        "ù": "u",
        "ì": "i",
    }

    for char in accent_chars:
        s.replace(char, accent_chars[char])

    # tokenize and remove punctuation
    from nltk.tokenize import RegexpTokenizer

    tokenizer = RegexpTokenizer(r"\w+")
    s = tokenizer.tokenize(s)

    # stem the words
    if stem:
        from nltk.stem.snowball import ItalianStemmer
        stemmer = ItalianStemmer()
        s = [stemmer.stem(word) for word in s]

    # remove stopwords (italian)
    if remove_stopwords:
        from stop_words import get_stop_words
        s = [word for word in s if word not in get_stop_words('it')]

    return " ".join(s)
Exemplo n.º 9
0
def preprocessQuery(query):
    # initialize tokenizer, stop words stemmer
    tokenizer = RegexpTokenizer(r'\w+')
    stopWords = set(stopwords.words('italian'))
    stemmer = ItalianStemmer()
    rawText = query.lower()
    #tokenize
    tokens = tokenizer.tokenize(rawText)
    # remove stop words
    effectiveTokens = [t for t in tokens if not t in stopWords]
    # stemming
    result = [stemmer.stem(t) for t in effectiveTokens]
    return result
Exemplo n.º 10
0
    def __init__(self,
                 no_numbers=True,
                 min_length=1,
                 clean_emoji=True,
                 stop_words_bool=True,
                 whitelist_stop_words=True,
                 stemmer=False):

        self.no_numbers = no_numbers
        self.min_length = min_length
        self.stemmer = stemmer
        self.ita_stemmer = ItalianStemmer()

        self.replace = [
            '#', '>', '_', '<', '-', '|', '\\', '/', '^', '\n', '”', '“', '"',
            '’', '‘', '€', '´', '.', '…'
        ]

        self.emoji = None
        self.clean_emoji = clean_emoji
        if self.clean_emoji:
            try:
                # UCS-4
                self.emoji = re.compile(u'[\U00010000-\U0010ffff]')
            except re.error:
                # UCS-2
                self.emoji = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]')

            self.pos_emoji = [
                '👍', '😀', '💪', '😎', '👌', '😁', '😃', '😄', '😊', '😋', '😍', '😻',
                '🤗', '👏🏻', '😘', '🎉', '💗', '🔝', '😉'
            ]
            self.neg_emoji = [
                '👎', '😒', '😖', '😠', '😡', '😤', '😨', '😱', '😳', '😬', '😞', '🤐',
                '😕', '😢'
            ]

        self.stop_words_bool = stop_words_bool
        if self.stop_words_bool:
            sw = StopWords(whitelist=whitelist_stop_words)
            sw_list = sw.getStopWords()

            stop_words_dict = defaultdict(lambda: -1)
            for i, word in enumerate(sw_list):
                stop_words_dict[word] = 1

            self.stop_words_dict = stop_words_dict

        return
Exemplo n.º 11
0
 def __init__(self,
              stopwords=None,
              punct=None,
              lower=True,
              strip=True,
              Language='English'):
     self.lower = lower
     self.strip = strip
     self.punct = punct or set(string.punctuation)
     if Language == 'English':
         self.stopwords = stopwords or set(sw.words('english'))
         self.stemmer = PorterStemmer()
     elif Language == 'Italian':
         self.stopwords = stopwords or set(sw.words('italian'))
         self.stemmer = ItalianStemmer()
Exemplo n.º 12
0
def get_TFIDFmatrix_vect(data, do_stemming):
    """
    data: input textual collection
    do_stemming: boolean. If True execute stemming, otherwise analyze only tokenized words (words are composed at least 2 chars and do not contains numbers)

    returns a tuple <tf, matrix> where tf is the vectorizer and matrix is the normalized matrix of tfidf
    """

    min_df = 10

    if do_stemming:
        italian_stemmer = ItalianStemmer()
        tf = textual_analysis.StemmedCountVectorizer(
            token_pattern=u'([a-z]{2,})',
            min_df=min_df,
            analyzer="word",
            stop_words=stopwords.words('italian'),
            norm='l2')

    else:
        tf = TfidfVectorizer(
            token_pattern=u'([a-z]{2,})',
            sublinear_tf=True,
            use_idf=True,
            stop_words=stopwords.words('italian'),
            max_df=0.1,
            min_df=min_df,
            norm='l2'
        )  #CountVectorizer supports counts of N-grams of words or consecutive characters.

    matrix = tf.fit_transform(data)

    return matrix, tf
Exemplo n.º 13
0
def preprocessData(documets):
    # initialize tokenizer, stop words stemmer
    tokenizer = RegexpTokenizer(r'\w+')
    stopWords = set(stopwords.words('italian'))
    stemmer = ItalianStemmer()
    texts = []
    # loop through document list
    for doc in documets:
        rawText = doc.lower()
        #tokenize
        tokens = tokenizer.tokenize(rawText)
        # remove stop words
        effectiveTokens = [t for t in tokens if not t in stopWords]
        # stemming
        stemmedTokens = [stemmer.stem(t) for t in effectiveTokens]
        # add tokens to list
        texts.append(stemmedTokens)
    return texts
def remove_step(doc):
    """
    takes as input the string of the document
    removes stopwords, punctuation and makes stemming 
    input:
    - string of document
    output:
    - list of term after stemming process
    
    """

    # check if it's a nan value

    if isinstance(doc, float):
        return str(doc)

    sp = string.punctuation + '“”–’°•€'

    doc = doc.replace("\\n", " ")
    # punctuations
    doc = [c if c not in sp else " " for c in doc]
    doc = ''.join(doc)
    # stopwords
    doc = [
        word for word in doc.split()
        if word.lower() not in stopwords.words('italian')
    ]
    doc = ' '.join(doc)

    # stemming
    ps = ItalianStemmer()
    words = word_tokenize(doc)

    w_lst = []
    for w in words:
        w_lst.append(ps.stem(w))

    # something else

    return ' '.join(w_lst)
Exemplo n.º 15
0
class NLTKPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self,
                 stopwords=None,
                 punct=None,
                 lower=True,
                 strip=True,
                 Language='English'):
        self.lower = lower
        self.strip = strip
        self.punct = punct or set(string.punctuation)
        if Language == 'English':
            self.stopwords = stopwords or set(sw.words('english'))
            self.stemmer = PorterStemmer()
        elif Language == 'Italian':
            self.stopwords = stopwords or set(sw.words('italian'))
            self.stemmer = ItalianStemmer()

    def fit(self, X, y=None):
        return self

    def inverse_transform(self, X):
        return [" ".join(doc) for doc in X]

    def transform(self, X):

        return [list(self.tokenize(doc)) for doc in X]

    def tokenize(self, document):
        # Break the document into sentences
        for sent in sent_tokenize(document):
            # Break the sentence into part of speech tagged tokens
            for token, tag in pos_tag(wordpunct_tokenize(sent)):
                # Apply preprocessing to the token
                token = token.lower() if self.lower else token
                token = token.strip() if self.strip else token
                token = token.strip('_') if self.strip else token
                token = token.strip('*') if self.strip else token

                # If stopword, ignore token and continue
                if token in self.stopwords:
                    continue

                # If punctuation, ignore token and continue
                if all(char in self.punct for char in token):
                    continue

                # Lemmatize the token and yield
                lemma = self.stemmer.stem(token)
                yield lemma
Exemplo n.º 16
0
    def myTokenizer(self,
                    corpus,
                    no_numbers=True,
                    min_length=1,
                    stop_words_bool=False,
                    stemmer=False):
        tokens = []
        unique_words = set()
        replace = [
            '>', '<', '-', '|', '\\', '/', '^', '\n', '”', '“', '"', '’', '...'
        ]

        ita_stemmer = ItalianStemmer()

        for doc in corpus:
            if no_numbers:
                doc = re.sub(r'\d+', '', doc)

            for punct in string.punctuation:
                doc = doc.replace(punct, " ")

            for specialChar in replace:
                doc = doc.replace(specialChar, ' ')

            split_doc = [
                token.lower().strip() for token in doc.split(" ") if token
            ]

            split_doc = [word for word in split_doc if len(word) > min_length]

            if stemmer:
                split_doc = [ita_stemmer.stem(word) for word in split_doc]

            unique_words.update(set(split_doc))
            tokens.append(split_doc)
        return tokens, unique_words
def description_preproc(description):

    description = description.strip()
    description = description.replace("\n", " ")
    description = description.replace('\r', " ")
    description = description.replace('’', " ")

    sp = string.punctuation + '“”–’°•€'
    punctuation_remover = str.maketrans('', '', sp)

    description = description.split(' ')

    # removing punctuation
    description = [word.translate(punctuation_remover) for word in description]

    #removing empty spaces in the list
    description = filter(None, description)

    # Italian stemmer
    stemmer = ItalianStemmer()
    # stemmed list
    stemmed_list = [stemmer.stem(word) for word in description]

    return ' '.join(stemmed_list)
Exemplo n.º 18
0
def run_BM25_collection(output_dir,documents,queries,qrels,train,validation,test,k,language):
    
    if language=='en':
        stop_words = set(stopwords.words('english'))
        stemmer = PorterStemmer()
    
    elif language=='fr':
        stop_words = set(stopwords.words('french'))
        stemmer = FrenchStemmer()
    
    elif language=='es':
        stop_words = set(stopwords.words('spanish'))
        stemmer = SpanishStemmer()
        
    elif language=='it':
        stop_words = set(stopwords.words('italian'))
        stemmer = ItalianStemmer()
    
    
    corpus = [] 
    doc_indexes = []
    for key,value in documents.items():
        doc_indexes.append(key)
        doc = [stemmer.stem(elem) for elem in value.split(" ") if elem not in stop_words]
        corpus.append(value.split(" "))
    bm25 = BM25Okapi(corpus)
    
    print("Running BM25",flush=True)
    
    results = dict()
    for i,elem in enumerate(train):
        results[elem] = run_BM25_query(queries[elem],bm25,doc_indexes,k,language)
        if i%1000==0:
            print('Processing query',i,'/',len(train),flush=True)
    save_BM25_res(output_dir+'/training/BM25.res',results)
    save_BM25_qrels_dataframe(output_dir + '/training/BM25.qrels.csv',results,qrels,True)
    
    results = dict()
    for elem in validation:
        results[elem] = run_BM25_query(queries[elem],bm25,doc_indexes,k,language)
    save_BM25_res(output_dir+'/validation/BM25.res',results)
    save_BM25_qrels_dataframe(output_dir + '/validation/BM25.qrels.csv',results,qrels,False)
    
    results = dict()
    for elem in test:
        results[elem] = run_BM25_query(queries[elem],bm25,doc_indexes,k,language)
    save_BM25_res(output_dir+'/test/BM25.res',results)
    save_BM25_qrels_dataframe(output_dir + '/test/BM25.qrels.csv',results,qrels,False)
Exemplo n.º 19
0
class StemmedTfidf(TfidfVectorizer):
    def __init__(
        self,
        input="content",
        encoding="utf-8",
        decode_error="ignore",
        strip_accents=None,
        lowercase=True,
        stop_file="stopwords_it.txt",
        ngram_range=(1, 1),
        max_df=1.0,
        min_df=1,
        max_features=None,
        vocabulary=None,
        binary=False,
        dtype=np.int64,
        norm="l2",
        use_idf=True,
        smooth_idf=True,
        sublinear_tf=False,
    ):

        self.stemmer = ItalianStemmer()
        stopwords = [line.strip() for line in codecs.open(stop_file, "r", "utf-8")]

        super(StemmedTfidf, self).__init__(
            input=input,
            encoding=encoding,
            decode_error=decode_error,
            strip_accents=strip_accents,
            lowercase=lowercase,
            analyzer=self.stemmed_analyzer,
            stop_words=stopwords,
            ngram_range=ngram_range,
            max_df=max_df,
            min_df=min_df,
            max_features=max_features,
            vocabulary=vocabulary,
            binary=binary,
            dtype=dtype,
        )

    def stemmed_analyzer(self, document):
        tokens = wordpunct_tokenize(self.decode(document))
        stopwords = self.get_stop_words()
        stems = [self.stemmer.stem(token) for token in tokens if token.isalpha() and token not in stopwords]
        return stems
Exemplo n.º 20
0
class StemTokenizer(object):
    def __init__(self):
        self.stemmer = ItalianStemmer()

    def __call__(self, document):
        lemmas = []
        for t in word_tokenize(document, language='italian'):
            t = t.strip()  # leading whitespaces are eliminated
            lemma = self.stemmer.stem(t)  # Stemmer
            # filter stopwords
            if t not in stopwords:  #  and detect(t) == 'it' # to detect language
                if (len(lemma) > 2) and (len(lemma) < 16):
                    lemmas.append(lemma)
            # allow words in the whitelit
            if t in whitelist:
                lemmas.append(lemma)
        return lemmas
Exemplo n.º 21
0
    def __init__(
        self,
        input="content",
        encoding="utf-8",
        decode_error="ignore",
        strip_accents=None,
        lowercase=True,
        stop_file="stopwords_it.txt",
        ngram_range=(1, 1),
        max_df=1.0,
        min_df=1,
        max_features=None,
        vocabulary=None,
        binary=False,
        dtype=np.int64,
        norm="l2",
        use_idf=True,
        smooth_idf=True,
        sublinear_tf=False,
    ):

        self.stemmer = ItalianStemmer()
        stopwords = [line.strip() for line in codecs.open(stop_file, "r", "utf-8")]

        super(StemmedTfidf, self).__init__(
            input=input,
            encoding=encoding,
            decode_error=decode_error,
            strip_accents=strip_accents,
            lowercase=lowercase,
            analyzer=self.stemmed_analyzer,
            stop_words=stopwords,
            ngram_range=ngram_range,
            max_df=max_df,
            min_df=min_df,
            max_features=max_features,
            vocabulary=vocabulary,
            binary=binary,
            dtype=dtype,
        )
Exemplo n.º 22
0
def run_BM25_query(query,bm25,doc_indexes,k,language):
    
    if language=='en':
        stop_words = set(stopwords.words('english'))
        stemmer = PorterStemmer()
    
    elif language=='fr':
        stop_words = set(stopwords.words('french'))
        stemmer = FrenchStemmer()
    
    elif language=='es':
        stop_words = set(stopwords.words('spanish'))
        stemmer = SpanishStemmer()
        
    elif language=='it':
        stop_words = set(stopwords.words('italian'))
        stemmer = ItalianStemmer()
    
    tokenized_query = [stemmer.stem(elem) for elem in query.split(" ") if elem not in stop_words]
    doc_scores = bm25.get_scores(tokenized_query)
    top_k = np.argsort(doc_scores)[::-1][:k]
    results = [[doc_indexes[key],doc_scores[key]] for key in top_k]
    return results
Exemplo n.º 23
0
import sys
import logging
import logging.config
logging.config.fileConfig('logging.conf')
logging.getLogger('chatbot')

import nltk
logging.info('nltk [imported]')

from nltk.stem.snowball import ItalianStemmer
stemmer = ItalianStemmer()
logging.info('ItalianStemmer [imported]')

# import our chat-bot intents file
import json
with open('assets/intents.json', encoding='utf-8') as json_data:
    intents = json.load(json_data)
logging.info('intents [loaded]')

words = []
classes = []
documents = []
ignore_words = ['?', ',', '.']
# loop through each sentence in our intents patterns
logging.info('Loop on intents...')
for intent in intents['intents']:
    for pattern in intent['patterns']:
        logging.debug('Evaluate pattern: ' + pattern)
        # tokenize each word in the sentence
        w = nltk.word_tokenize(pattern)
        print('Tokenize words:', w)
Exemplo n.º 24
0
 def __init__(self):
     self.stemmer = ItalianStemmer()
Exemplo n.º 25
0
class AdCluster:
    def __init__(self):
        self.data_dir = "./data/"
        if not isdir(self.data_dir):
            os.mkdir(self.data_dir)

        self.info = None
        self.desc = None
        self.desc_index = None

        self.stemmer = ItalianStemmer()
        self.stop_words = set(stopwords.words('italian'))

        self.vocab = None
        self.documents = None
        self.inv_index = None  # no inverted index yet available
        self.idf = None  # no inverse document frequency yet available

        self.nltk_check_downloaded()

        self.url_base = "https://www.immobiliare.it"
        self.url_search = "/vendita-case/roma/?criterio=rilevanza&pag="
        try:
            html = requests.get(self.url_base + self.url_search + "1").content
            soup = BeautifulSoup(html, "html.parser")

            pag_number_list = soup.find("ul", class_="pagination pagination__number")
            self.max_pag_nr = int(pag_number_list.find_all("li")[-1].text)
        except requests.exceptions.ConnectionError:
            pass

    def load_data(self, info_fname, desc_fname, convert_to_tfidf=True, skip_scrape=False):
        info_file = self.data_dir + info_fname
        desc_file = self.data_dir + desc_fname
        info_exists = isfile(info_file)
        desc_exists = isfile(desc_file)
        if info_exists and desc_exists:
            info, desc = pd.read_csv(info_file, sep=",", index_col=None, header=None), \
                         pd.read_csv(desc_file, sep=",", index_col=None, header=None)
        elif not skip_scrape:
            info, desc = self.scrape_immobiliare()
        else:
            raise ValueError(f"No files present and 'skip_scrape'={skip_scrape}.")

        info.drop(columns=[0], inplace=True)
        info.columns = ['ID', 'Price', 'Rooms', 'Area', 'Bathrooms', 'Floor']
        desc.drop(columns=[0, 2], inplace=True)
        desc.columns = ['ID', 'Description']

        info.reset_index(drop=True, inplace=True)
        desc.reset_index(drop=True, inplace=True)
        info[info["Floor"] == "A"] = 12
        info[info["Floor"].isin(("R", "T"))] = 0
        info[info["Floor"] == "S"] = -1

        # remove duplicates
        info = info.loc[(-1 * info["ID"].duplicated(keep=False) + 1).astype(bool)]
        desc = desc.loc[(-1 * desc["ID"].duplicated(keep=False) + 1).astype(bool)]

        desc_ids = desc["ID"]
        info_ids = info["ID"]
        info_corr = info_ids[info_ids.isin(desc_ids)]
        desc_corr = desc_ids[desc_ids.isin(info_ids)]
        rem_ids = pd.unique(pd.concat((info_corr, desc_corr)))

        info = info[info["ID"].isin(rem_ids)]
        desc = desc[desc["ID"].isin(rem_ids)]

        nans = lambda df: df.isnull().any(axis=1)  # handy func to find NANs

        nan_info = nans(info)
        nan_desc = nans(desc)
        # drop all ads where any of the two matrices encounter NANs
        info = info.drop(index=info.index[nan_info | nan_desc]).reset_index(drop=True)
        desc = desc.drop(index=desc.index[nan_info | nan_desc]).reset_index(drop=True)

        if convert_to_tfidf:
            desc = self.build_desc_matrix(desc)

        self.info = info
        self.desc = desc

        return info, desc

    @staticmethod
    def get_ad_from_url(url, parser):
        response = requests.get(url)

        html_soup = BeautifulSoup(response.text, parser)
        ad_containers = html_soup.find_all('p', class_='titolo text-primary')

        urls = []

        for container in ad_containers:
            if "/nuove_costruzioni/" not in container.a['href']:
                urls.append(container.a['href'])

        return urls

    @staticmethod
    def get_data(url):

        id = re.findall(r'(\d+)', url)[0]  # Get ad ID parsing the url

        response = requests.get(url)

        html_soup = BeautifulSoup(response.text, 'html.parser')
        data_container = html_soup.find('ul', class_='list-inline list-piped features__list')

        if data_container is not None:
            find = lambda itm: itm.find('div', class_='features__label')

            for item in data_container.children:

                # Locate rooms number
                found = find(item)
                if found:
                    if found.contents[0] == 'locali':
                        rooms = item.find('span', class_='text-bold').contents[0]
                        rooms = re.sub('[^A-Za-z0-9]+', '', rooms)

                    # Locate surface extension
                    elif found.contents[0] == 'superficie':
                        area = item.find('span', class_='text-bold').contents[0]
                        area = re.sub('[^A-Za-z0-9]+', '', area)

                    # Locate bathrooms number
                    elif found.contents[0] == 'bagni':
                        bathrooms = item.find('span', class_='text-bold').contents[0]
                        bathrooms = re.sub('[^A-Za-z0-9]+', '', bathrooms)

                    # Locate floor number
                    elif found.contents[0] == 'piano':
                        floor = item.find('abbr', class_='text-bold').contents[0]
                        floor = re.sub('[^A-Za-z0-9]+', '', floor)

                # Extract the description
                try:
                    cl = 'col-xs-12 description-text text-compressed'
                    description = html_soup.find('div', class_=cl).div.contents[0]
                    description = re.sub('[^a-zA-Z0-9-_*. ]', '', description)  # Remove special characters
                    description = description.lstrip(' ')  # Remove leading blank spaces
                except AttributeError:
                    return False

        try:
            return [[id, rooms, area, bathrooms, floor], [id, description]]
        except NameError:
            return False

    def scrape_immobiliare(self):
        row_info, row_desc, url_list = [], [], []

        try:
            import lxml
            parser = "lxml"
        except ImportError:
            parser = "html.parser"

        base_url = "https://www.immobiliare.it/vendita-case/roma/?criterio=rilevanza&pag="

        for i in tqdm(range(450)):
            url_list += self.get_ad_from_url(base_url + str(i), parser)

        for url in tqdm(url_list):

            print(url)

            # This while loop is needed to retry the request in case of connection error
            while True:
                try:
                    cont = self.get_data(url)
                    if cont:
                        # Convert list in dataframe
                        row_data = np.asarray(cont[0]).reshape(1, 5)
                        row_data = pd.DataFrame(data=row_data,
                                                columns=['ID', 'Rooms', 'Area', 'Bathrooms', 'Floor'])

                        # Append results to info dataframe
                        row_info.append(row_data)

                        # Convert list in dataframe
                        row_description = pd.np.asarray(cont[1]).reshape(1, 2)
                        row_description = pd.DataFrame(data=row_description,
                                                       columns=['ID', 'Description'])

                        # Append results to description dataframe
                        row_desc.append(row_description)

                        # Create two csv files line by line
                        with open('data/data.csv', 'a') as f:
                            row_data.to_csv(f, header=False)
                        with open('data/description.csv', 'a') as f:
                            row_description.to_csv(f, header=False)

                # Wait a second in case of connection error and retry
                except ConnectionError:
                    print('Connection Error')
                    time.sleep(1)
                    continue
                break

        info = pd.concat(row_info)
        desc = pd.concat(row_desc)

        # remove duplicates
        info = info.loc[(-1 * info["ID"].duplicated(keep=False) + 1).astype(bool)]
        desc = desc.loc[(-1 * desc["ID"].duplicated(keep=False) + 1).astype(bool)]

        desc_ids = desc["ID"]
        info_ids = info["ID"]
        info_corr = info_ids[info_ids.isin(desc_ids)]
        desc_corr = desc_ids[desc_ids.isin(info_ids)]
        rem_ids = pd.unique(pd.concat((info_corr, desc_corr)))

        info = info[info["ID"].isin(rem_ids)]
        desc = desc[desc["ID"].isin(rem_ids)]

        nans = lambda df: df.isnull().any(axis=1)  # handy func to find NANs

        nan_info = nans(info)
        nan_desc = nans(desc)
        # drop all ads where any of the two matrices encounter NANs
        info = info.drop(index=info.index[nan_info | nan_desc])
        desc = desc.drop(index=desc.index[nan_info | nan_desc])

        return info, desc

    @timeit
    def build_desc_matrix(self, desc_df):
        self.desc_index = desc_df.index
        docs = desc_df["Description"]
        docs = self._process_docs(docs)
        self._build_invert_idx(docs, proc=False)

        # In the following, the one-hot-encoding of the relevant documents is computed
        # and its tfidf values stored in sparse matrix.
        col = []  # list of non zero column indices
        row = []  # list of non zero row indices
        data = []  # data of the non zero indices
        for d_nr, content in docs.items():
            for term in content:
                col.append(self.vocab.loc[term, "term_id"])
                row.append(d_nr)
                # find the tfidf (the data) of the term in this document
                found = False
                for termset in self.inv_index[term]:
                    if termset.docID == d_nr:
                        data.append(termset.tfidf)
                        found = True
                        break  # value found, no other termset needs to be found after
                if not found:
                    raise ValueError(f"Term {term} in document {d_nr} not found.")
        shape = len(docs), len(self.vocab)
        desc_sparse = sparse.csr_matrix((data, (row, col)), shape=shape, dtype=float)
        return desc_sparse

    def desc_sparse_to_dense(self, desc_sparse):
        if isinstance(desc_sparse, sparse.csr_matrix):
            return pd.DataFrame(desc_sparse.toarray(),
                                index=self.desc_index,
                                columns=self.vocab.index)
        else:
            return desc_sparse

    @staticmethod
    def cluster_kmeans_elbow(X, normalize_=False):
        if normalize_:
            X_clust = normalize(X)
        else:
            X_clust = X
        i = 0
        ks, fits, scores = [], [], []
        while True:
            new_range = [k for k in range(10 * i + 1, 10 * i + 11)]
            ks += new_range
            KM = [KMeans(n_clusters=i) for i in new_range]
            f = [km.fit(X_clust) for km in KM]
            fits += f
            scores += [km.inertia_ for km in f]
            plt.plot(ks, scores)
            plt.show()
            print("Choose number of clusters: ", end="")
            new_k = input()
            if new_k != "":
                try:
                    new_k = int(new_k)
                    if new_k > 0:
                        break
                except ValueError:
                    pass
            i += 1
        km_fit = fits[new_k-1]
        return km_fit

    def find_similar_clusters(self, clusters_info, clusters_desc):
        if self.info is None or self.desc is None:
            raise ValueError("Information and/or description dataframe not yet assigned.")

        labels_info = clusters_info.predict(self.info)
        labels_desc = clusters_desc.predict(self.desc)
        n_clusters_info = clusters_info.n_clusters
        n_clusters_desc = clusters_desc.n_clusters

        cluster_sim = heapdict.heapdict()
        for i in range(n_clusters_info):
            ind_info = np.where(labels_info == i)[0]
            for j in range(n_clusters_desc):
                ind_desc = np.where(labels_desc == j)[0]
                all_ind = np.concatenate((ind_info, ind_desc))
                intersec = 0
                if len(ind_info) < len(ind_desc):
                    for idx in ind_info:
                        if idx in ind_desc:
                            intersec += 1
                else:
                    for idx in ind_desc:
                        if idx in ind_info:
                            intersec += 1
                union = np.unique(all_ind)
                cluster_sim[i, j] = -intersec / len(union)

        return cluster_sim

    def top_words_clusters(self, data, labels, nr_top_k_words):
        top_data = data.apply(
            lambda x: pd.Series(x.sort_values(ascending=False).iloc[:nr_top_k_words].index,
                                index=[f"top{i}" for i in range(1, nr_top_k_words + 1)]),
            axis=1
        )
        _, desc_df = self.load_data("data.csv", "description.csv", convert_to_tfidf=False)
        desc_df = self._process_docs(desc_df["Description"], stem=False)

        top_data["cluster"] = labels
        top_data.sort_values(by=["cluster"], inplace=True)

        for cluster in pd.unique(top_data["cluster"]):
            d = top_data[top_data["cluster"] == cluster].drop(columns=["cluster"])
            freqs = dict()
            for x in d.itertuples():
                idx = x.Index
                actual_ad = desc_df[idx]
                words = x[1:]
                for word in words:
                    for act_w in actual_ad:
                        if self.stemmer.stem(act_w) == word:
                            actual_word = act_w
                            break
                    freqs[actual_word] = data.loc[idx, word]

            wordcloud = WordCloud(width=1600, height=800, background_color="white")
            wordcloud.generate_from_frequencies(freqs)
            plt.figure(num=None, figsize=(20, 10), facecolor='w', edgecolor='k')
            plt.imshow(wordcloud, interpolation="bilinear")
            plt.axis("off")
            plt.title(f"Cluster {cluster} word-cloud of top {nr_top_k_words} words of each ad within cluster.\n"
                      f"The size of words corresponds to their TFIDF value.")
            plt.show()

        return top_data

    @timeit
    def _create_vocab(self, docs, proc=True):
        """
        Creates the vocabulary from documents or reads the vocabulary from file.
        The name is always "vocabulary.csv" containing the word as index and its
        term id as column entry.
        :param docs: dict or pd.DataFrame, the collection of documents (only essential parts)
        :return: the vocabulary
        """

        fname = f"{self.data_dir}vocabulary.csv"
        if proc:
            docs = self._process_docs(docs)
        self.vocab = set()
        for doc in docs.values():
            self.vocab.update(doc)
        self.vocab = pd.DataFrame(pd.Series(np.arange(len(self.vocab)), index=self.vocab),
                                  columns=["term_id"])
        self.vocab.to_csv(fname)
        return self.vocab

    def _process_text(self, text, stem=True):
        """
        Remove special characters and superfluous whitespaces from text body. Also
        send text to lower case, tokenize and stem the terms.
        :param text: str, the text to process.
        :return: generator, yields the processed words in iteration
        """
        if stem:
            stem_func = self.stemmer.stem
        else:
            stem_func = lambda x: x

        text = self.doc_to_string(text).lower()
        sub_re = r"[^A-Za-z']"
        text = re.sub(sub_re, " ", text)
        for i in word_tokenize(text):
            if i not in self.stop_words:
                w = stem_func(i)
                if len(w) > 1:
                    yield(w)

    def _process_docs(self, docs=None, stem=True):
        """
        Takes a collection of documents and processes them iteratively. The docs
        can be a pd.DataFrame, pd.Series or dictionary.
        :param docs: pd.DataFrame, pd.Series or dictionary
        :return: dict, indexed by doc number and lists (processed doc) as values
        """

        if isinstance(docs, pd.DataFrame):
            docs_generator = docs.iterrows()
        elif isinstance(docs, pd.Series):
            docs_generator = docs.iteritems()
        elif isinstance(docs, dict):
            docs_generator = docs.items()
        else:
            raise ValueError("Container type has no handler.")

        d_out = dict()
        for docnr, doc in docs_generator:
            d_out[docnr] = list(self._process_text(doc, stem=stem))
        return d_out

    @staticmethod
    def nltk_check_downloaded():
        """
        Check the prerequisite NLTK tools, download if not found
        """

        try:
            nltk.data.find('corpora/stopwords')
        except LookupError:
            nltk.download('stopwords')
        try:
            nltk.data.find('tokenizers/punkt')
        except LookupError:
            nltk.download('punkt')

    @staticmethod
    def doc_to_string(doc):
        """
        Converts a document to a string. Can take a list, DataFrame, tuple to convert to str
        :param doc: iterable, container of the document
        :return: str, the to string converted document.
        """

        if isinstance(doc, str):
            return doc
        elif isinstance(doc, np.ndarray):
            doc = " ".join(list(map(str, doc.flatten())))
        elif isinstance(doc, (list, tuple)):
            doc = " ".join(doc)
        elif isinstance(doc, (pd.DataFrame, pd.Series)):
            doc = " ".join(list(map(str, doc.values.flatten())))
        else:
            raise ValueError(f"Can't convert file type {type(doc)} to string.")
        return doc

    @timeit
    def _build_invert_idx(self, docs=None, proc=False, read_fname="inverted_index.txt",
                          write_fname="inverted_index.txt", load_from_file=False):
        """
        Build the inverted index for the terms in a collection of documents. Will load a
        previously build inverted index from file if it detects the file existing (and
        param load_from_file is True).
        :param docs: pd.DataFrame/dict, collection of documents
        :param read_fname: str, filename of the inverted txt to load. Needs to be built in the
                                specified way of the method
        :param write_fname: str, filename to write the inverted index to.
        :param load_from_file: bool, load the index from the filename provided if True
        :return: dict, the inverted index with terms as keys and [TermSet(docID, tfidf),...]
                       as values.
        """

        if self.vocab is None:
            self._create_vocab(docs, proc=proc)
        file = f"{self.data_dir}{read_fname}"
        TermSet = namedtuple("TermSet", "docID tfidf")
        if isfile(file) and load_from_file:
            idf_dict = dict()
            inv_index = dict()
            with open(file, "r") as f:
                # load all the information from the file into memory
                for rowidx, line in enumerate(f):
                    if rowidx > 0:
                        term, idf_doclist = line.strip().split(":", 1)
                        idf, doclist = idf_doclist.split("|", 1)
                        idf_dict[term] = idf
                        doclist = list(map(lambda x: re.search(r"\d+,\s?(\d[.])?\d+", x).group().split(","),
                                           doclist.split(";")))
                        inv_index[term] = [TermSet(*list(map(float, docl))) for docl in doclist]
        else:
            # the final inverted index container, defaultdict, so that new terms
            # can be searched and get an empty list back
            inv_index = defaultdict(list)
            docs, idf_dict, term_freqs, doc_counters = self._build_idf(docs, proc)
            for docnr, doc in docs.items():
                # weird, frequency pairs for this document
                freqs = doc_counters[docnr]
                for word, word_freq in freqs.items():
                    # nr of words in this document
                    n_terms = sum(freqs.values())
                    # store which document and frequency
                    inv_index[word].append(TermSet(docnr, word_freq / n_terms * idf_dict[word]))
            # write the built index to file
            with open(f"{self.data_dir}{write_fname}", "w") as f:
                f.write("Word: [Documents list]\n")
                for word, docs in inv_index.items():
                    docs = [(doc.docID, doc.tfidf) for doc in docs]
                    f.write(f"{word}: {idf_dict[word]} | {';'.join([str(doc) for doc in docs])}\n")
        self.inv_index = inv_index
        self.idf = idf_dict
        return inv_index

    @timeit
    def _build_idf(self, docs, proc=True):
        """
        Builds the IDF values for terms in docs.
        :param docs: dict/pd.DataFrame, the documents
        :return: tuple; a tuple of (docs_dict, idf_dict, termFrequencies_dict, docCounters_dict).
        The idf_dict contains the IDF value for each term in the documents.
        The termFrequencies_dict contains the global number of occurences of each term in all the docs.
        the docCounters_dict contains the local number of occurences of each term in the respective doc.
        """

        if proc:
            docs = self._process_docs(docs)
        nr_docs = len(docs)
        idf = defaultdict(lambda: np.math.log(len(docs) + 1))
        # dict to track nr of occurences of each term
        term_freqs = dict()
        # dict to store counter of words in each doc
        doc_counters = dict()
        for docnr, doc in docs.items():
            freqs = Counter(doc)
            doc_counters[docnr] = freqs
            for word in freqs.keys():
                if word in term_freqs:
                    term_freqs[word] += 1
                else:
                    term_freqs[word] = 1
        for word in self.vocab.index:
            # nr of documents with this term in it
            nr_d_with_term = term_freqs[word]
            # inverse document frequency for this term and this document
            idf[word] = np.math.log((float(nr_docs + 1) / (1 + nr_d_with_term)))
        self.idf = idf
        return docs, idf, term_freqs, doc_counters
Exemplo n.º 26
0
 def stem_words(self, words):
     stemmer = ItalianStemmer()
     stemmed_words = []
     for word in words:
         stemmed_words.append(stemmer.stem(word))
     return stemmed_words
Exemplo n.º 27
0
class MyTokenizer:

    # Constructor
    def __init__(self,
                 no_numbers=True,
                 min_length=1,
                 clean_emoji=True,
                 stop_words_bool=True,
                 whitelist_stop_words=True,
                 stemmer=False):

        self.no_numbers = no_numbers
        self.min_length = min_length
        self.stemmer = stemmer
        self.ita_stemmer = ItalianStemmer()

        self.replace = [
            '#', '>', '_', '<', '-', '|', '\\', '/', '^', '\n', '”', '“', '"',
            '’', '‘', '€', '´', '.', '…'
        ]

        self.emoji = None
        self.clean_emoji = clean_emoji
        if self.clean_emoji:
            try:
                # UCS-4
                self.emoji = re.compile(u'[\U00010000-\U0010ffff]')
            except re.error:
                # UCS-2
                self.emoji = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]')

            self.pos_emoji = [
                '👍', '😀', '💪', '😎', '👌', '😁', '😃', '😄', '😊', '😋', '😍', '😻',
                '🤗', '👏🏻', '😘', '🎉', '💗', '🔝', '😉'
            ]
            self.neg_emoji = [
                '👎', '😒', '😖', '😠', '😡', '😤', '😨', '😱', '😳', '😬', '😞', '🤐',
                '😕', '😢'
            ]

        self.stop_words_bool = stop_words_bool
        if self.stop_words_bool:
            sw = StopWords(whitelist=whitelist_stop_words)
            sw_list = sw.getStopWords()

            stop_words_dict = defaultdict(lambda: -1)
            for i, word in enumerate(sw_list):
                stop_words_dict[word] = 1

            self.stop_words_dict = stop_words_dict

        return

    def __call__(self, doc):

        doc = re.sub(r'[hHtTpP]+[sS]?:[A-Za-z0-9-#_./]+', ' ', doc)

        if self.no_numbers:
            doc = re.sub(r'\d+', ' ', doc)

        for punct in string.punctuation:
            doc = doc.replace(punct, " ")

        for specialChar in self.replace:
            doc = doc.replace(specialChar, ' ')

        if self.clean_emoji:

            for specialEmoji in self.pos_emoji:
                doc = doc.replace(specialEmoji, ' positiveemoji ')

            for specialEmoji in self.neg_emoji:
                doc = doc.replace(specialEmoji, ' negativeemoji ')

            doc = self.emoji.sub(u' ', doc)

        split_doc = [
            token.lower().strip() for token in doc.split(" ") if token
        ]

        if self.stop_words_bool:
            split_doc = [
                word for word in split_doc if len(word) > self.min_length
                and len(word) < 16 and self.stop_words_dict[word] != 1
            ]
        else:
            split_doc = [
                word for word in split_doc
                if len(word) > self.min_length and len(word) < 16
            ]

        if self.stemmer:
            split_doc = [self.ita_stemmer.stem(word) for word in split_doc]

        return split_doc
Exemplo n.º 28
0
import pickle
import json
import tflearn
import tensorflow as tf
import numpy as np
import re
import random

import nltk
from nltk.stem.snowball import ItalianStemmer

stemmer = ItalianStemmer()

data = pickle.load(open("training_data", "rb"))
words = data['words']
classes = data['classes']
train_x = data['train_x']
train_y = data['train_y']

# import our chat-bot intents file
with open('intents.json') as json_data:
    intents = json.load(json_data)


def clean_up_sentence(sentence):
    sentence_words = nltk.word_tokenize(re.sub(r'[^\w\s]', ' ', sentence),
                                        language='italian')
    sentence_words = [stemmer.stem(word.lower()) for word in sentence_words]
    return sentence_words

Exemplo n.º 29
0
 def stem_words(self, words):
     stemmer = ItalianStemmer()
     stemmed_words = []        
     for word in words:
         stemmed_words.append(stemmer.stem(word))
     return stemmed_words