Пример #1
0
    def __math_garden_area(plot_surface, build_surface):

        # If plot_surface and build_surface are not None:
        if plot_surface and build_surface:

            # Convert string to int
            plot_surface = Cleaner.string_to_int(plot_surface.strip())
            build_surface = Cleaner.string_to_int(build_surface.strip())

            # If plot_surface and build_surface exists:
            if plot_surface and build_surface:
                print(plot_surface - build_surface)
                return plot_surface - build_surface

        return None
Пример #2
0
    def compile(self):

        self.token = Cleaner(self.questn).make_final_string()
        self.location = Geocoder(self.token).get_location()

        if self.location[0] == "OK":

            try:
                self.wikiresult = WikiSearcher(self.location).geolookup()
                self.wikiurl = WikiSearcher(self.location).get_url()

                self.finalData = {
                    "status": self.location[0],
                    "lat": self.location[1][0],
                    "long": self.location[1][1],
                    "wikiresult": self.wikiresult,
                    "wikiurl": self.wikiurl,
                    "granpyMessage": random.choice(ok_res),
                }

            except IndexError:

                self.finalData = {
                    "status": "NOK",
                    "warningMessage": random.choice(no_res),
                }

        else:

            self.finalData = {
                "status": self.location[0],
                "warningMessage": random.choice(nok_res),
            }

        return self.finalData
Пример #3
0
        def search_col_xs_7(title, room_type):
            """
            Search for a given room in a 'col_xs_7' under 'More Info'.
            Append its number to 'room_number'.
            """
            nonlocal rooms_number

            result = self.__get_text(
                self.__scrap_field_value('div', 'col-xs-7 info-name', title))
            if result:
                rooms_number += Cleaner.string_to_int(result, 1)
                has_found[room_type] = True
    def clean(self, tfile):
        """
        clean data
        Parameters
        ----------
        tfile: string
            the path of the data needing to be processed

        Returns
        -------
        new_data: pd.DataFrame
        """

        # load data
        data = pd.read_csv(
            tfile,
            sep='\t',
            header=None,
            names=['polarity', 'aspect', 'target', 'startend', 'message'])
        # clean the data
        cleaner = Cleaner()
        new_data = cleaner.remove_punctuation_dataframe(data)
        new_data = cleaner.remove_digits_dataframe(new_data)
        new_data = cleaner.lemmatization_dataframe(new_data)
        new_data = cleaner.lower_case(new_data)
        return new_data
Пример #5
0
    def start(self):

        # Grab the urls to scrap
        manager = Manager()
        manager.grabber(10)

        # Print starting message
        total_urls_number = len(manager.urls)
        print(f"[+] Scrapping phase started: 0/{total_urls_number}.")

        scrapped_urls = 0

        # Group the urls 10 by 10
        grouped_total_urls = self.grouper(manager.urls, self.url_pool_size)

        with ThreadPoolExecutor(max_workers=self.scrapper_workers) as executor:
            futures = [
                executor.submit(Manager.scrapper, urls)
                for urls in grouped_total_urls
            ]

            for entry in concurrent.futures.as_completed(futures):

                # Iterate the scrapped_urls and print a status message
                scrapped_urls += self.url_pool_size
                print(
                    f"[i] Urls scrapped: {scrapped_urls}/{total_urls_number}.")

                Cleaner(entry.result()).clean()

        print(
            f"[i] Urls scrapped: {total_urls_number}/{total_urls_number} - Complete !"
        )

        # Merge the pickles
        Merger().merge()
Пример #6
0
def split():
    """
    This endpoint expects:
        1 - a file with raw OCR text
        2 - an email address for the patient/submitter of pathology report
    :return:
    """
    PRODUCTION_ENDPOINT, TEST_ENDPOINT = get_endpoints()
    content = request.get_json()
    text, email = content['text'], content['email'].strip()

    # Remove PHI
    de_identified_text = filter_task(text, scispacy)

    # Clean text
    cleaner = Cleaner(de_identified_text)
    cleaned_text = cleaner.text

    # Finally preprocess
    preprocessor = SpacyPreProcessor(scispacy)
    text, tokens_list = preprocessor.preprocess_sentences(cleaned_text)
    m = {'text': text, 'tokens': tokens_list, 'email': email}
    response = requests.post(url=TEST_ENDPOINT, json=m)
    return response.json()
Пример #7
0
    #mongo_cols = {'acct_type','user_type','email_domain','venue_state','venue_name'}
    client = MongoClient()
    db = client[client_name]
    tab = db[tab_name]
    cursor = tab.find(None)  #mongo_cols)
    df = pd.DataFrame(list(cursor))
    return df


if __name__ == '__main__':
    # read data
    dataframe = get_data()
    #print(dataframe)
    #clean data
    y = dataframe['acct_type'].str.contains('fraud').astype(int)
    X_train, X_test, y_train, y_test = \
           train_test_split(dataframe, y, random_state = 142)

    print('cleaning....')
    clean = Cleaner()
    clean.fit(X_train)
    X_train = clean.transform(X_train)
    X_test = clean.transform(X_test)

    print('Fitting....')
    #fit model
    gb = GBModel()
    gb.fit(X_train, y_train)

    print('score: {}'.format(gb.score(X_test, y_test)))
Пример #8
0
def test_get_question():
    sut = Cleaner("Granpy quelle est l'adresse de la Poste?")
    assert sut.question == "Granpy quelle est l'adresse de la Poste?"
    """ Turn question into all lowercases with lowercase method
Пример #9
0
def test_final_string():
    sut = Cleaner("Granpy quelle est l'adresse de la Tour Eiffel?")
    assert sut.make_final_string() == "tour+eiffel"
Пример #10
0
def test_concatenate():
    sut = Cleaner("Granpy quelle est l'adresse de la Tour Eiffel?")
    assert sut.make_final_string() == "tour+eiffel"
    """ Returns the final string """
Пример #11
0
def test_token():
    sut = Cleaner("Granpy quelle est l'adresse de la Poste?")
    assert sut.make_final_string() == "poste"
    """ Takes words from wordlist and adds a + in between """
Пример #12
0
def test_turn_string_to_wordlist():
    sut = Cleaner("Granpy quelle est l'adresse de la Tour Eiffel?")
    assert sut.make_final_string() == "tour+eiffel"
    """ Clean worldlist from unwanted commun words with stopword method """
Пример #13
0
def test_turn_lowercase():
    sut = Cleaner("Granpy quelle est l'adresse de la POSTE?")
    assert sut.make_final_string() == "poste"
    """ Turn question (string) in a world list with wordlist method