Exemplo n.º 1
0
    def regex_detection(self, sentence, full_text=None, offset=0):
        """ Detect entities with a regex in sentence

        Keyword arguments:
        sentence -- a sentence in plain text

        """
        # dict to store detections
        unconsolidated_broad_list = []

        result_broad_list = self._detect_regexp(sentence, "broad")
        strict_list = copy.deepcopy(self._detect_regexp(sentence, "strict"))

        consolidated_list = [clean_text(regexp[0]) for regexp in strict_list]

        for _broad_regexp in result_broad_list:
            if clean_text(_broad_regexp[0]) not in consolidated_list:
                unconsolidated_broad_list.append(_broad_regexp)

        # check proximity conditions of broad regexp detections
        # Si no se inicializa a [] se duplican resultados
        consolidated_broad_list = self._check_proximity_conditions(
            unconsolidated_broad_list, full_text, offset)

        # Validate
        validate_list, strict_list, consolidated_broad_list = self._validate(
            strict_list, consolidated_broad_list)

        unconsolidated_broad_list = self._remove_unconsolidated_matches(
            consolidated_broad_list, unconsolidated_broad_list)

        return strict_list, consolidated_broad_list, unconsolidated_broad_list, validate_list
    def predict(self, text):
        words = self.tokenizer.preprocess(clean_text(text))
        if len(words) < self.seq_length:
            words += ['<pad>'] * (self.seq_length - len(words))
        elif len(words) > self.seq_length:
            words = words[:self.seq_length]
        tokens = [self.tokenizer.vocab.stoi[word] for word in words]
        tokens = torch.LongTensor(tokens).unsqueeze(0)
        reference_tokens = self.token_reference.generate_reference(
            self.seq_length, device='cpu').unsqueeze(0)
        pred = self.model(tokens)
        plabel = int(torch.argmax(pred, 1))
        pred = pred.tolist()[0]

        unpad_index = [
            idx for idx, word in enumerate(words) if word != '<pad>'
        ]
        unpad_words = [word for word in words if word != '<pad>']
        attributions = []
        for label in range(len(pred)):
            attributions.append(
                list(
                    self.attribute(tokens, reference_tokens, label,
                                   unpad_index)))
        return unpad_words, pred, plabel, attributions
Exemplo n.º 3
0
 def _validate_list(self, validate_list, _list):
     new_list = []
     for regexp in _list:
         ent = clean_text(regexp[0])
         # print("ent: " + ent)
         if self._func_validate(ent):
             # print("\tent:" + ent + " is valid!")
             validate_list.append(regexp)
         else:
             # print("\tent:" + ent + " NO valid!")
             new_list.append(regexp)
     return validate_list, new_list
Exemplo n.º 4
0
 def parse_table(soup):
     table = {}
     name = clean_text(soup.find('tbody').find('tr').text)
     table[name] = []
     if 'No Record Found' in soup.text:
         return {}
     headers = [
         clean_text(x.text)
         for x in soup.find('tbody').find_all('tr')[1].find_all('td')
     ]
     table[name].append(headers)
     for row in soup.find_all('tbody')[1].find_all('tr'):
         if 'Total' in row.text:
             # Lets skip the row which gives the total
             continue
         values = [clean_text(x.text) for x in row.find_all('td')]
         if len(values) < len(headers):
             # HACK: Assuming that if less columns than expected, left
             # side cells are merged
             values.insert(0, table[name][1][0])
         table[name].append(values)
     return table
def train_classifiers(data_path, save_path, multithreading, lang='eng'):

    # Read data from provided location
    dataframe = pd.read_csv(data_path, header=0, sep=";")
    dataframe = dataframe.dropna()

    print("Cleaning text...", end="")
    dataframe['text'] = dataframe['text'].map(
        lambda x: utils.clean_text(x, lang))
    print("Done")

    #Train test split
    train, test, label_train, label_test = utils.create_train_test(
        dataframe["text"], dataframe["label"])

    # TFIDF Bag Of Words extraction. Theese lines extract the features from text based on word frequencies among
    # texts
    print("Creating TFIDF data")
    unwanted = stopwords.words(lang)
    tfidf_vect = TfidfVectorizer(analyzer='word',
                                 ngram_range=range(1, 3),
                                 max_features=3000,
                                 stop_words=unwanted,
                                 max_df=0.5,
                                 min_df=3)
    tfidf_vect.fit(train)

    # saving the vectorizer model
    tmp = os.path.join(save_path, "Models")
    save = os.path.join(tmp, "vectorizer.pkl")
    with open(save, 'wb') as pklfile:
        pkl.dump(tfidf_vect, pklfile)

    train_bow = tfidf_vect.transform(train)
    test_bow = tfidf_vect.transform(test)

    print("Beginning supervised training...")
    if multithreading:
        parallel_pipeline(train_bow, test_bow, label_train, label_test,
                          save_path)
    else:
        classifiers_pipeline(train_bow, test_bow, label_train, label_test,
                             save_path)
Exemplo n.º 6
0
 def test_clean_text(self):
     processed_test = clean_text("http://test.site @mention")
     self.assertEqual(processed_test, " ")
Exemplo n.º 7
0
def main(args) -> dict:
    """ Execute the command.

    :param name: name to use in greeting
    """
    output = {}
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:78.0) Gecko/20100101 Firefox/78.0',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'Content-Type': 'application/x-www-form-urlencoded',
        'Referer': 'https://cra-nsdl.com/CRA/',
    }
    sensor = NpsSensor('finance/nps', URLS['base'], headers=headers,
                       creds=True)

    @retry(CaptchaError, tries=5, delay=5)
    def login():
        sensor.download_captcha(URLS['login'])
        captcha = sensor.solve_captcha()
        logger.info(f"Captcha = {captcha}")
        login_url = URLS['login'] + ';' + sensor.session.cookies['JSESSIONID']
        data = {
            'userID': sensor.credentials['username'],
            'password': sensor.credentials['password'],
            'subCaptchaVal': captcha,
        }
        # Lets login
        sensor.post(url=login_url, data=data)
        if "Your Password has expired." in sensor.response.text:
            raise LoginError("The Password has expired")
        sensor.dump_html('login-out.html')
        if sensor.soup.find('div', {'class': 'login-tab'}):
#            sensor.dump_html('login-error.html')
    #        if 'Please enter correct captcha code' in sensor.response.text:
            raise CaptchaError("Captcha was not validated")
        logger.info("Success!!")

    logger.info("Logging in now ..")
    login()
    if not "Welcome Subscriber" in sensor.soup.text:
        sensor.dump_html("login-error.html")
        raise LoginError(f"Login did not work")
 #   sensor.dump_html('login-success.html')
    id = sensor.get_id()
    sensor.get(URLS['account_details'].format(id=id))
#    sensor.dump_html('account.html')
#    sensor.read_html('account.html')

    def parse_table(soup):
        table = {}
        name = clean_text(soup.find('tbody').find('tr').text)
        table[name] = []
        if 'No Record Found' in soup.text:
            return {}
        headers = [
            clean_text(x.text)
            for x in soup.find('tbody').find_all('tr')[1].find_all('td')
        ]
        table[name].append(headers)
        for row in soup.find_all('tbody')[1].find_all('tr'):
            if 'Total' in row.text:
                # Lets skip the row which gives the total
                continue
            values = [clean_text(x.text) for x in row.find_all('td')]
            if len(values) < len(headers):
                # HACK: Assuming that if less columns than expected, left
                # side cells are merged
                values.insert(0, table[name][1][0])
            table[name].append(values)
        return table

    rawdata = {}
    for t in sensor.soup.find_all('table', {'class': 'table-newnorow'}):
        table_data = parse_table(t)
        rawdata.update(table_data)

    prefs = list_of_lists_to_dict(rawdata['Current Scheme Preference'],
                                  "Scheme Details")
    summary = list_of_lists_to_dict(
        rawdata['Account Summary For Current Schemes'], "Scheme Name")

    date = clean_text(sensor.soup.find(id='stddate').span.text)
    pran = clean_text(sensor.soup.find(id='pranno').text)
    for scheme, scheme_data in summary.items():
        scheme_data['Percentage'] = prefs[scheme]['Percentage']
        scheme_data['Date'] = date
        scheme_data['PRAN Number'] = pran
        output[scheme] = scheme_data

    logger.info("Logging off ..")
    sensor.get(URLS['logoff'].format(id=sensor.get_id()))
    return output
Exemplo n.º 8
0
# Dropping N/A's
df_randomized = df_randomized.dropna()

n_features = 1000
tf_vectorizer = TfidfVectorizer(stop_words='english',
                                max_df=0.98,
                                min_df=3,
                                max_features=n_features)
cleaned_texts = []
for t in list(df_randomized.loc[:, 'A/P']):
    # Doing some cleaning of the text
    t = str(t)
    blob = TextBlob(t)
    good_words = [n for n, t in blob.tags if t == 'NN' or t == 'JJ']
    good_words = utils.clean_text(' '.join(good_words))
    wnl = WordNetLemmatizer()
    good_words = [wnl.lemmatize(i) for i in good_words.split(' ')]
    cleaned_texts.append(' '.join(good_words))
tf = tf_vectorizer.fit_transform(cleaned_texts)


# Bayesian optimization
def optimize(x):
    clf = SGDClassifier(loss=x[0], penalty=x[1], alpha=x[2], random_state=0)
    scores = cross_val_score(clf,
                             tf.toarray(),
                             df_randomized['target'],
                             cv=5,
                             scoring='f1_macro')
    return -1 * scores.mean()
Exemplo n.º 9
0
 def clean_entity(text):
     return clean_text(text)