Exemplo n.º 1
0
 def __init__(self, config):
     self.config = config
     self.device = torch.device(
         "cuda" if torch.cuda.is_available() else "cpu")
     self.bert_wwm_model = BertForMaskedLM.from_pretrained(
         self.config["Model"]["bert_wwm_ext_chinese"])
     self.bert_wwm_model.eval()
     self.bert_wwm_model = self.bert_wwm_model.to(self.device)
     self.bert_wwm_tokenizer = BertTokenizer.from_pretrained(
         self.config["Model"]["bert_wwm_ext_chinese"])
     self.bert_base_model = BertForMaskedLM.from_pretrained(
         self.config["Model"]["bert_base_chinese"])
     self.bert_base_model.eval()
     self.bert_base_model = self.bert_base_model.to(self.device)
     self.bert_base_tokenizer = BertTokenizer.from_pretrained(
         self.config["Model"]["bert_base_chinese"])
     self.utils = Utils(self.config)
     self.dict_trie = self.utils.loadDictionaryTrie(
         self.config["Data"]["dictionary"], True)
     self.pinyin = self.utils.loadPinYin(self.config["Data"]["pinyin"])
     self.stroke = self.utils.loadStroke(self.config["Data"]["stroke"])
     self.place = self.utils.loadPlace(self.config["Data"]["place"])
     self.person = self.utils.loadPerson(self.config["Data"]["person"])
     self.ner_model = NER(self.config["Model"]["ner"], self.pinyin,
                          self.stroke, self.place, self.person,
                          self.config["Data"]["ssc"])
     self.charSet = self.utils.loadCharSet(
         self.config['Data']['common_char_set'])
     self.customConfusionDict = self.utils.loadCustomConfusion(
         self.config['Data']['confusion'])
     self.ngram_model = NGRAM(self.config["Model"]["ngram"])
Exemplo n.º 2
0
class Aggregate:
    # Variables
    __file = ''
    __delimiter= ''
    __col_name = ''
    log = cl.customLogs()

    def __init__(self):
        self.utils = Utils()
        pass

    def __initiate_vars(self,file_path, delimiter):
        """
        Creates the Data Frame objects and returns back in array
        :param file_path:
        :param delimiter:
        :return:
        """
        df_pandas = pd.read_csv(file_path,delimiter=delimiter)
        rzt_data = RZTData(cntx.experiment)
        df_rzt = rzt_data.read({
            'path':file_path,
            'delimiter':delimiter,
            'encoding':'utf-8'
        })

        return [df_pandas,df_rzt]

    def validate_count(self, file_path, delimiter=','):
        data_frames = self.__initiate_vars(file_path, delimiter)
        count_pd = len(data_frames[0])
        count_rzt = data_frames[1].count()
        if self.utils.verifyArrayMatch(actual=count_rzt,expected=count_pd):
            self.log.info("Verified the row count matches in both Data Frames")
        else:
            self.log.error("Row count match failed")

    def validate_minima(self, file_path, delimiter=',', col_name=''):
        data_frames = self.__initiate_vars(file_path, delimiter)
        min_pd = data_frames[0][col_name].min()
        min_rzt = data_frames[1][col_name].min()
        if self.utils.verifyArrayMatch(actual=min_rzt,expected=min_pd):
            self.log.info("Verified the Minima matches in both Data Frames on column name : {}".format(col_name).upper())
        else:
            self.log.error("Minima does not match")

    def validate_maxima(self, file_path, delimiter=',', col_name=''):
        data_frames = self.__initiate_vars(file_path, delimiter)
        max_pd = data_frames[0][col_name].max()
        max_rzt = data_frames[1][col_name].max()
        if self.utils.verifyArrayMatch(actual=max_rzt, expected=max_pd):
            self.log.info(
                "Verified the Maxima matches in both Data Frames on column name : {}".format(col_name).upper())
        else:
            self.log.error("Maxima does not match")
Exemplo n.º 3
0
    def get_scan_status(scan_id):
        scan_status_from_db = get_scan_status_by_id(scan_id)

        if scan_status_from_db:
            current_status = Utils.convert_scan_status(
                scan_status_from_db[0][0])
        else:  # Check if task is cached for 20 minutes
            if is_cached(scan_id):
                current_status = Utils.convert_scan_status("PENDING")
            else:
                current_status = Utils.convert_scan_status("NOT_FOUND")
        return current_status
Exemplo n.º 4
0
    def _get_db_conn(self):
        '''
        Retrieve a database connection using environment variables
        :return: db_conn: connection client to the database
        '''
        # host='AQICAHhMz1Wf7tTGilIGlJZDb3LaCAXhS5a/i+jj8GeI1TQa6gHQuQO7vW3uvEq6M6eSJZRWAAAAljCBkwYJKoZIhvcNAQcGoIGFMIGCAgEAMH0GCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQM/5hbtKGk/bpYfcDpAgEQgFCGmwoxqJoKriZ3QRQLKH/OjtdJhQhzKN7AeRey2snliLdcN401RUu/WfN74QMOmC5dQPF/Qa6tp2MkJFj2nGG0sT7BPiCGvpZcI6/+yxO/7Q=='
        # user='******'
        # password='******'
        utils = Utils()
        db_conn = mysql.connector.connect(
            host=utils.decrypt(os.environ.get('db_url')),
            user=utils.decrypt(os.environ.get('db_user')),
            password=utils.decrypt(os.environ.get('db_password')),
            # host=utils.decrypt(host),
            # user=utils.decrypt(user),
            # password=utils.decrypt(password),
            database=self.database)

        return db_conn
Exemplo n.º 5
0
    def validate_usedecimals(usedecimals):
        """
        """
        try:
            """Validate and parse useDecimals"""
            decimals = ['false', 'true']
            if usedecimals:
                if usedecimals in decimals:
                    usedecimals = usedecimals.lower()
                else:
                    usedecimals = 'error'
            else:
                usedecimals = 'false'
        except:
            '''Get the stack trace and print it'''
            err = traceback.format_exc()
            logger.error(err)
            utils = Utils()
            utils.send_err_email_process(
                err, 'parameters not satisfy the expected values')

        finally:
            return usedecimals
Exemplo n.º 6
0
    def validate_granularity(granularity):
        """
        """
        try:
            """Validate and parse granularity"""
            granular_opt = ['hourly', 'daily']
            if granularity:
                if granularity.lower() in granular_opt:
                    granularity = granularity.upper()
                else:
                    granularity = 'error'
            else:
                granularity = 'DAILY'

        except:
            '''Get the stack trace and print it'''
            err = traceback.format_exc()
            logger.error(err)
            utils = Utils()
            utils.send_err_email_process(
                err, 'parameters not satisfy the expected values')

        finally:
            return granularity
Exemplo n.º 7
0
class ScreenDetails(BasePage):

    log = logging.getLogger('flipkartSearch.pdp')
    com_utils = Utils()

    _purchase_item_pdp_title = (By.XPATH, "//h1/span[starts-with(@class,'_')]")
    pdp_item_cost = (By.XPATH, "(//div[contains(text(),'₹')])[1]")
    _add_to_basket = (
        By.XPATH,
        "//div[@class='row']/button[contains(text(),'ADD TO BASKET')]")
    _add_to_cart = (By.XPATH, "//ul/li/button")
    _cart_icon = (By.XPATH, "//a/span[contains(text(),'Cart')]")

    def get_pdp_item_title_cost(self):
        """
        To return the cost of the item from PDP
        """
        purchase_item_pdp_cost = self.getText(self.pdp_item_cost,
                                              info="Item Price value")
        print("PDP cost value :: " + str(purchase_item_pdp_cost))
        return purchase_item_pdp_cost

    def add_navigate_cart(self):
        """
        Add to basket and Navigate to cart
        """
        basket_elem = self.getElement(self._add_to_basket,
                                      elementName="Add to Basket button")
        if basket_elem is None:
            # cart_elem = self.getElement(self._add_to_cart, elementName="Add to Cart button")
            # self.javascript_execute("arguments[0].scrollIntoView();", cart_elem)
            self.elementClick(self._add_to_cart,
                              elementName="Add to Cart button")
            time.sleep(2)
        else:
            # self.javascript_execute("arguments[0].scrollIntoView();", basket_elem)
            self.elementClick(self._add_to_basket,
                              elementName="Add to Basket button")
            time.sleep(2)

        # self.elementClick(self._add_to_basket, elementName="Add to Basket button")
        # time.sleep(2)
        # self.elementClick(self._add_to_cart, elementName="Add to Cart button")
        # time.sleep(2)
        self.elementClick(self._cart_icon, elementName="Cart icon")
        time.sleep(2)
Exemplo n.º 8
0
class ZaiLaGan():
    # Initialize config, device, model, tokenizer, and utilities
    def __init__(self, config):
        self.config = config
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.bert_wwm_model = BertForMaskedLM.from_pretrained(
            self.config["Model"]["bert_wwm_ext_chinese"])
        self.bert_wwm_model.eval()
        self.bert_wwm_model = self.bert_wwm_model.to(self.device)
        self.bert_wwm_tokenizer = BertTokenizer.from_pretrained(
            self.config["Model"]["bert_wwm_ext_chinese"])
        self.bert_base_model = BertForMaskedLM.from_pretrained(
            self.config["Model"]["bert_base_chinese"])
        self.bert_base_model.eval()
        self.bert_base_model = self.bert_base_model.to(self.device)
        self.bert_base_tokenizer = BertTokenizer.from_pretrained(
            self.config["Model"]["bert_base_chinese"])
        self.utils = Utils(self.config)
        self.dict_trie = self.utils.loadDictionaryTrie(
            self.config["Data"]["dictionary"], True)
        self.pinyin = self.utils.loadPinYin(self.config["Data"]["pinyin"])
        self.stroke = self.utils.loadStroke(self.config["Data"]["stroke"])
        self.place = self.utils.loadPlace(self.config["Data"]["place"])
        self.person = self.utils.loadPerson(self.config["Data"]["person"])
        self.ner_model = NER(self.config["Model"]["ner"], self.pinyin,
                             self.stroke, self.place, self.person,
                             self.config["Data"]["ssc"])
        self.charSet = self.utils.loadCharSet(
            self.config['Data']['common_char_set'])
        self.customConfusionDict = self.utils.loadCustomConfusion(
            self.config['Data']['confusion'])
        self.ngram_model = NGRAM(self.config["Model"]["ngram"])
        #self.GEC = grammarErrorCorrector(self.config["Data"]["label_map"], self.config["Model"]["ngram"], self.config["Model"]["pos_model"])

    # Detect named-entities and return their corrections & positions
    def detectNamedEntity(self, sentences: List[str],
                          task_name: str) -> List[Tuple[str, List[int]]]:
        return self.ner_model.check_ner(sentences, task_name)

    # Detect potential spelling errors in a given sentence/paragraph and return detected error positions & top predictions from BERT
    def detectSpellingError(
            self, text: str, threshold: float,
            topk: int) -> Tuple[List[int], Dict[int, List[str]]]:
        positions = []
        predictions = {}
        # Mask each word and predict it
        for i in range(len(text)):
            # Check if current word is a chinese character
            if (not self.utils.isChineseChar(text[i])):
                continue
            # Add mask
            masked_text = "[CLS]" + text[:i] + "[MASK]" + text[i +
                                                               1:] + "[SEP]"
            # Tokenize input text
            tokenized_masked_text = self.bert_wwm_tokenizer.tokenize(
                masked_text)
            masked_token_index = tokenized_masked_text.index("[MASK]")
            # Construct token ids and segment ids
            token_ids = torch.tensor([
                self.bert_wwm_tokenizer.convert_tokens_to_ids(
                    tokenized_masked_text)
            ])
            segment_ids = torch.tensor([[0] * token_ids.shape[1]])
            # Set up ids on GPU
            token_ids = token_ids.to(self.device)
            segment_ids = segment_ids.to(self.device)
            # Predict masked token
            with torch.no_grad():
                outputs = self.bert_wwm_model(token_ids,
                                              token_type_ids=segment_ids)
                scores = outputs[0][0, masked_token_index]
                # Classify the token as a potential spelling error if predicted probability is lower than given threshold
                token_probability = torch.nn.Softmax(0)(scores)[
                    self.bert_wwm_tokenizer.convert_tokens_to_ids(text[i])]
                if (token_probability < threshold):
                    # Extract top predictions from BERT
                    token_scores, token_indices = scores.topk(topk)
                    top_predicted_tokens = self.bert_wwm_tokenizer.convert_ids_to_tokens(
                        token_indices)
                    positions.append(i)
                    predictions[i] = top_predicted_tokens
        return (positions, predictions)

    # Give top n suggestions of spelling error correction
    def correctSpellingError(
            self, text: str, err_positions: Set[int],
            predictions: Dict[int, List[str]], ne_positions: Set[int],
            candidate_num: int,
            similar_bonus: float) -> List[Tuple[str, int, float]]:
        # Initialize a dictionary to record starting positions of potentially correct tokens/words
        starting_positions = {}
        # Add original tokens
        for i in range(len(text)):
            token = text[i]
            # Separate all tokens/words from tokens/words that are similar in stroke or pinyin
            starting_positions[i] = (set(token), set(token))
        # Add similar tokens in stroke or pinyin
        for err_position in err_positions:
            # Check if the error token is included in a named-entity
            if (err_position in ne_positions):
                continue
            else:
                error_token = text[err_position]
                ssc_target = self.ner_model.ssc.getSoundCode(error_token)
                if (error_token in self.stroke):
                    for similar_token in self.stroke[error_token][:3]:
                        starting_positions[err_position][0].add(similar_token)
                        starting_positions[err_position][1].add(similar_token)
                if (error_token in self.pinyin):
                    for similar_token in self.pinyin[error_token][:7]:
                        starting_positions[err_position][0].add(similar_token)
                        starting_positions[err_position][1].add(similar_token)
                for predicted_token in predictions[err_position]:
                    # Check if BERT's prediction is a chinese character
                    if (len(predicted_token) == 1
                            and self.utils.isChineseChar(predicted_token)):
                        starting_positions[err_position][0].add(
                            predicted_token)
                        ssc_pred = self.ner_model.ssc.getSoundCode(
                            predicted_token)
                        ssc_score = self.ner_model.ssc.computeSoundCodeSimilarity(
                            ssc_target, ssc_pred)
                        if ssc_score >= 0.7:
                            starting_positions[err_position][1].add(
                                predicted_token)
        # Construct candidate sentences
        candidates = []
        prefixes = list(starting_positions[0][0])
        # Initialize counts of tokens/words that are similar in stroke or pinyin
        for i in range(len(prefixes)):
            if (prefixes[i] in starting_positions[0][1]):
                prefixes[i] = (prefixes[i], 1)
            else:
                prefixes[i] = (prefixes[i], 0)
        while (len(prefixes) > 0):
            prefix = prefixes.pop(0)
            if (len(prefix[0]) == len(text)):
                candidates.append((prefix[0], prefix[1],
                                   self.ngram_model.get_ppl(prefix[0])))
            else:
                for suffix in starting_positions[len(prefix[0])][0]:
                    if (suffix in starting_positions[len(prefix[0])][1]):
                        prefixes.append((prefix[0] + suffix, prefix[1] + 1))
                    else:
                        prefixes.append((prefix[0] + suffix, prefix[1]))
        # Sort candidate sentences by perplexities from ngram model
        candidates.sort(key=lambda x: x[2])
        # Compute top candidate sentences' perplexities again with GPT2 and sort
        candidates = candidates[:50]
        for i in range(len(candidates)):
            candidates[i] = (candidates[i][0], candidates[i][1],
                             self.utils.getSentencePpl(candidates[i][0]))
        candidates.sort(key=lambda x: x[2])
        # Extract top n suggestions
        recommendations = []
        for i in range(min(len(candidates), candidate_num)):
            recommendations.append(candidates[i])
        # Take counts of tokens/words that are similar in stroke or pinyin into consideration and sort again
        for i in range(len(recommendations)):
            recommendations[i] = (recommendations[i][0], recommendations[i][1],
                                  recommendations[i][2] /
                                  pow(similar_bonus, recommendations[i][1]))
        recommendations.sort(key=lambda x: x[2])
        return recommendations

    # Find spelling error correction candidates from dictionary
    def generate_correction_cand(self, word):
        correction_candidates = []
        if len(word) == 1:
            # Add similar tokens in pinyin
            confusion_word_set = set()
            for char in self.charSet:
                if lazy_pinyin(char) == lazy_pinyin(word):
                    confusion_word_set.add(char)
            confusion_word_set = confusion_word_set.union(
                set(self.pinyin[word]))
            correction_candidates.extend(confusion_word_set)
            # Add similar tokens in stroke
            correction_candidates.extend(self.stroke[word])

        if len(word) > 2:
            edit_cand = set()
            word_splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
            transposes = [
                L + R[1] + R[0] + R[2:] for L, R in word_splits if len(R) > 1
            ]
            replaces = [
                L + c + R[1:] for L, R in word_splits if R
                for c in self.charSet
            ]
            edit_set = set(transposes + replaces)
            for edit in edit_set:
                if self.dict_trie.getWordFreq(edit) > 0:
                    edit_cand.add(edit)
            correction_candidates.extend(edit_cand)

            confusion_word_set = set()
            if word in self.customConfusionDict:
                confusion_word_set = {self.customConfusionDict[word]}
            correction_candidates.extend(confusion_word_set)

            if len(word) == 2:
                # Add similar tokens in pinyin
                correction_candidates.extend(
                    set(ele + word[1:] for ele in self.pinyin[word[0]] if ele))
                correction_candidates.extend(
                    set(word[:-1] + ele for ele in self.pinyin[word[-1]]
                        if ele))

            if len(word) > 2:
                correction_candidates.extend(
                    set(word[0] + ele + word[2:]
                        for ele in self.pinyin[word[1]] if ele))
                correction_candidates.extend(
                    set(ele + word[-1] for ele in self.pinyin[word[1]] if ele))
                correction_candidates.extend(
                    set(word[0] + ele for ele in self.pinyin[word[1]] if ele))

        return correction_candidates

    # Detect and correct spelling errors given input text
    def bertDetectAndCorrect(self, text: str, topk: int,
                             ner_pos_list: List[int]) -> Tuple[str, List[int]]:
        positions = []
        text_list = list(text)
        # split input text into short texts
        re_punctuation = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&]+)", re.U)
        short_texts = []
        components = re_punctuation.split(text)
        components = list(filter(('').__ne__, components))
        start_idx = 0
        for comp in components:
            if re_punctuation.match(comp):
                short_texts.append((comp, start_idx))
                start_idx += len(comp)
                start_idx += 1
        # character-based detection and correction
        for (short_text, start_idx) in short_texts:
            for idx, single_word in enumerate(short_text):
                if self.utils.isChineseChar(single_word):
                    if start_idx + idx not in ner_pos_list:
                        # bert-based model generates topk candidates
                        masked_text = "[CLS]" + text[:idx] + "[MASK]" + text[
                            idx + 1:] + "[SEP]"
                        tokenized_masked_text = self.bert_base_tokenizer.tokenize(
                            masked_text)
                        token_ids = torch.tensor([
                            self.bert_base_tokenizer.convert_tokens_to_ids(
                                tokenized_masked_text)
                        ])
                        segment_ids = torch.tensor([[0] * token_ids.shape[1]])
                        token_ids = token_ids.to(self.device)
                        segment_ids = segment_ids.to(self.device)
                        with torch.no_grad():
                            outputs = self.bert_base_model(
                                token_ids, token_type_ids=segment_ids)
                            scores = outputs[0][0, idx + 1]
                            token_probability = torch.nn.Softmax(0)(scores)[
                                self.bert_base_tokenizer.convert_tokens_to_ids(
                                    text[idx])]
                            scores_list = torch.nn.Softmax(0)(scores)
                            _, pred = scores_list.topk(topk, 0, True, True)
                            topk_bert_candidates = [
                                self.bert_base_tokenizer.convert_ids_to_tokens(
                                    ele.item()) for ele in pred
                            ]

                        if topk_bert_candidates and (
                                single_word not in topk_bert_candidates):
                            candidates = self.generate_correction_cand(
                                short_text[idx])
                            candidates_sorted = sorted(
                                candidates,
                                key=lambda k: self.dict_trie.getWordFreq(k),
                                reverse=True)
                            if candidates_sorted:
                                for topk_bert_cand in topk_bert_candidates:
                                    if topk_bert_cand in candidates_sorted:
                                        #print(['- '+single_word, '+ '+topk_bert_cand + '_'+str(start_idx+idx)])
                                        text_list[start_idx +
                                                  idx] = topk_bert_cand
                                        positions.append(start_idx + idx)
                                        single_word = topk_bert_cand
                                        break

        # word-based detection and correction
        for (short_text, start_idx) in short_texts:
            for n in [2, 3, 4, 5]:
                for idx in range(len(short_text) - n + 1):
                    if not ner_pos_list or (
                            ner_pos_list and
                        (start_idx + idx > ner_pos_list[-1]
                         or start_idx + idx + n < ner_pos_list[0])):
                        if self.utils.isChineseChar(short_text[idx]):
                            word = short_text[idx:idx + n]
                            # bert-based model generates topk candidates
                            masked_text = "[CLS]" + text[:
                                                         idx] + "[MASK]" + text[
                                                             idx +
                                                             1:] + "[SEP]"
                            tokenized_masked_text = self.bert_base_tokenizer.tokenize(
                                masked_text)
                            token_ids = torch.tensor([
                                self.bert_base_tokenizer.convert_tokens_to_ids(
                                    tokenized_masked_text)
                            ])
                            segment_ids = torch.tensor([[0] *
                                                        token_ids.shape[1]])
                            token_ids = token_ids.to(self.device)
                            segment_ids = segment_ids.to(self.device)
                            with torch.no_grad():
                                outputs = self.bert_base_model(
                                    token_ids, token_type_ids=segment_ids)
                                scores = outputs[0][0, idx + 1]
                                token_probability = torch.nn.Softmax(0)(
                                    scores)[self.bert_base_tokenizer.
                                            convert_tokens_to_ids(text[idx])]
                                scores_list = torch.nn.Softmax(0)(scores)
                                _, pred = scores_list.topk(topk, 0, True, True)
                                topk_bert_candidates = [
                                    self.bert_base_tokenizer.
                                    convert_ids_to_tokens(ele.item())
                                    for ele in pred
                                ]

                            candidates = self.generate_correction_cand(word)
                            candidates = [
                                ele for ele in candidates
                                if self.dict_trie.getWordFreq(ele) > 0
                            ]
                            if candidates:
                                for topk_bert_cand in topk_bert_candidates:
                                    tmp_word = topk_bert_cand + word[1:]
                                    if tmp_word in candidates and tmp_word != word:
                                        #print(['- '+short_text[idx], '+ '+topk_bert_cand + '_'+str(start_idx+idx)])
                                        text_list[start_idx +
                                                  idx] = topk_bert_cand
                                        positions.append(start_idx + idx)
                                        break
        # return corrected string and error position list
        return (''.join(text_list), sorted(list(set(positions))))

    # Divide a long text into multiple parts and correct spelling errors separately
    def divideAndCorrectSpellingError(self, text: str) -> Tuple[str, str]:
        # Perform named-entity recognition first
        ner_processed_text, ne_positions, _ = self.detectNamedEntity(
            [text], 'correction')[0]
        ne_positions = set(ne_positions)
        # Detect spelling errors
        err_positions, bert_predictions = self.detectSpellingError(
            ner_processed_text, 1e-5, 3)
        err_positions = set(err_positions)
        # Split long text into multiple parts
        punctuations = {"。", "?", "!", ",", "、", ";", ":"}
        splitted_text = []
        sub_ne_positions, sub_err_positions, sub_bert_predictions = set(), set(
        ), {}
        start = 0
        count = 0
        for i in range(len(ner_processed_text)):
            # Check if current character is included in a named-entity or is an error
            if (i in ne_positions):
                sub_ne_positions.add(i - start)
            if (i in err_positions):
                sub_err_positions.add(i - start)
                sub_bert_predictions[i - start] = bert_predictions[i]
            # Check if current character is a punctuation
            if (ner_processed_text[i] in punctuations):
                count += 1
            # Check if a short text has been completed
            if (count == 2):
                splitted_text.append(
                    (ner_processed_text[start:i + 1], sub_err_positions,
                     sub_bert_predictions, sub_ne_positions))
                sub_ne_positions, sub_err_positions, sub_bert_predictions = set(
                ), set(), {}
                start = i + 1
                count = 0
            elif (i == len(ner_processed_text) - 1):
                splitted_text.append(
                    (ner_processed_text[start:], sub_err_positions,
                     sub_bert_predictions, sub_ne_positions))
        # Correct spelling errors in each short text and combine corrected results
        corrections = []
        for short_text in splitted_text:
            correction = self.correctSpellingError(short_text[0],
                                                   short_text[1],
                                                   short_text[2],
                                                   short_text[3], 10,
                                                   1.5)[0][0]
            corrections.append(correction)
        return (ner_processed_text, "".join(corrections))

    # Get substitution words
    def getWordSub(self, text):
        res = os.popen(
            "conda run -n wordSub python ./utilities/wordSubJob.py " +
            text).read()
        dic = ast.literal_eval(res)
        return dic
Exemplo n.º 9
0
    def run(self, tdids, granularity, usedecimals):
        '''
        Function that controls the extraction of data from the Aislelabs endpoint
        :return: No value to return
        '''

        logger.info('Initiating run')
        '''Initiate the Utils'''
        utils = Utils()
        '''Retrieve values from config file'''
        config = utils.get_yaml_config()
        baseurl = config['aislelabs']['baseurl']
        extension = config['aislelabs']['domain']
        bucket = config['aws']['bucket']
        database = config['status_table']['database']
        table = config['status_table']['table']
        '''Get the URL'''
        url = utils.concatenate_url(baseurl, extension)
        '''Retrieve values from environment vars'''
        sender = os.environ['sender']
        recipients = os.environ['recipients'].split(',')
        region = os.environ['aws_region']
        # sender = "*****@*****.**"
        # recipients = "*****@*****.**"
        # region = 'use-east-1'
        '''Get the api key stored in environment variables and decrypt it'''
        apikey = utils.decrypt(os.environ.get('apikey'))
        #apikey = '3c06767b873c483fc6295fbc7bc421e1'
        '''Get the request parameters and send the the request to the Aislelabs endpoint'''
        try:
            """Set the query datetimes"""
            ts1 = int(self.lambda_start_time) / 1000
            ts2 = int(self.lambda_end_time) / 1000
            query_start_date = datetime.fromtimestamp(ts1).strftime(
                '%Y-%m-%d %H:%M:%S')
            query_end_date = datetime.fromtimestamp(ts2).strftime(
                '%Y-%m-%d %H:%M:%S')
            """Executing HTTP GET request"""
            request_parameters = self._craft_request(self.lambda_start_time,
                                                     self.lambda_end_time,
                                                     apikey, tdids,
                                                     granularity, usedecimals)
            response_json = self._make_request(url, request_parameters)

            filename_ts = self.lambda_start_time
            filename_end_ts = self.lambda_end_time
            '''Create the filename and upload the JSON response to S3 if possible'''
            if granularity.lower() == 'hourly':
                filename = '{}/{}/{}-{}-{}.json'.format(
                    'aislelabs', 'hourly-unfiltered-traffic', 'hourly',
                    filename_ts, filename_end_ts)
            else:
                filename = '{}/{}/{}-{}-{}.json'.format(
                    'aislelabs', 'daily-unfiltered-traffic', 'daily',
                    filename_ts, filename_end_ts)

            if response_json:
                logger.info(
                    'Uploading the file to S3 with the following filepath: {}/{}'
                    .format(bucket, filename))
                utils.json_to_s3(bucket, filename, response_json)
                '''Create a dict to pass to the Load class to send the status to RDS'''
                dict_to_db = {
                    'filename': filename,
                    'tdid': tdids,
                    'query_start_date': query_start_date,
                    'query_end_date': query_end_date,
                    'workflow_step_rds': 1,
                    'date_created':
                    datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S'),
                    'date_updated':
                    datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S'),
                    'processing_complete_rds': False,
                    'error_rds': False,
                }

                load = Load(database, table)

                load.send_status_to_db(dict_to_db)
            else:
                logger.warning(
                    "We did not receive a successful response back from the endpoint. No file will be uploaded to S3"
                )

                dict_to_db = {
                    'filename': filename,
                    'tdid': tdids,
                    'query_start_date': query_start_date,
                    'query_end_date': query_end_date,
                    'workflow_step_rds': 1,
                    'date_created':
                    datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S'),
                    'date_updated':
                    datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S'),
                    'processing_complete_rds': True,
                    'error_rds': False,
                }

                load = Load(database, table)

                load.send_status_to_db(dict_to_db)
        except:
            '''Get the stack trace and print it'''
            err = traceback.format_exc()
            logger.error(err)
            utils.send_err_email(err, 'Data retrieval from Aislelabs', sender,
                                 recipients, filename_ts, region)
            '''If we get an error, we still want to send a record to the DB for tracking'''
            dict_to_db = {
                'filename': filename,
                'tdid': tdids,
                'query_start_date': query_start_date,
                'query_end_date': query_end_date,
                'workflow_step_rds': 1,
                'date_created':
                datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S'),
                'date_updated':
                datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S'),
                'processing_complete_rds': False,
                'error_rds': True,
            }

            load = Load(database, table)

            load.send_status_to_db(dict_to_db)
Exemplo n.º 10
0
def lambda_handler(event, context):
    '''
    Function called by the Lambda Trigger. The way this was created is to delegate tasks to other classes and functions in order to keep the main function lightweight.
    :param event: Event passed in by the lambda trigger
    :type event: dict
    :param context: Context passed in by the lambda trigger
    :return: No value to return
    '''

    try:
        """Initial process flow"""
        param = event['queryStringParameters']
        if param:
            """Validate parameters"""
            if "granularity" in param:
                granularity = Extract.validate_granularity(
                    param['granularity'])
            else:
                granularity = 'DAILY'

            if "usedecimals" in param:
                usedecimals = Extract.validate_usedecimals(
                    param['usedecimals'])
            else:
                usedecimals = 'false'

            assert granularity.lower() in ['hourly',
                                           'daily'], "Please validate"
            assert usedecimals.lower() in ['false', 'true'], "Please validate"
            """Read the main parameters"""
            lambda_start_time = param['startEpoch']
            lambda_end_time = param['endEpoch']
            tdids = param['tdids']
            Extract(lambda_start_time,
                    lambda_end_time).run(tdids, granularity, usedecimals)
            return {
                "statusCode":
                200,
                "body":
                json.dumps('Data retrieved successfully from {} to {}!'.format(
                    lambda_start_time, lambda_end_time))
            }

        else:
            return {
                "statusCode":
                400,
                "body":
                json.dumps(
                    'The request has been sent without parameters, please try again!'
                )
            }

    except Exception as err:
        logger = logging.getLogger()
        logger.setLevel(logging.INFO)
        utils = Utils()
        utils.send_err_email_process(
            err, 'parameters do not satisfy the expected values')
        logger.error('Process failed due to a initial parameters error')

        return {
            "statusCode":
            400,
            "body":
            json.dumps(
                'There is a request error on the query parameters, please validate the request and try again.'
            )
        }
Exemplo n.º 11
0
 def __init__(self):
     self.utils = Utils()
     pass
Exemplo n.º 12
0
class StringFunctions:
    log = cl.customLogs()

    def __init__(self):
        self.utils = Utils()
        pass

    def __initiate_vars(self, file_path, delimiter):
        """
        Creates the Data Frame objects and returns back in array
        :param file_path:
        :param delimiter:
        :return:
        """
        df_pandas = pd.read_csv(file_path, delimiter=delimiter)
        rzt_data = RZTData(cntx.experiment)
        df_rzt = rzt_data.read({
            'path': file_path,
            'delimiter': delimiter,
            'encoding': 'utf-8'
        })

        return [df_pandas, df_rzt]

    def compareDataFrame(self, actual, expected, fnc='', index_col=1):
        actual.to_csv("Test.csv")
        actual_tmp = pd.read_csv("Test.csv", index_col=index_col)
        expected.to_csv("Test1.csv")
        expected_tmp = pd.read_csv("Test1.csv", index_col=index_col)

        if self.utils.verifyDFMatch(actual=actual_tmp,
                                    expected=expected_tmp) is None:
            self.log.info("Verified the DF Matches for {}() function".format(
                fnc).upper())
        else:
            self.log.error(
                "DF match failed for {}() function".format(fnc).upper())

    def validate_asc_sort(self, file_path, col_name, delimiter=','):
        data_frames = self.__initiate_vars(file_path, delimiter)
        asc_pd = data_frames[0].sort_values(by=col_name, ascending=True)
        asc_rzt = data_frames[1].asc_sort(col_name)

        self.compareDataFrame(actual=asc_rzt, expected=asc_pd, fnc="ascending")

    def validate_desc_sort(self, file_path, col_name, delimiter=','):
        data_frames = self.__initiate_vars(file_path, delimiter)
        dsc_pd = data_frames[0].sort_values(by=col_name, ascending=False)
        dsc_rzt = data_frames[1].desc_sort(col_name)

        self.compareDataFrame(actual=dsc_rzt,
                              expected=dsc_pd,
                              fnc="descending")

    def validate_to_upper(self, file_path, col_name, delimiter=','):
        data_frames = self.__initiate_vars(file_path, delimiter)
        to_upper_rzt = data_frames[1].to_upper(col_name)
        to_upper_pd = data_frames[0]
        to_upper_pd[col_name] = to_upper_pd[col_name].str.upper()

        self.compareDataFrame(actual=to_upper_rzt,
                              expected=to_upper_pd,
                              fnc="to_upper")

    def validate_to_lower(self, file_path, col_name, delimiter=','):
        data_frames = self.__initiate_vars(file_path, delimiter)
        to_lower_rzt = data_frames[1].to_lower(col_name)
        to_lower_pd = data_frames[0]
        to_lower_pd[col_name] = to_lower_pd[col_name].str.lower()

        self.compareDataFrame(actual=to_lower_rzt,
                              expected=to_lower_pd,
                              fnc="to_lower")

    def validate_to_title(self, file_path, col_name, delimiter=','):
        data_frames = self.__initiate_vars(file_path, delimiter)
        to_title_rzt = data_frames[1].to_titlecase(col_name)
        to_title_pd = data_frames[0]
        to_title_pd[col_name] = to_title_pd[col_name].str.title()

        self.compareDataFrame(actual=to_title_rzt,
                              expected=to_title_pd,
                              fnc="to_title")

    def validate_trim(self, file_path, col_name, delimiter=','):
        data_frames = self.__initiate_vars(file_path, delimiter)
        trim_rzt = data_frames[1].trim(col_name)
        trim_pd = data_frames[0]
        trim_pd[col_name] = trim_pd[col_name].str.strip()

        self.compareDataFrame(actual=trim_rzt, expected=trim_pd, fnc="trim")

    def validate_format_date(self,
                             file_path,
                             col_name,
                             new_format='%d/%m/%Y',
                             delimiter=','):
        data_frames = self.__initiate_vars(file_path, delimiter)
        format_rzt = data_frames[1].format_date(key=col_name,
                                                destinationformat=new_format)
        format_pd = data_frames[0]
        format_pd[col_name] = pd.to_datetime(format_pd[col_name],
                                             format=new_format)

        self.compareDataFrame(actual=format_rzt,
                              expected=format_pd,
                              fnc="format_date")
Exemplo n.º 13
0
class Mathematical:
    # Variables
    __file = ''
    __delimiter = ''
    __col_name = ''
    log = cl.customLogs()

    def __init__(self):
        self.utils = Utils()
        pass

    def __initiate_vars(self, file_path, delimiter):
        """
        Creates the Data Frame objects and returns back in array
        :param file_path:
        :param delimiter:
        :return:
        """
        df_pandas = pd.read_csv(file_path, delimiter=delimiter)
        rzt_data = RZTData(cntx.experiment)
        df_rzt = rzt_data.read({
            'path': file_path,
            'delimiter': delimiter,
            'encoding': 'utf-8'
        })

        return [df_pandas, df_rzt]

    def compareDataFrame(self, actual, expected, fnc='', index_col=1):
        actual.to_csv("Test.csv")
        actual_tmp = pd.read_csv("Test.csv", index_col=index_col)
        expected.to_csv("Test1.csv")
        expected_tmp = pd.read_csv("Test1.csv", index_col=index_col)

        if self.utils.verifyDFMatch(actual=actual_tmp,
                                    expected=expected_tmp) is None:
            self.log.info("Verified the DF Matches for {}() function".format(
                fnc).upper())
        else:
            self.log.error(
                "DF match failed for {}() function".format(fnc).upper())

    def validate_sqrt(self,
                      file_path,
                      col_name,
                      delimiter=',',
                      new_col_name=''):
        data_frames = self.__initiate_vars(file_path, delimiter)
        if new_col_name is None:
            sqrt_rzt = data_frames[1].sqrt(col_name)
            sqrt_pd = data_frames[0][col_name]**0.5

        else:
            sqrt_rzt = data_frames[1].sqrt(col_name, new_col_name)
            sqrt_pd = data_frames[0]
            sqrt_pd[new_col_name] = sqrt_pd[col_name]**0.5

        self.compareDataFrame(actual=sqrt_rzt, expected=sqrt_pd, fnc="sqrt")

    def validate_inverse(self,
                         file_path,
                         col_name,
                         delimiter=',',
                         new_col_name=''):
        data_frames = self.__initiate_vars(file_path, delimiter)
        if new_col_name is None:
            inverse_rzt = data_frames[1].inverse(col_name)
            inverse_pd = data_frames[0]
            inverse_pd[col_name] = 1 / inverse_pd[col_name]
        else:
            inverse_rzt = data_frames[1].inverse(col_name, new_col_name)
            inverse_pd = data_frames[0]
            inverse_pd[new_col_name] = 1 / inverse_pd[col_name]
        self.compareDataFrame(actual=inverse_rzt,
                              expected=inverse_pd,
                              fnc="inverse")

    def validate_power(self,
                       file_path,
                       col_name,
                       exponent,
                       delimiter=',',
                       new_col_name=''):
        data_frames = self.__initiate_vars(file_path, delimiter)
        if new_col_name is None:
            power_rzt = data_frames[1].pow(col_name, exponent=exponent)
            power_pd = data_frames[0]
            power_pd[col_name] = power_pd[col_name]**exponent
        else:
            power_rzt = data_frames[1].pow(col_name,
                                           new_col_name,
                                           exponent=exponent)
            power_pd = data_frames[0]
            power_pd[new_col_name] = power_pd[col_name]**exponent
        self.compareDataFrame(actual=power_rzt, expected=power_pd, fnc="power")

    def validate_log2(self,
                      file_path,
                      col_name,
                      delimiter=',',
                      new_col_name=''):
        data_frames = self.__initiate_vars(file_path, delimiter)
        if new_col_name is None:
            log_rzt = data_frames[1].log2(col_name)
            log_pd = data_frames[0]
            log_pd[col_name] = np.log2(log_pd[col_name])
        else:
            log_rzt = data_frames[1].log2(col_name, new_col_name)
            log_pd = data_frames[0]
            log_pd[new_col_name] = np.log2(log_pd[col_name])
        self.compareDataFrame(actual=log_rzt, expected=log_pd, fnc="log2")

    def validate_log10(self,
                       file_path,
                       col_name,
                       delimiter=',',
                       new_col_name=''):
        data_frames = self.__initiate_vars(file_path, delimiter)
        if new_col_name is None:
            log_rzt = data_frames[1].log10(col_name)
            log_pd = data_frames[0]
            log_pd[col_name] = np.log10(log_pd[col_name])
        else:
            log_rzt = data_frames[1].log10(col_name, new_col_name)
            log_pd = data_frames[0]
            log_pd[new_col_name] = np.log10(log_pd[col_name])
        self.compareDataFrame(actual=log_rzt, expected=log_pd, fnc="log10")

    def validate_log(self,
                     file_path,
                     col_name,
                     base=10,
                     delimiter=',',
                     new_col_name=''):
        data_frames = self.__initiate_vars(file_path, delimiter)
        if new_col_name is None:
            log_rzt = data_frames[1].log(col_name, base=base)
            log_pd = data_frames[0]
            log_pd[col_name] = np.log(log_pd[col_name]) / np.log(base)
        else:
            log_rzt = data_frames[1].log(col_name, new_col_name, base=base)
            log_pd = data_frames[0]
            log_pd[new_col_name] = np.log(log_pd[col_name]) / np.log(base)
        self.compareDataFrame(actual=log_rzt, expected=log_pd, fnc="log")
Exemplo n.º 14
0
class PageHome(BasePage):

    com_utils = Utils()
    email_value = os.environ.get("flip_username")
    password_value = os.environ.get("flip_password")

    _signIn_user_name = (By.XPATH, "(//input[@type='text'])[last()]")
    _signIn_password = (By.CSS_SELECTOR, "input[type='password']")
    _signIn_login = (By.XPATH, "(//button[@type='submit'])[last()]")
    _signIn_userName_val= '//div[contains(text(),"{0}")]'
    username_elem = None
    _search_results_index_path = None
    _signIn_searchBox = (By.CSS_SELECTOR, "input[type='text'][name='q']")
    _search_icon = (By.CSS_SELECTOR, "button[type='submit']")
    _results_breadcrumb = "//div[@class='bhgxx2']//following::span[contains(text(),'{}')]"
    _search_results_index = "(//div[contains(@class,'bhgxx2 col-12-12')]/div/child::div[@data-id])[{}]//following::a"
    _purchase_item_pdp_title = "//h1/span[contains(text(), '{}')]"

    def invoke_url(self, url):
        """
        Get amazon url and wait for the page to load
        """
        self.open(url)
        pageLoaded = self.page_loaded()
        return pageLoaded

    def signIn_enter_emailaddress(self):
        self.clearTextVals(self._signIn_user_name, "Email/Mobile Number field")
        self.sendKeys(self.email_value, self._signIn_user_name, "Email/Mobile Number text field")

    def signIn_enter_password(self):
        self.clearTextVals(self._signIn_password, "Password field")
        self.sendKeys(self.password_value, self._signIn_password, "Password text field")

    def signIn_click_login(self, userName_val):
        self.elementClick(self._signIn_login, elementName="Login button")
        self._signIn_userName_assert = self.com_utils.addTextValXpath(userName_val, self._signIn_userName_val)
        self.username_elem = self.waitForElement(self._signIn_userName_assert, elementName="User Name text")

    def signIn_validate_credentials(self, userName_val):
        self.signIn_enter_emailaddress()
        self.signIn_enter_password()
        self.signIn_click_login(userName_val)
        return self.username_elem

    def enter_value_search(self, searchValue):
        """
        Enter values based on searchValue parameter
        """
        self.sendKeys(searchValue, self._signIn_searchBox, elementName="Search box")
        self.elementClick(self._search_icon, elementName="Search icon")
        _search_results_breadcrumb = self.com_utils.addTextValXpath(searchValue, self._results_breadcrumb)
        self.waitForElement(_search_results_breadcrumb, elementName="Breadcrumb text")
        breadcrumb_searchVal = self.isElementPresent(locator=_search_results_breadcrumb)
        return breadcrumb_searchVal

    def select_random_item_search_results(self, searchLabel):
        """
        Select any random item from the search results
        """
        _search_results_breadcrumb = self.com_utils.addTextValXpath(searchLabel, self._results_breadcrumb)
        results_text = self.getText(_search_results_breadcrumb, info="Getting results count in page")
        #print("Breadcrumb for search results :: " + results_count)
        results_index = results_text.split(" ")
        #print("Results index :: " + str(results_index))
        results_start_index = results_index[1]
        #print("Results start index :: " + str(results_start_index))
        results_end_index = results_index[3]
        #print("Results End index :: " + str(results_end_index))
        random_index = random.randint(int(results_start_index) + 1, int(results_end_index) - 1)
        #print("Selecting the item in index :: " + str(random_index))
        self._search_results_index_path = self.com_utils.addTextValXpath(random_index, self._search_results_index)
        purchase_item_title = self.getText(self._search_results_index_path, info="Purchase Item Title")
        if purchase_item_title.endswith("..."):
            purchase_item_title = purchase_item_title.replace("...", "", 1)
        return purchase_item_title

    def click_item_search_results(self, purchase_item_title):
        self.elementClick(self._search_results_index_path, elementName="Purchase Item")
        time.sleep(3)