예제 #1
0
 def get_discipline(major_phrase,
                    prefer_index,
                    discipline_phrase_dic,
                    debug=False):
     prefer_major = major_phrase[prefer_index]
     prefer_major = SegmentHelper.normalize(
         TextHelper.unicode_to_ascii(prefer_major))
     prefer_major = TextHelper.get_dict_pattern(prefer_major,
                                                discipline_phrase_dic)
     if len(prefer_major) == 0:
         if debug:
             print("prefer major can not found match phrase in dict: %s" %
                   major_phrase[prefer_index])
         prefer_major = ' '.join(major_phrase)
         prefer_major = SegmentHelper.normalize(
             TextHelper.unicode_to_ascii(prefer_major))
         prefer_major = TextHelper.get_dict_pattern(prefer_major,
                                                    discipline_phrase_dic)
         if len(prefer_major) == 0:
             if debug:
                 print("Can not found major words: %s" % str(major_phrase))
             return None
     max_length = max([len(key) for key in prefer_major.keys()])
     for major in prefer_major:
         if len(major) == max_length:
             return major
예제 #2
0
 def get_highest_education(profile, education_phrase_dic,
                           discipline_phrase_dic):
     education_dic = {}
     for i in range(len(profile['education'])):
         education = SegmentHelper.normalize(
             TextHelper.unicode_to_ascii(profile['education'][i]))
         education_dic[i] = TextHelper.get_dict_pattern(
             education, education_phrase_dic)
     education_dic = {
         e_dic.keys()[0]: index
         for index, e_dic in education_dic.items() if len(e_dic) > 0
     }
     if 'Doctor' in education_dic:
         return ['Doctor'], [
             ProfileHelper.get_discipline(profile['major'],
                                          education_dic['Doctor'],
                                          discipline_phrase_dic)
         ]
     elif 'Master' in education_dic:
         return ['Master'], [
             ProfileHelper.get_discipline(profile['major'],
                                          education_dic['Master'],
                                          discipline_phrase_dic)
         ]
     elif 'Bachelor' in education_dic:
         return ['Bachelor'], [
             ProfileHelper.get_discipline(profile['major'],
                                          education_dic['Bachelor'],
                                          discipline_phrase_dic)
         ]
     else:
         return [], []
예제 #3
0
    def remove_reduntant_text_part(self, caption: CaptionLine,
                                   to_cut_text: str, at_start: bool) -> str:
        """
        Удаляем только ту часть, которая осталась лишней и неотформатированной.
        Возвращает подстроку, которую нужно еще удалить
        :return:
        """
        len_to_cut = TextHelper.define_smallest_len_to_cut(
            caption.left_unformatted_text, to_cut_text)
        if caption.left_unformatted_text == '':
            return to_cut_text

        if self.is_similar_strings(caption.left_unformatted_text, to_cut_text,
                                   at_start):
            if at_start:
                splitted_text_tuple = TextHelper.cut_text_from_start(
                    caption.left_unformatted_text, len_to_cut)
            else:
                splitted_text_tuple = TextHelper.cut_text_from_end(
                    caption.left_unformatted_text, len_to_cut)
        else:
            return to_cut_text

        caption.left_unformatted_text = splitted_text_tuple[1]
        caption.correct_unformatted_text = splitted_text_tuple[0]

        # Корректируем оставшуюся длину для удаления.
        return TextHelper.cut_text_from_start(to_cut_text, len_to_cut)[0]
예제 #4
0
 def get_skills(profile, skills_dic, debug=False):
     skill_phrases = ' '.join(profile['skills'])
     skill_phrases = SegmentHelper.normalize(
         TextHelper.unicode_to_ascii(skill_phrases))
     if debug:
         print("right after normalize: %s" % skill_phrases)
     skill_phrases_dict = TextHelper.get_dict_pattern(
         skill_phrases, skills_dic)
     if len(skill_phrases_dict) == 0:
         # print ("can not found skills in %s" % str(skills))
         return []
     else:
         return skill_phrases_dict.keys()
예제 #5
0
 def is_similar_strings(self,
                        str1: str,
                        str2: str,
                        at_start_to_cut=True) -> bool:
     len_to_cut = TextHelper.define_smallest_len_to_cut(str1, str2)
     if at_start_to_cut:
         if len(str1) == len_to_cut:
             if str2.startswith(str1):
                 return True
             else:
                 return False
         else:
             if str1.startswith(str2):
                 return True
             else:
                 return False
     else:
         if len(str1) == len_to_cut:
             if str2.endswith(str1):
                 return True
             else:
                 return False
         else:
             if str1.endswith(str2):
                 return True
             else:
                 return False
예제 #6
0
 def calculate_years(profile):
     year_list = []
     for years in profile['years']:
         year_pair = TextHelper.unicode_to_ascii(years).split('-')
         year_pair = [
             TextHelper.get_year(year_str) for year_str in year_pair
         ]
         year_pair = [year for year in year_pair if year is not None]
         if len(year_pair) == 2:
             try:
                 year_list.append(year_pair[1] - year_pair[0])
             except TypeError:
                 print("Can not minus between %s" % str(year_pair))
         else:
             print("can not calculate %s" % str(year_pair))
     return len(year_list), sum(year_list)
예제 #7
0
 def __init__(self, original_sentence: str):
     self.original_sentence = original_sentence
     self.without_punctuation = TextHelper.leave_only_letters(
         self.original_sentence)
     self.len_without_punctuation = len(self.without_punctuation)
     self.unused_original_sentence = original_sentence
     self.letters = LetterListManager.create_letters_list(
         self.original_sentence)
예제 #8
0
 def _get_education_words(self, education_dict):
     education_phrase_dict = TextHelper.get_dict_pattern(
         self.raw_position, education_dict)
     if len(education_phrase_dict) == 0:
         default_education_requirement = "Bachelor"
         self.new_words_list.append(default_education_requirement)
         return {default_education_requirement: 1}
     else:
         return education_phrase_dict
예제 #9
0
 def get_prompt_url_from_web(url,
                             start_tag,
                             end_tag,
                             user='******',
                             password='******'):
     url_ignore, web_content = WebHelper.get_auth_url_content(
         url, user, password)
     prompt_url_short = TextHelper.find_text_between_tag(
         web_content, start_tag, end_tag)
     prompt_url = WebHelper.join_url(url, prompt_url_short)
     print("get new prompt url: %s" % prompt_url)
     return prompt_url
예제 #10
0
 def _get_working_year_words(self, year_convert_file=None):
     year_list = TextHelper.get_years_pattern(self.raw_position)
     if len(year_list) == 0:
         default_year_requirement = "[0]"
         self.new_words_list.append(default_year_requirement)
         year_list = [default_year_requirement]
     elif year_convert_file is not None:
         year_convert_dict = StoreHelper.load_data(year_convert_file, {})
         year_list = [
             year_convert_dict[item] for item in year_list
             if item in year_convert_dict
         ]
     return DictHelper.dict_from_count_list(year_list)
예제 #11
0
 def __init__(self, raw_line: str, line_number: int):
     self.raw_line = raw_line
     self.caption_letters = LetterListManager.create_letters_list(raw_line)
     self.line_number = line_number
     self.without_punctuation = TextHelper.leave_only_letters(self.raw_line)
     self.len_without_punctuation = len(self.without_punctuation)
     self.__start_pos_in_cap_text = 0
     self.__end_pos_in_cap_text = 0
     # Оставшийся текст, который нужно отформатировать
     self.left_unformatted_text = self.without_punctuation
     self.correct_unformatted_text = self.without_punctuation
     self.is_formatted = False
     self.formatted_text = ''
     self.checked = False
     self.checked_twice = False
예제 #12
0
 def get_post(web_source):
     if not TextHelper.contain(web_source, 'data scientist'):
         print("Not contain data scientist")
         return False, None
     soup = BeautifulSoup(web_source, 'lxml')
     # post = soup.find('article', id='jobview') This include header
     # if post is not None:
     #     return True, post
     post = soup.find_all('div', class_='jobview-section')
     if len(post) >= 1:
         if len(post) > 1:
             print("Too many jobview-section")
         return (True, post[0]) if len(post) == 1 else (False, post[0])
     post = soup.findAll(True,
                         {'class': re.compile('^panel panel-default')})
     if len(post) >= 1:
         if len(post) > 1:
             print("Too many panel panel-default")
             post = soup.find_all(
                 'article',
                 class_='panel panel-default m-job-view panel-body')
             if len(post) == 1:
                 return True, post[0]
             else:
                 print("Can not solve multi panel")
                 return False, None
         return (True, post[0]) if len(post) == 1 else (False, post[0])
     post = soup.find_all('div', class_='job-show-description-scroller')
     if len(post) >= 1:
         if len(post) > 1:
             print("Too many job-show-description-scroller")
         return (True, post[0]) if len(post) == 1 else (False, post[0])
     post = soup.find('div', id='details-job-content')
     if post is not None:
         return True, post
     post = soup.find('div', id='jobcopy')
     if post is not None:
         return True, post
     post = soup.find('div', id='bodycol')
     if post is not None:
         return True, post
     post = soup.find('div', id='JobDescription')
     return (True, post) if post is not None else (False, None)
    def get_part_of_string_by_percent(self, percent: int) -> str:
        if percent >= 90:
            percent = 100

        cut_chars_from_start_cnt = math.floor(
            len(self.original_sentence) * percent / 100)

        # Бывает, что текст обрезается на половине слова, поэтому нужно его увеличить.
        for i, char in enumerate(self.raw_sentence):
            if i >= cut_chars_from_start_cnt:
                if not TextHelper.is_letter(char) or re.match(
                        r'[.?!,]', char) is not None:
                    cut_chars_from_start_cnt = i
                    break

        result = self.raw_sentence[:cut_chars_from_start_cnt]
        self.raw_sentence = self.raw_sentence[cut_chars_from_start_cnt:]

        if re.match(r'[.?!]', self.raw_sentence) is not None:
            result += self.raw_sentence
            self.raw_sentence = ''
        return result
    def start(self):
        captions_raw = FileManager.get_file_rows(self.captions_filename)
        result = []

        # Для каждой строки caption определяем номер предложения и  количество символов
        for caption_raw in captions_raw:
            caption_line = CaptionLine(caption_raw)

            if TextHelper.is_new_line(caption_raw):
                result.append(caption_line)
                continue

            if caption_line.substrings_cnt == 1:
                eng_sentence = self.get_first_not_handled_english_sentence()
                percent = int(caption_line.get_full_len() /
                              eng_sentence.get_full_len() * 100)
                eng_sentence.add_percent(percent)
                sentence_part = self.get_first_not_empty_russian_sentence(
                ).get_part_of_string_by_percent(percent)
                caption_line.add_formatted_text(sentence_part)

            if caption_line.substrings_cnt > 1:
                for i in range(caption_line.substrings_cnt):
                    eng_sentence = self.get_first_not_handled_english_sentence(
                    )
                    len_cap_line = len(caption_line.substrings[i])

                    percent = int(len_cap_line / eng_sentence.get_full_len() *
                                  100)
                    eng_sentence.add_percent(percent)
                    sentence_part = self.get_first_not_empty_russian_sentence(
                    ).get_part_of_string_by_percent(percent)
                    caption_line.add_formatted_text(sentence_part)

            result.append(caption_line)

        self.save_list_to_file(result)
예제 #15
0
 def get_fortune_500_dict():
     return {TextHelper.company_name_clean(value): key for key, value in pd.read_csv("./resource/fortune500.csv")['company'].to_dict().items()}
예제 #16
0
if __name__ == '__main__':
    print('Start training')
    time_start_load_everything = time.time()

    parser = argparse.ArgumentParser(description='PPDL')
    parser.add_argument('--params', dest='params', default='params_words.json')
    args = parser.parse_args()

    with open(f'./{args.params}', 'r') as f:
        params_loaded = yaml.load(f)
    current_time = datetime.datetime.now().strftime('%b.%d_%H.%M.%S')
    if params_loaded['type'] == "image":
        helper = ImageHelper(current_time=current_time, params=params_loaded,
                             name=params_loaded.get('name', 'image'))
    else:
        helper = TextHelper(current_time=current_time, params=params_loaded,
                            name=params_loaded.get('name', 'text'))

    helper.load_data()
    helper.create_model()

    ### Create models
    if helper.params['is_poison']:
        helper.params['adversary_list'] = [0]+ \
                                random.sample(range(helper.params['number_of_total_participants']),
                                                      helper.params['number_of_adversaries']-1)
        logger.info(f"Poisoned following participants: {len(helper.params['adversary_list'])}")
    else:
        helper.params['adversary_list'] = list()

    best_loss = float('inf')
    vis.text(text=dict_html(helper.params, current_time=helper.params["current_time"]),
예제 #17
0
 def _get_skill_words(self, skill_dict):
     return TextHelper.get_dict_pattern(self.raw_position, skill_dict)
예제 #18
0
                    'SearchJobJserp":{"jobPosting":"urn:li:fs_normalized_jobPosting:(\d*)"',
                    web_source)))
        return job_id

    def get_web_source(self, url):
        if not self.__has_authentication:
            print("Error! Not have authenticate yet!")
            return None
        self.__driver.get(url)
        delay = random.choice([5, 3, 3, 13, 4, 15, 7, 60, 5, 3, 4, 5, 7,
                               9])  # random delay seconds
        time.sleep(delay)
        try:
            WebDriverWait(self.__driver, delay).until(
                EC.presence_of_element_located((By.ID, 'clientPageInstance')))
            print("Page is ready!")
            return self.__driver.page_source.encode("utf-8")
        except TimeoutException:
            print("Loading took too much time!")
            return None


if __name__ == "__main__":
    chrome = ChromeHelper()
    chrome.authenticate("https://www.linkedin.com/uas/login-cap",
                        ("*****@*****.**", "Linkedin0405"))
    web_content = chrome.get_web_source(
        "https://www.linkedin.com/jobs/view/309059479")
    soup = BeautifulSoup(web_content.decode('utf-8'), 'html.parser')
    TextHelper.store_html(soup.prettify().encode('utf-8'), "309059479.html")
예제 #19
0
 def _get_discipline_words(self, discipline_dict):
     discipline_phrase_dict = TextHelper.get_dict_pattern(
         self.raw_position, discipline_dict)
     return discipline_phrase_dict

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='PPDL')
    parser.add_argument('--params', dest='params', default='utils/params.yaml')
    parser.add_argument('--name', dest='name', required=True)

    args = parser.parse_args()
    d = datetime.now().strftime('%b.%d_%H.%M.%S')
    writer = SummaryWriter(log_dir=f'runs/{args.name}')
    writer.add_custom_scalars(layout)

    with open(args.params) as f:
        params = yaml.load(f)
    if params.get('model', False) == 'word':
        helper = TextHelper(current_time=d, params=params, name='text')

        helper.corpus = torch.load(helper.params['corpus'])
        logger.info(helper.corpus.train.shape)
    else:
        helper = ImageHelper(current_time=d, params=params, name='utk')
    logger.addHandler(
        logging.FileHandler(filename=f'{helper.folder_path}/log.txt'))
    logger.addHandler(logging.StreamHandler())
    logger.setLevel(logging.DEBUG)
    logger.info(f'current path: {helper.folder_path}')
    batch_size = int(helper.params['batch_size'])
    num_microbatches = int(helper.params['num_microbatches'])
    lr = float(helper.params['lr'])
    momentum = float(helper.params['momentum'])
    decay = float(helper.params['decay'])
if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='PPDL')
    parser.add_argument('--params', dest='params', default='utils/params.yaml')
    # parser.add_argument('--name', dest='name', required=True)
    parser.add_argument('--name', dest='name', default='test')

    args = parser.parse_args()
    d = datetime.now().strftime('%b.%d_%H.%M.%S')
    writer = SummaryWriter(log_dir=f'runs/{args.name}')
    writer.add_custom_scalars(layout)

    with open(args.params) as f:
        params = yaml.load(f)
    if params.get('model', False) == 'word':
        helper = TextHelper(current_time=d, params=params, name='text')

        helper.corpus = torch.load(helper.params['corpus'])
        logger.info(helper.corpus.train.shape)
    else:
        helper = ImageHelper(current_time=d, params=params, name='utk')
    logger.addHandler(logging.FileHandler(filename=f'{helper.folder_path}/log.txt'))
    logger.addHandler(logging.StreamHandler())
    logger.setLevel(logging.DEBUG)
    logger.info(f'current path: {helper.folder_path}')
    batch_size = int(helper.params['batch_size'])
    num_microbatches = int(helper.params['num_microbatches'])
    lr = float(helper.params['lr'])
    momentum = float(helper.params['momentum'])
    decay = float(helper.params['decay'])
    epochs = int(helper.params['epochs'])
예제 #22
0
 def _get_responsibility_words(self, education_dict):
     return TextHelper.get_dict_pattern(self.raw_position, education_dict)
예제 #23
0
 def _feature4(element):
     return TextHelper.get_data_length(element)
예제 #24
0
    def find_caption_list_which_best_suit_for_sentence(
            self, punctuation_file_sentence: PunctuationSentence,
            captions_prepared: List[CaptionLine]) -> List[CaptionLine]:
        """
        Метод, который вернет список CaptionLine, который
        содержит необходимые строки для форматирования.
        """
        result = []
        result_string_len = 0
        for caption_prepared in captions_prepared:

            if not caption_prepared.checked_twice and caption_prepared.left_unformatted_text != '' and not caption_prepared.is_formatted and TextHelper.has_similar_substring(
                    caption_prepared.left_unformatted_text,
                    punctuation_file_sentence.without_punctuation):
                # Собираем длину строки
                result_string_len += len(
                    caption_prepared.left_unformatted_text)

                if caption_prepared.checked:
                    caption_prepared.checked_twice = True
                caption_prepared.checked = True

                result.append(caption_prepared)
                if result_string_len > punctuation_file_sentence.len_without_punctuation:
                    break

        return result