def test_date_extraction(self): self.assertEqual(extract_date('02.01.2010'), datetime(2010, 1, 2).date()) self.assertEqual(extract_date('2.1.2010'), datetime(2010, 1, 2).date()) self.assertEqual(extract_date('2.1.20100'), '') self.assertEqual(extract_date('2.1.20'), '') self.assertEqual(extract_date(''), '')
def parse_arrete(filename): with open(filename, 'r') as input: soup = BeautifulSoup(input.read().decode('utf-8')) prev_article = None data = {'refs': [], 'articles': []} for tag in soup.findAll(['p']): line = tag.text.replace('\n', ' ').strip() if not line: continue if re_end.search(line) and not re_date.search(line): continue if re_date.search(line): try: data['date'] = extract_date(line) except: pass if re_end.search(line) and re_date.search(line): data['date'] = extract_date(line) break if line.startswith('Portant'): data['titre'] = line numero = re_numero_arrete.search(line) if numero: data['numero'] = numero.group(1).strip() continue reference = re_references.search(line) if reference: data['refs'].append(line[:-1]) continue current_article = re_articles.search(line) if current_article: data['articles'].append(line) prev_article = current_article continue if prev_article: data['articles'][-1] += ' ' + line return data
def create_task(self, args): """Create a document with de args provided""" doc = {} doc['id'] = self.get_free_id() try: doc['description'] = ' '.join(args['description']) except TypeError: doc['description'] = ' '.join(args['parameter']) if args['project']: doc['project'] = args['project'] else: doc['project'] = 'default' doc['status'] = "incomplete" doc['date'] = datetime.now() doc['priority'] = prioritize(args['priority']) try: doc['tags'] = clean_tags(args['tags']) except TypeError: pass try: actual_date = extract_date(args['due_date']) if actual_date: doc['due_date'] = actual_date except TypeError: pass try: parent_id = self.get_id(int(args['parent'])) doc['parent'] = parent_id doc['ancestors'] = self.get_ancestors(parent_id) except TypeError: doc['parent'] = None return doc
def query_manifests(self, depot_id): """Query steamdb.info for a list of manifests for a specific depot and return that list. Returns a list of manifests """ url_base = "https://steamdb.info/" response = requests.get(self.__build_url(url_base, f"depot/{depot_id}/manifests/"), headers=self.headers) result = [] if self.__is_response_successful(response): self.__print_response_error(response) sys.exit() soup = BeautifulSoup(response.content, "html.parser") div = soup.find("div", {'id' : 'manifests'}) tbody = div.find("tbody") # Prevent Error for depots without history if not tbody is None: for tr in tbody.findAll("tr"): tds = tr.findAll("td") date = utils.extract_date(tds[0].text) id = tds[2].text result.append({ 'date' : date, 'id' : id }) return result
def test_extract_date(self): app = MagicMock() test_cases = [("I will travel on 05/09/2020", '05/09/2020'), ("on 05/09/2021 and on 03/09/2021", False), ("2021/06/30", False), ("13/05/2021", '13/05/2021'), ("on 13/05/2021", '13/05/2021')] for option, answer in test_cases: self.assertEqual(extract_date(option, app), answer)
def __init__(self, **kwargs): user_data = { 'full_name': '{} {}'.format( kwargs.get('last_name', ''), kwargs.get('first_name', '') ), 'first_name': kwargs.get('first_name', ''), 'last_name': kwargs.get('last_name', ''), 'bdate': extract_date(kwargs.get('bdate', '')), 'phone': extract_phone(kwargs.get('home_phone', '')), 'nickname': kwargs.get('nickname', ''), 'site': kwargs.get('site', ''), } super().__init__(**user_data)
def job_parser(file): imageString = pytesseract.image_to_string(Image.open(file), lang="eng") raw_text = imageString.replace('\n', ' ') input_file_name = file.split("/")[-1] date_of_posting = utils.extract_date(''.join(file.split('_')[1:3])) details_dict = { "file_name": input_file_name, "date_of_posting": date_of_posting } # How many Connects required req_con = [] # duration of the job job_duration = utils.extract_job_duration(raw_text) details_dict["job_duration"] = job_duration # Level of expertise exp_level = utils.experience_level(raw_text) details_dict["experience_level"] = exp_level # Duration of the project job_desc = utils.extract_job_description(imageString) details_dict["job_description"] = job_desc # Job Related to (DL/ML/DA) job_main_skill = utils.job_main_skill(imageString) details_dict["job_main_skill"] = job_main_skill # Job detailed description full_desc = utils.extract_full_job_description(raw_text) details_dict["job_full_description"] = full_desc # Skills required skill_req = np.NAN details_dict["required_skill"] = str(skill_req) # How many Connects required temp_conn = raw_text.split('Worldwide')[1] if 'Send a proposal for:' in temp_conn: req_con = temp_conn.split('Send a proposal for:')[1].split()[0] details_dict["required_connects"] = str(req_con) else: req_con = raw_text.split('Worldwide')[1].split()[0] # working details_dict["required_connects"] = str(req_con) # print(details_dict) return details_dict
def _createOpinions(self, hotel_id): for opinion in self.opinions: opinionObj = ed.model.Opinion() opinionObj.user = opinion['name'] opinionObj.country = self._getOrCreateCountry(opinion['country']).id if opinion['country']!=None else None opinionObj.age_range = self._getOrCreateAgeRange(opinion['age_range']) opinionObj.date = extract_date(opinion['date']) opinionObj.hotel_id = hotel_id opinionObj.positive = opinion['positive'] opinionObj.negative = opinion['negative'] opinionObj.grade = opinion['grade'].replace(",",".") opinionObj.title = opinion['title'] opinionObj.user_opinions = opinion['visits'] self.session.add(opinionObj) self.session.flush() tags = map(lambda tag: self._getOrCreateTags(tag), opinion['tags']) optags = map(lambda tag: ed.model.OpinionTag(tag=tag.id,opinion=opinionObj.id),tags) self.session.add_all(optags) self.session.flush()
def query_manifests(self, depot_id): """Query steamdb.info for a list of manifests for a specific depot and return that list. Returns a list of manifests """ url = f"https://steamdb.info/depot/{depot_id}/manifests/" response = self._query_website(url, headers=self.headers) result = [] soup = BeautifulSoup(response.content, "html.parser") div = soup.find("div", {'id' : 'manifests'}) tbody = div.find("tbody") # Prevent Error for depots without history if not tbody is None: for tr in tbody.findAll("tr"): tds = tr.findAll("td") date = utils.extract_date(tds[0].text) id = tds[2].text result.append({ 'date' : date, 'id' : id }) return result
def extract_reviews_from_url(url): try: # Avoid raising exception... In the except section review_div = None logger.debug('Getting reviews from %s', url) # Clean end of URL to get the main Beer URL beer_url = url if '?' in url: beer_url = re.sub(r'\?.*', '', url) # Kept begin of review order int review_order = 0 if 'start=' in url: parse_result = urlparse(url) query_dict = parse_qs(parse_result.query) try: review_order = int(query_dict['start'][0]) except: logger.error('Could not get start index. Query string: '+str(query_dict)) raise empty_rating_dict = {'beer_url': beer_url, 'user_url': '', 'score': '', 'rdev': '', 'date': '', 'review': '', 'scrap_time': '', 'review_order': ''} list_ratings_reviews = [] soup = make_soup(url) review_divs = soup.findAll(id='rating_fullview_content_2') for review_div in review_divs: rating_dict = empty_rating_dict.copy() rating_dict['review_order'] = str(review_order) review_order += 1 now = datetime.datetime.now() rating_dict['scrap_time'] = str(now) # Date muted = review_div.find_all(class_='muted') date = muted[-1].find_all('a')[-1].contents[0] real_date = utils.extract_date(date, current_date=now) rating_dict['date'] = str(real_date) # user url rating_dict['user_url'] = review_div.find(class_='username')['href'] # score bascore = review_div.find(class_='BAscore_norm') rating_dict['score'] = bascore.contents[0] # Now we'll process line by line... Always ugly # rdev - useful? norm_line = review_div.find(class_='rAvg_norm') rdev_line = norm_line.next_sibling rdev_string = rdev_line.string # Need to take into account rDev 0% if not '%' in rdev_string: rdev_line = rdev_line.next_sibling rdev_string = rdev_line.string rdev = rdev_line.string.replace('%', '').replace('rDev', '').strip() rating_dict['rdev'] = rdev # If there is a review, then we have more info next_el = rdev_line.next_sibling next_el_sibl = next_el.next_sibling current_el = next_el_sibl # Get all siblings, in any case all_siblings = current_el.next_siblings # Remove all tags from the siblings true_siblings = [x for x in all_siblings if isinstance(x, NavigableString) or not x.name] # It's a review, let's parse it review_string = " ".join(true_siblings[0: -2]) rating_dict['review'] = review_string list_ratings_reviews.append(rating_dict) return list_ratings_reviews except Exception: logger.error('Error fetching reviews and ratings from %s' % url) if review_div: logger.error('Div: ') logger.error(review_div) raise