def en_parsers_speed(self): file_path = os.path.join(lexnlp_test_path, 'long_parsed_text.txt') with codecs.open(file_path, 'r', encoding='utf-8') as fr: text = fr.read() ge_path = os.path.join(lexnlp_test_path, 'lexnlp/extract/en/tests/test_geoentities/') entities_fn = ge_path + 'geoentities.csv' aliases_fn = ge_path + 'geoaliases.csv' geo_config = list(DictionaryEntry.load_entities_from_files(entities_fn, aliases_fn)) times = {} # type: Dict[str, float] self.check_time(text, lambda s: list(get_amounts(s)), 'get_amounts', times) self.check_time(text, lambda s: list(get_acts(s)), 'get_acts', times) self.check_time(text, lambda s: list(get_citations(s)), 'get_citations', times) self.check_time(text, lambda s: list(get_conditions(s)), 'get_conditions', times) self.check_time(text, lambda s: list(get_constraints(s)), 'get_constraints', times) self.check_time(text, lambda s: list(get_copyright(s)), 'get_copyright', times) self.check_time(text, lambda s: list(_get_courts(s)), 'get_courts', times) self.check_time(text, lambda s: list(get_cusip(s)), 'get_cusip', times) self.check_time(text, lambda s: list(get_dates(s)), 'get_dates', times) self.check_time(text, lambda s: list(get_definitions(s)), 'get_definitions', times) self.check_time(text, lambda s: list(get_distances(s)), 'get_distances', times) self.check_time(text, lambda s: list(get_durations(s)), 'get_durations', times) self.check_time(text, lambda s: list(get_geoentities(s, geo_config)), 'get_geoentities', times) self.check_time(text, lambda s: list(get_money(s)), 'get_money', times) self.check_time(text, lambda s: list(get_percents(s)), 'get_percents', times) self.check_time(text, lambda s: list(get_pii(s)), 'get_pii', times) self.check_time(text, lambda s: list(get_ratios(s)), 'get_ratios', times) self.check_time(text, lambda s: list(get_regulations(s)), 'get_regulations', times) self.check_time(text, lambda s: list(get_trademarks(s)), 'get_trademarks', times) self.check_time(text, lambda s: list(get_urls(s)), 'get_urls', times) self.assertTrue('get_amounts' in times)
def get_salary(text, return_source=False): TRIGGER_LIST_SALARY = ["salary", "rate of pay"] # text to be found and multiplier to get yearly found_time_unit = None found_time_units = [] found_salary_trigger = False money = None min_annual_salary = 20000 # sample is mostly executives- so this is safe. for t in TRIGGER_LIST_SALARY: if findWholeWordorPhrase(t)(text) is not None: found_salary_trigger = True break if found_salary_trigger: for t in TRIGGER_LIST_TIME_UNIT: found_time_unit_temp = findWholeWordorPhrase(t[0])(text) if found_time_unit_temp is not None: found_time_units.append(t[1]) if len(found_time_units) > 0: found_time_unit = min(found_time_units) found_money = list(get_money(text)) if len(found_money) > 0: money_temp = max(found_money, key=lambda item: item[0]) if money_temp[0] * found_time_unit > min_annual_salary: money = money_temp if money is not None: if return_source: return money, found_time_unit, text else: return money, found_time_unit else: return None
def test_get_money_problem1(self): """ Problem: it was returning 23.6 instead of 23.62 for such cases. :return: """ text = '''Exercise Price per Share: 23.62''' actual = list(get_money(text, return_sources=False, float_digits=6)) self.assertEqual(actual[0][0], 23.62)
def test_money(self): text = "100 bucks, 100 dollars, 100 greens" ds = list(get_money(text)) self.assertEqual(1, len(ds)) ants = list(get_money_annotations(text)) self.assertEqual(1, len(ds)) self.assertEqual('en', ants[0].locale) self.assertEqual('USD', ants[0].currency) self.assertEqual(100.0, ants[0].amount)
def test_get_money_order(self): """ At some moment there was a problem: get_money() was returning money in reversed order. This test is ensures the order is straight. :return: """ text = ''' $96,844.00 per month ($31.00 per square foot per year), beginning on the date which is 90 days after the Commencement Date and ending on the Expiration Date.''' actual = list(get_money(text, return_sources=False, float_digits=6)) self.assertEqual(actual[0][0], 96844.0)
def parse(self, text, text_unit_id, _text_unit_lang, **kwargs) -> ParseResults: found = list(money.get_money(text, return_sources=True)) if found: unique = set(found) return ParseResults({ CurrencyUsage: [ CurrencyUsage(text_unit_id=text_unit_id, amount=item[0], amount_str=item[2], currency=item[1], count=found.count(item)) for item in unique ] })
def _extract_variants_from_text(self, field, text: str, **kwargs): money = get_money(text, return_sources=False) if not money: return None return [{'currency': m[1], 'amount': m[0]} for m in money]
def getMoney(self): mem = [] money = list(get_money(self.bill_text)) for mon in money: mem.append(str(mon[0])) self.bill.info['money'] = mem
rep_date_list = [] for elem in elems: date_lim = elem[1] if (date_lim[1] - date_lim[0]) <= 6: continue rep_text = text[date_lim[0]:date_lim[1]] rep_date_list.append(rep_text) for i in rep_date_list: text = text.replace(i, ' <DATE> ') text = re.sub(dates1, ' <DATE> ', text) start_time = time.time() rep_money_list = set() elems = (list(money.get_money(text, return_sources=True))) print("Money") print(elems) for elem in elems: rep_money_list.add(elem[-1]) rep_money_list = list(rep_money_list) for i in rep_money_list: text = text.replace(i, ' <MON> ') print(time.time() - start_time) # rep_amt_list=[] # elems=list(amounts.get_amounts(text, return_sources=True)) # print(elems)