def _read_coords(rawtext, **kwargs): "Get coordinates from the notice." rawtext = _strip_ws(rawtext) rawcoords = re.findall(MINUTE_COORDS, rawtext) if len(rawcoords) > 0: return _clean_minute_coords(rawcoords, **kwargs) out = [] rawcoordsd = re.findall(DECIMAL_COORDS, rawtext) for lat, lng in window(rawcoordsd, n=2): if lat and lng: try: out.append((float(lat), float(lng))) except ValueError: pass for lat, lng in re.findall(PARANTHETICAL_COORDS, rawtext): out.append((float(lat), float(lng))) for _lat, latdir, _lng, lngdir in re.findall(PORTLAND_COORDS, rawtext): lat = abs(float(_lat)) * (1 if 'n' in latdir.lower() else -1) lng = abs(float(_lng)) * (-1 if 'w' in lngdir.lower() else 1) out.append((lat, lng)) return out
def makefile_to_slides(fp): for line in fp: if line.startswith('all') or line == '\n': continue if 'wget' in line: continue if not line.startswith('\t'): title = line.split(':')[0] else: tokens = line.strip().split() lines = tokens[0] for prevtoken, thistoken in window(tokens, 2): if prevtoken.startswith('--') and not thistoken.startswith('--'): lines += ' \\\n %s %s' % (prevtoken, thistoken) elif prevtoken.startswith('--') and thistoken.startswith('--'): pass elif not prevtoken.startswith('--') and not thistoken.startswith('--'): if len(lines.split('\n')[-1] + thistoken) < 15: lines += ' %s' % thistoken else: lines += ' \\\n %s' % thistoken yield title, lines
def test_window(): observed = list(map(tuple,window(range(8), n = 3))) expected = [ (0,1,2), (1,2,3), (2,3,4), (3,4,5), (4,5,6), (5,6,7), ] n.assert_list_equal(observed, expected)
def character(text): lines = window(line.strip() for line in re.split(r'[\r\n]', text)) out = '' output = False i = 0 for l, r in lines: if 'Full Public Notice' in l and 'Project Plans' in l: output = True elif output and i < 10 and r.strip(): out += ' ' + r.strip() i += 1 return out.lstrip()
def applicant(text): lines = (line.strip() for line in re.split(r'[\n\r]+', text) \ if line.strip()) result = None for l, r in window(lines): m = re.match(r'^(?:name of |)applicant:(.*)', l or '', flags=re.IGNORECASE) if m: remainder = m.group(1).strip() if remainder: result = remainder else: result = r break if result: return re.split(APPLICANT_REGEXES['shorten'], result, maxsplit=0)[0].strip()
def amici(brief:str) -> list: for member, result in MANUAL_OVERRIDE: if re.match(member, brief): return result _amicus_regex = re.compile(r'(?:amicus brief|amici brief|amici curiae|amicus curiae|motion for leave to file and brief)(?: of)?', flags = re.IGNORECASE) amici_section = _remove_date(brief) amici_section = re.sub(r'[0-9]+\. +Brief,', '', amici_section, flags = re.IGNORECASE) match = re.search(_amicus_regex, amici_section) if match != None and match.start() < 30: amici_section = amici_section[match.end():] match = re.search(_amicus_regex, amici_section) if match != None and match.start() > len(amici_section)*2/5: amici_section = amici_section[:match.start()] amici_section = re.sub(r' in support of .*', '', amici_section, flags = re.IGNORECASE) # Inc.The amici_section = re.sub(r'([^ ])The ', r'\1, The ', amici_section) onlycomma = r'(?:,| and the| and other) ' l = amici_section.lower() if l.count(';') > 0: _regex = r'; ' elif l.count(',') > 3 or ', and' in l or l.count(',') > l.count('and') or l.count(',') == l.count(', inc'): _regex = onlycomma else: _regex = r'(?:,| and) ' amicus_separator = re.compile(_regex, flags = re.IGNORECASE) def clean(result): r = result.strip() r = re.sub(r' as ?$', '', r, flags = re.IGNORECASE) r = re.sub(r'^ ?(for|of|and|amic(i|us) curiae) ?', '', r, flags = re.IGNORECASE) match = re.search(r'brief(?: for| of)?(?: the)?(?: amic(?:us|i) curiae)? ', r, flags = re.IGNORECASE) if match and match.start() < len(result) / 2: return r[match.end():] elif match and match.end() > len(result) / 2: return r[:match.start()] else: return r # Clean twice results = map(clean, map(clean, _amicus(unidecode(amici_section), amicus_separator, 0))) slider = window(chain([''], results, ['']), n = 3) out = [] for previous_result, current_result, next_result in slider: if re.match(r'^(|as|amic(i|us) curiae)$', current_result, flags = re.IGNORECASE): pass elif re.match(r'^[^a-z]{0,2}(inc|jr)[^a-z]{0,2}', next_result, flags = re.IGNORECASE): out.append(current_result + ', ' + next_result) next(slider) elif re.match(r'^et al\.?,?$', next_result, flags = re.IGNORECASE): out.append(current_result + ', ' + next_result) next(slider) else: out.append(current_result) if len(out) >= 3 and _regex == onlycomma: out = out[:-1] + re.split(r' and ', out[-1], flags = re.IGNORECASE) def finalize(result): result = re.sub(r'^ ?(amici|of )', '', result, flags = re.IGNORECASE) result = re.sub(r' (on behalf of).*$', '', result, flags = re.IGNORECASE) result = re.sub(r',$', '', result) return result return list(map(finalize, out))
def derivative(counter): dcounter = Counter() for left, right in window(itertools.chain([min(counter) - 1], sorted(counter))): dcounter[right] = counter[right] - counter[left] return dcounter
def process(self, root_dir, **kwargs): """ TODO """ # Read keyword arguments est_curpair = kwargs.get("estimate_curpair", "EURUSD") feature_curpairs = kwargs.get( "feature_curpairs", [ "AUDJPY", "AUDNZD", "AUDUSD", "CADJPY", "CHFJPY", "EURCHF", "EURGBP", "EURJPY", "EURUSD", "GBPJPY", "GBPUSD", "NZDUSD", "USDCAD", "USDCHF", "USDJPY", ], ) output_file = kwargs.get("output", "output") window_size = kwargs.get("window_size", 3) timeslot_len = timedelta(minutes=kwargs.get("timeslot_len", 60)) self.__remove_timebounds = set() # Extract zip files from directory zipfiles = [ [os.path.join(root_dir, curpair, zipfile) for zipfile in os.listdir(os.path.join(root_dir, curpair))] for curpair in os.listdir(root_dir) ] with open(output_file, "w") as fp: months = len(zipfiles[0]) # number of months in given dataset for month in xrange(months): print "working in %sth month" % month # get zip files of current month month_zipfiles = [e[month] for e in zipfiles] # initialize data data = {} for filename in month_zipfiles: with self.__unarchive_zipfile(filename) as file_obj: curpair = self.__get_currency_pair(filename) if curpair not in feature_curpairs: continue print "Aggregating %s" % filename self.__aggregate_data(file_obj, self.__get_currency_pair(filename), timeslot_len, data) data = {k: v for k, v in data.iteritems() if k not in self.__remove_timebounds} # Iterate data with a sliding window, for each sequence, the # first n items serve as feature and last item serves as label data_it = iter(data[i] for i in sorted(data)) for seq in window(data_it, window_size + 1): print "Generaing log..." self.__print_result(fp, seq, est_curpair)