def download_formadvs(): br = HomeBrowser(starturl = r'https://www.sec.gov/help/foiadocsinvafoiahtm.html') for linktag in br.filterlinks(r'\d{6}\.zip'): url = linktag.url link = "https://www.sec.gov/%s" % url _ = OSPath.split(link)[-1] br.download(link, outfile = mkpath(zipfolder, _))
def read_dailyxml(): f = gzip.open(get_dailyxml_path(), 'rb') for chunk in chunker(f, chunksize = 100): appendData(mkpath(xmlfolder, 'daily.xml'), ''.join(chunk)) f.close() #catch IOerror return pd.DataFrame(parsexml('dailyxml.xml', 'Firm', 'FormInfo'))
def load_scheduleDs(): folders = [folder for folder in Folder.listdir('data', pattern=r'\d+$')] for folder in folders: try: data = from_json(mkpath(folder, 'predictiveops.json')) load_scheduleD(data) except IOError: continue
def normfile(self, formadv, dailyxml=False, **kwds): df = read_formadv(formadv.filename) df = self.normdf(df, formadv, **kwds) self.writefile(df, mkpath(preprocessed, formadv.outfile)) return df
class FormadvStage(Stage): FIELDSPATH = mkpath('config', 'fieldsconfig.json') def __init__(self): super(FormadvStage, self).__init__('formadv') @classmethod def processfiles(cls, start=1, **kwds): advprsr = cls() advprsr.info("Starting at entry number {}".format(start)) for formadv in db.FormADV.select(): if formadv.id >= start: advprsr.info("Currently processing '{}'".format( formadv.filename)) advprsr.normfile(formadv) @staticmethod def get_number(df, field='numberofclients'): data = df[field].copy() mask = data.notnull() data.loc[data.contains(re_NUMBERSPECIFY)] = np.nan __ = df.loc[mask, '{}_specify'.format(field)] return data.modify(mask, data.fillna(__)).quickmap(numericrank) @staticmethod def cleantext(text, key): if text.startswith(key): return ' '.join( i.capitalize() for i in text.replace("{}_".format(key), '').split('_')) return to_single_space(text) @staticmethod def get_types(df): categories = ['client_types', 'compensation', 'pct_aum', 'disclosures'] fields = ['adviser', 'text', 'specific', 'percentage'] typesmap = {'descriptions': []} for key in categories: data = df.filter(regex=key).stack().reset_index() if data.empty: continue __maps = {} for _key in ( 'specify', 'other', ): __ = data.level_1.contains("{}_(?:other_)?{}$".format( key, _key)) __maps.update({ _key: { 'map': data.loc[__].get_mapper('level_0', 0), 'mask': __ } }) mask_o = __maps['other']['mask'] map_s = __maps['specify']['map'] map_o = __maps['other']['map'] descriptions = data.assign( text=data.level_1.modify(mask_o, data.level_0.map(map_s)).quickmap( FormadvStage.cleantext, key), # specific=data.level_1.modify(mask_o, True, elsevalue=False), # adviser=data.level_0.map(df.crd.to_dict()), # percentage=data[0].quickmap(percentrank) # ).ix[:, fields] qty = descriptions.percentage.to_numeric(force=True) if key == 'disclosures': descriptions['number'] = qty else: descriptions['percentage'] = qty dropmask = (descriptions.text != 'Other Specify') & (qty != 0) & (qty.notnull()) descriptions = descriptions.loc[dropmask].dropna() typesmap['descriptions']\ .extend(descriptions\ .ix[:, ['text', 'specific']]\ .drop_duplicates(subset = ['text'])\ .to_dict(orient = 'records')) typesmap.update({ key : descriptions.loc[dropmask]\ .dropna().rename(columns = {'text' : 'description'}) }) typesmap['descriptions'] = [ dict(t) for t in {tuple(d.items()) for d in typesmap['descriptions']} ] return typesmap @staticmethod def addnames(df): if hasattr(df, 'contactperson'): df = pd.concat([df, df.contactperson.to_name()], axis=1) return df def normdf(self, df, formadv, **kwds): df = super(FormadvStage, self).normdf(df, **kwds) nflds = self.numeric_fields num = df[nflds].copy() if _num.any(axis=1).any(): #these did not provide a value df[nflds] = num.fillna(0) return df.assign( formadv=formadv.id, adviser=df.crd, numberofclients=self.get_number(df), numberofemployees=self.get_number(df, field='numberofemployees'), date=formadv.date, ).clean_addresses().addnames() def writefile(self, df, outfile, **kwds): while True: try: df.to_csv(outfile, index=False, **kwds) break except UnicodeEncodeError as e: self.error("Encoding troubles") self.error(e) kwds['encoding'] = 'utf-8' def normfile(self, formadv, dailyxml=False, **kwds): df = read_formadv(formadv.filename) df = self.normdf(df, formadv, **kwds) self.writefile(df, mkpath(preprocessed, formadv.outfile)) return df
def get_dailyxml_path(): return OSPath.abspath( mkpath(xmlfolder, utcnow().strftime(r'IA_FIRM_SEC_Feed_%m_%d_%Y.xml.gz') ))
def get_outfile(date): return mkpath(preprocessed_folder, date.strftime("%m%d%y_output.csv"))