def make_trinomial_instances_df(doc_dir): tri_man = TrinomialManage() tri_man.remove_prepended_zeros = True df = pd.DataFrame( columns=[ 'filename', 'pos_trinomial', 'state_num', 'region_abbr', 'site_number' ] ) i = 0 for subdir, dirs, files in os.walk(doc_dir): for file in files: if not file.endswith('.txt'): continue filepath = os.path.join(subdir, file) with open(filepath, 'r') as file_obj: content = file_obj.read() trinomials = re.findall(r'(\b([0-9]{1,2}[A-Z]{2,}[0-9]{1,})\b)', content) trinomials = set(trinomials) for t_tup in trinomials: t_tup = set(t_tup) for trinomial in t_tup: if trinomial.startswith('0'): # not a trinomial continue tri_parts = tri_man.parse_trinomial(trinomial) state = int(tri_parts['state']) if state < 1 or state > 50: # not a state, skip continue df.loc[i] = [ file, trinomial, state, tri_parts['county'], tri_parts['site'] ] i += 1 print('[{}] Found {} in {} ({}, {}, {})'.format( i, trinomial, file, state, tri_parts['county'], tri_parts['site'], ) ) return df