def parsed_grant(grant_tuple): program, sub_area, grant = grant_tuple m = re.match(r"([^()]+)\(([^)]+)\)", grant) m2 = re.search(r"\$((\d{1,3},)?\d{1,3},)?\d{1,3}$", grant) grantee = "" location = "" duration = "" amount = 0 if m: grantee = title_cased(util.cleaned(m.group(1))) location = title_cased(util.cleaned(m.group(2))) assert re.match(r"[A-Za-z -]+, [A-Z][A-Z]", location) or not location (duration, support_type, purpose) = parsed_middle_part( grant[len(m.group(0)):grant.find(". . .")].strip()) if m2: amount = int(m2.group(0).strip().replace("$", "").replace(",", "")) return { "year": 2001, "program": program, "sub_area": sub_area, "grantee": grantee, "grantee_location": location, "same_year_awards": amount, "duration": duration, "support_type": support_type, "purpose": purpose }
def main(): writer = csv.DictWriter(sys.stdout, fieldnames=util.fieldnames) for fp in FILE_PATHS: with open(fp, "r") as f: soup = BeautifulSoup(f, "lxml") year = int(fp[len("data/"):len("data/YYYY")]) for table in soup.find_all("table"): trs = table.find_all("tr") num_col = len(trs[1].find_all("td")) if year in [1991, 1992]: assert num_col == 5, (fp, num_col) if year in [1993, 1994]: assert num_col == 4, (fp, num_col) label = sub_area(table) for tr in trs: cols = tr.find_all("td") if util.cleaned(cols[0]) == "RECIPIENT": continue d = { "url": SOURCE[fp], "program": program_name(fp), "year": year, "sub_area": label, } try: loc = util.cleaned(cols[0].i.extract().text) assert re.match(r"[A-Za-z. -]+, [A-Za-z ]+", loc) d["grantee_location"] = loc except AssertionError: raise ValueError("Does not look like a location", loc, fp) except: pass d["grantee"] = util.cleaned(cols[0].text) d["purpose"] = util.cleaned(cols[1].text) if year in [1991, 1992]: d["same_year_awards"] = amount(cols[2].text) d["same_year_payments"] = amount(cols[3].text) d["same_year_eoy_grants_payable"] = amount(cols[4].text) assert amount(cols[2].text) is not None, fp assert amount(cols[3].text) is not None, fp assert amount(cols[4].text) is not None, fp if year in [1993, 1994]: d["same_year_awards"] = amount(cols[2].text) d["same_year_payments"] = amount(cols[3].text) assert amount(cols[2].text) is not None, fp assert amount(cols[3].text) is not None, fp writer.writerow(d)
def run(input, output, f): dataset = pd.read_csv(input, sep='\t', names=['Paper_Id','Paper_title','Publication_venue', \ 'Cited_Papers', 'Cited_Papers_Venues'], quoting=csv.QUOTE_NONE) titles = dataset['Paper_title'].values pre_clean_sentences, word_list = pre_clean(titles) print("Done 1.lowercase 2.Remove criteria 3.Tokenize...") try: print("Loading pre-define",f,"...") filter_word = pkl.load(open(f, 'rb')) except: print("Didn't load pre-define, generate new...") freq = nltk.FreqDist(word_list) filter_word = list(filter(lambda x: x[1]>=5,freq.items())) # filter out low freq word filter_word = set([x for x, _ in filter_word]) pkl.dump(filter_word, open("filter_word.pkl", 'wb')) print("From {0} words reduce to {1} words by filtering freq out < 5.".format(len(freq), len(filter_word))) cleaned_sentences = cleaned(pre_clean_sentences, filter_word) print("Done 4.filter out low freq...") dataset['Paper_title'] = cleaned_sentences print("Saving",output,"...") dataset.to_csv(output, sep='\t', index=False, header=False)
def parsed_middle_part(middle_part): duration = "" support_type = "" purpose = "" m_year = re.search(r"\d+[ ](?:year|years|month|months)$", middle_part) if m_year: duration = util.cleaned(m_year.group(0)) middle_part = middle_part[:m_year.start(0)] m = re.search(r"""(?:(.+)\n)? ((?:To|For|Multi).+)""", middle_part, flags=re.DOTALL | re.VERBOSE | re.MULTILINE) if m: support_type = util.cleaned(m.group(1)) purpose = util.cleaned(m.group(2)) return (duration, support_type, purpose)
def sub_area(table): """Find the sub-area of the grant. These are given right above the table in bold.""" label = table.previous_sibling # Keep going until we find a tag while isinstance(label, bs4.element.NavigableString): label = label.previous_sibling assert label.name == "b" return util.cleaned(label)
def amount(x): x = util.cleaned(x).replace("$", "").replace("*", "") if not x: return 0 m1 = re.match(r"^((\d{1,3},)?\d{1,3},)?\d{1,3}$", x) if m1: return int(x.replace(",", "")) m2 = re.match(r"^\(((\d{1,3},)?\d{1,3},)?\d{1,3}\)$", x) if m2: return -int(x.replace("(", "").replace(")", "").replace(",", "")) print(x, file=sys.stderr) return None
def write_grant(grant, sub_area, file_path, writer): purpose, grantee, location_amount = grant year = int(file_path[len("data/"):len("data/YYYY")]) if grant == (['\n'], None, []): pass elif (any(map(lambda x: x.name == "h2", grant[0])) and grant[1] is None and grant[2] == []): pass elif not grantee: print(file_path, grant, file=sys.stderr) else: la_str = " ".join(map(util.cleaned, location_amount)) loc = find_location(la_str) assert not loc or re.match(r"[A-Za-z -]+, [A-Z][A-Z]", loc) or re.match(r"[A-Z][A-Z]", loc) writer.writerow({"program": program_name(file_path), "sub_area": sub_area, "year": year, "url": SOURCE[file_path], "purpose": " ".join(map(util.cleaned, purpose)), "notes": find_extra(la_str), "grantee_location": loc, "same_year_awards": first_dollar(la_str), "grantee": util.cleaned(grantee.text)})
def find_sub_area(elem): curr = elem while curr is not None and curr.name != "h2": curr = curr.previous_sibling return util.cleaned(curr)