def parsed_grant(grant_tuple):
    program, sub_area, grant = grant_tuple
    m = re.match(r"([^()]+)\(([^)]+)\)", grant)
    m2 = re.search(r"\$((\d{1,3},)?\d{1,3},)?\d{1,3}$", grant)
    grantee = ""
    location = ""
    duration = ""
    amount = 0
    if m:
        grantee = title_cased(util.cleaned(m.group(1)))
        location = title_cased(util.cleaned(m.group(2)))
        assert re.match(r"[A-Za-z -]+, [A-Z][A-Z]", location) or not location
        (duration, support_type, purpose) = parsed_middle_part(
            grant[len(m.group(0)):grant.find(". . .")].strip())
    if m2:
        amount = int(m2.group(0).strip().replace("$", "").replace(",", ""))
    return {
        "year": 2001,
        "program": program,
        "sub_area": sub_area,
        "grantee": grantee,
        "grantee_location": location,
        "same_year_awards": amount,
        "duration": duration,
        "support_type": support_type,
        "purpose": purpose
    }
예제 #2
0
def main():
    writer = csv.DictWriter(sys.stdout, fieldnames=util.fieldnames)

    for fp in FILE_PATHS:
        with open(fp, "r") as f:
            soup = BeautifulSoup(f, "lxml")
            year = int(fp[len("data/"):len("data/YYYY")])
            for table in soup.find_all("table"):
                trs = table.find_all("tr")
                num_col = len(trs[1].find_all("td"))
                if year in [1991, 1992]:
                    assert num_col == 5, (fp, num_col)
                if year in [1993, 1994]:
                    assert num_col == 4, (fp, num_col)
                label = sub_area(table)
                for tr in trs:
                    cols = tr.find_all("td")
                    if util.cleaned(cols[0]) == "RECIPIENT":
                        continue
                    d = {
                            "url": SOURCE[fp],
                            "program": program_name(fp),
                            "year": year,
                            "sub_area": label,
                            }

                    try:
                        loc = util.cleaned(cols[0].i.extract().text)
                        assert re.match(r"[A-Za-z. -]+, [A-Za-z ]+", loc)
                        d["grantee_location"] = loc
                    except AssertionError:
                        raise ValueError("Does not look like a location", loc, fp)
                    except:
                        pass
                    d["grantee"] = util.cleaned(cols[0].text)
                    d["purpose"] = util.cleaned(cols[1].text)
                    if year in [1991, 1992]:
                        d["same_year_awards"] = amount(cols[2].text)
                        d["same_year_payments"] = amount(cols[3].text)
                        d["same_year_eoy_grants_payable"] = amount(cols[4].text)
                        assert amount(cols[2].text) is not None, fp
                        assert amount(cols[3].text) is not None, fp
                        assert amount(cols[4].text) is not None, fp
                    if year in [1993, 1994]:
                        d["same_year_awards"] = amount(cols[2].text)
                        d["same_year_payments"] = amount(cols[3].text)
                        assert amount(cols[2].text) is not None, fp
                        assert amount(cols[3].text) is not None, fp
                    writer.writerow(d)
예제 #3
0
def run(input, output, f):
    dataset = pd.read_csv(input, sep='\t', names=['Paper_Id','Paper_title','Publication_venue', \
                                                     'Cited_Papers', 'Cited_Papers_Venues'], quoting=csv.QUOTE_NONE)
    titles = dataset['Paper_title'].values
    
    pre_clean_sentences, word_list = pre_clean(titles)
    print("Done 1.lowercase 2.Remove criteria 3.Tokenize...")
    
    try:
        print("Loading pre-define",f,"...")
        filter_word = pkl.load(open(f, 'rb'))
    except:
        print("Didn't load pre-define, generate new...")
        freq = nltk.FreqDist(word_list)
        filter_word = list(filter(lambda x: x[1]>=5,freq.items())) # filter out low freq word
        filter_word = set([x for x, _ in filter_word])
        pkl.dump(filter_word, open("filter_word.pkl", 'wb'))
        print("From {0} words reduce to {1} words by filtering freq out < 5.".format(len(freq), len(filter_word)))

    cleaned_sentences = cleaned(pre_clean_sentences, filter_word)
    print("Done 4.filter out low freq...")
    
    dataset['Paper_title'] = cleaned_sentences
    print("Saving",output,"...")
    dataset.to_csv(output, sep='\t', index=False, header=False)
def parsed_middle_part(middle_part):
    duration = ""
    support_type = ""
    purpose = ""

    m_year = re.search(r"\d+[ ](?:year|years|month|months)$", middle_part)
    if m_year:
        duration = util.cleaned(m_year.group(0))
        middle_part = middle_part[:m_year.start(0)]
    m = re.search(r"""(?:(.+)\n)?
                      ((?:To|For|Multi).+)""",
                  middle_part,
                  flags=re.DOTALL | re.VERBOSE | re.MULTILINE)

    if m:
        support_type = util.cleaned(m.group(1))
        purpose = util.cleaned(m.group(2))
    return (duration, support_type, purpose)
예제 #5
0
def sub_area(table):
    """Find the sub-area of the grant. These are given right above the table in
    bold."""
    label = table.previous_sibling

    # Keep going until we find a tag
    while isinstance(label, bs4.element.NavigableString):
        label = label.previous_sibling

    assert label.name == "b"
    return util.cleaned(label)
예제 #6
0
def amount(x):
    x = util.cleaned(x).replace("$", "").replace("*", "")
    if not x:
        return 0
    m1 = re.match(r"^((\d{1,3},)?\d{1,3},)?\d{1,3}$", x)
    if m1:
        return int(x.replace(",", ""))
    m2 = re.match(r"^\(((\d{1,3},)?\d{1,3},)?\d{1,3}\)$", x)
    if m2:
        return -int(x.replace("(", "").replace(")", "").replace(",", ""))
    print(x, file=sys.stderr)
    return None
예제 #7
0
def write_grant(grant, sub_area, file_path, writer):
    purpose, grantee, location_amount = grant
    year = int(file_path[len("data/"):len("data/YYYY")])
    if grant == (['\n'], None, []):
        pass
    elif (any(map(lambda x: x.name == "h2", grant[0])) and
          grant[1] is None and grant[2] == []):
        pass
    elif not grantee:
        print(file_path, grant, file=sys.stderr)
    else:
        la_str = " ".join(map(util.cleaned, location_amount))
        loc = find_location(la_str)
        assert not loc or re.match(r"[A-Za-z -]+, [A-Z][A-Z]", loc) or re.match(r"[A-Z][A-Z]", loc)
        writer.writerow({"program": program_name(file_path),
                         "sub_area": sub_area,
                         "year": year,
                         "url": SOURCE[file_path],
                         "purpose": " ".join(map(util.cleaned, purpose)),
                         "notes": find_extra(la_str),
                         "grantee_location": loc,
                         "same_year_awards": first_dollar(la_str),
                         "grantee": util.cleaned(grantee.text)})
예제 #8
0
def find_sub_area(elem):
    curr = elem
    while curr is not None and curr.name != "h2":
        curr = curr.previous_sibling
    return util.cleaned(curr)