Exemplo n.º 1
0
        so_far = (sortpl(so_far))[:k]

    return so_far

# check while parsing the arguments that the number given is a nat.
def nat (string) :
    value = int(string)
    if value < 0:
        msg = "%r is not a natural number" % string
        raise argparse.ArgumentTypeError(msg)
    return value

## describe and parse arguments from the command line
parser = argparse.ArgumentParser(description="Count the n most-used words in" +
                                 "a corpus of English text.")
baseargs (parser)
parser.add_argument("-n" , "--number",
                    help="number of most frequently used words to compute. "
                          + "defaults to 4.",
                    type=nat,
                    default=4)
args = parser.parse_args()
txtsrc = opentext (args.pdf , args.gutenberg , args.filename)

# traverse the whole file, adding canonical forms of valid words into a
# dictionary counting the number of appearances.
d = dict()
for line in txtsrc:
    # get rid of ASCII em and en dashes
    line = (line.replace("---", " ")).replace("--", " ")
Exemplo n.º 2
0
    print "looking for adjectives near nouns requires that you install nltk"
    exit(1)

# check if a string is an English noun
def noun (string):
    tag = nltk.pos_tag([string])
    if not(tag[0][1] == 'NN' or tag[0][1] == 'NNP'):
        msg = "%r is not a noun according to nltk" % string
        raise argparse.ArgumentTypeError(msg)
    return string

# describe and parse arguments from the command line
parser = argparse.ArgumentParser(description="Count the number of times a "
                                 + "noun appears after an adjective in a "
                                 + "corpus of English text.")
baseargs(parser, "nicer output if you have progressbar installed.")
parser.add_argument("noun", help="the noun for which to search", type=noun)
args = parser.parse_args()

txtsrc = opentext (args.pdf, args.gutenberg, args.filename)

# dump the corpus into a string so nltk can tokenize it
corpus = ""
for line in txtsrc:
    corpus = corpus + (line.lower())
if not (args.pdf or args.gutenberg):
    txtsrc.close()

if args.verbose:
    print "read in text"