Пример #1
0
import re
import sys
import lib

f = open(sys.argv[1])
raw = f.read()

lines = lib.get_dat(raw)
list = lib.get_eterms(lines)

list = [entry.lower() for entry in list]

terms = [entry.strip() for entry in list]
words = lib.collapse([entry.split() for entry in list])

termset = set(terms)
wordset = set(words)

worddict = dict([(el, (0,0,0,0,0)) for el in wordset])

for term in terms:
	term = term.split()
	for i in range(len(term)):
		f,a,b,c,d = worddict[term[i]]
		if len(term) == 1:
			worddict[term[i]] = f+1,a+1,b,c,d
		elif i == 0:
			worddict[term[i]] = f+1,a,b+1,c,d
		elif i == (len(term)-1):
			worddict[term[i]] = f+1,a,b,c,d+1
		else:
Пример #2
0
import re
import sys
import lib

f = open(sys.argv[1])
raw = f.read()

terms = lib.get_freq(raw)

if '' in terms: terms.pop('')

list = [term.split() for term in terms.keys()]

list = lib.collapse(list)

s = set(list)

wfreq = dict([(word, 0) for word in s])

for words, freq in terms.items():
    for word in words.split():
        wfreq[word] += freq

for word, a in sorted(wfreq.items(), key=lambda entry: entry[1], reverse=True):
    print word + '\t' + str(a)
Пример #3
0
sys.stderr.write("Max dist: " + str(max_dist) + "\n")

if len(args) == 0:
    raw = sys.stdin.read()
else:
    f = open(args[0])
    raw = f.read()

lines = lib.get_dat(raw)
lines = [line[2] for line in lines]

if lowercase:
    lines = [line.lower() for line in lines]

if byword:
    lines = lib.collapse([line.split() for line in lines])

wordset = set(lines)

sys.stderr.write(str(len(wordset)) + "\n")

allcount = 0
scount = 0
for str1, str2 in itertools.combinations(wordset, 2):
    if allcount % 10000 == 0:
        sys.stderr.write("allcount: " + str(allcount) + "\n")
        sys.stderr.write("  scount: " + str(scount) + "\n")
    allcount += 1
    dist = lib.levenshtein(str1, str2)
    if dist <= max_dist:
        scount += 1
Пример #4
0
import re
import sys
import lib

f = open(sys.argv[1])
raw = f.read()

terms = lib.get_freq(raw)

if '' in terms: terms.pop('')

wordset = set(lib.collapse([term.split() for term in terms.keys()]))

worddict = dict([(el, (0,0,0,0,0)) for el in wordset])

for term, count in terms.items():
	term = term.split()
	for i in range(len(term)):
		f,a,b,c,d = worddict[term[i]]
		if len(term) == 1:
			worddict[term[i]] = f+count,a+count,b,c,d
		elif i == 0:
			worddict[term[i]] = f+count,a,b+count,c,d
		elif i == (len(term)-1):
			worddict[term[i]] = f+count,a,b,c,d+count
		else:
			worddict[term[i]] = f+count,a,b,c+count,d

for word, (f,a,b,c,d) in sorted(worddict.items(), key=lambda entry: entry[1][0], reverse=True):
	print word + '\t' + str(f) + '\t' + str(a) + '\t' + str(b) + '\t' + str(c) + '\t' + str(d)
Пример #5
0
import re
import sys
import lib

f = open(sys.argv[1])
raw = f.read()

lines = lib.get_dat(raw)
list = lib.get_eterms(lines)

list = [entry.lower() for entry in list]
list = [entry.split() for entry in list]

list = lib.collapse(list)

s = set(list)

d = dict([(el, 0) for el in s])

for entry in list:
	d[entry] += 1

for word, a in sorted(d.items(), key=lambda entry: entry[1], reverse=True):
	print word + '\t' + str(a)
Пример #6
0
sys.stderr.write("Max dist: " + str(max_dist) + "\n")

if len(args) == 0:
    raw = sys.stdin.read()
else:
    f = open(args[0])
    raw = f.read()

lines = lib.get_dat(raw)
lines = [line[2] for line in lines]

if lowercase:
    lines = [line.lower() for line in lines]

if byword:
    lines = lib.collapse([line.split() for line in lines])

wordset = set(lines)

sys.stderr.write(str(len(wordset)) + "\n")

allcount = 0
scount = 0
for str1, str2 in itertools.combinations(wordset, 2):
    if allcount%10000 == 0:
        sys.stderr.write("allcount: " + str(allcount) + "\n")
        sys.stderr.write("  scount: " + str(scount) + "\n")
    allcount += 1
    dist = lib.levenshtein(str1, str2)
    if dist <= max_dist:
        scount += 1
Пример #7
0
sys.stderr.write(str(len(lines)) + " entries\n")

p = PunktSentenceTokenizer()

taking_pos = set(["ADJ", "ADV", "FW", "N", "NP", "NUM", "VG", "VN"])

for i in range(len(lines)):
    if i % 100 == 0:
        sys.stderr.write(str(i) + "/" + str(len(lines)) + "\n")
    line = dict(lines[i])
    if "EKYWD" in line and "EABST" in line:
        abstract = line["EABST"]
        abstract = p.tokenize(abstract)
        abstract = [word_tokenize(sent) for sent in abstract]
        abstract = lib.collapse(abstract)
        pos_abstract = pos_tag(abstract)
        pos_abstract = [(word, tag.simplify.simplify_wsj_tag(t)) for word, t in pos_abstract]
        keywords = re.split("\t", line["EKYWD"])
        keywords = [word_tokenize(keyword) for keyword in keywords]
        j = 0
        while j < len(abstract):
            found = False
            for k in range(len(keywords)):
                keyword = keywords[k]
                keyword_len = len(keyword)
                if keyword_len > 0 and keyword == abstract[j:j+keyword_len]:
                    for l in range(keyword_len):
                        this_word = keyword[l]
                        this_pos = pos_abstract[j+l][1]
                        out = ""
Пример #8
0
import re
import sys
import lib

f = open(sys.argv[1])
raw = f.read()

terms = lib.get_freq(raw)

if '' in terms: terms.pop('')

wordset = set(lib.collapse([term.split() for term in terms.keys()]))

worddict = dict([(el, (0, 0, 0, 0, 0)) for el in wordset])

for term, count in terms.items():
    term = term.split()
    for i in range(len(term)):
        f, a, b, c, d = worddict[term[i]]
        if len(term) == 1:
            worddict[term[i]] = f + count, a + count, b, c, d
        elif i == 0:
            worddict[term[i]] = f + count, a, b + count, c, d
        elif i == (len(term) - 1):
            worddict[term[i]] = f + count, a, b, c, d + count
        else:
            worddict[term[i]] = f + count, a, b, c + count, d

for word, (f, a, b, c, d) in sorted(worddict.items(),
                                    key=lambda entry: entry[1][0],
                                    reverse=True):
Пример #9
0
sys.stderr.write(str(len(lines)) + " entries\n")

p = PunktSentenceTokenizer()

taking_pos = set(["ADJ", "ADV", "FW", "N", "NP", "NUM", "VG", "VN"])

for i in range(len(lines)):
    if i % 100 == 0:
        sys.stderr.write(str(i) + "/" + str(len(lines)) + "\n")
    line = dict(lines[i])
    if "EKYWD" in line and "EABST" in line:
        abstract = line["EABST"]
        abstract = p.tokenize(abstract)
        abstract = [word_tokenize(sent) for sent in abstract]
        abstract = lib.collapse(abstract)
        pos_abstract = pos_tag(abstract)
        pos_abstract = [(word, tag.simplify.simplify_wsj_tag(t))
                        for word, t in pos_abstract]
        keywords = re.split("\t", line["EKYWD"])
        keywords = [word_tokenize(keyword) for keyword in keywords]
        j = 0
        while j < len(abstract):
            found = False
            for k in range(len(keywords)):
                keyword = keywords[k]
                keyword_len = len(keyword)
                if keyword_len > 0 and keyword == abstract[j:j + keyword_len]:
                    for l in range(keyword_len):
                        this_word = keyword[l]
                        this_pos = pos_abstract[j + l][1]