Пример #1
0
def load_bnc_word_freqs():
#	filename = "/home/rob/git/thinklink/reference/bnc_corpus_all.num.o5.txt"
	docfreqs = {}
	for line in localfile("data/bnc_corpus_all.num.o5.txt"):
		termfreq,term,type,docfreq = line.strip().split(" ")
		if term not in docfreqs:	# TODO: not sure if this is correct
			docfreqs[term] = int(docfreq)
	return docfreqs
Пример #2
0
def load_bnc_word_freqs():
    #	filename = "/home/rob/git/thinklink/reference/bnc_corpus_all.num.o5.txt"
    docfreqs = {}
    for line in localfile("data/bnc_corpus_all.num.o5.txt"):
        termfreq, term, type, docfreq = line.strip().split(" ")
        if term not in docfreqs:  # TODO: not sure if this is correct
            docfreqs[term] = int(docfreq)
    return docfreqs
Пример #3
0
def load_ranges(ranges):
	return pickle.load(settings.localfile("data/svm_range.range","w"))		
Пример #4
0
def save_ranges(ranges):
	outfile = settings.localfile("data/svm_range.range","w")
	pickle.dump(ranges,outfile)
	outfile.close()
Пример #5
0
import urllib2
import settings
from nlptools.html_to_text import html_to_text
import re


def tokenize(claim):
    return re.split("(\W+)", claim)


def sentences(text):
    return text.split(".")


firstwords_keys = set(
    settings.localfile("data/firstwords_keys.list").read().split("|"))

#TODO: replace caches with limited-size caches that throw away data when they get too big

firstwords_cache = {}


def get_firstwords(first):
    if not first in firstwords_cache:
        try:
            firstwords_cache[first] = FirstWords.objects.get(
                firstword=first).secondwords_set()
        except:
            firstwords_cache[first] = set()
    return firstwords_cache[first]
Пример #6
0
def load_ranges(ranges):
    return pickle.load(settings.localfile("data/svm_range.range", "w"))
Пример #7
0
def save_ranges(ranges):
    outfile = settings.localfile("data/svm_range.range", "w")
    pickle.dump(ranges, outfile)
    outfile.close()
Пример #8
0
from urlcheck.models import FirstWords, WordPair, WordTriple
import urllib2
import settings
from nlptools.html_to_text import html_to_text
import re

def tokenize(claim): return re.split("(\W+)",claim)
def sentences(text): return text.split(".")





firstwords_keys = set(settings.localfile("data/firstwords_keys.list").read().split("|"))

#TODO: replace caches with limited-size caches that throw away data when they get too big

firstwords_cache = {}
def get_firstwords(first):
	if not first in firstwords_cache:
		try: firstwords_cache[first] = FirstWords.objects.get(firstword=first).secondwords_set()
		except: firstwords_cache[first] = set()
	return firstwords_cache[first]	
		
wordpair_cache = {}
def get_wordpair(pair):
	if not pair in wordpair_cache:
		try:
			pairobj = WordPair.objects.get(pair=pair)
			wordpair_cache[pair] = (pairobj.triples_set(),pairobj.claims_list())
		except: