Пример #1
	def runRule(self, buf, index, r):
		curindex = index
		sm = StringMatcher()
		dm = DateMatcher()
		tm = TimeMatcher()
		ri = RuleInstance(r)
		if r.getType() == 'stringmatch':
			p = sm.seek_until_keys(buf[curindex:], r.getKeys(), r.getExpectedConfidence())
			if p[0] != -1: # Found something
				ri.addMatchedKeys(r.getType(), [p[1], p[2], curindex+p[0], curindex+(p[0]+len(p[1]))])
				curindex += (p[0]+len(p[1]))
		elif r.getType() == 'datematch':
			p = dm.locate_date(buf[curindex:], r.getExpectedConfidence())
			if p[0] != -1:
				condate = ""
				for v in p[1]:
					condate += v[0]+" "

				ri.addMatchedKeys(r.getType(), [p[1], p[2], curindex+p[0], curindex+p[0]+len(condate)])
				curindex += p[0]
		elif r.getType() == 'timematch':
			p = tm.locate_time(buf[curindex:], r.getExpectedConfidence())
			if p[0] != -1:
				contime = ""
				for v in p[1]:
					contime += v[0]+" "
				print "Time offset: ",curindex," and ",p[0]," = ", curindex+p[0]+len(contime)
				ri.addMatchedKeys(r.getType(), [p[1], p[2], curindex+p[0], curindex+p[0]+len(contime)])
				curindex += p[0]

		return (curindex, ri)
def LevDistCorrect (token, suggestions):
   temp = StringMatcher.distance(token, suggestions[0])
   corrected_token = suggestions[0]

   for word in suggestions[1:]:
      dist = StringMatcher.distance(token, word)

      if (dist < temp):
         corrected_token = word

   return corrected_token
Пример #3
    def runRule(self, buf, index, r):
        curindex = index
        sm = StringMatcher()
        dm = DateMatcher()
        tm = TimeMatcher()
        ri = RuleInstance(r)
        if r.getType() == 'stringmatch':
            p = sm.seek_until_keys(buf[curindex:], r.getKeys(),
            if p[0] != -1:  # Found something
                ri.addMatchedKeys(r.getType(), [
                    p[1], p[2], curindex + p[0], curindex + (p[0] + len(p[1]))
                curindex += (p[0] + len(p[1]))
        elif r.getType() == 'datematch':
            p = dm.locate_date(buf[curindex:], r.getExpectedConfidence())
            if p[0] != -1:
                condate = ""
                for v in p[1]:
                    condate += v[0] + " "

                ri.addMatchedKeys(r.getType(), [
                    p[1], p[2], curindex + p[0], curindex + p[0] + len(condate)
                curindex += p[0]
        elif r.getType() == 'timematch':
            p = tm.locate_time(buf[curindex:], r.getExpectedConfidence())
            if p[0] != -1:
                contime = ""
                for v in p[1]:
                    contime += v[0] + " "
                print "Time offset: ", curindex, " and ", p[
                    0], " = ", curindex + p[0] + len(contime)
                ri.addMatchedKeys(r.getType(), [
                    p[1], p[2], curindex + p[0], curindex + p[0] + len(contime)
                curindex += p[0]

        return (curindex, ri)
Пример #4
def getDifflibOrPyLev(seq2=None, junk=None, forceDifflib=False):
    returns either a difflib.SequenceMatcher or pyLevenshtein StringMatcher.StringMatcher
    object depending on what is installed.
    If forceDifflib is True then use difflib even if pyLevenshtein is installed:

    if forceDifflib is True:
        smObject = difflib.SequenceMatcher(junk, '', seq2)
            import StringMatcher as pyLevenshtein
            smObject = pyLevenshtein.StringMatcher(junk, '', seq2)
        except ImportError:
            smObject = difflib.SequenceMatcher(junk, '', seq2)

    return smObject
Пример #5
nodeLists = [dom.xpath('//exercises//problem-set/entry | //exercises//multi-part/entry') for dom in doms]
assert len(nodeLists[0]) == len(nodeLists[1])

for tagName in ['solution','correct']:
    for nodeIndex in range(len(nodeLists[0])):
        entries = [nodeList[nodeIndex] for nodeList in nodeLists]
        solutions = [entry.find(tagName) for entry in entries]
        solutionStrings = []
        for solution in solutions:
            if solution is None:
                solutionStrings.append(strip_namespaces(etree.tostring(solution, with_tail=False)))
        if solutionStrings[0] != solutionStrings[1]:
            blocks = StringMatcher.matching_blocks(StringMatcher.editops(solutionStrings[0], solutionStrings[1]), solutionStrings[0], solutionStrings[1])
            if sum([block[2] for block in blocks])/max(len(solutionStrings[0]), len(solutionStrings[1])) < 0.1:
                blocks = []
            for i, col in [(0,'old'), (1,'new')]:
                pos = 0
                output = ''
                for block in blocks:
                    if block[i] > pos:
                        output += termColors[col] + solutionStrings[i][pos:block[i]] + termColors['reset']
                    output += solutionStrings[i][block[i]:block[i]+block[2]]
                    pos = block[i]+block[2]
                if pos < len(solutionStrings[i]):
                    output += termColors[col] + solutionStrings[i][pos:] + termColors['reset']
                print '===', col.upper(), '===================================================='
                print output
            print '============================================================'
Пример #6
from RuleSet import *
from Rule import *
from RuleInstance import *
from SpellChecker import *
from DateParser import *
from Record import *
from XLSProcessor import *
from HOCRParser import *
from kitchen.text.converters import getwriter
from time import strftime
UTF8Writer = getwriter('utf8')
sys.stdout = UTF8Writer(sys.stdout)

xls = XLSProcessor('out.xls')
s = StringMatcher()

#  Could be something like this
#                             programtype
#                           >0.9 /    \  <0.9
#                            date    fail
#                      >0.7 /   \  <0.7
#                        time   fail
#                      / | |  \
#                             fail

Пример #7
     #print("test ",i)
     naiveMatch(stri, substr)
 end3 = time.time()
 print("Time elapsed ", end3 - start3)
 start1 = time.time()
 for i in range(1, int(len(string) / 100000)):
     j = 100000 * (i - 1)
     stri = string[j:j + 100000]
     k = 10 * (i - 1)
     substr = substring[k:k + 10]
     #print("test ",i)
     kmp = RK.KarpRabin(substr, stri)
 end1 = time.time()
 print("Time elapsed ", end1 - start1)
 start2 = time.time()
 for i in range(1, int(len(string) / 100000)):
     j = 100000 * (i - 1)
     stri = string[j:j + 100000]
     k = 10 * (i - 1)
     substr = substring[k:k + 10]
     #print("test ",i)
     kmp = KMP.KMP().search(stri, substr)
 end2 = time.time()
Пример #8
    def __init__(self, lang, filePath, hotFilePath, muti_name_file, att_file,
                 batch_size, cache_size, maxSampCount, shuffle, word_vocab,
                 kb_vocab, kbp_type_vocab, kb_type_vocab):
        self.lang = lang
        assert (self.lang == 'ENG' or self.lang == 'CMN' or self.lang == 'SPA')
        self.doc_avg_dis = {}
        self.doc_max_dis = {}
        self.doc_avg_dis['ENG'] = 0.0445
        self.doc_avg_dis['CMN'] = 0
        self.doc_avg_dis['SPA'] = 0.1397
        self.doc_max_dis['ENG'] = 0.2
        self.doc_max_dis['CMN'] = 0.8
        self.doc_max_dis['SPA'] = 0.3

        #self.doc_avg_dis['ENG'] = 0.0
        #self.doc_avg_dis['CMN'] = 0.0
        #self.doc_avg_dis['SPA'] = 0.1397
        #self.doc_max_dis['ENG'] = 1.0
        #self.doc_max_dis['CMN'] = 1.0
        #self.doc_max_dis['SPA'] = 0.3

        self.filePath = filePath
        self.hotFilePath = hotFilePath
        self.mutiNamePath = muti_name_file
        self.att_file = att_file
        self.shuffle = shuffle
        self.stopWord = {}
        self.wikiContext = {}
        self.docContext = {}
        self.wikiContextIDs = {}
        self.docContextIDs = {}
        self.wordsINDoc = {}
        self.wordsINWiki = {}
        self.hotValue = {}
        self.samples = []
        self.candAttWord = {}
        self.mutiName = {}

        self.word_vocab = word_vocab
        self.kb_vocab = kb_vocab
        self.kbp_type_vocab = kbp_type_vocab
        self.kb_type_vocab = kb_type_vocab

        self.batch_size = batch_size
        self.cache_size = cache_size
        self.FileEnd = 0
        self.file = file
        self.samples_count = 0
        self.stringMatcher = StringMatcher.StringMatcher()
        self.docMatcher = DocMatcher.DocMatcher()

        self.maxSampCount = maxSampCount
        self.group = None

        if (self.lang == 'ENG'):
            self.docMatcher._loadIDF('../file/idf.txt', self.word_vocab)
        elif (self.lang == 'CMN'):
            self.docMatcher._loadIDF('../data/IDF.txt', self.word_vocab)
        elif (self.lang == 'SPA'):
            self.docMatcher._loadIDF('../file/idf.txt', self.word_vocab)

Пример #9
    def statistic_similarity(self, paper, min_similarity):
        """Function that splits the paper text in n-grams (unigrams,bigrams,trigrams)
        and with a Levenshtein it check the similarity for each of them with the topics in the ontology.

            paper (string): The paper to analyse. At this stage it is a string.
            cso (dictionary): the ontology previously loaded from the file.
            min_similarity (integer): minimum Levenshtein similarity between the n-gram and the topics within the CSO. 

            found_topics (dictionary): containing the found topics with their similarity and the n-gram analysed.

        # analysing grams
        found_topics = {}

        idx = 0
        trigrams = ngrams(word_tokenize(paper), 3)
        matched_trigrams = []
        for grams in trigrams:
            idx += 1
            gram = " ".join(grams)
            topics = [
                key for key, _ in self.cso['topics'].items()
                if key.startswith(gram[:4])
            for topic in topics:
                m = ls.StringMatcher(None, topic, gram).ratio()
                if m >= min_similarity:
                    topic = self.get_primary_label(topic,
                    if topic in found_topics:
                            'matched': gram,
                            'similarity': m
                        found_topics[topic] = [{
                            'matched': gram,
                            'similarity': m

        idx = 0
        bigrams = ngrams(word_tokenize(paper), 2)
        matched_bigrams = []
        for grams in bigrams:
            idx += 1
            if (idx not in matched_trigrams) and ((idx - 1)
                                                  not in matched_trigrams):
                gram = " ".join(grams)
                topics = [
                    key for key, _ in self.cso['topics'].items()
                    if key.startswith(gram[:4])
                for topic in topics:
                    m = ls.StringMatcher(None, topic, gram).ratio()
                    if m >= min_similarity:
                        topic = self.get_primary_label(
                            topic, self.cso['primary_labels'])
                        if topic in found_topics:
                                'matched': gram,
                                'similarity': m
                            found_topics[topic] = [{
                                'matched': gram,
                                'similarity': m

        idx = 0
        unigrams = ngrams(word_tokenize(paper), 1)
        for grams in unigrams:
            idx += 1
            if (idx not in matched_trigrams) and (
                (idx - 1) not in matched_trigrams) and (
                    idx not in matched_bigrams) and (
                        (idx - 1) not in matched_bigrams) and (
                            (idx - 1) not in matched_bigrams):
                gram = " ".join(grams)
                topics = [
                    key for key, _ in self.cso['topics'].items()
                    if key.startswith(gram[:4])
                for topic in topics:
                    m = ls.StringMatcher(None, topic, gram).ratio()
                    if m >= min_similarity:
                        topic = self.get_primary_label(
                            topic, self.cso['primary_labels'])
                        if topic in found_topics:
                                'matched': gram,
                                'similarity': m
                            found_topics[topic] = [{
                                'matched': gram,
                                'similarity': m

        return found_topics
Пример #10
def get_seq(i):
    global lines
    data = ""
    header = ""
    for line in lines:
        if line.startswith(">"):
            if i == 0:
                return (header, data)
            i = i - 1
            header = line[:-1]
            data = line[10:-11]  # remove MIDs and \n

seq_count = get_seq_count()
sm = StringMatcher()

def index(a, x):
    i = bisect.bisect_left(a, x)
    if i != len(a) and a[i] == x:
        return True
    return False

os.system("mkdir " + sys.argv[1] + "_result")

for i in range(seq_count):
    if index(matched, i + 1):
    header, data = get_seq(i + 1)