Пример #1
0
	def runRule(self, buf, index, r):
		curindex = index
		sm = StringMatcher()
		dm = DateMatcher()
		tm = TimeMatcher()
		ri = RuleInstance(r)
		if r.getType() == 'stringmatch':
			p = sm.seek_until_keys(buf[curindex:], r.getKeys(), r.getExpectedConfidence())
			if p[0] != -1: # Found something
				ri.addMatchedKeys(r.getType(), [p[1], p[2], curindex+p[0], curindex+(p[0]+len(p[1]))])
				curindex += (p[0]+len(p[1]))
		elif r.getType() == 'datematch':
			p = dm.locate_date(buf[curindex:], r.getExpectedConfidence())
			if p[0] != -1:
				condate = ""
				for v in p[1]:
					condate += v[0]+" "

				ri.addMatchedKeys(r.getType(), [p[1], p[2], curindex+p[0], curindex+p[0]+len(condate)])
				curindex += p[0]
		elif r.getType() == 'timematch':
			p = tm.locate_time(buf[curindex:], r.getExpectedConfidence())
			if p[0] != -1:
				contime = ""
				for v in p[1]:
					contime += v[0]+" "
				print "Time offset: ",curindex," and ",p[0]," = ", curindex+p[0]+len(contime)
				ri.addMatchedKeys(r.getType(), [p[1], p[2], curindex+p[0], curindex+p[0]+len(contime)])
				curindex += p[0]

		return (curindex, ri)
def LevDistCorrect (token, suggestions):
   temp = StringMatcher.distance(token, suggestions[0])
   corrected_token = suggestions[0]

   for word in suggestions[1:]:
      dist = StringMatcher.distance(token, word)

      if (dist < temp):
         corrected_token = word

   return corrected_token
Пример #3
0
    def runRule(self, buf, index, r):
        curindex = index
        sm = StringMatcher()
        dm = DateMatcher()
        tm = TimeMatcher()
        ri = RuleInstance(r)
        if r.getType() == 'stringmatch':
            p = sm.seek_until_keys(buf[curindex:], r.getKeys(),
                                   r.getExpectedConfidence())
            if p[0] != -1:  # Found something
                ri.addMatchedKeys(r.getType(), [
                    p[1], p[2], curindex + p[0], curindex + (p[0] + len(p[1]))
                ])
                curindex += (p[0] + len(p[1]))
        elif r.getType() == 'datematch':
            p = dm.locate_date(buf[curindex:], r.getExpectedConfidence())
            if p[0] != -1:
                condate = ""
                for v in p[1]:
                    condate += v[0] + " "

                ri.addMatchedKeys(r.getType(), [
                    p[1], p[2], curindex + p[0], curindex + p[0] + len(condate)
                ])
                curindex += p[0]
        elif r.getType() == 'timematch':
            p = tm.locate_time(buf[curindex:], r.getExpectedConfidence())
            if p[0] != -1:
                contime = ""
                for v in p[1]:
                    contime += v[0] + " "
                print "Time offset: ", curindex, " and ", p[
                    0], " = ", curindex + p[0] + len(contime)
                ri.addMatchedKeys(r.getType(), [
                    p[1], p[2], curindex + p[0], curindex + p[0] + len(contime)
                ])
                curindex += p[0]

        return (curindex, ri)
Пример #4
0
def getDifflibOrPyLev(seq2=None, junk=None, forceDifflib=False):
    '''
    returns either a difflib.SequenceMatcher or pyLevenshtein StringMatcher.StringMatcher
    object depending on what is installed.
    
    If forceDifflib is True then use difflib even if pyLevenshtein is installed:
    '''

    if forceDifflib is True:
        smObject = difflib.SequenceMatcher(junk, '', seq2)
    else:
        try:
            import StringMatcher as pyLevenshtein
            smObject = pyLevenshtein.StringMatcher(junk, '', seq2)
        except ImportError:
            smObject = difflib.SequenceMatcher(junk, '', seq2)

    return smObject
Пример #5
0
nodeLists = [dom.xpath('//exercises//problem-set/entry | //exercises//multi-part/entry') for dom in doms]
assert len(nodeLists[0]) == len(nodeLists[1])

for tagName in ['solution','correct']:
    for nodeIndex in range(len(nodeLists[0])):
        entries = [nodeList[nodeIndex] for nodeList in nodeLists]
        solutions = [entry.find(tagName) for entry in entries]
        solutionStrings = []
        for solution in solutions:
            if solution is None:
                solutionStrings.append('')
            else:
                solutionStrings.append(strip_namespaces(etree.tostring(solution, with_tail=False)))
        if solutionStrings[0] != solutionStrings[1]:
            blocks = StringMatcher.matching_blocks(StringMatcher.editops(solutionStrings[0], solutionStrings[1]), solutionStrings[0], solutionStrings[1])
            if sum([block[2] for block in blocks])/max(len(solutionStrings[0]), len(solutionStrings[1])) < 0.1:
                blocks = []
            for i, col in [(0,'old'), (1,'new')]:
                pos = 0
                output = ''
                for block in blocks:
                    if block[i] > pos:
                        output += termColors[col] + solutionStrings[i][pos:block[i]] + termColors['reset']
                    output += solutionStrings[i][block[i]:block[i]+block[2]]
                    pos = block[i]+block[2]
                if pos < len(solutionStrings[i]):
                    output += termColors[col] + solutionStrings[i][pos:] + termColors['reset']
                print '===', col.upper(), '===================================================='
                print output
            print '============================================================'
Пример #6
0
from RuleSet import *
from Rule import *
from RuleInstance import *
from SpellChecker import *
from DateParser import *
from Record import *
from XLSProcessor import *
from HOCRParser import *
from kitchen.text.converters import getwriter
from time import strftime
UTF8Writer = getwriter('utf8')
sys.stdout = UTF8Writer(sys.stdout)

xls = XLSProcessor('out.xls')
xls.save()
s = StringMatcher()

#  Could be something like this
#
#                             programtype
#                           >0.9 /    \  <0.9
#                            date    fail
#                      >0.7 /   \  <0.7
#                        time   fail
#                      / | |  \
#                             fail
#



Пример #7
0
     #print("test ",i)
     #print(substr)
     naiveMatch(stri, substr)
 end3 = time.time()
 print("###################################")
 print("Time elapsed ", end3 - start3)
 print("###################################")
 start1 = time.time()
 for i in range(1, int(len(string) / 100000)):
     j = 100000 * (i - 1)
     stri = string[j:j + 100000]
     k = 10 * (i - 1)
     substr = substring[k:k + 10]
     #print("test ",i)
     #print(substr)
     kmp = RK.KarpRabin(substr, stri)
 end1 = time.time()
 print("###################################")
 print("Time elapsed ", end1 - start1)
 print("###################################")
 start2 = time.time()
 for i in range(1, int(len(string) / 100000)):
     j = 100000 * (i - 1)
     stri = string[j:j + 100000]
     k = 10 * (i - 1)
     substr = substring[k:k + 10]
     #print("test ",i)
     #print(substr)
     kmp = KMP.KMP().search(stri, substr)
 end2 = time.time()
 print("###################################")
Пример #8
0
    def __init__(self, lang, filePath, hotFilePath, muti_name_file, att_file,
                 batch_size, cache_size, maxSampCount, shuffle, word_vocab,
                 kb_vocab, kbp_type_vocab, kb_type_vocab):
        self.lang = lang
        assert (self.lang == 'ENG' or self.lang == 'CMN' or self.lang == 'SPA')
        self.doc_avg_dis = {}
        self.doc_max_dis = {}
        #para1
        self.doc_avg_dis['ENG'] = 0.0445
        self.doc_avg_dis['CMN'] = 0
        self.doc_avg_dis['SPA'] = 0.1397
        self.doc_max_dis['ENG'] = 0.2
        self.doc_max_dis['CMN'] = 0.8
        self.doc_max_dis['SPA'] = 0.3

        #para2
        #self.doc_avg_dis['ENG'] = 0.0
        #self.doc_avg_dis['CMN'] = 0.0
        #self.doc_avg_dis['SPA'] = 0.1397
        #self.doc_max_dis['ENG'] = 1.0
        #self.doc_max_dis['CMN'] = 1.0
        #self.doc_max_dis['SPA'] = 0.3

        self.filePath = filePath
        self.hotFilePath = hotFilePath
        self.mutiNamePath = muti_name_file
        self.att_file = att_file
        self.shuffle = shuffle
        self.stopWord = {}
        self.wikiContext = {}
        self.docContext = {}
        self.wikiContextIDs = {}
        self.docContextIDs = {}
        self.wordsINDoc = {}
        self.wordsINWiki = {}
        self.hotValue = {}
        self.samples = []
        self.candAttWord = {}
        self.mutiName = {}

        self.word_vocab = word_vocab
        self.kb_vocab = kb_vocab
        self.kbp_type_vocab = kbp_type_vocab
        self.kb_type_vocab = kb_type_vocab

        self.batch_size = batch_size
        self.cache_size = cache_size
        self.FileEnd = 0
        self.file = file
        self.samples_count = 0
        self.stringMatcher = StringMatcher.StringMatcher()
        self.docMatcher = DocMatcher.DocMatcher()

        self.maxSampCount = maxSampCount
        self.group = None

        if (self.lang == 'ENG'):
            self.docMatcher._loadIDF('../file/idf.txt', self.word_vocab)
            self.__loadStopWord('../file/StopWord.txt')
            self.__readDocWikiContext(self.filePath)
            self.__loadHotFile(self.hotFilePath)
            self.__loadMutiName(self.mutiNamePath)
        elif (self.lang == 'CMN'):
            self.docMatcher._loadIDF('../data/IDF.txt', self.word_vocab)
            self.__readDocWikiContext(self.filePath)
            self.__loadHotFile(self.hotFilePath)
            self.__loadMutiName(self.mutiNamePath)
        elif (self.lang == 'SPA'):
            self.docMatcher._loadIDF('../file/idf.txt', self.word_vocab)
            self.__readDocWikiContext(self.filePath)
            self.__loadHotFile(self.hotFilePath)
            self.__loadMutiName(self.mutiNamePath)

        self.reset()
Пример #9
0
    def statistic_similarity(self, paper, min_similarity):
        """Function that splits the paper text in n-grams (unigrams,bigrams,trigrams)
        and with a Levenshtein it check the similarity for each of them with the topics in the ontology.

        Args:
            paper (string): The paper to analyse. At this stage it is a string.
            cso (dictionary): the ontology previously loaded from the file.
            min_similarity (integer): minimum Levenshtein similarity between the n-gram and the topics within the CSO. 

        Returns:
            found_topics (dictionary): containing the found topics with their similarity and the n-gram analysed.
        """

        # analysing grams
        found_topics = {}

        idx = 0
        trigrams = ngrams(word_tokenize(paper), 3)
        matched_trigrams = []
        for grams in trigrams:
            idx += 1
            gram = " ".join(grams)
            topics = [
                key for key, _ in self.cso['topics'].items()
                if key.startswith(gram[:4])
            ]
            for topic in topics:
                m = ls.StringMatcher(None, topic, gram).ratio()
                if m >= min_similarity:
                    topic = self.get_primary_label(topic,
                                                   self.cso['primary_labels'])
                    if topic in found_topics:
                        found_topics[topic].append({
                            'matched': gram,
                            'similarity': m
                        })
                    else:
                        found_topics[topic] = [{
                            'matched': gram,
                            'similarity': m
                        }]
                    matched_trigrams.append(idx)

        idx = 0
        bigrams = ngrams(word_tokenize(paper), 2)
        matched_bigrams = []
        for grams in bigrams:
            idx += 1
            if (idx not in matched_trigrams) and ((idx - 1)
                                                  not in matched_trigrams):
                gram = " ".join(grams)
                topics = [
                    key for key, _ in self.cso['topics'].items()
                    if key.startswith(gram[:4])
                ]
                for topic in topics:
                    m = ls.StringMatcher(None, topic, gram).ratio()
                    if m >= min_similarity:
                        topic = self.get_primary_label(
                            topic, self.cso['primary_labels'])
                        if topic in found_topics:
                            found_topics[topic].append({
                                'matched': gram,
                                'similarity': m
                            })
                        else:
                            found_topics[topic] = [{
                                'matched': gram,
                                'similarity': m
                            }]
                        matched_bigrams.append(idx)

        idx = 0
        unigrams = ngrams(word_tokenize(paper), 1)
        for grams in unigrams:
            idx += 1
            if (idx not in matched_trigrams) and (
                (idx - 1) not in matched_trigrams) and (
                    idx not in matched_bigrams) and (
                        (idx - 1) not in matched_bigrams) and (
                            (idx - 1) not in matched_bigrams):
                gram = " ".join(grams)
                topics = [
                    key for key, _ in self.cso['topics'].items()
                    if key.startswith(gram[:4])
                ]
                for topic in topics:
                    m = ls.StringMatcher(None, topic, gram).ratio()
                    if m >= min_similarity:
                        topic = self.get_primary_label(
                            topic, self.cso['primary_labels'])
                        if topic in found_topics:
                            found_topics[topic].append({
                                'matched': gram,
                                'similarity': m
                            })
                        else:
                            found_topics[topic] = [{
                                'matched': gram,
                                'similarity': m
                            }]

        return found_topics
Пример #10
0
def get_seq(i):
    global lines
    data = ""
    header = ""
    for line in lines:
        if line.startswith(">"):
            if i == 0:
                return (header, data)
            i = i - 1
            header = line[:-1]
        else:
            data = line[10:-11]  # remove MIDs and \n


seq_count = get_seq_count()
sm = StringMatcher()


def index(a, x):
    i = bisect.bisect_left(a, x)
    if i != len(a) and a[i] == x:
        return True
    return False


os.system("mkdir " + sys.argv[1] + "_result")

for i in range(seq_count):
    if index(matched, i + 1):
        continue
    header, data = get_seq(i + 1)