Exemplo n.º 1
0
def PrepareJSandJM(nodes):
    nodes.head.ApplyFeature(utils.FeatureID_JS2)
    JSnode = Tokenization.SentenceNode('')
    JSnode.ApplyFeature(utils.FeatureID_JS)
    JSnode.ApplyFeature(utils.FeatureID_JS2)
    nodes.insert(JSnode, 0)
    p = nodes.head.next
    while p.next:
        if utils.FeatureID_SYM not in p.features:
            p.ApplyFeature(utils.FeatureID_JS2)
            break
        p.ApplyFeature(utils.FeatureID_JS2)
        p = p.next

    PUNCSet = {".", "?", "!", ";", "...", ":", "。"}
    if utils.FeatureID_SYM not in nodes.tail.features and \
            nodes.tail.text not in PUNCSet  :
        JMnode = Tokenization.SentenceNode('')
        JMnode.StartOffset = nodes.tail.EndOffset
        JMnode.EndOffset = nodes.tail.EndOffset
        JMnode.ApplyFeature(utils.FeatureID_punc)
        nodes.append(JMnode)
    nodes.tail.ApplyFeature(utils.FeatureID_JM)
    nodes.tail.ApplyFeature(utils.FeatureID_JM2)
    p = nodes.tail.prev
    while p.prev:
        if utils.FeatureID_SYM not in p.features:
            # first one that is not punc. the real JM2:
            p.ApplyFeature(utils.FeatureID_JM2)
            break
        p.ApplyFeature(utils.FeatureID_JM2)
        p = p.prev
Exemplo n.º 2
0
    def test_LogicOr(self):
        """Logic Or"""
        node = Tokenization.SentenceNode('being')
        strtokenlist = Tokenization.SentenceLinkedList()
        strtokenlist.append(node)

        self.assertTrue(LogicMatchFeatures(strtokenlist, 0, "being|getting", [Rules.RuleToken()], 0))
Exemplo n.º 3
0
    def test_LogicNotOr(self):
        """Logic And/Or"""
        node =  Tokenization.SentenceNode('d')
        strtokenlist = Tokenization.SentenceLinkedList()
        strtokenlist.append(node)

        RuleTokenList = [Rules.RuleToken()]

        self.assertFalse(LogicMatchFeatures(strtokenlist, 0, "!c|d|e", RuleTokenList, 0))
        node.text = "f"
        self.assertTrue(LogicMatchFeatures(strtokenlist, 0, "!c|d|e", RuleTokenList, 0))
        node.text = "e"
        self.assertTrue(LogicMatchFeatures(strtokenlist, 0, "!c d|e", RuleTokenList, 0))
        node.text = "f"
        self.assertFalse(LogicMatchFeatures(strtokenlist, 0, "!c d|e", RuleTokenList, 0))
        node.text = "c"
        self.assertTrue(LogicMatchFeatures(strtokenlist, 0, "c|d !d|e", RuleTokenList, 0))
        node.text = "d"
        self.assertFalse(LogicMatchFeatures(strtokenlist, 0, "c|d !d|e", RuleTokenList, 0))
        node.text = "e"
        self.assertFalse(LogicMatchFeatures(strtokenlist, 0, "c|e !d|f|g|e", RuleTokenList, 0))
        node.text = "e"
        self.assertFalse(LogicMatchFeatures(strtokenlist, 0, "c|d !d|c", RuleTokenList, 0))
        node.text = "f"
        self.assertFalse(LogicMatchFeatures(strtokenlist, 0, "c|d !d|e", RuleTokenList, 0))
Exemplo n.º 4
0
    def test_simple(self):
        """exact match"""
        node =  Tokenization.SentenceNode('')
        node.features.add(FeatureOntology.GetFeatureID('NN'))
        strtokenlist = Tokenization.SentenceLinkedList()
        strtokenlist.append(node)

        self.assertTrue(LogicMatchFeatures(strtokenlist, 0, "NN", None, 0))
Exemplo n.º 5
0
    def test_LogicAnd(self):
        """Logic And"""
        node =  Tokenization.SentenceNode("c")
        strtokenlist = Tokenization.SentenceLinkedList()
        strtokenlist.append(node)

        ruletokenlist = [Rules.RuleToken()]

        self.assertFalse(LogicMatchFeatures(strtokenlist, 0, "c d", ruletokenlist, 0))
        self.assertTrue(LogicMatchFeatures(strtokenlist, 0, "c c", ruletokenlist, 0))
Exemplo n.º 6
0
    def test_LogicAndOr(self):
        """Logic And/Or"""
        node =  Tokenization.SentenceNode('d')
        strtokenlist = Tokenization.SentenceLinkedList()
        strtokenlist.append(node)

        ruletokenlist = [Rules.RuleToken()]
        self.assertFalse(LogicMatchFeatures(strtokenlist, 0, "c|d c", ruletokenlist, 0))
        node.text = "c"
        self.assertTrue(LogicMatchFeatures(strtokenlist, 0, "c|d c", ruletokenlist, 0))
Exemplo n.º 7
0
    def test_And(self):
        node =  Tokenization.SentenceNode("abc")
        node.features.add(FeatureOntology.GetFeatureID('NN'))
        strtokenlist = Tokenization.SentenceLinkedList()
        strtokenlist.append(node)

        self.assertFalse(LogicMatchFeatures(strtokenlist, 0, "NN percent", [Rules.RuleToken()], 0))

        node.features.add(FeatureOntology.GetFeatureID('percent'))
        self.assertTrue(LogicMatchFeatures(strtokenlist, 0, "NN percent", [Rules.RuleToken()], 0))
Exemplo n.º 8
0
def searchQueryUnion(query,dictionary):
    result=[]
    IDresult=[]
    postings=[]
    DocID=[]
    #Tokenise the query for process later
    tokens=Tokenization.tokenization(query)
    #Normalize the tokens for process later
    for index in range(len(tokens)):
        tokens[index] = Normalization.cleanStopWords150(tokens[index])
        tokens[index] = Normalization.cleanedTokens(tokens[index])
        tokens[index] = Normalization.caseFoldedTokens(tokens[index])
        tokens[index] = Normalization.stemmedTokens(tokens[index])
        if tokens[index] in dictionary:
            DocID=DocID+dictionary.get(tokens[index])
    if DocID!=[]:
        #change docID from string to int
        for index in range(len(DocID)):
            DocID[index]=DocID[index].split(',')
            DocID[index]=DocID[index][0].split()
            DocID[index]=map(int, DocID[index])
        #Getting the intersection of the DocID for different tokens in query
        DocID=[set(id) for id in DocID]
        #print DocID
        finalID=sorted(set.union(*DocID))
        return finalID
    else:
        finalID=DocID
        return finalID
Exemplo n.º 9
0
def getInvertedIndexTokens(fileNum):
    invertedIndex={}
    #loading the orginal documents for tokenization and normalization later
    fileNum= '%0*d' % (3, fileNum)
    resourcepath = 'reuters/reut2-' + fileNum + '.sgm'
    file = open(resourcepath)
    soup = BeautifulSoup(file, 'html.parser')
    allReuters=soup.find_all('reuters')
    for reuters in allReuters:
        if reuters.body is not None:
            #Tokenize the text inside the [body]tag in the files 
            tokenslist=reuters.body.text.rsplit('reuters',1)
            for e in tokenslist:
                tokens=''.join(e).encode('utf8')
                tokens=Tokenization.tokenization(tokens)
                for token in tokens:
                    #Normalization after get the tokens 
                    token = Normalization.cleanedTokens(token)
                    token = Normalization.caseFoldedTokens(token)
                    token = Normalization.cleanStopWords150(token)
                    token = Normalization.stemmedTokens(token)
                    #Construct the inverted index for tokens
                    if token != '':
                        if invertedIndex.has_key(token):
                            if reuters['newid'] not in invertedIndex[token]:
                                invertedIndex[token].append(reuters['newid'])
                                
                        else:
                            invertedIndex[token] = [reuters['newid']]
                            
    return invertedIndex
Exemplo n.º 10
0
    def test_ApplyLexicon(self):
        node = Tokenization.SentenceNode('0')
        ApplyLexicon(node)
        CDFeatureID = GetFeatureID('CD')
        self.assertTrue(CDFeatureID in node.features)

        LoadLexicon(dir_path + '/../../../fsa/X/LexX-ChinesePunctuate.txt')

        node = Tokenization.SentenceNode(':')
        ApplyLexicon(node)
        self.assertTrue(utils.FeatureID_SYM in node.features)
        self.assertFalse(utils.FeatureID_OOV in node.features)

        node = Tokenization.SentenceNode(':')
        ApplyLexicon(node)
        self.assertTrue(utils.FeatureID_SYM in node.features)
        self.assertFalse(utils.FeatureID_OOV in node.features)
Exemplo n.º 11
0
    def test_LexiconLookup(self):
        LoadLexicon(dir_path + '/../../../fsa/X/defLexX.txt', lookupSource=LexiconLookupSource.defLex)
        LoadLexicon(dir_path + '/../../../fsa/X/defPlus.txt', lookupSource=LexiconLookupSource.defLex)

        Sentence="喝不惯"
        NodeList = Tokenization.Tokenize(Sentence)
        import ProcessSentence
        ProcessSentence.PrepareJSandJM(NodeList)
        LexiconLookup(NodeList, LexiconLookupSource.defLex)
        self.assertEqual(NodeList.size, 3)

        Sentence="李四"
        NodeList = Tokenization.Tokenize(Sentence)
        #import ProcessSentence
        ProcessSentence.PrepareJSandJM(NodeList)
        LexiconLookup(NodeList, LexiconLookupSource.defLex)
        self.assertEqual(NodeList.size, 3)
        self.assertFalse(utils.FeatureID_OOV in NodeList.head.features)
Exemplo n.º 12
0
def walkOverContribution(contributionPath, ignoreFunc = None, endsWith = '.tokens.json'):
	result = {}
	for (path, dirs, files) in os.walk(contributionPath):
		if ignoreFunc(path):
			continue
		for file in files:
			if file.endswith(endsWith):
				if ignoreFunc(file):
					continue
				result[file] = Tokenization.tokenizeFile(os.path.join(path,file))
	return result
Exemplo n.º 13
0
def purrify(query):
    # purrify the query
    normalizedTokens = []
    tokens = Tokenization.tokenization(query)
    for token in tokens:
        token = Normalization.cleanedTokens(token)
        token = Normalization.caseFoldedTokens(token)
        token = Normalization.cleanStopWords150(token)
        token = Normalization.stemmedTokens(token)
        if token != "":
            normalizedTokens.append(token)
    #print normalizedTokens
    return normalizedTokens
def main():
    """
	The program must accept two command line arguments: 
	-train.json
	-test.json
	"""
    # first handle user input
    trainJSONData, testJSONData = command_parser()

    # import the text process after checking user input
    import Normalization
    import Tokenization

    # init text processing classes
    global normalization, tokenization
    normalization = Normalization.Normalizer()
    tokenization = Tokenization.Tokenizer()

    print("Pre-processing begin >>>>>>>>")
    # Perform Data pre-processing (text processing and get each document terms)
    Document_vectors, corpus, number_of_document, corpus_count = pre_processing(
        trainJSONData)
    print("<<<<<<<< Pre-processing done")
    # apply the kNN
    best_accuary = -1
    best_k = -1
    decrease = 0
    k_parameter_accuracy = []
    # try all different parameter k
    # until if there are two consectively decreases
    # then stop
    for k in range(1, number_of_document):
        print("Apply kNN begin with K=%d  >>>>>>>>" % (k))
        accuracy = apply_kNN_on_test_documents(testJSONData, Document_vectors,
                                               corpus, number_of_document,
                                               corpus_count, k)
        k_parameter_accuracy.append(accuracy)
        print("<<<<<<<< Apply kNN done with K=%d" % (k))
        print("Accuracy: " + str(accuracy) + "  with K=%d" % (k))
        if accuracy > best_accuary:
            best_accuary = accuracy
            best_k = k
        if k > 1 and accuracy < k_parameter_accuracy[k - 2]:
            decrease += 1
        if decrease == 2:
            # if consectively decreasing break
            print("Two consectively decreasing accuracy! Stop here")
            break
    print("")
    print("Best Accuracy: %f  with parameter K=%d" % (best_accuary, best_k))
Exemplo n.º 15
0
def refineTokens(data, debug = False, force = True):
	#find all .tokens.json files
	files = Helper.derivedFiles(Helper.relevantFiles(data['data']), inputFileExt)

	if (not force):
		files = Helper.disregardFiles(files, inputFileExt, outputFileExt)

	for file in files:
		tokenized = Tokenization.tokenizeFile(file)
		if (debug):
			json.dump(tokenized, open(file.replace(inputFileExt, outputDebugFileExt), 'w'))
		map = createMap(tokenized)
		json.dump(map, open(file.replace('.tokens.json', outputFileExt), 'w'))
		Helper.incProgress()
	print ''
Exemplo n.º 16
0
def cleantxt(line, cleanfile):

    word_list = tk.TokenizeMultiWord(line, Taglist)

    for word in word_list:

        if (not word.isnumeric() and len(word) > 1
                and word in words.words()) or (word in DiseaseName):
            cleanfile.write(word)
            cleanfile.write(" ")
        else:
            if p.singular_noun(word):
                cleanfile.write(word)
                cleanfile.write(" ")

    cleanfile.write("\n")
Exemplo n.º 17
0
def LexicalAnalyzeTask( SubSentence, schema):

    NodeList = Tokenization.Tokenize(SubSentence)
    if not NodeList or NodeList.size == 0:
        return None, None

    Lexicon.ApplyLexiconToNodes(NodeList)
    # print("after ApplyLexiconToNodes" + OutputStringTokens_oneliner(NodeList))

    PrepareJSandJM(NodeList)
    #Lexicon.LexiconoQoCLookup(NodeList)

    NodeList, Dag, WinningRules = DynamicPipeline(NodeList, schema)
        # t = Thread(target=Cache.WriteSentenceDB, args=(SubSentence, NodeList))
        # t.start()

    return NodeList, Dag, WinningRules
Exemplo n.º 18
0
def refineTokens(data, debug=False, force=True):
    #find all .tokens.json files
    files = Helper.derivedFiles(Helper.relevantFiles(data['data']),
                                inputFileExt)

    if (not force):
        files = Helper.disregardFiles(files, inputFileExt, outputFileExt)

    for file in files:
        tokenized = Tokenization.tokenizeFile(file)
        if (debug):
            json.dump(
                tokenized,
                open(file.replace(inputFileExt, outputDebugFileExt), 'w'))
        map = createMap(tokenized)
        json.dump(map, open(file.replace('.tokens.json', outputFileExt), 'w'))
        Helper.incProgress()
    print ''
Exemplo n.º 19
0
    def test_LogicCombined(self):
        """Logic Combined"""

        blocks = SeparateOrBlocks("a|b|c")
        self.assertEqual(len(blocks), 3)

        blocks = SeparateOrBlocks("a")
        self.assertEqual(len(blocks), 1)

        blocks = SeparateOrBlocks("'a|b'|c")
        self.assertEqual(len(blocks), 2)


        strtokenlist = Tokenization.Tokenize('d')
        RuleTokenList = [Rules.RuleToken()]

        self.assertTrue(LogicMatch(strtokenlist, 0, 'd', RuleTokenList, 0))

        #strtokenlist = Tokenization.Tokenize("notfeature|'d'|notfeature2")
        self.assertTrue(LogicMatch(strtokenlist, 0, "notfeature|'d'|notfeature2", RuleTokenList, 0))
Exemplo n.º 20
0
def getAllTokens(fileNum): 
    # Create inverted index, loop through all articles in one file   
    invertedIndex = {}
    tokensLength= open('invertedIndex/tokensLength', 'a')
    Content = open('invertedIndex/Content', 'a')
    #loading the orginal documents for tokenization and normalization later
    fileNum= '%0*d' % (3, fileNum)
    resourcepath = 'reuters/reut2-' + fileNum + '.sgm'
    file = open(resourcepath)
    soup = BeautifulSoup(file, 'html.parser')
    for doc in soup.find_all('reuters'):
        docId = int(doc['newid'].encode('utf8'))
        tokenCounter = 0
        if doc.body is not None:
            content = doc.body.text
            length=len(content)
            Content.write (str(docId) + ' Start ' + content.encode('utf8') + ' End ')
            tokens = Tokenization.tokenization(content)
            for token in tokens:
                # Normalization
                token = Normalization.cleanedTokens(token)
                token = Normalization.caseFoldedTokens(token)
                token = Normalization.cleanStopWords150(token)
                token = Normalization.stemmedTokens(token)
                if token != '':
                    tokenCounter += 1
                    # Add to the postings list if the word exists
                    if invertedIndex.has_key(token):
                        if invertedIndex[token].has_key(docId):
                            tf = invertedIndex[token][docId]
                            invertedIndex[token][docId] = tf +1
                        else:
                            invertedIndex[token][docId] = 1
                    else:
                        invertedIndex[token] = {docId:1}
        tokensLength.write (str(docId) + ':' + str(tokenCounter) +'\n')
    tokensLength.close()
    Content.close()                
    return invertedIndex
Exemplo n.º 21
0
def refineTokens(debug = False, force = True, fragments=False):
	#find all .tokens.json or fragments.tokens.json files
    if fragments:
        ending = inputFragFileExt
        outEnding = outputFragFileExt
        files = Helper.derivedFiles(Helper.relevantFiles(), inputFragFileExt)
        if (not force):
            files = Helper.disregardFiles(files, inputFragFileExt, outputFragFileExt)
    else:
        ending = inputFileExt
        outEnding = outputFileExt
        files = Helper.derivedFiles(Helper.relevantFiles(), inputFileExt)
        if (not force):
		    files = Helper.disregardFiles(files, inputFileExt, outputFileExt)

    for file in files:
        tokenized = Tokenization.tokenizeFile(file, fragments=fragments)
        if (debug):
            json.dump(tokenized, open(file.replace(ending, outputDebugFileExt), 'w'))
        map = createMap(tokenized, fragments=fragments)
        json.dump(map, open(file.replace(ending, outEnding), 'w'))
        Helper.incProgress()
    print ''
Exemplo n.º 22
0
def ApplyLexicon(node, lex=None, stemming_version="stem"):
    global _SuffixList

    if not C1ID:
        InitLengthSet()

    OOVFeatureSet = {
        utils.FeatureID_JM, utils.FeatureID_JM2, utils.FeatureID_JS,
        utils.FeatureID_JS2
    }
    OOVFeatureSet |= LengthSet

    if not lex:
        lex = SearchLexicon(node.text)
    # if not node.lexicon:    # If lexicon is assigned before, then don't do the search
    #                         #  because the node.word is not as reliable as stem.
    #     node.lexicon = SearchLexicon(node.word)

    #attempt stemming if lexicon fails (O.O)
    word = node.text.lower()
    if lex is None and len(word) >= 4:
        if stemming_version == "stem":
            start = len(word) - 1
            stop = 2
            step = -1
        else:
            start = 3
            stop = len(word)
            step = 1

        for stem_length in range(start, stop, step):
            stem_word = word[:stem_length]

            lex_copy = SearchStem(stem_word)

            suffix = word[stem_length:].lower()

            if lex_copy is not None and suffix in _SuffixList:  # both the stem_word exists and the suffix exists
                lex = LexiconNode(word)
                lex.atom = lex_copy.atom
                lex.norm = lex_copy.norm
                lex.features.update(lex_copy.features)

                # set the node essentially equal to lex, so it technically sends lex into MatchAndApplyRuleFile
                o_norm = node.norm
                o_atom = node.atom
                o_text = node.text

                node.norm = lex.norm
                node.atom = lex.atom
                node.text = suffix
                if utils.FeatureID_NEW in lex.features:
                    node.features = set()
                    node.features.update(lex.features)
                    node.features.remove(utils.FeatureID_NEW)
                else:
                    node.features.update(lex.features)

                orig_feature = len(node.features)

                SingleNodeList = Tokenization.SentenceLinkedList()
                SingleNodeList.append(node)
                ProcessSentence.MatchAndApplyRuleFile(SingleNodeList, _InfFile)

                node = SingleNodeList.head

                # all we want is the updated features
                lex.features = set()
                lex.features.update(node.features)
                new_feature = len(node.features)

                node.norm = o_norm
                node.atom = o_atom
                node.text = o_text
                node.features = set()

                # if features don't change, it didn't match, thus stemming failed
                if orig_feature != new_feature:
                    break
                else:
                    lex = None
                    if stemming_version == "stem":  # failing from small suffixes could still work for longer ones
                        continue
                    else:  # starting for longer suffixes, if matching failed it would fail everything
                        break

    if lex is None:
        if utils.IsCD(node.text):
            node.ApplyFeature(utils.FeatureID_CD)
        elif node.text in string.punctuation:
            node.ApplyFeature(utils.FeatureID_SYM)
        elif node.norm == " ":
            node.ApplyFeature(utils.FeatureID_CM)
            # not to apply NNP/OOV to space.
        else:
            node.ApplyFeature(utils.FeatureID_NNP)
            node.ApplyFeature(utils.FeatureID_OOV)
    else:
        node.norm = lex.norm

        #to have correct stem, e.g. carries -> carrie -> carry
        if lex.norm in _StemDict:
            stem_lex = SearchStem(lex.norm)
            if stem_lex.norm:
                node.norm = stem_lex.norm

        node.atom = lex.atom
        if utils.FeatureID_NEW in lex.features:
            node.features = set()
            node.features.update(lex.features)
            node.features.remove(utils.FeatureID_NEW)
        else:
            node.features.update(lex.features)
        # _ApplyWordStem(node, lex) (o.o)
        if len(node.features) == 0 or \
                len(node.features - OOVFeatureSet) == 0:
            node.ApplyFeature(utils.FeatureID_OOV)
            # node.features.add(utils.FeatureID_OOV)

    ApplyWordLengthFeature(node)
    node.ApplyFeature(utils.FeatureID_0)
    return node
Exemplo n.º 23
0
 def runTest(self):
     token = 'StyledEditorKit'
     expected = ['Styled', 'Editor', 'Kit']
     self.assertTrue(isEqual(Tokenization.tokenizeToken(token), expected),
                     token + ' is not correctly tokenized')
Exemplo n.º 24
0
 def runTest(self):
     token = 'HTMLEditor'
     expected = ['HTML', 'Editor']
     self.assertTrue(isEqual(Tokenization.tokenizeToken(token), expected),
                     token + ' is not correctly tokenized')
import Normalization
import Tokenization


def cleanText(text, tokenization, normalization):
    """
	Input: string of text
	Return: a list of term/vocabulary after tokenization and normalization 
	"""
    # perform tokenization
    tokens = tokenization.tokenize(text)
    # perform normalization
    tokens = normalization.lemmatize(tokens)
    # get rid of non-meaningful character after tokenization
    tokens = tokenization.getRidPuncuation(tokens)
    return tokens


normalization = Normalization.Normalizer()
tokenization = Tokenization.Tokenizer()

dd = cleanText(
    "adad.adad ada...adad..ad 1941.http u.s.a. #Dadad #Rats sgsgs...",
    tokenization, normalization)
print(dd)
Exemplo n.º 26
0
	def runTest(self):
		token = 'isOSGiCompatible'
		expected = ['is', 'OSGi', 'Compatible']
		self.assertTrue(isEqual(Tokenization.tokenizeToken(token), expected), token + ' not correctly tokenized')
Exemplo n.º 27
0
	def runTest(self):
		token = 'HTMLEditor'
		expected = ['HTML', 'Editor']
		self.assertTrue(isEqual(Tokenization.tokenizeToken(token), expected), token + ' is not correctly tokenized')
Exemplo n.º 28
0
	def runTest(self):
		token = 'StyledEditorKit'
		expected = ['Styled', 'Editor', 'Kit']
		self.assertTrue(isEqual(Tokenization.tokenizeToken(token), expected), token + ' is not correctly tokenized')
Exemplo n.º 29
0
 def test_ApplyWordLengthFeature(self):
     Sentence="李四abc456,sab98中文"
     NodeList = Tokenization.Tokenize(Sentence)
     ApplyLexiconToNodes(NodeList)
     self.assertTrue(C1ID in NodeList.head.features)
     self.assertTrue(D1ID in NodeList.get(1).features)
Exemplo n.º 30
0
 def runTest(self):
     token = 'isOSGiCompatible'
     expected = ['is', 'OSGi', 'Compatible']
     self.assertTrue(isEqual(Tokenization.tokenizeToken(token), expected),
                     token + ' not correctly tokenized')
def main():
	"""
	The program must accept two command line arguments: 
	the first is the directory containing the documents to be indexed, 
	and the second must be the directory where the index will be stored.
	"""
	# first handle user input
	if len(sys.argv) != 3:
		# number of argument is not correct
		print("Two arguments are needed:")
		print("1. the directory containing the documents to be indexed")
		print("2. the directory where the index will be stored")
		return 
	docDir = sys.argv[1]
	indexDir = sys.argv[2]
	if not os.path.isdir(docDir) or not os.path.isdir(indexDir):
		# the given input dir are invalid
		print("The given directory is invalid")
		return 
	# append / if not present in the directory
	if docDir[-1] != "/":
		docDir += "/"
	if indexDir[-1] != "/":
		indexDir += "/"
	if indexDir == "/":
		indexDir = "." + indexDir
	if docDir == "/":
		docDir = "." + docDir
	# retrieve all documents in the given directory
	allDoc = []
	for subDir in os.walk(docDir):
		# recursively retrieve all files in each subDir
		# docDir is also a subDir of itself
		for doc in subDir[2]:
			# all documents in subDir
			allDoc.append(doc)

	#######################################################################################################################

	# intialization for building index
	import Normalization 
	import Tokenization 
	import SQLite3database 
	# init text processing classes
	normalization = Normalization.Normalizer()
	tokenization = Tokenization.Tokenizer()
	# create a SQLite3 database
	indexDatabase = SQLite3database.Database(indexDir+"index.db")
	# create title index database
	titleDatabase = SQLite3database.Database(indexDir+"title.db")
	# create table
	createTable(indexDatabase)
	createTable(titleDatabase)
	# init final insert string
	indexDatabase.initInsertString()
	indexDatabase.addBeginTransactionString()
	titleDatabase.initInsertString()
	titleDatabase.addBeginTransactionString()
	# intializing insert string
	insertDocument = "INSERT INTO document VALUES"
	insertDictionary = "INSERT INTO dictionary VALUES"
	insertTermPosition = "INSERT INTO termPosition VALUES"
	insertDocumentFrequency = "INSERT INTO documentFrequency VALUES"
	insertTermFrequency = "INSERT INTO termFrequency VALUES"

	insertDocumentTitle = "INSERT INTO document VALUES"
	insertDictionaryTitle = "INSERT INTO dictionary VALUES"
	insertTermPositionTitle = "INSERT INTO termPosition VALUES"
	insertDocumentFrequencyTitle = "INSERT INTO documentFrequency VALUES"
	insertTermFrequencyTitle = "INSERT INTO termFrequency VALUES" 

	# store document frequency of each vocabulary
	dictionary = {} # contain all vocabulary over all (vocabulary as key, document frequncy as value)
	titleDic = {} 
	for doc in allDoc:
		# First read and process text from the current document
		# open file to read
		text = open(docDir+doc,"r").read()

		noTxt = doc.rstrip(".txt")
		title = " ".join(noTxt.split("_")[2:])

		# process raw text from document
		tokens = cleanText(text, tokenization, normalization) # return a list of term/vocabulary after tokenization and normalization
		titleTokens = cleanText(title.lower(), tokenization, normalization) 
		# Then
		# Traverse the term/vocabulary list and record the information
		# -position 
		# -count
		# init 
		termFrequency = {} # (vocabulary and documentID as key, term frequency as value)
		titleTermFrequency = {}
		documentID = int(doc.split("_")[1]) # extract document ID
		insertDocument += """ ({docID},"{docName}",{docLength}),""".format(docID=documentID, docName=doc, docLength=len(tokens))
		insertDocumentTitle += """ ({docID},"{docName}",{docLength}),""".format(docID=documentID, docName=doc, docLength=len(titleTokens))
		alreadyIncrement = {} # use for check if the document frequency in this document is already increment
		alreadyIncrementTitle = {}
		for index,token in enumerate(tokens):
			# insert position of this token in the document
			insertTermPosition += """ ("{word}",{docID},{position}),""".format(word=token, docID=documentID, position=index+1)
			if token not in dictionary:
				dictionary[token] = 1
				alreadyIncrement[token] = None
				# insert if this token is the first time encounter overall 
				insertDictionary += """ ("{word}"),""".format(word=token)
			elif token not in alreadyIncrement:
				dictionary[token] += 1
				alreadyIncrement[token] = None
			if token not in termFrequency:
				termFrequency[token] = 1
			else:
				termFrequency[token] += 1
		for key,val in termFrequency.items():
			insertTermFrequency += """ ("{word}",{docID},{termFreq}),""".format(word=key, docID=documentID, termFreq=val)

		for index,token in enumerate(titleTokens):
			# insert position of this token in the document
			insertTermPositionTitle += """ ("{word}",{docID},{position}),""".format(word=token, docID=documentID, position=index+1)
			if token not in titleDic:
				titleDic[token] = 1
				alreadyIncrementTitle[token] = None
				# insert if this token is the first time encounter overall 
				insertDictionaryTitle += """ ("{word}"),""".format(word=token)
			elif token not in alreadyIncrementTitle:
				titleDic[token] += 1
				alreadyIncrementTitle[token] = None
			if token not in titleTermFrequency:
				titleTermFrequency[token] = 1
			else:
				titleTermFrequency[token] += 1
		for key,val in titleTermFrequency.items():
			insertTermFrequencyTitle += """ ("{word}",{docID},{termFreq}),""".format(word=key, docID=documentID, termFreq=val)


	# insert the document frequency
	for key,val in dictionary.items():
		insertDocumentFrequency += """ ("{word}",{docFrequency}),""".format(word=key, docFrequency=val)

	for key,val in titleDic.items():
		insertDocumentFrequencyTitle += """ ("{word}",{docFrequency}),""".format(word=key, docFrequency=val)

	# get rid of the ',' at the end of each insert string
	# replace it with ';'
	insertDocument = insertDocument[:-1] + ";"
	insertDictionary = insertDictionary[:-1] + ";"
	insertTermPosition = insertTermPosition[:-1] + ";"
	insertTermFrequency = insertTermFrequency[:-1] + ';'
	insertDocumentFrequency = insertDocumentFrequency[:-1] + ";"


	insertDocumentTitle = insertDocumentTitle[:-1] + ";"
	insertDictionaryTitle = insertDictionaryTitle[:-1] + ";"
	insertTermPositionTitle = insertTermPositionTitle[:-1] + ";"
	insertTermFrequencyTitle = insertTermFrequencyTitle[:-1] + ';'
	insertDocumentFrequencyTitle = insertDocumentFrequencyTitle[:-1] + ";"

	# add all insert string to the final insert string
	indexDatabase.addInsertString(insertDocument)
	indexDatabase.addInsertString(insertDictionary)
	indexDatabase.addInsertString(insertTermPosition)
	indexDatabase.addInsertString(insertTermFrequency)
	indexDatabase.addInsertString(insertDocumentFrequency)
	indexDatabase.addCommitString()
	indexDatabase.execute(indexDatabase.getInsertString())
	createBtreeIndex(indexDatabase)
	indexDatabase.close()

	titleDatabase.addInsertString(insertDocumentTitle)
	titleDatabase.addInsertString(insertDictionaryTitle)
	titleDatabase.addInsertString(insertTermPositionTitle)
	titleDatabase.addInsertString(insertTermFrequencyTitle)
	titleDatabase.addInsertString(insertDocumentFrequencyTitle)
	titleDatabase.addCommitString()
	titleDatabase.execute(titleDatabase.getInsertString())
	createBtreeIndex(titleDatabase)
	titleDatabase.close()
Exemplo n.º 32
0
    for i in range(0, 22):
        if i < 10:
            filename = 'file\\reut2-00%i.sgm' % (i)
        else:
            filename = 'file\\reut2-0%i.sgm' % (i)

        f = open(filename, 'r', errors='ignore')
        s = f.read()
        '''parse the file and output the results'''
        parser = Preprocess.Preprocess()
        parser.parse(s)
    '''create another text directory'''
    if not os.path.isdir('tokenization'):
        os.makedirs('tokenization')

    tokenization = Tokenization.Tokenization()

    for j in range(1, 21579):
        filename_txt = 'text\\%d.txt' % (j)

        f = open(filename_txt, 'r')
        s = f.read()

        s = tokenization.case_folding(s)
        #         print('******case folding *****\n%s'%(s))
        s = tokenization.remove_num(s)
        #         print('******num*****\n%s'%(s))
        s = tokenization.remove_punctuation(s)
        s = tokenization.toke_nize(s)
        # #         print ('*******removal punctuation*******\n%s'%s)
def main():
    # First of all check the user input
    indexFilePath, k, printScore, queryTermString = checkInput()
    # open the database file that is given
    indexDatabase = SQLite3database.Database(
        sys.argv[1])  #This also handle file error
    # cursor
    cursor = indexDatabase.getCursor()
    # check if the tables needed exists in the index storage file
    tablesNeeded = [
        "dictionary", "document", "termPosition", "documentFrequency",
        "termFrequency"
    ]
    if checkIfTableNeedExist(indexDatabase, cursor, tablesNeeded) == False:
        print(
            "The given index storage file does not contain the required Tables."
        )
        indexDatabase.close()
        return
    # last check for k
    cursor.execute("SELECT COUNT(*) FROM document;")
    NumberOfDocument = cursor.fetchall()[0][0]
    if k > int(NumberOfDocument):
        print(
            "The second argument k is larger than the number of document in the input collection."
        )
        print("Arugmnet k should be less or equal to: %d" %
              (int(NumberOfDocument)))
        indexDatabase.close()
        sys.exit(-1)

    ##################################################################################################################################
    """
	At this point, all input should be all validated,
	and database file has opened,
	The database file has all the information represent the each document language model
	-tf (term frequency) in each of the document
	-document length for each document
	and along with some other extra information
	"""

    # First of all, do text processing(clean text) on the query term
    # (The same way that is done to the input data document terms)
    import Normalization
    import Tokenization

    normalization = Normalization.Normalizer()
    tokenization = Tokenization.Tokenizer()
    queryTermsList = cleanText(queryTermString, tokenization, normalization)
    print("Query Terms:")
    print(queryTermsList)
    # Perform the computation of probability of generating the query terms on the document model
    topKdocument = ComputeProbabilityGeneratingQueryTerms(
        queryTermsList, cursor, k)
    if printScore == "y":
        print(" %4s %63s" % ("Document Name:", "Query Likelyhood:"))
        for index, document in enumerate(topKdocument):
            print("%4d. %-60s" % (index + 1, document[0]), end="")
            print(document[1])
    else:
        print(" %4s" % ("Document Name:"))
        for index, document in enumerate(topKdocument):
            print("%4d. %-60s" % (index + 1, document[0]))
    # close the database file after
    indexDatabase.close()