Пример #1
0
def similar(docSet, InvIndex, word):
    textS = []
    wordCol = []
    if word not in InvIndex:
        print ""
        print "Word is not in the index."
        print ""
        return
    for doc in docSet:
        if doc in InvIndex[word]:
            textS = stemmer(
                filterData(
                    tokenize(
                        lowercase(parseXML(InvIndex["path_of_documents"] + "/" + "cranfield" + zfill(str(doc), 4)))
                    )
                )
            )
            textS = [t for t in textS if t != ""]
            while word in textS and len(wordCol) < 1000:
                index = textS.index(word)
                textS[index] = ""
                wordCol.extend(textS[(index + 1) % len(textS) : (index + 4) % len(textS)])
                wordCol.extend(textS[(index - 4) % len(textS) : (index - 1) % len(textS)])
    result = sorted(set(wordCol), key=wordCol.count, reverse=True)
    if len(result) < 10:
        print ""
        print "List of Similar terms (in stemmed form) is"
        print result
        print ""
    else:
        print ""
        print "List of Similar terms (in stemmed form) is"
        print result[:10]
        print ""
Пример #2
0
def MakeIndex(path):
    InvertedIndex = {}
    InvertedIndex['path_of_documents'] = path
    if exists(path):
        textFiles = [f for f in listdir(path)]
    else:
        print path,'is not a valid path, exiting...'
        exit()
    for file in textFiles:
        text = parseXML( path + "/" + file )
        text = lowercase(text)
        text = tokenize(text)
        text = filterData(text)
        text = stemmer(text)
        InvertedIndex = invertedListAppend( text, file, InvertedIndex )    
    return InvertedIndex