Exemplo n.º 1
0
    def __init__(self, text, collection, stemmer=None, stopwordVocab=None):
        self.text = text
        self.collection = collection
        self.results = {}
        self.vocab = Vocabulary()
        self.maxFreq = 0

        self.norm = None
        self.weights = None

        # Parse
        self.parser = UtteranceTextParser(stemmer, stopwordVocab)
        for word in self.parser.getWords(text):
            self.vocab.add(word)
        for word in self.getWordList():
            if self.getWordCount(word) > self.maxFreq:
                self.maxFreq = self.getWordCount(word)
Exemplo n.º 2
0
def main():
    # ARGUMENT PARSING
    argParser = buildArguments()
    args = argParser.parse_args()

    print("----------")
    # STEMMING CREATION
    stemmer = None
    if args.stem:
        stemmer = PorterStemmer()

    # STOPWORD CREATION
    stopwordVocab = None
    if args.stopword_filename:
        stopwordList = None
        if args.stopword_filename[-4:] != '.txt':
            print('Wrong stopword file format: please provide a .txt file')
            return errno.EINVAL
        try:
            with open(args.stopword_filename) as raw_data_file:
                print('Building stopword vocabulary: %s' % (args.stopword_filename))
                stopwordList = raw_data_file.readlines()
        except OSError as e:
            if e.errno == errno.ENOENT:
                print('Could not find stopword file %s' % (args.file))
                return errno.ENOENT
        stopwordVocab = Vocabulary()
        for w in stopwordList:
            stopwordVocab.add(w.strip())
        print('Done!')
        print("----------")

    # COLLECTION CREATION
    collection = None
    if args.filename[-5:] == '.json':
        try:
            with open(args.filename) as raw_data_file:
                print('Processing JSON Utterances file: %s' % (args.filename))
                jsonData = None
                try:
                    jsonData = json.load(raw_data_file)
                except ValueError as err:
                    print("failed to open JSON file")
                    return errno.EINVAL
        except OSError as e:
            if e.errno == errno.ENOENT:
                print('Could not find file %s' % (args.file))
                return errno.ENOENT
        collection = UtteranceCollection(\
                jsonData, stemmer=stemmer, stopwordVocab=stopwordVocab, \
                dedup=args.dedup, metadata=args.metadata)
        print('Done!')
        print("----------")
        # PICKLE THE COLLECTION
        if args.pickle:
            try:
                pickleFilename = args.filename.replace(".","_") + ".pickle"
                print("Saving to file %s" % (pickleFilename))
                with open(pickleFilename, "wb") as outFile:
                    pickle.dump(collection, outFile)
                    print("Save Successful!")
                    print("----------")
            except OSError as e:
                if e.errno == errno.ENOENT:
                    print('Could not open file %s' % (args.filename))
                    return errno.ENOENT
    elif args.filename[-7:] == '.pickle':
        # OPEN FROM PICKLED FILE
        print("Opening processed file %s" % (args.filename))
        try:
            with open(args.filename) as pickleFile:
                collection = pickle.load(pickleFile)
                print("Load Successful!")
                print("----------")
        except:
            print("Something went very wrong")
            return 22
    else:
        print('File issue, make sure you include a queryfile with the .pickle results')
        return errno.EINVAL
    # QUERYING
    if args.query_filename and args.query_filename[-4:] == '.txt':
        # TOP RESULTS
        topResultNum = int(args.top) if args.top else DEF_TOP
        queryText = None
        # OPEN QUERY FILE
        print("Opening query file: %s" % (args.query_filename))
        try:
            with open(args.query_filename) as qfile:
                queryText = qfile.read().replace("\n", " ").strip()
        except OSError as e:
            if e.errno == errno.ENOENT:
                print('Could not find file %s' % (args.filename))
                return errno.ENOENT
            else:
                raise
        query = Query(queryText, collection, stemmer=stemmer, stopwordVocab=stopwordVocab)
        print("Finding documents related to:")
        print('~' * WIDTH)
        dedented_text = textwrap.dedent(queryText).strip()
        print(textwrap.fill(dedented_text, initial_indent='    ', subsequent_indent='    ', width = 100))
        query.findResults()
        print('~' * WIDTH)
        print("Finding top %s results" % topResultNum)
        print('~' * WIDTH)
        i = 1
        for res in query.getResults(topResultNum):
            doc = res.getDocument()
            resultStr = "%-2d|%.3f\tpid:%d\tperson:%s" % \
                (i, res.getSimilarity(), doc.pid, doc.PersonType)
            print(resultStr)
            print('~' * WIDTH)
            dedented_text = textwrap.dedent(doc.text).strip()
            print(textwrap.fill(dedented_text, initial_indent=' ' * INDENT, subsequent_indent=' ' * INDENT, width = WIDTH))
            print('~' * WIDTH)
            i += 1
    if args.word:
        print("----------")
        collection.printStatistics(args.word)
        print("----------")
        query.printStatistics(args.word)
        print("----------")
Exemplo n.º 3
0
class Query(object):
    def __init__(self, text, collection, stemmer=None, stopwordVocab=None):
        self.text = text
        self.collection = collection
        self.results = {}
        self.vocab = Vocabulary()
        self.maxFreq = 0

        self.norm = None
        self.weights = None

        # Parse
        self.parser = UtteranceTextParser(stemmer, stopwordVocab)
        for word in self.parser.getWords(text):
            self.vocab.add(word)
        for word in self.getWordList():
            if self.getWordCount(word) > self.maxFreq:
                self.maxFreq = self.getWordCount(word)

    def __iter__(self):
        return self.vocab.__iter__()

    def addWord(self, word):
        self.vocab.add(word)

    def getWordCount(self, word):
        return self.vocab.getWordCount(word)

    def getWordList(self):
        return self.vocab.getWordList()

    # Term Weights
    def calculateNorm(self):
        sumSquares = 0.0
        weights = self.getWeights()
        for w in weights:
            sumSquares += weights[w] * weights[w]
        self.norm = math.sqrt(sumSquares)
    def getNorm(self):
        if not self.norm:
            self.calculateNorm()
        return self.norm
    def calculateWeights(self):
        self.weights = {}
        for term in self.getWordList():
            tf = self.getWordCount(term)
            idf = self.collection.inverseDocumentFrequency(term)
            self.weights[term] = tf * idf
    def getWeights(self):
        if not self.weights:
            self.calculateWeights()
        return self.weights
    def getTermWeight(self, term):
        weights = self.getWeights()
        if term not in weights:
            return 0
        return weights[term]
    def printStatistics(self, string):
        for w in self.parser.getWords(string):
            print("Query Stats")
            print("Word: %s"   % w)
            print("TF  : %d"   % self.getWordCount(w))
            print("IDF : %.3f" % self.collection.inverseDocumentFrequency(w))
            print("WGHT: %.3f" % self.getTermWeight(w))

    # Results
    def findResults(self):
        self.results = []
        for doc in self.collection.getDocuments():
            result = QueryResult(doc, self)
            bisect.insort(self.results, result)
    def getResults(self, topK=None):
        if not self.results:
            self.findResults()
        if topK:
            return self.results[:topK]
        return self.results