def make_preprocessed_voca_list(raw_list, stem_opt):
    preprocessed_voca_list = []
    assert stem_opt in ['stem', 'lem', 'N']
    stopword_list = set(stopwords.words('english'))

    if stem_opt == 'stem':
        stem = PorterStemmer()
    elif stem_opt == 'lem':
        stem = WordNetLemmatizer()

    for line in raw_list:
        if type(line).__name__ == 'float':
            preprocessed_voca_list.append('')
        else:
            sentence_temp = html.unescape(line).lower()
            sentence_temp = re.compile('[a-z]{3,}').findall(sentence_temp)
            sentence_temp = list(
                filter(lambda x: x not in stopword_list, sentence_temp))
            if stem_opt == 'stem':
                sentence_temp = [stem.stem(w) for w in sentence_temp]
            elif stem_opt == 'lem':
                sentence_temp = [stem.lemmatize(w) for w in sentence_temp]
            preprocessed_voca_list.append(sentence_temp)

    return preprocessed_voca_list
예제 #2
0
    def main(self):
        #create input parser
        parser = argparse.ArgumentParser()

        #add non mandatory argument for stoplist file
        parser.add_argument('-s',
                            action='store',
                            dest='stop_list_path',
                            help='use stoplist file')

        #add mandatory argument for collections file
        parser.add_argument('-c',
                            action='store',
                            help='the file path to the documents',
                            dest='collection',
                            required=True)

        #add mandatory argument for index (output) file
        parser.add_argument('-i',
                            action='store',
                            help='the index file',
                            dest='index',
                            required=True)

        #add non mandatory argument for if we should use the existing index file
        parser.add_argument('-I',
                            action='store_true',
                            help='use existing index file',
                            dest='use_index')

        #add non mandatory argument for if we should compute on the fly
        parser.add_argument('-F',
                            action='store_true',
                            help='use on the fly computation',
                            dest='use_fly')

        #add non mandatory argument of whether or not to use stemming
        parser.add_argument('-m',
                            action='store',
                            help='specify type of stemmer',
                            dest='stem')

        #add non mandatory argument of whether or not to use stemming
        parser.add_argument('-q',
                            action='store',
                            help='use query id',
                            dest='query_id',
                            type=int)

        #add non mandatory argument for collections file
        parser.add_argument('-o',
                            action='store',
                            help='output file name',
                            dest='output_file')

        #add non mandatory argument for term freq type
        parser.add_argument('-tf',
                            action='store',
                            help='term frequency calculation type',
                            dest='tf_type')

        #add non mandatory argument for the inverse document freq type
        parser.add_argument('-idf',
                            action='store',
                            help='inverse doc frequency calculation type',
                            dest='idf_type')

        #add non mandatory arugment for top N results. I.E number of results returned by query
        parser.add_argument('-n',
                            action='store',
                            help='the number of documents to return for query',
                            dest='numberOfDocs')

        #add non mandatory arugment to use rocchio relevance
        parser.add_argument('-r',
                            action='store',
                            help='rocchio relevance mode',
                            type=int,
                            nargs='?',
                            const=10,
                            dest='rr')

        #get arugments
        args = parser.parse_args()

        #if we supplied a stop list then load the data
        if args.stop_list_path:
            from StopListLoader import loadStopList
            data = loadStopList(args.stop_list_path)
            self.stops.update(data)

        #if we supplied a name for the output file then set it
        if args.index:
            self.outputName = args.index

        #if we activated the stemming flag
#we have the option to supply different types of stemmers
#aswell as a lemmatizer
        if args.stem:
            stemmer = None
            if args.stem == "p":
                from nltk.stem import PorterStemmer
                stemmer = PorterStemmer()
                self.stem = lambda x: stemmer.stem(x)
            elif args.stem == "s":
                from nltk.stem.snowball import SnowballStemmer
                stemmer = SnowballStemmer("english")
                self.stem = lambda x: stemmer.stem(x)
            elif args.stem == "l":
                from nltk.stem.lancaster import LancasterStemmer
                stemmer = LancasterStemmer()
                self.stem = lambda x: stemmer.stem(x)
            elif args.stem == "lem":
                nltk.download("wordnet")
                from nltk.stem import WordNetLemmatizer
                stemmer = WordNetLemmatizer()
                self.stem = lambda x: stemmer.lemmatize(x)

        #if -I is set then then set useExistingIndexFile to true
        if args.use_index:
            self.useExistingIndexFile = True

        #If we wish to get the result set for just a single query
        if args.query_id:
            self.queryId = int(args.query_id)

        #the name of the results file when output
        if args.output_file:
            self.outputFileName = args.output_file

        #the number of documents we should return for a query
        #set to -1 for optimized result set
        if args.numberOfDocs:
            self.numberOfDocs = int(args.numberOfDocs)

        #the type of term freq to use
        if args.tf_type:
            if args.tf_type == "n":
                self.tfWeighter = Tf_Weighting()
            elif args.tf_type == "l":
                self.tfWeighter = Logarithmic_Tf_Weighting()
            elif args.tf_type == "a":
                self.tfWeighter = Augmented_Tf_Weighting()
            elif args.tf_type == "b":
                self.tfWeighter = Boolean_Tf_Weighting()

        #the type of inverse doc freq to use
        if args.idf_type:
            if args.idf_type == "p":
                self.idfType = "p"

        #sets how many documents should be displayed
        #when using rocchio relevance
        if args.rr:
            self.rrAmount = int(args.rr)

#You can only use relevance feedback mode when a query id is specified
        if self.queryId == -1 and self.rrAmount != -1:
            sys.exit(
                "If you are using rocchio relevance you must supply a query id[-q id]"
            )

        if args.use_fly:
            self.onTheFly = 1

#you cannot use on the fly generation with relevance feedback
        if self.onTheFly == 1 and self.rrAmount != -1:
            sys.exit(
                "If you are using rocchio relevance you cannot use on the fly generation"
            )

        #run the IR system
        start = time.time()
        self.runIRS(args.collection)
        self.printOutputTiming(start, args)