示例#1
0
    def create(self):
        docs = os.listdir(self.doc_dir)
        index = {}

        for doc in docs:
            file_name = os.path.join(self.doc_dir, doc)
            if file_name.endswith('.txt'):
                doc_name = doc.replace('CACM-', '').replace('.txt', '')
                doc_text = read_file(file_name).strip().split()
                terms = set(doc_text)
                for term in terms:
                    inv_list = []
                    entry = list()
                    entry.append(doc_name)
                    entry.append(doc_text.count(term))
                    positions = [
                        i for i, x in enumerate(doc_text) if x == term
                    ]
                    entry = entry + positions
                    if term in index.keys():
                        inv_list = index[term]
                        inv_list = sorted(inv_list, key=itemgetter(1))
                    inv_list.append(entry)
                    index[term] = inv_list

        self.index = index
示例#2
0
    def scores(self, query):
        scores = {}
        docs = os.listdir(self.doc_dir)
        corpus_len = len(docs)

        for term in query.split():

            if term in self.index.keys():

                if self.mode == 1:
                    if term in self.stoplist:
                        continue

                inv_list = self.index[term]
                df = len(inv_list) / corpus_len

                for entry in inv_list:
                    doc_id = entry[0]
                    doc_name = 'CACM-' + doc_id + '.txt'
                    doc_text = read_file(os.path.join(self.doc_dir, doc_name))
                    doc_len = len(doc_text)
                    tf = entry[1] / doc_len

                    score = tf * math.log(1 / df)
                    scores[doc_id] = scores[
                        doc_id] + score if doc_id in scores else score

        return scores
示例#3
0
 def __init__(self, mode):
     paths = get_model_paths(mode)
     self.mode = mode
     self.doc_dir = paths['doc_dir']
     self.index = file_to_dict(paths['index_file'])
     self.dlens = {doc.replace('CACM-', '').replace('.txt', ''): len(read_file(os.path.join(self.doc_dir, doc)).split()) for doc in os.listdir(self.doc_dir)}
     self.clen = sum(self.dlens.values())
     self.stoplist = get_stoplist()
示例#4
0
 def __init__(self, model):
     config = load_config()
     data_dir = config.get('DIRS', 'data_dir')
     stopwords_file = abspath(data_dir, config.get('FILES', 'common_words'))
     corpus_dir = config.get('DIRS', 'corpus_dir')
     self.stopwords = read_file(stopwords_file).split('\n')
     self.parsed_dir = abspath(corpus_dir, config.get('DIRS', 'parsed_dir'))
     self.model = model
示例#5
0
 def get_freq_terms(self, doc_id):
     doc_path = os.path.join(self.parsed_dir, 'CACM-' + doc_id + '.txt')
     terms = read_file(doc_path).split()
     most_common = Counter(terms).most_common()
     freq_terms = [
         word_tuple for word_tuple in most_common
         if not self.is_stop_word(word_tuple[0])
     ]
     return freq_terms[:3]
示例#6
0
 def get_run(self):
     run = {}
     run_text = read_file(self.results_file_path)
     run_text = run_text.replace('\n\n', '\n')
     for line in run_text.split('\n')[:-1]:
         data = line.split()
         query_id = data[0]
         doc_id = data[2]
         run.setdefault(query_id, []).append(doc_id)
     return run
示例#7
0
 def get_queries(self):
     queries = {}
     self.data_parser.initialize()
     self.data_parser.feed(read_file(self.query_file))
     qdata = self.data_parser.get_data()
     i = 3
     while i < len(qdata):
         queries[int(qdata[i].strip())] = parse_stuff(qdata[i + 2])
         i += 8
     return queries
示例#8
0
    def get_freq_terms(self, doc):
        data = read_file(os.path.join(self.parsed_dir, 'CACM-' + doc + '.txt'))
        words = data.split()
        doc_len = len(words) / 15
        word_freq = Counter(words)
        sig_words = []

        if doc_len < 25:
            threshold = 7 - 0.1 * (25 - doc_len)
        elif 25 <= doc_len <= 40:
            threshold = 7
        else:
            threshold = 7 + 0.1 * (doc_len - 40)

        for word in word_freq.keys():
            if word_freq[word] >= threshold:
                sig_words.append(word)

        return sig_words
示例#9
0
    def get_snippet(self):

        parsed_sentences = {}
        org_sentences = {}
        for item in self.doc_scores:
            doc = item[0]
            content = read_file(
                os.path.join(self.raw_docs, 'CACM-' + doc + '.html'))
            self.dataparser.initialize()
            self.dataparser.feed(content)
            data = []
            dataparser_op = self.dataparser.get_data()
            end = len(dataparser_op)
            for i in range(end):
                if dataparser_op[i] == 'pre':
                    end = i
            if end > 3:
                org = " ".join(dataparser_op[3:end]).split('\n\n')
            else:
                org = dataparser_op[3].split('\n\n')

            self.titles[doc] = org[1]
            org_data = org[2:-2]

            # parsing document
            for line in org_data:
                data.append(parse_stuff(line, period=True))

            parsed_sentences[doc] = [line.split('.') for line in data]
            org_sentences[doc] = [line.split('.') for line in org_data]

            sig_words = self.significant_words.union(
                set(self.get_freq_terms(doc)))

            for portion_index in range(len(parsed_sentences[doc])):
                portion = parsed_sentences[doc][portion_index]
                for sent_index in range(len(portion)):
                    sent = portion[sent_index]
                    # print("Doc = {}, Data = {}".format(doc, sent))
                    if sent:
                        words = sent.strip().split()
                        first_sig = None
                        last_sig = 0
                        sig_count = 0
                        non_sig_count = 0
                        max_non_sig = 20
                        for i in range(len(words)):
                            if words[i] in sig_words:
                                if first_sig is None:
                                    first_sig = i
                                    sig_count += 1
                                elif non_sig_count <= max_non_sig:
                                    last_sig = i
                                    sig_count += 1
                            elif first_sig is not None and non_sig_count <= max_non_sig:
                                non_sig_count += 1
                        cts = Counter(
                            [sig_words.__contains__(word) for word in words])
                        if first_sig is not None and last_sig > first_sig:
                            sig_factor = sig_count**2 / (last_sig - first_sig)
                        elif cts[True]:
                            sig_factor = cts[True] / max_non_sig
                        else:
                            sig_factor = 0
                        self.snippets.setdefault(doc, []).append(
                            (org_sentences[doc][portion_index]
                             [sent_index].strip(), sig_factor))

        for doc in self.snippets.keys():
            self.snippets[doc] = sorted(self.snippets[doc],
                                        key=lambda k: k[1],
                                        reverse=True)