示例#1
0
 def __init__(self):
     config = load_config()
     self.query_file = abspath(config.get('DIRS', 'data_dir'),
                               config.get('FILES', 'query_file'))
     self.stem_query_file = abspath(config.get('DIRS', 'data_dir'),
                                    config.get('FILES', 'stem_query_file'))
     self.data_parser = DataParser()
示例#2
0
class QueryParser:
    def __init__(self):
        config = load_config()
        self.query_file = abspath(config.get('DIRS', 'data_dir'),
                                  config.get('FILES', 'query_file'))
        self.stem_query_file = abspath(config.get('DIRS', 'data_dir'),
                                       config.get('FILES', 'stem_query_file'))
        self.data_parser = DataParser()

    # get queries from file
    def get_queries(self):
        queries = {}
        self.data_parser.initialize()
        self.data_parser.feed(read_file(self.query_file))
        qdata = self.data_parser.get_data()
        i = 3
        while i < len(qdata):
            queries[int(qdata[i].strip())] = parse_stuff(qdata[i + 2])
            i += 8
        return queries

    # get stemmed queries from file
    def get_stem_queries(self):
        squeries = {}
        with open(self.stem_query_file) as f:
            content = f.readlines()
            content = [x.strip(' \t\n') for x in content]
            count = 1
            for each in content:
                if each is not "":
                    squeries[count] = each
                    count += 1
        return squeries
def create_maps(file=None, month=None):
    if month:
        my_data_parser = DataParser(file, month)
        my_data_parser.obtain_by_date(month)
    else:
        my_data_parser = DataParser(file)
        my_data_parser.obtain_all()
示例#4
0
 def __init__(self):
     config = load_config()
     corpus_dir = config.get('DIRS', 'corpus_dir')
     self.raw_docs = abspath(corpus_dir, config.get('DIRS', 'raw_docs'))
     self.parsed_dir = abspath(corpus_dir, config.get('DIRS', 'parsed_dir'))
     self.data_parser = DataParser()
     create_dir(self.parsed_dir)
     self.parsed_content = ""
     self.raw_corpus = os.listdir(self.raw_docs)
     self.stem_dir = abspath(corpus_dir, config.get('DIRS', 'stem_dir'))
     self.stem_file = abspath(config.get('DIRS', 'data_dir'),
                              config.get('FILES', 'stemmed_docs'))
     create_dir(self.stem_dir)
     self.docs = []
示例#5
0
    def __init__(self, query, scores):
        config = load_config()
        self.raw_docs = abspath(config.get('DIRS', 'corpus_dir'),
                                config.get('DIRS', 'raw_docs'))
        self.parsed_dir = abspath(config.get('DIRS', 'corpus_dir'),
                                  config.get('DIRS', 'parsed_dir'))
        self.stoplist = get_stoplist()
        self.significant_words = set(
            [term for term in query.split() if term not in self.stoplist])

        self.dataparser = DataParser()
        self.snippets = {}
        self.snippet_dir = abspath(config.get('DIRS', 'results'),
                                   config.get('DIRS', 'snippet_dir'))
        create_dir(self.snippet_dir)
        self.doc_scores = sorted(scores.items(),
                                 key=lambda x: x[1],
                                 reverse=True)[:100]
        self.titles = {}
示例#6
0
class SnippetGenerator:
    def __init__(self, query, scores):
        config = load_config()
        self.raw_docs = abspath(config.get('DIRS', 'corpus_dir'),
                                config.get('DIRS', 'raw_docs'))
        self.parsed_dir = abspath(config.get('DIRS', 'corpus_dir'),
                                  config.get('DIRS', 'parsed_dir'))
        self.stoplist = get_stoplist()
        self.significant_words = set(
            [term for term in query.split() if term not in self.stoplist])

        self.dataparser = DataParser()
        self.snippets = {}
        self.snippet_dir = abspath(config.get('DIRS', 'results'),
                                   config.get('DIRS', 'snippet_dir'))
        create_dir(self.snippet_dir)
        self.doc_scores = sorted(scores.items(),
                                 key=lambda x: x[1],
                                 reverse=True)[:100]
        self.titles = {}

    # using luhns formula to get medium frequency words from the document
    def get_freq_terms(self, doc):
        data = read_file(os.path.join(self.parsed_dir, 'CACM-' + doc + '.txt'))
        words = data.split()
        doc_len = len(words) / 15
        word_freq = Counter(words)
        sig_words = []

        if doc_len < 25:
            threshold = 7 - 0.1 * (25 - doc_len)
        elif 25 <= doc_len <= 40:
            threshold = 7
        else:
            threshold = 7 + 0.1 * (doc_len - 40)

        for word in word_freq.keys():
            if word_freq[word] >= threshold:
                sig_words.append(word)

        return sig_words

    def get_snippet(self):

        parsed_sentences = {}
        org_sentences = {}
        for item in self.doc_scores:
            doc = item[0]
            content = read_file(
                os.path.join(self.raw_docs, 'CACM-' + doc + '.html'))
            self.dataparser.initialize()
            self.dataparser.feed(content)
            data = []
            dataparser_op = self.dataparser.get_data()
            end = len(dataparser_op)
            for i in range(end):
                if dataparser_op[i] == 'pre':
                    end = i
            if end > 3:
                org = " ".join(dataparser_op[3:end]).split('\n\n')
            else:
                org = dataparser_op[3].split('\n\n')

            self.titles[doc] = org[1]
            org_data = org[2:-2]

            # parsing document
            for line in org_data:
                data.append(parse_stuff(line, period=True))

            parsed_sentences[doc] = [line.split('.') for line in data]
            org_sentences[doc] = [line.split('.') for line in org_data]

            sig_words = self.significant_words.union(
                set(self.get_freq_terms(doc)))

            for portion_index in range(len(parsed_sentences[doc])):
                portion = parsed_sentences[doc][portion_index]
                for sent_index in range(len(portion)):
                    sent = portion[sent_index]
                    # print("Doc = {}, Data = {}".format(doc, sent))
                    if sent:
                        words = sent.strip().split()
                        first_sig = None
                        last_sig = 0
                        sig_count = 0
                        non_sig_count = 0
                        max_non_sig = 20
                        for i in range(len(words)):
                            if words[i] in sig_words:
                                if first_sig is None:
                                    first_sig = i
                                    sig_count += 1
                                elif non_sig_count <= max_non_sig:
                                    last_sig = i
                                    sig_count += 1
                            elif first_sig is not None and non_sig_count <= max_non_sig:
                                non_sig_count += 1
                        cts = Counter(
                            [sig_words.__contains__(word) for word in words])
                        if first_sig is not None and last_sig > first_sig:
                            sig_factor = sig_count**2 / (last_sig - first_sig)
                        elif cts[True]:
                            sig_factor = cts[True] / max_non_sig
                        else:
                            sig_factor = 0
                        self.snippets.setdefault(doc, []).append(
                            (org_sentences[doc][portion_index]
                             [sent_index].strip(), sig_factor))

        for doc in self.snippets.keys():
            self.snippets[doc] = sorted(self.snippets[doc],
                                        key=lambda k: k[1],
                                        reverse=True)

    # results -> snippets -> QueryID -> Rank_DocumentID.html
    def save_snippets(self, qno):
        create_dir(os.path.join(self.snippet_dir, str(qno)))
        notfound = []
        for i in range(len(self.doc_scores)):
            doc = self.doc_scores[i][0]
            if not self.snippets.get(doc):
                notfound.append(doc)
                continue
            snips = self.snippets[doc]
            with open(
                    os.path.join(self.snippet_dir, str(qno),
                                 'Rank_' + str(i + 1) + '_' + doc + '.html'),
                    'w+') as f:
                f.write('<html><pre><h2>' + self.titles[doc] + '</h2>')
                for j in range(len(snips[:2])):
                    f.write('<div>Snippet ' + str(j + 1) + ':<p>')
                    for word in snips[j][0].replace('\n', ' <br>').split():
                        if parse_stuff(word.lower()) in self.significant_words:
                            f.write('<b>' + word + '</b> ')
                        else:
                            f.write(word + ' ')
                    f.write('</p></div>')
                f.write('</pre></html>')
        if notfound:
            print("Snippets not found for {} ... {}".format(
                len(notfound), notfound))
示例#7
0
def create_maps(file=None):
    my_data_parser = DataParser(file)
    my_data_parser.obtain_all()
示例#8
0
class Parser:
    def __init__(self):
        config = load_config()
        corpus_dir = config.get('DIRS', 'corpus_dir')
        self.raw_docs = abspath(corpus_dir, config.get('DIRS', 'raw_docs'))
        self.parsed_dir = abspath(corpus_dir, config.get('DIRS', 'parsed_dir'))
        self.data_parser = DataParser()
        create_dir(self.parsed_dir)
        self.parsed_content = ""
        self.raw_corpus = os.listdir(self.raw_docs)
        self.stem_dir = abspath(corpus_dir, config.get('DIRS', 'stem_dir'))
        self.stem_file = abspath(config.get('DIRS', 'data_dir'),
                                 config.get('FILES', 'stemmed_docs'))
        create_dir(self.stem_dir)
        self.docs = []

    # generates clean corpus from raw documents
    def parse_documents(self):
        for doc in self.raw_corpus:
            with open(os.path.join(self.raw_docs, doc), 'r') as f:
                content = f.read()
                self.data_parser.initialize()
                self.data_parser.feed(content)
                dataparser_op = self.data_parser.get_data()
                end = len(dataparser_op)
                for i in range(end):
                    if dataparser_op[i] == 'pre':
                        end = i
                if end > 3:
                    self.parsed_content = parse_stuff(" ".join(
                        dataparser_op[3:end]))
                else:
                    self.parsed_content = parse_stuff(dataparser_op[3])
                # to remove numbers at the end (specific for cacm corpus
                pmindex = self.parsed_content.rfind('pm')
                if pmindex == -1:
                    self.parsed_content = self.parsed_content[:self.
                                                              parsed_content.
                                                              rfind('am') + 2]
                else:
                    self.parsed_content = self.parsed_content[:pmindex + 2]
            write_file(
                os.path.join(self.parsed_dir, doc.replace('.html', '.txt')),
                self.parsed_content)

    # generates clean corpus from stemmed documents
    def stem_parse_documents(self):
        with open(self.stem_file) as f:
            content = f.read().split('#')
            for each in content:
                self.data_parser.initialize()
                self.data_parser.feed(each)
                each = parse_stuff(each)
                pmindex = each.rfind('pm')
                if pmindex == -1:
                    each = each[:each.rfind('am') + 2]
                else:
                    each = each[:pmindex + 2]
                self.docs.append(each)
            for doc in self.docs:
                if doc is not "":
                    write_file(
                        os.path.join(
                            self.stem_dir, 'CACM-' +
                            str(doc).split(" ")[0].zfill(4) + ".txt"),
                        " ".join(str(doc).split(" ")[1:]))