Exemplo n.º 1
0
def map():

    with sqlite3.connect(os.getcwd() + '/../../data/database.sqlite') as database:
        cursor = database.cursor()
        cursor.execute('SELECT paper_text FROM papers')
        for doc in cursor.fetchall():
            text = doc[0]
            lines = sent_tokenize(text)
            for line in lines:
               words = word_tokenize(line.lower())
               words = [i for i in words if i not in stop and not any((c.isdigit() or c in string.punctuation) for c in i) and not len(i) == 1]
               for first, second in zip(words, words[1:]):
                   sFirst = stem(first)
                   print(sFirst, second, 1)
    return True
Exemplo n.º 2
0
def stemWord(matchobj):
    # if necessary ignore keywords in the stemming, but the Porter stemmer doesn't affect them.
    return stem(matchobj.group(0))
Exemplo n.º 3
0
def Search(query, type="BM25"):
    """
    This method searches in the corpus using VSM or BM25
    :param query: query to search in the corpus
    :param type: VSM or BM25
    :return: list of doc ids ranked by their similarity to query
    """

    # get tokens from query
    q_grammar = Word(alphas)
    q_tf = {}
    q_length = 0
    for token, start, end in q_grammar.scanString(query):
        token = stem(str(token[0]).lower())
        if q_tf.has_key(token):
            q_tf[token] += 1
        else:
            q_tf[token] = 1
        q_length += 1

    # index = Index()
    # vocabulary = index[0]
    # d_idf = index[1]
    # d_length = index[2]
    q_terms = q_tf.keys()
    vocabulary = GetDbPostings(q_terms)
    d_idf = GetDbIdf(q_terms)
    d_length = GetDbDocs(q_terms)

    q_score = 0
    d_score_list = {}
    first_pos = {}

    d_avg = 0
    for doc in d_length:
        d_avg += d_length[doc]

    d_count = len(d_length)
    d_avg = d_avg / d_count

    # get the collection's tf idf with the query's tf idf
    for term in q_terms:
        if d_idf.has_key(term):
            q_score = q_tf[term] * d_idf[term]
            for doc in vocabulary[term]:
                posting = vocabulary[term][doc]
                d_tf = len(posting)
                if type == "BM25":
                    k = 2.0
                    b = 0.75
                    d_score = (((k + 1) * d_tf) / (
                        (k *
                         (1 - b + b *
                          (d_count / d_avg))) + d_tf)) * d_idf[term] * q_score
                if type == "VSM":
                    d_score = d_tf * d_idf[term] * q_score
                if d_score_list.has_key(doc):
                    d_score_list[doc] += d_score
                else:
                    d_score_list[doc] = d_score

    #print q_score
    #print sorted(d_score_list.items(), key=operator.itemgetter(0))

    # normalize the weights of the documents
    if type == "VSM":
        for d in d_score_list:
            d_score_list[d] = d_score_list[d] / d_length[d]

    # sort the results descendingly
    sorted_scores = sorted(d_score_list.items(),
                           key=operator.itemgetter(1),
                           reverse=True)

    #print sorted(d_score_list.items(), key=operator.itemgetter(0))
    #print sorted_scores

    result = []
    for doc, weight in sorted_scores:
        result.append(str(doc))

    #print result

    return result
Exemplo n.º 4
0
def Index(file_dataset="../../data/papers.csv",
          file_dump="../../data/derived/",
          id_col=0,
          text_col=-1,
          title_col=2):

    try:
        collection = pickle.load(open(file_dump + "index.lol", "rb"))
        idf = pickle.load(open(file_dump + "idf.lol", "rb"))
        docs = pickle.load(open(file_dump + "doc_length.lol", "rb"))
        #print "Retrieved index from file " + file_dump
    except:
        collection = defaultdict(functools.partial(defaultdict, list))
        doc_nr = 0
        idf = {}
        docs = {}

        print "Building indexer..."

        #with open(file_dataset, "r") as paperscsv:
        #papersreader = csv.reader(paperscsv,delimiter=',')
        with sqlite3.connect(os.getcwd() +
                             '/../../data/database.sqlite') as database:
            cursor = database.cursor()
            cursor.execute('SELECT * FROM papers')
            for doc in cursor.fetchall():
                #for doc in papersreader:
                # skip the header
                #if doc_nr > 0:
                id = doc[id_col]
                title = doc[title_col]
                text = doc[text_col]
                # extract tokens from the title and the text
                token_pos = 0
                print "Examining doc: " + str(id)
                tokenized_text = Word(alphas).searchString(title)
                docs[id] = len(tokenized_text)
                for token in tokenized_text:
                    token = stem(str(token[0]).lower())
                    token_pos += 1
                    collection[token][id].append(token_pos)
                tokenized_text = Word(alphas).searchString(text)
                docs[id] += len(tokenized_text)
                for token in tokenized_text:
                    token = stem(str(token[0]).lower())
                    token_pos += 1
                    collection[token][id].append(token_pos)
                doc_nr += 1
                # if doc_nr > 10:
                #     break

        print "Calculating idf..."

        for term in collection.keys():
            idf[term] = math.log10(doc_nr / float(len(collection[term])))

        print "Dumping index..."

        pickle.dump(collection, open(file_dump + "index.lol", "wb"))

        print "Dumping idf..."

        pickle.dump(idf, open(file_dump + "idf.lol", "wb"))

        print "Dumping doc_length..."

        pickle.dump(docs, open(file_dump + "doc_length.lol", "wb"),
                    pickle.HIGHEST_PROTOCOL)

    return [collection, idf, docs]