Exemplo n.º 1
0
def process(collection, reverseType):
    queries = parseQueries()
    parts = {item[0].rstrip(): list(itertools.chain(*([replacePunct(line[1:])
                                                       for line in item[1:]
                                                       if line[0] == "W"])))
             for item in queries}
    for index, qu in parts.iteritems():
        #print(collection, index, " ".join(qu))
        yield (index, qu, vectorial.main(collection, reverseType, " ".join(qu)))
Exemplo n.º 2
0
def searchToQuery(search):
    #output : parsed query with words and occurence
    with open("../CACM/common_words", "r") as cw:
        common_words = replacePunct(cw.read())
        return count_words(common_words, search.split())
Exemplo n.º 3
0
def normalize_wiki(open_file):
    xml_free = ''.join(ElementTree.fromstring(open_file).itertext())
    if type(xml_free) == unicode:
        xml_free = unicodedata.normalize('NFKD', xml_free).encode('ascii', 'ignore')
    clean_file = replacePunct(xml_free)
    return clean_file
Exemplo n.º 4
0
import os
import json
from xml.etree import ElementTree
import unicodedata
from parseCollection import replacePunct, count_words


def normalize_wiki(open_file):
    xml_free = ''.join(ElementTree.fromstring(open_file).itertext())
    if type(xml_free) == unicode:
        xml_free = unicodedata.normalize('NFKD', xml_free).encode('ascii', 'ignore')
    clean_file = replacePunct(xml_free)
    return clean_file


if __name__ == "__main__":
    for elem in os.listdir("../Wiki"):
        print(elem)
        for i, filename in enumerate(os.listdir("../Wiki/" + elem)):
            if (i % 1000 == 0):
                print ("%.2f%%" % (100. * i / 660000))
            clean_file = normalize_wiki(open("../Wiki/" + elem + "/" + filename, "r").read())
            with open("../CACM/common_words", "r") as cw:
                common_words = replacePunct(cw.read())
                freq = count_words(common_words, clean_file)
                dump_dir_path = "../WIKIindexes/wikifreq/" + elem + "/"
                if not os.path.exists(dump_dir_path):
                    os.makedirs(dump_dir_path)
                with open(dump_dir_path + filename.split(".")[0] + ".json", "w") as dump:
                    dump.write(json.dumps(freq, indent=2))