示例#1
0
 def __init__(self, device):
     super(PostagEmbedding, self).__init__(device=device)
     self.indexer = Indexer(
         special_tokens={
             '<s>': 0,
             '<unk>': 1,
             '<pad>': 2,
             '<\s>': 3,
             '<mask>': 4
         },
         with_del_stopwords=False)  # postag embedding の場合だけ必ずFalse
     datasets = Dataset().get_instance()
     sentences = [
         nltk.pos_tag(self.indexer.tokenize(pairs[0]))
         for pairs in datasets['train']
     ]
     sentences = [[pairs[1] for pairs in sentence]
                  for sentence in sentences]
     for sentence in sentences:
         self.indexer.add_sentence(sentence, with_raw=True)
     self.embedding_dim = 10
     self.embedding = nn.Embedding(num_embeddings=len(self.indexer),
                                   embedding_dim=self.embedding_dim,
                                   padding_idx=self.indexer.padding_index)
     self.embedding.to(device)
示例#2
0
 def __init__(self):
     self.parameter = list()
     self.total = list()
     self.paramCount = 0
     self.type = 'string'
     self.actionIndexer = Indexer(self.type)
     self.featureIndexer = Indexer(self.type)
def indexDirs():
    click.echo("Indexing Files....")
    libraries = jhandler.getLibs()
    indxer = Indexer(libraries)
    nols = indxer.index()
    click.echo("{} library items detected!".format(nols))
    click.echo("Done Indexing Files....")
    def processFile(self):
        interpreter = Interpreter()

        # return all the documents present in the file
        output = self.path + '.bin'
        if isfile(output):
            print('loading tokens')
            self.index = pickle.load(open(output, 'rb'))
            self.indexer = Indexer(self.tokenizer, index=self.index)
        else:
            self.indexer = Indexer(self.tokenizer)
            file = open(self.path, 'r', encoding='utf-8', errors='ignore')
            maximum = os.stat(self.path).st_size
            # initialize the variables
            i = 0
            progress = 0
            document = []
            for line in file:
                progress += len(line)
                if line == '\n':
                    interpreter.process(self.indexer, document)
                    document = []
                else:
                    document += [line]
                i += 1
                if i >= 5000:
                    i = 0
                    log(progress, maximum)

            file.close()
            self.index = self.indexer.index
            print('\nsaving tokens')
            pickle.dump(self.index, open(output, 'wb'))
class AbsolutePositionalEmbedding(AbstractEmbedding):
    def __init__(self, device):
        super(AbsolutePositionalEmbedding, self).__init__(device=device)
        self.max_length = 150
        self.indexer = Indexer(special_tokens={
            '<s>': 0,
            '<unk>': 1,
            '<pad>': 2,
            '<\s>': 3,
            '<mask>': 4
        },
                               with_del_stopwords=self.with_del_stopwords)
        self.indexer.add_sentence(list(map(str, range(self.max_length))),
                                  with_raw=True)
        self.embedding_dim = 20
        self.embedding = nn.Embedding(num_embeddings=len(self.indexer),
                                      embedding_dim=self.embedding_dim,
                                      padding_idx=self.indexer.padding_index)
        self.embedding.to(device)

    def forward(self, sentences):
        sentences = [self.indexer.tokenize(sentence) for sentence in sentences]
        sentences = [[str(i) for i, _ in enumerate(sentence)]
                     for sentence in sentences]
        indexes = [[self.indexer.get_index(word) for word in sentence]
                   for sentence in sentences]
        pad_indexes = self.pad_sequence(indexes)
        pad_indexes = torch.Tensor(pad_indexes).long().to(self.device)
        vectors = self.embedding(pad_indexes)
        return vectors
示例#6
0
def main():
    parser = util.get_dates_range_parser()
    args = parser.parse_args()

    ind = Indexer(DB_DIR)

    #prof1, nouns1 = get_profiles(ind, args.start)
    #prof2, nouns2 = get_profiles(ind, args.end)
   
    cur = ind.get_db_for_date(args.start)

    prof, nouns = get_profiles(ind, args.start)
    
    replys_v = set()
    for p in prof:
        replys_v |= set(prof[p].replys.keys())

    m = []
    for p in prof:
        m_i = []
        for r in replys_v:
            if r in prof[p].replys:
                m_i.append(prof[p].replys[r])
            else:
                m_i.append(0)
        m.append(m_i)

    logging.info("%s x %s" % (len(m), len(m[0])))

    u, s, v = numpy.linalg.svd(m, full_matrices=False)

    k = 50

    uk = numpy.transpose(numpy.transpose(u)[:k])
    sk = s[:k]

    stats.create_given_tables(cur, ["noun_similarity"])
    cur.execute("create table if not exists noun_sim_svd as select * from noun_similarity limit 0")

    p_keys = prof.keys()

    sims = []
    for i in range(0, len(p_keys)):
        for j in range(i + 1, len(p_keys)):
            
            p1_ = map(lambda x: u[i][x] * sk[x] , range(0, k))
            p2_ = map(lambda x: u[j][x] * sk[x] , range(0, k))

            sim = numpy.dot(p1_, p2_) / (numpy.linalg.norm(p1_) * numpy.linalg.norm(p2_))
            sims.append((p_keys[i], p_keys[j], sim))
 
            if len(sims) > 20000:
                save_sims(cur, sims)
                sims = []

                logging.info("Another 10k seen")

    save_sims(cur, sims)

    logging.info("done")
示例#7
0
def main():
    # Indexer Initialization
    indexer = Indexer()
    indexer.build_dictionary()
    # indexer.write_dict_to_file()

    # classify files
    run_classifier(indexer)
示例#8
0
def main():
    start = time.time()
    indexer = Indexer(logging.DEBUG)
    word_dict = indexer.get_normalized_fequency()
    cosine = CosineScorer(word_dict, "the cat and the dog jkhgdh", logging.DEBUG)
    score = cosine.get_score("httpwwwvimncompressnickseriessam uSam jkhgdh ampcat", "http://www.google.com")
    print score
    print str(time.time() - start) + " seconds"
示例#9
0
 def __init__(self):
     self._max_url_length = 100
     self._url_list = []
     self._title_list = []
     self._max_stay_on_site = 100
     self._current_on_site = 0
     self._previous_domain = None
     self._max_urls_in_list = 500
     self._max_new_urls_per_page = 100
     self._aggressive_pruning = True
     self._indexer = Indexer("localhost", 9200)
示例#10
0
    def __init__(self, numOfLayer):
        self.num = numOfLayer
        self.parent = []
        self.children = []
        self.handled = []
        self.Indexer = Indexer()
        self.Processor = Processor()
        self.Porter = PorterStemmer()
        self.db = []

        link = "http://www.cse.ust.hk/"
        self.parent.append(link)
示例#11
0
def main():
    parser = util.get_dates_range_parser()
    args = parser.parse_args()

    logging.info("Start")
    ind = Indexer(DB_DIR)

    grades = (1, 10, 100, 1000)
    data = [["date", "nouns", "tweets", "tweet_chains"] +
            map(lambda x: "cnt > %s" % x, grades)]
    print data

    dates = []
    for date in sorted(ind.dates_dbs.keys()):
        if args.start is not None and date < args.start:
            continue
        if args.end is not None and date > args.end:
            continue
        cur = ind.get_db_for_date(date)

        tables = cur.execute(
            "SELECT name FROM sqlite_master WHERE type='table' and name = 'tweets_nouns'"
        ).fetchall()
        if len(tables) == 0:
            logging.error("No tweets_nouns for date %s" % date)
            continue

        stats.create_given_tables(cur, ["post_cnt"])
        post_cnt = cur.execute("select count(*) from post_cnt").fetchone()[0]
        if post_cnt == 0:
            cur.execute(
                "insert or ignore into post_cnt select noun_md5, count(*) from tweets_nouns group by noun_md5"
            )

        cnt = [date]
        nouns_cnt = cur.execute("select count(*) from nouns").fetchone()[0]
        cnt.append(nouns_cnt)
        tweets = cur.execute("select count(*) from tweets").fetchone()[0]
        cnt.append(tweets if tweets is not None else "~")
        tweet_chains = cur.execute(
            "select count(*) from tweet_chains").fetchone()[0]
        cnt.append(tweet_chains if tweet_chains is not None else "~")

        for i in grades:
            cnti = cur.execute(
                "select count(*) from (select 1 from post_cnt where post_cnt > %s group by post_md5)"
                % i).fetchone()[0]
            cnt.append("%.2f" % ((cnti + 0.0) / nouns_cnt))

        data.append(cnt)

    for row in data:
        print_cols(row)
示例#12
0
 def load(self):
     self.indexer = Indexer(self.posting_path)
     if self.to_stem:
         self.indexer.to_stem = True
     self.languages = self.indexer.load()
     self.avg_doc_length = self.indexer.docs_avg_length
     self.searcher = Searcher(self.main_path, self.posting_path,
                              self.indexer.terms_dict,
                              self.indexer.cities_dict,
                              self.indexer.docs_dict, self.avg_doc_length,
                              self.to_stem, self.with_semantics)
     self.searcher.model = Word2Vec.load(self.posting_path + '//model.bin')
示例#13
0
    def start(self):
        self.indexer = Indexer(self.posting_path)
        if self.to_stem:
            self.indexer.to_stem = True
        dirs_list = os.listdir(self.main_path + '\\corpus')
        # Create temp postings Multiprocessing
        dirs_dict = ParallelMain.start(self.main_path, self.posting_path,
                                       self.to_stem, dirs_list)
        # Merging dictionaries that were created by the processes
        docs = {}
        files_names = []
        post_files_lines = []
        total_length = 0
        for dir in dirs_dict.keys():
            tmp_docs_dict = dirs_dict[dir][2]
            for doc_id in tmp_docs_dict:
                docs[doc_id] = tmp_docs_dict[doc_id]
                total_length += docs[doc_id].length
            for lang in dirs_dict[dir][3]:
                self.languages.add(lang)
            old_post_files_lines = dirs_dict[dir][0]
            for i in range(0, len(old_post_files_lines)):
                files_names.append(dir + "\\Posting" +
                                   str(i) if not self.to_stem else dir +
                                   "\\sPosting" + str(i))
                post_files_lines.append(old_post_files_lines[i])

        self.avg_doc_length = total_length / len(docs)

        # Gets Cities that appear in the corpus
        i = 0
        while i < len(dirs_list):
            self.reader.read_cities(self.main_path + '\\corpus', dirs_list[i])
            i += 1

        terms_dicts = [
            dirs_dict["\\Postings1"][1], dirs_dict["\\Postings2"][1],
            dirs_dict["\\Postings3"][1], dirs_dict["\\Postings4"][1]
        ]

        terms_dict = Merge.start_merge(files_names, post_files_lines,
                                       terms_dicts, self.posting_path,
                                       self.to_stem)

        self.indexer.docs_avg_length = self.avg_doc_length
        self.indexer.terms_dict = terms_dict
        self.indexer.docs_dict = docs
        self.indexer.index_cities(self.reader.cities)
        self.indexer.post_pointers(self.languages)
    def __init__(self, data_path = ""):
        ''' Either read the Indexer info from files or generate it
            If readFromFiles is true: path = path to the dir containing the results from saving Indexer files before
            ELSE readFromFiles is false: path = path to the data dir'''

        self.createQueryDir()
        Indexer.__init__(self)
        index_timer_start = time.time()
        if len(data_path) == 0: # No need to calculate again
            self.read_files(self.indexer_path) # Indexer_Data
            print("Read index in {0} seconds".format(str(time.time() - index_timer_start)))
        else:
            self.handle_dir(data_path)
            self.create_tf_idf()
            print("Generated index in seconds: {0}".format(str(time.time() - index_timer_start)))
            self.save_indexer_to_files()
        self.inverse_doc_lookup = self.get_inverse_doc_lookup()
示例#15
0
    def __init__(self):

        DBCrawl.connect()
        DBUnCrawl.connect()
        DBRobot.connect()
        DBWebPage.connect()
        DBPageRank.connect()
        DBIndexer.connect()
        indexedCount.connect()
        #DBQuery.connect()

        self._getDBTables()
        self.indexer = Indexer()
        self.numberOfThreads = 1
        self._setNumOfThreads()
        self.crawlerObjs = []
        self._createCrawlerObjects()
示例#16
0
class StanfordTwitterEmbedding(AbstractEmbedding):
    def __init__(self, device):
        super(StanfordTwitterEmbedding, self).__init__(device=device)
        self.path = Path(
            '../data/models/glove.twitter.27B/glove.twitter.27B.200d.txt')
        with_raw_file = False
        if with_raw_file:
            with self.path.open('r', encoding='utf-8-sig') as f:
                texts = f.readlines()
            headers = [len(texts), None]
            vocab, weights = map(
                list,
                zip(*Parallel(n_jobs=10)
                    ([delayed(self.get_weights)(text) for text in texts])))
            with (self.path.parent / 'vocab.pkl').open('wb') as f:
                pickle.dump(vocab, f)
            with (self.path.parent / 'weights.pkl').open('wb') as f:
                pickle.dump(weights, f)
        else:
            with (self.path.parent / 'vocab.pkl').open('rb') as f:
                vocab = pickle.load(f)
            with (self.path.parent / 'weights.pkl').open('rb') as f:
                weights = pickle.load(f)

        self.indexer = Indexer(special_tokens={
            '<s>': 0,
            '<unk>': 1,
            '<pad>': 2,
            '<\s>': 3,
            '<mask>': 4
        },
                               with_del_stopwords=self.with_del_stopwords)
        for word in vocab:
            self.indexer.count_word(word)
            self.indexer.add_word(word)
        self.embedding_dim = len(weights[0])
        special_weights = [[0.0] * self.embedding_dim] * 5
        weights = torch.FloatTensor(special_weights + weights)
        self.embedding = nn.Embedding.from_pretrained(
            embeddings=weights, padding_idx=self.indexer.padding_index)
        self.embedding.to(device)

    def get_weights(self, text):
        content = text.split(' ')
        return content[0], list(map(float, content[1:]))
 def __init__(self, device):
     super(AbsolutePositionalEmbedding, self).__init__(device=device)
     self.max_length = 150
     self.indexer = Indexer(special_tokens={
         '<s>': 0,
         '<unk>': 1,
         '<pad>': 2,
         '<\s>': 3,
         '<mask>': 4
     },
                            with_del_stopwords=self.with_del_stopwords)
     self.indexer.add_sentence(list(map(str, range(self.max_length))),
                               with_raw=True)
     self.embedding_dim = 20
     self.embedding = nn.Embedding(num_embeddings=len(self.indexer),
                                   embedding_dim=self.embedding_dim,
                                   padding_idx=self.indexer.padding_index)
     self.embedding.to(device)
示例#18
0
 def __init__(self, device):
     super(NtuaTwitterEmbedding, self).__init__(device=device)
     self.path = Path('../data/models/ntua-slp-semeval2018/ntua_twitter_300.txt')
     with self.path.open('r', encoding='utf-8-sig') as f:
         texts = f.readlines()
     headers = texts[0].strip().split(' ')
     contents = [text.strip().split(' ') for text in texts[1:]]
     vocab = [content[0] for content in contents]
     weights = [list(map(float, content[1:])) for content in contents]
     self.indexer = Indexer(special_tokens={'<s>': 0, '<unk>': 1, '<pad>': 2, '<\s>': 3, '<mask>': 4}, with_del_stopwords=self.with_del_stopwords)
     for word in vocab:
         self.indexer.count_word(word)
         self.indexer.add_word(word)
     self.embedding_dim = int(headers[1])
     special_weights = [[0.0] * self.embedding_dim] * 5
     weights = torch.FloatTensor(special_weights + weights)
     self.embedding = nn.Embedding.from_pretrained(embeddings=weights, padding_idx=self.indexer.padding_index)
     self.embedding.to(device)
示例#19
0
    def __init__(self, data_path=""):
        ''' Either read the Indexer info from files or generate it
            If readFromFiles is true: path = path to the dir containing the results from saving Indexer files before
            ELSE readFromFiles is false: path = path to the data dir'''

        self.createQueryDir()
        Indexer.__init__(self)
        index_timer_start = time.time()
        if len(data_path) == 0:  # No need to calculate again
            self.read_files(self.indexer_path)  # Indexer_Data
            print("Read index in {0} seconds".format(
                str(time.time() - index_timer_start)))
        else:
            self.handle_dir(data_path)
            self.create_tf_idf()
            print("Generated index in seconds: {0}".format(
                str(time.time() - index_timer_start)))
            self.save_indexer_to_files()
        self.inverse_doc_lookup = self.get_inverse_doc_lookup()
示例#20
0
 def __init__(self, device):
     super(RawEmbedding, self).__init__(device=device)
     self.indexer = Indexer(special_tokens={
         '<s>': 0,
         '<unk>': 1,
         '<pad>': 2,
         '<\s>': 3,
         '<mask>': 4
     },
                            with_del_stopwords=self.with_del_stopwords)
     datasets = Dataset().get_instance()
     sentences = [pairs[0] for pairs in datasets['train']]
     self.indexer.count_word_in_text(sentences)
     self.indexer.add_sentences(sentences)
     self.embedding_dim = 100
     self.embedding = nn.Embedding(num_embeddings=len(self.indexer),
                                   embedding_dim=self.embedding_dim,
                                   padding_idx=self.indexer.padding_index)
     self.embedding.to(device)
示例#21
0
def do_query(query, idf_t):
    q_terms = set(ind.parse_html(query))

    db = sqlite3.connect("data/pages.db")
    cursor = db.cursor()

    page_ids = set()

    for term in q_terms:
        db_q = """SELECT indexTable.pageId FROM indexTable, freqTable
            WHERE freqTable.id = indexTable.termID
            AND freqTable.term = "{0}"
            """.format(term)
        cursor.execute(db_q)

        tmp_page_ids = set()
        if len(page_ids) == 0:
            for page_id in cursor.fetchall():
                page_ids.add(page_id[0])
        else:
            for page_id in cursor.fetchall():
                tmp_page_ids.add(page_id[0])
            page_ids = page_ids.intersection(tmp_page_ids)

    import time
    page_score = []
    i = 0.0

    for page_id in page_ids:
        t = time.time()
        db_q = """SELECT html FROM htmlParsed WHERE pageId = ?"""
        cursor.execute(db_q, (page_id,))
        times[0].append(time.time() - t)
        doc = str(cursor.fetchall()[0][0]).split()  # ind.parse_html(str(cursor.fetchall()[0][0]))
        times[1].append(time.time() - t)
        page_score.append((page_id, compare_page_query(doc, ind.parse_html(query), idf_t)))
        times[2].append(time.time() - t)
        print('{1:.2%} {0}'.format(page_id, 1.0/len(page_ids)*i), end='\r')
        i += 1

    db.close()
    return page_score
def interact(c):
    # Create a indexer object
    indexer = Indexer(root)
    while True:
        #  Get Directory details
        curr_directory, folders, files = indexer.get_dir_details()
        print "Current Directory :", curr_directory
        #  Send the details to the client

        c.send(curr_directory + "\n")
        for item in folders:
            c.send(item + '\n')
        c.send("\n")
        for item in files:
            c.send(item[0] + '\n' + str(item[1]) + "\n")
        c.send("/")
        # Receive response from client
        choice = c.recv(1024)

        # Is -1 disconnect
        if choice == "-1":
            print "Disconnecting from Client"
            return
        elif int(choice) <= len(
                folders):  # Change directory if folder is selected
            indexer.make_choice(int(choice))
        else:
            send_file(c, indexer.get_file_path(
                int(choice)))  #Send file if file is selected
    return
示例#23
0
def start_indexing(dirs_list, dirs_dicts, main_path, posting_path, to_stem,
                   start_index, end_index, directory):
    dirs_dicts[directory] = None
    reader = ReadFile()
    parser = Parse(main_path)
    indexer = Indexer(posting_path + directory)

    if to_stem:
        parser.to_stem = True
        indexer.to_stem = True
    if not os.path.exists(posting_path + directory):
        os.makedirs(posting_path + directory)

    documents = {}
    i = start_index
    while i < end_index:
        docs = reader.separate_docs_in_file(main_path + '\\corpus',
                                            dirs_list[i])
        j = 0
        for doc_id in docs:
            doc_dict = parser.main_parser(docs[doc_id].text, docs[doc_id])
            docs[doc_id].text = None
            if i == end_index - 1 and j == len(docs) - 1:
                indexer.finished_parse = True
            indexer.index_terms(doc_dict, doc_id)
            documents[doc_id] = docs[doc_id]
            j += 1
        i += 1
    dirs_dicts[directory] = [
        indexer.post_files_lines, indexer.terms_dict, documents,
        reader.languages
    ]
示例#24
0
    def crawl(current_url):
        print('Total in Queue', len(Crawler.queue), '| Total Crawled', len(Crawler.crawled))
        if '.vhd' not in current_url:
            try:
                with urllib.request.urlopen(current_url) as response:
                    html = response.read()

                soup = BeautifulSoup(html, "html.parser")
                print(" crawling", current_url)
                for link in soup.findAll('a', attrs={'href': re.compile("^http")}):
                    href = link.get('href')
                    if href not in Crawler.queue and href not in Crawler.crawled:
                        Crawler.queue.add(href)
                Crawler.crawled.add(current_url)
                Crawler.queue.discard(current_url)
                Indexer.indexer(current_url, soup)
                Crawler.save_lists()
            except:
                print("ERROR", current_url)
                Crawler.queue.discard(current_url)
                Crawler.save_lists()
                pass
示例#25
0
def init():
    bot_id = '1437569240:AAEd2sZ0faC1EwPvQGJPPW4xf7ohP1hTzV8'
    updater = Updater(bot_id)
    updater.setPhotoHandler(imageHandler)

    QualityChecker.init()
    ShoeDetector.init()
    FeatureExtractor.init()
    data_structure = Indexer.build_data_structure(config.DATASET_PATH)
    Matcher.init(data_structure)

    print("Bot is running...")
    updater.start()
示例#26
0
def imageHandler(bot, message, chat_id, local_filename):
    bot.sendMessage(chat_id, "Hi, I'm processing your request")
    print("Processing request...")
    is_good_quality = QualityChecker.is_good_quality(
        Indexer.load_image(local_filename,
                           im_size=config.QUALITYCHECKER_IMSIZE))
    if not is_good_quality:
        bot.sendMessage(
            chat_id,
            "Your image is of a poor quality. Please, send me a better one")
        print("Message sent: image is of a poor quality.")
    else:
        is_shoe = ShoeDetector.classify_image(
            Indexer.load_image(local_filename,
                               im_size=config.CLASSIFIER_IM_SIZE))
        if not is_shoe:
            bot.sendMessage(
                chat_id,
                "Ops! Something went wrong... Make sure your image contains a shoe"
            )
            print("Message sent: the photo doesn't contain a shoe.")
        else:
            try:
                most_similar = Matcher.get_most_similar(
                    Indexer.load_image(local_filename))
                retrieved_images = Matcher.retrieve_items(most_similar)
                bot.sendMessage(chat_id,
                                "These are the most similar shoes I've found")
                for im in retrieved_images:
                    bot.sendImage(chat_id, config.DATASET_PATH + im, "")
                print("Most similar images sent.")
            except FeatureExtractionException:
                bot.sendMessage(
                    chat_id,
                    "I couldn't process your photo. Please, send me a better one"
                )
                print("Message sent: the photo can't be processed.")
    print("Request processed.")
示例#27
0
def main():
    indexer = Indexer()
    numDocs = 0

    for subdir, dirs, files in os.walk(
            r'C:\Users\Justin Ho\Documents\CS 121\developer\DEV'):
        for filename in files:
            filepath = subdir + os.sep + filename
            f = open(filepath)
            data = json.load(f)
            print(data["url"])
            indexer.parse(data['content'], data['url'])
            numDocs += 1

    sortedTokens = sorted(indexer.invertedIndex.items(),
                          key=lambda x: x[1]["tf-idf"],
                          reverse=True)
    print("Number of Documents: {}".format(numDocs))
    print("Number of Unique Tokens: {}".format(
        len(indexer.invertedIndex.keys())))
    file1 = open("index.txt", "a")
    for k, v in sortedTokens:
        file1.write("{}:{}\n".format(k, v))
    file1.close()
示例#28
0
class PostagEmbedding(AbstractEmbedding):
    def __init__(self, device):
        super(PostagEmbedding, self).__init__(device=device)
        self.indexer = Indexer(
            special_tokens={
                '<s>': 0,
                '<unk>': 1,
                '<pad>': 2,
                '<\s>': 3,
                '<mask>': 4
            },
            with_del_stopwords=False)  # postag embedding の場合だけ必ずFalse
        datasets = Dataset().get_instance()
        sentences = [
            nltk.pos_tag(self.indexer.tokenize(pairs[0]))
            for pairs in datasets['train']
        ]
        sentences = [[pairs[1] for pairs in sentence]
                     for sentence in sentences]
        for sentence in sentences:
            self.indexer.add_sentence(sentence, with_raw=True)
        self.embedding_dim = 10
        self.embedding = nn.Embedding(num_embeddings=len(self.indexer),
                                      embedding_dim=self.embedding_dim,
                                      padding_idx=self.indexer.padding_index)
        self.embedding.to(device)

    def forward(self, sentences):
        if self.with_del_stopwords:
            postags = [
                nltk.pos_tag(self.indexer.tokenize(sentence))
                for sentence in sentences
            ]
            sentences = [[pairs[0] for pairs in postag] for postag in postags]
            postags = [[pairs[1] for pairs in postag] for postag in postags]
            is_stopword = self.indexer.is_stopword(sentences)
            postags = [[tag for sw, tag in zip(stopword, postag) if sw != 1]
                       for stopword, postag in zip(is_stopword, postags)]
        else:
            postags = [
                nltk.pos_tag(self.indexer.tokenize(sentence))
                for sentence in sentences
            ]
            postags = [[pairs[1] for pairs in postag] for postag in postags]
        indexes = [[self.indexer.get_index(tag) for tag in postag]
                   for postag in postags]
        pad_indexes = self.pad_sequence(indexes)
        pad_indexes = torch.Tensor(pad_indexes).long().to(self.device)
        vectors = self.embedding(pad_indexes)
        return vectors
示例#29
0
def probe_vocabs():
    datasets, tags = get_datasets()
    indexer = Indexer(with_preprocess=False)

    n_grams = [1, 2, 3]
    raw_texts = datasets
    multi_stats = {i: {
        'vocabs': {tag: set() for tag in tags},
        'counts': {tag: {} for tag in tags},
        'vocabs_by_labels': {tag: {'INFORMATIVE': set(), 'UNINFORMATIVE': set()} for tag in tags},
        'counts_by_labels': {tag: {'INFORMATIVE': {}, 'UNINFORMATIVE': {}} for tag in tags},
        'ann_texts': {tag: [] for tag in tags},
        'del_texts': {tag: [] for tag in tags}
    } for i in n_grams}

    del_items = set(
        ['<hashtag>', '</hashtag>', '<allcaps>', '</allcaps>', '<user>', 'covid19', 'coronavirus', 'covid',
         '<number>', 'httpurl', 19, '19'])
    del_items |= set(["'", '"', ':', ';', '.', ',', '-', '!', '?', "'s", "<", ">", "(", ")", "/"])
    del_items |= set(nltk_stopwords.words('english'))
    for n_gram in n_grams:
        for tag in tags:
            for text, label in datasets[tag]:
                words = indexer.text_processor.pre_process_doc(text)
                label = get_label_text(label)
                multi_stats[n_gram]['ann_texts'][tag].extend([['_'.join(words[i: i+n_gram]) for i in range(0, len(words) - n_gram + 1)]])
                del_words = [word for word in words if word not in del_items]
                multi_stats[n_gram]['del_texts'][tag].extend([['_'.join(del_words[i: i+n_gram]) for i in range(0, len(del_words) - n_gram + 1)]])
                if n_gram != 0:
                    words = del_words
                for word in ['_'.join(words[i: i+n_gram]) for i in range(0, len(words) - n_gram + 1)]:
                    multi_stats[n_gram]['vocabs'][tag].add(word)
                    multi_stats[n_gram]['vocabs_by_labels'][tag][label].add(word)
                    if word in multi_stats[n_gram]['counts'][tag].keys():
                        multi_stats[n_gram]['counts'][tag][word] += 1
                    else:
                        multi_stats[n_gram]['counts'][tag][word] = 1
                    if word in multi_stats[n_gram]['counts_by_labels'][tag][label].keys():
                        multi_stats[n_gram]['counts_by_labels'][tag][label][word] += 1
                    else:
                        multi_stats[n_gram]['counts_by_labels'][tag][label][word] = 1
    return {'multi_stats': multi_stats, 'raw_texts': raw_texts}
示例#30
0
    def retrieve_items(sorted_rsv, n=3):
        """
        Given the sorted list of similar images, retrieve the n most similar images belonging to different classes

        :param sorted_rsv: a sorted np array. The first column contains scores, the second one contains image names
        :param n: the number of images to retrieve
        :return: list of retrieved image names
        """
        images = sorted_rsv[:, 1]
        image_classes_dict = Indexer.extract_classes()
        classes_already_retrieved = []
        retrieved = []
        for image in images:
            class_id = image_classes_dict[image]
            if class_id not in classes_already_retrieved:
                retrieved.append(image)
                classes_already_retrieved.append(class_id)
                if len(retrieved) == n:
                    break
        return retrieved
示例#31
0
def probe_sentence_length():
    datasets, tags = get_datasets()
    counts = {tag: {} for tag in tags}
    counts_by_labels = {tag: {'INFORMATIVE': {}, 'UNINFORMATIVE': {}} for tag in tags}
    indexer = Indexer()
    for tag in tags:
        for text, label in datasets[tag]:
            words = indexer.text_processor.pre_process_doc(text)
            label = get_label_text(label)
            if len(words) in counts[tag].keys():
                counts[tag][len(words)].append(words)
            else:
                counts[tag][len(words)] = [words]

            if len(words) in counts_by_labels[tag][label].keys():
                counts_by_labels[tag][label][len(words)] += 1
            else:
                counts_by_labels[tag][label][len(words)] = 1

    return {'counts': counts, 'counts_by_labels': counts_by_labels}
示例#32
0
    def _build_index(self):
        """
        Takes the extracted terms and stop words and builds up the term frequency index and the 
        document frequency index.
        
        """
        print "Building index ..."        
        
        for website_and_terms in self._extracted_terms:
            website = website_and_terms[0]
            terms = website_and_terms[1]
            self._document_lengths[website] = len(terms)
    
        self._indexer = Indexer(self._extracted_terms, self._stopwords)
        index = self._indexer.buidlindex()
        
        self._document_frequency = index[0]
        self._term_frequency = index[1]
        self._extracted_terms = index[2]
        self._document_lengths = index[3]
        #pprint(self._term_frequency) 

        
        #print "  Document index:"
        #for term in sorted(self._document_frequency):
        #    print "   - " + term + ": " + str(self._document_frequency[term]) + " times"
        
        print
        print "  Term frequency:"
        for term in sorted(self._term_frequency):
            print "   - " + term + ":"
            print "      -   Document Frequency: "+ str(self._document_frequency[term])
            for document_and_count in self._term_frequency[term]:
            
                print "      - " + document_and_count[0] + ": " + str(document_and_count[1]) + " times"
        print
        print " Document Lengthes"
        pprint (self._document_lengths)
        print "Index build up."
示例#33
0
def main(argv):
    collectionFile = ''
    tokenizerType = ''
    try:
        opts, args = getopt.getopt(argv, "hf:t:",
                                   ["collectionFile=", "tokenizerType="])
    except getopt.GetoptError:
        print(
            'main.py -f <collectionFile> -t <tokenizerType: 0 - Simple, 1 - Better>'
        )
        sys.exit()

    if len(opts) != 2:
        print(
            'main.py -f <collectionFile> -t <tokenizerType: 0 - Simple, 1 - Better>'
        )
        sys.exit()

    for opt, arg in opts:
        if opt == '-h':
            print(
                'main.py -f <collectionFile> -t <tokenizerType: 0 - Simple, 1 - Better>'
            )
            sys.exit()
        elif opt in ("-f", "--collectionFile"):
            if not path.exists(arg):
                print('Incorrect path to collection file.')
                sys.exit()
            elif not path.exists(arg):
                print('File doesn\'t exists')
                sys.exit()
            collectionFile = arg
        elif opt in ("-t", "--tokenizerType"):
            if arg != '0' and arg != '1':
                print(
                    'Incorrect tokenizer type. Simple tokenizer: 0, Better tokenizer: 1.'
                )
                sys.exit()
            tokenizerType = arg

    indexer = Indexer(collectionFile, tokenizerType)
    indexer.listTermsInOneDoc()
    indexer.listHighestDocFreqTerms()
示例#34
0
def main(inputDir, outputDir):

    m = Indexer()
    files = os.listdir(inputDir)

    #for measuring elapsed time
    elapsed_time = []
    start = time.time()

    #Loop over all files in the given directory
    for file in files:
        if file.endswith(".html"):
            inputFile = os.path.join(inputDir, file)

            m.parse(inputFile)  #the tokenization happens inside this function
            m.mappings.append((m.doc_num, inputFile))

            end = time.time()
            elapsed_time.append(end - start)

    m.writeFiles(outputDir, N=len(m.mappings))
    print("Ran in {} seconds.".format(elapsed_time[-1]))
示例#35
0
class SearchEngine(object):
    """
    Manages crawler, indexer and page rank computer and provides a convenient method for querying
    the search index.
    
    The class is initialized with a set of seed URLs for the crawler and a list of stop words for
    the indexer. It will then run the crawler, indexer and page rank computer and provides a ready-
    to-use search index which can be queried using query().
    
    """

    # Words that are filtered out before term/document frequency calculation.
    _stopwords = []
    
    # Seed URLs for the web crawler, these are the URLs which the web crawler uses to start 
    # crawling.
    _seed_urls = []
    
    # The web crawler, starts with the seed URLs and build up the web graph and extracts terms from
    # the web sites.
    _crawler = None
    
    # Computes the page ranks for every website using the web graph.
    _page_rank_computer = None
    
    # The indexer that creates the term frequency and document frequency indexes.
    _indexer = None
    
    # The webgraph is a dictionary mapping websites to a list of outlinks (websites linked to by
    # that website).
    _webgraph = {}
    
    # A dictionary mapping websites to extracted terms.
    _extracted_terms = {}
    
    # A dictionary mapping websites to computed page ranks.
    _page_ranks = {}
    
    # A dictionary mapping terms to the number of documents they occur in.
    _document_frequency = {}
    
    # A dictionary mapping terms to a list of tuples of documents and the number of times the term
    # occurs in that document, e.g. { 'term' : [('document', 123), ('anotherdocument', 1234)] }
    _term_frequency = {}
    
    # A dictionary mapping websites to the length of their content (number of words).
    _document_lengths = {}
    
    def __init__(self, seed_urls, stopwords = None):
        """
        Initializes the search engine with the given seed urls and the given stop words and does the
        crawling, computes page ranks and builds up the index. 
        
        Args:
            seed_urls: The seed urls for the crawler.
            stopwords: The stop words for the indexer.
        
        """
        if stopwords is not None:
            self._stopwords = stopwords
        self._seed_urls = seed_urls
        
        self._do_crawling()
        self._compute_page_ranks()
        self._build_index()
        
    def _sanitize_query(self, query):
        """
        Sanitizes the query by lowercasing all characters, then creates a dictionary mapping query
        terms to occurrence count.
        
        Args:
            query: The search query, terms separated by whitespace.
            
        Returns:
            A dictionary mapping query terms to occurrence count.
        
        """
        query_terms = query.lower().split()
        
        terms = {}
        for term in query_terms:
            if term in terms:
                terms[term] = terms[term] + 1
            else:
                terms[term] = 1
        
        return terms
    
    def query(self, query):
        """
        Searches the index for every term (separated by whitespace), then sorts the 
        resulting documents by relevance using the cosine score algorithm and prints them.
        
        Args:
            query: The search query, terms separated by whitespace (all terms will be converted to
                   lowercase).
        
        """
        terms = self._sanitize_query(query)
        
        if not terms:
            print "No search terms entered."
            return
        
        documents = []
        scores = defaultdict(int)
        querryLength = 0
        for term in terms:
            
            if not term in self._term_frequency:
                continue
            
            # The inverse document frequency weight is a measure of informativeness of a term and 
            # is calculated by dividing the number of documents in the webgraph by the number of 
            # documents the term occurs in.
            #
            # idf = log10(number of documents in webgraph/number of documents containing term)
            
            idf = log10(len(self._document_frequency) / self._document_frequency[term])
                        
            # The weight of a term in the query is the product of the term frequency weight and the
            # inverse document frequency weight.
            #
            # tqw = (1 + log10(term frequency in the query)) * idf
            
            term_query_weight = (1 + log10(terms[term])) * idf;
            querryLength = querryLength + pow(term_query_weight, 2)
        
        for term in terms:
            documents_containing_term = []
            
            if not term in self._term_frequency:
                continue
            
            # The inverse document frequency weight is a measure of informativeness of a term and 
            # is calculated by dividing the number of documents in the webgraph by the number of 
            # documents the term occurs in.
            #
            # idf = log10(number of documents in webgraph/number of documents containing term)
            
            idf = log10(len(self._document_frequency) / self._document_frequency[term])
                        
            # The weight of a term in the query is the product of the term frequency weight and the
            # inverse document frequency weight.
            #
            # tqw = (1 + log10(term frequency in the query)) * idf
            
            term_query_weight = (1 + log10(terms[term])) * idf;
            term_document_weights = {}
            
            for document_and_count in self._term_frequency[term]:
                document, count,tfidf = document_and_count
                    
                documents_containing_term.append(document)
                    
                # The weight of a term in the document is the product of the weighted term frequency
                # and the inverse document frequency weight.
                #
                # tdw = (1 + log10(frequency of the term in the document)) * idf
                    
                term_document_weights[document] = tfidf
            
            # Merge documents containing the term with the result list.
            
            documents = list(set(documents + documents_containing_term))
            
            # Add the product of the term query weight and the term document weight to each 
            # document.
            
            for document in documents_containing_term:
                score = scores[document] + (term_query_weight * term_document_weights[document])
                scores[document] = score
        
        # Divide the score of each document d by the length of document d, so that longer and 
        # shorter documents have scores in the same order of magnitude. 

        for doc in scores:
            scores[doc] = scores[doc] / (self._document_lengths[doc] *sqrt(querryLength))
        
        print 
        
        if not documents:
            print ("No documents match your search terms (\"" 
                   "" + ', '.join(str(term) for term in terms) + "\").")
            return
        
        print "Results:"
        
        for document in sorted(documents, 
                               key = lambda url : self._page_ranks[url] * scores[url], 
                               reverse = True):
            print "  - " + document
            print ("      (Score: " + str(scores[document]) + ""
                   ", PageRank: " + str(self._page_ranks[document]) + ""
                   ", Combined: " + str(self._page_ranks[document] * scores[document]) + ")")
    
    def _do_crawling(self):
        """
        Initializes the crawler with the seed urls and starts crawling, then stores the resulting
        webgraph and the extracted terms in the attributes.
        
        Also counts the extracted words in every website and stores each website's length in the 
        document_lengths attribute.
        
        """
        
        print "Starting crawler ..."
        print "  Seed URLs: "
        
        for url in self._seed_urls:
            print "   - " + url
        
        self._crawler = Crawler(self._seed_urls)
        results = self._crawler.startCrawling()
        
        self._webgraph = results[0]
        self._extracted_terms = results[1]

        
        print "  Web graph: "
        for url in self._webgraph.keys():
            print "   - " + url
            for outlink in self._webgraph[url]:
                print "     -> " + outlink
        
        #print "  Extracted terms: "
        #for website in self._extracted_terms:
        #    print "   - " + website[0] + ": "
        #    print ', '.join(str(token) for token in website[1])
        
        print "Crawler finished."
        print
        
    def _compute_page_ranks(self):
        """
        Initializes the page rank computer with the webgraph and computes the page ranks.
        
        """
        print "Computing page ranks ..."

        self._page_rank_computer = Computer(self._webgraph)
        self._page_rank_computer.dampening_factor = 0.95
        self._page_rank_computer.compute()
        self._page_ranks = self._page_rank_computer.page_ranks
        
        print "  Page ranks:"
        
        result_sum = 0
        for website in sorted(self._page_ranks.keys()):
            result_sum += self._page_ranks[website]
            
            print "   - " + website + ": " + str(self._page_ranks[website])
        
        #print
        #print "  Sum: " + str(result_sum)
        
        print "Page ranks computed."
        print
        
    def _build_index(self):
        """
        Takes the extracted terms and stop words and builds up the term frequency index and the 
        document frequency index.
        
        """
        print "Building index ..."        
        
        for website_and_terms in self._extracted_terms:
            website = website_and_terms[0]
            terms = website_and_terms[1]
            self._document_lengths[website] = len(terms)
    
        self._indexer = Indexer(self._extracted_terms, self._stopwords)
        index = self._indexer.buidlindex()
        
        self._document_frequency = index[0]
        self._term_frequency = index[1]
        self._extracted_terms = index[2]
        self._document_lengths = index[3]
        #pprint(self._term_frequency) 

        
        #print "  Document index:"
        #for term in sorted(self._document_frequency):
        #    print "   - " + term + ": " + str(self._document_frequency[term]) + " times"
        
        print
        print "  Term frequency:"
        for term in sorted(self._term_frequency):
            print "   - " + term + ":"
            print "      -   Document Frequency: "+ str(self._document_frequency[term])
            for document_and_count in self._term_frequency[term]:
            
                print "      - " + document_and_count[0] + ": " + str(document_and_count[1]) + " times"
        print
        print " Document Lengthes"
        pprint (self._document_lengths)
        print "Index build up."
 def __init__(self, seed_URL):
     self.seed_URL = seed_URL
     self.indexer = Indexer("people", "person")
     self.__walk()
示例#37
0
from Analyzer import Analyzer

from java.io import StringReader
from org.apache.lucene.analysis import TokenStream
from org.apache.lucene.index import IndexWriterConfig
from org.apache.lucene.util import Version
from org.apache.lucene.search import Explanation

lucene.initVM()


analyzer = Analyzer(Version.LUCENE_CURRENT)
config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)

indexer = Indexer(config, '/home/hnguyen/Projects/CLIFinder/cli.index')
indexer.index('/home/hnguyen/Projects/CLIFinder/cli')

searcher = Searcher(analyzer, '/home/hnguyen/Projects/CLIFinder/cli.index')

while True:
	strQuery = raw_input("Query:")
	if strQuery == '':
		sys.exit(1)

	docs, query = searcher.search(strQuery, 'content', 'name')

	print '"%s" has %s result(s)' % (strQuery, len(docs))
	for d in docs:
		print 'Score: %s \nFile: %s \nDesc: %s \n' % (d.score, searcher.mIndexSearcher.doc(d.doc).get('name'), searcher.mIndexSearcher.doc(d.doc).get('content'))
示例#38
0
 def __init__(self,root):
     self.store = gtk.ListStore(str, str, gtk.gdk.Pixbuf, str)
     self.root  = os.path.abspath(root)
     self.index = Indexer(self.root)
class Crawler(object):

    __resourcesQueue = set()

    def __init__(self, seed_URL):
        self.seed_URL = seed_URL
        self.indexer = Indexer("people", "person")
        self.__walk()

    def __walk(self):
        #Extract the resources from the seed URL
        self.__resourcesQueue |= self.__extractResources(self.seed_URL)

        #Extract the people from the seed URL
        self.__extractPeople(self.seed_URL)


        while ((len(self.__resourcesQueue) != 0)):
            resource_url = self.__resourcesQueue.pop()

            # Find the linked resources from this resource
            self.__resourcesQueue |= self.__extractResources(resource_url)

            self.__extractPeople(resource_url)


    def __extractResources(self, resource_url):
        print "Looking for resources in %s" % resource_url

        resources_query = """
        PREFIX foaf:<http://xmlns.com/foaf/0.1/>
        PREFIX rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#>
        SELECT ?resource
        WHERE {
            <%s> ?p ?resource
        }""" % resource_url

        resources = set(self.__run_query(resources_query, 'Resources'))

        print "Found %s resources" % len(resources)

        return resources


    def __extractPeople(self, resource_url):
        print "Looking for people in %s" %resource_url
        people_query = """
        PREFIX foaf:<http://xmlns.com/foaf/0.1/>
        PREFIX rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#>
        PREFIX rdfs:<http://www.w3.org/2000/01/rdf-schema#>
        SELECT ?person, ?label, ?mentioned_by
        WHERE {
            <%s> ?p ?person .
            ?person rdf:type foaf:Person .
            ?person rdfs:label ?label .
            ?mentioned_by ?pred ?person
        }""" % resource_url

        people = self.__run_query(people_query, 'People', resource_url)

        print "Found %s people" % len(people)


        # Index the returned people object
        if (len(people) != 0):
            for person in people:
                # Remove duplicate entries in the mentioned_by field
                people[person]['mentioned_by'] = list(people[person]['mentioned_by'])
                self.indexer.index(person, people[person])

            print "Done indexing batch"

    def __run_query(self, query, type, resource_url=None):

        query_result_list = []
        people_json = {}
        people_found =  set()

        try:
            # Set the SPARQL endpoint to run queries against
            sparql = SPARQLWrapper("http://dbpedia.org/sparql")
            sparql.setQuery(query)
            sparql.setReturnFormat(JSON)

            results = sparql.query().convert()

            if (type == 'Resources'):

                for result in results['results']['bindings']:
                    # Only follow resources that are URIs
                    if (result['resource']['type'] == 'uri'):
                        query_result_list.append(result['resource']['value'])

                return query_result_list


            if (type == 'People'):

                for result in results['results']['bindings']:

                    people_found.add(result['person']['value'])

                # For each distinct person found, create an object for them
                for person in people_found:
                    people_json[person] = {
                        'uri': person,
                        'mentioned_by': set()
                    }

                for result in results['results']['bindings']:
                    # For each result, get the label and append to the mentioned_by array
                    person_uri = result['person']['value']
                    person_label = result['label']['value']
                    label_lang = result['label']['xml:lang']
                    person_mentioned_by = result['mentioned_by']['value']

                    if (label_lang == 'en'):
                        people_json[person_uri]['label'] = person_label
                    people_json[person_uri]['mentioned_by'].add(person_mentioned_by)

                return people_json

        except Exception:
            if (type == 'Resources'):
                return query_result_list
            elif (type == 'People'):
                return people_json
示例#40
0
        for i in postings1:
            for k in postings2:
                if i == k:
                    intersect.append(i)
        """

        """
        intersect = set(postings[0]).intersection(*postings)
        """

        

        return intersect


x = Indexer()

lines = file('tweets.txt').read().split('\n')
for r in range(1):
    for i in range(1000):
        x.process(docId = i+r*10000, text = lines[i])
#
#for i in x.data.items():
#    print i

#shelf = shelve.open('dump.txt')
#shelf['indexes'] = x.data

#f = file('dump.txt', 'w')
#for i in x.data.items():
#    f.write(str(i) + '\n')
示例#41
0
文件: crawl.py 项目: noxerit/cms
from Frontier import Frontier
from PageRanker import PageRanker
from Indexer import Indexer
from Searcher import Searcher
import re

frontier = Frontier()
pageRanker = PageRanker()
indexer = Indexer()

seedDocuments = [
    'http://people.f4.htw-berlin.de/fileadmin/user_upload/Dozenten/WI-Dozenten/Classen/DAWeb/smdocs/d01.html',
    'http://people.f4.htw-berlin.de/fileadmin/user_upload/Dozenten/WI-Dozenten/Classen/DAWeb/smdocs/d06.html',
    'http://people.f4.htw-berlin.de/fileadmin/user_upload/Dozenten/WI-Dozenten/Classen/DAWeb/smdocs/d08.html'
]

def printWebGraph(webGraph):
    print
    print '-*( Web Graph )*-'
    print
    for entry in sorted(webGraph.keys()):
        print entry + ' -> ' + ', '.join(webGraph[entry])

def printIndex(index):
    print
    print '-*( Indices )*-'
    print
    for term,occurences in sorted(index.iteritems()):
        print '(' + term[0] + ', df:' + str(term[1]) + ') ->',
        print re.sub('(u)?\'', '', str(occurences))
示例#42
0
class IndexerTest(unittest.TestCase):
    def setUp(self):
        self.store_mock = IndexStoreMock()
        self.tokenizer_mock = TokenizerMock()
        self.indexer = Indexer(self.store_mock, self.tokenizer_mock)

    def test_term_document_frequency(self):
        # Arrange
        term = "foo"
        document = uuid.uuid4()

        # Act
        self.indexer.term_document_frequency(document, term)

        # Assert
        self.assertEqual(1, self.store_mock.num_method_calls("term_document_frequency"))
        arguments =  self.store_mock.get_arguments("term_document_frequency")
        self.assertEqual(document, arguments[0])
        self.assertEqual(term, arguments[1])

    def test_document_frequency_normalized(self):
        # Arrange
        term = "foo"
        document_frequency = 22
        num_documents = 100
        self.store_mock.set_document_frequency(document_frequency)
        self.store_mock.set_num_documents(num_documents)

        # Act
        result = self.indexer.document_frequency_normalized(term)

        # Assert
        self.assertEqual(1, self.store_mock.num_method_calls("document_frequency"))
        document_frequency_args = self.store_mock.get_arguments("document_frequency")
        self.assertEqual(term, document_frequency_args[0])

        self.assertEqual(1, self.store_mock.num_method_calls("num_documents"))

        self.assertEqual(result, 0.22)

    def test_index_empty_text(self):
        # Arrange
        document = uuid.uuid4()
        text = ""
        self.tokenizer_mock.set_tokens([])

        # Act
        self.indexer.index(text, document)

        # Assert
        self.assertEqual(1, self.tokenizer_mock.num_method_calls("tokenize"))
        tokenize_arguments = self.tokenizer_mock.get_arguments("tokenize")
        self.assertEqual(text, tokenize_arguments[0])

        self.assertFalse(self.store_mock.was_called("add"))

    def test_index_one_token(self):
        # Arrange
        document = uuid.uuid4()
        text = "foo"
        self.tokenizer_mock.set_tokens([text])

        # Act
        self.indexer.index(text, document)

        # Assert
        self.assertEqual(1, self.tokenizer_mock.num_method_calls("tokenize"))
        tokenize_arguments = self.tokenizer_mock.get_arguments("tokenize")
        self.assertEqual(text, tokenize_arguments[0])

        self.assertEqual(1, self.store_mock.num_method_calls("add"))
        add_arguments = self.store_mock.get_arguments("add")
        self.assertEqual(document, add_arguments[0])
        self.assertEqual(text, add_arguments[1])

    def test_index_two_tokens(self):
        # Arrange
        document = uuid.uuid4()
        tokens = ["foo", "bar"]
        text = " ".join(tokens)
        self.tokenizer_mock.set_tokens(tokens)

        # Act
        self.indexer.index(text, document)

        # Assert
        self.assertEqual(1, self.tokenizer_mock.num_method_calls("tokenize"))
        tokenize_arguments = self.tokenizer_mock.get_arguments("tokenize")
        self.assertEqual(text, tokenize_arguments[0])

        self.assertEqual(2, self.store_mock.num_method_calls("add"))

        add_arguments1 = self.store_mock.get_arguments("add", 1)
        self.assertEqual(document, add_arguments1[0])
        self.assertEqual(tokens[0], add_arguments1[1])

        add_arguments2 = self.store_mock.get_arguments("add", 2)
        self.assertEqual(document, add_arguments2[0])
        self.assertEqual(tokens[1], add_arguments2[1])

    def test_get_posting_list(self):
        # Arrange
        term = "foo"

        # Act
        self.indexer.get_posting_list(term)

        # Assert
        self.assertEqual(1, self.store_mock.num_method_calls("posting_list"))
        arguments = self.store_mock.get_arguments("posting_list")
        self.assertEqual(term, arguments[0])

    def test_get_terms(self):
        # Arrange
        terms = {"foo", "bar"}
        document = uuid.uuid4()

        self.store_mock.set_terms(terms)

        # Act
        result = self.indexer.get_terms(document)

        # Assert
        self.assertEqual(1, self.store_mock.num_method_calls("get_terms"))
        arguments = self.store_mock.get_arguments("get_terms")
        self.assertEqual(document, arguments[0])
        self.assertEqual(terms, result)
示例#43
0
 def setUp(self):
     self.store_mock = IndexStoreMock()
     self.tokenizer_mock = TokenizerMock()
     self.indexer = Indexer(self.store_mock, self.tokenizer_mock)
示例#44
0
class FileManager:
    COL_TITLE  = 0
    COL_PATH   = 1
    COL_PIXBUF = 2
    COL_TYPE   = 3

    store = None
    root  = None
    index = None

    thumbnailer = None

    def __init__(self,root):
        self.store = gtk.ListStore(str, str, gtk.gdk.Pixbuf, str)
        self.root  = os.path.abspath(root)
        self.index = Indexer(self.root)

    def search(self,query):
        """ Search the index for the given query

            query needs to be something the Whoosh query parser can parse,
            otherwise Whoosh exceptions are bubbled up
        """
        self.stop_thumbnailer()
        self.store.clear()
        results = self.index.search(unicode(query))

        for i, fields in enumerate(results):
            title = fields['title']
            if(title == ''):
                title = os.path.basename(fields['path']);

            self.store.append([
                title,
                fields['path'],
                self.get_icon(gtk.STOCK_FILE),
                'jpg'])
        self.start_thumbnailer()


    def browse(self,folder):
        """ Browse the given folder

        folder needs to exist and be relative to the library root,
        otherwise a NoDirException is thrown
        """
        self.stop_thumbnailer()
        self.store.clear()

        folder = folder.replace('..','')
        full = os.path.join(self.root,folder)
        full = os.path.abspath(full)
        imgre = re.compile('\.jpe?g$',re.IGNORECASE)

        if(not os.path.isdir(full)):
            raise NoDirException("No such directory in library: "+folder)

        # add upper dir
        if(folder):
            upper = os.path.dirname(folder);
            self.store.append([
                        '..',
                        upper,
                        self.get_icon(gtk.STOCK_GO_UP),
                        'dir'])

        for fl in os.listdir(full):
            if fl[0] == '.':
                continue; #skip hidden files

            fn    = os.path.join(full,fl)
            rel   = os.path.relpath(fn,self.root)
            title = os.path.basename(fn)

            if(os.path.isdir(fn)):
                self.store.append([
                    title,
                    rel,
                    self.get_icon(gtk.STOCK_DIRECTORY),
                    'dir'])
            elif(imgre.search(fn)):
                self.store.append([
                    title,
                    rel,
                    self.get_icon(gtk.STOCK_FILE),
                    'jpg'])
        self.start_thumbnailer()


    def start_thumbnailer(self):
        """ Start thumbnailing for the current ListStore

            Thumbnailing is done in a separate thread
        """
        self.stop_thumbnailer()
        self.thumbnailer = GeneratorTask(self._create_thumbnails)
        self.thumbnailer.start()

    def stop_thumbnailer(self):
        """ Stop any running thumbnailer

            Always call this before the ListStore is cleared!
        """
        if self.thumbnailer is not None:
            self.thumbnailer.stop()
            self.thumbnailer.wait()
            self.thumbnailer = None

    def _create_thumbnails(self):
        """ The thumbnailing process

            FIXME: reading addtional image info from exif might be
            sensible here
        """
        for row in self.store:
            path  = row[self.COL_PATH]
            ftype = row[self.COL_TYPE]
            fn    = os.path.join(self.root,path)
            if(ftype == 'jpg'):
                buf = gtk.gdk.pixbuf_new_from_file_at_size(fn, 48, 48)
                row[self.COL_PIXBUF] = buf
            yield None


    def get_itemat(self,pos):
        """ Return the item at the given postion """
        row = self.store[pos];
        path  = row[self.COL_PATH]
        ftype = row[self.COL_TYPE]
        fn    = os.path.join(self.root,path)
        return {'fn':fn, 'ft': ftype}

    def get_nextimagepos(self,pos):
        """ Get the position of the next image (not dir) after the given
            position. If the given positon is None, the search sats at the
            beginning of the store

            FIXME there is probably a much more elegant way doing the
            whole iteration stuff, but I can't figure it out
        """
        if(pos == None):
            pos = 0;
        else:
            pos = pos+1;

        try:
            rowiter = self.store.get_iter(pos);
            while rowiter != None:
                if(self.store.get_value(rowiter,self.COL_TYPE) == 'jpg'):
                    return self.store.get_path(rowiter)[0];
                self.store.iter_next(rowiter);
        except ValueError: # we're out of range
            pass
        return None

    def get_previmagepos(self,pos):
        """ Get the position of the next image (not dir) before the given
            position.

            FIXME there is probably a much more elegant way. And I have no
            idea how to iterate backwards anyway
        """
        while (pos >=0):
            pos -= 1;
            try:
                rowiter = self.store.get_iter(pos);
                if(self.store.get_value(rowiter,self.COL_TYPE) == 'jpg'):
                    return self.store.get_path(rowiter)[0];
            except ValueError: # we're out of range
                pass
        return None

    def get_icon(self, name):
        """ Helper to load a stock icon """
        theme = gtk.icon_theme_get_default()
        return theme.load_icon(name, 48, 0)

    def get_tagcloudstring(self):
        tags = self.index.tagcloud()

        cloud = ''
        for tag in sorted(tags):
#            style = 'size="%d" underline="none" foreground="blck"' % (5 + (tags[tag]*10))
#            cloud += '<a href="tags:'+urllib.quote('"'+tag+'"')+'" underline="none" foreground="black">'+cgi.escape(tag)+'</a> ';


            cloud += '<a href="%s"><span size="%d" underline="none" foreground="black">%s</span></a> ' % (
                         urllib.quote('"'+tag+'"'),
                         (10 + (tags[tag]*5))*1000,
                         cgi.escape(tag) )


        return cloud