示例#1
0
    def test_idf(self):
        self.assertEqual(self.index_.idf("dog"), 1.0986122886681098)

        self.parameters.params["repo_dir"] = '/scratch/index/indri_5_7/ap8889'
        self.index_ = Index(self.parameters)

        self.assertEqual(self.index_.document_count("first"), 0)
示例#2
0
 def info(self, info='', col_bits=5, pagenum=100):
     keywords = PreDeal.seg(info)
     # 1. 关键词提取
     keys = jieba.analyse.textrank(info,
                                   topK=10,
                                   withWeight=False,
                                   allowPOS=('ns', 'n', 'vn', 'v'))
     # 2. 调用搜索引擎爬取相关网页
     # 2.1 抓取链接
     spider_link = SpiderLink(keys, self.root)
     spider_link.crawl(pagenum)
     # 2.2 抓取内容
     filename = '_'.join(keys) + '.html'
     spider_to = SpiderTo(filename)
     spider_to.crawl()
     # 3. 文本预处理,去重,去停用词,分词,保留url和关键词集合
     p = PreDeal()
     filepath = os.path.join(config.spidertext, '_'.join(keys))
     propath = os.path.join(config.prepapath, '_'.join(keys))
     p.savetexts(filepath=filepath, prepath=propath)
     # 4. 构建索引, 并检索,得到包含关键词信息的网页
     # 4.1 索引构建
     indexpath = os.path.join(config.indexpath, '_'.join(keys))
     Index.build(datapath=propath, indexpath=indexpath)
     search = Search(keys=keys, pindexp=indexpath)
     # 4.2 搜索并保存
     search.retrieve(keywords=keywords)
     # 5. 选取最佳网页,位置信息描述,编码
     info_kws = keywords[:]
     loc = Location(keywords=info_kws, col_bits=col_bits)
     name = '_'.join(keys)
     res_list = loc.describe(name)
     return res_list
示例#3
0
    def test_term_count(self):
        self.assertEqual(self.index_.term_count("dog"), 2)

        self.parameters.params["repo_dir"] = '/scratch/index/indri_5_7/ap8889'
        self.index_ = Index(self.parameters)

        self.assertEqual(self.index_.term_count("emotional"), 3515)
示例#4
0
    def test_document_count(self):
        self.assertEqual(self.index_.document_count("dog"), 1)

        self.parameters.params["repo_dir"] = '/scratch/index/indri_5_7/ap8889'
        self.index_ = Index(self.parameters)

        self.assertEqual(self.index_.document_count("emotional"), 2973)
示例#5
0
class SimpleDocument:
    def __init__(self, parameters):
        self.index_ = Index(parameters)
        self.stop_words = set(nltk.corpus.stopwords.words('english'))

    def replace_stemmed_similar_words_list(self, l):
        i = 0
        while i < len(l):
            j = i + 1
            while j < len(l):
                if self.index_.check_if_have_same_stem(l[i], l[j]):
                    l[j] = l[i]
                j += 1
            i += 1
        return l

    def remove_non_existent_words_in_repo(self, l):
        return [w for w in l if self.index_.check_if_exists_in_index(w)]

    def get_words(self, doc_file_name):
        tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
        with open(doc_file_name, 'r') as f:
            doc_words = tokenizer.tokenize(f.read())
        doc_words = [w.lower() for w in doc_words]
        doc_words = [w for w in doc_words if w not in self.stop_words]
        doc_words = self.remove_non_existent_words_in_repo(doc_words)
        doc_words = self.replace_stemmed_similar_words_list(doc_words)
        doc_words = [w for w in doc_words if w.isalpha()]
        doc_words = [w for w in doc_words if len(w) > 2]
        return doc_words
示例#6
0
class GensimCorpus(TextCorpus):
    @staticmethod
    def save_corpus(fname, corpus, id2word=None, metadata=False):
        pass

    def __init__(self, parameters):
        self.stop_words = set(nltk.corpus.stopwords.words('english'))
        self.tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
        self.parameters = parameters
        self.index_ = Index(self.parameters)
        self.store_collection_if_not_exists()
        input_ = self.parameters.params["lda"]["file_name"]
        super().__init__(input_)

    def read_collection(self):
        collection_lines = ""
        for i in range(1, self.index_.total_count() + 1):
            doc_text = self.index_.obtain_text_of_a_document(i)
            doc_words = self.tokenizer.tokenize(doc_text)
            doc_words = [w.lower() for w in doc_words if w.isalpha() and len(w) > 2]
            doc_words = [w for w in doc_words if w not in self.stop_words]
            doc_words = [self.index_.index.process_term(w) for w in doc_words]
            collection_lines += ' '.join(doc_words) + '\n'
        return collection_lines

    def store_collection_if_not_exists(self):
        if not os.path.exists(self.parameters.params["lda"]["file_name"]):
            print("reading collection...", file=sys.stderr)
            collection_lines = self.read_collection()
            print("writing collection to:", self.parameters.params["lda"]["file_name"], file=sys.stderr)
            with open(self.parameters.params["lda"]["file_name"], "w") as f:
                f.write(collection_lines)
 def test_index_with_xml_libraries(self):
     xml_libs = os.path.join(
         env.RESOURCES_DIR,
         'library'
         )
     db_dir_with_xml = os.path.join(
         env.RESULTS_DIR,
         'db_dir_with_xml')
     scanner = Scanner(xml_libs)
     scanner.scan(
         self.suite_dir,
         'robot',
         db_dir_with_xml
     )
     index = Index(db_dir_with_xml, self.index_dir, self.xml_libs)
     index.index_consturctor(self.resource_a_table_name)
     files = os.listdir(self.index_dir)
     self.assertEqual(len(files), 1)
     with open(os.path.join(self.index_dir, files[0])) as f:
         data = json.load(f)
     self.assertTrue(
         any(kw[2] == 'SwingLibrary' for kw in data['keyword'])
     )
     self.assertTrue(
         any(kw[0] == 'Add Table Cell Selection' for kw in data['keyword'])
     )
     self.assertTrue(
         any(kw[0] == 'Select From Popup Menu' for kw in data['keyword'])
     )
 def test_index_with_xml_libraries(self):
     xml_libs = os.path.join(
         env.RESOURCES_DIR,
         'library'
     )
     db_dir_with_xml = os.path.join(
         env.RESULTS_DIR,
         'db_dir_with_xml')
     scanner = Scanner(xml_libs)
     scanner.scan(
         self.suite_dir,
         'robot',
         db_dir_with_xml
     )
     index = Index(db_dir_with_xml, self.index_dir, self.xml_libs)
     index.index_consturctor(self.resource_a_table_name)
     files = os.listdir(self.index_dir)
     self.assertEqual(len(files), 1)
     with open(os.path.join(self.index_dir, files[0])) as f:
         data = json.load(f)
     self.assertTrue(
         any(kw[2] == 'SwingLibrary' for kw in data['keywords'])
     )
     self.assertTrue(
         any(kw[0] == 'Add Table Cell Selection' for kw in data['keywords'])
     )
     self.assertTrue(
         any(kw[0] == 'Select From Popup Menu' for kw in data['keywords'])
     )
示例#9
0
文件: hide1.py 项目: loinly/TextStego
 def info(self, fi='', pagenum=100):
     info = FileUtil.readfile(fi)
     keywords = PreDeal.seg(info)
     # 1. 关键词提取
     keys = jieba.analyse.textrank(info,
                                   topK=10,
                                   withWeight=False,
                                   allowPOS=('ns', 'n', 'vn', 'v'))
     # 2. 调用搜索引擎爬取相关网页
     # 2.1 抓取链接
     spider_link = SpiderLink(keys, self.root)
     spider_link.crawl(pagenum)
     # 2.2 抓取内容
     filename = '_'.join(keys) + '.html'
     spider_to = SpiderTo(filename)
     spider_to.crawl()
     # 3. 文本预处理,去重,去停用词,分词,保留url和关键词集合
     p = PreDeal()
     filepath = os.path.join(config.spidertext, '_'.join(keys))
     prepath = os.path.join(config.prepapath, '_'.join(keys))
     p.savetexts(filepath=filepath, prepath=prepath)
     # 4. 构建索引, 并检索,得到包含关键词信息的网页
     # 4.1 索引构建
     indexpath = os.path.join(config.indexpath, '_'.join(keys))
     idx = Index()
     idx.build(datapath=prepath, indexpath=indexpath)
     search = Search1(filename=fi, pindexp=indexpath)
     # 4.2 搜索并保存
     info_k = keywords[:]
     num = search.retrieve(keywords=info_k)
     return keywords, num
示例#10
0
 def __init__(self, parameters):
     self.stop_words = set(nltk.corpus.stopwords.words('english'))
     self.tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
     self.parameters = parameters
     self.index_ = Index(self.parameters)
     self.store_collection_if_not_exists()
     input_ = self.parameters.params["lda"]["file_name"]
     super().__init__(input_)
示例#11
0
    def test_check_if_exists_in_index(self):
        self.parameters.params["repo_dir"] = '/scratch/index/indri_5_7/ap8889'
        self.index_ = Index(self.parameters)

        self.assertTrue(self.index_.check_if_exists_in_index("emotional"))
        self.assertFalse(self.index_.check_if_exists_in_index("first"))
        self.assertFalse(self.index_.check_if_exists_in_index("included"))
        self.assertTrue(self.index_.check_if_exists_in_index("includes"))
示例#12
0
 def query(self, keywords, kwpath=''):
     path = []  # 已经找到的文章列表
     num = []  # 每篇含文章组合的个数
     unmatch = 0  # 失配个数
     maxh = 0  # 关键词个数
     q = ''  # 联合关键词
     flag = True  # 失配标志
     hidekey = []
     while keywords:
         kw = keywords[0]
         paper = Index.search(self.pindexp, q + ' ' + kw, limit=None)
         if paper:
             keywords.pop(0)
             hidekey.append(kw)
             q = q + ' ' + kw
             maxh += 1
         else:  # 当联合搜索无法进行下去时,转为寻找相似关键词
             simikeys = WV.similarwords(kw)
             t_paper = []
             if not simikeys:
                 print(
                     ".................Failed to find similar words................"
                 )
                 flag = False
             else:
                 for skw, similarity in simikeys:
                     sq = q + ' ' + skw
                     t_paper = Index.search(self.pindexp, sq, limit=None)
                     if t_paper:
                         hidekey.append(skw)
                         keywords.pop(0)
                         q = sq
                         maxh += 1
                         break
                 if not t_paper:  # 有关键词但联合搜索仍失败
                     flag = False
             # 失配
             if not flag:
                 doc = Index.search(self.pindexp, q, limit=None)
                 if not doc:
                     print("The keyword  '%s' is unMatch !" % kw)
                     unmatch += 1
                     hidekey.append('0')
                     keywords.pop(0)
                     path.append(None)
                     # flag = True
                 else:
                     path.append(doc)
                     num.append(maxh)
                     maxh = 0
                     q = ''
                 flag = True
         if not keywords:
             path.append(paper)
     hide_string = ' '.join(hidekey)
     FileUtil.writefile(hide_string, kwpath)
     return path
def index_single(db_path, db_table, index_path, module_search_path,
                 libs_in_xml):
    for path_ in module_search_path:
        sys.path.append(path_)
    if not path.exists(index_path):
        makedirs(index_path)
    index = Index(db_path=db_path, index_path=index_path,
                  xml_libraries=libs_in_xml)
    index.index_consturctor(table=db_table)
示例#14
0
    def test_tf(self):
        doc_words = SimpleDocument(self.parameters).get_words(
            "../configs/others/pride_and_prejudice_wiki.txt")

        self.assertEqual(self.index_.tf("dog", doc_words), 0.5)

        self.parameters.params["repo_dir"] = '/scratch/index/indri_5_7/ap8889'
        self.index_ = Index(self.parameters)

        self.assertEqual(self.index_.tf("emotional", doc_words), 0.5)
示例#15
0
def index_single(db_path, db_table, index_path, module_search_path,
                 libs_in_xml):
    for path_ in module_search_path:
        sys.path.append(path_)
    if not path.exists(index_path):
        makedirs(index_path)
    index = Index(db_path=db_path,
                  index_path=index_path,
                  xml_libraries=libs_in_xml)
    index.index_consturctor(table=db_table)
 def setUp(self):
     self.index_dir = os.path.join(
         env.RESULTS_DIR,
         'index_dir',
     )
     if os.path.exists(self.index_dir):
         while os.path.exists(self.index_dir):
             shutil.rmtree(self.index_dir)
             sleep(0.1)
     os.makedirs(self.index_dir)
     self.index = Index(self.db_dir, self.index_dir)
示例#17
0
    def test_tfidf(self):
        self.parameters.params["repo_dir"] = '/scratch/index/indri_5_7/ap8889'
        self.index_ = Index(self.parameters)

        doc_words = SimpleDocument(self.parameters).get_words(
            "../configs/others/pride_and_prejudice_wiki.txt")
        tfidf_1 = self.index_.tfidf('emotional', doc_words)
        print(tfidf_1, file=sys.stderr)
        self.assertEqual(tfidf_1, 2.0455597255490345)

        with self.assertRaises(Exception) as context:
            self.index_.tfidf('is', doc_words)
        self.assertTrue(
            'unigram "is" not exist. Probably was a stopword in indexing.' in
            str(context.exception))
示例#18
0
class Original:
    def __init__(self, parameters):
        self.parameters = parameters
        self.enchant_dict = enchant.Dict("en_US")
        self.stopwords = stopwords.words('english')
        self.index_ = Index(self.parameters)

    def check_if_unigram_should_be_added(self, unigram):

        unigram = unigram.lower()

        if not unigram.isalpha():
            return False
        if unigram in self.stopwords:
            return False
        if not self.index_.check_if_exists_in_index(unigram):
            return False
        # uncomment to include a dictionary
        # if not self.enchant_dict.check(unigram):
        #     print("WARNING: \"", unigram, "\" doesn't exist in dictionary.", file=sys.stderr, end=" ")
        #     return False
        return True

    def find_unigrams(self, text):
        for unigram in word_tokenize(text):
            if self.check_if_unigram_should_be_added(unigram):
                yield unigram, [(unigram, 1)]
示例#19
0
 def test_obtain_term_ids_of_a_document(self):
     self.parameters.params["repo_dir"] = '/scratch/index/indri_5_7/ap8889'
     self.index_ = Index(self.parameters)
     res = self.index_.obtain_term_ids_of_a_document(1)
     self.assertEqual(
         res,
         ('AP881107-0001',
          (147, 771, 0, 78064, 26, 2828, 1283, 92, 126, 147, 175009, 159395,
           771, 55, 0, 0, 2362, 26, 2828, 919, 0, 0, 115, 8, 461, 1624,
           1826, 0, 35, 693, 1198, 0, 195412, 0, 724, 430, 621, 340, 0, 771,
           0, 1502, 20649, 4327, 1620, 9, 247, 0, 0, 866, 0, 643, 0, 2828,
           415, 101374, 1289, 2015, 276, 1246, 0, 24, 29, 586, 0, 272, 0,
           856, 0, 101374, 1826, 2153, 0, 174, 693, 0, 195412, 0, 0, 158999,
           1037, 0, 117013, 137162, 123, 157, 0, 4415, 159395, 0, 0, 2262,
           0, 0, 56, 0, 101374, 251, 0, 0, 189, 101374, 0, 569, 0, 332,
           3095, 1873, 0, 2974, 0, 0, 13, 63630, 0, 485, 461, 91464, 0, 0,
           91, 0, 50405, 0, 0, 156, 159395, 0, 690, 0, 347, 0, 24, 4049, 0,
           101374, 0, 0, 0, 771, 0, 0, 92, 126, 690, 131907, 609, 0, 56, 0,
           88, 2222, 0, 0, 1624, 0, 160974, 0, 9, 0, 0, 436, 0, 2362, 273,
           0, 774, 1620, 13, 0, 263, 0, 887, 339, 176, 0, 0, 0, 91, 0,
           50405, 0, 1620, 0, 0, 0, 289, 0, 1202, 101374, 0, 254, 0, 4543,
           8, 193, 91, 0, 979, 3597, 0, 3095, 0, 791, 0, 2768, 937, 0, 0,
           264, 91, 0, 0, 461, 2464, 0, 9, 0, 0, 1333, 2198, 45622, 0, 4433,
           0, 1624, 0, 678, 0, 0, 0, 0, 3790, 0, 40, 0, 0, 118, 135313,
           1620, 727, 136295, 0, 0, 0, 3408, 0, 2362, 6, 0, 245, 0, 0, 494,
           0, 415, 101374, 0, 2042, 435, 0, 0, 0, 0, 0, 586, 0, 347, 1461,
           0, 0, 116, 0, 0, 354, 0, 0, 9, 145130, 0, 48310, 120515, 51, 0,
           0, 1, 956, 540, 430, 32, 32, 0, 0, 0, 221, 340, 0, 9, 771,
           101374, 0, 1431, 897, 123, 0, 248, 0, 0, 91, 0, 50405, 0, 919, 0,
           114, 667, 0, 70387, 0, 62918, 0, 0, 101374, 261, 0, 0, 311, 212,
           0, 0, 61, 0, 4322, 767, 144897, 0, 62, 0, 0, 91, 0, 50405, 0,
           375, 8, 546, 0, 0, 26, 3527, 543, 3657, 3224, 0, 0, 0, 2223, 0,
           533, 0, 1019, 367, 0, 674, 0, 165, 101374, 0, 0, 436, 0, 4322,
           767, 144897, 0, 62, 0, 91, 0, 0, 445, 8, 0, 0, 550, 0, 0, 26, 91,
           3533, 1512, 285, 0, 0, 543, 0, 0, 0, 0, 0, 192, 0, 0, 1798,
           101374, 0, 328, 0, 26, 0, 2362, 45369, 1390, 0, 156, 167, 0, 0,
           91, 0, 50405, 0, 791, 0, 28, 0, 0, 264, 4415, 1977, 0, 187, 15,
           0, 0, 550, 1483, 273, 0, 247, 101374, 887, 0, 91, 0, 50405, 168,
           0, 303, 0, 515, 34, 77704, 0, 156, 23, 701, 273, 114, 1056, 0, 0,
           409, 490, 101374, 771, 897, 899, 0, 248, 0, 77704, 0, 0, 0, 0,
           1502, 599, 463, 147, 415, 4327, 2512, 0, 0, 70387, 0, 91, 0,
           50405, 221, 62, 0, 17, 41, 8, 1548, 0, 0, 3657, 3224, 550, 0, 0,
           171, 0, 2597, 4327, 62, 4, 0, 512, 0, 0, 1, 73, 532, 7, 0, 0, 0,
           192, 0, 3657, 3224, 0, 0, 0, 1548, 101374, 626, 918, 455, 0, 114,
           219, 0, 0, 245, 3095, 0, 0, 333, 0)))
 def setUp(self):
     self.index_dir = os.path.join(
         env.RESULTS_DIR,
         'index_dir',
     )
     if os.path.exists(self.index_dir):
         while os.path.exists(self.index_dir):
             shutil.rmtree(self.index_dir)
             sleep(0.1)
     os.makedirs(self.index_dir)
     self.index = Index(self.db_dir, self.index_dir)
class TestIndexing(unittest.TestCase):
    """The content of the db_fir was created with scanner by scanning the
    TEST_DATA_DIR/suite_tree folder. If scanner is changed, db_dir must
    be recreated."""
    @classmethod
    def setUpClass(cls):
        cls.db_dir = os.path.join(env.RESULTS_DIR, 'db_dir')
        cls.suite_dir = os.path.join(env.TEST_DATA_DIR, 'suite_tree')
        scanner = Scanner()
        scanner.scan(cls.suite_dir, 'robot', cls.db_dir)
        cls.xml_libs = os.path.join(env.RESOURCES_DIR, 'library')

    def setUp(self):
        self.index_dir = os.path.join(
            env.RESULTS_DIR,
            'index_dir',
        )
        if os.path.exists(self.index_dir):
            while os.path.exists(self.index_dir):
                shutil.rmtree(self.index_dir)
                sleep(0.1)
        os.makedirs(self.index_dir)
        self.index = Index(self.db_dir, self.index_dir)

    def test_parse_table_data(self):
        t_name = os.path.join(env.RESOURCES_DIR,
                              'BuiltIn-ca8f2e8d70641ce17b9b304086c19657.json')
        self.index.queue.add(t_name, None, None)
        data, status = self.index.read_table(
            os.path.join(env.RESOURCES_DIR, t_name))
        var, kw_index = self.index.parse_table_data(data, t_name)
        self.assertTrue(u'${/}' in var)
        self.assertTrue('${OUTPUT_FILE}' in var)
        self.assertTrue('@{TEST_TAGS}' in var)

    def test_add_builtin(self):
        self.index.add_builtin_to_queue(self.db_dir)
        self.assertTrue(len(self.index.queue.queue) > 0)

    def test_read_table(self):
        data, read_status = self.index.read_table(
            os.path.join(self.db_dir, self.test_b_table_name))
        self.assertTrue(data['file_name'], 'test_b.robot')

    def test_get_keywords_resource(self):
        data = self.get_resource_b()
        expected_kw_list = ['Resource B Keyword 2', 'Resource B Keyword 1']
        expected_arg_list = [['kwb1'], []]
        kw_list, arg_list = self.index.get_keywords(data)
        self.assertEqual(kw_list, expected_kw_list)
        self.assertEqual(arg_list.sort(), expected_arg_list.sort())

        data = self.get_test_a()
        expected_kw_list = ['Test A Keyword', 'Keyword']
        kw_list, arg_list = self.index.get_keywords(data)
        self.assertEqual(kw_list, expected_kw_list)
        self.assertEqual(arg_list, [[], []])

        data = self.get_s2l()
        parsed_kw, arg_list = self.index.get_keywords(data)
        self.assertTrue('Set Window Position' in parsed_kw)
        self.assertTrue('Get Cookies' in parsed_kw)
        self.assertTrue('Unselect Frame' in parsed_kw)
        self.assertTrue(['name'] in arg_list)
        l = ['driver_name', 'alias', 'kwargs', '**init_kwargs']
        self.assertTrue(l in arg_list)
        self.assertTrue(['*code'] in arg_list)

    def test_get_imports(self):
        data = self.get_resource_b()
        import_list = [self.process_table_name]
        self.assertEqual(self.index.get_imports(data), import_list)

        data = self.get_test_a()
        import_list = [self.common_table_name, self.resource_a_table_name]
        self.assertEqual(
            self.index.get_imports(data).sort(), import_list.sort())

        data = self.get_s2l()
        self.assertEqual(self.index.get_imports(data), [])

    def test_get_variables(self):
        data = self.get_resource_b()
        var = ['${RESOURCE_B}']
        self.assertEqual(self.index.get_variables(data), var)

        data = self.get_test_a()
        var = ['${TEST_A}']
        self.assertEqual(self.index.get_variables(data).sort(), var.sort())

        data = self.get_s2l()
        self.assertEqual(self.index.get_variables(data), [])

        data = self.get_common()
        self.assertEqual(self.index.get_variables(data), [])

    def test_get_kw_for_index(self):
        KeywordRecord = namedtuple('KeywordRecord',
                                   'keyword argument object_name table_name')
        table_name = self.resource_b_table_name
        l, kw_list, arg_list, object_name, table_name = \
            self.get_resource_b_kw_index(KeywordRecord)

        self.assertEqual(
            self.index.get_kw_for_index(kw_list, arg_list, table_name,
                                        object_name), l)

        l, kw_list, arg_list, object_name, table_name = \
            self.get_test_a_kw_index(KeywordRecord)
        self.assertEqual(
            self.index.get_kw_for_index(kw_list, arg_list, table_name,
                                        object_name), l)

        l, kw_list, arg_list, object_name, table_name = self.get_s2l_kw_index(
            KeywordRecord)
        self.assertEqual(
            self.index.get_kw_for_index(kw_list, arg_list, table_name,
                                        object_name), l)

    def test_index_creation_test_a(self):
        table_name = self.test_a_table_name
        KeywordRecord = namedtuple('KeywordRecord',
                                   'keyword argument object_name table_name')
        kw_list = []
        kw_list.extend(self.get_test_a_kw_index(KeywordRecord)[0])
        kw_list.extend(self.get_common_kw_index(KeywordRecord)[0])
        kw_list.extend(self.get_resource_a_kw_index(KeywordRecord)[0])
        kw_list.extend(self.get_s2l_kw_index(KeywordRecord)[0])
        kw_list.extend(self.get_os_kw_index(KeywordRecord)[0])
        kw_list.extend(self.get_builtin_kw_index(KeywordRecord)[0])
        kw_list.extend(self.get_LibNoClass_kw_index(KeywordRecord)[0])
        var_list = [
            u'${TEST_A}', u'${RESOURCE_A}', u'${COMMON_VARIABLE_1}',
            u'${COMMON_VARIABLE_2}'
        ]
        t_index = {'keyword': kw_list, 'variable': var_list}
        r_index = self.index.create_index_for_table(self.db_dir, table_name)
        self.assertEqual(r_index['variable'].sort(),
                         t_index['variable'].sort())
        self.assertEqual(len(r_index['keyword']), len(t_index['keyword']))
        self.assertEqual(r_index['keyword'].sort(), t_index['keyword'].sort())

    def test_index_creation_test_b(self):
        table_name = self.test_b_table_name
        KeywordRecord = namedtuple('KeywordRecord',
                                   'keyword argument object_name table_name')
        kw_list = []
        kw_list.extend(self.get_test_b_kw_index(KeywordRecord)[0])
        kw_list.extend(self.get_common_kw_index(KeywordRecord)[0])
        kw_list.extend(self.get_resource_b_kw_index(KeywordRecord)[0])
        kw_list.extend(self.get_s2l_kw_index(KeywordRecord)[0])
        kw_list.extend(self.get_process_kw_index(KeywordRecord)[0])
        kw_list.extend(self.get_builtin_kw_index(KeywordRecord)[0])
        var_list = [
            u'${TEST_B}', u'${RESOURCE_B}', u'${COMMON_VARIABLE_1}',
            u'${COMMON_VARIABLE_2}'
        ]
        t_index = {'keyword': kw_list, 'variable': var_list}
        r_index = self.index.create_index_for_table(self.db_dir, table_name)
        self.assertEqual(r_index['variable'].sort(),
                         t_index['variable'].sort())
        self.assertEqual(len(r_index['keyword']), len(t_index['keyword']))
        self.assertEqual(r_index['keyword'].sort(), t_index['keyword'].sort())

    def test_index_consturctor(self):
        self.index.index_consturctor(self.resource_a_table_name)
        files = os.listdir(self.index_dir)
        self.assertEqual(len(files), 1)
        with open(os.path.join(self.index_dir, files[0])) as f:
            data = json.load(f)
        self.assertIn('variable', data)
        self.assertIn('keyword', data)
        self.assertFalse(
            any(kw[0] == 'Test A Keyword' for kw in data['keyword']))
        self.assertTrue(
            any(kw[0] == 'Resource A Keyword 1' for kw in data['keyword']))

    def test_get_kw_arguments(self):
        kw_args = [u'item', u'msg=None']
        result = self.index.get_kw_arguments(kw_args)
        expected = [u'item', u'msg']
        self.assertEqual(result, expected)
        kw_args = [u'name', u'*args']
        result = self.index.get_kw_arguments(kw_args)
        self.assertEqual(result, kw_args)
        kw_args = []
        result = self.index.get_kw_arguments(kw_args)
        self.assertEqual(result, kw_args)
        kw_args = [u'object=None', u'*args', u'**kwargs']
        result = self.index.get_kw_arguments(kw_args)
        expected = [u'object', u'*args', u'**kwargs']
        self.assertEqual(result, expected)
        kw_args = [u'${kwa1}', '@{list}', '&{kwargs}']
        result = self.index.get_kw_arguments(kw_args)
        expected = [u'kwa1', '*list', '**kwargs']
        self.assertEqual(result, expected)

    def test_add_xml_libraries(self):
        self.assertEqual(len(self.index.queue.queue), 0)
        self.index.add_xml_libraries(self.xml_libs)
        self.assertEqual(len(self.index.queue.queue), 2)

    def test_index_with_xml_libraries(self):
        xml_libs = os.path.join(env.RESOURCES_DIR, 'library')
        db_dir_with_xml = os.path.join(env.RESULTS_DIR, 'db_dir_with_xml')
        scanner = Scanner(xml_libs)
        scanner.scan(self.suite_dir, 'robot', db_dir_with_xml)
        index = Index(db_dir_with_xml, self.index_dir, self.xml_libs)
        index.index_consturctor(self.resource_a_table_name)
        files = os.listdir(self.index_dir)
        self.assertEqual(len(files), 1)
        with open(os.path.join(self.index_dir, files[0])) as f:
            data = json.load(f)
        self.assertTrue(any(kw[2] == 'SwingLibrary' for kw in data['keyword']))
        self.assertTrue(
            any(kw[0] == 'Add Table Cell Selection' for kw in data['keyword']))
        self.assertTrue(
            any(kw[0] == 'Select From Popup Menu' for kw in data['keyword']))

    def test_get_object_name(self):
        object_name = self.index.get_object_name(self.get_libnoclass())
        self.assertEqual(object_name, 'LibNoClass')
        object_name = self.index.get_object_name(self.get_resource_b())
        self.assertEqual(object_name, 'resource_b')
        object_name = self.index.get_object_name(self.get_os())
        self.assertEqual(object_name, 'OperatingSystem')
        object_name = self.index.get_object_name(self.get_s2l())
        self.assertEqual(object_name, 'Selenium2Library')

    @property
    def common_table_name_index(self):
        index = 'index-{0}'.format(self.common_table_name)
        return os.path.join(self.index_dir, index)

    @property
    def test_a_table_name_index(self):
        index = 'index-{0}'.format(self.test_a_table_name)
        return os.path.join(self.index_dir, index)

    @property
    def real_suite_table_name(self):
        return rf_table_name(
            os.path.normcase(
                os.path.join(self.real_suite_dir, 'test', 'real_suite.robot')))

    @property
    def resource_b_table_name(self):
        return rf_table_name(
            os.path.normcase(os.path.join(self.suite_dir, 'resource_b.robot')))

    @property
    def common_table_name(self):
        return rf_table_name(
            os.path.normcase(os.path.join(self.suite_dir, 'common.robot')))

    @property
    def test_a_table_name(self):
        return rf_table_name(
            os.path.normcase(os.path.join(self.suite_dir, 'test_a.robot')))

    @property
    def test_b_table_name(self):
        return rf_table_name(
            os.path.normcase(os.path.join(self.suite_dir, 'test_b.robot')))

    @property
    def resource_a_table_name(self):
        return rf_table_name(
            os.path.normcase(os.path.join(self.suite_dir, 'resource_a.robot')))

    @property
    def s2l_table_name(self):
        return lib_table_name('Selenium2Library')

    @property
    def os_table_name(self):
        return lib_table_name('OperatingSystem')

    @property
    def process_table_name(self):
        return lib_table_name('Process')

    @property
    def builtin_table_name(self):
        return lib_table_name('BuiltIn')

    @property
    def libnoclass_table_name(self):
        return lib_table_name('LibNoClass')

    def get_resource_b(self):
        f = open(os.path.join(self.db_dir, self.resource_b_table_name))
        return json.load(f)

    def get_common(self):
        f = open(os.path.join(self.db_dir, self.common_table_name))
        return json.load(f)

    def get_test_a(self):
        f = open(os.path.join(self.db_dir, self.test_a_table_name))
        return json.load(f)

    def get_s2l(self):
        f = open(os.path.join(self.db_dir, self.s2l_table_name))
        return json.load(f)

    def get_os(self):
        f = open(os.path.join(self.db_dir, self.os_table_name))
        return json.load(f)

    def get_process(self):
        f = open(os.path.join(self.db_dir, self.process_table_name))
        return json.load(f)

    def getbuiltin(self):
        f = open(os.path.join(self.db_dir, self.builtin_table_name))
        return json.load(f)

    def get_libnoclass(self):
        f = open(os.path.join(self.db_dir, self.libnoclass_table_name))
        return json.load(f)

    def get_s2l_kw_index(self, keywordrecord):
        s2l_data = self.get_s2l()
        kw_list = self.index.get_keywords(s2l_data)[0]
        arg_list = self.get_kw_args(s2l_data)
        object_name = 'Selenium2Library'
        table_name = self.s2l_table_name
        l = []
        for kw, arg in zip(kw_list, arg_list):
            l.append(
                keywordrecord(keyword=kw,
                              argument=arg,
                              object_name=object_name,
                              table_name=table_name))
        return l, kw_list, arg_list, object_name, table_name

    def get_os_kw_index(self, keywordrecord):
        os_data = self.get_os()
        kw_list = self.index.get_keywords(os_data)[0]
        arg_list = self.get_kw_args(os_data)
        object_name = 'OperatingSystem'
        table_name = self.os_table_name
        l = []
        for kw, arg in zip(kw_list, arg_list):
            l.append(
                keywordrecord(keyword=kw,
                              argument=arg,
                              object_name=object_name,
                              table_name=table_name))
        return l, kw_list, arg_list, object_name, table_name

    def get_process_kw_index(self, keywordrecord):
        data = self.get_process()
        kw_list = self.index.get_keywords(data)[0]
        arg_list = self.get_kw_args(data)
        object_name = 'Process'
        table_name = self.process_table_name
        l = []
        for kw, arg in zip(kw_list, arg_list):
            l.append(
                keywordrecord(keyword=kw,
                              argument=arg,
                              object_name=object_name,
                              table_name=table_name))
        return l, kw_list, arg_list, object_name, table_name

    def get_builtin_kw_index(self, keywordrecord):
        data = self.getbuiltin()
        kw_list = self.index.get_keywords(data)[0]
        arg_list = self.get_kw_args(data)
        object_name = 'BuiltIn'
        table_name = self.builtin_table_name
        l = []
        for kw, arg in zip(kw_list, arg_list):
            l.append(
                keywordrecord(keyword=kw,
                              argument=arg,
                              object_name=object_name,
                              table_name=table_name))
        return l, kw_list, arg_list, object_name, table_name

    def get_LibNoClass_kw_index(self, keywordrecord):
        data = self.get_libnoclass()
        kw_list = self.index.get_keywords(data)[0]
        arg_list = self.get_kw_args(data)
        object_name = 'LibNoClass'
        table_name = self.libnoclass_table_name
        l = []
        for kw, arg in zip(kw_list, arg_list):
            l.append(
                keywordrecord(keyword=kw,
                              argument=arg,
                              object_name=object_name,
                              table_name=table_name))
        return l, kw_list, arg_list, object_name, table_name

    def get_test_a_kw_index(self, keywordrecord):
        kw_list = [u'Test A Keyword', u'Keyword']
        arg_list = [None, None]
        table_name = self.test_a_table_name
        object_name = u'test_a.robot'
        l = []
        for kw, arg in zip(kw_list, arg_list):
            l.append(
                keywordrecord(keyword=kw,
                              argument=arg,
                              object_name=object_name,
                              table_name=table_name))
        return l, kw_list, arg_list, object_name, table_name

    def get_test_b_kw_index(self, keywordrecord):
        kw_list = []
        table_name = self.test_b_table_name
        object_name = u'test_a.robot'
        l = []
        return l, kw_list, [None], object_name, table_name

    def get_resource_a_kw_index(self, keywordrecord):
        kw_list = [u'Resource A Keyword 1', u'resource A Keyword 2']
        arg_list = ['kwa1', None]
        table_name = self.resource_a_table_name
        object_name = u'resource_a.robot'
        l = []
        for kw, arg in zip(kw_list, arg_list):
            l.append(
                keywordrecord(keyword=kw,
                              argument=arg,
                              object_name=object_name,
                              table_name=table_name))
        return l, kw_list, arg_list, object_name, table_name

    def get_resource_b_kw_index(self, keywordrecord):
        kw_list = [u'Resource B Keyword 1', u'resource B Keyword 2']
        arg_list = ['kwb1', None]
        table_name = self.resource_b_table_name
        object_name = u'resource_b.robot'
        l = []
        for kw, arg in zip(kw_list, arg_list):
            l.append(
                keywordrecord(keyword=kw,
                              argument=arg,
                              object_name=object_name,
                              table_name=table_name))
        return l, kw_list, arg_list, object_name, table_name

    def get_common_kw_index(self, keywordrecord):
        kw_list = [
            u'Common Keyword 2', u'common Keyword 1',
            u'Really Long Keyword To Test With Jumping To Keyword Does Not Scroll The Visible Area To A Wrong Place Should There Be More Words'
        ]
        table_name = self.common_table_name
        object_name = u'common.robot'
        l = []
        for kw in kw_list:
            l.append(
                keywordrecord(keyword=kw,
                              argument=None,
                              object_name=object_name,
                              table_name=table_name))
        return l, kw_list, [None], object_name, table_name

    def get_kw_args(self, data):
        arg_list = []
        kws = data["keywords"]
        for i in kws.iterkeys():
            args = kws[i]['keyword_arguments']
            for arg in args:
                if '=' in arg:
                    arg_list.append(arg.split('=')[0])
                else:
                    arg_list.append(arg)
        return arg_list
示例#22
0
    def test_obtain_text_of_a_document(self):
        self.parameters.params["repo_dir"] = '/scratch/index/indri_5_7/ap8889'
        self.index_ = Index(self.parameters)
        res = self.index_.obtain_text_of_a_document(1)
        self.assertEqual(
            res, """
   Public Order Minister Tassos Sehiotis
resigned Monday after a Greek-American banker indicted in a $30
million financial scandal fled the country, apparently aboard a
yacht.
   The conservative opposition immediately demanded the resignation
of Premier Andreas Papandreou's socialist government, claiming it
was staging a cover-up.
   Banker George Koskotas, 34, disappeared Saturday afternoon. A
police officer, speaking on condition of anonymity, said Koskotas
fled abroad on Sunday, apparently by yacht from the seaside village
of Megalo Pefko, 20 miles from Athens.
   Sehiotis said a warrant had been issued for Koskotas' arrest. One
week ago, Koskotas was banned from leaving Greece pending the
outcome of an official enquiry into alleged financial irregularities
at the Bank of Crete, which he controls.
   Sehiotis, whose ministry was responsible for police surveillance
of Koskotas, said he was resigning ``since such (public order
ministry) omissions ... create an issue of political sensitivity.''
   The scandal has shaken the government because of accusations in
Greek newspapers that senior socialist officials were involved in
illegal deals set up by the Bank of Crete.
   The socialists also have been criticized for permitting Koskotas
to build a multi-million dollar banking and media empire in Greece
since 1984 without adequate checks by the central bank on his
financial background.
   The government last week pledged ``absolute clarity'' in
uncovering the scandal and warned there will be ``no pardons' for
members of the ruling Panhellenic Socialist Movement (PASOK) who may
be implicated.
   ``The Greek people are left with the conviction that George
Koskotas was spirited away so that he would not speak. The
responsibility goes all the way to the top of the government
pyramid,'' Constantine Mitsotakis, leader of the New Democracy main
opposition party party, said in a statement demanding the government
resign.
   Koskotas was suspended Oct. 20 as chairman of the Bank of Crete
and indicted on five counts of forgery and embezzlement.
   Last week Koskotas appeared before a district attorney on a
charge of forging documents purporting to show that the Bank of
Crete had $13 million invested with the American brokerage firm
Merrill Lynch.
   He was not detained but given until Nov. 14 to prepare his
defense.
   Koskotas also is accused of forging documents purporting to show
his bank had another $17 million in an account with an American
bank, Irving Trust Corp. Both U.S. firms have said they had no
record of the deposits.
   Koskotas, who holds both American and Greek citizenship, bought a
controlling interest in the Bank of Crete in 1984 after working in
its central Athens branch for six years as an accountant.
   Rival newspapers have claimed Koskotas illegally used Bank of
Crete money to fund his publishing group Grammi, which controls
three daily newspapers, five magazines and a radio station.
   Koskotas resigned Oct. 29 as chairman of Grammi, the day after
the premier's son, Education Minister George Papandreou, denounced
as a forgery a Bank of Crete statement showing a $2.3 million
transfer to a Merrill Lynch account in his name.
   The younger Papandreou showed reporters a letter from a New York
lawyer saying there was no record at Merrill Lynch of such a
transfer.
   Koskotas' parents, brother, wife and five children all have left
Greece during the past week.
""")
示例#23
0
class TestIndex(TestCase):
    def setUp(self):
        self.parameters = Parameters()
        self.parameters.params["repo_dir"] = '../index/test_files/index'

        self.index_ = Index(self.parameters)

    def test_uw_expression_count(self):
        self.assertEqual(self.index_.uw_expression_count("SAMPSON Dog", 12), 2)

    def test_od_expression_count(self):
        self.assertEqual(self.index_.od_expression_count("SAMPSON True", 12),
                         1)

    def test_uw_document_expression_count(self):
        self.assertEqual(
            self.index_.uw_expression_document_count("SAMPSON True", 12), 1)

    def test_od_document_expression_count(self):
        self.assertEqual(
            self.index_.od_expression_document_count("SAMPSON True", 12), 1)

    def test_term_count(self):
        self.assertEqual(self.index_.term_count("dog"), 2)

        self.parameters.params["repo_dir"] = '/scratch/index/indri_5_7/ap8889'
        self.index_ = Index(self.parameters)

        self.assertEqual(self.index_.term_count("emotional"), 3515)

    def test_document_count(self):
        self.assertEqual(self.index_.document_count("dog"), 1)

        self.parameters.params["repo_dir"] = '/scratch/index/indri_5_7/ap8889'
        self.index_ = Index(self.parameters)

        self.assertEqual(self.index_.document_count("emotional"), 2973)

    def test_check_if_have_same_stem(self):
        self.assertEqual(self.index_.check_if_have_same_stem("goes", "goe"),
                         True)
        self.assertEqual(self.index_.check_if_have_same_stem("goes", "g"),
                         False)
        self.assertEqual(self.index_.check_if_have_same_stem("first", "mr"),
                         False)

    def test_idf(self):
        self.assertEqual(self.index_.idf("dog"), 1.0986122886681098)

        self.parameters.params["repo_dir"] = '/scratch/index/indri_5_7/ap8889'
        self.index_ = Index(self.parameters)

        self.assertEqual(self.index_.document_count("first"), 0)

    def test_tfidf(self):
        self.parameters.params["repo_dir"] = '/scratch/index/indri_5_7/ap8889'
        self.index_ = Index(self.parameters)

        doc_words = SimpleDocument(self.parameters).get_words(
            "../configs/others/pride_and_prejudice_wiki.txt")
        tfidf_1 = self.index_.tfidf('emotional', doc_words)
        print(tfidf_1, file=sys.stderr)
        self.assertEqual(tfidf_1, 2.0455597255490345)

        with self.assertRaises(Exception) as context:
            self.index_.tfidf('is', doc_words)
        self.assertTrue(
            'unigram "is" not exist. Probably was a stopword in indexing.' in
            str(context.exception))

    def test_tf(self):
        doc_words = SimpleDocument(self.parameters).get_words(
            "../configs/others/pride_and_prejudice_wiki.txt")

        self.assertEqual(self.index_.tf("dog", doc_words), 0.5)

        self.parameters.params["repo_dir"] = '/scratch/index/indri_5_7/ap8889'
        self.index_ = Index(self.parameters)

        self.assertEqual(self.index_.tf("emotional", doc_words), 0.5)

    def test_check_if_exists_in_index(self):
        self.parameters.params["repo_dir"] = '/scratch/index/indri_5_7/ap8889'
        self.index_ = Index(self.parameters)

        self.assertTrue(self.index_.check_if_exists_in_index("emotional"))
        self.assertFalse(self.index_.check_if_exists_in_index("first"))
        self.assertFalse(self.index_.check_if_exists_in_index("included"))
        self.assertTrue(self.index_.check_if_exists_in_index("includes"))

    def test_obtain_text_of_a_document(self):
        self.parameters.params["repo_dir"] = '/scratch/index/indri_5_7/ap8889'
        self.index_ = Index(self.parameters)
        res = self.index_.obtain_text_of_a_document(1)
        self.assertEqual(
            res, """
   Public Order Minister Tassos Sehiotis
resigned Monday after a Greek-American banker indicted in a $30
million financial scandal fled the country, apparently aboard a
yacht.
   The conservative opposition immediately demanded the resignation
of Premier Andreas Papandreou's socialist government, claiming it
was staging a cover-up.
   Banker George Koskotas, 34, disappeared Saturday afternoon. A
police officer, speaking on condition of anonymity, said Koskotas
fled abroad on Sunday, apparently by yacht from the seaside village
of Megalo Pefko, 20 miles from Athens.
   Sehiotis said a warrant had been issued for Koskotas' arrest. One
week ago, Koskotas was banned from leaving Greece pending the
outcome of an official enquiry into alleged financial irregularities
at the Bank of Crete, which he controls.
   Sehiotis, whose ministry was responsible for police surveillance
of Koskotas, said he was resigning ``since such (public order
ministry) omissions ... create an issue of political sensitivity.''
   The scandal has shaken the government because of accusations in
Greek newspapers that senior socialist officials were involved in
illegal deals set up by the Bank of Crete.
   The socialists also have been criticized for permitting Koskotas
to build a multi-million dollar banking and media empire in Greece
since 1984 without adequate checks by the central bank on his
financial background.
   The government last week pledged ``absolute clarity'' in
uncovering the scandal and warned there will be ``no pardons' for
members of the ruling Panhellenic Socialist Movement (PASOK) who may
be implicated.
   ``The Greek people are left with the conviction that George
Koskotas was spirited away so that he would not speak. The
responsibility goes all the way to the top of the government
pyramid,'' Constantine Mitsotakis, leader of the New Democracy main
opposition party party, said in a statement demanding the government
resign.
   Koskotas was suspended Oct. 20 as chairman of the Bank of Crete
and indicted on five counts of forgery and embezzlement.
   Last week Koskotas appeared before a district attorney on a
charge of forging documents purporting to show that the Bank of
Crete had $13 million invested with the American brokerage firm
Merrill Lynch.
   He was not detained but given until Nov. 14 to prepare his
defense.
   Koskotas also is accused of forging documents purporting to show
his bank had another $17 million in an account with an American
bank, Irving Trust Corp. Both U.S. firms have said they had no
record of the deposits.
   Koskotas, who holds both American and Greek citizenship, bought a
controlling interest in the Bank of Crete in 1984 after working in
its central Athens branch for six years as an accountant.
   Rival newspapers have claimed Koskotas illegally used Bank of
Crete money to fund his publishing group Grammi, which controls
three daily newspapers, five magazines and a radio station.
   Koskotas resigned Oct. 29 as chairman of Grammi, the day after
the premier's son, Education Minister George Papandreou, denounced
as a forgery a Bank of Crete statement showing a $2.3 million
transfer to a Merrill Lynch account in his name.
   The younger Papandreou showed reporters a letter from a New York
lawyer saying there was no record at Merrill Lynch of such a
transfer.
   Koskotas' parents, brother, wife and five children all have left
Greece during the past week.
""")

    def test_obtain_term_ids_of_a_document(self):
        self.parameters.params["repo_dir"] = '/scratch/index/indri_5_7/ap8889'
        self.index_ = Index(self.parameters)
        res = self.index_.obtain_term_ids_of_a_document(1)
        self.assertEqual(
            res,
            ('AP881107-0001',
             (147, 771, 0, 78064, 26, 2828, 1283, 92, 126, 147, 175009, 159395,
              771, 55, 0, 0, 2362, 26, 2828, 919, 0, 0, 115, 8, 461, 1624,
              1826, 0, 35, 693, 1198, 0, 195412, 0, 724, 430, 621, 340, 0, 771,
              0, 1502, 20649, 4327, 1620, 9, 247, 0, 0, 866, 0, 643, 0, 2828,
              415, 101374, 1289, 2015, 276, 1246, 0, 24, 29, 586, 0, 272, 0,
              856, 0, 101374, 1826, 2153, 0, 174, 693, 0, 195412, 0, 0, 158999,
              1037, 0, 117013, 137162, 123, 157, 0, 4415, 159395, 0, 0, 2262,
              0, 0, 56, 0, 101374, 251, 0, 0, 189, 101374, 0, 569, 0, 332,
              3095, 1873, 0, 2974, 0, 0, 13, 63630, 0, 485, 461, 91464, 0, 0,
              91, 0, 50405, 0, 0, 156, 159395, 0, 690, 0, 347, 0, 24, 4049, 0,
              101374, 0, 0, 0, 771, 0, 0, 92, 126, 690, 131907, 609, 0, 56, 0,
              88, 2222, 0, 0, 1624, 0, 160974, 0, 9, 0, 0, 436, 0, 2362, 273,
              0, 774, 1620, 13, 0, 263, 0, 887, 339, 176, 0, 0, 0, 91, 0,
              50405, 0, 1620, 0, 0, 0, 289, 0, 1202, 101374, 0, 254, 0, 4543,
              8, 193, 91, 0, 979, 3597, 0, 3095, 0, 791, 0, 2768, 937, 0, 0,
              264, 91, 0, 0, 461, 2464, 0, 9, 0, 0, 1333, 2198, 45622, 0, 4433,
              0, 1624, 0, 678, 0, 0, 0, 0, 3790, 0, 40, 0, 0, 118, 135313,
              1620, 727, 136295, 0, 0, 0, 3408, 0, 2362, 6, 0, 245, 0, 0, 494,
              0, 415, 101374, 0, 2042, 435, 0, 0, 0, 0, 0, 586, 0, 347, 1461,
              0, 0, 116, 0, 0, 354, 0, 0, 9, 145130, 0, 48310, 120515, 51, 0,
              0, 1, 956, 540, 430, 32, 32, 0, 0, 0, 221, 340, 0, 9, 771,
              101374, 0, 1431, 897, 123, 0, 248, 0, 0, 91, 0, 50405, 0, 919, 0,
              114, 667, 0, 70387, 0, 62918, 0, 0, 101374, 261, 0, 0, 311, 212,
              0, 0, 61, 0, 4322, 767, 144897, 0, 62, 0, 0, 91, 0, 50405, 0,
              375, 8, 546, 0, 0, 26, 3527, 543, 3657, 3224, 0, 0, 0, 2223, 0,
              533, 0, 1019, 367, 0, 674, 0, 165, 101374, 0, 0, 436, 0, 4322,
              767, 144897, 0, 62, 0, 91, 0, 0, 445, 8, 0, 0, 550, 0, 0, 26, 91,
              3533, 1512, 285, 0, 0, 543, 0, 0, 0, 0, 0, 192, 0, 0, 1798,
              101374, 0, 328, 0, 26, 0, 2362, 45369, 1390, 0, 156, 167, 0, 0,
              91, 0, 50405, 0, 791, 0, 28, 0, 0, 264, 4415, 1977, 0, 187, 15,
              0, 0, 550, 1483, 273, 0, 247, 101374, 887, 0, 91, 0, 50405, 168,
              0, 303, 0, 515, 34, 77704, 0, 156, 23, 701, 273, 114, 1056, 0, 0,
              409, 490, 101374, 771, 897, 899, 0, 248, 0, 77704, 0, 0, 0, 0,
              1502, 599, 463, 147, 415, 4327, 2512, 0, 0, 70387, 0, 91, 0,
              50405, 221, 62, 0, 17, 41, 8, 1548, 0, 0, 3657, 3224, 550, 0, 0,
              171, 0, 2597, 4327, 62, 4, 0, 512, 0, 0, 1, 73, 532, 7, 0, 0, 0,
              192, 0, 3657, 3224, 0, 0, 0, 1548, 101374, 626, 918, 455, 0, 114,
              219, 0, 0, 245, 3095, 0, 0, 333, 0)))

    def test_obtain_terms_of_a_document(self):
        self.parameters.params["repo_dir"] = '/scratch/index/indri_5_7/ap8889'
        self.index_ = Index(self.parameters)
        res = self.index_.obtain_terms_of_a_document(1)
        print(res, file=sys.stderr)
        self.assertEqual(res, ('AP881107-0001', (
            'minist', 'resign', '', 'gree', 'american', 'banker', 'escap',
            'public', 'order', 'minist', 'tasso', 'sehioti', 'resign',
            'mondai', '', '', 'greek', 'american', 'banker', 'indict', '', '',
            '30', 'million', 'financi', 'scandal',
            'fled', '', 'countri', 'appar', 'aboard', '', 'yacht', '',
            'conserv', 'opposit', 'immedi', 'demand', '', 'resign', '',
            'premier', 'andrea', 'papandr', 'socialist', 'govern', 'claim', '',
            '', 'stage', '', 'cover', '', 'banker', 'georg', 'koskota', '34',
            'disappear', 'saturdai', 'afternoon', '', 'polic', 'offic',
            'speak', '', 'condit', '', 'anonym', '', 'koskota', 'fled',
            'abroad', '', 'sundai', 'appar', '', 'yacht', '', '', 'seasid',
            'villag', '', 'megalo', 'pefko', '20', 'mile', '', 'athen',
            'sehioti', '', '', 'warrant', '', '', 'issu', '', 'koskota',
            'arrest', '', '', 'ago', 'koskota', '', 'ban', '', 'leav', 'greec',
            'pend', '', 'outcom', '', '', 'offici', 'enquiri', '', 'alleg',
            'financi', 'irregular', '', '', 'bank', '', 'crete', '', '',
            'control', 'sehioti', '', 'ministri', '', 'respons', '', 'polic',
            'surveil', '', 'koskota', '', '', '', 'resign', '', '', 'public',
            'order', 'ministri', 'omiss', 'creat', '', 'issu', '', 'polit',
            'sensit', '', '', 'scandal', '', 'shaken', '', 'govern', '', '',
            'accus', '', 'greek', 'newspap', '', 'senior', 'socialist',
            'offici', '', 'involv', '', 'illeg', 'deal', 'set', '', '', '',
            'bank', '', 'crete', '', 'socialist', '', '', '', 'critic', '',
            'permit', 'koskota', '', 'build', '', 'multi', 'million', 'dollar',
            'bank', '', 'media', 'empir', '', 'greec', '', '1984', '', 'adequ',
            'check', '', '', 'central', 'bank', '', '', 'financi',
            'background', '', 'govern', '', '', 'pledg', 'absolut', 'clariti',
            '', 'uncov', '', 'scandal', '', 'warn', '', '', '', '', 'pardon',
            '', 'member', '', '', 'rule', 'panhellen', 'socialist', 'movement',
            'pasok', '', '', '', 'implic', '', 'greek', 'peopl', '', 'left',
            '', '', 'convict', '', 'georg', 'koskota', '', 'spirit', 'awai',
            '', '', '', '', '', 'speak', '', 'respons', 'goe', '', '', 'wai',
            '', '', 'top', '', '', 'govern', 'pyramid', '', 'constantin',
            'mitsotaki', 'leader', '', '', 'new', 'democraci', 'main',
            'opposit', 'parti', 'parti', '', '', '', 'statement', 'demand', '',
            'govern', 'resign', 'koskota', '', 'suspend', 'oct', '20', '',
            'chairman', '', '', 'bank', '', 'crete', '', 'indict', '', 'five',
            'count', '', 'forgeri', '', 'embezzl', '', '', 'koskota', 'appear',
            '', '', 'district', 'attornei', '', '', 'charg', '', 'forg',
            'document', 'purport', '', 'show', '', '', 'bank', '', 'crete', '',
            '13', 'million', 'invest', '', '', 'american', 'brokerag', 'firm',
            'merril', 'lynch', '', '', '', 'detain', '', 'given', '', 'nov',
            '14', '', 'prepar', '', 'defens', 'koskota', '', '', 'accus', '',
            'forg', 'document', 'purport', '', 'show', '', 'bank', '', '',
            '17', 'million', '', '', 'account', '', '', 'american', 'bank',
            'irv', 'trust', 'corp', '', '', 'firm', '', '', '', '', '',
            'record', '', '', 'deposit', 'koskota', '', 'hold', '', 'american',
            '', 'greek', 'citizenship', 'bought', '', 'control', 'interest',
            '', '', 'bank', '', 'crete', '', '1984', '', 'work', '', '',
            'central', 'athen', 'branch', '', 'six', 'year', '', '', 'account',
            'rival', 'newspap', '', 'claim', 'koskota', 'illeg', '', 'bank',
            '', 'crete', 'monei', '', 'fund', '', 'publish', 'group', 'grammi',
            '', 'control', 'three', 'daili', 'newspap', 'five', 'magazin', '',
            '', 'radio', 'station', 'koskota', 'resign', 'oct', '29', '',
            'chairman', '', 'grammi', '', '', '', '', 'premier', 'son', 'educ',
            'minist', 'georg', 'papandr', 'denounc', '', '', 'forgeri', '',
            'bank', '', 'crete', 'statement', 'show', '', '2', '3', 'million',
            'transfer', '', '', 'merril', 'lynch', 'account', '', '', 'name',
            '', 'younger', 'papandr', 'show', 'report', '', 'letter', '', '',
            'new', 'york', 'lawyer', 'sai', '', '', '', 'record', '', 'merril',
            'lynch', '', '', '', 'transfer', 'koskota', 'parent', 'brother',
            'wife', '', 'five', 'children', '', '', 'left', 'greec', '', '',
            'past', '')))

    def test_term(self):
        self.parameters.params["repo_dir"] = '/scratch/index/indri_5_7/ap8889'
        self.index_ = Index(self.parameters)
        res = self.index_.term(147)
        print(res, file=sys.stderr)
        self.assertEqual(res, 'minist')

    def test_expression_list(self):
        self.assertEqual(self.index_.expression_list("SAMPSON Dog", "#uw", 12),
                         {'romeo': 2})
        self.assertEqual(self.index_.expression_list("your", "#uw", 12), {
            'hamlet': 1,
            'romeo': 3
        })

    def test_run_query(self):
        self.index_.init_query_env()
        self.assertEqual(self.index_.run_query("you"),
                         ((3, -4.207161834249701), (2, -4.27477458192466)))

    def test_run_query_doc_names(self):
        self.index_.init_query_env()
        self.assertEqual(self.index_.run_query_doc_names("you"),
                         ['romeo', 'hamlet'])

    def test_get_ext_document_id(self):
        self.assertEqual(self.index_.get_ext_document_id(1), 'lorem')

    def test_expression_list_in_top_docs(self):
        self.index_.init_query_env()
        runs = self.index_.run_query_doc_names("a")
        self.assertEqual(
            self.index_.expression_list_in_top_docs("you", "#uw", 12, 2, runs),
            {
                'hamlet': 1,
                'romeo': 9
            })
        self.assertEqual(
            self.index_.expression_list_in_top_docs("you", "#uw", 12, 1, runs),
            {'romeo': 9})

    def test_document_length_doc_name(self):
        self.assertEqual(self.index_.document_length_doc_name('lorem'), 88)
        self.assertEqual(self.index_.document_length_doc_name('hamlet'), 71)

    def test_document_length_docs_names(self):
        self.assertEqual(
            self.index_.document_length_docs_names(['lorem', 'hamlet']), 159)

    def test_expand_query(self):
        self.assertEqual(
            self.index_.expand_query('consectetur adipiscing', 10, 10,
                                     ['lorem', 'hamlet']),
            [
                'francisco', 'bernardo', 'i', 'at', 'nulla', 'consectetur',
                'in', 'eget', 'and', 'the'
            ])
class TestIndexing(unittest.TestCase):

    """The content of the db_fir was created with scanner by scanning the
    TEST_DATA_DIR/suite_tree folder. If scanner is changed, db_dir must
    be recreated."""

    @classmethod
    def setUpClass(cls):
        cls.db_dir = os.path.join(
            env.RESULTS_DIR,
            'db_dir'
        )
        cls.suite_dir = os.path.join(
            env.TEST_DATA_DIR,
            'suite_tree'
        )
        scanner = Scanner()
        scanner.scan(
            cls.suite_dir,
            'robot',
            cls.db_dir)
        cls.xml_libs = os.path.join(
            env.RESOURCES_DIR,
            'library'
            )

    def setUp(self):
        self.index_dir = os.path.join(
            env.RESULTS_DIR,
            'index_dir',
        )
        if os.path.exists(self.index_dir):
            while os.path.exists(self.index_dir):
                shutil.rmtree(self.index_dir)
                sleep(0.1)
        os.makedirs(self.index_dir)
        self.index = Index(self.db_dir, self.index_dir)

    def test_parse_table_data(self):
        t_name = os.path.join(
            env.RESOURCES_DIR,
            'BuiltIn-ca8f2e8d70641ce17b9b304086c19657.json'
        )
        self.index.queue.add(t_name, None, None)
        data, status = self.index.read_table(
            os.path.join(env.RESOURCES_DIR, t_name))
        var, kw_index = self.index.parse_table_data(data, t_name)
        self.assertTrue(u'${/}' in var)
        self.assertTrue('${OUTPUT_FILE}' in var)
        self.assertTrue('@{TEST_TAGS}' in var)

    def test_add_builtin(self):
        self.index.add_builtin_to_queue(self.db_dir)
        self.assertTrue(len(self.index.queue.queue) > 0)

    def test_read_table(self):
        data, read_status = self.index.read_table(
            os.path.join(
                self.db_dir,
                self.test_b_table_name))
        self.assertTrue(data['file_name'], 'test_b.robot')

    def test_get_keywords_resource(self):
        data = self.get_resource_b()
        expected_kw_list = ['Resource B Keyword 2', 'Resource B Keyword 1']
        expected_arg_list = [['kwb1'], []]
        kw_list, arg_list = self.index.get_keywords(data)
        self.assertEqual(kw_list, expected_kw_list)
        self.assertEqual(arg_list.sort(), expected_arg_list.sort())

        data = self.get_test_a()
        expected_kw_list = ['Test A Keyword', 'Keyword']
        kw_list, arg_list = self.index.get_keywords(data)
        self.assertEqual(kw_list, expected_kw_list)
        self.assertEqual(arg_list, [[], []])

        data = self.get_s2l()
        parsed_kw, arg_list = self.index.get_keywords(data)
        self.assertTrue('Set Window Position' in parsed_kw)
        self.assertTrue('Get Cookies' in parsed_kw)
        self.assertTrue('Unselect Frame' in parsed_kw)
        self.assertTrue(['name'] in arg_list)
        l = ['driver_name', 'alias', 'kwargs', '**init_kwargs']
        self.assertTrue(l in arg_list)
        self.assertTrue(['*code'] in arg_list)

    def test_get_imports(self):
        data = self.get_resource_b()
        import_list = [self.process_table_name]
        self.assertEqual(self.index.get_imports(data), import_list)

        data = self.get_test_a()
        import_list = [
            self.common_table_name,
            self.resource_a_table_name]
        self.assertEqual(
            self.index.get_imports(data).sort(), import_list.sort())

        data = self.get_s2l()
        self.assertEqual(self.index.get_imports(data), [])

    def test_get_variables(self):
        data = self.get_resource_b()
        var = ['${RESOURCE_B}']
        self.assertEqual(self.index.get_variables(data), var)

        data = self.get_test_a()
        var = ['${TEST_A}']
        self.assertEqual(
            self.index.get_variables(data).sort(), var.sort())

        data = self.get_s2l()
        self.assertEqual(self.index.get_variables(data), [])

        data = self.get_common()
        self.assertEqual(self.index.get_variables(data), [])

    def test_get_kw_for_index(self):
        KeywordRecord = namedtuple(
            'KeywordRecord',
            'keyword argument object_name table_name')
        table_name = self.resource_b_table_name
        l, kw_list, arg_list, object_name, table_name = \
            self.get_resource_b_kw_index(KeywordRecord)

        self.assertEqual(
            self.index.get_kw_for_index(
                kw_list, arg_list, table_name, object_name), l)

        l, kw_list, arg_list, object_name, table_name = \
            self.get_test_a_kw_index(KeywordRecord)
        self.assertEqual(
            self.index.get_kw_for_index(
                kw_list, arg_list, table_name, object_name), l)

        l, kw_list, arg_list, object_name, table_name = self.get_s2l_kw_index(
            KeywordRecord)
        self.assertEqual(
            self.index.get_kw_for_index(
                kw_list, arg_list, table_name, object_name), l)

    def test_index_creation_test_a(self):
        table_name = self.test_a_table_name
        KeywordRecord = namedtuple(
            'KeywordRecord',
            'keyword argument object_name table_name')
        kw_list = []
        kw_list.extend(self.get_test_a_kw_index(KeywordRecord)[0])
        kw_list.extend(self.get_common_kw_index(KeywordRecord)[0])
        kw_list.extend(self.get_resource_a_kw_index(KeywordRecord)[0])
        kw_list.extend(self.get_s2l_kw_index(KeywordRecord)[0])
        kw_list.extend(self.get_os_kw_index(KeywordRecord)[0])
        kw_list.extend(self.get_builtin_kw_index(KeywordRecord)[0])
        kw_list.extend(self.get_LibNoClass_kw_index(KeywordRecord)[0])
        var_list = [
            u'${TEST_A}',
            u'${RESOURCE_A}',
            u'${COMMON_VARIABLE_1}',
            u'${COMMON_VARIABLE_2}'
        ]
        t_index = {
            'keyword': kw_list,
            'variable': var_list}
        r_index = self.index.create_index_for_table(self.db_dir, table_name)
        self.assertEqual(
            r_index['variable'].sort(), t_index['variable'].sort())
        self.assertEqual(len(r_index['keyword']), len(t_index['keyword']))
        self.assertEqual(r_index['keyword'].sort(), t_index['keyword'].sort())

    def test_index_creation_test_b(self):
        table_name = self.test_b_table_name
        KeywordRecord = namedtuple(
            'KeywordRecord',
            'keyword argument object_name table_name')
        kw_list = []
        kw_list.extend(self.get_test_b_kw_index(KeywordRecord)[0])
        kw_list.extend(self.get_common_kw_index(KeywordRecord)[0])
        kw_list.extend(self.get_resource_b_kw_index(KeywordRecord)[0])
        kw_list.extend(self.get_s2l_kw_index(KeywordRecord)[0])
        kw_list.extend(self.get_process_kw_index(KeywordRecord)[0])
        kw_list.extend(self.get_builtin_kw_index(KeywordRecord)[0])
        var_list = [
            u'${TEST_B}',
            u'${RESOURCE_B}',
            u'${COMMON_VARIABLE_1}',
            u'${COMMON_VARIABLE_2}'
        ]
        t_index = {
            'keyword': kw_list,
            'variable': var_list}
        r_index = self.index.create_index_for_table(self.db_dir, table_name)
        self.assertEqual(
            r_index['variable'].sort(), t_index['variable'].sort())
        self.assertEqual(len(r_index['keyword']), len(t_index['keyword']))
        self.assertEqual(r_index['keyword'].sort(), t_index['keyword'].sort())

    def test_index_consturctor(self):
        self.index.index_consturctor(self.resource_a_table_name)
        files = os.listdir(self.index_dir)
        self.assertEqual(len(files), 1)
        with open(os.path.join(self.index_dir, files[0])) as f:
            data = json.load(f)
        self.assertIn('variable', data)
        self.assertIn('keyword', data)
        self.assertFalse(
            any(kw[0] == 'Test A Keyword' for kw in data['keyword'])
        )
        self.assertTrue(
            any(kw[0] == 'Resource A Keyword 1' for kw in data['keyword'])
        )

    def test_get_kw_arguments(self):
        kw_args = [u'item', u'msg=None']
        result = self.index.get_kw_arguments(kw_args)
        expected = [u'item', u'msg']
        self.assertEqual(result, expected)
        kw_args = [u'name', u'*args']
        result = self.index.get_kw_arguments(kw_args)
        self.assertEqual(result, kw_args)
        kw_args = []
        result = self.index.get_kw_arguments(kw_args)
        self.assertEqual(result, kw_args)
        kw_args = [u'object=None', u'*args', u'**kwargs']
        result = self.index.get_kw_arguments(kw_args)
        expected = [u'object', u'*args', u'**kwargs']
        self.assertEqual(result, expected)
        kw_args = [u'${kwa1}', '@{list}', '&{kwargs}']
        result = self.index.get_kw_arguments(kw_args)
        expected = [u'kwa1', '*list', '**kwargs']
        self.assertEqual(result, expected)

    def test_add_xml_libraries(self):
        self.assertEqual(len(self.index.queue.queue), 0)
        self.index.add_xml_libraries(self.xml_libs)
        self.assertEqual(len(self.index.queue.queue), 2)

    def test_index_with_xml_libraries(self):
        xml_libs = os.path.join(
            env.RESOURCES_DIR,
            'library'
            )
        db_dir_with_xml = os.path.join(
            env.RESULTS_DIR,
            'db_dir_with_xml')
        scanner = Scanner(xml_libs)
        scanner.scan(
            self.suite_dir,
            'robot',
            db_dir_with_xml
        )
        index = Index(db_dir_with_xml, self.index_dir, self.xml_libs)
        index.index_consturctor(self.resource_a_table_name)
        files = os.listdir(self.index_dir)
        self.assertEqual(len(files), 1)
        with open(os.path.join(self.index_dir, files[0])) as f:
            data = json.load(f)
        self.assertTrue(
            any(kw[2] == 'SwingLibrary' for kw in data['keyword'])
        )
        self.assertTrue(
            any(kw[0] == 'Add Table Cell Selection' for kw in data['keyword'])
        )
        self.assertTrue(
            any(kw[0] == 'Select From Popup Menu' for kw in data['keyword'])
        )

    @property
    def common_table_name_index(self):
        index = 'index-{0}'.format(self.common_table_name)
        return os.path.join(self.index_dir, index)

    @property
    def test_a_table_name_index(self):
        index = 'index-{0}'.format(self.test_a_table_name)
        return os.path.join(self.index_dir, index)

    @property
    def real_suite_table_name(self):
        return rf_table_name(
            os.path.normcase(
                os.path.join(
                    self.real_suite_dir,
                    'test',
                    'real_suite.robot'
                )
            )
        )

    @property
    def resource_b_table_name(self):
        return rf_table_name(
            os.path.normcase(os.path.join(self.suite_dir, 'resource_b.robot'))
        )

    @property
    def common_table_name(self):
        return rf_table_name(
            os.path.normcase(os.path.join(self.suite_dir, 'common.robot'))
        )

    @property
    def test_a_table_name(self):
        return rf_table_name(
            os.path.normcase(os.path.join(self.suite_dir, 'test_a.robot'))
        )

    @property
    def test_b_table_name(self):
        return rf_table_name(
            os.path.normcase(os.path.join(self.suite_dir, 'test_b.robot'))
        )

    @property
    def resource_a_table_name(self):
        return rf_table_name(os.path.normcase(
            os.path.join(self.suite_dir, 'resource_a.robot'))
        )

    @property
    def s2l_table_name(self):
        return lib_table_name('Selenium2Library')

    @property
    def os_table_name(self):
        return lib_table_name('OperatingSystem')

    @property
    def process_table_name(self):
        return lib_table_name('Process')

    @property
    def builtin_table_name(self):
        return lib_table_name('BuiltIn')

    @property
    def libnoclass_table_name(self):
        return lib_table_name('LibNoClass')

    def get_resource_b(self):
        f = open(os.path.join(
                    self.db_dir,
                    self.resource_b_table_name
                )
            )
        return json.load(f)

    def get_common(self):
        f = open(os.path.join(
                self.db_dir,
                self.common_table_name
            )
        )
        return json.load(f)

    def get_test_a(self):
        f = open(os.path.join(
                self.db_dir,
                self.test_a_table_name
            )
        )
        return json.load(f)

    def get_s2l(self):
        f = open(os.path.join(
                self.db_dir,
                self.s2l_table_name
            )
        )
        return json.load(f)

    def get_os(self):
        f = open(os.path.join(
                self.db_dir,
                self.os_table_name
            )
        )
        return json.load(f)

    def get_process(self):
        f = open(os.path.join(
                self.db_dir,
                self.process_table_name
            )
        )
        return json.load(f)

    def getbuiltin(self):
        f = open(os.path.join(
                self.db_dir,
                self.builtin_table_name
            )
        )
        return json.load(f)

    def get_libnoclass(self):
        f = open(os.path.join(
                self.db_dir,
                self.libnoclass_table_name
            )
        )
        return json.load(f)

    def get_s2l_kw_index(self, keywordrecord):
        s2l_data = self.get_s2l()
        kw_list = self.index.get_keywords(s2l_data)[0]
        arg_list = self.get_kw_args(s2l_data)
        object_name = 'Selenium2Library'
        table_name = self.s2l_table_name
        l = []
        for kw, arg in zip(kw_list, arg_list):
            l.append(
                keywordrecord(
                    keyword=kw,
                    argument=arg,
                    object_name=object_name,
                    table_name=table_name
                )
            )
        return l, kw_list, arg_list, object_name, table_name

    def get_os_kw_index(self, keywordrecord):
        os_data = self.get_os()
        kw_list = self.index.get_keywords(os_data)[0]
        arg_list = self.get_kw_args(os_data)
        object_name = 'OperatingSystem'
        table_name = self.os_table_name
        l = []
        for kw, arg in zip(kw_list, arg_list):
            l.append(
                keywordrecord(
                    keyword=kw,
                    argument=arg,
                    object_name=object_name,
                    table_name=table_name
                )
            )
        return l, kw_list, arg_list, object_name, table_name

    def get_process_kw_index(self, keywordrecord):
        data = self.get_process()
        kw_list = self.index.get_keywords(data)[0]
        arg_list = self.get_kw_args(data)
        object_name = 'Process'
        table_name = self.process_table_name
        l = []
        for kw, arg in zip(kw_list, arg_list):
            l.append(
                keywordrecord(
                    keyword=kw,
                    argument=arg,
                    object_name=object_name,
                    table_name=table_name
                )
            )
        return l, kw_list, arg_list, object_name, table_name

    def get_builtin_kw_index(self, keywordrecord):
        data = self.getbuiltin()
        kw_list = self.index.get_keywords(data)[0]
        arg_list = self.get_kw_args(data)
        object_name = 'BuiltIn'
        table_name = self.builtin_table_name
        l = []
        for kw, arg in zip(kw_list, arg_list):
            l.append(
                keywordrecord(
                    keyword=kw,
                    argument=arg,
                    object_name=object_name,
                    table_name=table_name
                )
            )
        return l, kw_list, arg_list, object_name, table_name

    def get_LibNoClass_kw_index(self, keywordrecord):
        data = self.get_libnoclass()
        kw_list = self.index.get_keywords(data)[0]
        arg_list = self.get_kw_args(data)
        object_name = 'BuiltIn'
        table_name = self.builtin_table_name
        l = []
        for kw, arg in zip(kw_list, arg_list):
            l.append(
                keywordrecord(
                    keyword=kw,
                    argument=arg,
                    object_name=object_name,
                    table_name=table_name
                )
            )
        return l, kw_list, arg_list, object_name, table_name

    def get_test_a_kw_index(self, keywordrecord):
        kw_list = [u'Test A Keyword', u'Keyword']
        arg_list = [None, None]
        table_name = self.test_a_table_name
        object_name = u'test_a.robot'
        l = []
        for kw, arg in zip(kw_list, arg_list):
            l.append(
                keywordrecord(
                    keyword=kw,
                    argument=arg,
                    object_name=object_name,
                    table_name=table_name
                )
            )
        return l, kw_list, arg_list, object_name, table_name

    def get_test_b_kw_index(self, keywordrecord):
        kw_list = []
        table_name = self.test_b_table_name
        object_name = u'test_a.robot'
        l = []
        return l, kw_list, [None], object_name, table_name

    def get_resource_a_kw_index(self, keywordrecord):
        kw_list = [u'Resource A Keyword 1', u'resource A Keyword 2']
        arg_list = ['kwa1', None]
        table_name = self.resource_a_table_name
        object_name = u'resource_a.robot'
        l = []
        for kw, arg in zip(kw_list, arg_list):
            l.append(
                keywordrecord(
                    keyword=kw,
                    argument=arg,
                    object_name=object_name,
                    table_name=table_name
                )
            )
        return l, kw_list, arg_list, object_name, table_name

    def get_resource_b_kw_index(self, keywordrecord):
        kw_list = [u'Resource B Keyword 1', u'resource B Keyword 2']
        arg_list = ['kwb1', None]
        table_name = self.resource_b_table_name
        object_name = u'resource_b.robot'
        l = []
        for kw, arg in zip(kw_list, arg_list):
            l.append(
                keywordrecord(
                    keyword=kw,
                    argument=arg,
                    object_name=object_name,
                    table_name=table_name
                )
            )
        return l, kw_list, arg_list, object_name, table_name

    def get_common_kw_index(self, keywordrecord):
        kw_list = [
            u'Common Keyword 2',
            u'common Keyword 1',
            u'Really Long Keyword To Test With Jumping To Keyword Does Not Scroll The Visible Area To A Wrong Place Should There Be More Words'
        ]
        table_name = self.common_table_name
        object_name = u'common.robot'
        l = []
        for kw in kw_list:
            l.append(
                keywordrecord(
                    keyword=kw,
                    argument=None,
                    object_name=object_name,
                    table_name=table_name
                )
            )
        return l, kw_list, [None], object_name, table_name

    def get_kw_args(self, data):
        arg_list = []
        kws = data["keywords"]
        for i in kws.iterkeys():
            args = kws[i]['keyword_arguments']
            for arg in args:
                if '=' in arg:
                    arg_list.append(arg.split('=')[0])
                else:
                    arg_list.append(arg)
        return arg_list
示例#25
0
 def __init__(self, word2vec_, parameters):
     self.word2vec_model = word2vec_.model
     self.single_document = SimpleDocument(parameters)
     self.index_ = Index(parameters)
     self.stop_words = set(nltk.corpus.stopwords.words('english'))
示例#26
0
 def test_term(self):
     self.parameters.params["repo_dir"] = '/scratch/index/indri_5_7/ap8889'
     self.index_ = Index(self.parameters)
     res = self.index_.term(147)
     print(res, file=sys.stderr)
     self.assertEqual(res, 'minist')
示例#27
0
    def setUp(self):
        self.parameters = Parameters()
        self.parameters.params["repo_dir"] = '../index/test_files/index'

        self.index_ = Index(self.parameters)
示例#28
0
 def __init__(self, parameters):
     self.parameters = parameters
     self.enchant_dict = enchant.Dict("en_US")
     self.stopwords = stopwords.words('english')
     self.index_ = Index(self.parameters)
示例#29
0
import sys
from datetime import datetime

from index.document import IndexDocument
from index.index import Index
from server.server import fields
from index.utils import kendal_tau
from metrics.utils import avg_sd

index = Index()
index.load()

def validate_queries(queries):
    for i, query in enumerate(queries):
        if not query[0] in fields:
            print('{} is not a valid field (in query #{}). Try one of: {}'.format(query[0], i+1, str(fields)))
            sys.exit(1)

def ranking_correlation(queries):
    validate_queries(queries)

    for query in queries:
        ranking_tfidf = index.get_documents_for_query(query[0], query[1], query[2], True)
        ranking_raw = index.get_documents_for_query(query[0], query[1], query[2], False)

        correlation = kendal_tau(ranking_tfidf, ranking_raw)
        print('For query [{}] on field [{}]: {} Kendal Tau correlation. Rankings with {} documents.'.format(query[1], query[0], correlation, query[2]))


def query_response_time(queries):
    validate_queries(queries)
示例#30
0
 def test_obtain_terms_of_a_document(self):
     self.parameters.params["repo_dir"] = '/scratch/index/indri_5_7/ap8889'
     self.index_ = Index(self.parameters)
     res = self.index_.obtain_terms_of_a_document(1)
     print(res, file=sys.stderr)
     self.assertEqual(res, ('AP881107-0001', (
         'minist', 'resign', '', 'gree', 'american', 'banker', 'escap',
         'public', 'order', 'minist', 'tasso', 'sehioti', 'resign',
         'mondai', '', '', 'greek', 'american', 'banker', 'indict', '', '',
         '30', 'million', 'financi', 'scandal',
         'fled', '', 'countri', 'appar', 'aboard', '', 'yacht', '',
         'conserv', 'opposit', 'immedi', 'demand', '', 'resign', '',
         'premier', 'andrea', 'papandr', 'socialist', 'govern', 'claim', '',
         '', 'stage', '', 'cover', '', 'banker', 'georg', 'koskota', '34',
         'disappear', 'saturdai', 'afternoon', '', 'polic', 'offic',
         'speak', '', 'condit', '', 'anonym', '', 'koskota', 'fled',
         'abroad', '', 'sundai', 'appar', '', 'yacht', '', '', 'seasid',
         'villag', '', 'megalo', 'pefko', '20', 'mile', '', 'athen',
         'sehioti', '', '', 'warrant', '', '', 'issu', '', 'koskota',
         'arrest', '', '', 'ago', 'koskota', '', 'ban', '', 'leav', 'greec',
         'pend', '', 'outcom', '', '', 'offici', 'enquiri', '', 'alleg',
         'financi', 'irregular', '', '', 'bank', '', 'crete', '', '',
         'control', 'sehioti', '', 'ministri', '', 'respons', '', 'polic',
         'surveil', '', 'koskota', '', '', '', 'resign', '', '', 'public',
         'order', 'ministri', 'omiss', 'creat', '', 'issu', '', 'polit',
         'sensit', '', '', 'scandal', '', 'shaken', '', 'govern', '', '',
         'accus', '', 'greek', 'newspap', '', 'senior', 'socialist',
         'offici', '', 'involv', '', 'illeg', 'deal', 'set', '', '', '',
         'bank', '', 'crete', '', 'socialist', '', '', '', 'critic', '',
         'permit', 'koskota', '', 'build', '', 'multi', 'million', 'dollar',
         'bank', '', 'media', 'empir', '', 'greec', '', '1984', '', 'adequ',
         'check', '', '', 'central', 'bank', '', '', 'financi',
         'background', '', 'govern', '', '', 'pledg', 'absolut', 'clariti',
         '', 'uncov', '', 'scandal', '', 'warn', '', '', '', '', 'pardon',
         '', 'member', '', '', 'rule', 'panhellen', 'socialist', 'movement',
         'pasok', '', '', '', 'implic', '', 'greek', 'peopl', '', 'left',
         '', '', 'convict', '', 'georg', 'koskota', '', 'spirit', 'awai',
         '', '', '', '', '', 'speak', '', 'respons', 'goe', '', '', 'wai',
         '', '', 'top', '', '', 'govern', 'pyramid', '', 'constantin',
         'mitsotaki', 'leader', '', '', 'new', 'democraci', 'main',
         'opposit', 'parti', 'parti', '', '', '', 'statement', 'demand', '',
         'govern', 'resign', 'koskota', '', 'suspend', 'oct', '20', '',
         'chairman', '', '', 'bank', '', 'crete', '', 'indict', '', 'five',
         'count', '', 'forgeri', '', 'embezzl', '', '', 'koskota', 'appear',
         '', '', 'district', 'attornei', '', '', 'charg', '', 'forg',
         'document', 'purport', '', 'show', '', '', 'bank', '', 'crete', '',
         '13', 'million', 'invest', '', '', 'american', 'brokerag', 'firm',
         'merril', 'lynch', '', '', '', 'detain', '', 'given', '', 'nov',
         '14', '', 'prepar', '', 'defens', 'koskota', '', '', 'accus', '',
         'forg', 'document', 'purport', '', 'show', '', 'bank', '', '',
         '17', 'million', '', '', 'account', '', '', 'american', 'bank',
         'irv', 'trust', 'corp', '', '', 'firm', '', '', '', '', '',
         'record', '', '', 'deposit', 'koskota', '', 'hold', '', 'american',
         '', 'greek', 'citizenship', 'bought', '', 'control', 'interest',
         '', '', 'bank', '', 'crete', '', '1984', '', 'work', '', '',
         'central', 'athen', 'branch', '', 'six', 'year', '', '', 'account',
         'rival', 'newspap', '', 'claim', 'koskota', 'illeg', '', 'bank',
         '', 'crete', 'monei', '', 'fund', '', 'publish', 'group', 'grammi',
         '', 'control', 'three', 'daili', 'newspap', 'five', 'magazin', '',
         '', 'radio', 'station', 'koskota', 'resign', 'oct', '29', '',
         'chairman', '', 'grammi', '', '', '', '', 'premier', 'son', 'educ',
         'minist', 'georg', 'papandr', 'denounc', '', '', 'forgeri', '',
         'bank', '', 'crete', 'statement', 'show', '', '2', '3', 'million',
         'transfer', '', '', 'merril', 'lynch', 'account', '', '', 'name',
         '', 'younger', 'papandr', 'show', 'report', '', 'letter', '', '',
         'new', 'york', 'lawyer', 'sai', '', '', '', 'record', '', 'merril',
         'lynch', '', '', '', 'transfer', 'koskota', 'parent', 'brother',
         'wife', '', 'five', 'children', '', '', 'left', 'greec', '', '',
         'past', '')))
示例#31
0
 def __init__(self, parameters):
     self.index_ = Index(parameters)
     self.stop_words = set(nltk.corpus.stopwords.words('english'))
示例#32
0
class Neighborhood:
    def __init__(self, word2vec_, parameters):
        self.word2vec_model = word2vec_.model
        self.single_document = SimpleDocument(parameters)
        self.index_ = Index(parameters)
        self.stop_words = set(nltk.corpus.stopwords.words('english'))

    def find_nearest_neighbor_in_a_list(self, unigram, other_unigrams,
                                        min_distance, neighbor_size):
        neighbor = []
        if unigram in self.word2vec_model.wv.vocab:
            for other_unigram in other_unigrams:
                if len(neighbor) > neighbor_size:
                    break
                if other_unigram is not unigram and other_unigram not in neighbor and \
                        other_unigram in self.word2vec_model.wv.vocab:
                    sim = self.word2vec_model.similarity(
                        unigram, other_unigram)
                    if sim > min_distance:
                        neighbor += [other_unigram]
        return neighbor

    def find_significant_neighbors(self, doc_words, min_distance,
                                   neighbor_size):
        significant_neighbors = []
        for other_unigram in doc_words:
            if other_unigram in self.word2vec_model.wv.vocab:
                neighbor = self.find_nearest_neighbor_in_a_list(
                    other_unigram, doc_words, min_distance, neighbor_size)
                if len(neighbor) == neighbor_size:
                    significant_neighbors += [neighbor]
        significant_neighbors = [
            list(x) for x in set(tuple(x) for x in significant_neighbors)
        ]
        return significant_neighbors

    @staticmethod
    def merge_close_neighbors(neighbors, minimum_merge_intersection):
        merged_neighbors = []
        i = 0
        while i < len(neighbors):
            merged_neighbors += [set(neighbors[i])]
            j = i + 1
            while j < len(neighbors):
                neighbor_intersection = merged_neighbors[i].intersection(
                    neighbors[j])
                if len(neighbor_intersection) >= minimum_merge_intersection:
                    merged_neighbors[i] = set(merged_neighbors[i]).union(
                        neighbors[j])
                    del neighbors[j]
                else:
                    j += 1
            i += 1
        return merged_neighbors

    def find_significant_merged_neighbors(self, doc_words, min_distance,
                                          neighbor_size,
                                          minimum_merge_intersection):
        significant_neighbors = self.find_significant_neighbors(
            doc_words, min_distance, neighbor_size)
        significant_merged_neighbors = self.merge_close_neighbors(
            significant_neighbors, minimum_merge_intersection)
        return significant_merged_neighbors

    def remove_stopwords_neighbors(self, neighbors, max_stop_words):
        i = 0
        while i < len(neighbors):
            neighbor_stop_words_intersection = set(neighbors[i]).intersection(
                set(self.stop_words))
            if len(neighbor_stop_words_intersection) >= max_stop_words:
                del neighbors[i]
            else:
                for a in neighbors[i].copy():
                    if a in self.stop_words:
                        neighbors[i].remove(a)
                i += 1
        return neighbors

    def remove_stemmed_similar_words_neighbors(self, neighbors):
        for k in range(len(neighbors)):
            neighbor_ = list(neighbors[k])
            neighbors[k] = set(
                self.remove_stemmed_similar_words_list(neighbor_))
        return neighbors

    def remove_stemmed_similar_words_list(self, l):
        i = 0
        while i < len(l):
            j = i + 1
            while j < len(l):
                if self.index_.check_if_have_same_stem(l[i], l[j]):
                    del l[j]
                else:
                    j += 1
            i += 1
        return l

    def find_significant_pruned_neighbors(self, doc_words, min_distance,
                                          neighbor_size,
                                          minimum_merge_intersection,
                                          max_stop_words):
        doc_words = list(set(doc_words))
        significant_neighbors = \
            self.find_significant_merged_neighbors(doc_words, min_distance, neighbor_size, minimum_merge_intersection)
        significant_neighbors = self.remove_stopwords_neighbors(
            significant_neighbors, max_stop_words)
        significant_neighbors = self.remove_stemmed_similar_words_neighbors(
            significant_neighbors)
        return significant_neighbors

    def find_significant_pruned_neighbors_in_doc(self, doc_file_name,
                                                 min_distance, neighbor_size,
                                                 minimum_merge_intersection,
                                                 max_stop_words):

        doc_words = self.single_document.get_words(doc_file_name)

        significant_neighbors = self.find_significant_pruned_neighbors(
            doc_words, min_distance, neighbor_size, minimum_merge_intersection,
            max_stop_words)
        return significant_neighbors

    def find_significant_neighbors_weight(self, doc_words,
                                          significant_neighbors_ind):

        significant_neighbors_weight = dict()
        for ind, neighbor in list(significant_neighbors_ind.items()):
            significant_neighbors_weight[ind] = np.mean(
                [self.index_.tfidf(term, doc_words) for term in neighbor])

        return significant_neighbors_weight

    @staticmethod
    def sort_significant_neighbors(significant_neighbors_weight,
                                   significant_neighbors_ind):
        sorted_w = sorted(significant_neighbors_weight.items(),
                          key=operator.itemgetter(1),
                          reverse=True)
        return [(significant_neighbors_ind[k], v) for (k, v) in sorted_w]

    @staticmethod
    def index_neighbors(neighbors):
        return {ind: neighbor for ind, neighbor in enumerate(neighbors)}

    def run(self, doc_file_name, min_distance, neighbor_size,
            minimum_merge_intersection, max_stop_words):
        doc_words = self.single_document.get_words(doc_file_name)
        print("doc_words length =", len(doc_words))
        significant_neighbors = self.find_significant_pruned_neighbors(
            doc_words, min_distance, neighbor_size, minimum_merge_intersection,
            max_stop_words)
        print("significant_neighbors length =", len(significant_neighbors))
        significant_neighbors_ind = self.index_neighbors(significant_neighbors)
        print("significant_neighbors_ind length =",
              len(significant_neighbors_ind))
        significant_neighbors_weight = self.find_significant_neighbors_weight(
            doc_words, significant_neighbors_ind)
        print("significant_neighbors_weight length =",
              len(significant_neighbors_weight))
        sorted_significant_neighbors = self.sort_significant_neighbors(
            significant_neighbors_weight, significant_neighbors_ind)
        print("sorted_significant_neighbors length =",
              len(sorted_significant_neighbors))
        return sorted_significant_neighbors