def test_add_file_raises_file_not_found_error(self): with mock.patch('model.index.Config', autospec=True, spec_set=True): with mock.patch('builtins.open', mock.mock_open()) as m: m.side_effect = FileNotFoundError index = Index() with self.assertRaises(FileNotFoundError): index.add_file('doc1')
def test_add_file_raises_permission_error(self): with mock.patch('model.index.Config', autospec=True, spec_set=True): with mock.patch('builtins.open', mock.mock_open()) as m: m.side_effect = PermissionError index = Index() with self.assertRaises(PermissionError): index.add_file('doc1')
def test_add_file_raises_is_a_directory_error(self): with mock.patch('model.index.Config', autospec=True, spec_set=True): with mock.patch('builtins.open', mock.mock_open()) as m: m.side_effect = IsADirectoryError index = Index() with self.assertRaises(IsADirectoryError): index.add_file('doc1')
def test_add_file_with_empty_file_name(self): with mock.patch('model.index.Config', autospec=True, spec_set=True) as mock_config: mock_config.return_value.remove_stopwords.return_value = False mock_config.return_value.language.return_value = 'english' mock_config.return_value.use_stemming.return_value = False with mock.patch('builtins.open', mock.mock_open()) as m: m.side_effect = FileNotFoundError index = Index() with self.assertRaises(FileNotFoundError): index.add_file('')
def test_add_file_with_stop_words_and_stemming_enabled(self): data = 'this is some data that needs stemming ;continue hello world' expected = defaultdict(list, {'this': [1], 'some': [1], 'data': [1], 'that': [1], 'need': [1], 'stem': [1], 'continu': [1], 'hello': [1], 'world': [1]}) with mock.patch('model.index.Config', autospec=True, spec_set=True) as mock_config: mock_config.return_value.remove_stopwords.return_value = True mock_config.return_value.language.return_value = 'english' mock_config.return_value.use_stemming.return_value = True with mock.patch('builtins.open', mock.mock_open(read_data=data)) as m: index = Index() index.add_file('doc1') assert index._index == expected
def test_add_file_with_remove_stop_word_enabled(self): data = 'at some I am here before as so me' expected = defaultdict(list, {'some': [1], 'here': [1], 'before': [1]}) with mock.patch('model.index.Config', autospec=True, spec_set=True) as mock_config: mock_config.return_value.remove_stopwords.return_value = True mock_config.return_value.language.return_value = 'english' mock_config.return_value.use_stemming.return_value = False with mock.patch('builtins.open', mock.mock_open(read_data=data)) as m: index = Index() index.add_file('doc1') assert index._index == expected
def test_add_file_with_identic_words(self): data = 'data data data' expected = {'data': [1]} with mock.patch('model.index.Config', autospec=True, spec_set=True) as mock_config: mock_config.return_value.remove_stopwords.return_value = False mock_config.return_value.language.return_value = 'english' mock_config.return_value.use_stemming.return_value = False with mock.patch('builtins.open', mock.mock_open(read_data=data)) as m: index = Index() index.add_file('doc1') assert index._index == expected
def test_index_structure_with_stemming_disabled(self): data = 'cycling continuos continue' expected = {'cycling': [1], 'continuos': [1], 'continue': [1]} with mock.patch('model.index.Config', autospec=True, spec_set=True) as mock_config: mock_config.return_value.remove_stopwords.return_value = False mock_config.return_value.language.return_value = 'english' mock_config.return_value.use_stemming.return_value = False with mock.patch('builtins.open', mock.mock_open(read_data=data)): index = Index() index.add_file('doc1') assert index._index == expected
def test_empty_query(self): index = defaultdict(list, {'data': [1, 3], 'some':[1, 2], 'hello': [1], 'world': [3]}) files = ['doc1', 'doc2', 'doc3', 'doc4'] query = '' expected = None with mock.patch('model.index.Config', autospec=True, spec_set=True) as mock_config: mock_config.return_value.remove_stopwords.return_value = False mock_config.return_value.language.return_value = 'english' mock_config.return_value.use_stemming.return_value = False idx = Index() idx._index = index idx._files = files assert idx.get_result_for_query(query) == expected
def test_wrong_query(self): index = defaultdict(list, {'data': [1, 3], 'some':[1, 2], 'hello': [1], 'world': [3]}) files = ['doc1', 'doc2', 'doc3', 'doc4'] query = 'data. some hello this is wrong query !' with mock.patch('model.index.Config', autospec=True, spec_set=True) as mock_config: mock_config.return_value.remove_stopwords.return_value = False mock_config.return_value.language.return_value = 'english' mock_config.return_value.use_stemming.return_value = False idx = Index() idx._index = index idx._files = files with self.assertRaises(ValueError): result = idx.get_result_for_query(query)
def test_query_with_stemming_enabled(self): index = defaultdict(list, {'continuo': [1, 3], 'cycl':[1, 2], 'hello': [1], 'world': [3]}) files = ['doc1', 'doc2', 'doc3', 'doc4'] query = 'continuos && cycling' expected = ['doc1'] with mock.patch('model.index.Config', autospec=True, spec_set=True) as mock_config: mock_config.return_value.remove_stopwords.return_value = False mock_config.return_value.language.return_value = 'english' mock_config.return_value.use_stemming.return_value = True idx = Index() idx._index = index idx._files = files assert idx.get_result_for_query(query) == expected
def test_add_file_with_wrong_type(self): with mock.patch('model.index.Config', autospec=True, spec_set=True) as mock_config: mock_config.return_value.remove_stopwords.return_value = False mock_config.return_value.language.return_value = 'english' mock_config.return_value.use_stemming.return_value = False with mock.patch('builtins.open', mock.mock_open()) as m: index = Index() with self.assertRaises(ValueError): index.add_file(None) with self.assertRaises(ValueError): index.add_file([]) with self.assertRaises(ValueError): index.add_file(True) with self.assertRaises(ValueError): index.add_file({})
def test_add_file_correct_input_with_more_files(self): with mock.patch('model.index.Config', autospec=True, spec_set=True): with mock.patch('builtins.open', mock.mock_open()) as m: index = Index() index.add_file('doc1') index.add_file('doc2') index.add_file('doc3') with self.assertRaises(IndexError): index.add_file('doc3') assert index._files == ['doc1', 'doc2', 'doc3']
def assign_indexes(self, publications): pub_by_id = {} for pub in publications: e = list(Index.find_by_type(pub.indexes, 'WOS')) if len(e) > 0: continue ut = list(Identifier.find_by_type(pub.identifiers, 'WOK')) if len(ut) == 0: continue ut = ut[0].value pub_by_id[ut] = pub editions = self._find_editions(pub_by_id.keys()) for ut, edition in editions.iteritems(): pub_by_id[ut].indexes.append(Index(edition, type='WOS'))
def create_index(name, desc, final_photo, projectName): """Creates a Index Card""" project = get_project_by_name(projectName) index = Index(project_id=project.project_id, name=name, desc=desc, index_url=final_photo) db.session.add(index) db.session.commit()
def test_add_file_raises_index_error(self): with mock.patch('model.index.Config', autospec=True, spec_set=True): with mock.patch('builtins.open', mock.mock_open()): index = Index() index.add_file('doc1') with self.assertRaises(IndexError): index.add_file('doc1')
def main(): # create a new index try: index = Index() except ValueError as e: print(e) try: with open('files.txt') as file: for line in file: if not line.strip() or line.strip().startswith('#'): continue add_file_to_index(line.split()[0], index) except FileNotFoundError: print('files.txt was not found, continuing with manual file addition.') prompt = 'File to add to index (or simply press enter for query): ' while (file := input(prompt)) != "": add_file_to_index(file, index)
def test_add_file_correct_index_structure_with_more_files(self): data1 = 'some data' data2 = 'data here' expected = {'some': [1], 'data': [1, 2], 'here': [2]} with mock.patch('model.index.Config', autospec=True, spec_set=True) as mock_config: mock_config.return_value.remove_stopwords.return_value = False mock_config.return_value.language.return_value = 'english' mock_config.return_value.use_stemming.return_value = False index = Index() with mock.patch('builtins.open', mock.mock_open(read_data=data1)) as m: index.add_file('document1') with mock.patch('builtins.open', mock.mock_open(read_data=data2)) as m: index.add_file('document2') assert index._index == expected
def _parse_csv(self, content, encoding='UTF-8'): csv = unicodecsv.DictReader(strip_bom(content).splitlines(), encoding=encoding) def empty_to_none(s): if s == None: return None s = s.strip() if len(s) == 0: return None return s def list_remove_empty(l): r = [] for x in l: v = empty_to_none(x) if v: r.append(v) return r def to_num(x): x = x.strip() if len(x) == 0: return 0 return int(x) for line in csv: if line['Authors'] == '[No author name available]': authors = [] else: # (mrshu): SCOPUS sa rozhodol oddelovat ako priezvyska, tak aj # jednotlive mena autorov ciarkov. Toto robi problemy, preto # preprocessujeme zoznam autorov, ktory vyzera napriklad # # Brejová, B., Brown, D.G., Li, M., Vinař, T. # # najdeme, konce celych mien, a ciarku v tomto pripade nahradime # bodkociarkou. Nasledne potom funkcii, ktora mena autorov spracovava # dame vediet, ze je ako separator pouzita bodkociarka. line['Authors'] = re.sub(r'\.,', ';', line['Authors']) authors = Author.parse_sn_first_list(line['Authors'], separator=u';') pub = Publication(line['Title'], authors, to_num(line['Year'])) source_title = empty_to_none(line['Source title']) if source_title: source_title, replacements = re.subn( r' \(including subseries [^)]+\)', '', source_title) source_title = source_title.strip() if replacements: pub.series = source_title else: pub.published_in = source_title pub.volume = empty_to_none(line['Volume']) pub.issue = empty_to_none(line['Issue']) pub.pages = make_page_range(empty_to_none(line['Page start']), empty_to_none(line['Page end'])) # (mrshu): z dovodu, ktory nedokazem pochopit teraz SCOPUS vracia cosi # ako 'DOILink', kde da dohromady tieto dva fieldy. Nepodarilo sa mi # prist na to ako to spravit rozumnejsie, tento hack to aspon rozparsuje splits = line['DOILink'].split('"') if len(splits) > 1: line['Link'] = splits[1] line['DOI'] = splits[0] else: line['Link'] = splits[0] line['DOI'] = None pub.times_cited = empty_to_none(line['Cited by']) pub.article_no = empty_to_none(line['Art. No.']) pub.publisher = empty_to_none(line['Publisher']) url = empty_to_none(line['Link']) if url: pub.source_urls.append( URL(url, type='SCOPUS', description='SCOPUS')) url_parts = urlparse(url) url_query = parse_qs(url_parts.query) if 'eid' in url_query and len: pub.identifiers.append( Identifier(url_query['eid'][0], type='SCOPUS')) for issn in list_remove_empty(line['ISSN'].split(u';')): pub.identifiers.append(Identifier(issn, type='ISSN')) for isbn in list_remove_empty(line['ISBN'].split(u';')): pub.identifiers.append(Identifier(isbn, type='ISBN')) doi = empty_to_none(line['DOI']) if doi: pub.identifiers.append(Identifier(doi, type='DOI')) pub.indexes.append(Index('SCOPUS', type='SCOPUS')) yield pub
def test_add_file_correct_input_one_file(self): with mock.patch('model.index.Config', autospec=True, spec_set=True): with mock.patch('builtins.open', mock.mock_open()) as m: index = Index() index.add_file('document') m.assert_called_once_with('document')
def test_index_raises_value_error_when_wrong_config_file(self, mock_config): mock_config.side_effect = ValueError with self.assertRaises(ValueError): index = Index()
class Application(object): config = Config def __init__(self): self.index = Index() self.__init_db() self.queue = TaskQueue(application=self, num_workers=self.config.WORKER_THREADS) self.search_module = Search(application=self) def __init_db(self): dir_path = os.path.dirname(os.path.realpath(__file__)) print(dir_path) self.db_path = "sqlite:///%s/datastore/%s" % \ (dir_path, self.config.DB) self.engine = create_engine(self.db_path) Base.metadata.create_all(self.engine) self.Session = sessionmaker(bind=self.engine) def init(self): pages_added = 0 with session_scope(self.Session) as session: if session.query(Page).count() == 0: for i in self.config.ROOT_NODES: page = Page(url=i) session.add(page) session.commit() self.queue.add_page(page.page_id) pages_added += 1 else: for page in session.query(Page).filter(Page.state != Page.State.PROCESSED).all(): self.queue.add_page(page.page_id) pages_added += 1 session.expunge_all() if pages_added != 0: print('No movie data in our system. We need to scrape IMDB for data...') print('Started pipeline! Added %s root pages to processing queue' % pages_added) self.queue.join() print("Finished processing!") self.search_module.build_index() def search(self, search_str): return self.search_module.search(search_str) def rebuild(self): self.clear() self.init() def clear(self): self.index.reset() meta = Base.metadata with session_scope(self.Session) as session: for table in reversed(meta.sorted_tables): session.execute(table.delete()) session.commit() Base.metadata.create_all(self.engine) dir_path = os.path.dirname(os.path.realpath(__file__)) folder = "%s/datastore/object_store" % dir_path for cur_file in os.listdir(folder): file_path = os.path.join(folder, cur_file) try: if os.path.isfile(file_path): os.unlink(file_path) elif os.path.isdir(file_path): shutil.rmtree(file_path) except Exception as e: print(e)
def __init__(self): self.index = Index() self.__init_db() self.queue = TaskQueue(application=self, num_workers=self.config.WORKER_THREADS) self.search_module = Search(application=self)
def entries_to_publications(self, entries): """Prerobi data zo SCOPUS json reprezentacie na internu Publication.""" def empty_to_none(s): if s is None: return None s = s.strip() if len(s) == 0: return None return s def exists_to_none(d, key): if key in d: if type(d[key]) is list: return [empty_to_none(x['$']) for x in d[key]] else: return empty_to_none(d[key]) else: return None def append_identifier(d, key, obj, type): ids = exists_to_none(d, key) if ids: if isinstance(ids, list): for id in ids: obj.identifiers.append(Identifier(id, type=type)) else: obj.identifiers.append(Identifier(ids, type=type)) for entry in entries: author_count = int(entry['author-count']['$']) if author_count == 0: authors = [] else: authors = self.authors_from_json(entry['author']) year = empty_to_none(entry['prism:coverDate']) if year: year = int(year.split('-')[0]) pub = Publication(empty_to_none(entry['dc:title']), authors, year) pub.times_cited = empty_to_none(entry['citedby-count']) source_title = exists_to_none(entry, 'prism:publicationName') if source_title: source_title, replacements = re.subn(INCLUDING_RE, '', source_title) source_title = source_title.strip() if replacements: pub.series = source_title else: pub.published_in = source_title url = self.find_next_url(entry['link'], ref='scopus') pub.source_urls.append(URL(url, type='SCOPUS', description='SCOPUS')) citedby_url = self.find_next_url(entry['link'], ref='scopus-citedby') if citedby_url is not None: pub.cite_urls.append(URL(citedby_url, type='SCOPUS', description='SCOPUS')) pub.pages = exists_to_none(entry, 'prism:pageRange') pub.volume = exists_to_none(entry, 'prism:volume') pub.issue = exists_to_none(entry, 'prism:issueIdentifier') pub.pages = exists_to_none(entry, 'prism:pageRange') append_identifier(entry, 'prism:doi', pub, 'DOI') append_identifier(entry, 'prism:isbn', pub, 'ISBN') append_identifier(entry, 'prism:issn', pub, 'ISSN') append_identifier(entry, 'eid', pub, 'SCOPUS') pub.indexes.append(Index('SCOPUS', type='SCOPUS')) yield pub