def __insert_journal(self): """add journal into venue index""" cprint('Journal commit started', 'pink') vix = open_dir(self.ven_index_path) writer = vix.writer() print('\tVenues count without journal: ' + str(vix.doc_count())) # writer.add_document(title=u"My document", content=u"This is my document!", # path=u"/a", tags=u"first short", icon=u"/icons/star.png") # f = open('jlist.txt', 'w') with open('jl.txt', 'r') as f: for line in f.readlines(): line = line.split('~') writer.add_document( key=line[0], pubtype='journal', title=line[1], year=line[2], url=line[5], ee=line[6], author='', publisher='', isbn='', ) writer.commit() print('\tVenues count with journal: ' + str(vix.doc_count())) cprint('Journal commit ended', 'purple') os.remove('jl.txt')
def print_alternative(alt): """print the others pubs contained in a given venue""" cprint('Pubs Included', *alt_obj, start='\t') [cprint(p.strip(), *argument, start='\t- ') for p in alt[:10] if p != ''] if len(alt) > 10: cprint(' ...', *argument, start='\t- ')
def menu_text(*args, start='', end='\n'): cprint( """ ___ ___ __ ___ ____ __ ____ _ / _ \/ _ )/ / / _ \ / __/__ ___ _________/ / / __/__ ___ _(_)__ ___ / // / _ / /__/ ___/ _\ \/ -_) _ `/ __/ __/ _ \ / _// _ \/ _ `/ / _ \/ -_) /____/____/____/_/ /___/\__/\_,_/_/ \__/_//_/ /___/_//_/\_, /_/_//_/\__/ /___/ """, *args, start='', end='\n')
def welcome_text(*args, start='', end='\n'): cprint( """ ██╗ ██╗███████╗██╗ ██████╗ ██████╗ ███╗ ███╗███████╗ ██║ ██║██╔════╝██║ ██╔════╝██╔═══██╗████╗ ████║██╔════╝ ██║ █╗ ██║█████╗ ██║ ██║ ██║ ██║██╔████╔██║█████╗ ██║███╗██║██╔══╝ ██║ ██║ ██║ ██║██║╚██╔╝██║██╔══╝ ╚███╔███╔╝███████╗███████╗╚██████╗╚██████╔╝██║ ╚═╝ ██║███████╗ ╚══╝╚══╝ ╚══════╝╚══════╝ ╚═════╝ ╚═════╝ ╚═╝ ╚═╝╚══════╝ """, *args, start='', end='\n')
def __indexing(self, handler, schema, parser, index_path): """a function that handles the index creation""" # ** returns dictionary as parameters writer = create_in(index_path, schema).writer(**self.__resources(self)) parser.setContentHandler(handler(writer)) parser.parse(self.db_path) if 'Pub' in index_path: cprint('Pubs commit started', 'green') else: cprint('Venues commit started.', 'lightcyan') writer.commit() if 'Pub' in index_path: cprint('Pubs commit ended.', 'green') else: cprint('Venues commit ended.', 'lightcyan')
def check_ixs(silent=False): """check if indexes has been created""" try: return check_open_ixs(silent=silent) except: while True: cprint('Indexes not found. Search Engine needs to create them.', 'orange', 'bold') db_path = input(form('Insert the DBLP file path: ', 'orange')) print() db_path = abspath(db_path) try: Index.create_ixs(Index(db_path)) except: cprint( 'It seems there is an error with the path. Please retry', 'red', 'bold') continue try: return check_open_ixs() except: cprint('It seems there is an error.', 'red', 'bold') break
def frequency(self, fuzzy): """ Used to get the rilevant documents using the frequency of the searched terms in the document. If you want to use fuzzy search of the query terms set fuzzy=True """ pquery, vquery = to_whoosh_query( self.__ask_query()) # Get the query used in whoosh # Whoosh Frequency doesn't support the OR query, so it will be splitted to merge later. pquery = pquery.split(' OR ') vquery = vquery.split(' OR ') pqprint = set() vqprint = set() print() # ----------- PUBLICATIONS ---------------------- with self.pix.searcher(weighting=Frequency) as ps: # "" search for phrase in which the maximum distance between each word is 1 # '' if you have to include characters in a term that are normally threated specially by the parsers, such # as spaces, colons, or brackets. presults = None for pq in pquery: if fuzzy: pq_parse = QueryParser('title', self.pix.schema, termclass=FuzzyTerm).parse(pq) pqprint.add(str(pq_parse)) else: pq_parse = QueryParser('title', self.pix.schema).parse(pq) pqprint.add(str(pq_parse)) if presults is not None: tresult = ps.search( pq_parse, limit=None, ) presults.upgrade_and_extend(tresult) else: presults = ps.search( pq_parse, limit=None, ) if not pq.startswith(('title', 'author', 'year'), ): if fuzzy: pq_parse = QueryParser('author', self.pix.schema, termclass=FuzzyTerm).parse(pq) pqprint.add(str(pq_parse)) else: pq_parse = QueryParser('author', self.pix.schema).parse(pq) pqprint.add(str(pq_parse)) tresult = ps.search( pq_parse, limit=None, ) presults.upgrade_and_extend(tresult) if fuzzy: pq_parse = QueryParser('year', self.pix.schema, termclass=FuzzyTerm).parse(pq) pqprint.add(str(pq_parse)) else: pq_parse = QueryParser('year', self.pix.schema).parse(pq) pqprint.add(str(pq_parse)) tresult = ps.search( pq_parse, limit=None, ) presults.upgrade_and_extend(tresult) cprint("Pub Query: " + ' OR '.join(pqprint), 'lightgrey', 'italic', start='\t') cprint('Publications found: ' + str(len(presults)), 'bold', 'lightgrey', 'url', start='\t') plist = [] for el in presults: tmp = { 'key': '', 'score': el.score, 'pub': {}, 'ven': {}, 'alternative': [] } for attr in el.items(): tmp['pub'][attr[0]] = attr[1] tmp['pub']['o_score'] = tmp['score'] plist.append(tmp) # --------------- VENUES -------------------------- vresults = None with self.vix.searcher(weighting=Frequency) as vs: # print('1: ', vquery) for vq in vquery: # print('2: ', vq) if fuzzy: vq_parse = QueryParser('title', self.vix.schema, termclass=FuzzyTerm).parse(vq) vqprint.add(str(vq_parse)) else: vq_parse = QueryParser('title', self.vix.schema).parse(vq) vqprint.add(str(vq_parse)) if vresults is not None: tresult = vs.search(vq_parse, limit=None) vresults.upgrade_and_extend(tresult) else: vresults = vs.search(vq_parse, limit=None) if not vq.startswith(('title:', 'publisher'), ): if fuzzy: vq_parse = QueryParser('publisher', self.vix.schema, termclass=FuzzyTerm).parse(vq) vqprint.add(str(vq_parse)) else: vq_parse = QueryParser('publisher', self.vix.schema).parse(vq) vqprint.add(str(vq_parse)) tresult = vs.search(vq_parse, limit=None) vresults.upgrade_and_extend(tresult) cprint("Ven Query: " + ' OR '.join(vqprint), 'lightgrey', 'italic', start='\t') cprint('Venues found: ' + str(len(vresults)), 'bold', 'lightgrey', 'url', start='\t') vlist = [] for el in vresults: tmp = { 'key': '', 'score': el.score, 'ven': {}, 'pub': {}, 'alternative': [] } for attr in el.items(): tmp['ven'][attr[0]] = attr[1] tmp['ven']['o_score'] = tmp['score'] vlist.append(tmp) self.__results(plist, vlist) # Call the function to print the results.
def bm25f(self, fuzzy): """ Used to get the rilevant documents. This ranking method use the default whoosh ranking method. If you want to use fuzzy search of the query terms set fuzzy=True""" pquery, vquery = to_whoosh_query( self.__ask_query()) # Get the query used in whoosh print() # ----------- PUBLICATIONS ---------------------- with self.pix.searcher() as ps: # "" search for phrase in which the maximum distance between each word is 1 # '' if you have to include characters in a term that are normally threated specially by the parsers, such # as spaces, colons, or brackets. if fuzzy: if 'pubtype' in pquery: # to prevent a bad search on the search term pquery = MultifieldParser( ['author', 'title', 'year'], self.pix.schema, termclass=FuzzyTerm).parse(pquery) else: pquery = MultifieldParser( ['pubtype', 'author', 'title', 'year'], self.pix.schema, termclass=FuzzyTerm).parse(pquery) else: if 'pubtype' in pquery: # to prevent a bad search on the search term pquery = MultifieldParser(['author', 'title', 'year'], self.pix.schema).parse(pquery) else: pquery = MultifieldParser( ['pubtype', 'author', 'title', 'year'], self.pix.schema).parse(pquery) cprint("Pub Query: " + str(pquery), 'lightgrey', 'italic', start='\t') presults = ps.search(pquery, limit=None) cprint('Publications found: ' + str(len(presults)), 'bold', 'lightgrey', 'url', start='\t') plist = [] for el in presults: tmp = { 'key': '', 'score': el.score, 'pub': {}, 'ven': {}, 'alternative': [] } for attr in el.items(): tmp['pub'][attr[0]] = attr[1] tmp['pub']['o_score'] = tmp['score'] plist.append(tmp) # --------------- VENUES -------------------------- with self.vix.searcher() as vs: if fuzzy: vquery = MultifieldParser(['title', 'publisher'], self.vix.schema, termclass=FuzzyTerm).parse(vquery) else: vquery = MultifieldParser(['title', 'publisher'], self.vix.schema).parse(vquery) cprint("Ven Query: " + str(vquery), 'lightgrey', 'italic', start='\t') vresults = vs.search(vquery, limit=None) cprint('Venues found: ' + str(len(vresults)), 'bold', 'lightgrey', 'url', start='\t') vlist = [] for el in vresults: tmp = { 'key': '', 'score': el.score, 'ven': {}, 'pub': {}, 'alternative': [] } for attr in el.items(): tmp['ven'][attr[0]] = attr[1] tmp['ven']['o_score'] = tmp['score'] vlist.append(tmp) self.__results(plist, vlist) # Call the function to print the results.
def __results(self, plist, vlist): """ Used at the end of the ranking function to mix the two indexes results and show only the relevants ones.""" plist = sorted(plist, key=lambda s: s['score'], reverse=True) vlist = sorted(vlist, key=lambda s: s['score'], reverse=True) if len(plist) == 0: for el in vlist: el['key'] = el['ven']['key'] results = vlist elif len(vlist) == 0: for el in plist: el['key'] = el['pub']['key'] results = plist else: results = tr(plist, vlist) # merge publications that have the same crossref same_venue = list() end_cycle = len(results) end_tot = 0 for r in results: if end_tot >= end_cycle: break if len(r['pub']) and len(r['ven']): if len(same_venue): id = None f = False for i in range(len(same_venue)): if same_venue[i]['key'] == r['ven']['key']: f = True # found id = i # position break if not f: same_venue.append({ 'key': r['ven']['key'], 'index': results.index(r) }) elif isinstance(results[id]['pub'], dict): # create a new element tmp = { 'key': r['ven']['key'], 'score': r['pub']['o_score'] + results[same_venue[id]['index']]['score'], 'pub': [ r['pub'], results[same_venue[id]['index']]['pub'], ], 'ven': r['ven'], 'alternative': [], } del results[ id] # remove the id element and the actual element results.remove(r) results.append(tmp) # add the element created same_venue[id]['index'] = results.index( tmp) # update the index end_cycle -= 2 # due to the remotion of the 2 elements else: results[id]['pub'].append(r['pub']) results[id]['score'] += r['pub']['o_score'] results.remove(r) end_cycle -= 1 # due to the remotion of the element else: same_venue.append({ 'key': r['ven']['key'], 'index': results.index(r) }) end_tot += 1 results = sorted(results, key=lambda s: s['score'], reverse=True) # find correlations if self.__output_level == 3: self.__find_correlations(results) else: self.__output = results cprint('RESULTS:', 'yellow', 'bold', 'url', start='\n\t', end='\n\n') count = 0 for element in self.__output: if count == self.__result_limit: break q_print(element, count + 1, self.__output_level) count += 1 self.__output = list()
def q_print(element, count, level): """ This function provide the documents output to the user.""" # - pub in venue --> score = pub.score if len(element['alternative']) == 0 and ( (isinstance(element['pub'], dict) and len(element['pub'])) or ( isinstance(element['pub'], list) and len(element['pub']) == 1)): cprint(' ' * 2 + str(count) + ')\t' + 'score: ' + str(round(element['score'], 5)), *score) cprint('Publication', *main_obj, start='\t') if isinstance(element['pub'], dict): print_pub(element['pub'], level) else: print_pub(element['pub'][0], level) if len(element['ven']) and level >= 2: if 'added' in element.keys(): cprint('In Venue', *alt_obj, start='\n\t') else: cprint('In Relevant Venue', *alt_obj, start='\n\t') print_inven(element['ven'], level) # - venue con alternative --> score = venue.score elif len(element['pub']) == 0: cprint(' ' * 2 + str(count) + ')\t' + 'score: ' + str(round(element['score'], 5)), *score) # ------- Venue ----------------- cprint('Venue', *main_obj, start='\t') print_venue(element['ven'], level) # alternative if len(element['alternative']) and level >= 3: print_alternative(element['alternative']) # - venue con pubs e alternative --> score = original(venue.score + pubs.score) else: s = element['ven']['o_score'] for x in element['pub']: s += x['o_score'] cprint(' ' * 2 + str(count) + ')\t' + 'score: ' + str(round(s, 5)), *score) cprint('Venue', *main_obj, start='\t') print_venue(element['ven'], level) print() cprint('Relevant Publications', *main_obj, start='\t') for pub in element['pub']: print_pub(pub, level) print() # alternative if len(element['alternative']) and level >= 3: print_alternative(element['alternative']) print()
def start(self): """a function that starts the menu loop""" check_ixs() while True: cprint('MAIN MENU\n', 'green', 'bold', 'url', start='\n\t') for choice in self.__choices_list: print(form(choice[0], *self.__colornumber), form(choice[1], *self.__colortext)) self.__last_selected = input(form('\nType your choice:\n> ', *self.__colorinput)) # ----------- Search ---------------------1 if self.__last_selected == '1': try: rank = Rank(self.__result_limit, self.__output_level) if self.__ranking == 'frequency': rank.frequency(self.__fuzzy) else: rank.bm25f(self.__fuzzy) except: cprint('Please, retry using the right sintax.', 'orange', 'bold', 'url', start='\r\t', end='\n\n') # ------------ Settings --------------------- elif self.__last_selected == '2': for option in self.__options_list: print(form(option[0], *self.__colornumber), form(option[1], *self.__colortext)) c = input(form('\nWhich options do you want to edit?\n> ', *self.__colorinput)) if c == '1': for rank in self.__ranking_list: print(form(rank[0], *self.__colornumber), form(rank[1], *self.__colortext)) c = input(form('\nWhich options do you want to choose?\n> ', *self.__colorinput)) if c == '2': self.__ranking = 'frequency' else: self.__ranking = 'bm25f' elif c == '2': limit = input(form('\nHow many results do you want to print?\n> ', *self.__colorinput)) self.__result_limit = int(limit) elif c == '3': print('Fuzzyterm: ', self.__fuzzy) c = input(form('\nDo you want to change it? [y/n]\n> ', *self.__colorinput)) if c == 'y': self.__fuzzy = not self.__fuzzy elif c == '4': for level in self.__level_list: print(form(level[0], *self.__colornumber), form(level[1], *self.__colortext)) c = input(form('\nWhich options do you want to choose?\n> ', *self.__colorinput)) if c in [x[0].replace('. ', '') for x in self.__level_list]: self.__output_level = int(c) elif c == '5': self.reset() # ------- Print Settings ---------------------- elif self.__last_selected == '3': o_color_key = ('pink', 'bold',) o_color_value = ('pink', 'italic',) cprint('Options: ', *o_color_key, start='\n') print('\t{}{}'.format(form('Ranking: ', *o_color_key), form(self.__ranking, *o_color_value))) print('\t{}{}'.format(form('Results limit: ', *o_color_key), form(self.__result_limit, *o_color_value))) print('\t{}{}'.format(form('Fuzzy: ', *o_color_key), form(self.__fuzzy, *o_color_value))) print('\t{}{}'.format(form('Output level: ', *o_color_key), form(self.__output_level, *o_color_value))) print() # --------- Exit ------------------------- elif self.__last_selected == '4': return else: cprint('Try again, you will be luckier!', 'orange', 'bold', 'url', start='\t', end='\n\n')