def add_doc(self, doc): """ doc: a dict """ content = doc['text'] document = UnprocessedDocument() document.fields.append(Field('text', content)) for k, v in doc.iteritems(): if k in ['text', 'id']: continue if type(v) == list: for item in v: document.fields.append(Field(k, ensure_unicode(item))) else: document.fields.append(Field(k, ensure_unicode(v))) document.id = str(doc['id']) try: self.lock.acquire() self.dbconn.add(document) except errors.IndexerError as e: print str(e) finally: self.lock.release()
def search(ctx): query = ctx.query.lower() ret = {} ret['ctx_update'] = {} srs = [] r = requests.get(GOOGLE_SCHOLAR_URL.format(query)) text = r.text.encode('utf-8') #with open('/tmp/b.html', 'r') as f: #text = f.read() def find_citecnt(dom): try: find = dom.findAll(attrs={'class': 'gs_ri'})[0] find = find.findAll(attrs={'class': 'gs_fl'})[0] find = find.findAll('a')[0].text cnt = re.search('[0-9]+', find).group() return int(cnt) except: return None soup = BeautifulSoup(text, BS_PARSER) results = soup.findAll(attrs={'class': 'gs_r'}) title_updated = None for rst in results: try: h3 = rst.findAll('h3')[0] real_title = h3.get_text() real_title = filter_title_fileformat(real_title) tc = title_correct(query, real_title) if not tc[0]: continue if not title_updated and tc[1]: title_updated = ensure_unicode(title_beautify(real_title)) while True: # fix things like '[citation][c] Title' new_title = re.sub('^\[[^\]]*\]', '', title_updated).strip() if new_title == title_updated: title_updated = new_title break title_updated = new_title log_info(u"Title updated: {0}".format(title_updated)) ret['ctx_update']['title'] = title_updated cnt = find_citecnt(rst) if cnt is not None: ret['ctx_update']['citecnt'] = cnt try: url = str(h3.find('a').get('href')) srs.append(SearchResult(None, url)) except: pass findpdf = rst.findAll(attrs={'class': 'gs_ggs'}) if findpdf: pdflink = findpdf[0].find('a').get('href') url = str(pdflink) srs.append(SearchResult('directpdf', url)) except Exception as e: log_exc("Search Item parse error: {0}".format(str(e))) ret['results'] = srs return ret
def update_new_title(self, title): if title != self.title: log_info("Using new title: {0}".format(ensure_unicode(title))) self.title = title return True return False
def name_clean(name): p = re.compile('\(.*?\)', re.DOTALL) ret = p.sub('', name).strip() return ensure_unicode(ret)