def insert_terms(db_path=None, db_url=None, input_path=None): assert (db_path is None and db_url is not None) or (db_path is not None and db_url is None) assert input_path is not None if db_path: engine = get_engine(db_path=db_path) else: engine = get_engine(db_url=db_url) # session = get_session(engine) con = engine.connect() meta = MetaData(engine) term_table = Table('terms', meta, autoload=True) with open(input_path, mode='r', encoding='utf-8') as file: insert_list = [] for line in file: line_split = line.strip().split('\t') term1 = line_split[0] if len(line_split) > 1: term2 = line_split[-1] else: term2 = None insert_list.append({'term': term1, 'term2': term2}) con.execute(term_table.insert(), insert_list) con.close()
def get_split_title_keyword_abstract(db_path=None, db_url=None, output_path='', foreground=False): assert (db_path is None and db_url is not None) or (db_path is not None and db_url is None) assert output_path is not None if db_path: engine = get_engine(db_path=db_path) else: engine = get_engine(db_url=db_url) session = get_session(engine) data = session.query(WosDocument).all() path = r'C:/Users/Tom/Desktop/bio_nature' if foreground: inner_path = path + '/foreground' else: inner_path = path + '/background' for document in data: title = document.title.strip() + '.' kw_str = '' # kw_str = ', '.join(document.keywords) for kw in document.keywords: kw_str += kw.keyword + '. ' # kw_str = kw_str[:-2] kp_str = '' # kp_str = ', '.join(document.keyword_plus) for kp in document.keyword_plus: kp_str += kp.keyword_plus + '. ' # kp_str = kp_str[:-2] if document.abs: abs_str = document.abs.replace('. ', '.\n') else: abs_str = '' out_str = '\n'.join([title, kw_str, kp_str, abs_str]) filename = inner_path + '/{}-{}.txt'.format(document.unique_id, document.pub_year) with open(filename, mode='w', encoding='utf-8') as file: file.write(out_str) with open(path + ('/foreground.list' if foreground else '/background.list'), mode='a', encoding='utf-8') as l: l.write( ('foreground' if foreground else 'background') + '/{}-{}.txt\n'.format(document.unique_id, document.pub_year))
def parse(input_dir=None, db_path=None, db_url=None): assert input_dir is not None and (db_path is not None or db_url is not None) init_set = set() for root, dirs, files in os.walk(input_dir): for file in files: if file[-4:] == '.txt': exist_set = parse_single(os.path.join(root, file), db_path, db_url, init_set) init_set = init_set.union(exist_set) # 最后处理内部引证关系 print('开始处理内部引证关系……') engine = get_engine(db_path, db_url) Base.metadata.create_all(engine) session = get_session(engine) session.execute( 'INSERT INTO wos_inner_reference ' 'SELECT DISTINCT t1.document_unique_id AS citing_paper_id, t2.unique_id AS cited_paper_id ' 'FROM wos_reference t1 INNER JOIN wos_document t2 ' 'ON t1.document_md5 = t2.document_md5 OR t1.doi = t2.doi ' 'ORDER BY citing_paper_id, cited_paper_id') session.commit() session.execute( 'DELETE FROM wos_inner_reference WHERE citing_paper_id = cited_paper_id' ) session.commit() session.close() print('全部解析完成')
def draw_cooccurrence_network(net_type=None, db_path=None, output_path=None, top_n=30): assert net_type is not None and output_path is not None and db_path is not None engine = get_engine(db_path) session = get_session(engine) print('正在处理共现数据') graph_data = [] data = [] title = None if net_type == 'keyword': title = 'Author Keyword Co-occurrence Network' data = session.query(WosDocument.unique_id, func.group_concat(WosKeyword.keyword, ';'))\ .join(WosKeyword).group_by(WosDocument.unique_id) filter_data = session.query(WosKeyword.keyword, func.count('*').label('num')) \ .group_by(WosKeyword.keyword).order_by(desc('num')) elif net_type == 'keyword_plus': title = 'WoS Keyword Co-occurrence Network' data = session.query(WosDocument.unique_id, func.group_concat(WosKeywordPlus.keyword_plus, ';'))\ .join(WosKeywordPlus).group_by(WosDocument.unique_id) filter_data = session.query(WosKeywordPlus.keyword_plus, func.count('*').label('num')) \ .group_by(WosKeywordPlus.keyword_plus).order_by(desc('num')) elif net_type == 'author': title = 'Author Co-authorship Network' data = session.query(WosDocument.unique_id, func.group_concat(WosAuthor.last_name +','+ WosAuthor.first_name, ';'))\ .join(WosAuthor).group_by(WosDocument.unique_id) filter_data = session.query(WosAuthor.last_name + ',' + WosAuthor.first_name, func.count('*').label('num')) \ .group_by(WosAuthor.last_name + ',' + WosAuthor.first_name).order_by(desc('num')) else: print('未考虑到的作图情况:', net_type) exit(-1) for row in data: row_split = row[1].split(';') if len(row_split) > 1: graph_data += list(combinations(row_split, 2)) # network是包含了全部关键词的共现网络 print('正在生成共现网络') network = get_network(graph_data, directed=False) session.close() nx.write_graphml(network, 'test.gml') filter_nodes = [i[0] for i in filter_data[top_n:]] sub = nx.restricted_view(network, filter_nodes, []) # 最大联通子图 # sub = sorted(nx.connected_component_subgraphs(sub), key = len, reverse=True)[0] # print('正在绘图') draw_net(sub, title=title, output_path=os.path.join(output_path, net_type))
def save_transfer(transfer: Dict): transfer_id = None try: with Session(model.get_engine()) as session: with session.begin(): operation_id = transfer['operation_id'] operation = session.query(model.Operation).get(operation_id) operation.state = transfer['next_state'] transfer = model.Transfer(tran_id=transfer['tranId'], type=transfer['type'], asset=transfer['asset'], amount=transfer['amount'], state='FILLED') operation.transfer = transfer transfer_id = transfer.id except Exception as ex: print(f"Error al guardar Transferencia = {transfer}") print(ex) traceback.print_stack() return transfer_id
def save_future_sell(future_order_dict: Dict): future_sell_id = None try: with Session(model.get_engine()) as session: with session.begin(): operation_id = future_order_dict['operation_id'] operation = session.query(model.Operation).get(operation_id) operation.state = 'FUTURE_SELL' order_id = future_order_dict['orderId'] future_order = session.query(model.FutureOrder).filter_by(order_id=order_id).first() future_sell = model_helper.sync_future_order(future_order_dict, future_order) operation.future_order = future_sell future_sell_id = future_sell.id except Exception as ex: print(f"Error al guardar Venta de futuro = {future_order_dict}") print(ex) traceback.print_stack() return future_sell_id
def save_operation_buy_spot(position_dict, spot_order_dict): with Session(model.get_engine()) as session: with session.begin(): # spot_order order_id = spot_order_dict['orderId'] spot_order = session.query( model.SpotOrder).filter_by(order_id=order_id).first() spot_order = model_helper.sync_spot_order(spot_order_dict, spot_order) # operation operation = spot_order.operation # position position = operation.position if operation else None if not position: position_id = position_dict['position_id'] if position_id: position = session.query(model.Position).get(position_id) else: position = model.Position() session.add(position) position = model_helper.sync_position( { **position_dict, 'state': 'CREATED' }, position) if not operation: operation = model.Operation() position.operations.append(operation) operation.spot_order = spot_order operation = model_helper.sync_operation( { **position_dict, 'kind': 'OPEN', 'state': 'SPOT_BUY' }, operation)
def save_spot_sell(spot_order: Dict): spot_sell_id = None with Session(model.get_engine()) as session: with session.begin(): operation_id = spot_order['operation_id'] operation = session.query(model.Operation).get(operation_id) operation.state = 'SPOT_SELL' operation.position.state = 'CLOSING' # spot_sell = model_helper.sync_spot_order(spot_order) # operation.spot_order = spot_sell # spot_sell_id = spot_sell.id return spot_sell_id
def task_avg_ratio(tickers, field, quantity): engine = model.get_engine() engine.dispose() sleep_time = quantity / 10 * 60 / len(tickers) for ticker in tickers: avg = model_service.get_data_ratio(engine, ticker=ticker, quantity=quantity) model_service.save_avg_ratio(engine, ticker, field, avg) while app.running: for ticker in tickers: try: avg = model_service.get_data_ratio(engine, ticker=ticker, quantity=quantity) model_service.save_avg_ratio(engine, ticker, field, avg) except Exception as ex: print(field) print(ex) time.sleep(sleep_time)
def save_future_buy(operation_dict: Dict, future_order_dict: Dict): future_buy_id = None with Session(model.get_engine()) as session: with session.begin(): position_id = operation_dict['position_id'] position = session.query(model.Position).get(position_id) position.state = 'CLOSING' position.message = '' operation = model_helper.sync_operation({ **operation_dict, 'kind': 'CLOSE', 'state': 'FUTURE_BUY' }) position.operations.append(operation) order_id = future_order_dict['orderId'] future_order = session.query( model.FutureOrder).filter_by(order_id=order_id).first() if future_order: print( f"Encontre future order = {future_order.id}, {future_order.status}" ) future_buy = model_helper.sync_future_order( future_order_dict, future_order) operation.future_order = future_buy # future_object = session.query(model.Future).get(operation.future) # future_object.balance.outdated = True future_buy_id = future_buy.id return future_buy_id
def parse_single(input_file=None, db_path=None, db_url=None, exist_set=None): assert input_file is not None and (db_path is not None or db_url is not None) and exist_set is not None engine = get_engine(db_path, db_url) Base.metadata.create_all(engine) session = get_session(engine) volume_pattern = re.compile(r'^v\d+$') page_pattern = re.compile(r'^p\w*\d+$') doi_pattern = re.compile(r'^doi \d+.+$') year_pattern = re.compile(r'^\d{4}$') print('正在解析{}……'.format(input_file)) with open(input_file, 'r', encoding='utf-8') as file: cur_field = None author_dict = {} initials_list = [] author_order = 1 wos_document = WosDocument() wos_document_list = [] # 有些换行了的字段先暂存,等到最后再处理 journal_line = None wos_category_line = None research_area_line = None keyword_line = None keyword_plus_line = None funding_line = None for line in file.readlines(): line = line[:-1].lower() # 通过每一行的前三个字符来识别行的状态 tmp = line[:3] if tmp != ' ': cur_field = tmp if cur_field == 'pt ': del wos_document wos_document = WosDocument() # 获得作者的缩写 elif cur_field == 'au ': if tmp == cur_field: initials_list.clear() full_name = line[3:] initials_list.append(full_name) # 解析作者得到作者顺序,此时尚未绑定通讯作者和地址 # 需要处理用空格分割的特殊情况 # 还需要处理匿名作者等无分割情况 # 还需要处理团体作者单独放在CA字段的情况 elif cur_field == 'af ': if tmp == cur_field: author_dict.clear() author_order = 1 full_name = line[3:] try: pos = full_name.index(',') except: try: pos = full_name.index(' ') except: pos = len(full_name) author = WosAuthor( full_name[pos + 1:], full_name[:pos].strip(), initials_list[author_order - 1].replace(',', ''), author_order, 0) author.document = wos_document author_dict[full_name] = author if author_order == 1: wos_document.first_author = initials_list[author_order - 1].replace( ',', '') author_order += 1 elif cur_field == 'ca ': group_author = line[3:] initials_list.append(group_author) author = WosAuthor( group_author, None, initials_list[author_order - 1].replace(',', ''), author_order, 0) author_dict[group_author] = author if author_order == 1: wos_document.first_author = initials_list[author_order - 1].replace( ',', '') author_order += 1 elif cur_field == 'c1 ': # 将机构地址绑定到前面提取到的作者上 author_affiliation = line[3:] try: pos = author_affiliation.index(']') except: # print('存在没有作者的机构{},已抛弃'.format(author_affiliation)) continue authors = author_affiliation[1:pos].split('; ') for author in authors: affiliation = WosAffiliation(author_affiliation[pos + 2:-1]) affiliation.author = author_dict[author] elif cur_field == 'rp ': # 确定通讯作者 rp_author_affiliations = line[3:].split('; ') for rp_author_affiliation in rp_author_affiliations: try: pos = rp_author_affiliation.index(' (') rp_author = rp_author_affiliation[:pos] except: rp_author = rp_author_affiliation try: rp_index = initials_list.index(rp_author) + 1 except: rp_index = 1 for author in author_dict.keys(): if author_dict[author].author_order == rp_index: author_dict[author].is_reprint_author = 1 elif cur_field == 'ti ': title = line[3:] if wos_document.title is not None: wos_document.title += ' ' + title else: wos_document.title = title elif cur_field == 'so ': if journal_line is not None: journal_line += ' ' + line[3:] else: journal_line = line[3:] elif cur_field == 'la ': wos_document.language = line[3:] elif cur_field == 'dt ': wos_document.document_type = line[3:] elif cur_field == 'de ': if keyword_line is not None: keyword_line += ' ' + line[3:] else: keyword_line = line[3:] elif cur_field == 'id ': if keyword_plus_line is not None: keyword_plus_line += ' ' + line[3:] else: keyword_plus_line = line[3:] elif cur_field == 'ab ': if wos_document.abs is not None: wos_document.abs += ' ' + line[3:] else: wos_document.abs = line[3:] elif cur_field == 'em ': wos_document.author_email = line[3:].replace(' ', '') elif cur_field == 'fu ': if funding_line is not None: funding_line += ' ' + line[3:] else: funding_line = line[3:] elif cur_field == 'fx ': if wos_document.funding_text is not None: wos_document.funding_text += ' ' + line[3:] else: wos_document.funding_text = line[3:] elif cur_field == 'cr ': # 解析参考文献 reference = line[3:] ref_split = reference.split(', ') first_author = None pub_year = None journal = None volume = None start_page = None doi = None year_flag = False if len(ref_split) < 2: journal = ref_split[0] else: i_list = [] for i_part in range(len(ref_split)): volume_match = volume_pattern.match(ref_split[i_part]) page_match = page_pattern.match(ref_split[i_part]) doi_match = doi_pattern.match(ref_split[i_part]) if not year_flag: year_match = year_pattern.match(ref_split[i_part]) else: year_match = None if year_match: pub_year = ref_split[i_part] i_list.append(i_part) year_flag = True elif volume_match: volume = ref_split[i_part][1:] i_list.append(i_part) elif page_match: start_page = ref_split[i_part][1:] i_list.append(i_part) elif doi_match: doi = ref_split[i_part].replace( 'doi ', '').replace('[', '').replace(']', '') i_list.append(i_part) i_list.sort() if len(i_list) > 0: if min(i_list) > 0: first_author = ref_split[0] start_pos = None end_pos = None pos = 0 while pos < len(i_list) - 1: if i_list[pos + 1] - i_list[pos] > 1: start_pos = i_list[pos] + 1 if start_pos is not None and i_list[ pos + 1] - i_list[pos] == 1: end_pos = i_list[pos] break pos += 1 if start_pos is not None or end_pos is not None: if start_pos == end_pos: journal = ref_split[start_pos] elif end_pos is None: journal = ', '.join( ref_split[start_pos:i_list[-1]]) else: journal = ', '.join( ref_split[start_pos:end_pos]) else: if year_flag: try: journal = ref_split[i_list[-1] + 1] except: journal = None else: journal = ref_split[i_list[0] - 1] else: first_author = ref_split[0] journal = ref_split[1] # 由于参考文献字段非常不规范,经常超长,所以使用截断 if first_author is not None and len(first_author) > 254: first_author = first_author[:254] if journal is not None and len(journal) > 254: journal = journal[:254] ref = WosReference( first_author.replace('.', '').replace('. ', '').replace( ',', '') if first_author else first_author, pub_year, journal, volume, start_page, doi) ref.document = wos_document elif cur_field == 'nr ': wos_document.reference_num = int(line[3:]) elif cur_field == 'tc ': wos_document.cited_times = int(line[3:]) elif cur_field == 'u1 ': wos_document.usage_180 = int(line[3:]) elif cur_field == 'u2': wos_document.usage_since_2013 = int(line[3:]) elif cur_field == 'pu ': wos_document.publisher = line[3:] elif cur_field == 'ji ': wos_document.journal_iso = line[3:] elif cur_field == 'j9 ': wos_document.journal_29 = line[3:] elif cur_field == 'pd ': wos_document.pub_month_day = line[3:] elif cur_field == 'py ': wos_document.pub_year = line[3:] elif cur_field == 'vl ': wos_document.volume = line[3:] elif cur_field == 'is ': wos_document.issue = line[3:] elif cur_field == 'bp ': wos_document.start_page = line[3:] elif cur_field == 'ep ': wos_document.end_page = line[3:] elif cur_field == 'di ': wos_document.doi = line[3:] elif cur_field == 'wc ': if wos_category_line is not None: wos_category_line += ' ' + line[3:] else: wos_category_line = line[3:] elif cur_field == 'sc ': if research_area_line is not None: research_area_line += ' ' + line[3:] else: research_area_line = line[3:] elif cur_field == 'ut ': wos_document.unique_id = line[7:] elif cur_field == 'er': # 在最后一行处理多行字段的问题 if journal_line is not None: wos_document.journal = journal_line journal_line = None if keyword_line is not None: keywords = keyword_line.split('; ') for keyword in keywords: if len(keyword) > 254: keyword = keyword[:254] key = WosKeyword(keyword) key.document = wos_document keyword_line = None if keyword_plus_line is not None: keyword_plus = keyword_plus_line.split('; ') for kp in keyword_plus: if len(kp) > 254: kp = kp[:254] keyp = WosKeywordPlus(kp) keyp.document = wos_document keyword_plus_line = None if wos_category_line is not None: categories = wos_category_line.split('; ') for category in categories: if len(category) > 254: category = category[:254] cat = WosCategory(category) cat.document = wos_document wos_category_line = None if research_area_line is not None: areas = research_area_line.split('; ') for area in areas: if len(area) > 254: area = area[:254] a = WosResearchArea(area) a.document = wos_document research_area_line = None if funding_line is not None: fundings = funding_line.split('; ') for fund in fundings: pos = find_nth(fund, '[', -1) if pos != -1: funding = [fund[:pos], fund[pos:]] agent = funding[0] numbers = funding[1].replace('[', '').replace( ']', '').split(', ') for number in numbers: f = WosFunding(agent, number) f.document = wos_document else: agent = fund f = WosFunding(agent, None) f.document = wos_document funding_line = None wos_document.document_md5 = document_hash(wos_document) # TODO:排除非article和review文献,用完记得删除 # if (not 'article' in wos_document.document_type and not 'review' in wos_document.document_type) \ # or 'early access' in wos_document.document_type or 'retracted' in wos_document.document_type\ # or 'software' in wos_document.document_type or 'hardware' in wos_document.document_type\ # or 'exhibit' in wos_document.document_type or 'database' in wos_document.document_type\ # or 'book' in wos_document.document_type: # continue # 统一处理一下超长截断问题 if len(wos_document.title) > 499: wos_document.title = wos_document.title[:499] if wos_document.unique_id in exist_set: continue else: exist_set.add(wos_document.unique_id) wos_document_list.append(wos_document) print('解析{}完成,正在写入数据库……'.format(input_file)) session.add_all(wos_document_list) session.commit() session.close() print('插入{}完成\n'.format(input_file)) return exist_set
import traceback import keys import time import app from model_service import sync_spot_prices_calc, spot_symbols_with_futures import model import config cache = dict() engine = model.get_engine() def price_risk_safe(symbol, book, risk=config.RISK, safe=config.SAFE): sum_total_risk, sum_total_safe, sum_size_risk, sum_size_safe = 0, 0, 0, 0 sum_provisorio_total, sum_provisorio_size = 0, 0 price_risk, price_safe = 0, 0 # print(book) for i in book: size = float(i[1]) price = float(i[0]) sum_provisorio_total += price * size
def resetFutureBalance(future_symbol): with Session(model.get_engine()) as session, session.begin(): future_object = session.query(model.Future).get(future_symbol) future_object.balance.outdated = True
'R': False, 'wt': 'CONTRACT_PRICE', 'ot': 'MARKET', 'ps': 'BOTH', 'cp': False, 'ma': 'BTC', 'rp': '0.00038223', 'pP': False, 'si': 0, 'ss': 0 } } future_order_dict = future_order_dict['o'] with Session(model.get_engine()) as session: with session.begin(): order_id = future_order_dict['i'] future_order = session.query( model.FutureOrder).filter_by(order_id=order_id).first() if future_order: print( f"Encontre future order = {future_order.id}, {future_order.status}" ) future_buy = model_helper.sync_future_order( future_order_dict, future_order)
def parse_single(input_file=None, db_path=None, db_url=None): assert input_file is not None and (db_path is not None or db_url is not None) engine = get_engine(db_path, db_url) Base.metadata.create_all(engine) session = get_session(engine) print('正在解析{}……'.format(input_file)) # TODO:被引数、利用数等实时指标在官方导出的格式里面没有。作者邮箱因为太过详细暂时不解析 # 用ElementTree读取XML文件,其中root标签是records # 因为WOS的XML文件带有name_space,在查找节点时需要加进去 tree = ET.parse(input_file) records = tree.getroot() name_space = records.tag[:records.tag.index('}') + 1] wos_document_list = [] for record in records: wos_document = WosDocument() wos_document.unique_id = get_unique_id(name_space, record) wos_document.title = get_title(name_space, record) wos_document.abs = get_abs(name_space, record) wos_document.journal = get_journal(name_space, record) wos_document.journal_iso = get_journal_iso(name_space, record) wos_document.journal_29 = get_journal_29(name_space, record) wos_document.publisher = get_publisher(name_space, record) wos_document.volume = get_volume(name_space, record) wos_document.issue = get_issue(name_space, record) wos_document.start_page = get_start_page(name_space, record) wos_document.end_page = get_end_page(name_space, record) wos_document.pub_year = get_pub_year(name_space, record) wos_document.pub_month_day = get_pub_month_day(name_space, record) wos_document.document_type = get_document_type(name_space, record) wos_document.doi = get_doi(name_space, record) wos_document.reference_num = get_reference_num(name_space, record) wos_document.funding_text = get_funding_text(name_space, record) wos_document.language = get_language(name_space, record) # print(wos_document) authors = record.find( './{0}static_data/{0}summary/{0}names'.format(name_space)) wos_document.authors = get_authors(name_space, authors, record) # print(wos_document.authors) references = record.find( './{0}static_data/{0}fullrecord_metadata/{0}references'.format( name_space)) wos_document.references = get_references(name_space, references) # print(wos_document.references) categories = record.findall( './{0}static_data/{0}fullrecord_metadata/{0}category_info/{0}subjects/{0}subject[@ascatype="traditional"]' .format(name_space)) wos_document.categories = get_categories(name_space, categories) # print(wos_document.categories) areas = record.findall( './{0}static_data/{0}fullrecord_metadata/{0}category_info/{0}subjects/{0}subject[@ascatype="extended"]' .format(name_space)) wos_document.research_areas = get_research_areas(name_space, areas) # print(wos_document.research_areas) keywords = record.find( './{0}static_data/{0}fullrecord_metadata/{0}keywords'.format( name_space)) wos_document.keywords = get_keywords(name_space, keywords) # print(wos_document.keywords) keyword_plus = record.find( './{0}static_data/{0}item/{0}keywords_plus'.format(name_space)) wos_document.keyword_plus = get_keyword_plus(name_space, keyword_plus) # print(wos_document.keyword_plus) fundings = record.find( './{0}static_data/{0}fullrecord_metadata/{0}fund_ack/{0}grants'. format(name_space)) wos_document.fundings = get_fundings(name_space, fundings) # print(wos_document.fundings) wos_document_list.append(wos_document) # 及时写入清空队列 if len(wos_document_list) > 499: print('缓存队列达到阈值,正在写入数据库……') session.add_all(wos_document_list) session.commit() wos_document_list.clear() # print() print('解析{}完成,正在写入数据库……'.format(input_file)) session.add_all(wos_document_list) session.commit() session.close() print('插入{}完成\n'.format(input_file))
def parse_single(input_file=None, db_path=None, db_url=None): assert input_file is not None and (db_path is not None or db_url is not None) print('正在解析{}……'.format(input_file)) bibtex_filename = input_file with open(bibtex_filename, 'r', encoding='utf-8') as file: parser = BibTexParser() parser.customization = customizations bib_db = bibtexparser.load(file, parser=parser) # print(len(bib_db.entries)) # exit(-1) # for k,v in bib_db.entries[0].items(): # print(k,v) # print('======\n') # exit(0) # if len(bib_db.entries) != 500: # exit(-1) engine = get_engine(db_path, db_url) Base.metadata.create_all(engine) session = get_session(engine) for i in range(len(bib_db.entries)): author_list = [] category_list = [] area_list = [] keyword_list = [] keyword_plus_list = [] reference_list = [] funding_list = [] # 解析文章基本信息 wos_document表的信息 try: wos_document = WosDocument( bib_db.entries[i]['unique-id'][5:-1].lower() if 'unique-id' in bib_db.entries[i] else None, bib_db.entries[i]['title'][1:-1].lower().replace( '\n', ' ').replace('\\', '') if 'title' in bib_db.entries[i] else None, bib_db.entries[i]['abstract'][1:-1].lower().replace( '\n', ' ').replace('\\', '') if 'abstract' in bib_db.entries[i] else None, bib_db.entries[i]['journal'][1:-1].lower().replace('\\', '') if 'journal' in bib_db.entries[i] else bib_db.entries[i] ['booktitle'][1:-1].lower().replace('\n', ' ').replace( '\\', '') if 'booktitle' in bib_db.entries[i] else None, bib_db.entries[i]['journal-iso'][1:-1].lower().replace( '\\', '') if 'journal-iso' in bib_db.entries[i] else None, # bibtex格式不存在29字符格式的期刊缩写 None, bib_db.entries[i]['publisher'][1:-1].lower().replace('\\', '') if 'publisher' in bib_db.entries[i] else None, bib_db.entries[i]['volume'][1:-1].lower() if 'volume' in bib_db.entries[i] else None, bib_db.entries[i]['number'][1:-1].lower() if 'number' in bib_db.entries[i] else None, bib_db.entries[i]['pages'][1:-1].lower().split('-')[0] if 'pages' in bib_db.entries[i] and len(bib_db.entries[i]['pages'][1:-1].lower().split('-')) > 1 else bib_db.entries[i]['pages'][1:-1].lower().split('+')[0] if 'pages' in bib_db.entries[i] else None, bib_db.entries[i]['pages'][1:-1].lower().split('-')[1] if 'pages' in bib_db.entries[i] and len(bib_db.entries[i]['pages'][1:-1].lower().split('-')) > 1 else '+' if 'pages' in bib_db.entries[i] else None, bib_db.entries[i]['year'][1:-1].lower() if 'year' in bib_db.entries[i] else None, bib_db.entries[i]['month'][1:-1].lower() if 'month' in bib_db.entries[i] else None, bib_db.entries[i]['type'][1:-1].lower() if 'type' in bib_db.entries[i] else None, bib_db.entries[i]['doi'][1:-1].lower() if 'doi' in bib_db.entries[i] else None, bib_db.entries[i]['times-cited'][1:-1].lower() if 'times-cited' in bib_db.entries[i] else None, bib_db.entries[i]['number-of-cited-references'][1:-1].lower() if 'number-of-cited-references' in bib_db.entries[i] else None, bib_db.entries[i]['usage-count-last-180-days'][1:-1].lower() if 'usage-count-last-180-days' in bib_db.entries[i] else None, bib_db.entries[i]['usage-count-since-2013'][1:-1].lower() if 'usage-count-since-2013' in bib_db.entries[i] else None, bib_db.entries[i]['funding-text'][1:-1].lower().replace( '\n', ' ').replace('\\', '') if 'funding-text' in bib_db.entries[i] else None, bib_db.entries[i]['language'][1:-1].lower() if 'language' in bib_db.entries[i] else None, bib_db.entries[i]['author-email'][1:-1].lower().replace( '\n', ';').replace('\\', '') if 'author-email' in bib_db.entries[i] else None) except Exception as e: print(bib_db.entries[i]) print('该行出现故障', e) exit(-1) # TODO: 暂时把格式错误的文章删去 if wos_document.unique_id is None: print('{} 文件存在格式不正确的记录,已跳过该记录'.format(input_file)) continue # 解析作者及机构的信息 if bib_db.entries[i]['affiliation'] is not None: for author_info, addresses in bib_db.entries[i][ 'affiliation'].items(): affiliation_list = [] # bibtex格式无法找到规范缩写 author = WosAuthor(author_info[0], author_info[1], None, author_info[2], author_info[3]) session.add(author) session.flush() if addresses is None: no_affiliation = WosAffiliation(None) affiliation_list.append(no_affiliation) else: for address in addresses: affiliation = WosAffiliation(address) affiliation_list.append(affiliation) author.affiliations = affiliation_list author_list.append(author) wos_document.authors = author_list # 解析WoS分类信息 if bib_db.entries[i]['web-of-science-categories'] is not None: for category in bib_db.entries[i]['web-of-science-categories']: cat = WosCategory(category) category_list.append(cat) wos_document.categories = category_list # 解析研究领域信息 if bib_db.entries[i]['research-areas'] is not None: for area in bib_db.entries[i]['research-areas']: a = WosResearchArea(area) area_list.append(a) wos_document.research_areas = area_list # 解析作者关键词 if bib_db.entries[i]['keywords'] is not None: for keyword in bib_db.entries[i]['keywords']: key = WosKeyword(keyword) keyword_list.append(key) wos_document.keywords = keyword_list # 解析WoS KeywordPlus if bib_db.entries[i]['keywords-plus'] is not None: for keyword_plus in bib_db.entries[i]['keywords-plus']: kp = WosKeywordPlus(keyword_plus) keyword_plus_list.append(kp) wos_document.keyword_plus = keyword_plus_list # 解析基金信息 if bib_db.entries[i]['funding-acknowledgement'] is not None: for agent, numbers in bib_db.entries[i][ 'funding-acknowledgement'].items(): for number in numbers: fund = WosFunding(agent, number) funding_list.append(fund) wos_document.fundings = funding_list # 解析参考文献信息 if bib_db.entries[i]['cited-references'] is not None: for reference in bib_db.entries[i]['cited-references']: ref = WosReference(reference[0], reference[1], reference[2], reference[3], reference[4], reference[5]) reference_list.append(ref) wos_document.references = reference_list session.add(wos_document) print('解析{}完成,正在插入……'.format(input_file)) session.commit() session.close() print('插入{}完成\n'.format(input_file))
def term_feature_extraction(db_path=None, db_url=None, output_path=''): assert (db_path is None and db_url is not None) or (db_path is not None and db_url is None) assert output_path is not None if db_path: engine = get_engine(db_path=db_path) else: engine = get_engine(db_url=db_url) meta = MetaData(engine) con = engine.connect() session = get_session(engine) term_table = Table('terms', meta, autoload=True) dataframe = [] abbr = {} query_terms = select([term_table.c.tid, term_table.c.term, term_table.c.term2]).where( term_table.c.tid > 9281).order_by(asc(term_table.c.tid)) result = con.execute(query_terms) terms = list(result) result.close() # with open('terms.csv', mode='w', encoding='utf-8') as _: # _.write('"' + '","'.join( # ['term', 'year', 'document_count', 'document_increment_ratio', 'author_count', 'citation_count', # 'funding_count', # 'reference_count', 'acc_document_count', 'acc_author_count', 'acc_citation_count', # 'acc_funding_count', 'acc_reference_count']) + '"\n') with open(r'abbreviations.txt', mode='r', encoding='utf-8') as abbr_file: for line in abbr_file: line_split = line.strip().split('\t') abbr[line_split[2].lower()] = line_split[1].lower() for term in terms: if len(term[1]) < 6: new_term = abbr.get(term[1], term[1]) else: new_term = term[1] print(term[0], ':', new_term) acc_document_count = 0 acc_author_count = 0 acc_reference_count = 0 acc_funding_count = 0 acc_citation_count = 0 last_document_count = 0 for year in range(2003, 2013): document_count = 0 author_count = 0 reference_count = 0 funding_count = 0 citation_count = 0 documents = session.query(WosDocument).filter(WosDocument.pub_year == year).all() for document in documents: keyword_plus = ', '.join( [kp.keyword_plus for kp in document.keyword_plus]) if document.keyword_plus else '' keyword = ', '.join([kw.keyword for kw in document.keywords]) if document.keywords else '' title = document.title if document.title else '' abstract = document.abs if document.abs else '' string = ' '.join([keyword_plus, keyword, title, abstract]) term_pattern = re.compile(r'\b{}s*\b'.format(re.escape(new_term))) if term[2]: term2_pattern = re.compile(r'\b{}s*\b'.format(re.escape(term[2]))) else: term2_pattern = re.compile(r'nothing') if term_pattern.search(string) or term2_pattern.search(string): # if new_term in keyword_plus or new_term in keyword or new_term in title or new_term in abstract: document_count += 1 author_count += len(document.authors) reference_count += len(document.references) funding_count += len(document.fundings) citation_count += document.cited_times acc_document_count += document_count acc_author_count += author_count acc_reference_count += reference_count acc_funding_count += funding_count acc_citation_count += citation_count document_increment_ratio = (document_count / last_document_count) if last_document_count != 0 else 0 last_document_count = document_count # dataframe.append((new_term, year, document_count,document_increment_ratio, author_count, citation_count, funding_count, # reference_count, acc_document_count, acc_author_count, acc_citation_count, acc_funding_count, acc_reference_count)) with open('terms.csv', mode='a', encoding='utf-8') as file: file.write('"' + '","'.join(map(str, [new_term, year, document_count, document_increment_ratio, author_count, citation_count, funding_count, reference_count, acc_document_count, acc_author_count, acc_citation_count, acc_funding_count, acc_reference_count])) + '"\n') # df = pd.DataFrame(dataframe, columns=['term','year','document_count','document_increment_ratio','author_count', 'citation_count','funding_count', # 'reference_count','acc_document_count','acc_author_count','acc_citation_count', # 'acc_funding_count','acc_reference_count']) # df.to_csv('terms.csv', index=None) session.close() con.close()
def resetSpotBalance(asset): with Session(model.get_engine()) as session, session.begin(): spot_balance = session.query(model.SpotBalance).get(asset) spot_balance.outdated = True
def parse_single(input_file=None, db_path=None, db_url=None): assert input_file is not None and (db_path is not None or db_url is not None) engine = get_engine(db_path, db_url) Base.metadata.create_all(engine) session = get_session(engine) print('正在解析{}……'.format(input_file)) # TODO:被引数、利用数等实时指标在官方导出的格式里面没有。作者邮箱因为太过详细暂时不解析 # 用ElementTree读取XML文件,其中root标签是records # 因为WOS的XML文件带有name_space,在查找节点时需要加进去 # tree = ET.parse(input_file) # records = tree.getroot() # name_space = records.tag[:records.tag.index('}')+1] wos_document_list = [] # ns_pattern = re.compile(r'<records xmlns="(.+?)"') # name_space = None with open(input_file, mode='r', encoding='utf-8') as file: single_record = '' for line in file: if '<REC' in line: single_record = line elif '</REC>' in line: single_record += line start = time.time() record = ET.fromstring(single_record) wos_document = WosDocument() wos_document.unique_id = get_unique_id(record) wos_document.title = get_title(record) wos_document.abs = get_abs(record) wos_document.journal = get_journal(record) wos_document.journal_iso = get_journal_iso(record) wos_document.journal_29 = get_journal_29(record) wos_document.publisher = get_publisher(record) wos_document.volume = get_volume(record) wos_document.issue = get_issue(record) wos_document.start_page = get_start_page(record) wos_document.end_page = get_end_page(record) wos_document.pub_year = get_pub_year(record) wos_document.pub_month_day = get_pub_month_day(record) wos_document.document_type = get_document_type(record) wos_document.doi = get_doi(record) wos_document.reference_num = get_reference_num(record) wos_document.funding_text = get_funding_text(record) wos_document.language = get_language(record) # print(wos_document) authors = record.find('./static_data/summary/names') wos_document.authors = get_authors(authors, record) # print(wos_document.authors) references = record.find( './static_data/fullrecord_metadata/references') wos_document.references = get_references(references) # print(wos_document.references) categories = record.findall( './static_data/fullrecord_metadata/category_info/subjects/subject[@ascatype="traditional"]' ) wos_document.categories = get_categories(categories) # print(wos_document.categories) areas = record.findall( './static_data/fullrecord_metadata/category_info/subjects/subject[@ascatype="extended"]' ) wos_document.research_areas = get_research_areas(areas) # print(wos_document.research_areas) keywords = record.find( './static_data/fullrecord_metadata/keywords') wos_document.keywords = get_keywords(keywords) # print(wos_document.keywords) keyword_plus = record.find('./static_data/item/keywords_plus') wos_document.keyword_plus = get_keyword_plus(keyword_plus) # print(wos_document.keyword_plus) fundings = record.find( './static_data/fullrecord_metadata/fund_ack/grants') wos_document.fundings = get_fundings(fundings) # print(wos_document.fundings) wos_document_list.append(wos_document) # 及时写入清空队列 if len(wos_document_list) > 499: print('缓存队列达到阈值,正在写入数据库……', end='') session.add_all(wos_document_list) session.commit() print(' 完成 - {}秒'.format(time.time() - start)) wos_document_list.clear() # print() elif line == '\n': pass else: single_record += line print('解析{}完成,正在写入数据库……'.format(input_file)) session.add_all(wos_document_list) session.commit() session.close() print('插入{}完成\n'.format(input_file))