def main(): parser = argparse.ArgumentParser() arg = parser.add_argument arg('q_model', help='Path to Q.joblib (deep-deep link model)') arg('data', help='Path to jl.gz file in CDR format') arg('output_folder', help='Where to put html output files N.html') arg('--offset', type=int, default=0, help='0-based start index in data') arg('--limit', type=int, default=100, help='Number of documents to analyze') args = parser.parse_args() output_path = Path(args.output_folder) output_path.mkdir(exist_ok=True) q_model = joblib.load(args.q_model) assert not q_model.get('page_vectorizer'), 'TODO' le = DictLinkExtractor() styles = format_html_styles() with json_lines.open(args.data, broken=True) as items: items = islice(items, args.offset, None) if args.limit: items = islice(items, args.limit) with multiprocessing.Pool() as pool: for idx, expls in enumerate( pool.imap(partial(links_expls, q_model, le), items)): expls.sort(reverse=True) (output_path.joinpath( '{}.html'.format(idx + args.offset)).write_text( styles + '\n'.join(expl for _, expl in expls)))
def convert_item_set_to_dict(file_dir): """Converts a dataset file into dict of item-information""" item_info = { "item_id": [], "title": [], "price": [], "category_id": [], "product_id": [], "domain_id": [], "condition": [] } with json_lines.open(file_dir) as f: for item in enumerate(f): item_info['item_id'].append(item[1]['item_id']) item_info['title'].append(item[1]['title']) item_info['price'].append(item[1]['price']) item_info['category_id'].append(item[1]['category_id']) item_info['product_id'].append(item[1]['product_id']) item_info['domain_id'].append(item[1]['domain_id']) item_info['condition'].append(item[1]['condition']) print("Finished reading file, proceding Dataframe") return item_info
def test_reader_broken_json_partial(tmpdir): # with broken=True broken json lines are skipped, but reading continues p = tmpdir.join('myfile.jl') p.write_binary(b'{"a": 1}\n{"a": 2\n{"b": 1}\n') with json_lines.open(str(p), broken=True) as f: lines = list(f) assert lines == [{'a': 1}, {'b': 1}]
def main(): parser = argparse.ArgumentParser() arg = parser.add_argument arg('input', help='.jl or .jl.gz file in CDRv2 format') arg('output', help='path to .jl or .jl.gz output in CDRv3 format') arg('--broken', action='store_true', help='specify if input might be broken (incomplete)') args = parser.parse_args() assert args.input != args.output with json_lines.open(args.input, broken=args.broken) as f: opener = gzip.open if args.output.endswith('.gz') else open with opener(args.output, 'wt') as outf: for v2_item in f: dt = datetime.fromtimestamp(v2_item['timestamp'] / 1e3) timestamp_crawl = format_timestamp(dt) assert v2_item['version'] == 2.0 v3_item = CDRItem( _id=format_id(v2_item['url'], timestamp_crawl), crawler=v2_item['crawler'], team=v2_item['team'], timestamp_crawl=timestamp_crawl, version=3.0, url=v2_item['url'], raw_content=v2_item['raw_content'], content_type=v2_item['content_type'], response_headers={'content-type': v2_item['content_type']}, ) outf.write(json.dumps(dict(v3_item))) outf.write('\n')
def run(self, args, opts): if not args: raise UsageError() if len(args) == 1 and '*' in args[0]: # paths were not expanded (docker) filenames = glob.glob(args[0]) else: filenames = args del args filtered_filenames = [ f for f in filenames if re.match(r'[a-z0-9]{12}\.csv$', os.path.basename(f)) ] filenames = filtered_filenames or filenames if not filenames: raise UsageError() response_logs = [] for filename in filenames: with json_lines.open(filename) as f: response_logs.append(pd.DataFrame(f)) print('Read data from {} files'.format(len(filenames))) all_rpms = [ rpms for rpms in ( get_rpms(name, rlog, step=opts.step, smooth=opts.smooth) for name, rlog in zip(filenames, response_logs)) if rpms is not None ] if all_rpms: print_rpms(all_rpms, opts) print_scores(response_logs, opts)
def main(args=None): parser = argparse.ArgumentParser() parser.add_argument('infile', help='Pages in .jl.gz') parser.add_argument( 'out_prefix', help='Output prefix (two files are written: one with' 'full data and one with meta: status, url, domain, lang)') args = parser.parse_args(args) with json_lines.open(args.infile, broken=True) as f: items_file = gzip.open(args.out_prefix + '.items.jl.gz', 'wt') meta_file = gzip.open(args.out_prefix + '.meta.jl.gz', 'wt') n_errors = 0 with multiprocessing.Pool() as pool: for text_item in pool.imap_unordered(convert_item, f, chunksize=100): if text_item is None: n_errors += 1 else: items_file.write(json.dumps(text_item)) items_file.write('\n') meta_file.write( json.dumps({ key: text_item[key] for key in ['url', 'domain', 'lang', 'status', 'mangled_url'] })) meta_file.write('\n') print('Number of errors: {}'.format(n_errors))
def convert_set_to_dict(file_dir): """Converts a dataset file into user-item-tmps dict""" user_item_tmstmp = { "user_id": [], "item_id": [], "rating": [], "timestamp": [] } with json_lines.open(file_dir) as f: for index, item in enumerate(f): # Save User Events for event in item['user_history']: if check_item_id(event['event_info']): user_item_tmstmp['user_id'].append(index) user_item_tmstmp['item_id'].append(event['event_info']) user_item_tmstmp['rating'].append(1) user_item_tmstmp['timestamp'].append( datetime.datetime.strptime(event['event_timestamp'], '%Y-%m-%dT%H:%M:%S.%f%z')) # Save User Item Tmpstmp Bought user_item_tmstmp['user_id'].append(index) user_item_tmstmp['item_id'].append(item['item_bought']) user_item_tmstmp['rating'].append(5) user_item_tmstmp['timestamp'].append( user_item_tmstmp['timestamp'][-1] + datetime.timedelta(hours=2)) print("Finished reading file, proceding Dataframe") return user_item_tmstmp
def read_samples(in_file, out_dir, args): ''' 'LR' version stores [Left headline/description, Right headline/description, left/right index]''' left_labeled_data_LR = [] right_labeled_data_LR = [] ''' 'LCR' version stores [Left headline/description, Center headline/description, Right headline/description, left/right index]''' left_labeled_data_LCR = [] right_labeled_data_LCR = [] '''Left_Center_desc_Right' version stores [Left headline/description, Center description, Right headline/description, left/right index]''' left_labeled_data_Left_Center_Desc_Right = [] right_labeled_data_Left_Center_Desc_Right = [] LCR_data_nonpairs = [] with json_lines.open(in_file) as f: for item in f: LCR_data_nonpairs = read_samples_unpaired(args.data_type, item["articles"], LCR_data_nonpairs) # left_labeled_data_LR, right_labeled_data_LR = read_samples_util_LR(args.data_type, item['articles'], left_labeled_data_LR, right_labeled_data_LR) # left_labeled_data_LCR, right_labeled_data_LCR = read_samples_util_LCR(args.data_type, item['articles'], left_labeled_data_LCR, right_labeled_data_LCR) # left_labeled_data_Left_Center_Desc_Right, right_labeled_data_Left_Center_Desc_Right = read_samples_util_Left_Center_Desc_Right("article_headline", item['articles'], left_labeled_data_Left_Center_Desc_Right, right_labeled_data_Left_Center_Desc_Right) Path(out_dir).mkdir(parents=True, exist_ok=True) pickle.dump( LCR_data_nonpairs, open( os.path.join(out_dir, args.data_type + "_LCR_nonpairs.pickle"), 'wb'))
def test_reader_broken_json_partial_gzipped(tmpdir): # For gzip files broken=True only means gzip recovery, inside a single # archive processing stops at first broken json line p = tmpdir.join('myfile.jl.gz') write_gz(p, b'{"a": 1}\n{"a": 2\n{"b": 1}\n') with json_lines.open(str(p), broken=True) as f: lines = list(f) assert lines == [{'a': 1}]
def read_input(input_file): """This method reads the input file which is in gzip format""" logging.info("reading file {0}...this may take a while".format(input_file)) max = 20000 with json_lines.open(input_file, 'rb') as f: for item in f: if max > 0: #max=max-1 yield item['s']
def read_blind_file(): x = [] with json_lines.open('test_dataset_blind.jsonl') as reader: for line in reader: p = list() for instruction in line['instructions']: p.append(instruction) x.append(" ".join(p)) return x
def init_factions(factions): print "Searching factions..." with json_lines.open(settings.factions_jsonl) as reader: for item in reader: for faction in factions: if isinstance(faction, int): if item['id'] == faction: save_faction(item) else: if item['name'] == faction: save_faction(item)
def read_file_opt(): x = [] opt = list() with json_lines.open('train_dataset.jsonl') as reader: for line in reader: p = list() for instruction in line['instructions']: p.append(instruction) x.append(" ".join(p)) opt.append(line['opt']) return x, opt
def read_file_comp(): x = [] comp = list() with json_lines.open('train_dataset.jsonl') as reader: for line in reader: p = list() for instruction in line['instructions']: p.append(instruction) x.append(" ".join(p)) comp.append(line['compiler']) return x, comp
def read_samples_description_sep_headline(in_file, out_dir, args): Path(out_dir).mkdir(parents=True, exist_ok=True) headline_desc_pair_data = [] # desc_sep_headline_output_file = open(os.path.join(out_dir, "description_sep_headline.txt"), "w") newsdesc_sep_headline_output_file_prefix = "description_sep_headline" newsdesc_sep_ideologylabel_headline_output_file_prefix = "description_sep_ideologylabel_headline" allsides_desc_sep_LCR_headline_output_file_prefix = "allsides_desc_sep_LCR_headline" allsides_desc_news_desc_ideology_headline_output_file_prefix = "allsides_desc_news_desc_ideology_headline" newsdesc_sep_headline_data = [] newsdesc_ideology_headline_data = [] allsides_desc_sep_ideology_headline_data = [] allsides_desc_news_desc_ideology_headline_data = [] with json_lines.open(in_file) as f: for item in f: for article in item["articles"]: newsdesc_sep_headline_data.append( "<BOS> " + article["article_description"].lower().replace("\n", " ") + " <SEP> " + article["article_headline"].replace("\n", " ") + " <EOS>") ideology_label = "<" + article["political_spectrum"].upper( ) + ">" newsdesc_ideology_headline_data.append( "<BOS> " + article["article_description"].lower().replace("\n", " ") + " <SEP> " + ideology_label + article["article_headline"].replace("\n", " ") + " <EOS>") allsides_desc_sep_ideology_headline_data = get_allsides_desc_ideology_headline( allsides_desc_sep_ideology_headline_data, item) allsides_desc_news_desc_ideology_headline_data = get_allsides_desc_news_desc_ideology_headline( allsides_desc_news_desc_ideology_headline_data, item) write_to_text_file_util(newsdesc_sep_headline_data, \ out_dir, \ newsdesc_sep_headline_output_file_prefix) write_to_text_file_util(newsdesc_ideology_headline_data, \ out_dir, \ newsdesc_sep_ideologylabel_headline_output_file_prefix) write_to_text_file_util(allsides_desc_sep_ideology_headline_data, \ out_dir, \ allsides_desc_sep_LCR_headline_output_file_prefix) write_to_text_file_util(allsides_desc_news_desc_ideology_headline_data, \ out_dir, \ allsides_desc_news_desc_ideology_headline_output_file_prefix)
def read_geo_peers(): peer_ip_countries = dict() try: with json_lines.open(peer_address_geo_file) as f: #thing = json.load(f) for item in f: #print(item["collector"]) if item["collector"] in ignore_multi_hop_collectors: continue #ignore multi-hop collectors for now col = item["collector"] #print (col) for monitor in item["peers"]: if col != "caida": ip_address = str(monitor["peer_address"]) if ip_address in peer_ip_countries: continue #ignore peers already read from a different collector #checksum to determine if monitor is both full feed and confidence == 1 try: ffeed = int(monitor["full_feed"]) except ValueError: #print ("monitorFails full-feed " + str(monitor["peer_address"])) continue #no full feed value, ignore try: conf = int(monitor["confidence"]) except ValueError: #print ("monitorFails confidence " + str(monitor["peer_address"])) continue #no confidence value, ignore csum = conf + ffeed if csum < 2: #print ("monitorFails confidence or full feed " + str(monitor["peer_address"])) continue #ignore monitors that aren't both full feed and confidence 1 else: ip_address = str(monitor["dns_name"]) try: country = str(monitor["final_country"]) except KeyError or ValueError: #print ("monitorFails final country " + str(monitor["peer_address"])) continue #no final country value, ignore #print ("monitorPass " + str(monitor["peer_address"])) peer_ip_countries[ip_address] = country except: sys.stderr.write("\n something went wrong opening " + peer_address_geo_file + "\n") sys.exit() return peer_ip_countries
def main(): parser = argparse.ArgumentParser() arg = parser.add_argument arg('input', help='In .jl.gz format with html in "text" field') arg('html_field', help='Field name where html is stored') arg('output', help='Output in jl.gz format with text in "text" field') args = parser.parse_args() with json_lines.open(args.input, broken=True) as f, gzip.open( args.output, 'wt') as outf: with multiprocessing.Pool() as pool: for text_item in pool.imap_unordered( partial(text_worker, html_field=args.html_field), f): if text_item is not None: outf.write(json.dumps(text_item)) outf.write('\n')
def extract_classifications(file_path, set_type): i = 0 with json_lines.open(file_path) as reader: for obj in reader: classification = obj["truthMean"] # classification = 1 if obj["truthClass"] == "clickbait" else 0 if set_type == "training": training_classifications.append(classification) else: test_classifications.append(classification) test_ids.append(obj["id"]) # Only collect features for the number of samples specified (takes too long for all 17,000) i += 1 if set_type == "training" and i == max_samples: break
def load_system_stats_file(*, je, path, metrics_data, node, q): """ Extract relevant data from a Xcalar system stats file. """ with json_lines.open(path) as f: for dikt in f: for metric_id in metrics_data.ids_for_source(source="_SYSTEM_STATS"): mcfg = metrics_data.cfg_for_id(metric_id=metric_id).dikt if 'xy_expr' in mcfg: points = je.extract_xy(xy_expr=mcfg.get('xy_expr'), dikt=dikt) elif 'key_expr' in mcfg and 'val_expr' in mcfg: points = je.extract_kv(key_expr=mcfg.get('key_expr'), val_expr=mcfg.get('val_expr'), dikt=dikt) else: raise ValueError("invalid metric config: {}".format(mcfg)) put_points(node=node, metric_id=metric_id, points=points, q=q)
def init_systems(): print "Searching systems..." faction_ids = [] sql = 'SELECT id FROM minor_factions WHERE name IN %s' % (in_clause( settings.monitored_factions)) c = db.cursor() c.execute(sql) for row in c: faction_ids.append(row[0]) print faction_ids with json_lines.open(settings.systems_jsonl) as reader: for item in reader: print item['name'], " \r", for faction in item['minor_faction_presences']: if faction['minor_faction_id'] in faction_ids: save_system(item) print ""
def read_geo_peers(): peer_ip_countries = dict() with json_lines.open(peer_address_geo_file) as f: #thing = json.load(f) for item in f: #print(item["collector"]) if item["collector"] in ignore_multi_hop_collectors: continue #ignore multi-hop collectors for now for monitor in item["peers"]: ip_address = str(monitor["peer_address"]) if ip_address in peer_ip_countries: continue #ignore peers already read from a different collector #checksum to determine if monitor is both full feed and confidence == 1 try: ffeed = int(monitor["full_feed"]) except ValueError: print("monitorFails full-feed " + str(monitor["peer_address"])) continue #no full feed value, ignore try: conf = int(monitor["confidence"]) except ValueError: print("monitorFails confidence " + str(monitor["peer_address"])) continue #no confidence value, ignore csum = conf + ffeed if csum < 2: print("monitorFails confidence or full feed " + str(monitor["peer_address"])) continue #ignore monitors that aren't both full feed and confidence 1 try: country = str(monitor["final_country"]) except KeyError or ValueError: print("monitorFails final country " + str(monitor["peer_address"])) continue #no final country value, ignore print("monitorPass " + str(monitor["peer_address"])) peer_ip_countries[ip_address] = country return peer_ip_countries
def load_data(train_jsonl_file, data_type): # data_type=[train, dev, test] doc_num = 0 qa_num = 0 documents = [] question_answer = {} dataset = {"documents": documents, 'question_answer': question_answer} with json_lines.open(train_jsonl_file) as f: for item in f: if 'header' in item: dataset['name'] = item['header']['dataset'] if 'qas' in item: document = Document() document.context = item['context'] document.context_token = item['context_tokens'] qas = item['qas'] for qa in qas: qa_ = QA() qa_.question = qa['question'] # qa_.id = qa['id'] some datasets don't have id qa_.qid = qa['qid'] qa_.question_tokens = qa['question_tokens'] if data_type != 'test': qa_.answers = qa['answers'] qa_.detected_answers = qa['detected_answers'] question_answer[qa['qid']] = qa['answers'] document.qas.append(qa_) qa_num += 1 documents.append(document) doc_num += 1 logging.info("{}: {} documents, {} questions".format( train_jsonl_file, doc_num, qa_num)) return dataset
def main(): parser = argparse.ArgumentParser() arg = parser.add_argument arg('dmoz_urls_topics', help='In .csv.gz format') arg('dmoz_text', help='Items with url and text fields in .jl.gz format') arg('output', help='output file for fasttext training') args = parser.parse_args() with gzip.open(args.dmoz_urls_topics, 'rt') as f: topics_by_url = dict(csv.reader(f)) with json_lines.open(args.dmoz_text) as f, open(args.output, 'wt') as outf: for item in f: topics = topics_by_url[item['url']] topics = topics.split('/') if topics[0] == 'Top': topics = topics[1:] topics = [t for t in topics if not (len(t) == 1 and t.isupper())] for topic in topics: outf.write('__label__{} '.format(topic)) outf.write(to_single_line(item['text'])) outf.write('\n')
def read_samples(in_file, international_keywords, domestic_keywords): domestic_articles = [] international_articles = [] with json_lines.open(in_file) as f: count = 0 for item in f: international_flag = False count += 1 if item['allsides_description']: international_flag = international_flag or check_keywords( item['allsides_description'], international_keywords) for article in item['articles']: if article['article_description']: international_flag = international_flag or check_keywords( article['article_description'], international_keywords) if check_keywords(item['allsides_description'], domestic_keywords): international_flag = False for article in item['articles']: if article['article_description']: if check_keywords(article['article_description'], domestic_keywords): international_flag = False if not international_flag: item["type"] = "domestic" domestic_articles.append(item) else: item["type"] = "international" international_articles.append(item) # print(count) return domestic_articles, international_articles
def plot(*args, ymin=None, ymax=None, xmin=None, xmax=None, params=False, max_points=200): """ Use in the notebook like this: plot('./runs/oc2', './runs/oc1', 'loss', 'valid_loss') """ paths, keys = [], [] for x in args: if x.startswith('.') or x.startswith('/'): if '*' in x: paths.extend(glob.glob(x)) else: paths.append(x) else: keys.append(x) plt.figure(figsize=(12, 8)) keys = keys or ['loss', 'valid_loss'] ylim_kw = {} if ymin is not None: ylim_kw['ymin'] = ymin if ymax is not None: ylim_kw['ymax'] = ymax if ylim_kw: plt.ylim(**ylim_kw) xlim_kw = {} if xmin is not None: xlim_kw['xmin'] = xmin if xmax is not None: xlim_kw['xmax'] = xmax if xlim_kw: plt.xlim(**xlim_kw) for path in paths: path = Path(path) with json_lines.open(str(path.joinpath('train.log')), broken=True) as f: events = list(f) if params: print(path) pprint(json.loads(path.joinpath('params.json').read_text())) for key in sorted(keys): xs, ys = [], [] for e in events: if key in e: xs.append(e['step']) ys.append(e[key]) if xs: if len(xs) > 2 * max_points: indices = (np.arange(0, len(xs), len(xs) / max_points).astype( np.int32)) xs = np.array(xs)[indices[1:]] ys = [ np.mean(ys[idx:indices[i + 1]]) for i, idx in enumerate(indices[:-1]) ] plt.plot(xs, ys, label='{}: {}'.format(path, key)) plt.legend()
def plot(*args, ymin=None, ymax=None, xmin=None, xmax=None, params=False, max_points=200, legend=True, title=None, print_keys=False, print_paths=False, plt=None, newfigure=True, x_scale=1): """ Use in the notebook like this:: %matplotlib inline from imet.utils import plot plot('./runs/oc2', './runs/oc1', 'loss', 'valid_loss') """ import json_lines # no available on Kaggle if plt is None: from matplotlib import pyplot as plt paths, keys = [], [] for x in args: if x.startswith('.') or '/' in x: if '*' in x: paths.extend(glob.glob(x)) else: paths.append(x) else: keys.append(x) if print_paths: print('Found paths: {}'.format(' '.join(sorted(paths)))) if newfigure: plt.figure(figsize=(12, 8)) keys = keys or ['loss', 'valid_loss'] ylim_kw = {} if ymin is not None: ylim_kw['bottom'] = ymin if ymax is not None: ylim_kw['top'] = ymax if ylim_kw: plt.ylim(**ylim_kw) xlim_kw = {} if xmin is not None: xlim_kw['left'] = xmin if xmax is not None: xlim_kw['right'] = xmax if xlim_kw: plt.xlim(**xlim_kw) all_keys = set() for path in sorted(paths): path = Path(path) with json_lines.open(path / 'train.log', broken=True) as f: events = list(f) all_keys.update(k for e in events for k in e) for key in sorted(keys): xs, ys, ys_err = [], [], [] for e in events: if key in e: xs.append(e['step'] * x_scale) ys.append(e[key]) std_key = key + '_std' if std_key in e: ys_err.append(e[std_key]) if xs: if np.isnan(ys).any(): print('Warning: NaN {} for {}'.format(key, path)) if len(xs) > 2 * max_points: indices = (np.arange(0, len(xs) - 1, len(xs) / max_points).astype( np.int32)) xs = np.array(xs)[indices[1:]] ys = _smooth(ys, indices) if ys_err: ys_err = _smooth(ys_err, indices) label = '{}: {}'.format(path, key) if label.startswith('_'): label = ' ' + label if ys_err: ys_err = 1.96 * np.array(ys_err) plt.errorbar(xs, ys, yerr=ys_err, fmt='-o', capsize=5, capthick=2, label=label) else: plt.plot(xs, ys, label=label) plt.legend() if newfigure: plt.grid() if legend: plt.legend() if title: plt.title(title) if print_keys: print('Found keys: {}'.format(', '.join( sorted(all_keys - {'step', 'dt'}))))
def iter_html(path): with json_lines.open(path, broken=True) as lines: for line in lines: yield line['raw_content']
def read_samples(file): data_dir = os.path.dirname(file) # headlines = { # 'left': [], # 'center': [], # 'right': [] # } # descriptions = { # 'left': [], # 'center': [], # 'right': [] # } headlines = [] descriptions = [] publisher_spectrum_map = {} label_map = {'left': 0, 'center': 1, 'right': 2} counts = {'left': 0, 'center': 0, 'right': 0} split_ratio = 0.9 publisher_info_flag = False with json_lines.open(file) as f: for item in f: headline = " ".join( [val.lower() for val in item["article_headline"].split()]) desc = " ".join( [val.lower() for val in item["article_description"].split()]) # print(item['publisher']) news_publisher = None if item['publisher'] != None: news_publisher = " ".join( [val.lower() for val in item["publisher"].split()]) if item['publisher'] != None: if item['publisher'] not in publisher_spectrum_map: publisher_spectrum_map[item['publisher']] = [] publisher_spectrum_map[item['publisher']].append( item["political_spectrum"].lower()) if item["political_spectrum"].lower( ) != "" and item['publisher'] != None: if publisher_info_flag: headlines.append([ news_publisher + ". " + headline, label_map[item["political_spectrum"].lower()] ]) descriptions.append([ news_publisher + ". " + desc, label_map[item["political_spectrum"].lower()] ]) else: headlines.append([ headline, label_map[item["political_spectrum"].lower()] ]) descriptions.append( [desc, label_map[item["political_spectrum"].lower()]]) # headlines[item["political_spectrum"].lower()].append(news_publisher+". "+headline) # descriptions[item["political_spectrum"].lower()].append(news_publisher+". "+desc) # counts[item["political_spectrum"].lower()] += 1 # except: # print("Val: ", item["political_spectrum"].lower()) # train_headlines = [] # test_headlines = [] # train_descriptions = [] # test_descriptions = [] # for label in ["left", "center", "right"]: # train_size = int(len(headlines[label])*split_ratio) # for val in headlines[label][:train_size]: # train_headlines.append([val, label_map[label]]) # for val in headlines[label][train_size:]: # test_headlines.append([val, label_map[label]]) # train_size = int(len(descriptions[label])*split_ratio) # for val in descriptions[label][:train_size]: # train_descriptions.append([val, label_map[label]]) # for val in descriptions[label][train_size:]: # test_descriptions.append([val, label_map[label]]) if publisher_info_flag: pickle.dump( headlines, open( os.path.join( data_dir, 'article_headlines_train_with_publisher.pickle'), 'wb')) # pickle.dump(test_headlines, open(os.path.join(data_dir, 'article_headlines_test_with_publisher.pickle'), 'wb')) pickle.dump( descriptions, open( os.path.join( data_dir, 'article_descriptions_train_with_publisher.pickle'), 'wb')) # pickle.dump(test_descriptions, open(os.path.join(data_dir, 'article_descriptions_test_with_publisher.pickle'), 'wb')) else: pickle.dump( headlines, open(os.path.join(data_dir, 'article_headlines_train.pickle'), 'wb')) pickle.dump( descriptions, open( os.path.join(data_dir, 'article_descriptions_train.pickle'), 'wb')) for key, val in publisher_spectrum_map.items(): # print(key, np.unique(np.array(val))) print(key, val) if len(np.unique(np.array(val))) > 1: print(key, np.unique(np.array(val)))
talf = alf.split("(")[0].replace('-', '').strip() det['affli'] += talf + "\n" details.append(det) else: for a in al: det = {} det['auth'] = a.split("(")[0].replace('-', '').strip() det['affli'] = afflistr.split("(")[0].replace('-', '').strip() details.append(det) return details filename = 'output' records = [] with json_lines.open(Ingredients.getOutputFilesPath() + 'wcd2019ltabstracts.jl') as f: for item in f: records.append(item) with json_lines.open(Ingredients.getOutputFilesPath() + 'wcd2019lterabstracts.jl') as f: for item in f: records.append(item) links = [] with json_lines.open(Ingredients.getOutputFilesPath() + 'abslblinks.jl') as f: for item in f: links.append(item) rows = [] for item in records:
# -*- coding: utf-8 -*- import ujson import elasticsearch import json_lines props = set() with json_lines.open('index_dbpedia201510.json') as data_file: es = elasticsearch.Elasticsearch([{ 'host': 'localhost', 'port': 9200 }], timeout=60) if es.indices.exists(index="dbpedia201510"): es.indices.delete(index="dbpedia201510") body = '{"settings": {"index.auto_expand_replicas": "1-all","index.number_of_shards": 1}}' es.indices.create(index="dbpedia201510", body=body) for line in data_file: dico = ujson.loads(line, encoding='utf-8') for k, v in dico.items(): props.update(v.keys()) es.index(index="dbpedia201510", doc_type="entity", id=k, body=v, timeout='60s')