def load_models(cursor): print("Loading data..") for row in cursor.execute('SELECT * FROM papers'): paper = Paper(row[0], row[1], row[2], row[3], row[4], row[5], row[6]) Data.add_paper(paper) for row in cursor.execute('SELECT * FROM authors'): # create new author with id and name if row[1] != 'None': author = Author(row[0], row[1]) Data.add_author(author) for row in cursor.execute('SELECT * FROM paper_authors'): if row[1] in Data.papers and row[2] in Data.authors: Data.papers[row[1]].add_author(row[2]) Data.authors[row[2]].add_paper(row[1]) for key, paper in Data.papers.items(): for author in paper.authors: if author in Data.authors: Data.authors[author].add_co_author(paper.authors) print("Loaded data")
def main(args): print args data_dir = args[1] pdf_dir = os.path.join(data_dir, 'pdfs') review_dir = os.path.join(data_dir, 'reviews') if os.path.exists(os.path.join( data_dir, 'reviews.json')) and not os.path.exists(review_dir): print 'Loading reviews from a review file' papers = Paper.from_softconf_dump( os.path.join(data_dir, 'reviews.json')) os.makedirs(review_dir) for paper in papers: paper.to_json('{}/reviews/{}.json'.format(data_dir, paper.ID)) if not os.path.exists(pdf_dir) or not os.path.exists(review_dir): print 'PDF/REVIEW dataset must be ready', pdf_dir, review_dir pdf_files = glob.glob(pdf_dir + '/*.pdf') print 'Number of pdfs:', len(pdf_files) review_files = glob.glob(review_dir + '/*.json') print 'Number of papers:', len(review_files) # checking the decision distributions decisions, recs, reviews = [], [], [] category_dict = {} category_types = ['cs.cl', 'cs.lg', 'cs.ai'] for review_file in review_files: paper = Paper.from_json(review_file) reviews += paper.REVIEWS if not paper: continue decisions.append(paper.get_accepted()) if len(paper.get_reviews()) > 0: recs.append(paper.get_reviews()[0].get_recommendation()) # count categories matched = False categories = paper.SUBJECTS.lower().split(' ') for c in categories: if c in category_types: matched = c if matched: if matched in category_dict: category_dict[matched] += 1 else: category_dict[matched] = 0 else: print categories, paper.ID print 'Paper Decisions:', Counter(decisions) print 'Review Recommendations:', Counter(recs) print 'Number of reviews:', len(reviews) print 'Categories: ', category_dict # science parser print 'Generating science parses...' science_dir = os.path.join(data_dir, 'scienceparse/') if not os.path.exists(science_dir): print 'Parsing papers usnig science-parser...' os.makedirs(science_dir) os.system( 'java -Xmx6g -jar ../lib/science-parse-cli-assembly-1.2.9-SNAPSHOT.jar %s -o %s' % (pdf_dir, science_dir)) science_files = glob.glob(science_dir + '/*.pdf.json') else: print 'Reading parsed science parses...' science_files = glob.glob(science_dir + '/*.pdf.json') print 'Number of science parses:', len(science_files) # split to train/dev/test by acceptance data_types = ['train', 'dev', 'test'] print 'Splitting paper/review/science-parses into train/dev/test' split_again = False for dtype in data_types: data_type_dir = os.path.join(data_dir, dtype) if os.path.exists(data_type_dir): num_pdfs = len(glob.glob(data_type_dir + '/pdfs' + '/*.pdf')) print 'file already exists:', data_type_dir, num_pdfs else: split_again = True if split_again: print 'splitting ...' #os.makedirs(data_type_dir) pids = [ os.path.basename(pfile).replace('.pdf', '') for pfile in pdf_files ] rids = [ os.path.basename(rfile).replace('.json', '') for rfile in review_files ] sids = [ os.path.basename(sfile).replace('.pdf.json', '') for sfile in science_files ] ids = [] for pid in pids: if pid in rids and pid in sids: ids.append(pid) train, validtest = train_test_split(ids, test_size=0.1, random_state=42) dev, test = train_test_split(validtest, test_size=0.5, random_state=42) for didx, data_set in enumerate([train, dev, test]): if os.path.exists(os.path.join(data_dir, data_types[didx])): rmtree(os.path.join(data_dir, data_types[didx])) os.makedirs(os.path.join(data_dir, data_types[didx], 'pdfs')) os.makedirs(os.path.join(data_dir, data_types[didx], 'reviews')) os.makedirs(os.path.join(data_dir, data_types[didx], 'parsed_pdfs')) print 'Splitting..', data_types[didx], len(data_set) for d in data_set: copyfile( os.path.join(pdf_dir, d + '.pdf'), os.path.join(data_dir, data_types[didx], 'pdfs', d + '.pdf')) copyfile( os.path.join(review_dir, d + '.json'), os.path.join(data_dir, data_types[didx], 'reviews', d + '.json')) copyfile( os.path.join(science_dir, d + '.pdf.json'), os.path.join(data_dir, data_types[didx], 'parsed_pdfs', d + '.pdf.json'))
def main(args, limit=False): fout_dic = {} for len_sent in range(args.min_sent, args.max_sent + 1): fout_dic[len_sent] = open('%s_%d.txt' % (args.out_file, len_sent), 'w') review_files = glob.glob(args.data_dir + '/reviews' + '/*.json') print 'Number of papers:', len(review_files) cnt = 0 topic = '' topic_changed = False category_types = ['cs.cl', 'cs.lg', 'cs.ai'] #nlp = spacy.load('en', parser=False) for rid, review_file in enumerate(review_files): if rid % 1000 == 0: print '[%d/%d]' % (rid, len(review_files)) paper = Paper.from_json(review_file) if not paper: continue paper.SCIENCEPARSE = ScienceParseReader.read_science_parse( paper.ID, paper.TITLE, paper.ABSTRACT, args.data_dir + '/scienceparse/') # paper ID file_prefix = paper.ID # sentences sections = paper.SCIENCEPARSE.get_sections_dict() for topic, content in sections.items(): paragraphs = content.split('\n') for paragraph in paragraphs: if paragraph == '': continue sents = sentence_tokenizer( paragraph, min_sent=args.min_sent, max_sent=args.max_sent, min_sent_len=args.min_sent_len, max_sent_len=args.max_sent_len ) if sents is None: continue cnt += 1 avg_len = np.average([len(sent) for sent in sents]) fout = fout_dic[len(sents)] fout.write('%s\t%s\t%d\t%.2f\t%s\n' % (topic, file_prefix, len( sents), avg_len, '\t'.join([' '.join(sent) for sent in sents]))) if cnt % 1000 == 0: print '\t%d paragraphs, %d files ..' % (cnt, rid) fout.flush() if limit and cnt == limit: sys.exit(1) for len_sent, fout in fout_dic.items(): fout.close()
def prepare_data(data_dir, vocab_path='vocab', max_vocab_size=20000, max_len_paper=1000, max_len_review=200): data_type = data_dir.split('/')[-1] vocab_path += '.' + data_type if max_vocab_size: vocab_path += '.' + str(max_vocab_size) vocab_path = data_dir + '/' + vocab_path label_scale = 5 if 'iclr' in data_dir.lower(): fill_missing = False aspects = [ 'RECOMMENDATION', 'SUBSTANCE', 'APPROPRIATENESS', 'MEANINGFUL_COMPARISON', 'SOUNDNESS_CORRECTNESS', 'ORIGINALITY', 'CLARITY', 'IMPACT', 'RECOMMENDATION_ORIGINAL' ] review_dir_postfix = '_annotated' elif 'acl' in data_dir.lower(): fill_missing = True aspects = [ 'RECOMMENDATION', 'SUBSTANCE', 'APPROPRIATENESS', 'MEANINGFUL_COMPARISON', 'SOUNDNESS_CORRECTNESS', 'ORIGINALITY', 'CLARITY', 'IMPACT', 'REVIEWER_CONFIDENCE' ] review_dir_postfix = '' else: print('wrong dataset:', data_dir) sys.exit(1) # Loading datasets print('Reading datasets..') datasets = ['train', 'dev', 'test'] paper_content_all = [] review_content_all = [] data = defaultdict(list) for dataset in datasets: review_dir = os.path.join(data_dir, dataset, 'reviews%s/' % (review_dir_postfix)) scienceparse_dir = os.path.join(data_dir, dataset, 'parsed_pdfs/') model_dir = os.path.join(data_dir, dataset, 'model/') if not os.path.exists(model_dir): os.makedirs(model_dir) paper_json_filenames = sorted(glob.glob( '{}/*.json'.format(review_dir))) # add all paper/review content to generate corpus for buildinb vocab paper_content = [] review_content = [] for paper_json_filename in paper_json_filenames: d = {} paper = Paper.from_json(paper_json_filename) paper.SCIENCEPARSE = ScienceParseReader.read_science_parse( paper.ID, paper.TITLE, paper.ABSTRACT, scienceparse_dir) review_contents = [] reviews = [] for review in paper.REVIEWS: review_contents.append( preprocess(review.COMMENTS, only_char=False, lower=True, stop_remove=False)) reviews.append(review) d['paper_content'] = preprocess( paper.SCIENCEPARSE.get_paper_content(), only_char=False, lower=True, stop_remove=False) d['reviews_content'] = review_contents d['reviews'] = reviews data[dataset].append(d) print('Total number of papers %d' % (np.sum([len(d) for _, d in list(data.items())]))) print( 'Total number of reviews %d' % (np.sum([len(r['reviews']) for _, d in list(data.items()) for r in d]))) # Loading VOCAB print('Building vocab...') words = [] for _, d in list(data.items()): for p in d: words += p['paper_content'].split(' ') for r in p['reviews_content']: words += r.split(' ') print("Total words in corpus", len(words)) vocab = OrderedDict() word_counter = Counter(words) vocab['PAD'] = 0 vocab['UNK'] = 1 for w, c in word_counter.most_common(): if max_vocab_size: if len(vocab) >= max_vocab_size: break if len(w) and w not in vocab: vocab[w] = len(vocab) with open(vocab_path, 'w') as fout: for w, id in list(vocab.items()): fout.write('%s\t%s\n' % (w, id)) vocab_inv = {int(i): v for v, i in list(vocab.items())} print("Total vocab of size", len(vocab)) # Loading DATA print('Reading reviews from...') data_padded = [] for dataset in datasets: ds = data[dataset] x_paper = [] #[None] * len(reviews) x_review = [] #[None] * len(reviews) y = [] #[None] * len(reviews) for d in ds: paper_content = d['paper_content'] reviews_content = d['reviews_content'] reviews = d['reviews'] for rid, (review_content, review) in enumerate(zip(reviews_content, reviews)): paper_ids = [ vocab[w] if w in vocab else 1 for w in paper_content.split(' ') ] review_ids = [ vocab[w] if w in vocab else 1 for w in review_content.split(' ') ] paper_ids = pad_sentence(paper_ids, max_len_paper, 0) review_ids = pad_sentence(review_ids, max_len_review, 0) xone = (paper_ids, review_ids) yone = [np.nan] * len(aspects) for aid, aspect in enumerate(aspects): if aspect in review.__dict__ and review.__dict__[ aspect] is not None: yone[aid] = float(review.__dict__[aspect]) #print rid,len(xone[0]), len(xone[1]), yone x_paper.append(xone[0]) x_review.append(xone[1]) y.append(yone) x_paper = np.array(x_paper, dtype=np.int32) x_review = np.array(x_review, dtype=np.int32) y = np.array(y, dtype=np.float32) # add average value of missing aspect value if fill_missing: col_mean = np.nanmean(y, axis=0) inds = np.where(np.isnan(y)) y[inds] = np.take(col_mean, inds[1]) print( 'Total %s dataset: %d/%d' % (dataset, len(x_paper), len(x_review)), x_paper.shape, x_review.shape, y.shape) data_padded.append((x_paper, x_review)) data_padded.append(y) return data_padded, vocab, vocab_inv, label_scale, aspects
def main(): # Loading annotaions annots = get_annots_dic() print 'Loaded annots: %d papers and %d reviews' % ( len(annots), sum([len(v) for k, v in annots.items()])) # Loading reviews, merging them with annotations, and saving into new directory data_dir = "../../data/iclr_2017" # args[1] #train/reviews datasets = ['train', 'dev', 'test'] print 'Reading reviews from...' for dataset in datasets: cnt_p, cnt_r = 0, 0 review_dir = os.path.join(data_dir, dataset, 'reviews_raw/') review_annotated_dir = os.path.join(data_dir, dataset, 'reviews/') scienceparse_dir = os.path.join(data_dir, dataset, 'scienceparse/') model_dir = os.path.join(data_dir, dataset, 'model') if not os.path.exists(model_dir): os.makedirs(model_dir) review_files = sorted(glob.glob('{}/*.json'.format(review_dir))) pids = [] for paper_json_filename in review_files: paper = Paper.from_json(paper_json_filename) reviews_combined = [] reviews_annotated = annots[paper.ID] reviews_original = paper.REVIEWS # overwrite review_annot to reviews in original paper reviews_combined += reviews_original for r_original in reviews_original: r_combined = None #r_original for r_annotated in reviews_annotated: if r_annotated['OTHER_KEYS'] == r_original.OTHER_KEYS: r_combined = r_original for k, v in r_annotated.items(): if k in [ 'RECOMMENDATION_UNOFFICIAL', 'SUBSTANCE', 'APPROPRIATENESS', 'MEANINGFUL_COMPARISON', 'SOUNDNESS_CORRECTNESS', 'ORIGINALITY', 'CLARITY', 'IMPACT' ]: setattr(r_combined, k, v) setattr(r_combined, 'IS_ANNOTATED', True) if r_combined is None: reviews_combined.append(r_original) else: reviews_combined.append(r_combined) paper.REVIEWS = reviews_combined cnt_r += len(paper.REVIEWS) # save to /reviews_annotated json.dump( paper.to_json_object(), open(review_annotated_dir + '/%s.json' % (paper.ID), 'w')) print paper.ID, len(paper.REVIEWS) cnt_p += 1 print dataset, cnt_p, cnt_r
def main(args, lower=True, max_vocab_size=False, encoder='bowtfidf'): argc = len(args) if argc < 9: print( "Usage:", args[0], "<paper-json-dir> <scienceparse-dir> <out-dir> <submission-year> <feature output file> <tfidf vector file> <max_vocab_size> <encoder> <hand-feature>" ) return -1 paper_json_dir = args[1] #train/reviews scienceparse_dir = args[2] #train/parsed_pdfs out_dir = args[3] #train/dataset feature_output_file = args[4] #train/dataset/features.dat vect_file = args[5] #train/dataset/vect.pkl max_vocab_size = False if args[6] == 'False' else int( args[6]) # False or integer encoder = False if args[7] == 'False' else str(args[7]) hand = False if args[8] == 'False' else str(args[8]) if not os.path.exists(out_dir): os.makedirs(out_dir) is_train = True vect = None idToFeature = None if os.path.isfile(feature_output_file): is_train = False idToFeature = read_features(feature_output_file) if encoder: print 'Loading vector file from...', vect_file vect = load_vect(vect_file) else: print 'Loading vector file from scratch..' idToFeature = dict() outLabelsFile = open( out_dir + '/labels_%s_%s_%s.tsv' % (str(max_vocab_size), str(encoder), str(hand)), 'w') outIDFile = open( out_dir + '/ids_%s_%s_%s.tsv' % (str(max_vocab_size), str(encoder), str(hand)), 'w') outSvmLiteFile = open( out_dir + '/features.svmlite_%s_%s_%s.txt' % (str(max_vocab_size), str(encoder), str(hand)), 'w') ################################ # read reviews ################################ print 'Reading reviews from...', paper_json_dir paper_content_corpus = [] #"" paper_json_filenames = sorted(glob.glob( '{}/*.json'.format(paper_json_dir))) papers = [] for paper_json_filename in paper_json_filenames: paper = Paper.from_json(paper_json_filename) paper.SCIENCEPARSE = ScienceParseReader.read_science_parse( paper.ID, paper.TITLE, paper.ABSTRACT, scienceparse_dir) paper_content_corpus.append(paper.SCIENCEPARSE.get_paper_content()) papers.append(paper) random.shuffle(papers) print 'Total number of reviews', len(papers) def get_feature_id(feature): if feature in idToFeature: return idToFeature[feature] else: return None def addFeatureToDict(fname): id = len(idToFeature) idToFeature[fname] = id ################################ # Initialize vocabularty ################################ outCorpusFilename = out_dir + '/corpus.pkl' if not os.path.isfile(outCorpusFilename): paper_content_corpus = [ preprocess(p, only_char=True, lower=True, stop_remove=True) for p in paper_content_corpus ] paper_content_corpus_words = [] for p in paper_content_corpus: paper_content_corpus_words += p.split(' ') pkl.dump(paper_content_corpus_words, open(outCorpusFilename, 'wb')) else: paper_content_corpus_words = pkl.load(open(outCorpusFilename, 'rb')) print 'Total words in corpus', len(paper_content_corpus_words) ################################ # Encoding ################################ print 'Encoding..', encoder # 1) tf-idf features on title/author_names/domains if not encoder: print 'No encoder', encoder elif encoder in ['bow', 'bowtfidf']: word_counter = Counter(paper_content_corpus_words) # vocab limit by frequency if max_vocab_size: word_counter = dict(word_counter.most_common()[:max_vocab_size]) vocabulary = dict() for w in word_counter: if len(w) and w not in vocabulary: if is_train: vocabulary[w] = len(vocabulary) addFeatureToDict(w) else: fid = get_feature_id(w) if fid is not None: vocabulary[w] = fid print("Got vocab of size", len(vocabulary)) if is_train: print 'Saving vectorized', vect_file if encoder == 'bow': vect = CountVectorizer(max_df=0.5, analyzer='word', stop_words='english', vocabulary=vocabulary) else: vect = TfidfVectorizer(sublinear_tf=True, max_df=0.5, analyzer='word', stop_words='english', vocabulary=vocabulary) vect.fit([p for p in paper_content_corpus]) save_vect(vect, vect_file) # 2) sentence encoder features elif encoder in ['w2v', 'w2vtfidf']: from sent2vec import MeanEmbeddingVectorizer, TFIDFEmbeddingVectorizer, import_embeddings if is_train: w2v = import_embeddings() vect = MeanEmbeddingVectorizer( w2v) if encoder == 'w2v' else TFIDFEmbeddingVectorizer(w2v) for f in range(vect.dim): #fid = get_feature_id() addFeatureToDict('%s%d' % (encoder, f)) print 'Saving vectorized', vect_file if encoder == 'w2vtfidf': vect.fit([p for p in paper_content_corpus]) save_vect(vect, vect_file) else: print 'Wrong type of encoder', encoder sys.exit(1) ################################ # Add features ################################ if encoder: all_titles = [] for p in papers: sp = p.get_scienceparse() title = p.get_title() all_title = preprocess(title, only_char=True, lower=True, stop_remove=True) all_titles.append(all_title) all_titles_features = vect.transform(all_titles) if is_train: print 'saving features to file', feature_output_file if hand: addFeatureToDict("get_most_recent_reference_year") addFeatureToDict("get_num_references") addFeatureToDict("get_num_refmentions") addFeatureToDict("get_avg_length_reference_mention_contexts") addFeatureToDict("abstract_contains_deep") addFeatureToDict("abstract_contains_neural") addFeatureToDict("abstract_contains_embedding") addFeatureToDict("abstract_contains_outperform") addFeatureToDict("abstract_contains_novel") addFeatureToDict("abstract_contains_state_of_the_art") addFeatureToDict("abstract_contains_state-of-the-art") addFeatureToDict("get_num_recent_references") addFeatureToDict("get_num_ref_to_figures") addFeatureToDict("get_num_ref_to_tables") addFeatureToDict("get_num_ref_to_sections") addFeatureToDict("get_num_uniq_words") addFeatureToDict("get_num_sections") addFeatureToDict("get_avg_sentence_length") addFeatureToDict("get_contains_appendix") addFeatureToDict("proportion_of_frequent_words") addFeatureToDict("get_title_length") addFeatureToDict("get_num_authors") addFeatureToDict("get_num_ref_to_equations") addFeatureToDict("get_num_ref_to_theorems") save_features_to_file(idToFeature, feature_output_file) id = 1 hfws, most_frequent_words, least_frequent_words = count_words( paper_content_corpus_words, 0.01, 0.05, 3) for p in papers: outIDFile.write(str(id) + "\t" + str(p.get_title()) + "\n") rs = [r.get_recommendation() for r in p.get_reviews()] rec = int(p.get_accepted() == True) outLabelsFile.write(str(rec)) outSvmLiteFile.write(str(rec) + " ") sp = p.get_scienceparse() if encoder: title_tfidf = all_titles_features[id - 1] if encoder.startswith('bow'): nz = title_tfidf.nonzero()[1] for word_id in sorted(nz): outSvmLiteFile.write( str(word_id) + ":" + str(title_tfidf[0, word_id]) + " ") elif encoder.startswith('w2v'): for word_id in range(vect.dim): outSvmLiteFile.write( str(word_id) + ":" + str(title_tfidf[word_id]) + " ") else: print 'wrong ecndoer', encoder sys.exit(1) if hand: outSvmLiteFile.write( str(get_feature_id("get_most_recent_reference_year")) + ":" + str(sp.get_most_recent_reference_year() - 2000) + " ") outSvmLiteFile.write( str(get_feature_id("get_num_references")) + ":" + str(sp.get_num_references()) + " ") outSvmLiteFile.write( str(get_feature_id("get_num_refmentions")) + ":" + str(sp.get_num_refmentions()) + " ") outSvmLiteFile.write( str(get_feature_id( "get_avg_length_reference_mention_contexts")) + ":" + str(sp.get_avg_length_reference_mention_contexts()) + " ") outSvmLiteFile.write( str(get_feature_id("abstract_contains_deep")) + ":" + str(int(p.abstract_contains_a_term("deep"))) + " ") outSvmLiteFile.write( str(get_feature_id("abstract_contains_neural")) + ":" + str(int(p.abstract_contains_a_term("neural"))) + " ") outSvmLiteFile.write( str(get_feature_id("abstract_contains_embedding")) + ":" + str(int(p.abstract_contains_a_term("embedding"))) + " ") outSvmLiteFile.write( str(get_feature_id("abstract_contains_outperform")) + ":" + str(int(p.abstract_contains_a_term("outperform"))) + " ") outSvmLiteFile.write( str(get_feature_id("abstract_contains_novel")) + ":" + str(int(p.abstract_contains_a_term("novel"))) + " ") outSvmLiteFile.write( str(get_feature_id("abstract_contains_state_of_the_art")) + ":" + str(int(p.abstract_contains_a_term("state of the art"))) + " ") outSvmLiteFile.write( str(get_feature_id("abstract_contains_state-of-the-art")) + ":" + str(int(p.abstract_contains_a_term("state-of-the-art"))) + " ") outSvmLiteFile.write( str(get_feature_id("get_num_recent_references")) + ":" + str(sp.get_num_recent_references(2017)) + " ") outSvmLiteFile.write( str(get_feature_id("get_num_ref_to_figures")) + ":" + str(sp.get_num_ref_to_figures()) + " ") outSvmLiteFile.write( str(get_feature_id("get_num_ref_to_tables")) + ":" + str(sp.get_num_ref_to_tables()) + " ") outSvmLiteFile.write( str(get_feature_id("get_num_ref_to_sections")) + ":" + str(sp.get_num_ref_to_sections()) + " ") outSvmLiteFile.write( str(get_feature_id("get_num_uniq_words")) + ":" + str(sp.get_num_uniq_words()) + " ") outSvmLiteFile.write( str(get_feature_id("get_num_sections")) + ":" + str(sp.get_num_sections()) + " ") outSvmLiteFile.write( str(get_feature_id("get_avg_sentence_length")) + ":" + str(sp.get_avg_sentence_length()) + " ") outSvmLiteFile.write( str(get_feature_id("get_contains_appendix")) + ":" + str(sp.get_contains_appendix()) + " ") outSvmLiteFile.write( str(get_feature_id("proportion_of_frequent_words")) + ":" + str( round( sp.get_frequent_words_proportion( hfws, most_frequent_words, least_frequent_words), 3)) + " ") outSvmLiteFile.write( str(get_feature_id("get_title_length")) + ":" + str(p.get_title_len()) + " ") outSvmLiteFile.write( str(get_feature_id("get_num_authors")) + ":" + str(sp.get_num_authors()) + " ") outSvmLiteFile.write( str(get_feature_id("get_num_ref_to_equations")) + ":" + str(sp.get_num_ref_to_equations()) + " ") outSvmLiteFile.write( str(get_feature_id("get_num_ref_to_theorems")) + ":" + str(sp.get_num_ref_to_theorems()) + " ") outSvmLiteFile.write("\n") id += 1 outLabelsFile.close() outIDFile.close() outSvmLiteFile.close() print 'saved', outLabelsFile.name print 'saved', outIDFile.name print 'saved', outSvmLiteFile.name
def prepare_data( data_dir, vocab_path='vocab', max_vocab_size = 20000, max_len_paper=1000, max_len_review=200): data_type = data_dir.split('/')[-1] vocab_path += '.' + data_type if max_vocab_size: vocab_path += '.'+str(max_vocab_size) vocab_path = data_dir +'/'+ vocab_path label_scale = 5 if 'iclr' in data_dir.lower(): fill_missing = False aspects = ['RECOMMENDATION', 'SUBSTANCE', 'APPROPRIATENESS','MEANINGFUL_COMPARISON','SOUNDNESS_CORRECTNESS','ORIGINALITY','CLARITY', 'IMPACT', 'RECOMMENDATION_ORIGINAL'] review_dir_postfix = '' elif 'acl' in data_dir.lower(): fill_missing = True aspects = ['RECOMMENDATION', 'SUBSTANCE', 'APPROPRIATENESS','MEANINGFUL_COMPARISON','SOUNDNESS_CORRECTNESS','ORIGINALITY','CLARITY','IMPACT', 'REVIEWER_CONFIDENCE' ] review_dir_postfix = '' else: print 'wrong dataset:',data_dir sys.exit(1) #Loading datasets print 'Reading datasets..' datasets = ['train','dev','test'] paper_content_all = [] review_content_all = [] data = defaultdict(list) for dataset in datasets: review_dir = os.path.join(data_dir, dataset, 'reviews%s/'%(review_dir_postfix)) scienceparse_dir = os.path.join(data_dir, dataset, 'parsed_pdfs/') model_dir = os.path.join(data_dir, dataset, 'model/') if not os.path.exists(model_dir): os.makedirs(model_dir) paper_json_filenames = sorted(glob.glob('{}/*.json'.format(review_dir))) # add all paper/review content to generate corpus for buildinb vocab paper_content = [] review_content = [] for paper_json_filename in paper_json_filenames: d = {} paper = Paper.from_json(paper_json_filename) paper.SCIENCEPARSE = ScienceParseReader.read_science_parse(paper.ID, paper.TITLE, paper.ABSTRACT, scienceparse_dir) review_contents = [] reviews = [] for review in paper.REVIEWS: review_contents.append(review.COMMENTS) #preprocess(review.COMMENTS, only_char=False, lower=True, stop_remove=False)) reviews.append(review) d['paper'] = paper d['paper_content'] = paper.SCIENCEPARSE.get_paper_content() #preprocess( #, only_char=False, lower=True,stop_remove=False)# d['reviews_content'] = review_contents d['reviews'] = reviews data[dataset].append(d) print 'Total number of papers %d' %(np.sum([len(d) for _,d in data.items()])) print 'Total number of reviews %d' %(np.sum([len(r['reviews']) for _,d in data.items() for r in d ])) # Loading DATA print 'Reading reviews from...' data_padded = [] for dataset in datasets: ds = data[dataset] papers = [] x_paper = [] #[None] * len(reviews) x_review = [] #[None] * len(reviews) y = [] #[None] * len(reviews) num_reviews = [] x_reviews = [] decision = [] for d in ds: paper = d['paper'] paper_content = d['paper_content'] reviews_content = d['reviews_content'] reviews = d['reviews'] decision.append(paper.ACCEPTED) papers.append(paper) paper_sent = nltk.sent_tokenize(paper_content) reviews_sent = nltk.sent_tokenize(' '.join(reviews_content)) x_paper.append(paper_sent) x_reviews.append(reviews_sent) num_reviews.append(len(reviews)) for rid, (review_content, review) in enumerate(zip(reviews_content,reviews)): yone = [np.nan] * len(aspects) review_sent = nltk.sent_tokenize(review.__dict__['COMMENTS']) for aid,aspect in enumerate(aspects): if aspect in review.__dict__ and review.__dict__[aspect] is not None: yone[aid] = float(review.__dict__[aspect]) x_review.append(review_sent) y.append(yone) y = np.array(y, dtype=np.float32) # add average value of missing aspect value if fill_missing: col_mean = np.nanmean(y,axis=0) inds = np.where(np.isnan(y)) y[inds] = np.take(col_mean, inds[1]) data_padded.append((x_paper,papers, x_review, num_reviews, x_reviews, decision)) data_padded.append(y) return data_padded,label_scale,aspects