def load_models(cursor):

    print("Loading data..")
    for row in cursor.execute('SELECT * FROM papers'):
        paper = Paper(row[0], row[1], row[2], row[3], row[4], row[5], row[6])
        Data.add_paper(paper)

    for row in cursor.execute('SELECT * FROM authors'):
        # create new author with id and name
        if row[1] != 'None':
            author = Author(row[0], row[1])
            Data.add_author(author)

    for row in cursor.execute('SELECT * FROM paper_authors'):
        if row[1] in Data.papers and row[2] in Data.authors:
            Data.papers[row[1]].add_author(row[2])
            Data.authors[row[2]].add_paper(row[1])

    for key, paper in Data.papers.items():
        for author in paper.authors:
            if author in Data.authors:
                Data.authors[author].add_co_author(paper.authors)
    print("Loaded data")
Пример #2
0
def main(args):
    print args
    data_dir = args[1]

    pdf_dir = os.path.join(data_dir, 'pdfs')
    review_dir = os.path.join(data_dir, 'reviews')

    if os.path.exists(os.path.join(
            data_dir, 'reviews.json')) and not os.path.exists(review_dir):
        print 'Loading reviews from a review file'
        papers = Paper.from_softconf_dump(
            os.path.join(data_dir, 'reviews.json'))
        os.makedirs(review_dir)
        for paper in papers:
            paper.to_json('{}/reviews/{}.json'.format(data_dir, paper.ID))

    if not os.path.exists(pdf_dir) or not os.path.exists(review_dir):
        print 'PDF/REVIEW dataset must be ready', pdf_dir, review_dir

    pdf_files = glob.glob(pdf_dir + '/*.pdf')
    print 'Number of pdfs:', len(pdf_files)

    review_files = glob.glob(review_dir + '/*.json')
    print 'Number of papers:', len(review_files)
    # checking the decision distributions
    decisions, recs, reviews = [], [], []
    category_dict = {}
    category_types = ['cs.cl', 'cs.lg', 'cs.ai']
    for review_file in review_files:
        paper = Paper.from_json(review_file)
        reviews += paper.REVIEWS
        if not paper: continue
        decisions.append(paper.get_accepted())
        if len(paper.get_reviews()) > 0:
            recs.append(paper.get_reviews()[0].get_recommendation())

        # count categories
        matched = False
        categories = paper.SUBJECTS.lower().split(' ')
        for c in categories:
            if c in category_types:
                matched = c
        if matched:
            if matched in category_dict:
                category_dict[matched] += 1
            else:
                category_dict[matched] = 0
        else:
            print categories, paper.ID

    print 'Paper Decisions:', Counter(decisions)
    print 'Review Recommendations:', Counter(recs)
    print 'Number of reviews:', len(reviews)
    print 'Categories: ', category_dict

    # science parser
    print 'Generating science parses...'
    science_dir = os.path.join(data_dir, 'scienceparse/')
    if not os.path.exists(science_dir):
        print 'Parsing papers usnig science-parser...'
        os.makedirs(science_dir)
        os.system(
            'java -Xmx6g -jar ../lib/science-parse-cli-assembly-1.2.9-SNAPSHOT.jar %s -o %s'
            % (pdf_dir, science_dir))
        science_files = glob.glob(science_dir + '/*.pdf.json')
    else:
        print 'Reading parsed science parses...'
        science_files = glob.glob(science_dir + '/*.pdf.json')
    print 'Number of science parses:', len(science_files)

    # split to train/dev/test by acceptance
    data_types = ['train', 'dev', 'test']
    print 'Splitting paper/review/science-parses into train/dev/test'
    split_again = False
    for dtype in data_types:
        data_type_dir = os.path.join(data_dir, dtype)
        if os.path.exists(data_type_dir):
            num_pdfs = len(glob.glob(data_type_dir + '/pdfs' + '/*.pdf'))
            print 'file already exists:', data_type_dir, num_pdfs
        else:
            split_again = True

    if split_again:
        print 'splitting ...'
        #os.makedirs(data_type_dir)
        pids = [
            os.path.basename(pfile).replace('.pdf', '') for pfile in pdf_files
        ]
        rids = [
            os.path.basename(rfile).replace('.json', '')
            for rfile in review_files
        ]
        sids = [
            os.path.basename(sfile).replace('.pdf.json', '')
            for sfile in science_files
        ]
        ids = []
        for pid in pids:
            if pid in rids and pid in sids:
                ids.append(pid)
        train, validtest = train_test_split(ids,
                                            test_size=0.1,
                                            random_state=42)
        dev, test = train_test_split(validtest, test_size=0.5, random_state=42)

        for didx, data_set in enumerate([train, dev, test]):
            if os.path.exists(os.path.join(data_dir, data_types[didx])):
                rmtree(os.path.join(data_dir, data_types[didx]))
            os.makedirs(os.path.join(data_dir, data_types[didx], 'pdfs'))
            os.makedirs(os.path.join(data_dir, data_types[didx], 'reviews'))
            os.makedirs(os.path.join(data_dir, data_types[didx],
                                     'parsed_pdfs'))
            print 'Splitting..', data_types[didx], len(data_set)
            for d in data_set:
                copyfile(
                    os.path.join(pdf_dir, d + '.pdf'),
                    os.path.join(data_dir, data_types[didx], 'pdfs',
                                 d + '.pdf'))

                copyfile(
                    os.path.join(review_dir, d + '.json'),
                    os.path.join(data_dir, data_types[didx], 'reviews',
                                 d + '.json'))

                copyfile(
                    os.path.join(science_dir, d + '.pdf.json'),
                    os.path.join(data_dir, data_types[didx], 'parsed_pdfs',
                                 d + '.pdf.json'))
Пример #3
0
def main(args, limit=False):

    fout_dic = {}
    for len_sent in range(args.min_sent, args.max_sent + 1):
        fout_dic[len_sent] = open('%s_%d.txt' % (args.out_file, len_sent), 'w')

    review_files = glob.glob(args.data_dir + '/reviews' + '/*.json')
    print 'Number of papers:', len(review_files)

    cnt = 0
    topic = ''
    topic_changed = False
    category_types = ['cs.cl', 'cs.lg', 'cs.ai']

    #nlp = spacy.load('en', parser=False)
    for rid, review_file in enumerate(review_files):

        if rid % 1000 == 0:
            print '[%d/%d]' % (rid, len(review_files))

        paper = Paper.from_json(review_file)
        if not paper:
            continue

        paper.SCIENCEPARSE = ScienceParseReader.read_science_parse(
            paper.ID, paper.TITLE, paper.ABSTRACT, args.data_dir + '/scienceparse/')

        # paper ID
        file_prefix = paper.ID

        # sentences
        sections = paper.SCIENCEPARSE.get_sections_dict()

        for topic, content in sections.items():

            paragraphs = content.split('\n')

            for paragraph in paragraphs:
                if paragraph == '':
                    continue
                sents = sentence_tokenizer(
                    paragraph,
                    min_sent=args.min_sent,
                    max_sent=args.max_sent,
                    min_sent_len=args.min_sent_len,
                    max_sent_len=args.max_sent_len
                )
                if sents is None:
                    continue
                cnt += 1
                avg_len = np.average([len(sent) for sent in sents])

                fout = fout_dic[len(sents)]
                fout.write('%s\t%s\t%d\t%.2f\t%s\n' % (topic, file_prefix, len(
                    sents), avg_len, '\t'.join([' '.join(sent) for sent in sents])))
                if cnt % 1000 == 0:
                    print '\t%d paragraphs, %d files ..' % (cnt, rid)
                    fout.flush()
                if limit and cnt == limit:
                    sys.exit(1)

    for len_sent, fout in fout_dic.items():
        fout.close()
Пример #4
0
def prepare_data(data_dir,
                 vocab_path='vocab',
                 max_vocab_size=20000,
                 max_len_paper=1000,
                 max_len_review=200):

    data_type = data_dir.split('/')[-1]
    vocab_path += '.' + data_type
    if max_vocab_size: vocab_path += '.' + str(max_vocab_size)
    vocab_path = data_dir + '/' + vocab_path

    label_scale = 5
    if 'iclr' in data_dir.lower():
        fill_missing = False
        aspects = [
            'RECOMMENDATION', 'SUBSTANCE', 'APPROPRIATENESS',
            'MEANINGFUL_COMPARISON', 'SOUNDNESS_CORRECTNESS', 'ORIGINALITY',
            'CLARITY', 'IMPACT', 'RECOMMENDATION_ORIGINAL'
        ]
        review_dir_postfix = '_annotated'
    elif 'acl' in data_dir.lower():
        fill_missing = True
        aspects = [
            'RECOMMENDATION', 'SUBSTANCE', 'APPROPRIATENESS',
            'MEANINGFUL_COMPARISON', 'SOUNDNESS_CORRECTNESS', 'ORIGINALITY',
            'CLARITY', 'IMPACT', 'REVIEWER_CONFIDENCE'
        ]
        review_dir_postfix = ''
    else:
        print('wrong dataset:', data_dir)
        sys.exit(1)

    # Loading datasets
    print('Reading datasets..')
    datasets = ['train', 'dev', 'test']
    paper_content_all = []
    review_content_all = []

    data = defaultdict(list)
    for dataset in datasets:

        review_dir = os.path.join(data_dir, dataset,
                                  'reviews%s/' % (review_dir_postfix))
        scienceparse_dir = os.path.join(data_dir, dataset, 'parsed_pdfs/')
        model_dir = os.path.join(data_dir, dataset, 'model/')
        if not os.path.exists(model_dir): os.makedirs(model_dir)

        paper_json_filenames = sorted(glob.glob(
            '{}/*.json'.format(review_dir)))

        # add all paper/review content to generate corpus for buildinb vocab
        paper_content = []
        review_content = []
        for paper_json_filename in paper_json_filenames:
            d = {}
            paper = Paper.from_json(paper_json_filename)
            paper.SCIENCEPARSE = ScienceParseReader.read_science_parse(
                paper.ID, paper.TITLE, paper.ABSTRACT, scienceparse_dir)

            review_contents = []
            reviews = []
            for review in paper.REVIEWS:
                review_contents.append(
                    preprocess(review.COMMENTS,
                               only_char=False,
                               lower=True,
                               stop_remove=False))
                reviews.append(review)

            d['paper_content'] = preprocess(
                paper.SCIENCEPARSE.get_paper_content(),
                only_char=False,
                lower=True,
                stop_remove=False)
            d['reviews_content'] = review_contents
            d['reviews'] = reviews
            data[dataset].append(d)

    print('Total number of papers %d' %
          (np.sum([len(d) for _, d in list(data.items())])))
    print(
        'Total number of reviews %d' %
        (np.sum([len(r['reviews']) for _, d in list(data.items())
                 for r in d])))

    # Loading VOCAB
    print('Building vocab...')
    words = []
    for _, d in list(data.items()):
        for p in d:
            words += p['paper_content'].split(' ')
            for r in p['reviews_content']:
                words += r.split(' ')
    print("Total words in corpus", len(words))

    vocab = OrderedDict()
    word_counter = Counter(words)
    vocab['PAD'] = 0
    vocab['UNK'] = 1
    for w, c in word_counter.most_common():
        if max_vocab_size:
            if len(vocab) >= max_vocab_size:
                break
        if len(w) and w not in vocab:
            vocab[w] = len(vocab)
    with open(vocab_path, 'w') as fout:
        for w, id in list(vocab.items()):
            fout.write('%s\t%s\n' % (w, id))
    vocab_inv = {int(i): v for v, i in list(vocab.items())}
    print("Total vocab of size", len(vocab))

    # Loading DATA
    print('Reading reviews from...')
    data_padded = []
    for dataset in datasets:

        ds = data[dataset]

        x_paper = []  #[None] * len(reviews)
        x_review = []  #[None] * len(reviews)
        y = []  #[None] * len(reviews)

        for d in ds:
            paper_content = d['paper_content']
            reviews_content = d['reviews_content']
            reviews = d['reviews']

            for rid, (review_content,
                      review) in enumerate(zip(reviews_content, reviews)):
                paper_ids = [
                    vocab[w] if w in vocab else 1
                    for w in paper_content.split(' ')
                ]
                review_ids = [
                    vocab[w] if w in vocab else 1
                    for w in review_content.split(' ')
                ]

                paper_ids = pad_sentence(paper_ids, max_len_paper, 0)
                review_ids = pad_sentence(review_ids, max_len_review, 0)

                xone = (paper_ids, review_ids)
                yone = [np.nan] * len(aspects)

                for aid, aspect in enumerate(aspects):
                    if aspect in review.__dict__ and review.__dict__[
                            aspect] is not None:
                        yone[aid] = float(review.__dict__[aspect])
                #print rid,len(xone[0]), len(xone[1]), yone

                x_paper.append(xone[0])
                x_review.append(xone[1])
                y.append(yone)

        x_paper = np.array(x_paper, dtype=np.int32)
        x_review = np.array(x_review, dtype=np.int32)
        y = np.array(y, dtype=np.float32)

        # add average value of missing aspect value
        if fill_missing:
            col_mean = np.nanmean(y, axis=0)
            inds = np.where(np.isnan(y))
            y[inds] = np.take(col_mean, inds[1])

        print(
            'Total %s dataset: %d/%d' % (dataset, len(x_paper), len(x_review)),
            x_paper.shape, x_review.shape, y.shape)
        data_padded.append((x_paper, x_review))
        data_padded.append(y)

    return data_padded, vocab, vocab_inv, label_scale, aspects
Пример #5
0
def main():

    # Loading annotaions
    annots = get_annots_dic()
    print 'Loaded annots: %d papers and %d reviews' % (
        len(annots), sum([len(v) for k, v in annots.items()]))

    # Loading reviews, merging them with annotations, and saving into new directory
    data_dir = "../../data/iclr_2017"  # args[1]   #train/reviews
    datasets = ['train', 'dev', 'test']
    print 'Reading reviews from...'
    for dataset in datasets:

        cnt_p, cnt_r = 0, 0

        review_dir = os.path.join(data_dir, dataset, 'reviews_raw/')
        review_annotated_dir = os.path.join(data_dir, dataset, 'reviews/')
        scienceparse_dir = os.path.join(data_dir, dataset, 'scienceparse/')
        model_dir = os.path.join(data_dir, dataset, 'model')
        if not os.path.exists(model_dir): os.makedirs(model_dir)

        review_files = sorted(glob.glob('{}/*.json'.format(review_dir)))
        pids = []
        for paper_json_filename in review_files:
            paper = Paper.from_json(paper_json_filename)
            reviews_combined = []
            reviews_annotated = annots[paper.ID]
            reviews_original = paper.REVIEWS

            # overwrite review_annot to reviews in original paper
            reviews_combined += reviews_original

            for r_original in reviews_original:

                r_combined = None  #r_original
                for r_annotated in reviews_annotated:

                    if r_annotated['OTHER_KEYS'] == r_original.OTHER_KEYS:

                        r_combined = r_original
                        for k, v in r_annotated.items():
                            if k in [
                                    'RECOMMENDATION_UNOFFICIAL', 'SUBSTANCE',
                                    'APPROPRIATENESS', 'MEANINGFUL_COMPARISON',
                                    'SOUNDNESS_CORRECTNESS', 'ORIGINALITY',
                                    'CLARITY', 'IMPACT'
                            ]:
                                setattr(r_combined, k, v)
                        setattr(r_combined, 'IS_ANNOTATED', True)

                if r_combined is None:
                    reviews_combined.append(r_original)
                else:
                    reviews_combined.append(r_combined)

            paper.REVIEWS = reviews_combined
            cnt_r += len(paper.REVIEWS)

            # save to /reviews_annotated
            json.dump(
                paper.to_json_object(),
                open(review_annotated_dir + '/%s.json' % (paper.ID), 'w'))
            print paper.ID, len(paper.REVIEWS)
            cnt_p += 1
        print dataset, cnt_p, cnt_r
Пример #6
0
def main(args, lower=True, max_vocab_size=False, encoder='bowtfidf'):
    argc = len(args)

    if argc < 9:
        print(
            "Usage:", args[0],
            "<paper-json-dir> <scienceparse-dir> <out-dir> <submission-year> <feature output file> <tfidf vector file> <max_vocab_size> <encoder> <hand-feature>"
        )
        return -1

    paper_json_dir = args[1]  #train/reviews
    scienceparse_dir = args[2]  #train/parsed_pdfs
    out_dir = args[3]  #train/dataset
    feature_output_file = args[4]  #train/dataset/features.dat
    vect_file = args[5]  #train/dataset/vect.pkl
    max_vocab_size = False if args[6] == 'False' else int(
        args[6])  # False or integer
    encoder = False if args[7] == 'False' else str(args[7])
    hand = False if args[8] == 'False' else str(args[8])

    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    is_train = True
    vect = None
    idToFeature = None
    if os.path.isfile(feature_output_file):
        is_train = False
        idToFeature = read_features(feature_output_file)
        if encoder:
            print 'Loading vector file from...', vect_file
            vect = load_vect(vect_file)
    else:
        print 'Loading vector file from scratch..'
        idToFeature = dict()

    outLabelsFile = open(
        out_dir + '/labels_%s_%s_%s.tsv' %
        (str(max_vocab_size), str(encoder), str(hand)), 'w')
    outIDFile = open(
        out_dir + '/ids_%s_%s_%s.tsv' %
        (str(max_vocab_size), str(encoder), str(hand)), 'w')
    outSvmLiteFile = open(
        out_dir + '/features.svmlite_%s_%s_%s.txt' %
        (str(max_vocab_size), str(encoder), str(hand)), 'w')

    ################################
    # read reviews
    ################################
    print 'Reading reviews from...', paper_json_dir
    paper_content_corpus = []  #""
    paper_json_filenames = sorted(glob.glob(
        '{}/*.json'.format(paper_json_dir)))
    papers = []
    for paper_json_filename in paper_json_filenames:
        paper = Paper.from_json(paper_json_filename)
        paper.SCIENCEPARSE = ScienceParseReader.read_science_parse(
            paper.ID, paper.TITLE, paper.ABSTRACT, scienceparse_dir)
        paper_content_corpus.append(paper.SCIENCEPARSE.get_paper_content())
        papers.append(paper)
    random.shuffle(papers)
    print 'Total number of reviews', len(papers)

    def get_feature_id(feature):
        if feature in idToFeature:
            return idToFeature[feature]
        else:
            return None

    def addFeatureToDict(fname):
        id = len(idToFeature)
        idToFeature[fname] = id

    ################################
    # Initialize vocabularty
    ################################
    outCorpusFilename = out_dir + '/corpus.pkl'
    if not os.path.isfile(outCorpusFilename):
        paper_content_corpus = [
            preprocess(p, only_char=True, lower=True, stop_remove=True)
            for p in paper_content_corpus
        ]
        paper_content_corpus_words = []
        for p in paper_content_corpus:
            paper_content_corpus_words += p.split(' ')
        pkl.dump(paper_content_corpus_words, open(outCorpusFilename, 'wb'))
    else:
        paper_content_corpus_words = pkl.load(open(outCorpusFilename, 'rb'))
    print 'Total words in corpus', len(paper_content_corpus_words)

    ################################
    # Encoding
    ################################
    print 'Encoding..', encoder
    # 1) tf-idf features on title/author_names/domains
    if not encoder:
        print 'No encoder', encoder
    elif encoder in ['bow', 'bowtfidf']:
        word_counter = Counter(paper_content_corpus_words)
        # vocab limit by frequency
        if max_vocab_size:
            word_counter = dict(word_counter.most_common()[:max_vocab_size])
        vocabulary = dict()
        for w in word_counter:
            if len(w) and w not in vocabulary:
                if is_train:
                    vocabulary[w] = len(vocabulary)
                    addFeatureToDict(w)
                else:
                    fid = get_feature_id(w)
                    if fid is not None:
                        vocabulary[w] = fid
        print("Got vocab of size", len(vocabulary))
        if is_train:
            print 'Saving vectorized', vect_file
            if encoder == 'bow':
                vect = CountVectorizer(max_df=0.5,
                                       analyzer='word',
                                       stop_words='english',
                                       vocabulary=vocabulary)
            else:
                vect = TfidfVectorizer(sublinear_tf=True,
                                       max_df=0.5,
                                       analyzer='word',
                                       stop_words='english',
                                       vocabulary=vocabulary)
            vect.fit([p for p in paper_content_corpus])
            save_vect(vect, vect_file)

    # 2) sentence encoder features
    elif encoder in ['w2v', 'w2vtfidf']:
        from sent2vec import MeanEmbeddingVectorizer, TFIDFEmbeddingVectorizer, import_embeddings
        if is_train:
            w2v = import_embeddings()
            vect = MeanEmbeddingVectorizer(
                w2v) if encoder == 'w2v' else TFIDFEmbeddingVectorizer(w2v)
            for f in range(vect.dim):
                #fid = get_feature_id()
                addFeatureToDict('%s%d' % (encoder, f))
            print 'Saving vectorized', vect_file
            if encoder == 'w2vtfidf':
                vect.fit([p for p in paper_content_corpus])
            save_vect(vect, vect_file)
    else:
        print 'Wrong type of encoder', encoder
        sys.exit(1)

    ################################
    # Add features
    ################################
    if encoder:
        all_titles = []
        for p in papers:
            sp = p.get_scienceparse()
            title = p.get_title()
            all_title = preprocess(title,
                                   only_char=True,
                                   lower=True,
                                   stop_remove=True)
            all_titles.append(all_title)
        all_titles_features = vect.transform(all_titles)

    if is_train:
        print 'saving features to file', feature_output_file
        if hand:
            addFeatureToDict("get_most_recent_reference_year")
            addFeatureToDict("get_num_references")
            addFeatureToDict("get_num_refmentions")
            addFeatureToDict("get_avg_length_reference_mention_contexts")
            addFeatureToDict("abstract_contains_deep")
            addFeatureToDict("abstract_contains_neural")
            addFeatureToDict("abstract_contains_embedding")
            addFeatureToDict("abstract_contains_outperform")
            addFeatureToDict("abstract_contains_novel")
            addFeatureToDict("abstract_contains_state_of_the_art")
            addFeatureToDict("abstract_contains_state-of-the-art")

            addFeatureToDict("get_num_recent_references")
            addFeatureToDict("get_num_ref_to_figures")
            addFeatureToDict("get_num_ref_to_tables")
            addFeatureToDict("get_num_ref_to_sections")
            addFeatureToDict("get_num_uniq_words")
            addFeatureToDict("get_num_sections")
            addFeatureToDict("get_avg_sentence_length")
            addFeatureToDict("get_contains_appendix")
            addFeatureToDict("proportion_of_frequent_words")
            addFeatureToDict("get_title_length")
            addFeatureToDict("get_num_authors")

            addFeatureToDict("get_num_ref_to_equations")
            addFeatureToDict("get_num_ref_to_theorems")

        save_features_to_file(idToFeature, feature_output_file)

    id = 1
    hfws, most_frequent_words, least_frequent_words = count_words(
        paper_content_corpus_words, 0.01, 0.05, 3)
    for p in papers:
        outIDFile.write(str(id) + "\t" + str(p.get_title()) + "\n")
        rs = [r.get_recommendation() for r in p.get_reviews()]
        rec = int(p.get_accepted() == True)
        outLabelsFile.write(str(rec))
        outSvmLiteFile.write(str(rec) + " ")

        sp = p.get_scienceparse()

        if encoder:
            title_tfidf = all_titles_features[id - 1]
            if encoder.startswith('bow'):
                nz = title_tfidf.nonzero()[1]
                for word_id in sorted(nz):
                    outSvmLiteFile.write(
                        str(word_id) + ":" + str(title_tfidf[0, word_id]) +
                        " ")
            elif encoder.startswith('w2v'):
                for word_id in range(vect.dim):
                    outSvmLiteFile.write(
                        str(word_id) + ":" + str(title_tfidf[word_id]) + " ")
            else:
                print 'wrong ecndoer', encoder
                sys.exit(1)

        if hand:
            outSvmLiteFile.write(
                str(get_feature_id("get_most_recent_reference_year")) + ":" +
                str(sp.get_most_recent_reference_year() - 2000) + " ")
            outSvmLiteFile.write(
                str(get_feature_id("get_num_references")) + ":" +
                str(sp.get_num_references()) + " ")
            outSvmLiteFile.write(
                str(get_feature_id("get_num_refmentions")) + ":" +
                str(sp.get_num_refmentions()) + " ")
            outSvmLiteFile.write(
                str(get_feature_id(
                    "get_avg_length_reference_mention_contexts")) + ":" +
                str(sp.get_avg_length_reference_mention_contexts()) + " ")
            outSvmLiteFile.write(
                str(get_feature_id("abstract_contains_deep")) + ":" +
                str(int(p.abstract_contains_a_term("deep"))) + " ")
            outSvmLiteFile.write(
                str(get_feature_id("abstract_contains_neural")) + ":" +
                str(int(p.abstract_contains_a_term("neural"))) + " ")
            outSvmLiteFile.write(
                str(get_feature_id("abstract_contains_embedding")) + ":" +
                str(int(p.abstract_contains_a_term("embedding"))) + " ")

            outSvmLiteFile.write(
                str(get_feature_id("abstract_contains_outperform")) + ":" +
                str(int(p.abstract_contains_a_term("outperform"))) + " ")
            outSvmLiteFile.write(
                str(get_feature_id("abstract_contains_novel")) + ":" +
                str(int(p.abstract_contains_a_term("novel"))) + " ")
            outSvmLiteFile.write(
                str(get_feature_id("abstract_contains_state_of_the_art")) +
                ":" +
                str(int(p.abstract_contains_a_term("state of the art"))) + " ")
            outSvmLiteFile.write(
                str(get_feature_id("abstract_contains_state-of-the-art")) +
                ":" +
                str(int(p.abstract_contains_a_term("state-of-the-art"))) + " ")

            outSvmLiteFile.write(
                str(get_feature_id("get_num_recent_references")) + ":" +
                str(sp.get_num_recent_references(2017)) + " ")
            outSvmLiteFile.write(
                str(get_feature_id("get_num_ref_to_figures")) + ":" +
                str(sp.get_num_ref_to_figures()) + " ")
            outSvmLiteFile.write(
                str(get_feature_id("get_num_ref_to_tables")) + ":" +
                str(sp.get_num_ref_to_tables()) + " ")
            outSvmLiteFile.write(
                str(get_feature_id("get_num_ref_to_sections")) + ":" +
                str(sp.get_num_ref_to_sections()) + " ")
            outSvmLiteFile.write(
                str(get_feature_id("get_num_uniq_words")) + ":" +
                str(sp.get_num_uniq_words()) + " ")
            outSvmLiteFile.write(
                str(get_feature_id("get_num_sections")) + ":" +
                str(sp.get_num_sections()) + " ")
            outSvmLiteFile.write(
                str(get_feature_id("get_avg_sentence_length")) + ":" +
                str(sp.get_avg_sentence_length()) + " ")
            outSvmLiteFile.write(
                str(get_feature_id("get_contains_appendix")) + ":" +
                str(sp.get_contains_appendix()) + " ")

            outSvmLiteFile.write(
                str(get_feature_id("proportion_of_frequent_words")) + ":" +
                str(
                    round(
                        sp.get_frequent_words_proportion(
                            hfws, most_frequent_words, least_frequent_words),
                        3)) + " ")
            outSvmLiteFile.write(
                str(get_feature_id("get_title_length")) + ":" +
                str(p.get_title_len()) + " ")
            outSvmLiteFile.write(
                str(get_feature_id("get_num_authors")) + ":" +
                str(sp.get_num_authors()) + " ")
            outSvmLiteFile.write(
                str(get_feature_id("get_num_ref_to_equations")) + ":" +
                str(sp.get_num_ref_to_equations()) + " ")
            outSvmLiteFile.write(
                str(get_feature_id("get_num_ref_to_theorems")) + ":" +
                str(sp.get_num_ref_to_theorems()) + " ")

        outSvmLiteFile.write("\n")
        id += 1

    outLabelsFile.close()
    outIDFile.close()
    outSvmLiteFile.close()
    print 'saved', outLabelsFile.name
    print 'saved', outIDFile.name
    print 'saved', outSvmLiteFile.name
Пример #7
0
def prepare_data(
    data_dir,
    vocab_path='vocab',
    max_vocab_size = 20000,
    max_len_paper=1000,
    max_len_review=200):


  data_type = data_dir.split('/')[-1]
  vocab_path += '.' + data_type
  if max_vocab_size: vocab_path += '.'+str(max_vocab_size)
  vocab_path = data_dir +'/'+ vocab_path

  label_scale = 5
  if 'iclr' in data_dir.lower():
    fill_missing = False
    aspects = ['RECOMMENDATION', 'SUBSTANCE', 'APPROPRIATENESS','MEANINGFUL_COMPARISON','SOUNDNESS_CORRECTNESS','ORIGINALITY','CLARITY', 'IMPACT', 'RECOMMENDATION_ORIGINAL']
    review_dir_postfix = ''
  elif 'acl' in data_dir.lower():
    fill_missing = True
    aspects = ['RECOMMENDATION', 'SUBSTANCE', 'APPROPRIATENESS','MEANINGFUL_COMPARISON','SOUNDNESS_CORRECTNESS','ORIGINALITY','CLARITY','IMPACT', 'REVIEWER_CONFIDENCE' ]
    review_dir_postfix = ''
  else:
    print 'wrong dataset:',data_dir
    sys.exit(1)


  #Loading datasets
  print 'Reading datasets..'
  datasets = ['train','dev','test']
  paper_content_all = []
  review_content_all = []

  data = defaultdict(list)
  for dataset in datasets:

    review_dir = os.path.join(data_dir,  dataset, 'reviews%s/'%(review_dir_postfix))
    scienceparse_dir = os.path.join(data_dir, dataset, 'parsed_pdfs/')
    model_dir = os.path.join(data_dir, dataset, 'model/')
    if not os.path.exists(model_dir): os.makedirs(model_dir)

    paper_json_filenames = sorted(glob.glob('{}/*.json'.format(review_dir)))

    # add all paper/review content to generate corpus for buildinb vocab
    paper_content = []
    review_content = []
    for paper_json_filename in paper_json_filenames:
      d = {}
      paper = Paper.from_json(paper_json_filename)
      paper.SCIENCEPARSE = ScienceParseReader.read_science_parse(paper.ID, paper.TITLE, paper.ABSTRACT, scienceparse_dir)

      review_contents = []
      reviews = []
      for review in paper.REVIEWS:
        review_contents.append(review.COMMENTS)
          #preprocess(review.COMMENTS, only_char=False, lower=True, stop_remove=False))
        reviews.append(review)
      d['paper'] = paper
      d['paper_content'] = paper.SCIENCEPARSE.get_paper_content() #preprocess( #, only_char=False, lower=True,stop_remove=False)#
      d['reviews_content'] = review_contents
      d['reviews'] = reviews
      data[dataset].append(d)

  print 'Total number of papers %d' %(np.sum([len(d) for _,d in data.items()]))
  print 'Total number of reviews %d' %(np.sum([len(r['reviews']) for _,d in data.items() for r in d ]))
    
  # Loading DATA
  print 'Reading reviews from...'
  data_padded = []
  for dataset in datasets:

    ds = data[dataset]
    papers = []
    x_paper = [] #[None] * len(reviews)
    x_review = [] #[None] * len(reviews)
    y = [] #[None] * len(reviews)
    num_reviews = []
    x_reviews = []
    decision = []
    for d in ds:
      paper = d['paper']
      paper_content = d['paper_content']
      reviews_content = d['reviews_content']
      reviews = d['reviews']
      decision.append(paper.ACCEPTED)
      papers.append(paper)
      paper_sent = nltk.sent_tokenize(paper_content)
      reviews_sent = nltk.sent_tokenize(' '.join(reviews_content))
      x_paper.append(paper_sent)
      x_reviews.append(reviews_sent)
      num_reviews.append(len(reviews))
      for rid, (review_content, review) in enumerate(zip(reviews_content,reviews)):
         yone = [np.nan] * len(aspects)
	 review_sent = nltk.sent_tokenize(review.__dict__['COMMENTS'])
         for aid,aspect in enumerate(aspects):
            if aspect in review.__dict__ and review.__dict__[aspect] is not None:
               yone[aid] = float(review.__dict__[aspect])
         x_review.append(review_sent)
         y.append(yone)

    y = np.array(y, dtype=np.float32)
    # add average value of missing aspect value
    if fill_missing:
      col_mean = np.nanmean(y,axis=0)
      inds = np.where(np.isnan(y))
      y[inds] = np.take(col_mean, inds[1])

    data_padded.append((x_paper,papers, x_review, num_reviews, x_reviews, decision))
    data_padded.append(y)

  return data_padded,label_scale,aspects