예제 #1
0
def main():
    usage = "%prog parsed.ids.jsonlist articles.csv output_dir"
    parser = OptionParser(usage=usage)
    #parser.add_option('-v', dest='vocab_size', default=1000,
    #                  help='Maximum number of words to keep: default=%default')
    parser.add_option(
        '-m',
        dest='min_df',
        default=3,
        help='Minimum occurrence count for context words: default=%default')
    #parser.add_option('-d', dest='max_depth', default=2,
    #                  help='Max depth in parse tree: default=%default')
    #parser.add_option('-p', dest='pos', default=None,
    #                  help='Filter by POS tag(s) (e.g. JJ): default=%default')
    #parser.add_option('--boolarg', action="store_true", dest="boolarg", default=False,
    #                  help='Keyword argument: default=%default')

    (options, args) = parser.parse_args()

    infile = args[0]
    csv_file = args[1]
    output_dir = args[2]

    #max_depth = int(options.max_depth)
    min_df = int(options.min_df)
    #pos = options.pos

    lines = fh.read_jsonlist(infile)
    df = pd.read_csv(csv_file, header=0, index_col=0)

    stopwords = {'mr.', 'ms.', 'mrs.', 'major', 'maj.'}

    # go through all documents and build a vocab of relevant tuple words
    word_counts, entity_contexts = process_lines(lines, stopwords)

    print(word_counts.most_common(n=30))

    print("Size of full vocab = {:d}".format(len(word_counts)))
    vocab = [w for w, c in word_counts.items() if c >= min_df]
    vocab_size = len(vocab)
    print("Size of filtered vocab = {:d}".format(vocab_size))
    vocab.sort()

    vocab_index = dict(zip(vocab, range(len(vocab))))
    outlines = []
    for doc_id, words in entity_contexts.items():
        words = [word for word in words if word in vocab_index]
        if len(words) > 2:
            event_name = df.loc[doc_id, 'title']
            outlines.append({
                'id': doc_id,
                'text': ' '.join(words),
                'event_name': event_name,
                'name': event_name + '_' + str(doc_id)
            })

    fh.write_jsonlist(outlines, os.path.join(output_dir, 'tuples.jsonlist'))
def main():
    usage = "%prog msa_db.csv data_dir output_file.jsonlist"
    parser = OptionParser(usage=usage)
    #parser.add_option('--keyword', dest='key', default=None,
    #                  help='Keyword argument: default=%default')
    #parser.add_option('--boolarg', action="store_true", dest="boolarg", default=False,
    #                  help='Keyword argument: default=%default')

    (options, args) = parser.parse_args()

    msa_db = args[0]
    data_dir = args[1]
    output_filename = args[2]

    articles = []

    exclude = [
        'murderpedia.org', 'www.gunviolencearchive.org', 'www.fbi.gov',
        'en.wikipedia.org', 'www.history.com', 'web.archive.org'
    ]

    df = pd.read_csv(msa_db, header=0)
    index = df.index
    for i in index:
        row = df.loc[i]
        caseid = row['CaseID']
        title = row['Title']
        names = row['Shooter Name'].split()
        #subdirs = glob.glob(os.path.join(data_dir, '*_*'))
        subdir = os.path.join(data_dir, str(caseid) + '_' + '_'.join(names))
        if not os.path.exists(subdir):
            files = glob.glob(
                os.path.join(data_dir,
                             str(caseid) + '_*', '*.json'))
        else:
            files = glob.glob(os.path.join(subdir, '*.json'))
        print(subdir, len(files))
        for f in files:
            data = fh.read_json(f)
            text = data['text']
            url = data['url']
            parts = url.split('/')
            domain = parts[2]
            if len(text) > 200:
                if domain not in exclude:
                    articles.append({
                        'id': str(i),
                        'caseid': str(caseid),
                        'event_name': title,
                        'text': text
                    })

    fh.write_jsonlist(articles, output_filename, sort_keys=False)
예제 #3
0
def main():
    usage = "%prog input.jsonlist ouput_dir [labels.csv covariates.csv ...]"
    parser = OptionParser(usage=usage)
    parser.add_option('--test_prop', dest='test_prop', default=0.2,
                      help='proportion of documents to use for test data: default=%default')
    parser.add_option('--train', dest='train', default='train',
                      help='output prefix for training data: default=%default')
    parser.add_option('--test', dest='test', default='test',
                      help='output prefix for test data: default=%default')

    (options, args) = parser.parse_args()
    infile = args[0]
    output_dir = args[1]
    if len(args) > 2:
        csv_files = args[2:]
    else:
        csv_files = []

    test_prop = float(options.test_prop)
    train_prefix = options.train
    test_prefix = options.test

    print("Reading", infile)
    items = fh.read_jsonlist(infile)
    n_items = len(items)

    n_test = int(n_items * test_prop)
    print("Creating random test set of %d items" % n_test)
    n_train = n_items - n_test
    train_indices = np.random.choice(np.arange(n_items), size=n_train, replace=False)
    test_indices = list(set(range(n_items)) - set(train_indices))

    train_items = [items[i] for i in train_indices]
    test_items = [items[i] for i in test_indices]

    fh.write_jsonlist(train_items, os.path.join(output_dir, train_prefix + '.jsonlist'))
    fh.write_jsonlist(test_items, os.path.join(output_dir, test_prefix + '.jsonlist'))

    for file in csv_files:
        print(file)
        basename = os.path.basename(file)
        df = pd.read_csv(file, header=0, index_col=0)
        train_df_index = [df.index[i] for i in train_indices]
        train_df = df.loc[train_df_index]
        train_df.to_csv(os.path.join(output_dir, train_prefix + '.' + basename))

        test_df_index = [df.index[i] for i in test_indices]
        test_df = df.loc[test_df_index]
        test_df.to_csv(os.path.join(output_dir, test_prefix + '.' + basename))
def preprocess_data(csv_file, parsed_dir, output_dir, output_prefix, parse_prefix):

    df = pd.read_csv(csv_file, header=0, index_col=0)
    n_rows, n_columns = df.shape
    print(df.shape)

    files = glob.glob(os.path.join(parsed_dir, '*.json'))
    n_files = len(files)

    #assert n_files == n_rows

    coref_input = []

    pos_tags_all = set()
    print("Parsing %d documents" % n_files)
    for i in range(n_files):
        if i % 1000 == 0 and i > 0:
            print(i)

        valid = df.loc[i, 'matching']
        name = str(df.loc[i, 'shooter_names'])
        # fix an important name error
        name = re.sub('Marteen', 'Mateen', name)
        names = name.split()
        age = str(df.loc[i, 'age'])

        if valid:
            filename = os.path.join(parsed_dir, parse_prefix + '_' + str(i) + '.json')
            parse = fh.read_json(filename)

            # get the text and convert to tokens
            sentences, lemmas, pos_tags, speakers, dependencies, target_mentions, age_pos_tags = process_parse(parse, names, age)
            pos_tags_all.update(age_pos_tags)

            # write output for e2e-coref
            coref_input.append({"id": i,
                                "clusters": [],
                                "doc_key": "nw",
                                "sentences": sentences,
                                "lemmas": lemmas,
                                "speakers": speakers,
                                "pos_tags": pos_tags,
                                "dependencies": dependencies,
                                "coref": [target_mentions]
                                })

            print(i, names, age, len(target_mentions))

        fh.write_jsonlist(coref_input, os.path.join(output_dir, output_prefix + '.parsed.jsonlist'))
def download_articles(name, categories, subset):

    data = []
    print("Downloading articles")
    newsgroups_data = fetch_20newsgroups(subset=subset, categories=categories, remove=())

    for i in range(len(newsgroups_data['data'])):
        line = newsgroups_data['data'][i]
        data.append({'text': line, 'group': newsgroups_data['target_names'][newsgroups_data['target'][i]]})

    print(len(data))
    raw_data_dir = os.path.join('data', '20ng', name)
    print("Saving to", raw_data_dir)
    fh.makedirs(raw_data_dir)
    fh.write_jsonlist(data, os.path.join(raw_data_dir, subset + '.jsonlist'))
예제 #6
0
    def preprocess(self):
        """Preprocess the raw data file"""
        if self._check_processed_exists():
            return

        train_lines = []
        test_lines = []
        unlabeled_lines = []

        print("Opening tar file")
        # read in the raw data
        tar = tarfile.open(os.path.join(self.root, self.raw_filename), "r:gz")
        # process all the data files in the archive
        print("Processing documents")
        for m_i, member in enumerate(tar.getmembers()):
            # Display occassional progress
            if (m_i + 1) % 5000 == 0:
                print("Processed {:d} / 100000".format(m_i+1))
            # get the internal file name
            parts = member.name.split(os.sep)

            if len(parts) > 3:
                split = parts[1]  # train or test
                label = parts[2]  # pos, neg, or unsup
                name = parts[3].split('.')[0]
                doc_id, rating = name.split('_')
                doc_id = int(doc_id)
                rating = int(rating)

                # read the text from the archive
                f = tar.extractfile(member)
                bytes = f.read()
                text = bytes.decode("utf-8")
                # tokenize it using spacy
                if label != 'unsup':
                    # save the text, label, and original file name
                    doc = {'id': split + '_' + str(doc_id), 'text': text, 'sentiment': label, 'orig': member.name, 'rating': rating}
                    if split == 'train':
                        train_lines.append(doc)
                    elif split == 'test':
                        test_lines.append(doc)
                else:
                    doc = {'id': 'unlabeled_' + str(doc_id), 'text': text, 'sentiment': None, 'orig': member.name, 'rating': rating}
                    unlabeled_lines.append(doc)

        print("Saving processed data to {:s}".format(self.root))
        fh.write_jsonlist(train_lines, os.path.join(self.root, self.train_file))
        fh.write_jsonlist(test_lines, os.path.join(self.root, self.test_file))
        fh.write_jsonlist(unlabeled_lines, os.path.join(self.root, self.unlabeled_file))
def main():
    usage = "%prog articles.jsonlist metadata.csv output_dir"
    parser = OptionParser(usage=usage)
    #parser.add_option('--keyword', dest='key', default=None,
    #                  help='Keyword argument: default=%default')
    #parser.add_option('--boolarg', action="store_true", dest="boolarg", default=False,
    #                  help='Keyword argument: default=%default')

    (options, args) = parser.parse_args()

    infile = args[0]
    meta_file = args[1]
    output_dir = args[2]

    articles = fh.read_jsonlist(infile)
    df = pd.read_csv(meta_file, header=0, index_col=0)
    df.index = [str(i) for i in df.index]
    print(df.head())

    victim_counts = []
    fatality_counts = []
    white = []
    black = []
    mental = []

    outlines = []
    for line_i, line in enumerate(articles):
        if line_i % 1000 == 0:
            print(line_i)
        caseid = int(line['caseid'])
        name = line['name']
        if caseid == 156 or caseid == 168:
            # differentiate on name for two ids that have duplicates
            row = df[(df['CaseID'] == caseid) & (df['name'] == name)]
        else:
            # otherwise, just use the id
            row = df[df['CaseID'] == caseid]
        line['state'] = str(row['state'].values[0])
        line['white'] = int(row['ekg_white'])
        line['black'] = int(row['ekg_white'])
        white.append(int(row['ekg_white']))
        black.append(int(row['ekg_black']))
        line['mental'] = int(row['mental'])
        mental.append(int(row['mental']))
        line['fate'] = str(row['fate_at_scene'].values[0])
        line['fatalities'] = int(row['ekg_white'])
        line['victims'] = int(row['ekg_white'])
        victim_counts.append(int(row['victims']))
        fatality_counts.append(int(row['fatalities']))
        outlines.append(line)

    fh.write_jsonlist(outlines,
                      os.path.join(output_dir, 'articles.metadata.jsonlist'),
                      sort_keys=False)

    ids = list(range(len(victim_counts)))
    victims_df = pd.DataFrame(victim_counts, index=ids, columns=['victims'])
    victims_df.to_csv(os.path.join(output_dir, 'train.victims.csv'))

    fatalities_df = pd.DataFrame(fatality_counts,
                                 index=ids,
                                 columns=['fatalities'])
    fatalities_df.to_csv(os.path.join(output_dir, 'train.fatalities.csv'))

    white_df = pd.DataFrame(white, index=ids, columns=['white'])
    white_df.to_csv(os.path.join(output_dir, 'train.white.csv'))

    black_df = pd.DataFrame(black, index=ids, columns=['black'])
    black_df.to_csv(os.path.join(output_dir, 'train.black.csv'))

    mental_df = pd.DataFrame(mental, index=ids, columns=['mental'])
    mental_df.to_csv(os.path.join(output_dir, 'train.mental.csv'))
예제 #8
0
def main():
    usage = "%prog data_dir output_dir output_prefix"
    parser = OptionParser(usage=usage)
    #parser.add_option('--year', dest='year', default=1987,
    #                  help='Year: default=%default')
    parser.add_option('--gzip',
                      action="store_true",
                      dest="gzip",
                      default=False,
                      help='gzip output: default=%default')
    #parser.add_option('--word2vec', action="store_true", dest="word2vec", default=False,
    #                  help='Output data processed for word2vec: default=%default')
    parser.add_option('--lower',
                      action="store_true",
                      dest="lower",
                      default=False,
                      help='Lower-case words: default=%default')
    parser.add_option('--replace_digits',
                      action="store_true",
                      dest="replace_digits",
                      default=False,
                      help='Replace digits with #: default=%default')
    parser.add_option('--fix_punct',
                      action="store_true",
                      dest="fix_punct",
                      default=False,
                      help='Fix some punctuation: default=%default')
    #parser.add_option('--timestamp', dest='timestamp', default=None,
    #                  help='List of words to timestamp (comma-separated): default=%default')

    (options, args) = parser.parse_args()
    base_dir = args[0]
    outdir = args[1]
    output_prefix = args[2]

    years = [str(year) for year in range(1987, 2008)]
    do_gzip = options.gzip
    word2vec = False
    lower = options.lower
    replace_digits = options.replace_digits
    fix_punct = options.fix_punct
    #words_to_timestamp = options.timestamp
    #if words_to_timestamp is not None:
    #    words_to_timestamp = words_to_timestamp.split(',')

    outfile = None
    if word2vec:
        outfile = os.path.join(outdir, output_prefix + '.txt')
        if os.path.exists(outfile):
            sys.exit("Error: output file already exists.")

        with codecs.open(outfile, 'w') as f:
            f.write('')

    n_words = 0
    outlines = []
    for year in years:
        data_dir = os.path.join(base_dir, year)
        files = glob.glob(os.path.join(data_dir, '*.tgz'))
        files.sort()
        for tgz in files:
            print(tgz)
            tar = tarfile.open(tgz, "r:gz")
            for member in tar.getmembers():
                #print(tgz, member.name)
                f = tar.extractfile(member)
                if f is not None:
                    name = member.name
                    parts = name.split('/')
                    month = int(parts[0])
                    day = int(parts[1])
                    #print(tgz, member, f)
                    xml_string = f.read()
                    root = et.fromstring(xml_string)
                    headlines = []
                    paragraphs = []
                    for body in root.findall('body'):
                        #print(body)
                        for head in body.findall('body.head'):
                            #print(content)
                            for headline in head.findall('hedline'):
                                for child in headline:
                                    if child.text is not None:
                                        if 'class' not in child.attrib:
                                            headlines.append(child.text)

                        for content in body.findall('body.content'):
                            #print(content)
                            for block in content.findall('block'):
                                #print(block)
                                if block.attrib['class'] == 'full_text':
                                    for child in block:
                                        if child.text is not None:
                                            if child.text[:5] != 'LEAD:':
                                                paragraphs.append(child.text)
                    if len(paragraphs) > 0:
                        try:
                            if word2vec:
                                if len(headlines) > 0:
                                    for headline in headlines:
                                        lines = headline.split('\n')
                                        for line in lines:
                                            n_words += len(line.split())
                                            outlines.append(line)
                                for paragraph in paragraphs:
                                    lines = paragraph.split('\n')
                                    for line in lines:
                                        n_words += len(line.split())
                                        outlines.append(line)
                            else:
                                headline = '\n\n'.join(headlines)
                                body = '\n\n'.join(paragraphs)

                                headline = fix_line(headline, lower,
                                                    replace_digits, fix_punct)
                                body = fix_line(body, lower, replace_digits,
                                                fix_punct)

                                outlines.append({
                                    'body': body,
                                    'headline': headline,
                                    'year': year,
                                    'month': month,
                                    'day': day
                                })
                        except:
                            print(tgz, member.name)
                            print(headlines)
                            print(paragraphs)
                            print(year)
                            print(month)
                            print(day)
                            sys.exit()
        if word2vec:
            output_line = ''
            for line in outlines:
                output_line += line + '\n'
            output_line = fix_line(output_line, lower, replace_digits,
                                   fix_punct)
            #if words_to_timestamp is not None:
            #    for word in words_to_timestamp:
            #        output_line = re.sub(word, word + '_' + str(year), output_line)
            with codecs.open(outfile, 'a') as f:
                f.write(output_line)
        else:
            outfile = os.path.join(outdir,
                                   output_prefix + '_' + year + '.jsonlist')
            if do_gzip:
                outfile += '.gz'
            fh.write_jsonlist(outlines,
                              outfile,
                              sort_keys=False,
                              do_gzip=do_gzip)

    print("Total tokens = %d" % n_words)
예제 #9
0
        'stance': s['Stance'],
        'body_id': s['Body ID'],
        'body': bodies[s['Body ID']]
    } for s in stances]
    return data


if __name__ == '__main__':
    random.seed(3000)
    split = 0.8

    train_stances = 'fnc-1/train_stances.csv'
    train_bodies = 'fnc-1/train_bodies.csv'
    test_stances = 'fnc-1/competition_test_stances.csv'
    test_bodies = 'fnc-1/competition_test_bodies.csv'

    train_data = 'data/train_data.csv'
    dev_data = 'data/dev_data.csv'
    test_data = 'data/test_data.csv'

    train = merge_data(train_stances, train_bodies)
    random.shuffle(train)
    split_ind = int(len(train) * split)

    train, dev = train[:split_ind], train[split_ind:]
    test = merge_data(test_stances, test_bodies)

    fh.write_jsonlist(train, train_data, sort_keys=True)
    fh.write_jsonlist(dev, dev_data, sort_keys=True)
    fh.write_jsonlist(test, test_data, sort_keys=True)
예제 #10
0
def main():
    usage = "%prog parsed.ids.jsonlist articles.csv output_dir"
    parser = OptionParser(usage=usage)
    #parser.add_option('-v', dest='vocab_size', default=1000,
    #                  help='Maximum number of words to keep: default=%default')
    parser.add_option(
        '-m',
        dest='min_df',
        default=3,
        help='Minimum occurrence count for context words: default=%default')
    parser.add_option('-d',
                      dest='max_depth',
                      default=2,
                      help='Max depth in parse tree: default=%default')
    parser.add_option('-p',
                      dest='pos',
                      default=None,
                      help='Filter by POS tag(s) (e.g. JJ): default=%default')
    #parser.add_option('--filter', action="store_true", dest="filter", default=False,
    #                  help='Filter out unknown mental: default=%default')

    (options, args) = parser.parse_args()

    infile = args[0]
    csv_file = args[1]
    output_dir = args[2]

    max_depth = int(options.max_depth)
    min_df = int(options.min_df)
    pos = options.pos
    #filter = options.filter

    lines = fh.read_jsonlist(infile)
    df = pd.read_csv(csv_file, header=0, index_col=0)

    stopwords = set()

    # go through all documents and build a vocab of relevant tuple words
    search_terms = ['mental', 'terrorism']
    word_counts, entity_contexts, words_found = process_lines(
        lines, stopwords, search_terms, max_depth=max_depth, pos=pos)

    print(word_counts.most_common(n=30))

    print("Size of full vocab = {:d}".format(len(word_counts)))
    vocab = [w for w, c in word_counts.items() if c >= min_df]
    vocab_size = len(vocab)
    print("Size of filtered vocab = {:d}".format(vocab_size))
    vocab.sort()

    vocab_index = dict(zip(vocab, range(len(vocab))))

    outlines = []
    for doc_id, words in entity_contexts.items():
        # filter out duplicates
        words = [word for word in words if word in vocab_index]

        if len(words) > 2:
            event_name = df.loc[doc_id, 'title']
            text = ' '.join(words)
            outline = {'id': doc_id, 'text': text, 'event_name': event_name}
            outline['name'] = event_name + '_' + str(doc_id)
            outline['simple_race'] = df.loc[doc_id, 'simple_race']
            outline['white'] = int(df.loc[doc_id, 'white'])
            for term in search_terms:
                if words_found[doc_id][term] > 0:
                    outline[term] = 1
                else:
                    outline[term] = 0

            #if filter:
            #    if outline['mental'] != 'Unknown':
            #        outlines.append(outline)
            #else:
            outlines.append(outline)
    """
    all_events = {}
    for doc_id, words in entity_contexts.items():
        # filter out duplicates
        words = [word for word in words if word in vocab_index]
        event_name = df.loc[doc_id, 'title']
        if event_name in all_events:
            all_events[event_name]['words'] = all_events[event_name]['words'] + words
        else:
            all_events[event_name] = {'id': doc_id, 'words': words, 'event_name': event_name, 'name': event_name + '_' + str(doc_id)}

    outlines = []
    for key, value in all_events.items():
        if len(value['words']) > 2:
            outlines.append({'id': value['id'], 'text': ' '.join(value['words']), 'event_name': key})
    """
    fh.write_jsonlist(outlines, os.path.join(output_dir, 'contexts.jsonlist'))
def preprocess_data(csv_file, parsed_dir, output_dir, output_prefix, parse_prefix):

    df = pd.read_csv(csv_file, header=0, index_col=0)
    n_rows, n_columns = df.shape
    print(df.shape)

    files = glob.glob(os.path.join(parsed_dir, '*.json'))
    n_files = len(files)

    #assert n_files == n_rows

    coref_input = []

    pos_tags_all = set()
    print("Parsing %d documents" % n_files)
    #for i in range(n_files):
    for i in range(n_files):
        if i % 1000 == 0 and i > 0:
            print(i)

        valid = df.loc[i, 'matching']
        name = str(df.loc[i, 'shooter_names'])
        # fix an important name error
        name = re.sub('Marteen', 'Mateen', name)
        names = name.split()
        age = str(df.loc[i, 'age'])
        event_name = 'msa-' + re.sub('\s', '-', df.loc[i, 'title'])

        msa_index = int(df.loc[i, 'df_index'])

        if msa_index == 272:
            # Kalamzoo duplicate
            print("Skipping", i, event_name)
        elif msa_index == 276:
            # Belfair duplicate
            print("Skipping", i, event_name)
        elif msa_index == 293:
            # Sherman, Texas duplicate
            print("Skipping", i, event_name)
        elif msa_index == 280:
            # Chelsea, MA duplicate
            print("Skipping", i, event_name)
        elif msa_index == 283:
            # Kansas City duplicate
            print("Skipping", i, event_name)
        elif msa_index == 331:
            # Cape Coral
            print("Skipping", i, event_name)

        elif valid:
            filename = os.path.join(parsed_dir, parse_prefix + '_' + str(i) + '.json')
            parse = fh.read_json(filename)

            # get the text and convert to tokens
            sentences, sentences_tagged, target_mentions, pos_tags, dependencies = process_parse(parse, names, age, event_name)

            sentences_pruned = []
            for sent in sentences_tagged:
                tokens = [token for token in sent if token != '__DROP__']
                sentences_pruned.append(' '.join(tokens))
            text_pruned = ' '.join(sentences_pruned)

            # write output for e2e-coref
            coref_input.append({"id": i,
                                "clusters": [],
                                "doc_key": "nw",
                                "sentences": sentences,
                                "text_tagged": text_pruned,
                                "pos_tags": pos_tags,
                                "dependencies": dependencies,
                                "coref": [target_mentions]
                                })

            print(i, names, age, len(target_mentions))

        fh.write_jsonlist(coref_input, os.path.join(output_dir, output_prefix + '.parsed.jsonlist'))
예제 #12
0
def append_sims(split_data, filepath, namespace):
    with open(filepath, 'rb') as f:
        data = pickle.load(f)
    data = [d.item() for d in data]
    assert len(split_data) == len(data)

    for ex, d, in tqdm(zip(split_data, data), total=len(data)):
        ex[namespace] = d


train_data = fh.read_jsonlist('train_data.csv')
dev_data = fh.read_jsonlist('dev_data.csv')
test_data = fh.read_jsonlist('test_data.csv')

append_data(train_data, 'train_bodies_senti.pkl', 'body_senti')
append_data(dev_data, 'dev_bodies_senti.pkl', 'body_senti')
append_data(test_data, 'test_bodies_senti.pkl', 'body_senti')

append_data(train_data, 'train_headline_senti.pkl', 'headline_senti')
append_data(dev_data, 'dev_headline_senti.pkl', 'headline_senti')
append_data(test_data, 'test_headline_senti.pkl', 'headline_senti')

append_sims(train_data, 'train_cos_sims.pkl', 'cos_sim')
append_sims(dev_data, 'dev_cos_sims.pkl', 'cos_sim')
append_sims(test_data, 'test_cos_sims.pkl', 'cos_sim')

fh.write_jsonlist(train_data, 'train_data.jsonl')
fh.write_jsonlist(dev_data, 'dev_data.jsonl')
fh.write_jsonlist(test_data, 'test_data.jsonl')