示例#1
0
def build_dataset(args):
    for split in ['train', 'val', 'test']:
        print(split)
        t1 = time()

        print('start building dataset')
        if args.worker_num == 1 and cpu_count() > 1:
            print(
                '[INFO] There are %d CPUs in your device, please increase -worker_num to speed up'
                % (cpu_count()))
            print(
                "       It's a IO intensive application, so 2~10 may be a good choise"
            )

        files = list(iter_files(os.path.join(args.source_dir, split)))
        data_num = len(files)
        group_size = data_num // args.worker_num
        groups = []
        for i in range(args.worker_num):
            if i == args.worker_num - 1:
                groups.append(files[i * group_size:])
            else:
                groups.append(files[i * group_size:(i + 1) * group_size])
        p = Pool(processes=args.worker_num)
        multi_res = [p.apply_async(worker, (fs, )) for fs in groups]
        res = [res.get() for res in multi_res]

        with open(os.path.join(args.target_dir, "%s.json" % split), 'w') as f:
            for row in chain(*res):
                f.write(json.dumps(row, ensure_ascii=False) + "\n")

    t2 = time()
    print('Time Cost : %.1f seconds' % (t2 - t1))
def eval_distance_to_orig():
    all_sents = None
    with open(sys.argv[1]) as f:
        timeline = timelines.Timeline.from_file(f)
        all_sents = list(sent for date in timeline for sent in timeline[date])

    docs = []

    reader = StanfordXMLReader()
    for dirname in iter_dirs(sys.argv[2]):
        for filename in iter_files(dirname, ".htm.cont.tokenized"):
            try:
                docs.append(reader.run(filename))
            except:
                pass

    tfidf = TfidfVectorizer(stop_words=None)
    tfidf.fit(map(lambda d: d.plaintext, docs))

    sent_vecs = tfidf.transform(
        map(lambda s: " ".join(s.as_token_attr_sequence("form")),
            [sent for doc in docs for sent in doc.sentences]))
    tl_vecs = tfidf.transform(all_sents)

    sims = cosine_similarity(tl_vecs, sent_vecs)

    max_sims = np.max(sims, 1)

    print(max_sims)
    print("Median", np.median(max_sims))
    print("Min", np.min(max_sims))
    print(all_sents[np.argmin(max_sims)])
    print("Max", np.max(max_sims))
示例#3
0
def remove_old(path):
    files = iter_files(path)
    for file in files:
        filename = os.path.basename(file)
        print(filename)
        f = open(os.path.join('./final', filename), 'a+')
        lines = readlines(file)
        i = 0
        cnt = 0
        for line in lines:
            article = json.loads(line)
            if 'year' in article:
                if int(article['year']) >= 2000:
                    if "author" in article:
                        article['author'] = list(set(article['author']))
                        tmp = json.dumps(article)
                        f.write(tmp + '\n')
                        f.flush()

                        i += 1
                        # if i % 100000 == 0:
                        #     print(i)
                    else:
                        cnt += 1

            else:
                cnt += 1
        print('%s  skip:%d, save %d' % (filename, cnt, i))
示例#4
0
def extract_json_dir(src, des):
    i = 0
    files = list(iter_files(src))
    length = len(files)
    for id, file in enumerate(files):
        print("%d/%d" % (id, length))
        paper = json.load(open(file))

        a = len(paper['abstract'].split())
        b = len(paper['article'].split())
        if a > b:
            continue

        abs_len = len(paper["abstract"].split())
        if abs_len > 210: continue

        abstract = clean_abs(paper['abstract'])
        if len(abstract) < 2: continue
        article = clean_text(paper["article"])
        if len(article) < 2: continue

        conclusion = clean_text(paper["conclusion"])

        paper["abstract"] = abstract
        paper["article"] = article
        paper["conclusion"] = conclusion

        json.dump(paper, open(os.path.join(des, "%d.json" % i), 'w'), indent=4)
        i += 1
def read_duc2004_gold_summaries(gold_dir):
    gold_summaries = defaultdict(dict)

    for dirname in iter_dirs(gold_dir):
        for filename in iter_files(dirname, ""):
            if os.path.basename(filename).startswith("APW"):
                content = read_summary_file(filename)
                gold_summaries[os.path.basename(dirname)][os.path.basename(
                    filename)] = content
示例#6
0
def parse(dir_name='交运物流'):
    generator = iter_files(dir_name)
    for f in generator:
        try:
            with open(f, 'r') as o:
                doc = json.read(o)
                _parseDoc(doc)
                print('succeed to parse file {}'.format(f))
        except:
            print('fail to parse file {}'.format(f))
    def _extract_paths_labels_groups(
            self) -> Tuple[List[Path], List[str], List[int]]:
        file_paths = []
        labels = []
        groups = []
        paths_labels_groups = None
        metadata = None

        if self._metadata is not None:
            metadata = pd.read_csv(self._metadata)
            if self._path_column in metadata:
                metadata[self._path_column] = metadata[
                    self._path_column].apply(Path)
                if self._label_column in metadata:
                    path_existence = metadata[self._path_column].apply(
                        Path.exists)
                    metadata = metadata.loc[path_existence]
                    metadata[self._label_column] = metadata[
                        self._label_column].apply(str)
                    paths_labels_groups = metadata[[
                        self._path_column, self._label_column
                    ]]

        if paths_labels_groups is None and self._data_root:
            paths_labels_groups = pd.DataFrame()
            for class_folder in utils.iter_folders(self._data_root):
                label = class_folder.name
                for file_path in utils.iter_files(class_folder):
                    if self.is_file_integral(file_path):
                        paths_labels_groups = paths_labels_groups.append(
                            {
                                self._path_column: file_path,
                                self._label_column: label
                            },
                            ignore_index=True,
                            sort=False)

        if paths_labels_groups is not None:
            if self._with_shuffle:
                paths_labels_groups = paths_labels_groups.sample(
                    frac=1, replace=False, axis=0, random_state=self._seed)
            file_paths = paths_labels_groups[self._path_column].to_list()
            labels = paths_labels_groups[self._label_column].to_list()
            if self._group and metadata is not None and \
               all(c in metadata.columns for c in [self._path_column, self._group]):
                groups = metadata.set_index(self._path_column)\
                                 .loc[file_paths, self._group]

        return file_paths, labels, groups
示例#8
0
def store_contents(data_path, save_path, num_workers=4, num_files=5):
    """Preprocess and store a corpus of documents in sqlite.

    Args:
        data_path: Root path to directory (or directory of directories) of files
          containing json encoded documents (must have `id` and `text` fields).
        save_path: Path to output sqlite db.
        num_workers: Number of parallel processes to use when reading docs.
        num_files: Split db in to num_files files.
    """

    logger.info('Reading into database...')

    files = [f for f in utils.iter_files(data_path)]

    if num_files == 1:
        filelist = [files]
    else:
        one_length = len(files) // num_files + 1

        filelist = [[files[i * one_length + j] for j in range(one_length)]
                    for i in range(num_files - 1)]
        filelist.append(files[one_length * (num_files - 1):])

    for i, files in enumerate(filelist):
        logger.info('Building %i-th db...' % i)

        temp_save_path = os.path.join(save_path, 'fever%i.db' % i)

        if os.path.isfile(temp_save_path):
            raise RuntimeError('%s already exists! Not overwriting.' %
                               temp_save_path)

        conn = sqlite3.connect(temp_save_path)
        c = conn.cursor()
        c.execute("CREATE TABLE documents (id PRIMARY KEY, text);")

        workers = ProcessPool(num_workers)
        count = 0
        with tqdm(total=len(files)) as pbar:
            for pairs in tqdm(workers.imap_unordered(get_contents, files)):
                count += len(pairs)
                c.executemany("INSERT INTO documents VALUES (?,?)", pairs)
                pbar.update()
        logger.info('Read %d docs.' % count)
        logger.info('Committing...')
        conn.commit()
        conn.close()
def extract_json(src, des):
    i = 0
    files = list(iter_files(src))

    for file in tqdm(files):
        # print(file)
        tmp = json.load(open(file))['paper']
        paper = dict()
        if "abstract" in tmp:
            conclusion = ''
            abs_len = len(' '.join(tmp["abstract"]).split())
            if abs_len > 210: continue
            flag = False
            for sec in tmp['sections']:
                if "introduction" in sec.lower():
                    int_len = len(' '.join(tmp["sections"][sec]).split())
                    if int_len > 1000 or abs_len > int_len:
                        break
                    abstract = clean_abs(' '.join(tmp["abstract"]))

                    if len(abstract) < 2:
                        break
                    introduction = clean_text(' '.join(tmp["sections"][sec]))
                    if len(introduction) < 2:
                        break
                    flag = True

                if "conclusion" in sec.lower() and flag:
                    con_len = len(' '.join(tmp["sections"][sec]).split())
                    if con_len > 800:
                        conclusion = ''
                        break
                    conclusion = clean_text(' '.join(tmp["sections"][sec]))
                    break

            if flag:
                paper["abstract"] = abstract
                paper["article"] = introduction
                paper["conclusion"] = conclusion
                name = os.path.basename(file)
                name, _ = os.path.splitext(name)
                paper["id"] = name

                json.dump(paper,
                          open(os.path.join(des, "%s.json" % name), 'w'),
                          indent=4)
                i += 1
示例#10
0
def split(src, ratio=0.94):
    files = list(iter_files(src))
    random.shuffle(files)
    len_train = int(len(files) * ratio)
    len_val = int(len(files) * (1 - ratio) / 2)
    len_test = len(files) - len_train - len_val
    train = files[:len_train]
    val = files[len_train:len_train + len_val]
    test = files[-len_test:]

    if not exists(join(src, 'train')): os.makedirs(join(src, 'train'))
    if not exists(join(src, 'test')): os.makedirs(join(src, 'test'))
    if not exists(join(src, 'val')): os.makedirs(join(src, 'val'))
    for each in train:
        shutil.move(each, join(src, 'train'))
    for each in test:
        shutil.move(each, join(src, 'test'))
    for each in val:
        shutil.move(each, join(src, 'val'))
def dump(split):
    start = time()
    print('start processing {} split...'.format(split))
    data_dir = join(DATA_DIR, split)
    dump_dir = join(DATA_DIR, 'refs', split)
    n_data = count_data(data_dir)

    for i, file in enumerate(iter_files(data_dir)):
        print('processing {}/{} ({:.2f}%%)\r'.format(i, n_data,
                                                     100 * i / n_data),
              end='')
        name = os.path.basename(file)
        name, _ = os.path.splitext(name)

        with open(join(data_dir, '{}.json'.format(name))) as f:
            data = json.loads(f.read())
        abs_sents = data['abstract']
        with open(join(dump_dir, '{}.ref'.format(name)), 'w') as f:
            f.write(make_html_safe('\n'.join(abs_sents)))
    print('finished in {}'.format(timedelta(seconds=time() - start)))
示例#12
0
def cross_eval_main():
    parser = argparse.ArgumentParser()

    parser.add_argument("corpus_def")
    parser.add_argument("config")
    parser.add_argument("param_file")

    args = parser.parse_args()

    corpora_and_timelines = []

    with open(args.corpus_def) as f:
        corpus_defs = json.load(f)

    for corpus_def in corpus_defs["corpora"]:
        timeline_dir = corpus_def["tl_dir"]
        corpus_pickle = corpus_def["corpus_pkl"]

        corpus = load_corpus(corpus_pickle)

        timelines = []
        for tl_fname in iter_files(timeline_dir, ".txt"):
            with open(tl_fname, encoding="latin-1") as f:
                timeline = Timeline.from_file(f)
                timelines.append((os.path.basename(tl_fname), timeline))

        corpora_and_timelines.append((corpus, timelines))

    with open(args.config) as f:
        config = json.load(f)

    tl_gen = GloballyClusteredSentenceCompressionTimelineGenerator(config)

    parameters = tl_gen.run_scoring_cv_train_mode(corpora_and_timelines)

    with open(args.param_file, "wb") as f_out:
        pickle.dump(parameters, f_out)
示例#13
0
    #parser.add_argument("human_tl_dir")
    parser.add_argument("system_tl_dir")

    parser.add_argument("relevant_systems", nargs="+")

    parser.add_argument("outfile")

    args = parser.parse_args()
    relevant_systems = set(args.relevant_systems)

    all_relevant_timelines = defaultdict(lambda: defaultdict(dict))

    for directory in iter_dirs(args.system_tl_dir):
        system_name = os.path.basename(directory)
        for tl_dir in iter_dirs(directory):
            for tlfilename in iter_files(tl_dir, ".txt"):
                #print(system_name, relevant_systems)
                if system_name in relevant_systems:
                    with open(tlfilename) as tlfile:
                        all_relevant_timelines[system_name][os.path.basename(
                            tl_dir)][os.path.basename(
                                tlfilename)] = Timeline.from_file(tlfile)

    #for directory in iter_dirs(args.human_tl_dir):
    #    source_name = os.path.basename(directory)
    #    for tlfilename in iter_files(directory, ".txt"):
    #        with open(tlfilename, errors='ignore') as tlfile:
    #            all_relevant_timelines["human"][source_name][os.path.basename(tlfilename)] = Timeline.from_file(tlfile)

    vectorized_timelines = vectorize_timelines(all_relevant_timelines)
示例#14
0
from utils import iter_files, make_vocab
import json
import re
import shutil
from tqdm import tqdm
from utils import split

s_path = r'E:\HTML\a'
d_path = r'E:\DATASET\arxiv_json\arxiv_html'

for file in iter_files(s_path):
    try:
        shutil.move(file, d_path)
    except:
        pass
# in_path = '/home/yhj/dataset/conference_json'
# path = '/home/yhj/dataset/emnlp'
# path = r'E:\DATASET\arxiv_tex'
# for file in tqdm(list(iter_files(path))):
#     paper = json.load(open(file))
#     art = paper['article']
#     abs = paper['abstract']
#     con = paper['conclusion']
#     paper['article'] = [' '.join(each.split()) for each in art]
#     paper['abstract'] = [' '.join(each.split()) for each in abs]
#     paper['conclusion'] = [' '.join(each.split()) for each in con]
#     json.dump(paper,open(file,'w'),indent=4)

# split(path,'/home/yhj/dataset/emnlp')

# make_vocab(in_path,path)
示例#15
0
    parser = argparse.ArgumentParser()
    parser.add_argument('db_path', type=str, default=None,
                        help='Path to sqlite db holding document texts')
    parser.add_argument('out_dir', type=str, default=None,
                        help='Directory for saving output files')
    parser.add_argument('--ngram', type=int, default=1,
                        help=('Use up to N-size n-grams '
                              '(e.g. 2 = unigrams + bigrams)'))
    parser.add_argument('--hash-size', type=int, default=int(math.pow(2, 24)),
                        help='Number of buckets to use for hashing ngrams')
    parser.add_argument('--num-workers', type=int, default=4,
                        help='Number of CPU processes (for tokenizing, etc)')
    args = parser.parse_args()


    db_files = [f for f in utils.iter_files(args.db_path)]

    for i, f in enumerate(db_files):
        logger.info('Processing file %i...' % i)
        
        logger.info('Counting words...')

        count_matrix, doc_dict = get_count_matrix(
            args, 'sqlite', {'db_path': f}
        )

        logger.info('Getting word-doc frequencies...')
        freqs = get_doc_freqs(count_matrix)

        basename = os.path.splitext(os.path.basename(f))[0]
        basename += ('-ngram=%d-hash=%d' %
示例#16
0
            break


    paper['extracted'] = extracted
    paper['score'] = scores


    json.dump(paper,open(save_name,'w'),indent=4)


if __name__ == "__main__":
    path = '/home/yhj/dataset/emnlp_mix_int_he'
    for split in ['train','test','val']:
        print("labeling %s..." % split)

        data_path = os.path.join(path, split)
        files = list(iter_files(data_path))

        t1 = time.time()
        save_path = os.path.join(path, "%s" % split)
        if not os.path.exists(save_path):
            os.makedirs(save_path)
        with mp.Pool() as pool:
            list(pool.imap_unordered(label(save_path),files,chunksize=1024))
        # for file in tqdm(files):
        #
        #     p.apply_async(func=label, args=(file, save_name))

        t2 = time.time()
        print('%s time cost : %.1f seconds' % (split, (t2 - t1)))
示例#17
0
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger("Merge count matrix")

    parser = argparse.ArgumentParser()
    parser.add_argument('ct_path',
                        type=str,
                        default=None,
                        help='Path to count matrices')
    parser.add_argument('out_dir',
                        type=str,
                        default=None,
                        help='Directory for saving output files')
    args = parser.parse_args()

    ct_files = [f for f in utils.iter_files(args.ct_path)]

    logger.info('Loading the zeroth count matrix...')
    mat, metadata = utils.load_sparse_csr(ct_files[0])

    DOC2IDX, doc_ids = metadata['doc_dict']

    for i in range(1, len(ct_files)):

        logger.info('Loading %ith count matrix...' % i)
        nxt_mat, nxt_metadata = utils.load_sparse_csr(ct_files[i])

        if metadata['hash_size'] != nxt_metadata['hash_size']:
            raise RuntimeError('hash_size not equal in %ith file' % i)
        if metadata['ngram'] != nxt_metadata['ngram']:
            raise RuntimeError('ngram not equal in %ith file' % i)
示例#18
0

if __name__ == "__main__":

    save_root = '/home/yhj/dataset/conference_json'
    root_path = '/home/yhj/dataset/conference'

    # debug = False

    global save_path
    for path in os.listdir(root_path):
        path = os.path.join(root_path, path)
        print(path)

        save_path = os.path.join(save_root, basename(path))
        if not os.path.exists(save_path):
            os.makedirs(save_path)

        start = len(os.listdir(save_path))

        # if debug:start=0

        print("start with %d" % start)
        files = list(iter_files(path))
        start_time = time.time()
        for i in tqdm(range(start, len(files))):
            extract(i, files[i])

    delete_skip(save_root)

    # split(r'E:\conference')
示例#19
0
def delete_skip(path):
    for file in iter_files(path):
        if file.endswith('skip'):
            os.remove(file)
示例#20
0
batch_size = 64  #Batch size for training
epochs = 2  #Number of epochs to train for.
latent_dim = 256  #Latent dimensionality of the encoding space
num_samples = 10000  #Number of samples to train on

#Path to the data txt file on disk.
data_path = 'data/raw_sent_pair/fluency/'

#Vectorize the data.
input_texts = []
target_texts = []
input_characters = set()
target_characters = set()

for file in utils.iter_files(data_path):
    with open(file, 'r') as f:
        lines = f.read().split('\n')
    for line in lines[:min(num_samples, len(lines) - 1)]:
        input_text, target_text = line.split('\t')
        #We use "tab" as the "start sequence" character
        # for the targets, and "\n" as "end sequence" character.
        target_text = '\t' + target_text + '\n'
        input_texts.append(input_text)
        target_texts.append(target_text)
        for char in input_text:
            if char not in input_characters:
                input_characters.add(char)
        for char in target_text:
            if char not in target_characters:
                target_characters.add(char)